diff --git "a/checkpoint-6176/trainer_state.json" "b/checkpoint-6176/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6176/trainer_state.json" @@ -0,0 +1,55618 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 6176, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00016193020808031737, + "grad_norm": 2590324.25, + "learning_rate": 1e-05, + "loss": 27.9785, + "mean_token_accuracy": 0.10002041235566139, + "num_tokens": 1792.0, + "step": 1 + }, + { + "epoch": 0.00032386041616063474, + "grad_norm": 15321.634765625, + "learning_rate": 9.998380829015544e-06, + "loss": 12.7953, + "mean_token_accuracy": 0.1478351578116417, + "num_tokens": 3581.0, + "step": 2 + }, + { + "epoch": 0.00048579062424095217, + "grad_norm": 8141.26416015625, + "learning_rate": 9.99676165803109e-06, + "loss": 8.7196, + "mean_token_accuracy": 0.3560480624437332, + "num_tokens": 5371.0, + "step": 3 + }, + { + "epoch": 0.0006477208323212695, + "grad_norm": 4501.52099609375, + "learning_rate": 9.995142487046633e-06, + "loss": 6.0515, + "mean_token_accuracy": 0.5588205456733704, + "num_tokens": 7164.0, + "step": 4 + }, + { + "epoch": 0.0008096510404015869, + "grad_norm": 2391.61669921875, + "learning_rate": 9.993523316062177e-06, + "loss": 4.3382, + "mean_token_accuracy": 0.673895925283432, + "num_tokens": 8952.0, + "step": 5 + }, + { + "epoch": 0.0009715812484819043, + "grad_norm": 3996.810546875, + "learning_rate": 9.99190414507772e-06, + "loss": 3.6032, + "mean_token_accuracy": 0.6714812815189362, + "num_tokens": 10753.0, + "step": 6 + }, + { + "epoch": 0.0011335114565622216, + "grad_norm": 1601.2535400390625, + "learning_rate": 9.990284974093266e-06, + "loss": 2.967, + "mean_token_accuracy": 0.7058201134204865, + "num_tokens": 12540.0, + "step": 7 + }, + { + "epoch": 0.001295441664642539, + "grad_norm": 1023.8958740234375, + "learning_rate": 9.98866580310881e-06, + "loss": 2.5027, + "mean_token_accuracy": 0.7814523577690125, + "num_tokens": 14331.0, + "step": 8 + }, + { + "epoch": 0.0014573718727228565, + "grad_norm": 864.677734375, + "learning_rate": 9.987046632124353e-06, + "loss": 2.2716, + "mean_token_accuracy": 0.8148148059844971, + "num_tokens": 16118.0, + "step": 9 + }, + { + "epoch": 0.0016193020808031738, + "grad_norm": 865.9158325195312, + "learning_rate": 9.985427461139897e-06, + "loss": 2.0112, + "mean_token_accuracy": 0.8334065079689026, + "num_tokens": 17900.0, + "step": 10 + }, + { + "epoch": 0.0017812322888834911, + "grad_norm": 959.9893798828125, + "learning_rate": 9.983808290155442e-06, + "loss": 2.3178, + "mean_token_accuracy": 0.8049798905849457, + "num_tokens": 19694.0, + "step": 11 + }, + { + "epoch": 0.0019431624969638087, + "grad_norm": 732.5724487304688, + "learning_rate": 9.982189119170985e-06, + "loss": 1.9631, + "mean_token_accuracy": 0.8315800726413727, + "num_tokens": 21479.0, + "step": 12 + }, + { + "epoch": 0.002105092705044126, + "grad_norm": 734.8389282226562, + "learning_rate": 9.980569948186529e-06, + "loss": 1.9683, + "mean_token_accuracy": 0.8085106313228607, + "num_tokens": 23273.0, + "step": 13 + }, + { + "epoch": 0.0022670229131244433, + "grad_norm": 691.6798706054688, + "learning_rate": 9.978950777202073e-06, + "loss": 2.0535, + "mean_token_accuracy": 0.8219024240970612, + "num_tokens": 25065.0, + "step": 14 + }, + { + "epoch": 0.002428953121204761, + "grad_norm": 534.7596435546875, + "learning_rate": 9.977331606217618e-06, + "loss": 1.9077, + "mean_token_accuracy": 0.837618499994278, + "num_tokens": 26848.0, + "step": 15 + }, + { + "epoch": 0.002590883329285078, + "grad_norm": 492.1035461425781, + "learning_rate": 9.975712435233161e-06, + "loss": 1.6952, + "mean_token_accuracy": 0.8331745564937592, + "num_tokens": 28635.0, + "step": 16 + }, + { + "epoch": 0.0027528135373653955, + "grad_norm": 407.0179138183594, + "learning_rate": 9.974093264248705e-06, + "loss": 1.612, + "mean_token_accuracy": 0.8341322541236877, + "num_tokens": 30418.0, + "step": 17 + }, + { + "epoch": 0.002914743745445713, + "grad_norm": 465.31854248046875, + "learning_rate": 9.972474093264249e-06, + "loss": 1.804, + "mean_token_accuracy": 0.8161197006702423, + "num_tokens": 32218.0, + "step": 18 + }, + { + "epoch": 0.00307667395352603, + "grad_norm": 492.2633361816406, + "learning_rate": 9.970854922279794e-06, + "loss": 1.8954, + "mean_token_accuracy": 0.8170638978481293, + "num_tokens": 34020.0, + "step": 19 + }, + { + "epoch": 0.0032386041616063476, + "grad_norm": 367.78216552734375, + "learning_rate": 9.969235751295337e-06, + "loss": 1.8947, + "mean_token_accuracy": 0.8388837277889252, + "num_tokens": 35811.0, + "step": 20 + }, + { + "epoch": 0.003400534369686665, + "grad_norm": 357.5298156738281, + "learning_rate": 9.967616580310881e-06, + "loss": 1.7049, + "mean_token_accuracy": 0.8480354249477386, + "num_tokens": 37592.0, + "step": 21 + }, + { + "epoch": 0.0035624645777669823, + "grad_norm": 370.7132568359375, + "learning_rate": 9.965997409326426e-06, + "loss": 1.974, + "mean_token_accuracy": 0.8051095008850098, + "num_tokens": 39386.0, + "step": 22 + }, + { + "epoch": 0.0037243947858473, + "grad_norm": 415.8082580566406, + "learning_rate": 9.96437823834197e-06, + "loss": 1.9174, + "mean_token_accuracy": 0.8263655304908752, + "num_tokens": 41174.0, + "step": 23 + }, + { + "epoch": 0.0038863249939276173, + "grad_norm": 400.77783203125, + "learning_rate": 9.962759067357514e-06, + "loss": 2.4263, + "mean_token_accuracy": 0.7937062978744507, + "num_tokens": 42972.0, + "step": 24 + }, + { + "epoch": 0.004048255202007934, + "grad_norm": 316.3494873046875, + "learning_rate": 9.961139896373057e-06, + "loss": 1.5235, + "mean_token_accuracy": 0.8511984050273895, + "num_tokens": 44758.0, + "step": 25 + }, + { + "epoch": 0.004210185410088252, + "grad_norm": 332.2727355957031, + "learning_rate": 9.959520725388602e-06, + "loss": 1.6433, + "mean_token_accuracy": 0.8394160568714142, + "num_tokens": 46544.0, + "step": 26 + }, + { + "epoch": 0.0043721156181685695, + "grad_norm": 324.8083190917969, + "learning_rate": 9.957901554404146e-06, + "loss": 1.8186, + "mean_token_accuracy": 0.8315262198448181, + "num_tokens": 48335.0, + "step": 27 + }, + { + "epoch": 0.004534045826248887, + "grad_norm": 347.6258544921875, + "learning_rate": 9.95628238341969e-06, + "loss": 1.6425, + "mean_token_accuracy": 0.8448051810264587, + "num_tokens": 50130.0, + "step": 28 + }, + { + "epoch": 0.004695976034329204, + "grad_norm": 294.58636474609375, + "learning_rate": 9.954663212435233e-06, + "loss": 1.5152, + "mean_token_accuracy": 0.8535315692424774, + "num_tokens": 51915.0, + "step": 29 + }, + { + "epoch": 0.004857906242409522, + "grad_norm": 372.940185546875, + "learning_rate": 9.953044041450778e-06, + "loss": 1.9217, + "mean_token_accuracy": 0.81324702501297, + "num_tokens": 53711.0, + "step": 30 + }, + { + "epoch": 0.005019836450489839, + "grad_norm": 298.54095458984375, + "learning_rate": 9.951424870466322e-06, + "loss": 1.6812, + "mean_token_accuracy": 0.838798850774765, + "num_tokens": 55496.0, + "step": 31 + }, + { + "epoch": 0.005181766658570156, + "grad_norm": 319.18182373046875, + "learning_rate": 9.949805699481866e-06, + "loss": 1.6511, + "mean_token_accuracy": 0.822818785905838, + "num_tokens": 57302.0, + "step": 32 + }, + { + "epoch": 0.005343696866650474, + "grad_norm": 331.71240234375, + "learning_rate": 9.94818652849741e-06, + "loss": 1.9189, + "mean_token_accuracy": 0.8330687880516052, + "num_tokens": 59089.0, + "step": 33 + }, + { + "epoch": 0.005505627074730791, + "grad_norm": 272.4907531738281, + "learning_rate": 9.946567357512955e-06, + "loss": 1.4439, + "mean_token_accuracy": 0.8442226648330688, + "num_tokens": 60877.0, + "step": 34 + }, + { + "epoch": 0.005667557282811108, + "grad_norm": 219.92578125, + "learning_rate": 9.944948186528498e-06, + "loss": 1.4685, + "mean_token_accuracy": 0.8554767668247223, + "num_tokens": 62665.0, + "step": 35 + }, + { + "epoch": 0.005829487490891426, + "grad_norm": 267.6018981933594, + "learning_rate": 9.943329015544042e-06, + "loss": 1.628, + "mean_token_accuracy": 0.8447502851486206, + "num_tokens": 64454.0, + "step": 36 + }, + { + "epoch": 0.005991417698971743, + "grad_norm": 257.0301818847656, + "learning_rate": 9.941709844559585e-06, + "loss": 1.6321, + "mean_token_accuracy": 0.8432835936546326, + "num_tokens": 66234.0, + "step": 37 + }, + { + "epoch": 0.00615334790705206, + "grad_norm": 255.00665283203125, + "learning_rate": 9.94009067357513e-06, + "loss": 1.5121, + "mean_token_accuracy": 0.8339080214500427, + "num_tokens": 68029.0, + "step": 38 + }, + { + "epoch": 0.006315278115132378, + "grad_norm": 290.82635498046875, + "learning_rate": 9.938471502590674e-06, + "loss": 1.8195, + "mean_token_accuracy": 0.8093774914741516, + "num_tokens": 69824.0, + "step": 39 + }, + { + "epoch": 0.006477208323212695, + "grad_norm": 239.01097106933594, + "learning_rate": 9.936852331606218e-06, + "loss": 1.49, + "mean_token_accuracy": 0.8441810309886932, + "num_tokens": 71625.0, + "step": 40 + }, + { + "epoch": 0.006639138531293012, + "grad_norm": 281.5107421875, + "learning_rate": 9.935233160621763e-06, + "loss": 1.6659, + "mean_token_accuracy": 0.8286873996257782, + "num_tokens": 73423.0, + "step": 41 + }, + { + "epoch": 0.00680106873937333, + "grad_norm": 253.38821411132812, + "learning_rate": 9.933613989637307e-06, + "loss": 1.6001, + "mean_token_accuracy": 0.8374288380146027, + "num_tokens": 75218.0, + "step": 42 + }, + { + "epoch": 0.006962998947453647, + "grad_norm": 260.29718017578125, + "learning_rate": 9.93199481865285e-06, + "loss": 1.527, + "mean_token_accuracy": 0.8415662348270416, + "num_tokens": 77014.0, + "step": 43 + }, + { + "epoch": 0.0071249291555339645, + "grad_norm": 232.60289001464844, + "learning_rate": 9.930375647668394e-06, + "loss": 1.4487, + "mean_token_accuracy": 0.8467153310775757, + "num_tokens": 78800.0, + "step": 44 + }, + { + "epoch": 0.0072868593636142825, + "grad_norm": 232.3682861328125, + "learning_rate": 9.928756476683939e-06, + "loss": 1.2732, + "mean_token_accuracy": 0.865645170211792, + "num_tokens": 80587.0, + "step": 45 + }, + { + "epoch": 0.0074487895716946, + "grad_norm": 226.4539337158203, + "learning_rate": 9.927137305699483e-06, + "loss": 1.53, + "mean_token_accuracy": 0.8327760696411133, + "num_tokens": 82380.0, + "step": 46 + }, + { + "epoch": 0.007610719779774917, + "grad_norm": 223.02755737304688, + "learning_rate": 9.925518134715026e-06, + "loss": 1.3685, + "mean_token_accuracy": 0.8673881590366364, + "num_tokens": 84170.0, + "step": 47 + }, + { + "epoch": 0.007772649987855235, + "grad_norm": 199.50677490234375, + "learning_rate": 9.92389896373057e-06, + "loss": 1.7148, + "mean_token_accuracy": 0.8449667990207672, + "num_tokens": 85952.0, + "step": 48 + }, + { + "epoch": 0.007934580195935553, + "grad_norm": 229.42681884765625, + "learning_rate": 9.922279792746115e-06, + "loss": 1.5759, + "mean_token_accuracy": 0.8278166353702545, + "num_tokens": 87742.0, + "step": 49 + }, + { + "epoch": 0.008096510404015869, + "grad_norm": 218.40028381347656, + "learning_rate": 9.920660621761659e-06, + "loss": 1.3814, + "mean_token_accuracy": 0.8362470865249634, + "num_tokens": 89535.0, + "step": 50 + }, + { + "epoch": 0.008258440612096187, + "grad_norm": 250.39414978027344, + "learning_rate": 9.919041450777202e-06, + "loss": 1.6344, + "mean_token_accuracy": 0.8230555355548859, + "num_tokens": 91341.0, + "step": 51 + }, + { + "epoch": 0.008420370820176505, + "grad_norm": 223.5367889404297, + "learning_rate": 9.917422279792746e-06, + "loss": 1.651, + "mean_token_accuracy": 0.8363562524318695, + "num_tokens": 93134.0, + "step": 52 + }, + { + "epoch": 0.008582301028256821, + "grad_norm": 200.34713745117188, + "learning_rate": 9.915803108808291e-06, + "loss": 1.5119, + "mean_token_accuracy": 0.8532276153564453, + "num_tokens": 94932.0, + "step": 53 + }, + { + "epoch": 0.008744231236337139, + "grad_norm": 188.26747131347656, + "learning_rate": 9.914183937823835e-06, + "loss": 1.5943, + "mean_token_accuracy": 0.8267816007137299, + "num_tokens": 96720.0, + "step": 54 + }, + { + "epoch": 0.008906161444417457, + "grad_norm": 206.5025177001953, + "learning_rate": 9.912564766839378e-06, + "loss": 1.6023, + "mean_token_accuracy": 0.8385719060897827, + "num_tokens": 98516.0, + "step": 55 + }, + { + "epoch": 0.009068091652497773, + "grad_norm": 190.56814575195312, + "learning_rate": 9.910945595854922e-06, + "loss": 1.5202, + "mean_token_accuracy": 0.842502623796463, + "num_tokens": 100305.0, + "step": 56 + }, + { + "epoch": 0.009230021860578091, + "grad_norm": 189.0370330810547, + "learning_rate": 9.909326424870467e-06, + "loss": 1.4965, + "mean_token_accuracy": 0.8248813450336456, + "num_tokens": 102091.0, + "step": 57 + }, + { + "epoch": 0.009391952068658407, + "grad_norm": 188.38626098632812, + "learning_rate": 9.90770725388601e-06, + "loss": 1.4053, + "mean_token_accuracy": 0.8376361727714539, + "num_tokens": 103874.0, + "step": 58 + }, + { + "epoch": 0.009553882276738725, + "grad_norm": 166.5449676513672, + "learning_rate": 9.906088082901554e-06, + "loss": 1.1694, + "mean_token_accuracy": 0.8681569397449493, + "num_tokens": 105659.0, + "step": 59 + }, + { + "epoch": 0.009715812484819043, + "grad_norm": 293.2898254394531, + "learning_rate": 9.9044689119171e-06, + "loss": 1.6415, + "mean_token_accuracy": 0.8272086679935455, + "num_tokens": 107453.0, + "step": 60 + }, + { + "epoch": 0.00987774269289936, + "grad_norm": 192.3681640625, + "learning_rate": 9.902849740932643e-06, + "loss": 1.5366, + "mean_token_accuracy": 0.8456753194332123, + "num_tokens": 109237.0, + "step": 61 + }, + { + "epoch": 0.010039672900979678, + "grad_norm": 203.23016357421875, + "learning_rate": 9.901230569948187e-06, + "loss": 1.3134, + "mean_token_accuracy": 0.8415968716144562, + "num_tokens": 111039.0, + "step": 62 + }, + { + "epoch": 0.010201603109059995, + "grad_norm": 193.1846923828125, + "learning_rate": 9.89961139896373e-06, + "loss": 1.5576, + "mean_token_accuracy": 0.8363747000694275, + "num_tokens": 112826.0, + "step": 63 + }, + { + "epoch": 0.010363533317140312, + "grad_norm": 183.6842498779297, + "learning_rate": 9.897992227979276e-06, + "loss": 1.2775, + "mean_token_accuracy": 0.8560853004455566, + "num_tokens": 114616.0, + "step": 64 + }, + { + "epoch": 0.01052546352522063, + "grad_norm": 166.9985809326172, + "learning_rate": 9.89637305699482e-06, + "loss": 1.3046, + "mean_token_accuracy": 0.8644736707210541, + "num_tokens": 116401.0, + "step": 65 + }, + { + "epoch": 0.010687393733300948, + "grad_norm": 202.93296813964844, + "learning_rate": 9.894753886010363e-06, + "loss": 1.3966, + "mean_token_accuracy": 0.8367460072040558, + "num_tokens": 118195.0, + "step": 66 + }, + { + "epoch": 0.010849323941381264, + "grad_norm": 190.30007934570312, + "learning_rate": 9.893134715025907e-06, + "loss": 1.2746, + "mean_token_accuracy": 0.8372413516044617, + "num_tokens": 120002.0, + "step": 67 + }, + { + "epoch": 0.011011254149461582, + "grad_norm": 195.32217407226562, + "learning_rate": 9.891515544041452e-06, + "loss": 1.3609, + "mean_token_accuracy": 0.850742518901825, + "num_tokens": 121796.0, + "step": 68 + }, + { + "epoch": 0.0111731843575419, + "grad_norm": 188.3912811279297, + "learning_rate": 9.889896373056995e-06, + "loss": 1.248, + "mean_token_accuracy": 0.8554182052612305, + "num_tokens": 123585.0, + "step": 69 + }, + { + "epoch": 0.011335114565622216, + "grad_norm": 194.74266052246094, + "learning_rate": 9.888277202072539e-06, + "loss": 1.3703, + "mean_token_accuracy": 0.8520025014877319, + "num_tokens": 125374.0, + "step": 70 + }, + { + "epoch": 0.011497044773702534, + "grad_norm": 169.6554718017578, + "learning_rate": 9.886658031088083e-06, + "loss": 1.3225, + "mean_token_accuracy": 0.8728920221328735, + "num_tokens": 127162.0, + "step": 71 + }, + { + "epoch": 0.011658974981782852, + "grad_norm": 174.06930541992188, + "learning_rate": 9.885038860103628e-06, + "loss": 1.2812, + "mean_token_accuracy": 0.8392018675804138, + "num_tokens": 128954.0, + "step": 72 + }, + { + "epoch": 0.011820905189863168, + "grad_norm": 195.21556091308594, + "learning_rate": 9.883419689119171e-06, + "loss": 1.4616, + "mean_token_accuracy": 0.8464285731315613, + "num_tokens": 130746.0, + "step": 73 + }, + { + "epoch": 0.011982835397943486, + "grad_norm": 177.80078125, + "learning_rate": 9.881800518134715e-06, + "loss": 1.2312, + "mean_token_accuracy": 0.8633754253387451, + "num_tokens": 132535.0, + "step": 74 + }, + { + "epoch": 0.012144765606023804, + "grad_norm": 176.92259216308594, + "learning_rate": 9.880181347150259e-06, + "loss": 1.2379, + "mean_token_accuracy": 0.8515756130218506, + "num_tokens": 134323.0, + "step": 75 + }, + { + "epoch": 0.01230669581410412, + "grad_norm": 158.85617065429688, + "learning_rate": 9.878562176165804e-06, + "loss": 1.3788, + "mean_token_accuracy": 0.8581402599811554, + "num_tokens": 136110.0, + "step": 76 + }, + { + "epoch": 0.012468626022184438, + "grad_norm": 178.2767333984375, + "learning_rate": 9.876943005181348e-06, + "loss": 1.4577, + "mean_token_accuracy": 0.84085413813591, + "num_tokens": 137904.0, + "step": 77 + }, + { + "epoch": 0.012630556230264756, + "grad_norm": 158.47547912597656, + "learning_rate": 9.875323834196891e-06, + "loss": 1.3095, + "mean_token_accuracy": 0.8543533980846405, + "num_tokens": 139690.0, + "step": 78 + }, + { + "epoch": 0.012792486438345073, + "grad_norm": 203.07867431640625, + "learning_rate": 9.873704663212436e-06, + "loss": 1.3092, + "mean_token_accuracy": 0.8548941910266876, + "num_tokens": 141477.0, + "step": 79 + }, + { + "epoch": 0.01295441664642539, + "grad_norm": 182.68307495117188, + "learning_rate": 9.87208549222798e-06, + "loss": 1.4291, + "mean_token_accuracy": 0.8384424149990082, + "num_tokens": 143267.0, + "step": 80 + }, + { + "epoch": 0.013116346854505708, + "grad_norm": 163.27781677246094, + "learning_rate": 9.870466321243524e-06, + "loss": 1.6462, + "mean_token_accuracy": 0.8403182923793793, + "num_tokens": 145067.0, + "step": 81 + }, + { + "epoch": 0.013278277062586025, + "grad_norm": 132.82940673828125, + "learning_rate": 9.868847150259067e-06, + "loss": 1.3092, + "mean_token_accuracy": 0.8508674502372742, + "num_tokens": 146854.0, + "step": 82 + }, + { + "epoch": 0.013440207270666343, + "grad_norm": 178.1233673095703, + "learning_rate": 9.867227979274612e-06, + "loss": 1.5437, + "mean_token_accuracy": 0.8374050855636597, + "num_tokens": 148649.0, + "step": 83 + }, + { + "epoch": 0.01360213747874666, + "grad_norm": 147.5355682373047, + "learning_rate": 9.865608808290156e-06, + "loss": 1.3282, + "mean_token_accuracy": 0.850056529045105, + "num_tokens": 150440.0, + "step": 84 + }, + { + "epoch": 0.013764067686826977, + "grad_norm": 180.29693603515625, + "learning_rate": 9.8639896373057e-06, + "loss": 1.5092, + "mean_token_accuracy": 0.8365455865859985, + "num_tokens": 152246.0, + "step": 85 + }, + { + "epoch": 0.013925997894907295, + "grad_norm": 186.56275939941406, + "learning_rate": 9.862370466321243e-06, + "loss": 1.2075, + "mean_token_accuracy": 0.8505065739154816, + "num_tokens": 154039.0, + "step": 86 + }, + { + "epoch": 0.014087928102987613, + "grad_norm": 169.25901794433594, + "learning_rate": 9.860751295336788e-06, + "loss": 1.5819, + "mean_token_accuracy": 0.8284255862236023, + "num_tokens": 155848.0, + "step": 87 + }, + { + "epoch": 0.014249858311067929, + "grad_norm": 154.59115600585938, + "learning_rate": 9.859132124352332e-06, + "loss": 1.3555, + "mean_token_accuracy": 0.840780645608902, + "num_tokens": 157649.0, + "step": 88 + }, + { + "epoch": 0.014411788519148247, + "grad_norm": 131.11549377441406, + "learning_rate": 9.857512953367876e-06, + "loss": 1.1094, + "mean_token_accuracy": 0.8674995005130768, + "num_tokens": 159444.0, + "step": 89 + }, + { + "epoch": 0.014573718727228565, + "grad_norm": 140.81982421875, + "learning_rate": 9.85589378238342e-06, + "loss": 1.242, + "mean_token_accuracy": 0.8588644564151764, + "num_tokens": 161238.0, + "step": 90 + }, + { + "epoch": 0.014735648935308881, + "grad_norm": 126.2032470703125, + "learning_rate": 9.854274611398965e-06, + "loss": 1.1502, + "mean_token_accuracy": 0.8686131536960602, + "num_tokens": 163024.0, + "step": 91 + }, + { + "epoch": 0.0148975791433892, + "grad_norm": 111.11841583251953, + "learning_rate": 9.852655440414508e-06, + "loss": 1.01, + "mean_token_accuracy": 0.8772802650928497, + "num_tokens": 164805.0, + "step": 92 + }, + { + "epoch": 0.015059509351469517, + "grad_norm": 139.14724731445312, + "learning_rate": 9.851036269430052e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.8784445524215698, + "num_tokens": 166597.0, + "step": 93 + }, + { + "epoch": 0.015221439559549833, + "grad_norm": 143.22569274902344, + "learning_rate": 9.849417098445595e-06, + "loss": 1.299, + "mean_token_accuracy": 0.8621516525745392, + "num_tokens": 168392.0, + "step": 94 + }, + { + "epoch": 0.015383369767630151, + "grad_norm": 166.4271697998047, + "learning_rate": 9.84779792746114e-06, + "loss": 1.4071, + "mean_token_accuracy": 0.8573804497718811, + "num_tokens": 170182.0, + "step": 95 + }, + { + "epoch": 0.01554529997571047, + "grad_norm": 152.1414031982422, + "learning_rate": 9.846178756476684e-06, + "loss": 1.3104, + "mean_token_accuracy": 0.8578595221042633, + "num_tokens": 171975.0, + "step": 96 + }, + { + "epoch": 0.015707230183790787, + "grad_norm": 157.1289825439453, + "learning_rate": 9.844559585492228e-06, + "loss": 1.6479, + "mean_token_accuracy": 0.8411366045475006, + "num_tokens": 173764.0, + "step": 97 + }, + { + "epoch": 0.015869160391871105, + "grad_norm": 143.55955505371094, + "learning_rate": 9.842940414507773e-06, + "loss": 1.1203, + "mean_token_accuracy": 0.8773466944694519, + "num_tokens": 175553.0, + "step": 98 + }, + { + "epoch": 0.01603109059995142, + "grad_norm": 153.3444061279297, + "learning_rate": 9.841321243523317e-06, + "loss": 1.144, + "mean_token_accuracy": 0.8682743906974792, + "num_tokens": 177346.0, + "step": 99 + }, + { + "epoch": 0.016193020808031738, + "grad_norm": 141.318115234375, + "learning_rate": 9.839702072538862e-06, + "loss": 1.268, + "mean_token_accuracy": 0.850713849067688, + "num_tokens": 179146.0, + "step": 100 + }, + { + "epoch": 0.016354951016112056, + "grad_norm": 226.34141540527344, + "learning_rate": 9.838082901554406e-06, + "loss": 1.4913, + "mean_token_accuracy": 0.8339845538139343, + "num_tokens": 180953.0, + "step": 101 + }, + { + "epoch": 0.016516881224192374, + "grad_norm": 149.2474822998047, + "learning_rate": 9.836463730569949e-06, + "loss": 1.3925, + "mean_token_accuracy": 0.8564562499523163, + "num_tokens": 182744.0, + "step": 102 + }, + { + "epoch": 0.01667881143227269, + "grad_norm": 156.7301025390625, + "learning_rate": 9.834844559585494e-06, + "loss": 1.265, + "mean_token_accuracy": 0.8668636083602905, + "num_tokens": 184534.0, + "step": 103 + }, + { + "epoch": 0.01684074164035301, + "grad_norm": 126.90946197509766, + "learning_rate": 9.833225388601038e-06, + "loss": 1.4091, + "mean_token_accuracy": 0.8362042903900146, + "num_tokens": 186327.0, + "step": 104 + }, + { + "epoch": 0.017002671848433324, + "grad_norm": 142.30873107910156, + "learning_rate": 9.831606217616582e-06, + "loss": 1.3331, + "mean_token_accuracy": 0.8343567252159119, + "num_tokens": 188129.0, + "step": 105 + }, + { + "epoch": 0.017164602056513642, + "grad_norm": 140.3267059326172, + "learning_rate": 9.829987046632125e-06, + "loss": 1.3125, + "mean_token_accuracy": 0.8535894751548767, + "num_tokens": 189921.0, + "step": 106 + }, + { + "epoch": 0.01732653226459396, + "grad_norm": 134.98789978027344, + "learning_rate": 9.82836787564767e-06, + "loss": 1.2795, + "mean_token_accuracy": 0.8522522449493408, + "num_tokens": 191716.0, + "step": 107 + }, + { + "epoch": 0.017488462472674278, + "grad_norm": 145.98971557617188, + "learning_rate": 9.826748704663214e-06, + "loss": 1.4273, + "mean_token_accuracy": 0.8379662036895752, + "num_tokens": 193511.0, + "step": 108 + }, + { + "epoch": 0.017650392680754596, + "grad_norm": 135.8827362060547, + "learning_rate": 9.825129533678758e-06, + "loss": 1.2056, + "mean_token_accuracy": 0.8697141110897064, + "num_tokens": 195307.0, + "step": 109 + }, + { + "epoch": 0.017812322888834914, + "grad_norm": 121.84761810302734, + "learning_rate": 9.823510362694301e-06, + "loss": 1.1506, + "mean_token_accuracy": 0.8648231029510498, + "num_tokens": 197093.0, + "step": 110 + }, + { + "epoch": 0.01797425309691523, + "grad_norm": 148.14280700683594, + "learning_rate": 9.821891191709846e-06, + "loss": 1.5437, + "mean_token_accuracy": 0.84389927983284, + "num_tokens": 198893.0, + "step": 111 + }, + { + "epoch": 0.018136183304995546, + "grad_norm": 131.35768127441406, + "learning_rate": 9.82027202072539e-06, + "loss": 1.4677, + "mean_token_accuracy": 0.835106372833252, + "num_tokens": 200690.0, + "step": 112 + }, + { + "epoch": 0.018298113513075864, + "grad_norm": 129.72560119628906, + "learning_rate": 9.818652849740934e-06, + "loss": 1.0861, + "mean_token_accuracy": 0.8749391734600067, + "num_tokens": 202474.0, + "step": 113 + }, + { + "epoch": 0.018460043721156182, + "grad_norm": 119.53143310546875, + "learning_rate": 9.817033678756477e-06, + "loss": 1.2916, + "mean_token_accuracy": 0.8645299077033997, + "num_tokens": 204251.0, + "step": 114 + }, + { + "epoch": 0.0186219739292365, + "grad_norm": 128.28497314453125, + "learning_rate": 9.815414507772023e-06, + "loss": 1.4726, + "mean_token_accuracy": 0.8503647744655609, + "num_tokens": 206044.0, + "step": 115 + }, + { + "epoch": 0.018783904137316815, + "grad_norm": 147.23959350585938, + "learning_rate": 9.813795336787566e-06, + "loss": 1.7437, + "mean_token_accuracy": 0.8233158588409424, + "num_tokens": 207838.0, + "step": 116 + }, + { + "epoch": 0.018945834345397133, + "grad_norm": 139.5876007080078, + "learning_rate": 9.81217616580311e-06, + "loss": 1.2578, + "mean_token_accuracy": 0.8539618849754333, + "num_tokens": 209631.0, + "step": 117 + }, + { + "epoch": 0.01910776455347745, + "grad_norm": 118.18374633789062, + "learning_rate": 9.810556994818653e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.8750790357589722, + "num_tokens": 211419.0, + "step": 118 + }, + { + "epoch": 0.01926969476155777, + "grad_norm": 163.70236206054688, + "learning_rate": 9.808937823834199e-06, + "loss": 1.333, + "mean_token_accuracy": 0.8524993658065796, + "num_tokens": 213209.0, + "step": 119 + }, + { + "epoch": 0.019431624969638087, + "grad_norm": 178.47259521484375, + "learning_rate": 9.807318652849742e-06, + "loss": 1.4292, + "mean_token_accuracy": 0.842380940914154, + "num_tokens": 215011.0, + "step": 120 + }, + { + "epoch": 0.019593555177718405, + "grad_norm": 102.3499755859375, + "learning_rate": 9.805699481865286e-06, + "loss": 0.827, + "mean_token_accuracy": 0.8908371031284332, + "num_tokens": 216789.0, + "step": 121 + }, + { + "epoch": 0.01975548538579872, + "grad_norm": 157.03076171875, + "learning_rate": 9.804080310880831e-06, + "loss": 1.6988, + "mean_token_accuracy": 0.8290434181690216, + "num_tokens": 218590.0, + "step": 122 + }, + { + "epoch": 0.019917415593879037, + "grad_norm": 133.3473358154297, + "learning_rate": 9.802461139896375e-06, + "loss": 1.2202, + "mean_token_accuracy": 0.8512350916862488, + "num_tokens": 220384.0, + "step": 123 + }, + { + "epoch": 0.020079345801959355, + "grad_norm": 117.95792388916016, + "learning_rate": 9.800841968911918e-06, + "loss": 1.2215, + "mean_token_accuracy": 0.8653438985347748, + "num_tokens": 222171.0, + "step": 124 + }, + { + "epoch": 0.020241276010039673, + "grad_norm": 115.05699920654297, + "learning_rate": 9.799222797927462e-06, + "loss": 1.2581, + "mean_token_accuracy": 0.8516252934932709, + "num_tokens": 223959.0, + "step": 125 + }, + { + "epoch": 0.02040320621811999, + "grad_norm": 124.35389709472656, + "learning_rate": 9.797603626943007e-06, + "loss": 1.2404, + "mean_token_accuracy": 0.8611111044883728, + "num_tokens": 225744.0, + "step": 126 + }, + { + "epoch": 0.02056513642620031, + "grad_norm": 111.64168548583984, + "learning_rate": 9.79598445595855e-06, + "loss": 1.1267, + "mean_token_accuracy": 0.870034396648407, + "num_tokens": 227533.0, + "step": 127 + }, + { + "epoch": 0.020727066634280623, + "grad_norm": 110.30130004882812, + "learning_rate": 9.794365284974094e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.8764550089836121, + "num_tokens": 229320.0, + "step": 128 + }, + { + "epoch": 0.02088899684236094, + "grad_norm": 136.05819702148438, + "learning_rate": 9.792746113989638e-06, + "loss": 1.342, + "mean_token_accuracy": 0.8593847751617432, + "num_tokens": 231109.0, + "step": 129 + }, + { + "epoch": 0.02105092705044126, + "grad_norm": 155.8380126953125, + "learning_rate": 9.791126943005183e-06, + "loss": 1.3679, + "mean_token_accuracy": 0.8501408398151398, + "num_tokens": 232913.0, + "step": 130 + }, + { + "epoch": 0.021212857258521577, + "grad_norm": 112.73963928222656, + "learning_rate": 9.789507772020727e-06, + "loss": 1.1217, + "mean_token_accuracy": 0.87177973985672, + "num_tokens": 234698.0, + "step": 131 + }, + { + "epoch": 0.021374787466601895, + "grad_norm": 128.66358947753906, + "learning_rate": 9.78788860103627e-06, + "loss": 1.2164, + "mean_token_accuracy": 0.8356244564056396, + "num_tokens": 236484.0, + "step": 132 + }, + { + "epoch": 0.021536717674682213, + "grad_norm": 122.71953582763672, + "learning_rate": 9.786269430051814e-06, + "loss": 1.1271, + "mean_token_accuracy": 0.8730820715427399, + "num_tokens": 238280.0, + "step": 133 + }, + { + "epoch": 0.021698647882762528, + "grad_norm": 119.80425262451172, + "learning_rate": 9.78465025906736e-06, + "loss": 1.1418, + "mean_token_accuracy": 0.8698275983333588, + "num_tokens": 240077.0, + "step": 134 + }, + { + "epoch": 0.021860578090842846, + "grad_norm": 123.12356567382812, + "learning_rate": 9.783031088082903e-06, + "loss": 1.1351, + "mean_token_accuracy": 0.8713235259056091, + "num_tokens": 241861.0, + "step": 135 + }, + { + "epoch": 0.022022508298923164, + "grad_norm": 137.73861694335938, + "learning_rate": 9.781411917098446e-06, + "loss": 1.2338, + "mean_token_accuracy": 0.8567132651805878, + "num_tokens": 243652.0, + "step": 136 + }, + { + "epoch": 0.02218443850700348, + "grad_norm": 116.47329711914062, + "learning_rate": 9.77979274611399e-06, + "loss": 1.1096, + "mean_token_accuracy": 0.8680764138698578, + "num_tokens": 245437.0, + "step": 137 + }, + { + "epoch": 0.0223463687150838, + "grad_norm": 125.22412109375, + "learning_rate": 9.778173575129535e-06, + "loss": 1.3607, + "mean_token_accuracy": 0.8363218009471893, + "num_tokens": 247224.0, + "step": 138 + }, + { + "epoch": 0.022508298923164118, + "grad_norm": 129.5410919189453, + "learning_rate": 9.776554404145079e-06, + "loss": 1.4095, + "mean_token_accuracy": 0.8447043597698212, + "num_tokens": 249026.0, + "step": 139 + }, + { + "epoch": 0.022670229131244432, + "grad_norm": 134.6291961669922, + "learning_rate": 9.774935233160622e-06, + "loss": 1.5326, + "mean_token_accuracy": 0.8500875234603882, + "num_tokens": 250825.0, + "step": 140 + }, + { + "epoch": 0.02283215933932475, + "grad_norm": 147.4597930908203, + "learning_rate": 9.773316062176168e-06, + "loss": 1.2712, + "mean_token_accuracy": 0.8485411107540131, + "num_tokens": 252627.0, + "step": 141 + }, + { + "epoch": 0.022994089547405068, + "grad_norm": 132.6239471435547, + "learning_rate": 9.771696891191711e-06, + "loss": 1.3977, + "mean_token_accuracy": 0.8432987034320831, + "num_tokens": 254413.0, + "step": 142 + }, + { + "epoch": 0.023156019755485386, + "grad_norm": 126.1785659790039, + "learning_rate": 9.770077720207255e-06, + "loss": 1.3923, + "mean_token_accuracy": 0.8496575355529785, + "num_tokens": 256211.0, + "step": 143 + }, + { + "epoch": 0.023317949963565704, + "grad_norm": 123.70172882080078, + "learning_rate": 9.768458549222798e-06, + "loss": 1.4467, + "mean_token_accuracy": 0.8444103002548218, + "num_tokens": 258012.0, + "step": 144 + }, + { + "epoch": 0.023479880171646022, + "grad_norm": 109.92925262451172, + "learning_rate": 9.766839378238344e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.8661601543426514, + "num_tokens": 259800.0, + "step": 145 + }, + { + "epoch": 0.023641810379726336, + "grad_norm": 104.02082824707031, + "learning_rate": 9.765220207253887e-06, + "loss": 1.1055, + "mean_token_accuracy": 0.8659451305866241, + "num_tokens": 261594.0, + "step": 146 + }, + { + "epoch": 0.023803740587806654, + "grad_norm": 115.35977172851562, + "learning_rate": 9.763601036269431e-06, + "loss": 1.1863, + "mean_token_accuracy": 0.8485915660858154, + "num_tokens": 263390.0, + "step": 147 + }, + { + "epoch": 0.023965670795886972, + "grad_norm": 98.9872055053711, + "learning_rate": 9.761981865284975e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.8793355226516724, + "num_tokens": 265184.0, + "step": 148 + }, + { + "epoch": 0.02412760100396729, + "grad_norm": 127.40291595458984, + "learning_rate": 9.76036269430052e-06, + "loss": 1.2655, + "mean_token_accuracy": 0.8649792373180389, + "num_tokens": 266977.0, + "step": 149 + }, + { + "epoch": 0.024289531212047608, + "grad_norm": 131.8263702392578, + "learning_rate": 9.758743523316063e-06, + "loss": 1.3194, + "mean_token_accuracy": 0.8503649830818176, + "num_tokens": 268763.0, + "step": 150 + }, + { + "epoch": 0.024451461420127926, + "grad_norm": 116.09439849853516, + "learning_rate": 9.757124352331607e-06, + "loss": 1.1478, + "mean_token_accuracy": 0.8628665804862976, + "num_tokens": 270566.0, + "step": 151 + }, + { + "epoch": 0.02461339162820824, + "grad_norm": 117.9870376586914, + "learning_rate": 9.75550518134715e-06, + "loss": 1.3406, + "mean_token_accuracy": 0.8555416464805603, + "num_tokens": 272355.0, + "step": 152 + }, + { + "epoch": 0.02477532183628856, + "grad_norm": 104.34040069580078, + "learning_rate": 9.753886010362696e-06, + "loss": 1.0825, + "mean_token_accuracy": 0.8654018640518188, + "num_tokens": 274134.0, + "step": 153 + }, + { + "epoch": 0.024937252044368877, + "grad_norm": 109.73971557617188, + "learning_rate": 9.75226683937824e-06, + "loss": 1.1913, + "mean_token_accuracy": 0.8550039529800415, + "num_tokens": 275922.0, + "step": 154 + }, + { + "epoch": 0.025099182252449195, + "grad_norm": 120.62830352783203, + "learning_rate": 9.750647668393783e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.8692438304424286, + "num_tokens": 277717.0, + "step": 155 + }, + { + "epoch": 0.025261112460529513, + "grad_norm": 119.07559967041016, + "learning_rate": 9.749028497409327e-06, + "loss": 1.3981, + "mean_token_accuracy": 0.8505167663097382, + "num_tokens": 279510.0, + "step": 156 + }, + { + "epoch": 0.02542304266860983, + "grad_norm": 111.41227722167969, + "learning_rate": 9.747409326424872e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.8773983418941498, + "num_tokens": 281299.0, + "step": 157 + }, + { + "epoch": 0.025584972876690145, + "grad_norm": 126.73580932617188, + "learning_rate": 9.745790155440416e-06, + "loss": 1.413, + "mean_token_accuracy": 0.837442934513092, + "num_tokens": 283092.0, + "step": 158 + }, + { + "epoch": 0.025746903084770463, + "grad_norm": 126.39917755126953, + "learning_rate": 9.744170984455959e-06, + "loss": 1.2451, + "mean_token_accuracy": 0.8299241065979004, + "num_tokens": 284898.0, + "step": 159 + }, + { + "epoch": 0.02590883329285078, + "grad_norm": 120.39822387695312, + "learning_rate": 9.742551813471504e-06, + "loss": 1.3836, + "mean_token_accuracy": 0.8522522449493408, + "num_tokens": 286693.0, + "step": 160 + }, + { + "epoch": 0.0260707635009311, + "grad_norm": 115.48530578613281, + "learning_rate": 9.740932642487048e-06, + "loss": 1.073, + "mean_token_accuracy": 0.8645526766777039, + "num_tokens": 288493.0, + "step": 161 + }, + { + "epoch": 0.026232693709011417, + "grad_norm": 122.31940460205078, + "learning_rate": 9.739313471502592e-06, + "loss": 1.3647, + "mean_token_accuracy": 0.8451739847660065, + "num_tokens": 290283.0, + "step": 162 + }, + { + "epoch": 0.026394623917091735, + "grad_norm": 92.15382385253906, + "learning_rate": 9.737694300518135e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.8815624713897705, + "num_tokens": 292072.0, + "step": 163 + }, + { + "epoch": 0.02655655412517205, + "grad_norm": 92.47595977783203, + "learning_rate": 9.73607512953368e-06, + "loss": 1.1039, + "mean_token_accuracy": 0.8732142746448517, + "num_tokens": 293868.0, + "step": 164 + }, + { + "epoch": 0.026718484333252367, + "grad_norm": 114.09842681884766, + "learning_rate": 9.734455958549224e-06, + "loss": 1.2642, + "mean_token_accuracy": 0.839160829782486, + "num_tokens": 295666.0, + "step": 165 + }, + { + "epoch": 0.026880414541332685, + "grad_norm": 100.58135986328125, + "learning_rate": 9.732836787564768e-06, + "loss": 1.2702, + "mean_token_accuracy": 0.8551343381404877, + "num_tokens": 297461.0, + "step": 166 + }, + { + "epoch": 0.027042344749413003, + "grad_norm": 117.5189208984375, + "learning_rate": 9.731217616580311e-06, + "loss": 1.2679, + "mean_token_accuracy": 0.8572303652763367, + "num_tokens": 299253.0, + "step": 167 + }, + { + "epoch": 0.02720427495749332, + "grad_norm": 103.39527893066406, + "learning_rate": 9.729598445595857e-06, + "loss": 1.2223, + "mean_token_accuracy": 0.8658588528633118, + "num_tokens": 301056.0, + "step": 168 + }, + { + "epoch": 0.02736620516557364, + "grad_norm": 129.08644104003906, + "learning_rate": 9.7279792746114e-06, + "loss": 1.3509, + "mean_token_accuracy": 0.8439637720584869, + "num_tokens": 302850.0, + "step": 169 + }, + { + "epoch": 0.027528135373653954, + "grad_norm": 114.4383544921875, + "learning_rate": 9.726360103626944e-06, + "loss": 1.2728, + "mean_token_accuracy": 0.8337662220001221, + "num_tokens": 304656.0, + "step": 170 + }, + { + "epoch": 0.02769006558173427, + "grad_norm": 90.71302032470703, + "learning_rate": 9.724740932642487e-06, + "loss": 1.1357, + "mean_token_accuracy": 0.8639854788780212, + "num_tokens": 306440.0, + "step": 171 + }, + { + "epoch": 0.02785199578981459, + "grad_norm": 116.95759582519531, + "learning_rate": 9.723121761658033e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.8832117021083832, + "num_tokens": 308226.0, + "step": 172 + }, + { + "epoch": 0.028013925997894908, + "grad_norm": 150.5602264404297, + "learning_rate": 9.721502590673576e-06, + "loss": 1.2547, + "mean_token_accuracy": 0.8491332530975342, + "num_tokens": 310023.0, + "step": 173 + }, + { + "epoch": 0.028175856205975226, + "grad_norm": 92.75274658203125, + "learning_rate": 9.71988341968912e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.8627997934818268, + "num_tokens": 311812.0, + "step": 174 + }, + { + "epoch": 0.028337786414055544, + "grad_norm": 82.1319808959961, + "learning_rate": 9.718264248704663e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.9042253494262695, + "num_tokens": 313596.0, + "step": 175 + }, + { + "epoch": 0.028499716622135858, + "grad_norm": 119.94003295898438, + "learning_rate": 9.716645077720209e-06, + "loss": 1.1073, + "mean_token_accuracy": 0.8617897927761078, + "num_tokens": 315397.0, + "step": 176 + }, + { + "epoch": 0.028661646830216176, + "grad_norm": 121.94509887695312, + "learning_rate": 9.715025906735752e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.8742299377918243, + "num_tokens": 317187.0, + "step": 177 + }, + { + "epoch": 0.028823577038296494, + "grad_norm": 93.1749267578125, + "learning_rate": 9.713406735751296e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.8875661194324493, + "num_tokens": 318974.0, + "step": 178 + }, + { + "epoch": 0.028985507246376812, + "grad_norm": 97.17940521240234, + "learning_rate": 9.711787564766841e-06, + "loss": 1.1198, + "mean_token_accuracy": 0.8783540725708008, + "num_tokens": 320765.0, + "step": 179 + }, + { + "epoch": 0.02914743745445713, + "grad_norm": 131.29702758789062, + "learning_rate": 9.710168393782385e-06, + "loss": 1.5697, + "mean_token_accuracy": 0.8433793485164642, + "num_tokens": 322564.0, + "step": 180 + }, + { + "epoch": 0.029309367662537448, + "grad_norm": 119.54278564453125, + "learning_rate": 9.708549222797928e-06, + "loss": 1.5281, + "mean_token_accuracy": 0.8380559682846069, + "num_tokens": 324359.0, + "step": 181 + }, + { + "epoch": 0.029471297870617762, + "grad_norm": 97.37931060791016, + "learning_rate": 9.706930051813472e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.8845532238483429, + "num_tokens": 326148.0, + "step": 182 + }, + { + "epoch": 0.02963322807869808, + "grad_norm": 92.8282470703125, + "learning_rate": 9.705310880829017e-06, + "loss": 1.3343, + "mean_token_accuracy": 0.8527897298336029, + "num_tokens": 327943.0, + "step": 183 + }, + { + "epoch": 0.0297951582867784, + "grad_norm": 105.1593017578125, + "learning_rate": 9.70369170984456e-06, + "loss": 1.1945, + "mean_token_accuracy": 0.857710987329483, + "num_tokens": 329736.0, + "step": 184 + }, + { + "epoch": 0.029957088494858716, + "grad_norm": 103.52218627929688, + "learning_rate": 9.702072538860104e-06, + "loss": 1.1561, + "mean_token_accuracy": 0.8648359179496765, + "num_tokens": 331528.0, + "step": 185 + }, + { + "epoch": 0.030119018702939034, + "grad_norm": 106.11717987060547, + "learning_rate": 9.700453367875648e-06, + "loss": 1.2645, + "mean_token_accuracy": 0.8563829660415649, + "num_tokens": 333325.0, + "step": 186 + }, + { + "epoch": 0.030280948911019352, + "grad_norm": 102.07538604736328, + "learning_rate": 9.698834196891193e-06, + "loss": 1.165, + "mean_token_accuracy": 0.8626444339752197, + "num_tokens": 335121.0, + "step": 187 + }, + { + "epoch": 0.030442879119099667, + "grad_norm": 103.69478607177734, + "learning_rate": 9.697215025906737e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.8777984976768494, + "num_tokens": 336911.0, + "step": 188 + }, + { + "epoch": 0.030604809327179985, + "grad_norm": 102.62185668945312, + "learning_rate": 9.69559585492228e-06, + "loss": 1.1987, + "mean_token_accuracy": 0.8626984059810638, + "num_tokens": 338707.0, + "step": 189 + }, + { + "epoch": 0.030766739535260303, + "grad_norm": 86.39649963378906, + "learning_rate": 9.693976683937824e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.8647922873497009, + "num_tokens": 340500.0, + "step": 190 + }, + { + "epoch": 0.03092866974334062, + "grad_norm": 95.78366088867188, + "learning_rate": 9.69235751295337e-06, + "loss": 1.2656, + "mean_token_accuracy": 0.8473006188869476, + "num_tokens": 342300.0, + "step": 191 + }, + { + "epoch": 0.03109059995142094, + "grad_norm": 102.48013305664062, + "learning_rate": 9.690738341968913e-06, + "loss": 1.1197, + "mean_token_accuracy": 0.8623949587345123, + "num_tokens": 344081.0, + "step": 192 + }, + { + "epoch": 0.03125253015950125, + "grad_norm": 93.49293518066406, + "learning_rate": 9.689119170984456e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.879696249961853, + "num_tokens": 345867.0, + "step": 193 + }, + { + "epoch": 0.031414460367581575, + "grad_norm": 79.96947479248047, + "learning_rate": 9.6875e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.894948273897171, + "num_tokens": 347655.0, + "step": 194 + }, + { + "epoch": 0.03157639057566189, + "grad_norm": 70.88719177246094, + "learning_rate": 9.685880829015545e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.879696249961853, + "num_tokens": 349441.0, + "step": 195 + }, + { + "epoch": 0.03173832078374221, + "grad_norm": 94.55944061279297, + "learning_rate": 9.684261658031089e-06, + "loss": 1.2195, + "mean_token_accuracy": 0.8540273606777191, + "num_tokens": 351234.0, + "step": 196 + }, + { + "epoch": 0.031900250991822525, + "grad_norm": 80.9522705078125, + "learning_rate": 9.682642487046632e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.893457680940628, + "num_tokens": 353018.0, + "step": 197 + }, + { + "epoch": 0.03206218119990284, + "grad_norm": 121.10845947265625, + "learning_rate": 9.681023316062178e-06, + "loss": 1.218, + "mean_token_accuracy": 0.8588652610778809, + "num_tokens": 354806.0, + "step": 198 + }, + { + "epoch": 0.03222411140798316, + "grad_norm": 89.33942413330078, + "learning_rate": 9.679404145077721e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.8747301995754242, + "num_tokens": 356597.0, + "step": 199 + }, + { + "epoch": 0.032386041616063475, + "grad_norm": 76.56890106201172, + "learning_rate": 9.677784974093265e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.880248099565506, + "num_tokens": 358376.0, + "step": 200 + }, + { + "epoch": 0.0325479718241438, + "grad_norm": 98.30876922607422, + "learning_rate": 9.676165803108809e-06, + "loss": 1.3148, + "mean_token_accuracy": 0.8574938774108887, + "num_tokens": 360168.0, + "step": 201 + }, + { + "epoch": 0.03270990203222411, + "grad_norm": 92.54721069335938, + "learning_rate": 9.674546632124354e-06, + "loss": 1.3033, + "mean_token_accuracy": 0.8603916168212891, + "num_tokens": 361959.0, + "step": 202 + }, + { + "epoch": 0.032871832240304426, + "grad_norm": 69.83611297607422, + "learning_rate": 9.672927461139897e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.899563193321228, + "num_tokens": 363750.0, + "step": 203 + }, + { + "epoch": 0.03303376244838475, + "grad_norm": 79.83929443359375, + "learning_rate": 9.671308290155441e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.8892720341682434, + "num_tokens": 365542.0, + "step": 204 + }, + { + "epoch": 0.03319569265646506, + "grad_norm": 104.99756622314453, + "learning_rate": 9.669689119170985e-06, + "loss": 1.2795, + "mean_token_accuracy": 0.8637835085391998, + "num_tokens": 367333.0, + "step": 205 + }, + { + "epoch": 0.03335762286454538, + "grad_norm": 78.70339965820312, + "learning_rate": 9.66806994818653e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.8583677411079407, + "num_tokens": 369120.0, + "step": 206 + }, + { + "epoch": 0.0335195530726257, + "grad_norm": 99.41761779785156, + "learning_rate": 9.666450777202073e-06, + "loss": 1.3328, + "mean_token_accuracy": 0.8484255969524384, + "num_tokens": 370909.0, + "step": 207 + }, + { + "epoch": 0.03368148328070602, + "grad_norm": 72.52552795410156, + "learning_rate": 9.664831606217617e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.890016108751297, + "num_tokens": 372694.0, + "step": 208 + }, + { + "epoch": 0.033843413488786334, + "grad_norm": 78.51555633544922, + "learning_rate": 9.66321243523316e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.8875661194324493, + "num_tokens": 374481.0, + "step": 209 + }, + { + "epoch": 0.03400534369686665, + "grad_norm": 92.80706024169922, + "learning_rate": 9.661593264248706e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.8640248775482178, + "num_tokens": 376280.0, + "step": 210 + }, + { + "epoch": 0.03416727390494697, + "grad_norm": 98.85427856445312, + "learning_rate": 9.65997409326425e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.8770212829113007, + "num_tokens": 378083.0, + "step": 211 + }, + { + "epoch": 0.034329204113027284, + "grad_norm": 99.2446060180664, + "learning_rate": 9.658354922279793e-06, + "loss": 1.0836, + "mean_token_accuracy": 0.854392796754837, + "num_tokens": 379883.0, + "step": 212 + }, + { + "epoch": 0.034491134321107605, + "grad_norm": 92.38343811035156, + "learning_rate": 9.656735751295337e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.8636363744735718, + "num_tokens": 381681.0, + "step": 213 + }, + { + "epoch": 0.03465306452918792, + "grad_norm": 87.25627899169922, + "learning_rate": 9.655116580310882e-06, + "loss": 1.114, + "mean_token_accuracy": 0.8650175333023071, + "num_tokens": 383480.0, + "step": 214 + }, + { + "epoch": 0.034814994737268234, + "grad_norm": 78.69329833984375, + "learning_rate": 9.653497409326426e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.8893114328384399, + "num_tokens": 385272.0, + "step": 215 + }, + { + "epoch": 0.034976924945348556, + "grad_norm": 86.92837524414062, + "learning_rate": 9.651878238341969e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.8828863203525543, + "num_tokens": 387064.0, + "step": 216 + }, + { + "epoch": 0.03513885515342887, + "grad_norm": 73.95407104492188, + "learning_rate": 9.650259067357514e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.8853031992912292, + "num_tokens": 388855.0, + "step": 217 + }, + { + "epoch": 0.03530078536150919, + "grad_norm": 80.76441192626953, + "learning_rate": 9.648639896373058e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.8897058963775635, + "num_tokens": 390639.0, + "step": 218 + }, + { + "epoch": 0.035462715569589506, + "grad_norm": 101.61139678955078, + "learning_rate": 9.647020725388602e-06, + "loss": 1.4082, + "mean_token_accuracy": 0.8655229806900024, + "num_tokens": 392433.0, + "step": 219 + }, + { + "epoch": 0.03562464577766983, + "grad_norm": 81.81256103515625, + "learning_rate": 9.645401554404145e-06, + "loss": 1.2368, + "mean_token_accuracy": 0.8521648347377777, + "num_tokens": 394222.0, + "step": 220 + }, + { + "epoch": 0.03578657598575014, + "grad_norm": 85.47928619384766, + "learning_rate": 9.64378238341969e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.8652519881725311, + "num_tokens": 396009.0, + "step": 221 + }, + { + "epoch": 0.03594850619383046, + "grad_norm": 93.98357391357422, + "learning_rate": 9.642163212435234e-06, + "loss": 1.0849, + "mean_token_accuracy": 0.8631469905376434, + "num_tokens": 397799.0, + "step": 222 + }, + { + "epoch": 0.03611043640191078, + "grad_norm": 89.43865966796875, + "learning_rate": 9.640544041450778e-06, + "loss": 1.1659, + "mean_token_accuracy": 0.86446213722229, + "num_tokens": 399591.0, + "step": 223 + }, + { + "epoch": 0.03627236660999109, + "grad_norm": 77.89927673339844, + "learning_rate": 9.638924870466321e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.8854834735393524, + "num_tokens": 401391.0, + "step": 224 + }, + { + "epoch": 0.036434296818071414, + "grad_norm": 84.23726654052734, + "learning_rate": 9.637305699481867e-06, + "loss": 1.0932, + "mean_token_accuracy": 0.8648186326026917, + "num_tokens": 403185.0, + "step": 225 + }, + { + "epoch": 0.03659622702615173, + "grad_norm": 83.42762756347656, + "learning_rate": 9.63568652849741e-06, + "loss": 1.0781, + "mean_token_accuracy": 0.8731481730937958, + "num_tokens": 404972.0, + "step": 226 + }, + { + "epoch": 0.03675815723423204, + "grad_norm": 90.8319320678711, + "learning_rate": 9.634067357512954e-06, + "loss": 1.2173, + "mean_token_accuracy": 0.8509507179260254, + "num_tokens": 406764.0, + "step": 227 + }, + { + "epoch": 0.036920087442312365, + "grad_norm": 71.17721557617188, + "learning_rate": 9.632448186528497e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.8913982510566711, + "num_tokens": 408543.0, + "step": 228 + }, + { + "epoch": 0.03708201765039268, + "grad_norm": 64.10284423828125, + "learning_rate": 9.630829015544043e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.89650759100914, + "num_tokens": 410326.0, + "step": 229 + }, + { + "epoch": 0.037243947858473, + "grad_norm": 81.04692840576172, + "learning_rate": 9.629209844559586e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.875975489616394, + "num_tokens": 412119.0, + "step": 230 + }, + { + "epoch": 0.037405878066553315, + "grad_norm": 84.19200897216797, + "learning_rate": 9.62759067357513e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.868961364030838, + "num_tokens": 413913.0, + "step": 231 + }, + { + "epoch": 0.03756780827463363, + "grad_norm": 92.13495635986328, + "learning_rate": 9.625971502590673e-06, + "loss": 1.1045, + "mean_token_accuracy": 0.8625689744949341, + "num_tokens": 415701.0, + "step": 232 + }, + { + "epoch": 0.03772973848271395, + "grad_norm": 96.21236419677734, + "learning_rate": 9.624352331606219e-06, + "loss": 1.1076, + "mean_token_accuracy": 0.8659087419509888, + "num_tokens": 417489.0, + "step": 233 + }, + { + "epoch": 0.037891668690794265, + "grad_norm": 62.594482421875, + "learning_rate": 9.622733160621762e-06, + "loss": 0.711, + "mean_token_accuracy": 0.9117647111415863, + "num_tokens": 419273.0, + "step": 234 + }, + { + "epoch": 0.03805359889887459, + "grad_norm": 72.2493667602539, + "learning_rate": 9.621113989637306e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.8702192008495331, + "num_tokens": 421067.0, + "step": 235 + }, + { + "epoch": 0.0382155291069549, + "grad_norm": 76.11592864990234, + "learning_rate": 9.619494818652851e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.8894314765930176, + "num_tokens": 422858.0, + "step": 236 + }, + { + "epoch": 0.03837745931503522, + "grad_norm": 72.14783477783203, + "learning_rate": 9.617875647668395e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.8973430097103119, + "num_tokens": 424643.0, + "step": 237 + }, + { + "epoch": 0.03853938952311554, + "grad_norm": 61.82394027709961, + "learning_rate": 9.616256476683938e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.9017053246498108, + "num_tokens": 426429.0, + "step": 238 + }, + { + "epoch": 0.03870131973119585, + "grad_norm": 86.99252319335938, + "learning_rate": 9.614637305699482e-06, + "loss": 1.1055, + "mean_token_accuracy": 0.8616040349006653, + "num_tokens": 428215.0, + "step": 239 + }, + { + "epoch": 0.03886324993927617, + "grad_norm": 83.90782928466797, + "learning_rate": 9.613018134715027e-06, + "loss": 1.1475, + "mean_token_accuracy": 0.8636764287948608, + "num_tokens": 430005.0, + "step": 240 + }, + { + "epoch": 0.03902518014735649, + "grad_norm": 78.6772232055664, + "learning_rate": 9.61139896373057e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.8970588445663452, + "num_tokens": 431789.0, + "step": 241 + }, + { + "epoch": 0.03918711035543681, + "grad_norm": 101.41692352294922, + "learning_rate": 9.609779792746114e-06, + "loss": 1.1547, + "mean_token_accuracy": 0.8578526079654694, + "num_tokens": 433589.0, + "step": 242 + }, + { + "epoch": 0.039349040563517124, + "grad_norm": 83.16380310058594, + "learning_rate": 9.608160621761658e-06, + "loss": 1.1516, + "mean_token_accuracy": 0.8645991683006287, + "num_tokens": 435382.0, + "step": 243 + }, + { + "epoch": 0.03951097077159744, + "grad_norm": 75.57198333740234, + "learning_rate": 9.606541450777203e-06, + "loss": 1.1315, + "mean_token_accuracy": 0.8804621994495392, + "num_tokens": 437170.0, + "step": 244 + }, + { + "epoch": 0.03967290097967776, + "grad_norm": 89.52605438232422, + "learning_rate": 9.604922279792747e-06, + "loss": 1.1864, + "mean_token_accuracy": 0.8767799437046051, + "num_tokens": 438966.0, + "step": 245 + }, + { + "epoch": 0.039834831187758074, + "grad_norm": 78.80630493164062, + "learning_rate": 9.60330310880829e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.8812949657440186, + "num_tokens": 440756.0, + "step": 246 + }, + { + "epoch": 0.039996761395838396, + "grad_norm": 60.73417282104492, + "learning_rate": 9.601683937823834e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.8968297243118286, + "num_tokens": 442539.0, + "step": 247 + }, + { + "epoch": 0.04015869160391871, + "grad_norm": 92.85823059082031, + "learning_rate": 9.60006476683938e-06, + "loss": 1.3841, + "mean_token_accuracy": 0.8435491025447845, + "num_tokens": 444339.0, + "step": 248 + }, + { + "epoch": 0.04032062181199903, + "grad_norm": 82.19344329833984, + "learning_rate": 9.598445595854923e-06, + "loss": 1.1141, + "mean_token_accuracy": 0.8718289136886597, + "num_tokens": 446131.0, + "step": 249 + }, + { + "epoch": 0.040482552020079346, + "grad_norm": 69.2301025390625, + "learning_rate": 9.596826424870466e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.8918404579162598, + "num_tokens": 447920.0, + "step": 250 + }, + { + "epoch": 0.04064448222815966, + "grad_norm": 73.9405517578125, + "learning_rate": 9.59520725388601e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.8873015940189362, + "num_tokens": 449716.0, + "step": 251 + }, + { + "epoch": 0.04080641243623998, + "grad_norm": 79.64219665527344, + "learning_rate": 9.593588082901555e-06, + "loss": 1.2847, + "mean_token_accuracy": 0.8657226860523224, + "num_tokens": 451511.0, + "step": 252 + }, + { + "epoch": 0.040968342644320296, + "grad_norm": 68.41842651367188, + "learning_rate": 9.591968911917099e-06, + "loss": 1.1943, + "mean_token_accuracy": 0.8614130318164825, + "num_tokens": 453297.0, + "step": 253 + }, + { + "epoch": 0.04113027285240062, + "grad_norm": 66.58094787597656, + "learning_rate": 9.590349740932642e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.8794757127761841, + "num_tokens": 455083.0, + "step": 254 + }, + { + "epoch": 0.04129220306048093, + "grad_norm": 78.18601989746094, + "learning_rate": 9.588730569948188e-06, + "loss": 1.3005, + "mean_token_accuracy": 0.8566032946109772, + "num_tokens": 456874.0, + "step": 255 + }, + { + "epoch": 0.04145413326856125, + "grad_norm": 70.24598693847656, + "learning_rate": 9.587111398963731e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.8764172494411469, + "num_tokens": 458677.0, + "step": 256 + }, + { + "epoch": 0.04161606347664157, + "grad_norm": 59.934486389160156, + "learning_rate": 9.585492227979275e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.873127281665802, + "num_tokens": 460457.0, + "step": 257 + }, + { + "epoch": 0.04177799368472188, + "grad_norm": 70.09595489501953, + "learning_rate": 9.583873056994819e-06, + "loss": 1.1482, + "mean_token_accuracy": 0.8628380000591278, + "num_tokens": 462253.0, + "step": 258 + }, + { + "epoch": 0.041939923892802204, + "grad_norm": 82.5488510131836, + "learning_rate": 9.582253886010364e-06, + "loss": 1.1594, + "mean_token_accuracy": 0.8703097105026245, + "num_tokens": 464049.0, + "step": 259 + }, + { + "epoch": 0.04210185410088252, + "grad_norm": 59.300804138183594, + "learning_rate": 9.580634715025907e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.8900361657142639, + "num_tokens": 465851.0, + "step": 260 + }, + { + "epoch": 0.04226378430896284, + "grad_norm": 81.78755950927734, + "learning_rate": 9.579015544041451e-06, + "loss": 1.4321, + "mean_token_accuracy": 0.8499999940395355, + "num_tokens": 467643.0, + "step": 261 + }, + { + "epoch": 0.042425714517043155, + "grad_norm": 77.30547332763672, + "learning_rate": 9.577396373056995e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.8765076100826263, + "num_tokens": 469430.0, + "step": 262 + }, + { + "epoch": 0.04258764472512347, + "grad_norm": 55.55488586425781, + "learning_rate": 9.57577720207254e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.8917297720909119, + "num_tokens": 471218.0, + "step": 263 + }, + { + "epoch": 0.04274957493320379, + "grad_norm": 82.53324890136719, + "learning_rate": 9.574158031088083e-06, + "loss": 0.969, + "mean_token_accuracy": 0.8780970573425293, + "num_tokens": 473009.0, + "step": 264 + }, + { + "epoch": 0.042911505141284105, + "grad_norm": 53.665287017822266, + "learning_rate": 9.572538860103627e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.8975433111190796, + "num_tokens": 474786.0, + "step": 265 + }, + { + "epoch": 0.043073435349364426, + "grad_norm": 69.20823669433594, + "learning_rate": 9.57091968911917e-06, + "loss": 1.2566, + "mean_token_accuracy": 0.879696249961853, + "num_tokens": 476572.0, + "step": 266 + }, + { + "epoch": 0.04323536555744474, + "grad_norm": 72.65019989013672, + "learning_rate": 9.569300518134716e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.8810641765594482, + "num_tokens": 478370.0, + "step": 267 + }, + { + "epoch": 0.043397295765525055, + "grad_norm": 57.39815139770508, + "learning_rate": 9.56768134715026e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.8956836760044098, + "num_tokens": 480150.0, + "step": 268 + }, + { + "epoch": 0.04355922597360538, + "grad_norm": 62.54112243652344, + "learning_rate": 9.566062176165803e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.893990695476532, + "num_tokens": 481926.0, + "step": 269 + }, + { + "epoch": 0.04372115618168569, + "grad_norm": 73.81266784667969, + "learning_rate": 9.564443005181347e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.8785529732704163, + "num_tokens": 483702.0, + "step": 270 + }, + { + "epoch": 0.04388308638976601, + "grad_norm": 78.97330474853516, + "learning_rate": 9.562823834196892e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.8996659815311432, + "num_tokens": 485493.0, + "step": 271 + }, + { + "epoch": 0.04404501659784633, + "grad_norm": 84.11725616455078, + "learning_rate": 9.561204663212436e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.8642781376838684, + "num_tokens": 487306.0, + "step": 272 + }, + { + "epoch": 0.04420694680592665, + "grad_norm": 81.29261779785156, + "learning_rate": 9.559585492227979e-06, + "loss": 1.2821, + "mean_token_accuracy": 0.8477867245674133, + "num_tokens": 489100.0, + "step": 273 + }, + { + "epoch": 0.04436887701400696, + "grad_norm": 62.31085968017578, + "learning_rate": 9.557966321243524e-06, + "loss": 0.7428, + "mean_token_accuracy": 0.8943609595298767, + "num_tokens": 490896.0, + "step": 274 + }, + { + "epoch": 0.04453080722208728, + "grad_norm": 71.10617065429688, + "learning_rate": 9.556347150259068e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.8795271515846252, + "num_tokens": 492690.0, + "step": 275 + }, + { + "epoch": 0.0446927374301676, + "grad_norm": 83.48087310791016, + "learning_rate": 9.554727979274612e-06, + "loss": 1.159, + "mean_token_accuracy": 0.8639097809791565, + "num_tokens": 494494.0, + "step": 276 + }, + { + "epoch": 0.044854667638247914, + "grad_norm": 54.94813919067383, + "learning_rate": 9.553108808290155e-06, + "loss": 0.702, + "mean_token_accuracy": 0.9010069966316223, + "num_tokens": 496278.0, + "step": 277 + }, + { + "epoch": 0.045016597846328235, + "grad_norm": 64.06427001953125, + "learning_rate": 9.5514896373057e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.8793289959430695, + "num_tokens": 498062.0, + "step": 278 + }, + { + "epoch": 0.04517852805440855, + "grad_norm": 65.4741439819336, + "learning_rate": 9.549870466321244e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.8856909573078156, + "num_tokens": 499854.0, + "step": 279 + }, + { + "epoch": 0.045340458262488864, + "grad_norm": 66.4945297241211, + "learning_rate": 9.548251295336788e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.8850524425506592, + "num_tokens": 501645.0, + "step": 280 + }, + { + "epoch": 0.045502388470569186, + "grad_norm": 71.58250427246094, + "learning_rate": 9.546632124352331e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.8801409304141998, + "num_tokens": 503439.0, + "step": 281 + }, + { + "epoch": 0.0456643186786495, + "grad_norm": 60.97646713256836, + "learning_rate": 9.545012953367877e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.8957557082176208, + "num_tokens": 505229.0, + "step": 282 + }, + { + "epoch": 0.04582624888672982, + "grad_norm": 62.338478088378906, + "learning_rate": 9.54339378238342e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.8967047929763794, + "num_tokens": 507012.0, + "step": 283 + }, + { + "epoch": 0.045988179094810136, + "grad_norm": 70.01009368896484, + "learning_rate": 9.541774611398964e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.8655172288417816, + "num_tokens": 508814.0, + "step": 284 + }, + { + "epoch": 0.04615010930289046, + "grad_norm": 67.59593200683594, + "learning_rate": 9.540155440414507e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.8930011093616486, + "num_tokens": 510597.0, + "step": 285 + }, + { + "epoch": 0.04631203951097077, + "grad_norm": 70.6162109375, + "learning_rate": 9.538536269430053e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.8682743906974792, + "num_tokens": 512390.0, + "step": 286 + }, + { + "epoch": 0.046473969719051086, + "grad_norm": 61.420379638671875, + "learning_rate": 9.536917098445596e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.9003976583480835, + "num_tokens": 514173.0, + "step": 287 + }, + { + "epoch": 0.04663589992713141, + "grad_norm": 67.64603424072266, + "learning_rate": 9.53529792746114e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.8959807753562927, + "num_tokens": 515954.0, + "step": 288 + }, + { + "epoch": 0.04679783013521172, + "grad_norm": 67.98467254638672, + "learning_rate": 9.533678756476683e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.8892516791820526, + "num_tokens": 517744.0, + "step": 289 + }, + { + "epoch": 0.046959760343292044, + "grad_norm": 70.8077392578125, + "learning_rate": 9.532059585492229e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.8863345980644226, + "num_tokens": 519536.0, + "step": 290 + }, + { + "epoch": 0.04712169055137236, + "grad_norm": 62.81648254394531, + "learning_rate": 9.530440414507774e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.905844658613205, + "num_tokens": 521324.0, + "step": 291 + }, + { + "epoch": 0.04728362075945267, + "grad_norm": 65.97950744628906, + "learning_rate": 9.528821243523318e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.8746402859687805, + "num_tokens": 523115.0, + "step": 292 + }, + { + "epoch": 0.047445550967532994, + "grad_norm": 66.72357940673828, + "learning_rate": 9.527202072538861e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.9001141786575317, + "num_tokens": 524905.0, + "step": 293 + }, + { + "epoch": 0.04760748117561331, + "grad_norm": 80.37352752685547, + "learning_rate": 9.525582901554405e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.875331699848175, + "num_tokens": 526697.0, + "step": 294 + }, + { + "epoch": 0.04776941138369363, + "grad_norm": 54.132625579833984, + "learning_rate": 9.52396373056995e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.8980975151062012, + "num_tokens": 528474.0, + "step": 295 + }, + { + "epoch": 0.047931341591773945, + "grad_norm": 70.5918960571289, + "learning_rate": 9.522344559585494e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.8865539729595184, + "num_tokens": 530259.0, + "step": 296 + }, + { + "epoch": 0.048093271799854266, + "grad_norm": 72.65715026855469, + "learning_rate": 9.520725388601037e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.8867753744125366, + "num_tokens": 532053.0, + "step": 297 + }, + { + "epoch": 0.04825520200793458, + "grad_norm": 55.18707275390625, + "learning_rate": 9.519106217616582e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.9007679224014282, + "num_tokens": 533837.0, + "step": 298 + }, + { + "epoch": 0.048417132216014895, + "grad_norm": 77.32190704345703, + "learning_rate": 9.517487046632126e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.8728955388069153, + "num_tokens": 535630.0, + "step": 299 + }, + { + "epoch": 0.048579062424095217, + "grad_norm": 60.8797607421875, + "learning_rate": 9.51586787564767e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.8908167481422424, + "num_tokens": 537417.0, + "step": 300 + }, + { + "epoch": 0.04874099263217553, + "grad_norm": 56.40549087524414, + "learning_rate": 9.514248704663213e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.9058971703052521, + "num_tokens": 539205.0, + "step": 301 + }, + { + "epoch": 0.04890292284025585, + "grad_norm": 66.17485809326172, + "learning_rate": 9.512629533678758e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.8811090290546417, + "num_tokens": 540994.0, + "step": 302 + }, + { + "epoch": 0.04906485304833617, + "grad_norm": 59.788265228271484, + "learning_rate": 9.511010362694302e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.8920482099056244, + "num_tokens": 542795.0, + "step": 303 + }, + { + "epoch": 0.04922678325641648, + "grad_norm": 77.38652038574219, + "learning_rate": 9.509391191709846e-06, + "loss": 1.3083, + "mean_token_accuracy": 0.851588249206543, + "num_tokens": 544590.0, + "step": 304 + }, + { + "epoch": 0.0493887134644968, + "grad_norm": 80.82234191894531, + "learning_rate": 9.50777202072539e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.873657613992691, + "num_tokens": 546379.0, + "step": 305 + }, + { + "epoch": 0.04955064367257712, + "grad_norm": 62.7913932800293, + "learning_rate": 9.506152849740935e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.8920454680919647, + "num_tokens": 548170.0, + "step": 306 + }, + { + "epoch": 0.04971257388065744, + "grad_norm": 81.53544616699219, + "learning_rate": 9.504533678756478e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.8778295814990997, + "num_tokens": 549969.0, + "step": 307 + }, + { + "epoch": 0.04987450408873775, + "grad_norm": 64.9419174194336, + "learning_rate": 9.502914507772022e-06, + "loss": 1.0853, + "mean_token_accuracy": 0.878756046295166, + "num_tokens": 551774.0, + "step": 308 + }, + { + "epoch": 0.050036434296818075, + "grad_norm": 63.24518966674805, + "learning_rate": 9.501295336787565e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.8807711601257324, + "num_tokens": 553567.0, + "step": 309 + }, + { + "epoch": 0.05019836450489839, + "grad_norm": 49.60218048095703, + "learning_rate": 9.49967616580311e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.9033337235450745, + "num_tokens": 555348.0, + "step": 310 + }, + { + "epoch": 0.050360294712978704, + "grad_norm": 49.804195404052734, + "learning_rate": 9.498056994818654e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.915304571390152, + "num_tokens": 557131.0, + "step": 311 + }, + { + "epoch": 0.050522224921059025, + "grad_norm": 78.1496810913086, + "learning_rate": 9.496437823834198e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.8697479069232941, + "num_tokens": 558926.0, + "step": 312 + }, + { + "epoch": 0.05068415512913934, + "grad_norm": 62.136505126953125, + "learning_rate": 9.494818652849741e-06, + "loss": 0.895, + "mean_token_accuracy": 0.8791474103927612, + "num_tokens": 560711.0, + "step": 313 + }, + { + "epoch": 0.05084608533721966, + "grad_norm": 61.435508728027344, + "learning_rate": 9.493199481865287e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.8876686692237854, + "num_tokens": 562508.0, + "step": 314 + }, + { + "epoch": 0.051008015545299976, + "grad_norm": 51.740135192871094, + "learning_rate": 9.49158031088083e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.8983990252017975, + "num_tokens": 564305.0, + "step": 315 + }, + { + "epoch": 0.05116994575338029, + "grad_norm": 64.10138702392578, + "learning_rate": 9.489961139896374e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.8905942142009735, + "num_tokens": 566091.0, + "step": 316 + }, + { + "epoch": 0.05133187596146061, + "grad_norm": 81.78770446777344, + "learning_rate": 9.488341968911919e-06, + "loss": 1.174, + "mean_token_accuracy": 0.8632535636425018, + "num_tokens": 567888.0, + "step": 317 + }, + { + "epoch": 0.051493806169540926, + "grad_norm": 69.4294662475586, + "learning_rate": 9.486722797927463e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.8850221633911133, + "num_tokens": 569669.0, + "step": 318 + }, + { + "epoch": 0.05165573637762125, + "grad_norm": 66.24795532226562, + "learning_rate": 9.485103626943006e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.8722862899303436, + "num_tokens": 571447.0, + "step": 319 + }, + { + "epoch": 0.05181766658570156, + "grad_norm": 56.03800582885742, + "learning_rate": 9.48348445595855e-06, + "loss": 0.885, + "mean_token_accuracy": 0.8868373930454254, + "num_tokens": 573243.0, + "step": 320 + }, + { + "epoch": 0.05197959679378188, + "grad_norm": 66.54347229003906, + "learning_rate": 9.481865284974095e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.8723843097686768, + "num_tokens": 575037.0, + "step": 321 + }, + { + "epoch": 0.0521415270018622, + "grad_norm": 58.032493591308594, + "learning_rate": 9.480246113989639e-06, + "loss": 0.982, + "mean_token_accuracy": 0.8778461813926697, + "num_tokens": 576820.0, + "step": 322 + }, + { + "epoch": 0.05230345720994251, + "grad_norm": 67.31257629394531, + "learning_rate": 9.478626943005182e-06, + "loss": 1.1216, + "mean_token_accuracy": 0.8691913485527039, + "num_tokens": 578621.0, + "step": 323 + }, + { + "epoch": 0.052465387418022834, + "grad_norm": 47.67085647583008, + "learning_rate": 9.477007772020726e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.9005842208862305, + "num_tokens": 580415.0, + "step": 324 + }, + { + "epoch": 0.05262731762610315, + "grad_norm": 58.07624816894531, + "learning_rate": 9.475388601036271e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.8872580230236053, + "num_tokens": 582202.0, + "step": 325 + }, + { + "epoch": 0.05278924783418347, + "grad_norm": 56.813499450683594, + "learning_rate": 9.473769430051815e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.884892076253891, + "num_tokens": 583992.0, + "step": 326 + }, + { + "epoch": 0.052951178042263784, + "grad_norm": 59.193092346191406, + "learning_rate": 9.472150259067358e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.875, + "num_tokens": 585776.0, + "step": 327 + }, + { + "epoch": 0.0531131082503441, + "grad_norm": 69.3732681274414, + "learning_rate": 9.470531088082902e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.8646662831306458, + "num_tokens": 587568.0, + "step": 328 + }, + { + "epoch": 0.05327503845842442, + "grad_norm": 65.90499877929688, + "learning_rate": 9.468911917098447e-06, + "loss": 1.2268, + "mean_token_accuracy": 0.8905171155929565, + "num_tokens": 589372.0, + "step": 329 + }, + { + "epoch": 0.053436968666504735, + "grad_norm": 73.06372833251953, + "learning_rate": 9.467292746113991e-06, + "loss": 1.1115, + "mean_token_accuracy": 0.8717085123062134, + "num_tokens": 591164.0, + "step": 330 + }, + { + "epoch": 0.053598898874585056, + "grad_norm": 44.78315353393555, + "learning_rate": 9.465673575129534e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.9079106450080872, + "num_tokens": 592958.0, + "step": 331 + }, + { + "epoch": 0.05376082908266537, + "grad_norm": 50.79363250732422, + "learning_rate": 9.464054404145078e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.8981527090072632, + "num_tokens": 594755.0, + "step": 332 + }, + { + "epoch": 0.05392275929074569, + "grad_norm": 54.239864349365234, + "learning_rate": 9.462435233160623e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.8912129402160645, + "num_tokens": 596543.0, + "step": 333 + }, + { + "epoch": 0.05408468949882601, + "grad_norm": 60.15306091308594, + "learning_rate": 9.460816062176167e-06, + "loss": 1.1363, + "mean_token_accuracy": 0.8676398992538452, + "num_tokens": 598327.0, + "step": 334 + }, + { + "epoch": 0.05424661970690632, + "grad_norm": 53.625343322753906, + "learning_rate": 9.45919689119171e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.8868244886398315, + "num_tokens": 600122.0, + "step": 335 + }, + { + "epoch": 0.05440854991498664, + "grad_norm": 55.842166900634766, + "learning_rate": 9.457577720207256e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.8848241567611694, + "num_tokens": 601911.0, + "step": 336 + }, + { + "epoch": 0.05457048012306696, + "grad_norm": 52.97017288208008, + "learning_rate": 9.4559585492228e-06, + "loss": 0.844, + "mean_token_accuracy": 0.8799320757389069, + "num_tokens": 603706.0, + "step": 337 + }, + { + "epoch": 0.05473241033114728, + "grad_norm": 59.960540771484375, + "learning_rate": 9.454339378238343e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.8954392969608307, + "num_tokens": 605499.0, + "step": 338 + }, + { + "epoch": 0.05489434053922759, + "grad_norm": 59.53228759765625, + "learning_rate": 9.452720207253887e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.893928050994873, + "num_tokens": 607294.0, + "step": 339 + }, + { + "epoch": 0.05505627074730791, + "grad_norm": 55.91241455078125, + "learning_rate": 9.451101036269432e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.8915935754776001, + "num_tokens": 609091.0, + "step": 340 + }, + { + "epoch": 0.05521820095538823, + "grad_norm": 54.15565490722656, + "learning_rate": 9.449481865284975e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.8945436477661133, + "num_tokens": 610887.0, + "step": 341 + }, + { + "epoch": 0.05538013116346854, + "grad_norm": 64.91709899902344, + "learning_rate": 9.447862694300519e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.8832933604717255, + "num_tokens": 612673.0, + "step": 342 + }, + { + "epoch": 0.055542061371548865, + "grad_norm": 52.16558837890625, + "learning_rate": 9.446243523316063e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.893382340669632, + "num_tokens": 614457.0, + "step": 343 + }, + { + "epoch": 0.05570399157962918, + "grad_norm": 59.97270965576172, + "learning_rate": 9.444624352331608e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.8834813833236694, + "num_tokens": 616260.0, + "step": 344 + }, + { + "epoch": 0.055865921787709494, + "grad_norm": 50.6729621887207, + "learning_rate": 9.443005181347151e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.8850965797901154, + "num_tokens": 618042.0, + "step": 345 + }, + { + "epoch": 0.056027851995789815, + "grad_norm": 73.4538345336914, + "learning_rate": 9.441386010362695e-06, + "loss": 1.3164, + "mean_token_accuracy": 0.8522437214851379, + "num_tokens": 619845.0, + "step": 346 + }, + { + "epoch": 0.05618978220387013, + "grad_norm": 54.023887634277344, + "learning_rate": 9.439766839378239e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.8756336271762848, + "num_tokens": 621638.0, + "step": 347 + }, + { + "epoch": 0.05635171241195045, + "grad_norm": 44.938392639160156, + "learning_rate": 9.438147668393784e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.8960838913917542, + "num_tokens": 623429.0, + "step": 348 + }, + { + "epoch": 0.056513642620030766, + "grad_norm": 48.227046966552734, + "learning_rate": 9.436528497409328e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.8854352533817291, + "num_tokens": 625229.0, + "step": 349 + }, + { + "epoch": 0.05667557282811109, + "grad_norm": 55.08008575439453, + "learning_rate": 9.434909326424871e-06, + "loss": 0.809, + "mean_token_accuracy": 0.8933570086956024, + "num_tokens": 627022.0, + "step": 350 + }, + { + "epoch": 0.0568375030361914, + "grad_norm": 36.199302673339844, + "learning_rate": 9.433290155440415e-06, + "loss": 0.6275, + "mean_token_accuracy": 0.923739492893219, + "num_tokens": 628810.0, + "step": 351 + }, + { + "epoch": 0.056999433244271716, + "grad_norm": 60.80564498901367, + "learning_rate": 9.43167098445596e-06, + "loss": 1.1439, + "mean_token_accuracy": 0.8571495413780212, + "num_tokens": 630615.0, + "step": 352 + }, + { + "epoch": 0.05716136345235204, + "grad_norm": 45.7132453918457, + "learning_rate": 9.430051813471504e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.8997512459754944, + "num_tokens": 632396.0, + "step": 353 + }, + { + "epoch": 0.05732329366043235, + "grad_norm": 51.128448486328125, + "learning_rate": 9.428432642487047e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.8830883800983429, + "num_tokens": 634173.0, + "step": 354 + }, + { + "epoch": 0.05748522386851267, + "grad_norm": 51.39332580566406, + "learning_rate": 9.426813471502592e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.88721963763237, + "num_tokens": 635960.0, + "step": 355 + }, + { + "epoch": 0.05764715407659299, + "grad_norm": 60.653045654296875, + "learning_rate": 9.425194300518136e-06, + "loss": 1.1926, + "mean_token_accuracy": 0.873776912689209, + "num_tokens": 637765.0, + "step": 356 + }, + { + "epoch": 0.0578090842846733, + "grad_norm": 57.790435791015625, + "learning_rate": 9.42357512953368e-06, + "loss": 1.175, + "mean_token_accuracy": 0.8596720099449158, + "num_tokens": 639562.0, + "step": 357 + }, + { + "epoch": 0.057971014492753624, + "grad_norm": 52.67477035522461, + "learning_rate": 9.421955958549223e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.8744454383850098, + "num_tokens": 641359.0, + "step": 358 + }, + { + "epoch": 0.05813294470083394, + "grad_norm": 52.826515197753906, + "learning_rate": 9.420336787564769e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.8944833278656006, + "num_tokens": 643155.0, + "step": 359 + }, + { + "epoch": 0.05829487490891426, + "grad_norm": 61.656402587890625, + "learning_rate": 9.418717616580312e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.8801389932632446, + "num_tokens": 644952.0, + "step": 360 + }, + { + "epoch": 0.058456805116994574, + "grad_norm": 55.095741271972656, + "learning_rate": 9.417098445595856e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.8971613943576813, + "num_tokens": 646736.0, + "step": 361 + }, + { + "epoch": 0.058618735325074896, + "grad_norm": 53.96144485473633, + "learning_rate": 9.4154792746114e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.8833284676074982, + "num_tokens": 648535.0, + "step": 362 + }, + { + "epoch": 0.05878066553315521, + "grad_norm": 51.875186920166016, + "learning_rate": 9.413860103626945e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.8789206147193909, + "num_tokens": 650319.0, + "step": 363 + }, + { + "epoch": 0.058942595741235525, + "grad_norm": 53.755615234375, + "learning_rate": 9.412240932642488e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.8697420656681061, + "num_tokens": 652115.0, + "step": 364 + }, + { + "epoch": 0.059104525949315846, + "grad_norm": 56.433921813964844, + "learning_rate": 9.410621761658032e-06, + "loss": 1.1127, + "mean_token_accuracy": 0.8689115643501282, + "num_tokens": 653924.0, + "step": 365 + }, + { + "epoch": 0.05926645615739616, + "grad_norm": 56.5037956237793, + "learning_rate": 9.409002590673575e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.8764738440513611, + "num_tokens": 655719.0, + "step": 366 + }, + { + "epoch": 0.05942838636547648, + "grad_norm": 54.06751251220703, + "learning_rate": 9.40738341968912e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.9025388360023499, + "num_tokens": 657508.0, + "step": 367 + }, + { + "epoch": 0.0595903165735568, + "grad_norm": 50.80967330932617, + "learning_rate": 9.405764248704664e-06, + "loss": 0.856, + "mean_token_accuracy": 0.894341230392456, + "num_tokens": 659293.0, + "step": 368 + }, + { + "epoch": 0.05975224678163711, + "grad_norm": 53.56394577026367, + "learning_rate": 9.404145077720208e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.895600438117981, + "num_tokens": 661083.0, + "step": 369 + }, + { + "epoch": 0.05991417698971743, + "grad_norm": 40.78882598876953, + "learning_rate": 9.402525906735751e-06, + "loss": 0.634, + "mean_token_accuracy": 0.9120330810546875, + "num_tokens": 662868.0, + "step": 370 + }, + { + "epoch": 0.06007610719779775, + "grad_norm": 50.89396286010742, + "learning_rate": 9.400906735751297e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.8937198221683502, + "num_tokens": 664653.0, + "step": 371 + }, + { + "epoch": 0.06023803740587807, + "grad_norm": 60.67742919921875, + "learning_rate": 9.39928756476684e-06, + "loss": 1.1599, + "mean_token_accuracy": 0.8480996787548065, + "num_tokens": 666447.0, + "step": 372 + }, + { + "epoch": 0.06039996761395838, + "grad_norm": 59.49285888671875, + "learning_rate": 9.397668393782384e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.8697862327098846, + "num_tokens": 668236.0, + "step": 373 + }, + { + "epoch": 0.060561897822038704, + "grad_norm": 58.06311798095703, + "learning_rate": 9.396049222797929e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.8803423047065735, + "num_tokens": 670030.0, + "step": 374 + }, + { + "epoch": 0.06072382803011902, + "grad_norm": 45.018775939941406, + "learning_rate": 9.394430051813473e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.9039260447025299, + "num_tokens": 671823.0, + "step": 375 + }, + { + "epoch": 0.06088575823819933, + "grad_norm": 61.07163619995117, + "learning_rate": 9.392810880829016e-06, + "loss": 1.1467, + "mean_token_accuracy": 0.8736453056335449, + "num_tokens": 673620.0, + "step": 376 + }, + { + "epoch": 0.061047688446279655, + "grad_norm": 60.545082092285156, + "learning_rate": 9.39119170984456e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.8868243098258972, + "num_tokens": 675416.0, + "step": 377 + }, + { + "epoch": 0.06120961865435997, + "grad_norm": 50.514041900634766, + "learning_rate": 9.389572538860105e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.9117647111415863, + "num_tokens": 677200.0, + "step": 378 + }, + { + "epoch": 0.06137154886244029, + "grad_norm": 50.44256591796875, + "learning_rate": 9.387953367875649e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.8913865685462952, + "num_tokens": 678988.0, + "step": 379 + }, + { + "epoch": 0.061533479070520605, + "grad_norm": 67.69346618652344, + "learning_rate": 9.386334196891192e-06, + "loss": 1.2634, + "mean_token_accuracy": 0.8442807197570801, + "num_tokens": 680783.0, + "step": 380 + }, + { + "epoch": 0.06169540927860092, + "grad_norm": 56.003379821777344, + "learning_rate": 9.384715025906736e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.8904095888137817, + "num_tokens": 682578.0, + "step": 381 + }, + { + "epoch": 0.06185733948668124, + "grad_norm": 48.92418670654297, + "learning_rate": 9.383095854922281e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.9082009792327881, + "num_tokens": 684373.0, + "step": 382 + }, + { + "epoch": 0.062019269694761556, + "grad_norm": 66.3655014038086, + "learning_rate": 9.381476683937825e-06, + "loss": 0.999, + "mean_token_accuracy": 0.8667808473110199, + "num_tokens": 686171.0, + "step": 383 + }, + { + "epoch": 0.06218119990284188, + "grad_norm": 51.00679397583008, + "learning_rate": 9.379857512953368e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.9018518328666687, + "num_tokens": 687958.0, + "step": 384 + }, + { + "epoch": 0.06234313011092219, + "grad_norm": 60.59869384765625, + "learning_rate": 9.378238341968912e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.8720994889736176, + "num_tokens": 689751.0, + "step": 385 + }, + { + "epoch": 0.0625050603190025, + "grad_norm": 41.44636917114258, + "learning_rate": 9.376619170984457e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.8996130526065826, + "num_tokens": 691532.0, + "step": 386 + }, + { + "epoch": 0.06266699052708283, + "grad_norm": 54.59798812866211, + "learning_rate": 9.375000000000001e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.8741718530654907, + "num_tokens": 693322.0, + "step": 387 + }, + { + "epoch": 0.06282892073516315, + "grad_norm": 47.00348663330078, + "learning_rate": 9.373380829015544e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.88413867354393, + "num_tokens": 695110.0, + "step": 388 + }, + { + "epoch": 0.06299085094324346, + "grad_norm": 54.444454193115234, + "learning_rate": 9.371761658031088e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.8774765431880951, + "num_tokens": 696906.0, + "step": 389 + }, + { + "epoch": 0.06315278115132378, + "grad_norm": 58.430118560791016, + "learning_rate": 9.370142487046633e-06, + "loss": 1.1883, + "mean_token_accuracy": 0.86239293217659, + "num_tokens": 698713.0, + "step": 390 + }, + { + "epoch": 0.0633147113594041, + "grad_norm": 50.70392608642578, + "learning_rate": 9.368523316062177e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.879334956407547, + "num_tokens": 700515.0, + "step": 391 + }, + { + "epoch": 0.06347664156748442, + "grad_norm": 48.93309020996094, + "learning_rate": 9.36690414507772e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.8825734555721283, + "num_tokens": 702308.0, + "step": 392 + }, + { + "epoch": 0.06363857177556473, + "grad_norm": 45.962833404541016, + "learning_rate": 9.365284974093266e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.8962543606758118, + "num_tokens": 704100.0, + "step": 393 + }, + { + "epoch": 0.06380050198364505, + "grad_norm": 41.612205505371094, + "learning_rate": 9.36366580310881e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.8959369957447052, + "num_tokens": 705890.0, + "step": 394 + }, + { + "epoch": 0.06396243219172537, + "grad_norm": 53.03821563720703, + "learning_rate": 9.362046632124353e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.8670751750469208, + "num_tokens": 707673.0, + "step": 395 + }, + { + "epoch": 0.06412436239980568, + "grad_norm": 57.093421936035156, + "learning_rate": 9.360427461139897e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.8892857134342194, + "num_tokens": 709465.0, + "step": 396 + }, + { + "epoch": 0.064286292607886, + "grad_norm": 57.488162994384766, + "learning_rate": 9.358808290155442e-06, + "loss": 1.1046, + "mean_token_accuracy": 0.8744049370288849, + "num_tokens": 711264.0, + "step": 397 + }, + { + "epoch": 0.06444822281596632, + "grad_norm": 49.41898727416992, + "learning_rate": 9.357189119170985e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.8852111101150513, + "num_tokens": 713054.0, + "step": 398 + }, + { + "epoch": 0.06461015302404663, + "grad_norm": 47.096614837646484, + "learning_rate": 9.355569948186529e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.8815559446811676, + "num_tokens": 714845.0, + "step": 399 + }, + { + "epoch": 0.06477208323212695, + "grad_norm": 49.34243392944336, + "learning_rate": 9.353950777202073e-06, + "loss": 0.83, + "mean_token_accuracy": 0.8867270648479462, + "num_tokens": 716631.0, + "step": 400 + }, + { + "epoch": 0.06493401344020727, + "grad_norm": 57.568424224853516, + "learning_rate": 9.352331606217618e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.8923013508319855, + "num_tokens": 718439.0, + "step": 401 + }, + { + "epoch": 0.0650959436482876, + "grad_norm": 43.34575653076172, + "learning_rate": 9.350712435233161e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.8998723030090332, + "num_tokens": 720231.0, + "step": 402 + }, + { + "epoch": 0.0652578738563679, + "grad_norm": 42.3759880065918, + "learning_rate": 9.349093264248705e-06, + "loss": 0.744, + "mean_token_accuracy": 0.9106818735599518, + "num_tokens": 722021.0, + "step": 403 + }, + { + "epoch": 0.06541980406444822, + "grad_norm": 37.84474182128906, + "learning_rate": 9.347474093264249e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.9000000059604645, + "num_tokens": 723813.0, + "step": 404 + }, + { + "epoch": 0.06558173427252854, + "grad_norm": 50.4435920715332, + "learning_rate": 9.345854922279794e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.8763368427753448, + "num_tokens": 725608.0, + "step": 405 + }, + { + "epoch": 0.06574366448060885, + "grad_norm": 40.89650344848633, + "learning_rate": 9.344235751295338e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.8982283174991608, + "num_tokens": 727385.0, + "step": 406 + }, + { + "epoch": 0.06590559468868917, + "grad_norm": 39.34998321533203, + "learning_rate": 9.342616580310881e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.9014598429203033, + "num_tokens": 729171.0, + "step": 407 + }, + { + "epoch": 0.0660675248967695, + "grad_norm": 44.388153076171875, + "learning_rate": 9.340997409326425e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.9034444987773895, + "num_tokens": 730972.0, + "step": 408 + }, + { + "epoch": 0.06622945510484982, + "grad_norm": 47.50635528564453, + "learning_rate": 9.33937823834197e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.8852040767669678, + "num_tokens": 732771.0, + "step": 409 + }, + { + "epoch": 0.06639138531293012, + "grad_norm": 37.823299407958984, + "learning_rate": 9.337759067357514e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.9090404212474823, + "num_tokens": 734558.0, + "step": 410 + }, + { + "epoch": 0.06655331552101044, + "grad_norm": 53.0116081237793, + "learning_rate": 9.336139896373057e-06, + "loss": 1.03, + "mean_token_accuracy": 0.8753654956817627, + "num_tokens": 736357.0, + "step": 411 + }, + { + "epoch": 0.06671524572909077, + "grad_norm": 48.80860900878906, + "learning_rate": 9.334520725388602e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.8845965564250946, + "num_tokens": 738155.0, + "step": 412 + }, + { + "epoch": 0.06687717593717107, + "grad_norm": 40.512264251708984, + "learning_rate": 9.332901554404146e-06, + "loss": 0.651, + "mean_token_accuracy": 0.906927227973938, + "num_tokens": 739936.0, + "step": 413 + }, + { + "epoch": 0.0670391061452514, + "grad_norm": 47.08877182006836, + "learning_rate": 9.33128238341969e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.8892156779766083, + "num_tokens": 741734.0, + "step": 414 + }, + { + "epoch": 0.06720103635333172, + "grad_norm": 48.44396209716797, + "learning_rate": 9.329663212435233e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.8935215473175049, + "num_tokens": 743518.0, + "step": 415 + }, + { + "epoch": 0.06736296656141204, + "grad_norm": 48.781158447265625, + "learning_rate": 9.328044041450779e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.8889174461364746, + "num_tokens": 745309.0, + "step": 416 + }, + { + "epoch": 0.06752489676949235, + "grad_norm": 52.43247985839844, + "learning_rate": 9.326424870466322e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.8707874119281769, + "num_tokens": 747099.0, + "step": 417 + }, + { + "epoch": 0.06768682697757267, + "grad_norm": 45.427188873291016, + "learning_rate": 9.324805699481866e-06, + "loss": 0.903, + "mean_token_accuracy": 0.8879020512104034, + "num_tokens": 748887.0, + "step": 418 + }, + { + "epoch": 0.06784875718565299, + "grad_norm": 41.08964920043945, + "learning_rate": 9.32318652849741e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.8930870294570923, + "num_tokens": 750671.0, + "step": 419 + }, + { + "epoch": 0.0680106873937333, + "grad_norm": 59.60834503173828, + "learning_rate": 9.321567357512955e-06, + "loss": 1.1115, + "mean_token_accuracy": 0.8592792749404907, + "num_tokens": 752481.0, + "step": 420 + }, + { + "epoch": 0.06817261760181362, + "grad_norm": 60.86803436279297, + "learning_rate": 9.319948186528498e-06, + "loss": 0.928, + "mean_token_accuracy": 0.8842214047908783, + "num_tokens": 754278.0, + "step": 421 + }, + { + "epoch": 0.06833454780989394, + "grad_norm": 48.281036376953125, + "learning_rate": 9.318329015544042e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.8768051266670227, + "num_tokens": 756066.0, + "step": 422 + }, + { + "epoch": 0.06849647801797425, + "grad_norm": 39.070858001708984, + "learning_rate": 9.316709844559585e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.9155176877975464, + "num_tokens": 757850.0, + "step": 423 + }, + { + "epoch": 0.06865840822605457, + "grad_norm": 43.524845123291016, + "learning_rate": 9.31509067357513e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.897176593542099, + "num_tokens": 759643.0, + "step": 424 + }, + { + "epoch": 0.06882033843413489, + "grad_norm": 49.84126281738281, + "learning_rate": 9.313471502590674e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.8762077391147614, + "num_tokens": 761437.0, + "step": 425 + }, + { + "epoch": 0.06898226864221521, + "grad_norm": 35.13079071044922, + "learning_rate": 9.311852331606218e-06, + "loss": 0.805, + "mean_token_accuracy": 0.9032507538795471, + "num_tokens": 763218.0, + "step": 426 + }, + { + "epoch": 0.06914419885029552, + "grad_norm": 53.83464431762695, + "learning_rate": 9.310233160621761e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.8874330222606659, + "num_tokens": 765014.0, + "step": 427 + }, + { + "epoch": 0.06930612905837584, + "grad_norm": 51.93909454345703, + "learning_rate": 9.308613989637307e-06, + "loss": 1.052, + "mean_token_accuracy": 0.8821428716182709, + "num_tokens": 766806.0, + "step": 428 + }, + { + "epoch": 0.06946805926645616, + "grad_norm": 41.9996223449707, + "learning_rate": 9.30699481865285e-06, + "loss": 0.805, + "mean_token_accuracy": 0.9021008610725403, + "num_tokens": 768594.0, + "step": 429 + }, + { + "epoch": 0.06962998947453647, + "grad_norm": 40.266807556152344, + "learning_rate": 9.305375647668394e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.893869936466217, + "num_tokens": 770380.0, + "step": 430 + }, + { + "epoch": 0.06979191968261679, + "grad_norm": 46.49650955200195, + "learning_rate": 9.303756476683939e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.8893324136734009, + "num_tokens": 772155.0, + "step": 431 + }, + { + "epoch": 0.06995384989069711, + "grad_norm": 47.63191604614258, + "learning_rate": 9.302137305699483e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.8974014818668365, + "num_tokens": 773940.0, + "step": 432 + }, + { + "epoch": 0.07011578009877743, + "grad_norm": 50.4738655090332, + "learning_rate": 9.300518134715026e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.8861073553562164, + "num_tokens": 775739.0, + "step": 433 + }, + { + "epoch": 0.07027771030685774, + "grad_norm": 44.69540023803711, + "learning_rate": 9.29889896373057e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.9032374024391174, + "num_tokens": 777530.0, + "step": 434 + }, + { + "epoch": 0.07043964051493806, + "grad_norm": 39.313316345214844, + "learning_rate": 9.297279792746115e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.8961609601974487, + "num_tokens": 779321.0, + "step": 435 + }, + { + "epoch": 0.07060157072301838, + "grad_norm": 44.265167236328125, + "learning_rate": 9.295660621761659e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.8937326967716217, + "num_tokens": 781115.0, + "step": 436 + }, + { + "epoch": 0.07076350093109869, + "grad_norm": 51.03866958618164, + "learning_rate": 9.294041450777202e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.8880982398986816, + "num_tokens": 782904.0, + "step": 437 + }, + { + "epoch": 0.07092543113917901, + "grad_norm": 41.49946975708008, + "learning_rate": 9.292422279792746e-06, + "loss": 0.756, + "mean_token_accuracy": 0.8954051733016968, + "num_tokens": 784684.0, + "step": 438 + }, + { + "epoch": 0.07108736134725933, + "grad_norm": 49.79719924926758, + "learning_rate": 9.290803108808291e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.8881118893623352, + "num_tokens": 786482.0, + "step": 439 + }, + { + "epoch": 0.07124929155533966, + "grad_norm": 34.358699798583984, + "learning_rate": 9.289183937823835e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.9086354076862335, + "num_tokens": 788268.0, + "step": 440 + }, + { + "epoch": 0.07141122176341996, + "grad_norm": 39.487125396728516, + "learning_rate": 9.287564766839378e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.8991561830043793, + "num_tokens": 790058.0, + "step": 441 + }, + { + "epoch": 0.07157315197150028, + "grad_norm": 55.616573333740234, + "learning_rate": 9.285945595854922e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.8776397407054901, + "num_tokens": 791848.0, + "step": 442 + }, + { + "epoch": 0.0717350821795806, + "grad_norm": 44.5058479309082, + "learning_rate": 9.284326424870467e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.8960431516170502, + "num_tokens": 793639.0, + "step": 443 + }, + { + "epoch": 0.07189701238766091, + "grad_norm": 40.10417175292969, + "learning_rate": 9.282707253886011e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.9124087691307068, + "num_tokens": 795425.0, + "step": 444 + }, + { + "epoch": 0.07205894259574123, + "grad_norm": 48.371299743652344, + "learning_rate": 9.281088082901554e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.8732331693172455, + "num_tokens": 797221.0, + "step": 445 + }, + { + "epoch": 0.07222087280382156, + "grad_norm": 41.847511291503906, + "learning_rate": 9.279468911917098e-06, + "loss": 0.6581, + "mean_token_accuracy": 0.9116222262382507, + "num_tokens": 799016.0, + "step": 446 + }, + { + "epoch": 0.07238280301190186, + "grad_norm": 50.69091796875, + "learning_rate": 9.277849740932643e-06, + "loss": 1.1012, + "mean_token_accuracy": 0.8838366270065308, + "num_tokens": 800811.0, + "step": 447 + }, + { + "epoch": 0.07254473321998219, + "grad_norm": 37.527957916259766, + "learning_rate": 9.276230569948187e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.9081102907657623, + "num_tokens": 802595.0, + "step": 448 + }, + { + "epoch": 0.0727066634280625, + "grad_norm": 32.14083480834961, + "learning_rate": 9.27461139896373e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.9088995456695557, + "num_tokens": 804381.0, + "step": 449 + }, + { + "epoch": 0.07286859363614283, + "grad_norm": 57.70633316040039, + "learning_rate": 9.272992227979276e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.8817920386791229, + "num_tokens": 806170.0, + "step": 450 + }, + { + "epoch": 0.07303052384422314, + "grad_norm": 47.517555236816406, + "learning_rate": 9.27137305699482e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.8787415027618408, + "num_tokens": 807969.0, + "step": 451 + }, + { + "epoch": 0.07319245405230346, + "grad_norm": 36.685272216796875, + "learning_rate": 9.269753886010363e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.8994913697242737, + "num_tokens": 809750.0, + "step": 452 + }, + { + "epoch": 0.07335438426038378, + "grad_norm": 47.20469665527344, + "learning_rate": 9.268134715025907e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.8812949657440186, + "num_tokens": 811540.0, + "step": 453 + }, + { + "epoch": 0.07351631446846409, + "grad_norm": 33.17751693725586, + "learning_rate": 9.266515544041452e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.9102478623390198, + "num_tokens": 813331.0, + "step": 454 + }, + { + "epoch": 0.07367824467654441, + "grad_norm": 38.700862884521484, + "learning_rate": 9.264896373056995e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.8981804847717285, + "num_tokens": 815118.0, + "step": 455 + }, + { + "epoch": 0.07384017488462473, + "grad_norm": 47.53046798706055, + "learning_rate": 9.263277202072539e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.8719820380210876, + "num_tokens": 816910.0, + "step": 456 + }, + { + "epoch": 0.07400210509270505, + "grad_norm": 37.84265899658203, + "learning_rate": 9.261658031088083e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.89768186211586, + "num_tokens": 818705.0, + "step": 457 + }, + { + "epoch": 0.07416403530078536, + "grad_norm": 46.852474212646484, + "learning_rate": 9.260038860103628e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.8783625066280365, + "num_tokens": 820497.0, + "step": 458 + }, + { + "epoch": 0.07432596550886568, + "grad_norm": 48.34907913208008, + "learning_rate": 9.258419689119172e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.8811188638210297, + "num_tokens": 822295.0, + "step": 459 + }, + { + "epoch": 0.074487895716946, + "grad_norm": 47.38665771484375, + "learning_rate": 9.256800518134715e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.881118893623352, + "num_tokens": 824093.0, + "step": 460 + }, + { + "epoch": 0.07464982592502631, + "grad_norm": 43.687320709228516, + "learning_rate": 9.255181347150259e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.9018790423870087, + "num_tokens": 825890.0, + "step": 461 + }, + { + "epoch": 0.07481175613310663, + "grad_norm": 39.578758239746094, + "learning_rate": 9.253562176165804e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.8974613845348358, + "num_tokens": 827675.0, + "step": 462 + }, + { + "epoch": 0.07497368634118695, + "grad_norm": 46.43309783935547, + "learning_rate": 9.251943005181348e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.8959917724132538, + "num_tokens": 829466.0, + "step": 463 + }, + { + "epoch": 0.07513561654926726, + "grad_norm": 47.50029754638672, + "learning_rate": 9.250323834196891e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.8819444179534912, + "num_tokens": 831266.0, + "step": 464 + }, + { + "epoch": 0.07529754675734758, + "grad_norm": 47.62020492553711, + "learning_rate": 9.248704663212435e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.8851393163204193, + "num_tokens": 833057.0, + "step": 465 + }, + { + "epoch": 0.0754594769654279, + "grad_norm": 51.59745407104492, + "learning_rate": 9.24708549222798e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.8940199613571167, + "num_tokens": 834842.0, + "step": 466 + }, + { + "epoch": 0.07562140717350822, + "grad_norm": 56.6263313293457, + "learning_rate": 9.245466321243524e-06, + "loss": 1.2147, + "mean_token_accuracy": 0.8861211538314819, + "num_tokens": 836634.0, + "step": 467 + }, + { + "epoch": 0.07578333738158853, + "grad_norm": 45.93951416015625, + "learning_rate": 9.243847150259067e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.8855936825275421, + "num_tokens": 838417.0, + "step": 468 + }, + { + "epoch": 0.07594526758966885, + "grad_norm": 61.10692596435547, + "learning_rate": 9.242227979274612e-06, + "loss": 1.221, + "mean_token_accuracy": 0.8627976179122925, + "num_tokens": 840213.0, + "step": 469 + }, + { + "epoch": 0.07610719779774917, + "grad_norm": 45.47400665283203, + "learning_rate": 9.240608808290156e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.8870469331741333, + "num_tokens": 841999.0, + "step": 470 + }, + { + "epoch": 0.07626912800582948, + "grad_norm": 56.72896957397461, + "learning_rate": 9.2389896373057e-06, + "loss": 0.905, + "mean_token_accuracy": 0.8782622218132019, + "num_tokens": 843789.0, + "step": 471 + }, + { + "epoch": 0.0764310582139098, + "grad_norm": 35.1486701965332, + "learning_rate": 9.237370466321243e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.9051044583320618, + "num_tokens": 845575.0, + "step": 472 + }, + { + "epoch": 0.07659298842199012, + "grad_norm": 48.15449142456055, + "learning_rate": 9.235751295336789e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.8828417956829071, + "num_tokens": 847368.0, + "step": 473 + }, + { + "epoch": 0.07675491863007045, + "grad_norm": 29.401920318603516, + "learning_rate": 9.234132124352332e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.9149962067604065, + "num_tokens": 849151.0, + "step": 474 + }, + { + "epoch": 0.07691684883815075, + "grad_norm": 32.057533264160156, + "learning_rate": 9.232512953367876e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.9194042086601257, + "num_tokens": 850936.0, + "step": 475 + }, + { + "epoch": 0.07707877904623107, + "grad_norm": 36.40903854370117, + "learning_rate": 9.23089378238342e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.9207247197628021, + "num_tokens": 852725.0, + "step": 476 + }, + { + "epoch": 0.0772407092543114, + "grad_norm": 36.75739288330078, + "learning_rate": 9.229274611398965e-06, + "loss": 0.806, + "mean_token_accuracy": 0.903900682926178, + "num_tokens": 854518.0, + "step": 477 + }, + { + "epoch": 0.0774026394623917, + "grad_norm": 35.5255012512207, + "learning_rate": 9.227655440414508e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.8892035186290741, + "num_tokens": 856310.0, + "step": 478 + }, + { + "epoch": 0.07756456967047202, + "grad_norm": 43.96951675415039, + "learning_rate": 9.226036269430052e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.8875713050365448, + "num_tokens": 858106.0, + "step": 479 + }, + { + "epoch": 0.07772649987855235, + "grad_norm": 38.01785659790039, + "learning_rate": 9.224417098445595e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.8991561830043793, + "num_tokens": 859896.0, + "step": 480 + }, + { + "epoch": 0.07788843008663267, + "grad_norm": 32.69612121582031, + "learning_rate": 9.22279792746114e-06, + "loss": 0.69, + "mean_token_accuracy": 0.9075932800769806, + "num_tokens": 861689.0, + "step": 481 + }, + { + "epoch": 0.07805036029471298, + "grad_norm": 38.05558776855469, + "learning_rate": 9.221178756476684e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.9009661972522736, + "num_tokens": 863474.0, + "step": 482 + }, + { + "epoch": 0.0782122905027933, + "grad_norm": 50.20949172973633, + "learning_rate": 9.219559585492228e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.8779354095458984, + "num_tokens": 865272.0, + "step": 483 + }, + { + "epoch": 0.07837422071087362, + "grad_norm": 28.173141479492188, + "learning_rate": 9.217940414507773e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.9204521775245667, + "num_tokens": 867061.0, + "step": 484 + }, + { + "epoch": 0.07853615091895393, + "grad_norm": 46.980064392089844, + "learning_rate": 9.216321243523317e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.8888353109359741, + "num_tokens": 868861.0, + "step": 485 + }, + { + "epoch": 0.07869808112703425, + "grad_norm": 42.32093048095703, + "learning_rate": 9.214702072538862e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.8806146681308746, + "num_tokens": 870658.0, + "step": 486 + }, + { + "epoch": 0.07886001133511457, + "grad_norm": 43.08979034423828, + "learning_rate": 9.213082901554406e-06, + "loss": 0.82, + "mean_token_accuracy": 0.8889216184616089, + "num_tokens": 872467.0, + "step": 487 + }, + { + "epoch": 0.07902194154319488, + "grad_norm": 40.27311325073242, + "learning_rate": 9.21146373056995e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.8957039415836334, + "num_tokens": 874257.0, + "step": 488 + }, + { + "epoch": 0.0791838717512752, + "grad_norm": 41.04425048828125, + "learning_rate": 9.209844559585493e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.875, + "num_tokens": 876049.0, + "step": 489 + }, + { + "epoch": 0.07934580195935552, + "grad_norm": 49.12306594848633, + "learning_rate": 9.208225388601038e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.8899396359920502, + "num_tokens": 877843.0, + "step": 490 + }, + { + "epoch": 0.07950773216743584, + "grad_norm": 40.436012268066406, + "learning_rate": 9.206606217616582e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.8898065984249115, + "num_tokens": 879637.0, + "step": 491 + }, + { + "epoch": 0.07966966237551615, + "grad_norm": 38.13745880126953, + "learning_rate": 9.204987046632125e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.9014921188354492, + "num_tokens": 881423.0, + "step": 492 + }, + { + "epoch": 0.07983159258359647, + "grad_norm": 49.362247467041016, + "learning_rate": 9.20336787564767e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.892573893070221, + "num_tokens": 883214.0, + "step": 493 + }, + { + "epoch": 0.07999352279167679, + "grad_norm": 31.338422775268555, + "learning_rate": 9.201748704663214e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.9075321555137634, + "num_tokens": 885006.0, + "step": 494 + }, + { + "epoch": 0.0801554529997571, + "grad_norm": 40.81686019897461, + "learning_rate": 9.200129533678758e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.8969059884548187, + "num_tokens": 886809.0, + "step": 495 + }, + { + "epoch": 0.08031738320783742, + "grad_norm": 40.96434783935547, + "learning_rate": 9.198510362694301e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.8916457891464233, + "num_tokens": 888589.0, + "step": 496 + }, + { + "epoch": 0.08047931341591774, + "grad_norm": 37.90857696533203, + "learning_rate": 9.196891191709847e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.9121031761169434, + "num_tokens": 890385.0, + "step": 497 + }, + { + "epoch": 0.08064124362399806, + "grad_norm": 52.691017150878906, + "learning_rate": 9.19527202072539e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.8910714387893677, + "num_tokens": 892173.0, + "step": 498 + }, + { + "epoch": 0.08080317383207837, + "grad_norm": 49.31082534790039, + "learning_rate": 9.193652849740934e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.8654859662055969, + "num_tokens": 893966.0, + "step": 499 + }, + { + "epoch": 0.08096510404015869, + "grad_norm": 50.570438385009766, + "learning_rate": 9.192033678756477e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.8862743079662323, + "num_tokens": 895758.0, + "step": 500 + }, + { + "epoch": 0.08112703424823901, + "grad_norm": 33.87987518310547, + "learning_rate": 9.190414507772023e-06, + "loss": 0.6473, + "mean_token_accuracy": 0.9067513644695282, + "num_tokens": 897538.0, + "step": 501 + }, + { + "epoch": 0.08128896445631932, + "grad_norm": 35.96215057373047, + "learning_rate": 9.188795336787566e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.9012077450752258, + "num_tokens": 899323.0, + "step": 502 + }, + { + "epoch": 0.08145089466439964, + "grad_norm": 43.420310974121094, + "learning_rate": 9.18717616580311e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.8935688436031342, + "num_tokens": 901117.0, + "step": 503 + }, + { + "epoch": 0.08161282487247996, + "grad_norm": 45.617618560791016, + "learning_rate": 9.185556994818653e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.8881153464317322, + "num_tokens": 902906.0, + "step": 504 + }, + { + "epoch": 0.08177475508056029, + "grad_norm": 45.894161224365234, + "learning_rate": 9.183937823834199e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.8745863139629364, + "num_tokens": 904713.0, + "step": 505 + }, + { + "epoch": 0.08193668528864059, + "grad_norm": 41.16225814819336, + "learning_rate": 9.182318652849742e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.8966099619865417, + "num_tokens": 906496.0, + "step": 506 + }, + { + "epoch": 0.08209861549672091, + "grad_norm": 30.52025032043457, + "learning_rate": 9.180699481865286e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.9081169068813324, + "num_tokens": 908291.0, + "step": 507 + }, + { + "epoch": 0.08226054570480124, + "grad_norm": 36.748321533203125, + "learning_rate": 9.17908031088083e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.9008110463619232, + "num_tokens": 910075.0, + "step": 508 + }, + { + "epoch": 0.08242247591288154, + "grad_norm": 34.352508544921875, + "learning_rate": 9.177461139896375e-06, + "loss": 0.582, + "mean_token_accuracy": 0.9062213599681854, + "num_tokens": 911875.0, + "step": 509 + }, + { + "epoch": 0.08258440612096186, + "grad_norm": 40.05860900878906, + "learning_rate": 9.175841968911918e-06, + "loss": 0.936, + "mean_token_accuracy": 0.8804570436477661, + "num_tokens": 913663.0, + "step": 510 + }, + { + "epoch": 0.08274633632904219, + "grad_norm": 49.722721099853516, + "learning_rate": 9.174222797927462e-06, + "loss": 0.933, + "mean_token_accuracy": 0.8785386979579926, + "num_tokens": 915463.0, + "step": 511 + }, + { + "epoch": 0.0829082665371225, + "grad_norm": 36.01008987426758, + "learning_rate": 9.172603626943007e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.8921568691730499, + "num_tokens": 917255.0, + "step": 512 + }, + { + "epoch": 0.08307019674520282, + "grad_norm": 46.750553131103516, + "learning_rate": 9.17098445595855e-06, + "loss": 0.861, + "mean_token_accuracy": 0.879907101392746, + "num_tokens": 919050.0, + "step": 513 + }, + { + "epoch": 0.08323212695328314, + "grad_norm": 36.83955001831055, + "learning_rate": 9.169365284974094e-06, + "loss": 0.6834, + "mean_token_accuracy": 0.9160980880260468, + "num_tokens": 920836.0, + "step": 514 + }, + { + "epoch": 0.08339405716136346, + "grad_norm": 35.40216064453125, + "learning_rate": 9.167746113989638e-06, + "loss": 0.806, + "mean_token_accuracy": 0.8974076807498932, + "num_tokens": 922621.0, + "step": 515 + }, + { + "epoch": 0.08355598736944377, + "grad_norm": 42.34437942504883, + "learning_rate": 9.166126943005183e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.8676542043685913, + "num_tokens": 924420.0, + "step": 516 + }, + { + "epoch": 0.08371791757752409, + "grad_norm": 34.62520217895508, + "learning_rate": 9.164507772020727e-06, + "loss": 0.769, + "mean_token_accuracy": 0.8966503441333771, + "num_tokens": 926203.0, + "step": 517 + }, + { + "epoch": 0.08387984778560441, + "grad_norm": 45.12850570678711, + "learning_rate": 9.16288860103627e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.8656690716743469, + "num_tokens": 928001.0, + "step": 518 + }, + { + "epoch": 0.08404177799368472, + "grad_norm": 38.85281753540039, + "learning_rate": 9.161269430051814e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.8876811861991882, + "num_tokens": 929789.0, + "step": 519 + }, + { + "epoch": 0.08420370820176504, + "grad_norm": 35.66532897949219, + "learning_rate": 9.15965025906736e-06, + "loss": 0.818, + "mean_token_accuracy": 0.9037270545959473, + "num_tokens": 931570.0, + "step": 520 + }, + { + "epoch": 0.08436563840984536, + "grad_norm": 40.39627456665039, + "learning_rate": 9.158031088082903e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.8856909573078156, + "num_tokens": 933362.0, + "step": 521 + }, + { + "epoch": 0.08452756861792568, + "grad_norm": 46.52644729614258, + "learning_rate": 9.156411917098446e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.8760567903518677, + "num_tokens": 935156.0, + "step": 522 + }, + { + "epoch": 0.08468949882600599, + "grad_norm": 38.701576232910156, + "learning_rate": 9.15479274611399e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.8865248262882233, + "num_tokens": 936950.0, + "step": 523 + }, + { + "epoch": 0.08485142903408631, + "grad_norm": 41.84128952026367, + "learning_rate": 9.153173575129535e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.8804563581943512, + "num_tokens": 938746.0, + "step": 524 + }, + { + "epoch": 0.08501335924216663, + "grad_norm": 36.47675704956055, + "learning_rate": 9.151554404145079e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.8947421312332153, + "num_tokens": 940540.0, + "step": 525 + }, + { + "epoch": 0.08517528945024694, + "grad_norm": 37.19576644897461, + "learning_rate": 9.149935233160623e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.885046124458313, + "num_tokens": 942322.0, + "step": 526 + }, + { + "epoch": 0.08533721965832726, + "grad_norm": 37.5869255065918, + "learning_rate": 9.148316062176166e-06, + "loss": 0.696, + "mean_token_accuracy": 0.9103787243366241, + "num_tokens": 944123.0, + "step": 527 + }, + { + "epoch": 0.08549914986640758, + "grad_norm": 34.7048225402832, + "learning_rate": 9.146696891191711e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.9105429947376251, + "num_tokens": 945915.0, + "step": 528 + }, + { + "epoch": 0.0856610800744879, + "grad_norm": 33.568546295166016, + "learning_rate": 9.145077720207255e-06, + "loss": 0.701, + "mean_token_accuracy": 0.8984127044677734, + "num_tokens": 947702.0, + "step": 529 + }, + { + "epoch": 0.08582301028256821, + "grad_norm": 44.70387649536133, + "learning_rate": 9.143458549222799e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.8779865801334381, + "num_tokens": 949501.0, + "step": 530 + }, + { + "epoch": 0.08598494049064853, + "grad_norm": 41.08858871459961, + "learning_rate": 9.141839378238344e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.8736836612224579, + "num_tokens": 951290.0, + "step": 531 + }, + { + "epoch": 0.08614687069872885, + "grad_norm": 52.84129333496094, + "learning_rate": 9.140220207253887e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.8727866113185883, + "num_tokens": 953094.0, + "step": 532 + }, + { + "epoch": 0.08630880090680916, + "grad_norm": 45.39836502075195, + "learning_rate": 9.138601036269431e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.8755514621734619, + "num_tokens": 954902.0, + "step": 533 + }, + { + "epoch": 0.08647073111488948, + "grad_norm": 43.694156646728516, + "learning_rate": 9.136981865284975e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.8868613243103027, + "num_tokens": 956688.0, + "step": 534 + }, + { + "epoch": 0.0866326613229698, + "grad_norm": 44.96379089355469, + "learning_rate": 9.13536269430052e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.8791326880455017, + "num_tokens": 958473.0, + "step": 535 + }, + { + "epoch": 0.08679459153105011, + "grad_norm": 35.05347442626953, + "learning_rate": 9.133743523316063e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.9055226147174835, + "num_tokens": 960260.0, + "step": 536 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 40.07206726074219, + "learning_rate": 9.132124352331607e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.8996916711330414, + "num_tokens": 962051.0, + "step": 537 + }, + { + "epoch": 0.08711845194721075, + "grad_norm": 36.64631652832031, + "learning_rate": 9.13050518134715e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.9051094949245453, + "num_tokens": 963837.0, + "step": 538 + }, + { + "epoch": 0.08728038215529108, + "grad_norm": 34.931793212890625, + "learning_rate": 9.128886010362696e-06, + "loss": 0.6907, + "mean_token_accuracy": 0.9032531678676605, + "num_tokens": 965628.0, + "step": 539 + }, + { + "epoch": 0.08744231236337138, + "grad_norm": 36.16720962524414, + "learning_rate": 9.12726683937824e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.8967038989067078, + "num_tokens": 967420.0, + "step": 540 + }, + { + "epoch": 0.0876042425714517, + "grad_norm": 30.684724807739258, + "learning_rate": 9.125647668393783e-06, + "loss": 0.6305, + "mean_token_accuracy": 0.9159165620803833, + "num_tokens": 969207.0, + "step": 541 + }, + { + "epoch": 0.08776617277953203, + "grad_norm": 44.079593658447266, + "learning_rate": 9.124028497409327e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.8889742493629456, + "num_tokens": 970989.0, + "step": 542 + }, + { + "epoch": 0.08792810298761233, + "grad_norm": 44.77708053588867, + "learning_rate": 9.122409326424872e-06, + "loss": 0.885, + "mean_token_accuracy": 0.8802955746650696, + "num_tokens": 972793.0, + "step": 543 + }, + { + "epoch": 0.08809003319569265, + "grad_norm": 39.766258239746094, + "learning_rate": 9.120790155440416e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.903517335653305, + "num_tokens": 974584.0, + "step": 544 + }, + { + "epoch": 0.08825196340377298, + "grad_norm": 46.68134307861328, + "learning_rate": 9.11917098445596e-06, + "loss": 0.932, + "mean_token_accuracy": 0.8750191330909729, + "num_tokens": 976376.0, + "step": 545 + }, + { + "epoch": 0.0884138936118533, + "grad_norm": 46.088077545166016, + "learning_rate": 9.117551813471503e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.8890588581562042, + "num_tokens": 978166.0, + "step": 546 + }, + { + "epoch": 0.0885758238199336, + "grad_norm": 36.8809700012207, + "learning_rate": 9.115932642487048e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.9067831337451935, + "num_tokens": 979957.0, + "step": 547 + }, + { + "epoch": 0.08873775402801393, + "grad_norm": 38.474056243896484, + "learning_rate": 9.114313471502592e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.8904609680175781, + "num_tokens": 981752.0, + "step": 548 + }, + { + "epoch": 0.08889968423609425, + "grad_norm": 47.807437896728516, + "learning_rate": 9.112694300518135e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.8764282763004303, + "num_tokens": 983539.0, + "step": 549 + }, + { + "epoch": 0.08906161444417456, + "grad_norm": 33.97437286376953, + "learning_rate": 9.11107512953368e-06, + "loss": 0.726, + "mean_token_accuracy": 0.9033996760845184, + "num_tokens": 985320.0, + "step": 550 + }, + { + "epoch": 0.08922354465225488, + "grad_norm": 46.7301139831543, + "learning_rate": 9.109455958549224e-06, + "loss": 1.0919, + "mean_token_accuracy": 0.8827870488166809, + "num_tokens": 987121.0, + "step": 551 + }, + { + "epoch": 0.0893854748603352, + "grad_norm": 31.69512367248535, + "learning_rate": 9.107836787564768e-06, + "loss": 0.6151, + "mean_token_accuracy": 0.9129291772842407, + "num_tokens": 988909.0, + "step": 552 + }, + { + "epoch": 0.08954740506841552, + "grad_norm": 35.732723236083984, + "learning_rate": 9.106217616580311e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.9083463847637177, + "num_tokens": 990703.0, + "step": 553 + }, + { + "epoch": 0.08970933527649583, + "grad_norm": 43.09870529174805, + "learning_rate": 9.104598445595857e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.8877540230751038, + "num_tokens": 992491.0, + "step": 554 + }, + { + "epoch": 0.08987126548457615, + "grad_norm": 42.41291046142578, + "learning_rate": 9.1029792746114e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.8805672228336334, + "num_tokens": 994279.0, + "step": 555 + }, + { + "epoch": 0.09003319569265647, + "grad_norm": 34.58682632446289, + "learning_rate": 9.101360103626944e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.9031945466995239, + "num_tokens": 996071.0, + "step": 556 + }, + { + "epoch": 0.09019512590073678, + "grad_norm": 36.18080520629883, + "learning_rate": 9.099740932642487e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.8992753624916077, + "num_tokens": 997861.0, + "step": 557 + }, + { + "epoch": 0.0903570561088171, + "grad_norm": 30.26384735107422, + "learning_rate": 9.098121761658033e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.8936090171337128, + "num_tokens": 999646.0, + "step": 558 + }, + { + "epoch": 0.09051898631689742, + "grad_norm": 43.25270080566406, + "learning_rate": 9.096502590673576e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.8781447112560272, + "num_tokens": 1001445.0, + "step": 559 + }, + { + "epoch": 0.09068091652497773, + "grad_norm": 47.86593246459961, + "learning_rate": 9.09488341968912e-06, + "loss": 1.1261, + "mean_token_accuracy": 0.8596596419811249, + "num_tokens": 1003240.0, + "step": 560 + }, + { + "epoch": 0.09084284673305805, + "grad_norm": 26.668771743774414, + "learning_rate": 9.093264248704663e-06, + "loss": 0.6807, + "mean_token_accuracy": 0.9109138250350952, + "num_tokens": 1005021.0, + "step": 561 + }, + { + "epoch": 0.09100477694113837, + "grad_norm": 36.10108947753906, + "learning_rate": 9.091645077720209e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.8885050415992737, + "num_tokens": 1006801.0, + "step": 562 + }, + { + "epoch": 0.09116670714921869, + "grad_norm": 26.014118194580078, + "learning_rate": 9.090025906735752e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.9196553230285645, + "num_tokens": 1008586.0, + "step": 563 + }, + { + "epoch": 0.091328637357299, + "grad_norm": 36.54709243774414, + "learning_rate": 9.088406735751296e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.905587375164032, + "num_tokens": 1010373.0, + "step": 564 + }, + { + "epoch": 0.09149056756537932, + "grad_norm": 33.99188232421875, + "learning_rate": 9.08678756476684e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.8889986872673035, + "num_tokens": 1012155.0, + "step": 565 + }, + { + "epoch": 0.09165249777345964, + "grad_norm": 36.17535400390625, + "learning_rate": 9.085168393782385e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.8960067927837372, + "num_tokens": 1013946.0, + "step": 566 + }, + { + "epoch": 0.09181442798153995, + "grad_norm": 30.589298248291016, + "learning_rate": 9.083549222797928e-06, + "loss": 0.6762, + "mean_token_accuracy": 0.89882692694664, + "num_tokens": 1015735.0, + "step": 567 + }, + { + "epoch": 0.09197635818962027, + "grad_norm": 31.206890106201172, + "learning_rate": 9.081930051813472e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.8982758522033691, + "num_tokens": 1017532.0, + "step": 568 + }, + { + "epoch": 0.0921382883977006, + "grad_norm": 38.077938079833984, + "learning_rate": 9.080310880829017e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.8916275501251221, + "num_tokens": 1019330.0, + "step": 569 + }, + { + "epoch": 0.09230021860578091, + "grad_norm": 38.95375061035156, + "learning_rate": 9.07869170984456e-06, + "loss": 0.856, + "mean_token_accuracy": 0.8853963315486908, + "num_tokens": 1021121.0, + "step": 570 + }, + { + "epoch": 0.09246214881386122, + "grad_norm": 33.88456344604492, + "learning_rate": 9.077072538860104e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.902599424123764, + "num_tokens": 1022911.0, + "step": 571 + }, + { + "epoch": 0.09262407902194154, + "grad_norm": 36.16845703125, + "learning_rate": 9.075453367875648e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.8960636854171753, + "num_tokens": 1024692.0, + "step": 572 + }, + { + "epoch": 0.09278600923002187, + "grad_norm": 35.536415100097656, + "learning_rate": 9.073834196891193e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.8763091266155243, + "num_tokens": 1026479.0, + "step": 573 + }, + { + "epoch": 0.09294793943810217, + "grad_norm": 45.72947311401367, + "learning_rate": 9.072215025906737e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.8699954450130463, + "num_tokens": 1028288.0, + "step": 574 + }, + { + "epoch": 0.0931098696461825, + "grad_norm": 43.1564826965332, + "learning_rate": 9.07059585492228e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.8852503001689911, + "num_tokens": 1030087.0, + "step": 575 + }, + { + "epoch": 0.09327179985426282, + "grad_norm": 39.70059585571289, + "learning_rate": 9.068976683937824e-06, + "loss": 1.3281, + "mean_token_accuracy": 0.8701861500740051, + "num_tokens": 1031884.0, + "step": 576 + }, + { + "epoch": 0.09343373006234312, + "grad_norm": 27.399688720703125, + "learning_rate": 9.06735751295337e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.914814829826355, + "num_tokens": 1033666.0, + "step": 577 + }, + { + "epoch": 0.09359566027042344, + "grad_norm": 47.35184097290039, + "learning_rate": 9.065738341968913e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.8777962028980255, + "num_tokens": 1035456.0, + "step": 578 + }, + { + "epoch": 0.09375759047850377, + "grad_norm": 39.08265686035156, + "learning_rate": 9.064119170984456e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.8805022239685059, + "num_tokens": 1037251.0, + "step": 579 + }, + { + "epoch": 0.09391952068658409, + "grad_norm": 38.741668701171875, + "learning_rate": 9.0625e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.9007575511932373, + "num_tokens": 1039035.0, + "step": 580 + }, + { + "epoch": 0.0940814508946644, + "grad_norm": 44.4018669128418, + "learning_rate": 9.060880829015545e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.8770685493946075, + "num_tokens": 1040823.0, + "step": 581 + }, + { + "epoch": 0.09424338110274472, + "grad_norm": 33.8218879699707, + "learning_rate": 9.059261658031089e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.8912815153598785, + "num_tokens": 1042611.0, + "step": 582 + }, + { + "epoch": 0.09440531131082504, + "grad_norm": 39.219520568847656, + "learning_rate": 9.057642487046633e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.8989204466342926, + "num_tokens": 1044408.0, + "step": 583 + }, + { + "epoch": 0.09456724151890535, + "grad_norm": 46.69636154174805, + "learning_rate": 9.056023316062176e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.8728821575641632, + "num_tokens": 1046212.0, + "step": 584 + }, + { + "epoch": 0.09472917172698567, + "grad_norm": 46.50071334838867, + "learning_rate": 9.054404145077721e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.9067711234092712, + "num_tokens": 1048013.0, + "step": 585 + }, + { + "epoch": 0.09489110193506599, + "grad_norm": 39.08499526977539, + "learning_rate": 9.052784974093265e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.8857594728469849, + "num_tokens": 1049805.0, + "step": 586 + }, + { + "epoch": 0.09505303214314631, + "grad_norm": 38.8447151184082, + "learning_rate": 9.051165803108809e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.8857826292514801, + "num_tokens": 1051587.0, + "step": 587 + }, + { + "epoch": 0.09521496235122662, + "grad_norm": 34.5479621887207, + "learning_rate": 9.049546632124354e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.8918128609657288, + "num_tokens": 1053367.0, + "step": 588 + }, + { + "epoch": 0.09537689255930694, + "grad_norm": 29.89374542236328, + "learning_rate": 9.047927461139897e-06, + "loss": 0.6209, + "mean_token_accuracy": 0.9084370732307434, + "num_tokens": 1055152.0, + "step": 589 + }, + { + "epoch": 0.09553882276738726, + "grad_norm": 33.414695739746094, + "learning_rate": 9.046308290155441e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.9157062470912933, + "num_tokens": 1056949.0, + "step": 590 + }, + { + "epoch": 0.09570075297546757, + "grad_norm": 39.24887466430664, + "learning_rate": 9.044689119170985e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.8920360505580902, + "num_tokens": 1058739.0, + "step": 591 + }, + { + "epoch": 0.09586268318354789, + "grad_norm": 29.195756912231445, + "learning_rate": 9.04306994818653e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.9147091805934906, + "num_tokens": 1060532.0, + "step": 592 + }, + { + "epoch": 0.09602461339162821, + "grad_norm": 43.67234802246094, + "learning_rate": 9.041450777202073e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.8783040642738342, + "num_tokens": 1062332.0, + "step": 593 + }, + { + "epoch": 0.09618654359970853, + "grad_norm": 39.56077575683594, + "learning_rate": 9.039831606217617e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.8751722574234009, + "num_tokens": 1064124.0, + "step": 594 + }, + { + "epoch": 0.09634847380778884, + "grad_norm": 38.047847747802734, + "learning_rate": 9.03821243523316e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.8981804847717285, + "num_tokens": 1065911.0, + "step": 595 + }, + { + "epoch": 0.09651040401586916, + "grad_norm": 37.424049377441406, + "learning_rate": 9.036593264248706e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.8858951032161713, + "num_tokens": 1067703.0, + "step": 596 + }, + { + "epoch": 0.09667233422394948, + "grad_norm": 39.37641906738281, + "learning_rate": 9.03497409326425e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.9019958078861237, + "num_tokens": 1069491.0, + "step": 597 + }, + { + "epoch": 0.09683426443202979, + "grad_norm": 44.069984436035156, + "learning_rate": 9.033354922279793e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.8804119527339935, + "num_tokens": 1071286.0, + "step": 598 + }, + { + "epoch": 0.09699619464011011, + "grad_norm": 34.022003173828125, + "learning_rate": 9.031735751295337e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.9068345129489899, + "num_tokens": 1073077.0, + "step": 599 + }, + { + "epoch": 0.09715812484819043, + "grad_norm": 44.12709426879883, + "learning_rate": 9.030116580310882e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.876198798418045, + "num_tokens": 1074872.0, + "step": 600 + }, + { + "epoch": 0.09732005505627074, + "grad_norm": 33.81965255737305, + "learning_rate": 9.028497409326426e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.8938003480434418, + "num_tokens": 1076657.0, + "step": 601 + }, + { + "epoch": 0.09748198526435106, + "grad_norm": 40.71698760986328, + "learning_rate": 9.02687823834197e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.8902597427368164, + "num_tokens": 1078452.0, + "step": 602 + }, + { + "epoch": 0.09764391547243138, + "grad_norm": 32.85609436035156, + "learning_rate": 9.025259067357513e-06, + "loss": 0.6628, + "mean_token_accuracy": 0.9024606645107269, + "num_tokens": 1080241.0, + "step": 603 + }, + { + "epoch": 0.0978058456805117, + "grad_norm": 30.258365631103516, + "learning_rate": 9.023639896373058e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.9113465547561646, + "num_tokens": 1082024.0, + "step": 604 + }, + { + "epoch": 0.09796777588859201, + "grad_norm": 45.52676010131836, + "learning_rate": 9.022020725388602e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.8769958019256592, + "num_tokens": 1083812.0, + "step": 605 + }, + { + "epoch": 0.09812970609667233, + "grad_norm": 31.065317153930664, + "learning_rate": 9.020401554404145e-06, + "loss": 0.6603, + "mean_token_accuracy": 0.9060838520526886, + "num_tokens": 1085601.0, + "step": 606 + }, + { + "epoch": 0.09829163630475266, + "grad_norm": 30.893630981445312, + "learning_rate": 9.01878238341969e-06, + "loss": 0.71, + "mean_token_accuracy": 0.9122442603111267, + "num_tokens": 1087387.0, + "step": 607 + }, + { + "epoch": 0.09845356651283296, + "grad_norm": 36.142784118652344, + "learning_rate": 9.017163212435234e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.9053024351596832, + "num_tokens": 1089173.0, + "step": 608 + }, + { + "epoch": 0.09861549672091328, + "grad_norm": 40.01546859741211, + "learning_rate": 9.015544041450778e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.8971709907054901, + "num_tokens": 1090957.0, + "step": 609 + }, + { + "epoch": 0.0987774269289936, + "grad_norm": 33.17009353637695, + "learning_rate": 9.013924870466321e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.9080882370471954, + "num_tokens": 1092741.0, + "step": 610 + }, + { + "epoch": 0.09893935713707393, + "grad_norm": 44.18925857543945, + "learning_rate": 9.012305699481867e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.888827919960022, + "num_tokens": 1094523.0, + "step": 611 + }, + { + "epoch": 0.09910128734515423, + "grad_norm": 45.22432327270508, + "learning_rate": 9.01068652849741e-06, + "loss": 0.742, + "mean_token_accuracy": 0.8964992463588715, + "num_tokens": 1096325.0, + "step": 612 + }, + { + "epoch": 0.09926321755323456, + "grad_norm": 31.28477668762207, + "learning_rate": 9.009067357512954e-06, + "loss": 0.6511, + "mean_token_accuracy": 0.9182723760604858, + "num_tokens": 1098119.0, + "step": 613 + }, + { + "epoch": 0.09942514776131488, + "grad_norm": 41.96350860595703, + "learning_rate": 9.007448186528497e-06, + "loss": 0.6631, + "mean_token_accuracy": 0.9085756838321686, + "num_tokens": 1099915.0, + "step": 614 + }, + { + "epoch": 0.09958707796939519, + "grad_norm": 45.22574996948242, + "learning_rate": 9.005829015544043e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.8899290561676025, + "num_tokens": 1101718.0, + "step": 615 + }, + { + "epoch": 0.0997490081774755, + "grad_norm": 28.762096405029297, + "learning_rate": 9.004209844559586e-06, + "loss": 0.6382, + "mean_token_accuracy": 0.9142857193946838, + "num_tokens": 1103510.0, + "step": 616 + }, + { + "epoch": 0.09991093838555583, + "grad_norm": 34.82968521118164, + "learning_rate": 9.00259067357513e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.9024765491485596, + "num_tokens": 1105299.0, + "step": 617 + }, + { + "epoch": 0.10007286859363615, + "grad_norm": 36.802894592285156, + "learning_rate": 9.000971502590673e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.8917339146137238, + "num_tokens": 1107097.0, + "step": 618 + }, + { + "epoch": 0.10023479880171646, + "grad_norm": 30.758092880249023, + "learning_rate": 8.999352331606219e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.9057396352291107, + "num_tokens": 1108885.0, + "step": 619 + }, + { + "epoch": 0.10039672900979678, + "grad_norm": 36.754512786865234, + "learning_rate": 8.997733160621762e-06, + "loss": 0.6363, + "mean_token_accuracy": 0.9100378751754761, + "num_tokens": 1110673.0, + "step": 620 + }, + { + "epoch": 0.1005586592178771, + "grad_norm": 23.808488845825195, + "learning_rate": 8.996113989637306e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.9239267706871033, + "num_tokens": 1112461.0, + "step": 621 + }, + { + "epoch": 0.10072058942595741, + "grad_norm": 42.57846450805664, + "learning_rate": 8.99449481865285e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.8903903961181641, + "num_tokens": 1114265.0, + "step": 622 + }, + { + "epoch": 0.10088251963403773, + "grad_norm": 35.165733337402344, + "learning_rate": 8.992875647668395e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.8905109763145447, + "num_tokens": 1116051.0, + "step": 623 + }, + { + "epoch": 0.10104444984211805, + "grad_norm": 44.851806640625, + "learning_rate": 8.991256476683938e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.8868994116783142, + "num_tokens": 1117846.0, + "step": 624 + }, + { + "epoch": 0.10120638005019836, + "grad_norm": 30.463428497314453, + "learning_rate": 8.989637305699482e-06, + "loss": 0.6335, + "mean_token_accuracy": 0.9128254354000092, + "num_tokens": 1119632.0, + "step": 625 + }, + { + "epoch": 0.10136831025827868, + "grad_norm": 34.8211669921875, + "learning_rate": 8.988018134715027e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.893382340669632, + "num_tokens": 1121416.0, + "step": 626 + }, + { + "epoch": 0.101530240466359, + "grad_norm": 35.37688446044922, + "learning_rate": 8.98639896373057e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.8952054679393768, + "num_tokens": 1123224.0, + "step": 627 + }, + { + "epoch": 0.10169217067443932, + "grad_norm": 34.03190994262695, + "learning_rate": 8.984779792746114e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.9028058052062988, + "num_tokens": 1125014.0, + "step": 628 + }, + { + "epoch": 0.10185410088251963, + "grad_norm": 41.09244155883789, + "learning_rate": 8.983160621761658e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.8963044285774231, + "num_tokens": 1126806.0, + "step": 629 + }, + { + "epoch": 0.10201603109059995, + "grad_norm": 36.194007873535156, + "learning_rate": 8.981541450777203e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.8953647315502167, + "num_tokens": 1128594.0, + "step": 630 + }, + { + "epoch": 0.10217796129868027, + "grad_norm": 42.55862808227539, + "learning_rate": 8.979922279792747e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.8936170041561127, + "num_tokens": 1130388.0, + "step": 631 + }, + { + "epoch": 0.10233989150676058, + "grad_norm": 30.45798683166504, + "learning_rate": 8.97830310880829e-06, + "loss": 0.6512, + "mean_token_accuracy": 0.9172663688659668, + "num_tokens": 1132177.0, + "step": 632 + }, + { + "epoch": 0.1025018217148409, + "grad_norm": 34.06311798095703, + "learning_rate": 8.976683937823834e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.8865019977092743, + "num_tokens": 1133971.0, + "step": 633 + }, + { + "epoch": 0.10266375192292122, + "grad_norm": 34.95612335205078, + "learning_rate": 8.97506476683938e-06, + "loss": 0.757, + "mean_token_accuracy": 0.8997070789337158, + "num_tokens": 1135762.0, + "step": 634 + }, + { + "epoch": 0.10282568213100154, + "grad_norm": 37.71987533569336, + "learning_rate": 8.973445595854923e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.8888073265552521, + "num_tokens": 1137560.0, + "step": 635 + }, + { + "epoch": 0.10298761233908185, + "grad_norm": 41.074493408203125, + "learning_rate": 8.971826424870466e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.8853802680969238, + "num_tokens": 1139351.0, + "step": 636 + }, + { + "epoch": 0.10314954254716217, + "grad_norm": 36.32420349121094, + "learning_rate": 8.97020725388601e-06, + "loss": 0.774, + "mean_token_accuracy": 0.9025167226791382, + "num_tokens": 1141140.0, + "step": 637 + }, + { + "epoch": 0.1033114727552425, + "grad_norm": 31.13274574279785, + "learning_rate": 8.968588082901555e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.9094537794589996, + "num_tokens": 1142928.0, + "step": 638 + }, + { + "epoch": 0.1034734029633228, + "grad_norm": 36.037418365478516, + "learning_rate": 8.966968911917099e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.892691969871521, + "num_tokens": 1144719.0, + "step": 639 + }, + { + "epoch": 0.10363533317140312, + "grad_norm": 29.357982635498047, + "learning_rate": 8.965349740932643e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.9127601683139801, + "num_tokens": 1146507.0, + "step": 640 + }, + { + "epoch": 0.10379726337948345, + "grad_norm": 36.70446014404297, + "learning_rate": 8.963730569948186e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.8896499276161194, + "num_tokens": 1148318.0, + "step": 641 + }, + { + "epoch": 0.10395919358756377, + "grad_norm": 31.734821319580078, + "learning_rate": 8.962111398963731e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.9053531885147095, + "num_tokens": 1150105.0, + "step": 642 + }, + { + "epoch": 0.10412112379564407, + "grad_norm": 36.4366455078125, + "learning_rate": 8.960492227979275e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.8867652714252472, + "num_tokens": 1151906.0, + "step": 643 + }, + { + "epoch": 0.1042830540037244, + "grad_norm": 33.08853530883789, + "learning_rate": 8.958873056994819e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.8930742740631104, + "num_tokens": 1153699.0, + "step": 644 + }, + { + "epoch": 0.10444498421180472, + "grad_norm": 35.976219177246094, + "learning_rate": 8.957253886010364e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.9029101133346558, + "num_tokens": 1155489.0, + "step": 645 + }, + { + "epoch": 0.10460691441988502, + "grad_norm": 30.516292572021484, + "learning_rate": 8.955634715025907e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.9038292169570923, + "num_tokens": 1157271.0, + "step": 646 + }, + { + "epoch": 0.10476884462796535, + "grad_norm": 35.352481842041016, + "learning_rate": 8.954015544041451e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.9027210772037506, + "num_tokens": 1159070.0, + "step": 647 + }, + { + "epoch": 0.10493077483604567, + "grad_norm": 32.153175354003906, + "learning_rate": 8.952396373056995e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.89800626039505, + "num_tokens": 1160866.0, + "step": 648 + }, + { + "epoch": 0.10509270504412598, + "grad_norm": 36.029296875, + "learning_rate": 8.95077720207254e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.8847948610782623, + "num_tokens": 1162665.0, + "step": 649 + }, + { + "epoch": 0.1052546352522063, + "grad_norm": 29.155750274658203, + "learning_rate": 8.949158031088084e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.9239130616188049, + "num_tokens": 1164453.0, + "step": 650 + }, + { + "epoch": 0.10541656546028662, + "grad_norm": 30.928163528442383, + "learning_rate": 8.947538860103627e-06, + "loss": 0.6682, + "mean_token_accuracy": 0.9058971703052521, + "num_tokens": 1166241.0, + "step": 651 + }, + { + "epoch": 0.10557849566836694, + "grad_norm": 33.00962448120117, + "learning_rate": 8.94591968911917e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.9033761322498322, + "num_tokens": 1168031.0, + "step": 652 + }, + { + "epoch": 0.10574042587644725, + "grad_norm": 38.731590270996094, + "learning_rate": 8.944300518134716e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.8935846984386444, + "num_tokens": 1169824.0, + "step": 653 + }, + { + "epoch": 0.10590235608452757, + "grad_norm": 46.37031936645508, + "learning_rate": 8.94268134715026e-06, + "loss": 1.1388, + "mean_token_accuracy": 0.8711753189563751, + "num_tokens": 1171623.0, + "step": 654 + }, + { + "epoch": 0.10606428629260789, + "grad_norm": 23.43425941467285, + "learning_rate": 8.941062176165803e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.9194042086601257, + "num_tokens": 1173408.0, + "step": 655 + }, + { + "epoch": 0.1062262165006882, + "grad_norm": 43.1533088684082, + "learning_rate": 8.939443005181347e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.8692688047885895, + "num_tokens": 1175203.0, + "step": 656 + }, + { + "epoch": 0.10638814670876852, + "grad_norm": 40.089820861816406, + "learning_rate": 8.937823834196892e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.9004509150981903, + "num_tokens": 1177007.0, + "step": 657 + }, + { + "epoch": 0.10655007691684884, + "grad_norm": 37.989830017089844, + "learning_rate": 8.936204663212436e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.8887249529361725, + "num_tokens": 1178797.0, + "step": 658 + }, + { + "epoch": 0.10671200712492916, + "grad_norm": 35.25375747680664, + "learning_rate": 8.93458549222798e-06, + "loss": 0.814, + "mean_token_accuracy": 0.9076961874961853, + "num_tokens": 1180591.0, + "step": 659 + }, + { + "epoch": 0.10687393733300947, + "grad_norm": 47.49795913696289, + "learning_rate": 8.932966321243523e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.8851646780967712, + "num_tokens": 1182390.0, + "step": 660 + }, + { + "epoch": 0.10703586754108979, + "grad_norm": 29.93039321899414, + "learning_rate": 8.931347150259068e-06, + "loss": 0.6396, + "mean_token_accuracy": 0.9158540070056915, + "num_tokens": 1184187.0, + "step": 661 + }, + { + "epoch": 0.10719779774917011, + "grad_norm": 39.823307037353516, + "learning_rate": 8.929727979274612e-06, + "loss": 0.966, + "mean_token_accuracy": 0.8834685385227203, + "num_tokens": 1185980.0, + "step": 662 + }, + { + "epoch": 0.10735972795725042, + "grad_norm": 36.069766998291016, + "learning_rate": 8.928108808290155e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.8881153464317322, + "num_tokens": 1187769.0, + "step": 663 + }, + { + "epoch": 0.10752165816533074, + "grad_norm": 38.42499923706055, + "learning_rate": 8.9264896373057e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.882578045129776, + "num_tokens": 1189562.0, + "step": 664 + }, + { + "epoch": 0.10768358837341106, + "grad_norm": 43.1353874206543, + "learning_rate": 8.924870466321244e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.8968591690063477, + "num_tokens": 1191355.0, + "step": 665 + }, + { + "epoch": 0.10784551858149138, + "grad_norm": 26.2728214263916, + "learning_rate": 8.923251295336788e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.9280426800251007, + "num_tokens": 1193145.0, + "step": 666 + }, + { + "epoch": 0.10800744878957169, + "grad_norm": 37.30793762207031, + "learning_rate": 8.921632124352331e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.8986742198467255, + "num_tokens": 1194933.0, + "step": 667 + }, + { + "epoch": 0.10816937899765201, + "grad_norm": 38.652015686035156, + "learning_rate": 8.920012953367877e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.8946167230606079, + "num_tokens": 1196729.0, + "step": 668 + }, + { + "epoch": 0.10833130920573233, + "grad_norm": 34.655982971191406, + "learning_rate": 8.91839378238342e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.8971447348594666, + "num_tokens": 1198513.0, + "step": 669 + }, + { + "epoch": 0.10849323941381264, + "grad_norm": 41.78131866455078, + "learning_rate": 8.916774611398964e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.8818943798542023, + "num_tokens": 1200323.0, + "step": 670 + }, + { + "epoch": 0.10865516962189296, + "grad_norm": 35.171539306640625, + "learning_rate": 8.915155440414507e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.8848682343959808, + "num_tokens": 1202113.0, + "step": 671 + }, + { + "epoch": 0.10881709982997329, + "grad_norm": 33.28158950805664, + "learning_rate": 8.913536269430053e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.8812186121940613, + "num_tokens": 1203903.0, + "step": 672 + }, + { + "epoch": 0.10897903003805359, + "grad_norm": 38.598793029785156, + "learning_rate": 8.911917098445596e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.886401355266571, + "num_tokens": 1205697.0, + "step": 673 + }, + { + "epoch": 0.10914096024613391, + "grad_norm": 29.30876350402832, + "learning_rate": 8.91029792746114e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.9107498526573181, + "num_tokens": 1207478.0, + "step": 674 + }, + { + "epoch": 0.10930289045421424, + "grad_norm": 27.58310890197754, + "learning_rate": 8.908678756476683e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.9163265228271484, + "num_tokens": 1209277.0, + "step": 675 + }, + { + "epoch": 0.10946482066229456, + "grad_norm": 29.989320755004883, + "learning_rate": 8.907059585492229e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.9001165926456451, + "num_tokens": 1211070.0, + "step": 676 + }, + { + "epoch": 0.10962675087037486, + "grad_norm": 38.820762634277344, + "learning_rate": 8.905440414507774e-06, + "loss": 0.888, + "mean_token_accuracy": 0.893869936466217, + "num_tokens": 1212856.0, + "step": 677 + }, + { + "epoch": 0.10978868107845519, + "grad_norm": 28.231014251708984, + "learning_rate": 8.903821243523318e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.9022297263145447, + "num_tokens": 1214646.0, + "step": 678 + }, + { + "epoch": 0.10995061128653551, + "grad_norm": 26.858198165893555, + "learning_rate": 8.902202072538861e-06, + "loss": 0.6189, + "mean_token_accuracy": 0.9097744226455688, + "num_tokens": 1216424.0, + "step": 679 + }, + { + "epoch": 0.11011254149461581, + "grad_norm": 35.589088439941406, + "learning_rate": 8.900582901554405e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.8960067927837372, + "num_tokens": 1218215.0, + "step": 680 + }, + { + "epoch": 0.11027447170269614, + "grad_norm": 37.692317962646484, + "learning_rate": 8.89896373056995e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.889007955789566, + "num_tokens": 1220007.0, + "step": 681 + }, + { + "epoch": 0.11043640191077646, + "grad_norm": 32.157203674316406, + "learning_rate": 8.897344559585494e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.8979166448116302, + "num_tokens": 1221803.0, + "step": 682 + }, + { + "epoch": 0.11059833211885678, + "grad_norm": 27.846221923828125, + "learning_rate": 8.895725388601037e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.9058738052845001, + "num_tokens": 1223593.0, + "step": 683 + }, + { + "epoch": 0.11076026232693709, + "grad_norm": 31.302717208862305, + "learning_rate": 8.89410621761658e-06, + "loss": 0.848, + "mean_token_accuracy": 0.8885755240917206, + "num_tokens": 1225374.0, + "step": 684 + }, + { + "epoch": 0.11092219253501741, + "grad_norm": 34.415802001953125, + "learning_rate": 8.892487046632126e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.8927008211612701, + "num_tokens": 1227176.0, + "step": 685 + }, + { + "epoch": 0.11108412274309773, + "grad_norm": 38.12055587768555, + "learning_rate": 8.89086787564767e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.8892121016979218, + "num_tokens": 1228968.0, + "step": 686 + }, + { + "epoch": 0.11124605295117804, + "grad_norm": 39.62919616699219, + "learning_rate": 8.889248704663213e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.8866875171661377, + "num_tokens": 1230764.0, + "step": 687 + }, + { + "epoch": 0.11140798315925836, + "grad_norm": 29.010278701782227, + "learning_rate": 8.887629533678757e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.9052418172359467, + "num_tokens": 1232550.0, + "step": 688 + }, + { + "epoch": 0.11156991336733868, + "grad_norm": 28.291397094726562, + "learning_rate": 8.886010362694302e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.9096193909645081, + "num_tokens": 1234339.0, + "step": 689 + }, + { + "epoch": 0.11173184357541899, + "grad_norm": 34.68824768066406, + "learning_rate": 8.884391191709846e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.9032531678676605, + "num_tokens": 1236130.0, + "step": 690 + }, + { + "epoch": 0.11189377378349931, + "grad_norm": 38.88625717163086, + "learning_rate": 8.88277202072539e-06, + "loss": 1.1191, + "mean_token_accuracy": 0.8669329583644867, + "num_tokens": 1237919.0, + "step": 691 + }, + { + "epoch": 0.11205570399157963, + "grad_norm": 28.062644958496094, + "learning_rate": 8.881152849740935e-06, + "loss": 0.6811, + "mean_token_accuracy": 0.9083270728588104, + "num_tokens": 1239704.0, + "step": 692 + }, + { + "epoch": 0.11221763419965995, + "grad_norm": 26.984647750854492, + "learning_rate": 8.879533678756478e-06, + "loss": 0.6311, + "mean_token_accuracy": 0.9071076214313507, + "num_tokens": 1241495.0, + "step": 693 + }, + { + "epoch": 0.11237956440774026, + "grad_norm": 33.64918518066406, + "learning_rate": 8.877914507772022e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.9012126624584198, + "num_tokens": 1243292.0, + "step": 694 + }, + { + "epoch": 0.11254149461582058, + "grad_norm": 33.74709701538086, + "learning_rate": 8.876295336787565e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.88856241106987, + "num_tokens": 1245091.0, + "step": 695 + }, + { + "epoch": 0.1127034248239009, + "grad_norm": 32.96370315551758, + "learning_rate": 8.87467616580311e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.8838293552398682, + "num_tokens": 1246887.0, + "step": 696 + }, + { + "epoch": 0.11286535503198121, + "grad_norm": 27.40631675720215, + "learning_rate": 8.873056994818654e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.91847363114357, + "num_tokens": 1248681.0, + "step": 697 + }, + { + "epoch": 0.11302728524006153, + "grad_norm": 31.34441375732422, + "learning_rate": 8.871437823834198e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.8975626528263092, + "num_tokens": 1250476.0, + "step": 698 + }, + { + "epoch": 0.11318921544814185, + "grad_norm": 38.183109283447266, + "learning_rate": 8.869818652849741e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.8779591619968414, + "num_tokens": 1252273.0, + "step": 699 + }, + { + "epoch": 0.11335114565622217, + "grad_norm": 33.806419372558594, + "learning_rate": 8.868199481865287e-06, + "loss": 0.662, + "mean_token_accuracy": 0.9022125005722046, + "num_tokens": 1254070.0, + "step": 700 + }, + { + "epoch": 0.11351307586430248, + "grad_norm": 30.135028839111328, + "learning_rate": 8.86658031088083e-06, + "loss": 0.6622, + "mean_token_accuracy": 0.9025118350982666, + "num_tokens": 1255867.0, + "step": 701 + }, + { + "epoch": 0.1136750060723828, + "grad_norm": 29.978445053100586, + "learning_rate": 8.864961139896374e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.9041289985179901, + "num_tokens": 1257650.0, + "step": 702 + }, + { + "epoch": 0.11383693628046312, + "grad_norm": 30.304397583007812, + "learning_rate": 8.863341968911917e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.9004205167293549, + "num_tokens": 1259443.0, + "step": 703 + }, + { + "epoch": 0.11399886648854343, + "grad_norm": 26.6141357421875, + "learning_rate": 8.861722797927463e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.9057525396347046, + "num_tokens": 1261231.0, + "step": 704 + }, + { + "epoch": 0.11416079669662375, + "grad_norm": 45.95237731933594, + "learning_rate": 8.860103626943006e-06, + "loss": 1.2493, + "mean_token_accuracy": 0.8576714396476746, + "num_tokens": 1263038.0, + "step": 705 + }, + { + "epoch": 0.11432272690470408, + "grad_norm": 30.297012329101562, + "learning_rate": 8.85848445595855e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.8910098671913147, + "num_tokens": 1264835.0, + "step": 706 + }, + { + "epoch": 0.1144846571127844, + "grad_norm": 28.939964294433594, + "learning_rate": 8.856865284974094e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.9061901271343231, + "num_tokens": 1266625.0, + "step": 707 + }, + { + "epoch": 0.1146465873208647, + "grad_norm": 32.28270721435547, + "learning_rate": 8.855246113989639e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.8883920013904572, + "num_tokens": 1268424.0, + "step": 708 + }, + { + "epoch": 0.11480851752894503, + "grad_norm": 32.24588394165039, + "learning_rate": 8.853626943005182e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.8886352479457855, + "num_tokens": 1270223.0, + "step": 709 + }, + { + "epoch": 0.11497044773702535, + "grad_norm": 38.89261245727539, + "learning_rate": 8.852007772020726e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.8886702656745911, + "num_tokens": 1272013.0, + "step": 710 + }, + { + "epoch": 0.11513237794510565, + "grad_norm": 30.0701904296875, + "learning_rate": 8.850388601036271e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.8905204236507416, + "num_tokens": 1273808.0, + "step": 711 + }, + { + "epoch": 0.11529430815318598, + "grad_norm": 33.369205474853516, + "learning_rate": 8.848769430051815e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.8914824426174164, + "num_tokens": 1275605.0, + "step": 712 + }, + { + "epoch": 0.1154562383612663, + "grad_norm": 33.0394287109375, + "learning_rate": 8.847150259067358e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.8923611044883728, + "num_tokens": 1277405.0, + "step": 713 + }, + { + "epoch": 0.1156181685693466, + "grad_norm": 21.442638397216797, + "learning_rate": 8.845531088082902e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.9266505837440491, + "num_tokens": 1279190.0, + "step": 714 + }, + { + "epoch": 0.11578009877742693, + "grad_norm": 38.59785842895508, + "learning_rate": 8.843911917098447e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.8735786378383636, + "num_tokens": 1280986.0, + "step": 715 + }, + { + "epoch": 0.11594202898550725, + "grad_norm": 37.1681022644043, + "learning_rate": 8.842292746113991e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.8747104108333588, + "num_tokens": 1282786.0, + "step": 716 + }, + { + "epoch": 0.11610395919358757, + "grad_norm": 30.42218589782715, + "learning_rate": 8.840673575129535e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.8992751240730286, + "num_tokens": 1284575.0, + "step": 717 + }, + { + "epoch": 0.11626588940166788, + "grad_norm": 41.17758560180664, + "learning_rate": 8.839054404145078e-06, + "loss": 0.995, + "mean_token_accuracy": 0.8808045089244843, + "num_tokens": 1286372.0, + "step": 718 + }, + { + "epoch": 0.1164278196097482, + "grad_norm": 28.876922607421875, + "learning_rate": 8.837435233160623e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.8978873491287231, + "num_tokens": 1288168.0, + "step": 719 + }, + { + "epoch": 0.11658974981782852, + "grad_norm": 32.4138069152832, + "learning_rate": 8.835816062176167e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.9073280394077301, + "num_tokens": 1289960.0, + "step": 720 + }, + { + "epoch": 0.11675168002590883, + "grad_norm": 36.10765075683594, + "learning_rate": 8.83419689119171e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.9030250310897827, + "num_tokens": 1291760.0, + "step": 721 + }, + { + "epoch": 0.11691361023398915, + "grad_norm": 32.35591125488281, + "learning_rate": 8.832577720207254e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.8790132105350494, + "num_tokens": 1293545.0, + "step": 722 + }, + { + "epoch": 0.11707554044206947, + "grad_norm": 30.638582229614258, + "learning_rate": 8.8309585492228e-06, + "loss": 0.6793, + "mean_token_accuracy": 0.8979054987430573, + "num_tokens": 1295341.0, + "step": 723 + }, + { + "epoch": 0.11723747065014979, + "grad_norm": 31.42913246154785, + "learning_rate": 8.829339378238343e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.8879020512104034, + "num_tokens": 1297129.0, + "step": 724 + }, + { + "epoch": 0.1173994008582301, + "grad_norm": 34.921226501464844, + "learning_rate": 8.827720207253887e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.895411878824234, + "num_tokens": 1298918.0, + "step": 725 + }, + { + "epoch": 0.11756133106631042, + "grad_norm": 33.83235168457031, + "learning_rate": 8.826101036269432e-06, + "loss": 0.787, + "mean_token_accuracy": 0.8928516805171967, + "num_tokens": 1300710.0, + "step": 726 + }, + { + "epoch": 0.11772326127439074, + "grad_norm": 35.45464324951172, + "learning_rate": 8.824481865284975e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.8833283185958862, + "num_tokens": 1302505.0, + "step": 727 + }, + { + "epoch": 0.11788519148247105, + "grad_norm": 23.681251525878906, + "learning_rate": 8.822862694300519e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.9216992855072021, + "num_tokens": 1304298.0, + "step": 728 + }, + { + "epoch": 0.11804712169055137, + "grad_norm": 28.124034881591797, + "learning_rate": 8.821243523316063e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.9000459313392639, + "num_tokens": 1306090.0, + "step": 729 + }, + { + "epoch": 0.11820905189863169, + "grad_norm": 30.2061824798584, + "learning_rate": 8.819624352331608e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.9000459313392639, + "num_tokens": 1307882.0, + "step": 730 + }, + { + "epoch": 0.11837098210671201, + "grad_norm": 27.444787979125977, + "learning_rate": 8.818005181347152e-06, + "loss": 0.6383, + "mean_token_accuracy": 0.9054268598556519, + "num_tokens": 1309669.0, + "step": 731 + }, + { + "epoch": 0.11853291231479232, + "grad_norm": 30.661623001098633, + "learning_rate": 8.816386010362695e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.9084208011627197, + "num_tokens": 1311464.0, + "step": 732 + }, + { + "epoch": 0.11869484252287264, + "grad_norm": 18.264039993286133, + "learning_rate": 8.814766839378239e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.9261902272701263, + "num_tokens": 1313247.0, + "step": 733 + }, + { + "epoch": 0.11885677273095296, + "grad_norm": 28.01018714904785, + "learning_rate": 8.813147668393784e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.9010291695594788, + "num_tokens": 1315042.0, + "step": 734 + }, + { + "epoch": 0.11901870293903327, + "grad_norm": 25.607126235961914, + "learning_rate": 8.811528497409328e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.9112319052219391, + "num_tokens": 1316836.0, + "step": 735 + }, + { + "epoch": 0.1191806331471136, + "grad_norm": 44.4820556640625, + "learning_rate": 8.809909326424871e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.8848186731338501, + "num_tokens": 1318634.0, + "step": 736 + }, + { + "epoch": 0.11934256335519391, + "grad_norm": 38.70259475708008, + "learning_rate": 8.808290155440415e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.8745207190513611, + "num_tokens": 1320430.0, + "step": 737 + }, + { + "epoch": 0.11950449356327422, + "grad_norm": 39.5324592590332, + "learning_rate": 8.80667098445596e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.8870314955711365, + "num_tokens": 1322225.0, + "step": 738 + }, + { + "epoch": 0.11966642377135454, + "grad_norm": 29.941497802734375, + "learning_rate": 8.805051813471504e-06, + "loss": 0.6261, + "mean_token_accuracy": 0.89882692694664, + "num_tokens": 1324014.0, + "step": 739 + }, + { + "epoch": 0.11982835397943487, + "grad_norm": 38.259830474853516, + "learning_rate": 8.803432642487047e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.9024127423763275, + "num_tokens": 1325812.0, + "step": 740 + }, + { + "epoch": 0.11999028418751519, + "grad_norm": 32.23947525024414, + "learning_rate": 8.80181347150259e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.8921913802623749, + "num_tokens": 1327602.0, + "step": 741 + }, + { + "epoch": 0.1201522143955955, + "grad_norm": 26.683002471923828, + "learning_rate": 8.800194300518136e-06, + "loss": 0.6095, + "mean_token_accuracy": 0.916133850812912, + "num_tokens": 1329388.0, + "step": 742 + }, + { + "epoch": 0.12031414460367582, + "grad_norm": 31.33710289001465, + "learning_rate": 8.79857512953368e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.8904584050178528, + "num_tokens": 1331174.0, + "step": 743 + }, + { + "epoch": 0.12047607481175614, + "grad_norm": 36.16375732421875, + "learning_rate": 8.796955958549223e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.8830054998397827, + "num_tokens": 1332968.0, + "step": 744 + }, + { + "epoch": 0.12063800501983644, + "grad_norm": 58.49775695800781, + "learning_rate": 8.795336787564769e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.8914824426174164, + "num_tokens": 1334765.0, + "step": 745 + }, + { + "epoch": 0.12079993522791677, + "grad_norm": 38.21027374267578, + "learning_rate": 8.793717616580312e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.8922150731086731, + "num_tokens": 1336547.0, + "step": 746 + }, + { + "epoch": 0.12096186543599709, + "grad_norm": 40.312984466552734, + "learning_rate": 8.792098445595856e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.8784005641937256, + "num_tokens": 1338347.0, + "step": 747 + }, + { + "epoch": 0.12112379564407741, + "grad_norm": 33.33052062988281, + "learning_rate": 8.7904792746114e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.8948729038238525, + "num_tokens": 1340144.0, + "step": 748 + }, + { + "epoch": 0.12128572585215772, + "grad_norm": 39.75764846801758, + "learning_rate": 8.788860103626945e-06, + "loss": 1.027, + "mean_token_accuracy": 0.8861840069293976, + "num_tokens": 1341937.0, + "step": 749 + }, + { + "epoch": 0.12144765606023804, + "grad_norm": 27.47093391418457, + "learning_rate": 8.787240932642488e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.8968908786773682, + "num_tokens": 1343731.0, + "step": 750 + }, + { + "epoch": 0.12160958626831836, + "grad_norm": 37.69817352294922, + "learning_rate": 8.785621761658032e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.8614116609096527, + "num_tokens": 1345539.0, + "step": 751 + }, + { + "epoch": 0.12177151647639867, + "grad_norm": 31.21765899658203, + "learning_rate": 8.784002590673575e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.8887874782085419, + "num_tokens": 1347320.0, + "step": 752 + }, + { + "epoch": 0.12193344668447899, + "grad_norm": 48.6146125793457, + "learning_rate": 8.78238341968912e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.8782394230365753, + "num_tokens": 1349111.0, + "step": 753 + }, + { + "epoch": 0.12209537689255931, + "grad_norm": 34.6551513671875, + "learning_rate": 8.780764248704664e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.8879284858703613, + "num_tokens": 1350908.0, + "step": 754 + }, + { + "epoch": 0.12225730710063963, + "grad_norm": 40.05681610107422, + "learning_rate": 8.779145077720208e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.8786833882331848, + "num_tokens": 1352708.0, + "step": 755 + }, + { + "epoch": 0.12241923730871994, + "grad_norm": 36.126956939697266, + "learning_rate": 8.777525906735751e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.8810479640960693, + "num_tokens": 1354497.0, + "step": 756 + }, + { + "epoch": 0.12258116751680026, + "grad_norm": 26.102785110473633, + "learning_rate": 8.775906735751297e-06, + "loss": 0.697, + "mean_token_accuracy": 0.9042248427867889, + "num_tokens": 1356289.0, + "step": 757 + }, + { + "epoch": 0.12274309772488058, + "grad_norm": 21.950023651123047, + "learning_rate": 8.77428756476684e-06, + "loss": 0.636, + "mean_token_accuracy": 0.9039300382137299, + "num_tokens": 1358082.0, + "step": 758 + }, + { + "epoch": 0.12290502793296089, + "grad_norm": 27.387094497680664, + "learning_rate": 8.772668393782384e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.8994036912918091, + "num_tokens": 1359872.0, + "step": 759 + }, + { + "epoch": 0.12306695814104121, + "grad_norm": 23.670595169067383, + "learning_rate": 8.771049222797927e-06, + "loss": 0.6562, + "mean_token_accuracy": 0.920149177312851, + "num_tokens": 1361659.0, + "step": 760 + }, + { + "epoch": 0.12322888834912153, + "grad_norm": 34.474918365478516, + "learning_rate": 8.769430051813473e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.8982540667057037, + "num_tokens": 1363465.0, + "step": 761 + }, + { + "epoch": 0.12339081855720184, + "grad_norm": 42.511199951171875, + "learning_rate": 8.767810880829016e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.8815292119979858, + "num_tokens": 1365255.0, + "step": 762 + }, + { + "epoch": 0.12355274876528216, + "grad_norm": 25.6155948638916, + "learning_rate": 8.76619170984456e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.9093892574310303, + "num_tokens": 1367043.0, + "step": 763 + }, + { + "epoch": 0.12371467897336248, + "grad_norm": 35.98655319213867, + "learning_rate": 8.764572538860105e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.8880626261234283, + "num_tokens": 1368841.0, + "step": 764 + }, + { + "epoch": 0.1238766091814428, + "grad_norm": 34.600765228271484, + "learning_rate": 8.762953367875649e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.886418491601944, + "num_tokens": 1370635.0, + "step": 765 + }, + { + "epoch": 0.12403853938952311, + "grad_norm": 24.08222770690918, + "learning_rate": 8.761334196891192e-06, + "loss": 0.6396, + "mean_token_accuracy": 0.9126984179019928, + "num_tokens": 1372422.0, + "step": 766 + }, + { + "epoch": 0.12420046959760343, + "grad_norm": 25.452463150024414, + "learning_rate": 8.759715025906736e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.9114472568035126, + "num_tokens": 1374215.0, + "step": 767 + }, + { + "epoch": 0.12436239980568375, + "grad_norm": 27.962905883789062, + "learning_rate": 8.758095854922281e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.9067831337451935, + "num_tokens": 1376006.0, + "step": 768 + }, + { + "epoch": 0.12452433001376406, + "grad_norm": 29.30245590209961, + "learning_rate": 8.756476683937825e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.8857174217700958, + "num_tokens": 1377789.0, + "step": 769 + }, + { + "epoch": 0.12468626022184438, + "grad_norm": 18.69622230529785, + "learning_rate": 8.754857512953368e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.9188180863857269, + "num_tokens": 1379572.0, + "step": 770 + }, + { + "epoch": 0.1248481904299247, + "grad_norm": 22.967805862426758, + "learning_rate": 8.753238341968912e-06, + "loss": 0.691, + "mean_token_accuracy": 0.9044117629528046, + "num_tokens": 1381356.0, + "step": 771 + }, + { + "epoch": 0.125010120638005, + "grad_norm": 23.452070236206055, + "learning_rate": 8.751619170984457e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.9067532122135162, + "num_tokens": 1383136.0, + "step": 772 + }, + { + "epoch": 0.12517205084608535, + "grad_norm": 34.1168098449707, + "learning_rate": 8.750000000000001e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.8844388723373413, + "num_tokens": 1384925.0, + "step": 773 + }, + { + "epoch": 0.12533398105416566, + "grad_norm": 30.217897415161133, + "learning_rate": 8.748380829015545e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.9047619104385376, + "num_tokens": 1386721.0, + "step": 774 + }, + { + "epoch": 0.12549591126224596, + "grad_norm": 28.14794921875, + "learning_rate": 8.746761658031088e-06, + "loss": 0.6556, + "mean_token_accuracy": 0.9149396419525146, + "num_tokens": 1388515.0, + "step": 775 + }, + { + "epoch": 0.1256578414703263, + "grad_norm": 30.352880477905273, + "learning_rate": 8.745142487046633e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.8929492235183716, + "num_tokens": 1390306.0, + "step": 776 + }, + { + "epoch": 0.1258197716784066, + "grad_norm": 30.50710105895996, + "learning_rate": 8.743523316062177e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.9083270728588104, + "num_tokens": 1392091.0, + "step": 777 + }, + { + "epoch": 0.1259817018864869, + "grad_norm": 28.93914031982422, + "learning_rate": 8.74190414507772e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.9137163758277893, + "num_tokens": 1393881.0, + "step": 778 + }, + { + "epoch": 0.12614363209456725, + "grad_norm": 32.38669204711914, + "learning_rate": 8.740284974093264e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.8885893523693085, + "num_tokens": 1395680.0, + "step": 779 + }, + { + "epoch": 0.12630556230264756, + "grad_norm": 30.107797622680664, + "learning_rate": 8.73866580310881e-06, + "loss": 0.6413, + "mean_token_accuracy": 0.9154411852359772, + "num_tokens": 1397464.0, + "step": 780 + }, + { + "epoch": 0.12646749251072786, + "grad_norm": 26.293188095092773, + "learning_rate": 8.737046632124353e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.908279538154602, + "num_tokens": 1399258.0, + "step": 781 + }, + { + "epoch": 0.1266294227188082, + "grad_norm": 35.28606033325195, + "learning_rate": 8.735427461139897e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.8830456137657166, + "num_tokens": 1401044.0, + "step": 782 + }, + { + "epoch": 0.1267913529268885, + "grad_norm": 27.882164001464844, + "learning_rate": 8.733808290155442e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.9055748581886292, + "num_tokens": 1402842.0, + "step": 783 + }, + { + "epoch": 0.12695328313496884, + "grad_norm": 25.17303466796875, + "learning_rate": 8.732189119170985e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.9154636561870575, + "num_tokens": 1404626.0, + "step": 784 + }, + { + "epoch": 0.12711521334304915, + "grad_norm": 26.61556053161621, + "learning_rate": 8.730569948186529e-06, + "loss": 0.6077, + "mean_token_accuracy": 0.9111111164093018, + "num_tokens": 1406408.0, + "step": 785 + }, + { + "epoch": 0.12727714355112946, + "grad_norm": 33.89521408081055, + "learning_rate": 8.728950777202073e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.8943609595298767, + "num_tokens": 1408204.0, + "step": 786 + }, + { + "epoch": 0.1274390737592098, + "grad_norm": 28.257461547851562, + "learning_rate": 8.727331606217618e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.9011698365211487, + "num_tokens": 1409998.0, + "step": 787 + }, + { + "epoch": 0.1276010039672901, + "grad_norm": 34.67573547363281, + "learning_rate": 8.725712435233162e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.8919222056865692, + "num_tokens": 1411787.0, + "step": 788 + }, + { + "epoch": 0.1277629341753704, + "grad_norm": 35.834556579589844, + "learning_rate": 8.724093264248705e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.8900547325611115, + "num_tokens": 1413580.0, + "step": 789 + }, + { + "epoch": 0.12792486438345074, + "grad_norm": 38.697296142578125, + "learning_rate": 8.722474093264249e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.8904455006122589, + "num_tokens": 1415375.0, + "step": 790 + }, + { + "epoch": 0.12808679459153105, + "grad_norm": 39.40624237060547, + "learning_rate": 8.720854922279794e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.877743273973465, + "num_tokens": 1417165.0, + "step": 791 + }, + { + "epoch": 0.12824872479961136, + "grad_norm": 28.56023406982422, + "learning_rate": 8.719235751295338e-06, + "loss": 0.748, + "mean_token_accuracy": 0.9094496071338654, + "num_tokens": 1418962.0, + "step": 792 + }, + { + "epoch": 0.1284106550076917, + "grad_norm": 29.009428024291992, + "learning_rate": 8.717616580310881e-06, + "loss": 0.754, + "mean_token_accuracy": 0.8983812630176544, + "num_tokens": 1420758.0, + "step": 793 + }, + { + "epoch": 0.128572585215772, + "grad_norm": 32.4525260925293, + "learning_rate": 8.715997409326425e-06, + "loss": 0.723, + "mean_token_accuracy": 0.9010837972164154, + "num_tokens": 1422553.0, + "step": 794 + }, + { + "epoch": 0.1287345154238523, + "grad_norm": 29.586387634277344, + "learning_rate": 8.71437823834197e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.9029796719551086, + "num_tokens": 1424333.0, + "step": 795 + }, + { + "epoch": 0.12889644563193264, + "grad_norm": 25.813024520874023, + "learning_rate": 8.712759067357514e-06, + "loss": 0.839, + "mean_token_accuracy": 0.8931864202022552, + "num_tokens": 1426126.0, + "step": 796 + }, + { + "epoch": 0.12905837584001295, + "grad_norm": 27.574996948242188, + "learning_rate": 8.711139896373057e-06, + "loss": 0.6717, + "mean_token_accuracy": 0.8982804119586945, + "num_tokens": 1427913.0, + "step": 797 + }, + { + "epoch": 0.12922030604809326, + "grad_norm": 31.408742904663086, + "learning_rate": 8.7095207253886e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.871626079082489, + "num_tokens": 1429705.0, + "step": 798 + }, + { + "epoch": 0.1293822362561736, + "grad_norm": 27.938873291015625, + "learning_rate": 8.707901554404146e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.8991718590259552, + "num_tokens": 1431495.0, + "step": 799 + }, + { + "epoch": 0.1295441664642539, + "grad_norm": 35.222267150878906, + "learning_rate": 8.70628238341969e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.8914916217327118, + "num_tokens": 1433283.0, + "step": 800 + }, + { + "epoch": 0.12970609667233424, + "grad_norm": 27.772075653076172, + "learning_rate": 8.704663212435233e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.8973808586597443, + "num_tokens": 1435068.0, + "step": 801 + }, + { + "epoch": 0.12986802688041454, + "grad_norm": 26.73502540588379, + "learning_rate": 8.703044041450779e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.8992805778980255, + "num_tokens": 1436858.0, + "step": 802 + }, + { + "epoch": 0.13002995708849485, + "grad_norm": 41.384403228759766, + "learning_rate": 8.701424870466322e-06, + "loss": 1.1225, + "mean_token_accuracy": 0.8648176491260529, + "num_tokens": 1438651.0, + "step": 803 + }, + { + "epoch": 0.1301918872965752, + "grad_norm": 29.48305892944336, + "learning_rate": 8.699805699481866e-06, + "loss": 0.735, + "mean_token_accuracy": 0.8992060720920563, + "num_tokens": 1440439.0, + "step": 804 + }, + { + "epoch": 0.1303538175046555, + "grad_norm": 36.73165512084961, + "learning_rate": 8.69818652849741e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.8806320428848267, + "num_tokens": 1442228.0, + "step": 805 + }, + { + "epoch": 0.1305157477127358, + "grad_norm": 24.63804054260254, + "learning_rate": 8.696567357512955e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.9103121757507324, + "num_tokens": 1444018.0, + "step": 806 + }, + { + "epoch": 0.13067767792081614, + "grad_norm": 31.458341598510742, + "learning_rate": 8.694948186528498e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.8865043222904205, + "num_tokens": 1445811.0, + "step": 807 + }, + { + "epoch": 0.13083960812889645, + "grad_norm": 34.828094482421875, + "learning_rate": 8.693329015544042e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.8981518447399139, + "num_tokens": 1447607.0, + "step": 808 + }, + { + "epoch": 0.13100153833697675, + "grad_norm": 34.25908279418945, + "learning_rate": 8.691709844559585e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.8959731459617615, + "num_tokens": 1449417.0, + "step": 809 + }, + { + "epoch": 0.1311634685450571, + "grad_norm": 36.357547760009766, + "learning_rate": 8.69009067357513e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.8840375542640686, + "num_tokens": 1451221.0, + "step": 810 + }, + { + "epoch": 0.1313253987531374, + "grad_norm": 39.3553352355957, + "learning_rate": 8.688471502590674e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.8818948864936829, + "num_tokens": 1453005.0, + "step": 811 + }, + { + "epoch": 0.1314873289612177, + "grad_norm": 27.179302215576172, + "learning_rate": 8.686852331606218e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.8962962925434113, + "num_tokens": 1454796.0, + "step": 812 + }, + { + "epoch": 0.13164925916929804, + "grad_norm": 25.398508071899414, + "learning_rate": 8.685233160621761e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.8970921039581299, + "num_tokens": 1456590.0, + "step": 813 + }, + { + "epoch": 0.13181118937737835, + "grad_norm": 32.33420181274414, + "learning_rate": 8.683613989637307e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.9021242558956146, + "num_tokens": 1458389.0, + "step": 814 + }, + { + "epoch": 0.13197311958545865, + "grad_norm": 27.961353302001953, + "learning_rate": 8.68199481865285e-06, + "loss": 0.802, + "mean_token_accuracy": 0.890135258436203, + "num_tokens": 1460174.0, + "step": 815 + }, + { + "epoch": 0.132135049793539, + "grad_norm": 17.271499633789062, + "learning_rate": 8.680375647668394e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.924217939376831, + "num_tokens": 1461963.0, + "step": 816 + }, + { + "epoch": 0.1322969800016193, + "grad_norm": 29.830921173095703, + "learning_rate": 8.678756476683938e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.9098977446556091, + "num_tokens": 1463763.0, + "step": 817 + }, + { + "epoch": 0.13245891020969963, + "grad_norm": 28.314706802368164, + "learning_rate": 8.677137305699483e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.891791820526123, + "num_tokens": 1465552.0, + "step": 818 + }, + { + "epoch": 0.13262084041777994, + "grad_norm": 29.0943603515625, + "learning_rate": 8.675518134715026e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.9017763137817383, + "num_tokens": 1467340.0, + "step": 819 + }, + { + "epoch": 0.13278277062586025, + "grad_norm": 32.64145278930664, + "learning_rate": 8.67389896373057e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.8762452006340027, + "num_tokens": 1469132.0, + "step": 820 + }, + { + "epoch": 0.13294470083394058, + "grad_norm": 22.297565460205078, + "learning_rate": 8.672279792746115e-06, + "loss": 0.6083, + "mean_token_accuracy": 0.9077786207199097, + "num_tokens": 1470915.0, + "step": 821 + }, + { + "epoch": 0.1331066310420209, + "grad_norm": 23.34813117980957, + "learning_rate": 8.670660621761659e-06, + "loss": 0.701, + "mean_token_accuracy": 0.9093756973743439, + "num_tokens": 1472692.0, + "step": 822 + }, + { + "epoch": 0.1332685612501012, + "grad_norm": 34.98750305175781, + "learning_rate": 8.669041450777202e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.8653196096420288, + "num_tokens": 1474494.0, + "step": 823 + }, + { + "epoch": 0.13343049145818153, + "grad_norm": 29.603885650634766, + "learning_rate": 8.667422279792746e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.9051044583320618, + "num_tokens": 1476280.0, + "step": 824 + }, + { + "epoch": 0.13359242166626184, + "grad_norm": 33.01629638671875, + "learning_rate": 8.665803108808291e-06, + "loss": 0.756, + "mean_token_accuracy": 0.8935846984386444, + "num_tokens": 1478073.0, + "step": 825 + }, + { + "epoch": 0.13375435187434215, + "grad_norm": 32.65421676635742, + "learning_rate": 8.664183937823835e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.8876811861991882, + "num_tokens": 1479861.0, + "step": 826 + }, + { + "epoch": 0.13391628208242248, + "grad_norm": 24.05576515197754, + "learning_rate": 8.662564766839378e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.9151683151721954, + "num_tokens": 1481656.0, + "step": 827 + }, + { + "epoch": 0.1340782122905028, + "grad_norm": 32.75737762451172, + "learning_rate": 8.660945595854922e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.8908450901508331, + "num_tokens": 1483452.0, + "step": 828 + }, + { + "epoch": 0.1342401424985831, + "grad_norm": 27.679901123046875, + "learning_rate": 8.659326424870467e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.8928079307079315, + "num_tokens": 1485244.0, + "step": 829 + }, + { + "epoch": 0.13440207270666343, + "grad_norm": 21.084152221679688, + "learning_rate": 8.657707253886011e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.9168752431869507, + "num_tokens": 1487033.0, + "step": 830 + }, + { + "epoch": 0.13456400291474374, + "grad_norm": 29.352325439453125, + "learning_rate": 8.656088082901555e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.9055672287940979, + "num_tokens": 1488821.0, + "step": 831 + }, + { + "epoch": 0.13472593312282408, + "grad_norm": 35.51498031616211, + "learning_rate": 8.654468911917098e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.893150269985199, + "num_tokens": 1490614.0, + "step": 832 + }, + { + "epoch": 0.13488786333090438, + "grad_norm": 33.8050422668457, + "learning_rate": 8.652849740932643e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.8993945717811584, + "num_tokens": 1492402.0, + "step": 833 + }, + { + "epoch": 0.1350497935389847, + "grad_norm": 36.920982360839844, + "learning_rate": 8.651230569948187e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.8865518867969513, + "num_tokens": 1494186.0, + "step": 834 + }, + { + "epoch": 0.13521172374706503, + "grad_norm": 31.333059310913086, + "learning_rate": 8.64961139896373e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.9097591638565063, + "num_tokens": 1495975.0, + "step": 835 + }, + { + "epoch": 0.13537365395514533, + "grad_norm": 40.11623001098633, + "learning_rate": 8.647992227979274e-06, + "loss": 1.2166, + "mean_token_accuracy": 0.8759398460388184, + "num_tokens": 1497753.0, + "step": 836 + }, + { + "epoch": 0.13553558416322564, + "grad_norm": 25.671533584594727, + "learning_rate": 8.64637305699482e-06, + "loss": 0.6483, + "mean_token_accuracy": 0.9056722819805145, + "num_tokens": 1499541.0, + "step": 837 + }, + { + "epoch": 0.13569751437130598, + "grad_norm": 37.287193298339844, + "learning_rate": 8.644753886010363e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.8869949579238892, + "num_tokens": 1501329.0, + "step": 838 + }, + { + "epoch": 0.13585944457938628, + "grad_norm": 30.371614456176758, + "learning_rate": 8.643134715025907e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.8889376819133759, + "num_tokens": 1503111.0, + "step": 839 + }, + { + "epoch": 0.1360213747874666, + "grad_norm": 24.413530349731445, + "learning_rate": 8.641515544041452e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.9074074029922485, + "num_tokens": 1504893.0, + "step": 840 + }, + { + "epoch": 0.13618330499554693, + "grad_norm": 31.285934448242188, + "learning_rate": 8.639896373056996e-06, + "loss": 0.726, + "mean_token_accuracy": 0.8945697247982025, + "num_tokens": 1506680.0, + "step": 841 + }, + { + "epoch": 0.13634523520362724, + "grad_norm": 32.2459831237793, + "learning_rate": 8.638277202072539e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.8897615373134613, + "num_tokens": 1508482.0, + "step": 842 + }, + { + "epoch": 0.13650716541170754, + "grad_norm": 37.652164459228516, + "learning_rate": 8.636658031088083e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.8858367800712585, + "num_tokens": 1510283.0, + "step": 843 + }, + { + "epoch": 0.13666909561978788, + "grad_norm": 31.184146881103516, + "learning_rate": 8.635038860103628e-06, + "loss": 0.881, + "mean_token_accuracy": 0.8902844190597534, + "num_tokens": 1512077.0, + "step": 844 + }, + { + "epoch": 0.13683102582786819, + "grad_norm": 36.45187759399414, + "learning_rate": 8.633419689119172e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.8785934746265411, + "num_tokens": 1513877.0, + "step": 845 + }, + { + "epoch": 0.1369929560359485, + "grad_norm": 38.46100616455078, + "learning_rate": 8.631800518134715e-06, + "loss": 0.942, + "mean_token_accuracy": 0.8774697184562683, + "num_tokens": 1515682.0, + "step": 846 + }, + { + "epoch": 0.13715488624402883, + "grad_norm": 29.387264251708984, + "learning_rate": 8.630181347150259e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.8912192583084106, + "num_tokens": 1517487.0, + "step": 847 + }, + { + "epoch": 0.13731681645210914, + "grad_norm": 24.65268325805664, + "learning_rate": 8.628562176165804e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.9117217361927032, + "num_tokens": 1519271.0, + "step": 848 + }, + { + "epoch": 0.13747874666018947, + "grad_norm": 29.235944747924805, + "learning_rate": 8.626943005181348e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.8946630656719208, + "num_tokens": 1521078.0, + "step": 849 + }, + { + "epoch": 0.13764067686826978, + "grad_norm": 33.69078826904297, + "learning_rate": 8.625323834196891e-06, + "loss": 1.0, + "mean_token_accuracy": 0.8706004321575165, + "num_tokens": 1522875.0, + "step": 850 + }, + { + "epoch": 0.1378026070763501, + "grad_norm": 29.134723663330078, + "learning_rate": 8.623704663212435e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.9006756246089935, + "num_tokens": 1524658.0, + "step": 851 + }, + { + "epoch": 0.13796453728443042, + "grad_norm": 22.728116989135742, + "learning_rate": 8.62208549222798e-06, + "loss": 0.6287, + "mean_token_accuracy": 0.9011110067367554, + "num_tokens": 1526443.0, + "step": 852 + }, + { + "epoch": 0.13812646749251073, + "grad_norm": 29.707117080688477, + "learning_rate": 8.620466321243524e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.8967532515525818, + "num_tokens": 1528227.0, + "step": 853 + }, + { + "epoch": 0.13828839770059104, + "grad_norm": 17.45728874206543, + "learning_rate": 8.618847150259067e-06, + "loss": 0.557, + "mean_token_accuracy": 0.9195680320262909, + "num_tokens": 1530012.0, + "step": 854 + }, + { + "epoch": 0.13845032790867137, + "grad_norm": 25.25996971130371, + "learning_rate": 8.61722797927461e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.8958131670951843, + "num_tokens": 1531812.0, + "step": 855 + }, + { + "epoch": 0.13861225811675168, + "grad_norm": 23.43341827392578, + "learning_rate": 8.615608808290156e-06, + "loss": 0.6105, + "mean_token_accuracy": 0.8968254029750824, + "num_tokens": 1533606.0, + "step": 856 + }, + { + "epoch": 0.138774188324832, + "grad_norm": 24.679168701171875, + "learning_rate": 8.6139896373057e-06, + "loss": 0.677, + "mean_token_accuracy": 0.9079623520374298, + "num_tokens": 1535390.0, + "step": 857 + }, + { + "epoch": 0.13893611853291232, + "grad_norm": 27.431867599487305, + "learning_rate": 8.612370466321243e-06, + "loss": 0.6638, + "mean_token_accuracy": 0.9074242413043976, + "num_tokens": 1537184.0, + "step": 858 + }, + { + "epoch": 0.13909804874099263, + "grad_norm": 28.906810760498047, + "learning_rate": 8.610751295336789e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.9112429022789001, + "num_tokens": 1538977.0, + "step": 859 + }, + { + "epoch": 0.13925997894907294, + "grad_norm": 37.66145324707031, + "learning_rate": 8.609132124352332e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.8882094025611877, + "num_tokens": 1540775.0, + "step": 860 + }, + { + "epoch": 0.13942190915715327, + "grad_norm": 24.447853088378906, + "learning_rate": 8.607512953367876e-06, + "loss": 0.6192, + "mean_token_accuracy": 0.9074721336364746, + "num_tokens": 1542568.0, + "step": 861 + }, + { + "epoch": 0.13958383936523358, + "grad_norm": 30.687110900878906, + "learning_rate": 8.60589378238342e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.8986208736896515, + "num_tokens": 1544366.0, + "step": 862 + }, + { + "epoch": 0.1397457695733139, + "grad_norm": 32.879188537597656, + "learning_rate": 8.604274611398965e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.8770364820957184, + "num_tokens": 1546161.0, + "step": 863 + }, + { + "epoch": 0.13990769978139422, + "grad_norm": 28.076580047607422, + "learning_rate": 8.602655440414508e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.8982598185539246, + "num_tokens": 1547948.0, + "step": 864 + }, + { + "epoch": 0.14006962998947453, + "grad_norm": 16.121753692626953, + "learning_rate": 8.601036269430052e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.9231182336807251, + "num_tokens": 1549733.0, + "step": 865 + }, + { + "epoch": 0.14023156019755487, + "grad_norm": 28.865304946899414, + "learning_rate": 8.599417098445595e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.8985010981559753, + "num_tokens": 1551529.0, + "step": 866 + }, + { + "epoch": 0.14039349040563517, + "grad_norm": 24.488113403320312, + "learning_rate": 8.59779792746114e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.9079061448574066, + "num_tokens": 1553313.0, + "step": 867 + }, + { + "epoch": 0.14055542061371548, + "grad_norm": 37.51773452758789, + "learning_rate": 8.596178756476684e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.8992283642292023, + "num_tokens": 1555113.0, + "step": 868 + }, + { + "epoch": 0.14071735082179582, + "grad_norm": 42.04418182373047, + "learning_rate": 8.594559585492228e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.8807623982429504, + "num_tokens": 1556910.0, + "step": 869 + }, + { + "epoch": 0.14087928102987612, + "grad_norm": 31.518220901489258, + "learning_rate": 8.592940414507773e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.89775151014328, + "num_tokens": 1558696.0, + "step": 870 + }, + { + "epoch": 0.14104121123795643, + "grad_norm": 29.54262924194336, + "learning_rate": 8.591321243523317e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.9021909236907959, + "num_tokens": 1560485.0, + "step": 871 + }, + { + "epoch": 0.14120314144603677, + "grad_norm": 25.617778778076172, + "learning_rate": 8.589702072538862e-06, + "loss": 0.6006, + "mean_token_accuracy": 0.9239353239536285, + "num_tokens": 1562273.0, + "step": 872 + }, + { + "epoch": 0.14136507165411707, + "grad_norm": 29.190038681030273, + "learning_rate": 8.588082901554406e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.8864051103591919, + "num_tokens": 1564066.0, + "step": 873 + }, + { + "epoch": 0.14152700186219738, + "grad_norm": 27.611671447753906, + "learning_rate": 8.58646373056995e-06, + "loss": 0.6643, + "mean_token_accuracy": 0.8939849734306335, + "num_tokens": 1565851.0, + "step": 874 + }, + { + "epoch": 0.14168893207027772, + "grad_norm": 24.773714065551758, + "learning_rate": 8.584844559585493e-06, + "loss": 0.6696, + "mean_token_accuracy": 0.9076147675514221, + "num_tokens": 1567644.0, + "step": 875 + }, + { + "epoch": 0.14185086227835803, + "grad_norm": 22.59209442138672, + "learning_rate": 8.583225388601038e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.9148893356323242, + "num_tokens": 1569438.0, + "step": 876 + }, + { + "epoch": 0.14201279248643833, + "grad_norm": 25.45064926147461, + "learning_rate": 8.581606217616582e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.9087953269481659, + "num_tokens": 1571224.0, + "step": 877 + }, + { + "epoch": 0.14217472269451867, + "grad_norm": 36.724063873291016, + "learning_rate": 8.579987046632125e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.8841786980628967, + "num_tokens": 1573019.0, + "step": 878 + }, + { + "epoch": 0.14233665290259898, + "grad_norm": 28.60527801513672, + "learning_rate": 8.578367875647669e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.9042792916297913, + "num_tokens": 1574811.0, + "step": 879 + }, + { + "epoch": 0.1424985831106793, + "grad_norm": 31.895648956298828, + "learning_rate": 8.576748704663214e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.8872102200984955, + "num_tokens": 1576597.0, + "step": 880 + }, + { + "epoch": 0.14266051331875962, + "grad_norm": 27.735748291015625, + "learning_rate": 8.575129533678758e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.9045013785362244, + "num_tokens": 1578392.0, + "step": 881 + }, + { + "epoch": 0.14282244352683993, + "grad_norm": 29.527376174926758, + "learning_rate": 8.573510362694301e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.8884373009204865, + "num_tokens": 1580173.0, + "step": 882 + }, + { + "epoch": 0.14298437373492026, + "grad_norm": 30.437082290649414, + "learning_rate": 8.571891191709845e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.8962447047233582, + "num_tokens": 1581965.0, + "step": 883 + }, + { + "epoch": 0.14314630394300057, + "grad_norm": 27.638748168945312, + "learning_rate": 8.57027202072539e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.8946863114833832, + "num_tokens": 1583762.0, + "step": 884 + }, + { + "epoch": 0.14330823415108088, + "grad_norm": 27.933658599853516, + "learning_rate": 8.568652849740934e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.8928240537643433, + "num_tokens": 1585553.0, + "step": 885 + }, + { + "epoch": 0.1434701643591612, + "grad_norm": 38.2045783996582, + "learning_rate": 8.567033678756477e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.8746964335441589, + "num_tokens": 1587352.0, + "step": 886 + }, + { + "epoch": 0.14363209456724152, + "grad_norm": 20.439668655395508, + "learning_rate": 8.565414507772023e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.9223517775535583, + "num_tokens": 1589148.0, + "step": 887 + }, + { + "epoch": 0.14379402477532183, + "grad_norm": 32.88519287109375, + "learning_rate": 8.563795336787566e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.893098384141922, + "num_tokens": 1590931.0, + "step": 888 + }, + { + "epoch": 0.14395595498340216, + "grad_norm": 23.225061416625977, + "learning_rate": 8.56217616580311e-06, + "loss": 0.6568, + "mean_token_accuracy": 0.9044384062290192, + "num_tokens": 1592725.0, + "step": 889 + }, + { + "epoch": 0.14411788519148247, + "grad_norm": 22.49410057067871, + "learning_rate": 8.560556994818653e-06, + "loss": 0.6638, + "mean_token_accuracy": 0.9103226661682129, + "num_tokens": 1594514.0, + "step": 890 + }, + { + "epoch": 0.14427981539956278, + "grad_norm": 26.509963989257812, + "learning_rate": 8.558937823834199e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.8860294222831726, + "num_tokens": 1596298.0, + "step": 891 + }, + { + "epoch": 0.1444417456076431, + "grad_norm": 26.991771697998047, + "learning_rate": 8.557318652849742e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.8896606862545013, + "num_tokens": 1598102.0, + "step": 892 + }, + { + "epoch": 0.14460367581572342, + "grad_norm": 26.024194717407227, + "learning_rate": 8.555699481865286e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.8957327306270599, + "num_tokens": 1599898.0, + "step": 893 + }, + { + "epoch": 0.14476560602380373, + "grad_norm": 30.685579299926758, + "learning_rate": 8.55408031088083e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.8886958956718445, + "num_tokens": 1601698.0, + "step": 894 + }, + { + "epoch": 0.14492753623188406, + "grad_norm": 28.975326538085938, + "learning_rate": 8.552461139896375e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.8850038945674896, + "num_tokens": 1603497.0, + "step": 895 + }, + { + "epoch": 0.14508946643996437, + "grad_norm": 32.98832321166992, + "learning_rate": 8.550841968911918e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.8881553113460541, + "num_tokens": 1605295.0, + "step": 896 + }, + { + "epoch": 0.1452513966480447, + "grad_norm": 26.0952205657959, + "learning_rate": 8.549222797927462e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.8973332643508911, + "num_tokens": 1607079.0, + "step": 897 + }, + { + "epoch": 0.145413326856125, + "grad_norm": 24.03562355041504, + "learning_rate": 8.547603626943006e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.9059873819351196, + "num_tokens": 1608867.0, + "step": 898 + }, + { + "epoch": 0.14557525706420532, + "grad_norm": 29.828283309936523, + "learning_rate": 8.54598445595855e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.8997685015201569, + "num_tokens": 1610658.0, + "step": 899 + }, + { + "epoch": 0.14573718727228566, + "grad_norm": 26.028789520263672, + "learning_rate": 8.544365284974094e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.8958526253700256, + "num_tokens": 1612449.0, + "step": 900 + }, + { + "epoch": 0.14589911748036596, + "grad_norm": 24.521835327148438, + "learning_rate": 8.542746113989638e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.8971920311450958, + "num_tokens": 1614243.0, + "step": 901 + }, + { + "epoch": 0.14606104768844627, + "grad_norm": 23.212135314941406, + "learning_rate": 8.541126943005182e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.9056142568588257, + "num_tokens": 1616041.0, + "step": 902 + }, + { + "epoch": 0.1462229778965266, + "grad_norm": 33.281070709228516, + "learning_rate": 8.539507772020727e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.8838293552398682, + "num_tokens": 1617837.0, + "step": 903 + }, + { + "epoch": 0.14638490810460691, + "grad_norm": 27.358675003051758, + "learning_rate": 8.53788860103627e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.9034389555454254, + "num_tokens": 1619629.0, + "step": 904 + }, + { + "epoch": 0.14654683831268722, + "grad_norm": 27.35979461669922, + "learning_rate": 8.536269430051814e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.8901889026165009, + "num_tokens": 1621414.0, + "step": 905 + }, + { + "epoch": 0.14670876852076756, + "grad_norm": 30.20436668395996, + "learning_rate": 8.53465025906736e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.9110654890537262, + "num_tokens": 1623207.0, + "step": 906 + }, + { + "epoch": 0.14687069872884786, + "grad_norm": 27.33577537536621, + "learning_rate": 8.533031088082903e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.8921325206756592, + "num_tokens": 1624997.0, + "step": 907 + }, + { + "epoch": 0.14703262893692817, + "grad_norm": 27.49772071838379, + "learning_rate": 8.531411917098447e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.9044117629528046, + "num_tokens": 1626781.0, + "step": 908 + }, + { + "epoch": 0.1471945591450085, + "grad_norm": 26.854997634887695, + "learning_rate": 8.52979274611399e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.9193868935108185, + "num_tokens": 1628566.0, + "step": 909 + }, + { + "epoch": 0.14735648935308882, + "grad_norm": 24.952070236206055, + "learning_rate": 8.528173575129535e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.9098546504974365, + "num_tokens": 1630366.0, + "step": 910 + }, + { + "epoch": 0.14751841956116912, + "grad_norm": 28.105072021484375, + "learning_rate": 8.526554404145079e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.88413867354393, + "num_tokens": 1632154.0, + "step": 911 + }, + { + "epoch": 0.14768034976924946, + "grad_norm": 28.145858764648438, + "learning_rate": 8.524935233160623e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.8932623863220215, + "num_tokens": 1633947.0, + "step": 912 + }, + { + "epoch": 0.14784227997732977, + "grad_norm": 20.32073974609375, + "learning_rate": 8.523316062176166e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.9208264350891113, + "num_tokens": 1635737.0, + "step": 913 + }, + { + "epoch": 0.1480042101854101, + "grad_norm": 26.286941528320312, + "learning_rate": 8.521696891191711e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.893796980381012, + "num_tokens": 1637522.0, + "step": 914 + }, + { + "epoch": 0.1481661403934904, + "grad_norm": 28.174116134643555, + "learning_rate": 8.520077720207255e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.8952020108699799, + "num_tokens": 1639301.0, + "step": 915 + }, + { + "epoch": 0.14832807060157072, + "grad_norm": 22.744596481323242, + "learning_rate": 8.518458549222799e-06, + "loss": 0.5981, + "mean_token_accuracy": 0.9134517908096313, + "num_tokens": 1641101.0, + "step": 916 + }, + { + "epoch": 0.14849000080965105, + "grad_norm": 25.569793701171875, + "learning_rate": 8.516839378238342e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.9087750613689423, + "num_tokens": 1642897.0, + "step": 917 + }, + { + "epoch": 0.14865193101773136, + "grad_norm": 27.837949752807617, + "learning_rate": 8.515220207253887e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.9123667478561401, + "num_tokens": 1644683.0, + "step": 918 + }, + { + "epoch": 0.14881386122581167, + "grad_norm": 24.422086715698242, + "learning_rate": 8.513601036269431e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.9062043726444244, + "num_tokens": 1646472.0, + "step": 919 + }, + { + "epoch": 0.148975791433892, + "grad_norm": 23.904264450073242, + "learning_rate": 8.511981865284975e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.9138985574245453, + "num_tokens": 1648251.0, + "step": 920 + }, + { + "epoch": 0.1491377216419723, + "grad_norm": 29.145042419433594, + "learning_rate": 8.510362694300518e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.8869743347167969, + "num_tokens": 1650046.0, + "step": 921 + }, + { + "epoch": 0.14929965185005262, + "grad_norm": 29.6810302734375, + "learning_rate": 8.508743523316064e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.8966501653194427, + "num_tokens": 1651848.0, + "step": 922 + }, + { + "epoch": 0.14946158205813295, + "grad_norm": 29.418107986450195, + "learning_rate": 8.507124352331607e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.8968901038169861, + "num_tokens": 1653642.0, + "step": 923 + }, + { + "epoch": 0.14962351226621326, + "grad_norm": 25.847246170043945, + "learning_rate": 8.50550518134715e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.9053932428359985, + "num_tokens": 1655429.0, + "step": 924 + }, + { + "epoch": 0.14978544247429357, + "grad_norm": 26.78089714050293, + "learning_rate": 8.503886010362696e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.8966714143753052, + "num_tokens": 1657222.0, + "step": 925 + }, + { + "epoch": 0.1499473726823739, + "grad_norm": 25.565675735473633, + "learning_rate": 8.50226683937824e-06, + "loss": 0.744, + "mean_token_accuracy": 0.9042780995368958, + "num_tokens": 1659004.0, + "step": 926 + }, + { + "epoch": 0.1501093028904542, + "grad_norm": 32.19752502441406, + "learning_rate": 8.500647668393783e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.8999948501586914, + "num_tokens": 1660795.0, + "step": 927 + }, + { + "epoch": 0.15027123309853452, + "grad_norm": 30.858156204223633, + "learning_rate": 8.499028497409327e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.9026522934436798, + "num_tokens": 1662594.0, + "step": 928 + }, + { + "epoch": 0.15043316330661485, + "grad_norm": 23.60885238647461, + "learning_rate": 8.497409326424872e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.9205905199050903, + "num_tokens": 1664384.0, + "step": 929 + }, + { + "epoch": 0.15059509351469516, + "grad_norm": 23.985355377197266, + "learning_rate": 8.495790155440416e-06, + "loss": 0.6405, + "mean_token_accuracy": 0.9112013578414917, + "num_tokens": 1666166.0, + "step": 930 + }, + { + "epoch": 0.1507570237227755, + "grad_norm": 31.914487838745117, + "learning_rate": 8.49417098445596e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.9006539285182953, + "num_tokens": 1667960.0, + "step": 931 + }, + { + "epoch": 0.1509189539308558, + "grad_norm": 24.270599365234375, + "learning_rate": 8.492551813471503e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.9157386422157288, + "num_tokens": 1669757.0, + "step": 932 + }, + { + "epoch": 0.1510808841389361, + "grad_norm": 27.624757766723633, + "learning_rate": 8.490932642487048e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.9020979106426239, + "num_tokens": 1671542.0, + "step": 933 + }, + { + "epoch": 0.15124281434701645, + "grad_norm": 19.0823974609375, + "learning_rate": 8.489313471502592e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.9097693562507629, + "num_tokens": 1673331.0, + "step": 934 + }, + { + "epoch": 0.15140474455509675, + "grad_norm": 34.84246063232422, + "learning_rate": 8.487694300518135e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.8904704749584198, + "num_tokens": 1675126.0, + "step": 935 + }, + { + "epoch": 0.15156667476317706, + "grad_norm": 21.22995948791504, + "learning_rate": 8.486075129533679e-06, + "loss": 0.5987, + "mean_token_accuracy": 0.916402131319046, + "num_tokens": 1676913.0, + "step": 936 + }, + { + "epoch": 0.1517286049712574, + "grad_norm": 28.371997833251953, + "learning_rate": 8.484455958549224e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.8929268419742584, + "num_tokens": 1678715.0, + "step": 937 + }, + { + "epoch": 0.1518905351793377, + "grad_norm": 31.93210220336914, + "learning_rate": 8.482836787564768e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.8849896490573883, + "num_tokens": 1680505.0, + "step": 938 + }, + { + "epoch": 0.152052465387418, + "grad_norm": 17.640134811401367, + "learning_rate": 8.481217616580311e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.9218370020389557, + "num_tokens": 1682286.0, + "step": 939 + }, + { + "epoch": 0.15221439559549835, + "grad_norm": 34.61970520019531, + "learning_rate": 8.479598445595855e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.8887709677219391, + "num_tokens": 1684086.0, + "step": 940 + }, + { + "epoch": 0.15237632580357865, + "grad_norm": 22.304615020751953, + "learning_rate": 8.4779792746114e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.9135932624340057, + "num_tokens": 1685887.0, + "step": 941 + }, + { + "epoch": 0.15253825601165896, + "grad_norm": 30.35696029663086, + "learning_rate": 8.476360103626944e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.9139194190502167, + "num_tokens": 1687689.0, + "step": 942 + }, + { + "epoch": 0.1527001862197393, + "grad_norm": 26.467973709106445, + "learning_rate": 8.474740932642487e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.9063776135444641, + "num_tokens": 1689488.0, + "step": 943 + }, + { + "epoch": 0.1528621164278196, + "grad_norm": 34.51942825317383, + "learning_rate": 8.473121761658033e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.8851426243782043, + "num_tokens": 1691295.0, + "step": 944 + }, + { + "epoch": 0.15302404663589994, + "grad_norm": 28.47652244567871, + "learning_rate": 8.471502590673576e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.8971630930900574, + "num_tokens": 1693089.0, + "step": 945 + }, + { + "epoch": 0.15318597684398025, + "grad_norm": 27.88567352294922, + "learning_rate": 8.46988341968912e-06, + "loss": 0.6497, + "mean_token_accuracy": 0.9126865565776825, + "num_tokens": 1694875.0, + "step": 946 + }, + { + "epoch": 0.15334790705206056, + "grad_norm": 25.989086151123047, + "learning_rate": 8.468264248704663e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.9117217361927032, + "num_tokens": 1696659.0, + "step": 947 + }, + { + "epoch": 0.1535098372601409, + "grad_norm": 28.449325561523438, + "learning_rate": 8.466645077720209e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.8941029608249664, + "num_tokens": 1698454.0, + "step": 948 + }, + { + "epoch": 0.1536717674682212, + "grad_norm": 30.008527755737305, + "learning_rate": 8.465025906735752e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.9015873074531555, + "num_tokens": 1700250.0, + "step": 949 + }, + { + "epoch": 0.1538336976763015, + "grad_norm": 25.61617660522461, + "learning_rate": 8.463406735751296e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.9069034159183502, + "num_tokens": 1702031.0, + "step": 950 + }, + { + "epoch": 0.15399562788438184, + "grad_norm": 32.07202911376953, + "learning_rate": 8.46178756476684e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.8929268419742584, + "num_tokens": 1703833.0, + "step": 951 + }, + { + "epoch": 0.15415755809246215, + "grad_norm": 28.24432373046875, + "learning_rate": 8.460168393782385e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.8913426399230957, + "num_tokens": 1705631.0, + "step": 952 + }, + { + "epoch": 0.15431948830054246, + "grad_norm": 26.870410919189453, + "learning_rate": 8.458549222797928e-06, + "loss": 0.834, + "mean_token_accuracy": 0.9130023717880249, + "num_tokens": 1707419.0, + "step": 953 + }, + { + "epoch": 0.1544814185086228, + "grad_norm": 30.178091049194336, + "learning_rate": 8.456930051813472e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.8924460411071777, + "num_tokens": 1709210.0, + "step": 954 + }, + { + "epoch": 0.1546433487167031, + "grad_norm": 30.722068786621094, + "learning_rate": 8.455310880829016e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.89723339676857, + "num_tokens": 1711004.0, + "step": 955 + }, + { + "epoch": 0.1548052789247834, + "grad_norm": 27.30564308166504, + "learning_rate": 8.45369170984456e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.9052592515945435, + "num_tokens": 1712800.0, + "step": 956 + }, + { + "epoch": 0.15496720913286374, + "grad_norm": 21.477548599243164, + "learning_rate": 8.452072538860104e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.9039260447025299, + "num_tokens": 1714593.0, + "step": 957 + }, + { + "epoch": 0.15512913934094405, + "grad_norm": 23.72429084777832, + "learning_rate": 8.450453367875648e-06, + "loss": 0.6494, + "mean_token_accuracy": 0.9007092118263245, + "num_tokens": 1716387.0, + "step": 958 + }, + { + "epoch": 0.15529106954902436, + "grad_norm": 17.59723663330078, + "learning_rate": 8.448834196891193e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.9219769835472107, + "num_tokens": 1718168.0, + "step": 959 + }, + { + "epoch": 0.1554529997571047, + "grad_norm": 25.647672653198242, + "learning_rate": 8.447215025906737e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.8756858706474304, + "num_tokens": 1719962.0, + "step": 960 + }, + { + "epoch": 0.155614929965185, + "grad_norm": 25.639387130737305, + "learning_rate": 8.44559585492228e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.9118537902832031, + "num_tokens": 1721746.0, + "step": 961 + }, + { + "epoch": 0.15577686017326534, + "grad_norm": 34.39677810668945, + "learning_rate": 8.443976683937824e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.8847903907299042, + "num_tokens": 1723553.0, + "step": 962 + }, + { + "epoch": 0.15593879038134564, + "grad_norm": 24.351181030273438, + "learning_rate": 8.44235751295337e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.9055944085121155, + "num_tokens": 1725351.0, + "step": 963 + }, + { + "epoch": 0.15610072058942595, + "grad_norm": 33.06178665161133, + "learning_rate": 8.440738341968913e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.8937968611717224, + "num_tokens": 1727145.0, + "step": 964 + }, + { + "epoch": 0.1562626507975063, + "grad_norm": 24.48506736755371, + "learning_rate": 8.439119170984457e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.9168752431869507, + "num_tokens": 1728934.0, + "step": 965 + }, + { + "epoch": 0.1564245810055866, + "grad_norm": 24.079999923706055, + "learning_rate": 8.4375e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.9053837954998016, + "num_tokens": 1730720.0, + "step": 966 + }, + { + "epoch": 0.1565865112136669, + "grad_norm": 28.945960998535156, + "learning_rate": 8.435880829015545e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.9163067042827606, + "num_tokens": 1732506.0, + "step": 967 + }, + { + "epoch": 0.15674844142174724, + "grad_norm": 29.045324325561523, + "learning_rate": 8.434261658031089e-06, + "loss": 0.778, + "mean_token_accuracy": 0.8924980163574219, + "num_tokens": 1734294.0, + "step": 968 + }, + { + "epoch": 0.15691037162982754, + "grad_norm": 24.221921920776367, + "learning_rate": 8.432642487046633e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.9034090936183929, + "num_tokens": 1736082.0, + "step": 969 + }, + { + "epoch": 0.15707230183790785, + "grad_norm": 26.38144874572754, + "learning_rate": 8.431023316062176e-06, + "loss": 0.729, + "mean_token_accuracy": 0.9141156673431396, + "num_tokens": 1737873.0, + "step": 970 + }, + { + "epoch": 0.1572342320459882, + "grad_norm": 20.605619430541992, + "learning_rate": 8.429404145077721e-06, + "loss": 0.6105, + "mean_token_accuracy": 0.925168365240097, + "num_tokens": 1739652.0, + "step": 971 + }, + { + "epoch": 0.1573961622540685, + "grad_norm": 28.105173110961914, + "learning_rate": 8.427784974093265e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.89723339676857, + "num_tokens": 1741446.0, + "step": 972 + }, + { + "epoch": 0.1575580924621488, + "grad_norm": 29.532089233398438, + "learning_rate": 8.426165803108809e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.9031603336334229, + "num_tokens": 1743237.0, + "step": 973 + }, + { + "epoch": 0.15772002267022914, + "grad_norm": 36.528770446777344, + "learning_rate": 8.424546632124352e-06, + "loss": 1.1743, + "mean_token_accuracy": 0.8727026879787445, + "num_tokens": 1745047.0, + "step": 974 + }, + { + "epoch": 0.15788195287830945, + "grad_norm": 28.37747573852539, + "learning_rate": 8.422927461139897e-06, + "loss": 0.6049, + "mean_token_accuracy": 0.9088670015335083, + "num_tokens": 1746844.0, + "step": 975 + }, + { + "epoch": 0.15804388308638975, + "grad_norm": 33.5258674621582, + "learning_rate": 8.421308290155441e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.880923718214035, + "num_tokens": 1748650.0, + "step": 976 + }, + { + "epoch": 0.1582058132944701, + "grad_norm": 24.976877212524414, + "learning_rate": 8.419689119170985e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.9019651114940643, + "num_tokens": 1750447.0, + "step": 977 + }, + { + "epoch": 0.1583677435025504, + "grad_norm": 29.21996307373047, + "learning_rate": 8.41806994818653e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.8776244223117828, + "num_tokens": 1752245.0, + "step": 978 + }, + { + "epoch": 0.15852967371063073, + "grad_norm": 25.497209548950195, + "learning_rate": 8.416450777202074e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.9057921469211578, + "num_tokens": 1754033.0, + "step": 979 + }, + { + "epoch": 0.15869160391871104, + "grad_norm": 25.13727569580078, + "learning_rate": 8.414831606217617e-06, + "loss": 0.6612, + "mean_token_accuracy": 0.9112793803215027, + "num_tokens": 1755827.0, + "step": 980 + }, + { + "epoch": 0.15885353412679135, + "grad_norm": 32.218650817871094, + "learning_rate": 8.41321243523316e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.9064554274082184, + "num_tokens": 1757617.0, + "step": 981 + }, + { + "epoch": 0.15901546433487168, + "grad_norm": 23.252197265625, + "learning_rate": 8.411593264248706e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.9141464829444885, + "num_tokens": 1759397.0, + "step": 982 + }, + { + "epoch": 0.159177394542952, + "grad_norm": 33.847816467285156, + "learning_rate": 8.40997409326425e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.8996093273162842, + "num_tokens": 1761188.0, + "step": 983 + }, + { + "epoch": 0.1593393247510323, + "grad_norm": 26.308055877685547, + "learning_rate": 8.408354922279793e-06, + "loss": 0.6681, + "mean_token_accuracy": 0.9154391586780548, + "num_tokens": 1762984.0, + "step": 984 + }, + { + "epoch": 0.15950125495911263, + "grad_norm": 23.857769012451172, + "learning_rate": 8.406735751295337e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.9074297845363617, + "num_tokens": 1764766.0, + "step": 985 + }, + { + "epoch": 0.15966318516719294, + "grad_norm": 29.106836318969727, + "learning_rate": 8.405116580310882e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.8929536640644073, + "num_tokens": 1766566.0, + "step": 986 + }, + { + "epoch": 0.15982511537527325, + "grad_norm": 21.778444290161133, + "learning_rate": 8.403497409326426e-06, + "loss": 0.6758, + "mean_token_accuracy": 0.9108070731163025, + "num_tokens": 1768347.0, + "step": 987 + }, + { + "epoch": 0.15998704558335358, + "grad_norm": 25.258974075317383, + "learning_rate": 8.40187823834197e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.9010291695594788, + "num_tokens": 1770142.0, + "step": 988 + }, + { + "epoch": 0.1601489757914339, + "grad_norm": 28.957754135131836, + "learning_rate": 8.400259067357513e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.8993730545043945, + "num_tokens": 1771942.0, + "step": 989 + }, + { + "epoch": 0.1603109059995142, + "grad_norm": 34.045379638671875, + "learning_rate": 8.398639896373058e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.8682598173618317, + "num_tokens": 1773734.0, + "step": 990 + }, + { + "epoch": 0.16047283620759453, + "grad_norm": 29.821964263916016, + "learning_rate": 8.397020725388602e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.8944444358348846, + "num_tokens": 1775530.0, + "step": 991 + }, + { + "epoch": 0.16063476641567484, + "grad_norm": 23.459064483642578, + "learning_rate": 8.395401554404145e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.9061990678310394, + "num_tokens": 1777318.0, + "step": 992 + }, + { + "epoch": 0.16079669662375518, + "grad_norm": 23.6264705657959, + "learning_rate": 8.393782383419689e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.906389594078064, + "num_tokens": 1779096.0, + "step": 993 + }, + { + "epoch": 0.16095862683183548, + "grad_norm": 35.015262603759766, + "learning_rate": 8.392163212435234e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.8844228684902191, + "num_tokens": 1780885.0, + "step": 994 + }, + { + "epoch": 0.1611205570399158, + "grad_norm": 34.203609466552734, + "learning_rate": 8.390544041450778e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.881540060043335, + "num_tokens": 1782675.0, + "step": 995 + }, + { + "epoch": 0.16128248724799613, + "grad_norm": 22.607555389404297, + "learning_rate": 8.388924870466321e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.9079106450080872, + "num_tokens": 1784469.0, + "step": 996 + }, + { + "epoch": 0.16144441745607643, + "grad_norm": 27.346338272094727, + "learning_rate": 8.387305699481867e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.919334203004837, + "num_tokens": 1786266.0, + "step": 997 + }, + { + "epoch": 0.16160634766415674, + "grad_norm": 26.722373962402344, + "learning_rate": 8.38568652849741e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.8961466252803802, + "num_tokens": 1788047.0, + "step": 998 + }, + { + "epoch": 0.16176827787223708, + "grad_norm": 32.32159423828125, + "learning_rate": 8.384067357512954e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.895600438117981, + "num_tokens": 1789837.0, + "step": 999 + }, + { + "epoch": 0.16193020808031738, + "grad_norm": 32.08845520019531, + "learning_rate": 8.382448186528497e-06, + "loss": 0.891, + "mean_token_accuracy": 0.8835561871528625, + "num_tokens": 1791641.0, + "step": 1000 + }, + { + "epoch": 0.1620921382883977, + "grad_norm": 27.835010528564453, + "learning_rate": 8.380829015544043e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.8854427933692932, + "num_tokens": 1793441.0, + "step": 1001 + }, + { + "epoch": 0.16225406849647803, + "grad_norm": 22.59184455871582, + "learning_rate": 8.379209844559586e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.910397082567215, + "num_tokens": 1795230.0, + "step": 1002 + }, + { + "epoch": 0.16241599870455833, + "grad_norm": 16.92840003967285, + "learning_rate": 8.37759067357513e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.9187537133693695, + "num_tokens": 1797014.0, + "step": 1003 + }, + { + "epoch": 0.16257792891263864, + "grad_norm": 27.888029098510742, + "learning_rate": 8.375971502590673e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.9067992568016052, + "num_tokens": 1798805.0, + "step": 1004 + }, + { + "epoch": 0.16273985912071898, + "grad_norm": 29.029632568359375, + "learning_rate": 8.374352331606219e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.8823952972888947, + "num_tokens": 1800606.0, + "step": 1005 + }, + { + "epoch": 0.16290178932879928, + "grad_norm": 16.296430587768555, + "learning_rate": 8.372733160621762e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.9232524335384369, + "num_tokens": 1802391.0, + "step": 1006 + }, + { + "epoch": 0.1630637195368796, + "grad_norm": 30.251874923706055, + "learning_rate": 8.371113989637306e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.8787719011306763, + "num_tokens": 1804192.0, + "step": 1007 + }, + { + "epoch": 0.16322564974495993, + "grad_norm": 26.44344139099121, + "learning_rate": 8.36949481865285e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.9017285704612732, + "num_tokens": 1805978.0, + "step": 1008 + }, + { + "epoch": 0.16338757995304024, + "grad_norm": 28.625341415405273, + "learning_rate": 8.367875647668395e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.8992329835891724, + "num_tokens": 1807779.0, + "step": 1009 + }, + { + "epoch": 0.16354951016112057, + "grad_norm": 26.60886573791504, + "learning_rate": 8.366256476683938e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.9022475481033325, + "num_tokens": 1809567.0, + "step": 1010 + }, + { + "epoch": 0.16371144036920088, + "grad_norm": 21.12505340576172, + "learning_rate": 8.364637305699482e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.9216565489768982, + "num_tokens": 1811360.0, + "step": 1011 + }, + { + "epoch": 0.16387337057728119, + "grad_norm": 33.39212417602539, + "learning_rate": 8.363018134715026e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.8789272010326385, + "num_tokens": 1813161.0, + "step": 1012 + }, + { + "epoch": 0.16403530078536152, + "grad_norm": 28.788116455078125, + "learning_rate": 8.36139896373057e-06, + "loss": 0.84, + "mean_token_accuracy": 0.8966071605682373, + "num_tokens": 1814962.0, + "step": 1013 + }, + { + "epoch": 0.16419723099344183, + "grad_norm": 22.292428970336914, + "learning_rate": 8.359779792746114e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.9087591171264648, + "num_tokens": 1816748.0, + "step": 1014 + }, + { + "epoch": 0.16435916120152214, + "grad_norm": 29.944517135620117, + "learning_rate": 8.358160621761658e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.8854166567325592, + "num_tokens": 1818539.0, + "step": 1015 + }, + { + "epoch": 0.16452109140960247, + "grad_norm": 18.673097610473633, + "learning_rate": 8.356541450777203e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.9097744226455688, + "num_tokens": 1820317.0, + "step": 1016 + }, + { + "epoch": 0.16468302161768278, + "grad_norm": 19.84192657470703, + "learning_rate": 8.354922279792747e-06, + "loss": 0.6125, + "mean_token_accuracy": 0.9169273376464844, + "num_tokens": 1822106.0, + "step": 1017 + }, + { + "epoch": 0.1648449518257631, + "grad_norm": 27.13695526123047, + "learning_rate": 8.35330310880829e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.9027547836303711, + "num_tokens": 1823896.0, + "step": 1018 + }, + { + "epoch": 0.16500688203384342, + "grad_norm": 23.334693908691406, + "learning_rate": 8.351683937823834e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.9060479700565338, + "num_tokens": 1825685.0, + "step": 1019 + }, + { + "epoch": 0.16516881224192373, + "grad_norm": 18.940563201904297, + "learning_rate": 8.35006476683938e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.9340487122535706, + "num_tokens": 1827485.0, + "step": 1020 + }, + { + "epoch": 0.16533074245000404, + "grad_norm": 35.45963668823242, + "learning_rate": 8.348445595854923e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.8938707709312439, + "num_tokens": 1829279.0, + "step": 1021 + }, + { + "epoch": 0.16549267265808437, + "grad_norm": 20.04072380065918, + "learning_rate": 8.346826424870467e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.9194128215312958, + "num_tokens": 1831064.0, + "step": 1022 + }, + { + "epoch": 0.16565460286616468, + "grad_norm": 29.936227798461914, + "learning_rate": 8.34520725388601e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.905802845954895, + "num_tokens": 1832860.0, + "step": 1023 + }, + { + "epoch": 0.165816533074245, + "grad_norm": 24.985944747924805, + "learning_rate": 8.343588082901555e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.9070305228233337, + "num_tokens": 1834651.0, + "step": 1024 + }, + { + "epoch": 0.16597846328232532, + "grad_norm": 28.883068084716797, + "learning_rate": 8.341968911917099e-06, + "loss": 0.781, + "mean_token_accuracy": 0.9091353714466095, + "num_tokens": 1836449.0, + "step": 1025 + }, + { + "epoch": 0.16614039349040563, + "grad_norm": 27.938495635986328, + "learning_rate": 8.340349740932643e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.9004205167293549, + "num_tokens": 1838242.0, + "step": 1026 + }, + { + "epoch": 0.16630232369848597, + "grad_norm": 23.80014419555664, + "learning_rate": 8.338730569948186e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.9061383306980133, + "num_tokens": 1840030.0, + "step": 1027 + }, + { + "epoch": 0.16646425390656627, + "grad_norm": 36.050506591796875, + "learning_rate": 8.337111398963731e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.8878731727600098, + "num_tokens": 1841836.0, + "step": 1028 + }, + { + "epoch": 0.16662618411464658, + "grad_norm": 23.888402938842773, + "learning_rate": 8.335492227979275e-06, + "loss": 0.6407, + "mean_token_accuracy": 0.9055112302303314, + "num_tokens": 1843633.0, + "step": 1029 + }, + { + "epoch": 0.16678811432272692, + "grad_norm": 19.903118133544922, + "learning_rate": 8.333873056994819e-06, + "loss": 0.608, + "mean_token_accuracy": 0.9089947044849396, + "num_tokens": 1845420.0, + "step": 1030 + }, + { + "epoch": 0.16695004453080722, + "grad_norm": 26.717721939086914, + "learning_rate": 8.332253886010362e-06, + "loss": 0.6391, + "mean_token_accuracy": 0.897081196308136, + "num_tokens": 1847213.0, + "step": 1031 + }, + { + "epoch": 0.16711197473888753, + "grad_norm": 29.56040382385254, + "learning_rate": 8.330634715025908e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.900211364030838, + "num_tokens": 1849007.0, + "step": 1032 + }, + { + "epoch": 0.16727390494696787, + "grad_norm": 26.610042572021484, + "learning_rate": 8.329015544041451e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.9204458296298981, + "num_tokens": 1850808.0, + "step": 1033 + }, + { + "epoch": 0.16743583515504817, + "grad_norm": 31.945354461669922, + "learning_rate": 8.327396373056995e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.8885893523693085, + "num_tokens": 1852607.0, + "step": 1034 + }, + { + "epoch": 0.16759776536312848, + "grad_norm": 29.442951202392578, + "learning_rate": 8.32577720207254e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.9102682769298553, + "num_tokens": 1854398.0, + "step": 1035 + }, + { + "epoch": 0.16775969557120882, + "grad_norm": 27.62604522705078, + "learning_rate": 8.324158031088084e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.8967592716217041, + "num_tokens": 1856181.0, + "step": 1036 + }, + { + "epoch": 0.16792162577928912, + "grad_norm": 25.561246871948242, + "learning_rate": 8.322538860103627e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.9116471707820892, + "num_tokens": 1857976.0, + "step": 1037 + }, + { + "epoch": 0.16808355598736943, + "grad_norm": 19.984628677368164, + "learning_rate": 8.32091968911917e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.9274227619171143, + "num_tokens": 1859763.0, + "step": 1038 + }, + { + "epoch": 0.16824548619544977, + "grad_norm": 30.77582359313965, + "learning_rate": 8.319300518134716e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.8869130909442902, + "num_tokens": 1861558.0, + "step": 1039 + }, + { + "epoch": 0.16840741640353007, + "grad_norm": 30.961000442504883, + "learning_rate": 8.31768134715026e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.8897708058357239, + "num_tokens": 1863350.0, + "step": 1040 + }, + { + "epoch": 0.16856934661161038, + "grad_norm": 24.89232063293457, + "learning_rate": 8.316062176165803e-06, + "loss": 0.658, + "mean_token_accuracy": 0.9166927635669708, + "num_tokens": 1865139.0, + "step": 1041 + }, + { + "epoch": 0.16873127681969072, + "grad_norm": 23.86810874938965, + "learning_rate": 8.314443005181347e-06, + "loss": 0.686, + "mean_token_accuracy": 0.9107033312320709, + "num_tokens": 1866931.0, + "step": 1042 + }, + { + "epoch": 0.16889320702777103, + "grad_norm": 31.158031463623047, + "learning_rate": 8.312823834196892e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.8963263034820557, + "num_tokens": 1868732.0, + "step": 1043 + }, + { + "epoch": 0.16905513723585136, + "grad_norm": 28.18429183959961, + "learning_rate": 8.311204663212436e-06, + "loss": 0.702, + "mean_token_accuracy": 0.8967308700084686, + "num_tokens": 1870543.0, + "step": 1044 + }, + { + "epoch": 0.16921706744393167, + "grad_norm": 30.117345809936523, + "learning_rate": 8.30958549222798e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.8870314955711365, + "num_tokens": 1872338.0, + "step": 1045 + }, + { + "epoch": 0.16937899765201198, + "grad_norm": 27.53786277770996, + "learning_rate": 8.307966321243523e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.8953185379505157, + "num_tokens": 1874127.0, + "step": 1046 + }, + { + "epoch": 0.1695409278600923, + "grad_norm": 25.65018653869629, + "learning_rate": 8.306347150259068e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.904900074005127, + "num_tokens": 1875923.0, + "step": 1047 + }, + { + "epoch": 0.16970285806817262, + "grad_norm": 29.309276580810547, + "learning_rate": 8.304727979274612e-06, + "loss": 0.779, + "mean_token_accuracy": 0.8931216895580292, + "num_tokens": 1877717.0, + "step": 1048 + }, + { + "epoch": 0.16986478827625293, + "grad_norm": 29.517623901367188, + "learning_rate": 8.303108808290155e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.9039066433906555, + "num_tokens": 1879510.0, + "step": 1049 + }, + { + "epoch": 0.17002671848433326, + "grad_norm": 25.233959197998047, + "learning_rate": 8.301489637305699e-06, + "loss": 0.6591, + "mean_token_accuracy": 0.9136288166046143, + "num_tokens": 1881300.0, + "step": 1050 + }, + { + "epoch": 0.17018864869241357, + "grad_norm": 27.839366912841797, + "learning_rate": 8.299870466321244e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.8921431005001068, + "num_tokens": 1883090.0, + "step": 1051 + }, + { + "epoch": 0.17035057890049388, + "grad_norm": 28.8481388092041, + "learning_rate": 8.298251295336788e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.8960882127285004, + "num_tokens": 1884881.0, + "step": 1052 + }, + { + "epoch": 0.1705125091085742, + "grad_norm": 31.49526596069336, + "learning_rate": 8.296632124352331e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.9040741920471191, + "num_tokens": 1886674.0, + "step": 1053 + }, + { + "epoch": 0.17067443931665452, + "grad_norm": 27.991830825805664, + "learning_rate": 8.295012953367877e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.9014880955219269, + "num_tokens": 1888470.0, + "step": 1054 + }, + { + "epoch": 0.17083636952473483, + "grad_norm": 29.720895767211914, + "learning_rate": 8.29339378238342e-06, + "loss": 0.681, + "mean_token_accuracy": 0.8990960717201233, + "num_tokens": 1890258.0, + "step": 1055 + }, + { + "epoch": 0.17099829973281516, + "grad_norm": 26.77786636352539, + "learning_rate": 8.291774611398964e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.9031760692596436, + "num_tokens": 1892049.0, + "step": 1056 + }, + { + "epoch": 0.17116022994089547, + "grad_norm": 15.971278190612793, + "learning_rate": 8.290155440414507e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.9282234609127045, + "num_tokens": 1893839.0, + "step": 1057 + }, + { + "epoch": 0.1713221601489758, + "grad_norm": 26.042957305908203, + "learning_rate": 8.288536269430053e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.8944527804851532, + "num_tokens": 1895634.0, + "step": 1058 + }, + { + "epoch": 0.1714840903570561, + "grad_norm": 22.951515197753906, + "learning_rate": 8.286917098445596e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.912091463804245, + "num_tokens": 1897430.0, + "step": 1059 + }, + { + "epoch": 0.17164602056513642, + "grad_norm": 26.006912231445312, + "learning_rate": 8.28529792746114e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.9104984700679779, + "num_tokens": 1899210.0, + "step": 1060 + }, + { + "epoch": 0.17180795077321676, + "grad_norm": 25.629560470581055, + "learning_rate": 8.283678756476683e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.9049533605575562, + "num_tokens": 1901006.0, + "step": 1061 + }, + { + "epoch": 0.17196988098129706, + "grad_norm": 33.309391021728516, + "learning_rate": 8.282059585492229e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.9005594551563263, + "num_tokens": 1902811.0, + "step": 1062 + }, + { + "epoch": 0.17213181118937737, + "grad_norm": 27.263444900512695, + "learning_rate": 8.280440414507774e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.9054373502731323, + "num_tokens": 1904608.0, + "step": 1063 + }, + { + "epoch": 0.1722937413974577, + "grad_norm": 23.66631507873535, + "learning_rate": 8.278821243523318e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.9117647111415863, + "num_tokens": 1906392.0, + "step": 1064 + }, + { + "epoch": 0.172455671605538, + "grad_norm": 26.183629989624023, + "learning_rate": 8.277202072538861e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.8985491693019867, + "num_tokens": 1908179.0, + "step": 1065 + }, + { + "epoch": 0.17261760181361832, + "grad_norm": 25.572322845458984, + "learning_rate": 8.275582901554405e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.904900074005127, + "num_tokens": 1909975.0, + "step": 1066 + }, + { + "epoch": 0.17277953202169866, + "grad_norm": 24.543785095214844, + "learning_rate": 8.27396373056995e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.9144460260868073, + "num_tokens": 1911756.0, + "step": 1067 + }, + { + "epoch": 0.17294146222977896, + "grad_norm": 34.99859619140625, + "learning_rate": 8.272344559585494e-06, + "loss": 0.74, + "mean_token_accuracy": 0.9032531678676605, + "num_tokens": 1913547.0, + "step": 1068 + }, + { + "epoch": 0.17310339243785927, + "grad_norm": 21.34604263305664, + "learning_rate": 8.270725388601037e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.9059401154518127, + "num_tokens": 1915346.0, + "step": 1069 + }, + { + "epoch": 0.1732653226459396, + "grad_norm": 22.849685668945312, + "learning_rate": 8.269106217616581e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.9062369465827942, + "num_tokens": 1917135.0, + "step": 1070 + }, + { + "epoch": 0.17342725285401991, + "grad_norm": 30.736602783203125, + "learning_rate": 8.267487046632126e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.902877688407898, + "num_tokens": 1918925.0, + "step": 1071 + }, + { + "epoch": 0.17358918306210022, + "grad_norm": 38.222381591796875, + "learning_rate": 8.26586787564767e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.8739229142665863, + "num_tokens": 1920719.0, + "step": 1072 + }, + { + "epoch": 0.17375111327018056, + "grad_norm": 30.508625030517578, + "learning_rate": 8.264248704663213e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.8942881524562836, + "num_tokens": 1922505.0, + "step": 1073 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 27.91756248474121, + "learning_rate": 8.262629533678757e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.9043585360050201, + "num_tokens": 1924303.0, + "step": 1074 + }, + { + "epoch": 0.1740749736863412, + "grad_norm": 29.92466926574707, + "learning_rate": 8.261010362694302e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.8922041058540344, + "num_tokens": 1926093.0, + "step": 1075 + }, + { + "epoch": 0.1742369038944215, + "grad_norm": 21.300933837890625, + "learning_rate": 8.259391191709846e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.9121207296848297, + "num_tokens": 1927879.0, + "step": 1076 + }, + { + "epoch": 0.17439883410250182, + "grad_norm": 24.171720504760742, + "learning_rate": 8.25777202072539e-06, + "loss": 0.6282, + "mean_token_accuracy": 0.9152413010597229, + "num_tokens": 1929662.0, + "step": 1077 + }, + { + "epoch": 0.17456076431058215, + "grad_norm": 30.51509666442871, + "learning_rate": 8.256152849740933e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.9052110016345978, + "num_tokens": 1931448.0, + "step": 1078 + }, + { + "epoch": 0.17472269451866246, + "grad_norm": 25.310985565185547, + "learning_rate": 8.254533678756478e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.9041759967803955, + "num_tokens": 1933244.0, + "step": 1079 + }, + { + "epoch": 0.17488462472674277, + "grad_norm": 29.202320098876953, + "learning_rate": 8.252914507772022e-06, + "loss": 0.685, + "mean_token_accuracy": 0.8978347778320312, + "num_tokens": 1935032.0, + "step": 1080 + }, + { + "epoch": 0.1750465549348231, + "grad_norm": 27.092933654785156, + "learning_rate": 8.251295336787565e-06, + "loss": 0.6689, + "mean_token_accuracy": 0.904411792755127, + "num_tokens": 1936816.0, + "step": 1081 + }, + { + "epoch": 0.1752084851429034, + "grad_norm": 26.014677047729492, + "learning_rate": 8.24967616580311e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.9114806056022644, + "num_tokens": 1938610.0, + "step": 1082 + }, + { + "epoch": 0.17537041535098372, + "grad_norm": 25.77332305908203, + "learning_rate": 8.248056994818654e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.9077786207199097, + "num_tokens": 1940393.0, + "step": 1083 + }, + { + "epoch": 0.17553234555906405, + "grad_norm": 32.63511657714844, + "learning_rate": 8.246437823834198e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.8903985619544983, + "num_tokens": 1942187.0, + "step": 1084 + }, + { + "epoch": 0.17569427576714436, + "grad_norm": 28.944568634033203, + "learning_rate": 8.244818652849741e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.8952281475067139, + "num_tokens": 1943983.0, + "step": 1085 + }, + { + "epoch": 0.17585620597522467, + "grad_norm": 29.824087142944336, + "learning_rate": 8.243199481865287e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.8979211151599884, + "num_tokens": 1945769.0, + "step": 1086 + }, + { + "epoch": 0.176018136183305, + "grad_norm": 27.063369750976562, + "learning_rate": 8.24158031088083e-06, + "loss": 0.741, + "mean_token_accuracy": 0.9059762358665466, + "num_tokens": 1947558.0, + "step": 1087 + }, + { + "epoch": 0.1761800663913853, + "grad_norm": 33.166141510009766, + "learning_rate": 8.239961139896374e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.8994308412075043, + "num_tokens": 1949358.0, + "step": 1088 + }, + { + "epoch": 0.17634199659946562, + "grad_norm": 25.398836135864258, + "learning_rate": 8.238341968911918e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.9055555462837219, + "num_tokens": 1951145.0, + "step": 1089 + }, + { + "epoch": 0.17650392680754595, + "grad_norm": 22.27993392944336, + "learning_rate": 8.236722797927463e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.9088341891765594, + "num_tokens": 1952931.0, + "step": 1090 + }, + { + "epoch": 0.17666585701562626, + "grad_norm": 21.60903549194336, + "learning_rate": 8.235103626943006e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.9133562743663788, + "num_tokens": 1954720.0, + "step": 1091 + }, + { + "epoch": 0.1768277872237066, + "grad_norm": 31.3790283203125, + "learning_rate": 8.23348445595855e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.8852774798870087, + "num_tokens": 1956511.0, + "step": 1092 + }, + { + "epoch": 0.1769897174317869, + "grad_norm": 31.63407325744629, + "learning_rate": 8.231865284974094e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.9042253494262695, + "num_tokens": 1958305.0, + "step": 1093 + }, + { + "epoch": 0.1771516476398672, + "grad_norm": 22.23463249206543, + "learning_rate": 8.230246113989639e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.9158298671245575, + "num_tokens": 1960099.0, + "step": 1094 + }, + { + "epoch": 0.17731357784794755, + "grad_norm": 32.00529098510742, + "learning_rate": 8.228626943005182e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.8856922388076782, + "num_tokens": 1961890.0, + "step": 1095 + }, + { + "epoch": 0.17747550805602785, + "grad_norm": 27.947002410888672, + "learning_rate": 8.227007772020726e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.890306144952774, + "num_tokens": 1963693.0, + "step": 1096 + }, + { + "epoch": 0.17763743826410816, + "grad_norm": 25.10782241821289, + "learning_rate": 8.22538860103627e-06, + "loss": 0.72, + "mean_token_accuracy": 0.9003545939922333, + "num_tokens": 1965486.0, + "step": 1097 + }, + { + "epoch": 0.1777993684721885, + "grad_norm": 29.25543212890625, + "learning_rate": 8.223769430051815e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.8910053074359894, + "num_tokens": 1967273.0, + "step": 1098 + }, + { + "epoch": 0.1779612986802688, + "grad_norm": 28.227872848510742, + "learning_rate": 8.222150259067359e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.9027804732322693, + "num_tokens": 1969072.0, + "step": 1099 + }, + { + "epoch": 0.1781232288883491, + "grad_norm": 29.319116592407227, + "learning_rate": 8.220531088082902e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.8921104967594147, + "num_tokens": 1970872.0, + "step": 1100 + }, + { + "epoch": 0.17828515909642945, + "grad_norm": 31.41961669921875, + "learning_rate": 8.218911917098447e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.8847214579582214, + "num_tokens": 1972679.0, + "step": 1101 + }, + { + "epoch": 0.17844708930450975, + "grad_norm": 28.50559425354004, + "learning_rate": 8.217292746113991e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.8995442986488342, + "num_tokens": 1974469.0, + "step": 1102 + }, + { + "epoch": 0.17860901951259006, + "grad_norm": 16.01573371887207, + "learning_rate": 8.215673575129535e-06, + "loss": 0.524, + "mean_token_accuracy": 0.9239532649517059, + "num_tokens": 1976244.0, + "step": 1103 + }, + { + "epoch": 0.1787709497206704, + "grad_norm": 27.766342163085938, + "learning_rate": 8.214054404145078e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.8985978960990906, + "num_tokens": 1978032.0, + "step": 1104 + }, + { + "epoch": 0.1789328799287507, + "grad_norm": 28.34012222290039, + "learning_rate": 8.212435233160623e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.8863832950592041, + "num_tokens": 1979816.0, + "step": 1105 + }, + { + "epoch": 0.17909481013683104, + "grad_norm": 29.48569107055664, + "learning_rate": 8.210816062176167e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.891687273979187, + "num_tokens": 1981604.0, + "step": 1106 + }, + { + "epoch": 0.17925674034491135, + "grad_norm": 25.87701988220215, + "learning_rate": 8.20919689119171e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.8857634961605072, + "num_tokens": 1983393.0, + "step": 1107 + }, + { + "epoch": 0.17941867055299165, + "grad_norm": 29.637012481689453, + "learning_rate": 8.207577720207254e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.8990444839000702, + "num_tokens": 1985182.0, + "step": 1108 + }, + { + "epoch": 0.179580600761072, + "grad_norm": 26.58379554748535, + "learning_rate": 8.2059585492228e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.9144144356250763, + "num_tokens": 1986986.0, + "step": 1109 + }, + { + "epoch": 0.1797425309691523, + "grad_norm": 17.786781311035156, + "learning_rate": 8.204339378238343e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.9175507426261902, + "num_tokens": 1988777.0, + "step": 1110 + }, + { + "epoch": 0.1799044611772326, + "grad_norm": 26.482398986816406, + "learning_rate": 8.202720207253887e-06, + "loss": 0.6847, + "mean_token_accuracy": 0.9134517908096313, + "num_tokens": 1990577.0, + "step": 1111 + }, + { + "epoch": 0.18006639138531294, + "grad_norm": 22.108814239501953, + "learning_rate": 8.20110103626943e-06, + "loss": 0.6283, + "mean_token_accuracy": 0.9064748287200928, + "num_tokens": 1992367.0, + "step": 1112 + }, + { + "epoch": 0.18022832159339325, + "grad_norm": 29.971628189086914, + "learning_rate": 8.199481865284976e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.9059009552001953, + "num_tokens": 1994155.0, + "step": 1113 + }, + { + "epoch": 0.18039025180147356, + "grad_norm": 28.439516067504883, + "learning_rate": 8.197862694300519e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.8909341096878052, + "num_tokens": 1995942.0, + "step": 1114 + }, + { + "epoch": 0.1805521820095539, + "grad_norm": 39.925140380859375, + "learning_rate": 8.196243523316063e-06, + "loss": 1.1399, + "mean_token_accuracy": 0.8619047701358795, + "num_tokens": 1997751.0, + "step": 1115 + }, + { + "epoch": 0.1807141122176342, + "grad_norm": 28.248197555541992, + "learning_rate": 8.194624352331606e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.88520547747612, + "num_tokens": 1999559.0, + "step": 1116 + }, + { + "epoch": 0.1808760424257145, + "grad_norm": 24.39079475402832, + "learning_rate": 8.193005181347152e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.9162560105323792, + "num_tokens": 2001356.0, + "step": 1117 + }, + { + "epoch": 0.18103797263379484, + "grad_norm": 24.221967697143555, + "learning_rate": 8.191386010362695e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.9056521952152252, + "num_tokens": 2003156.0, + "step": 1118 + }, + { + "epoch": 0.18119990284187515, + "grad_norm": 26.90912437438965, + "learning_rate": 8.189766839378239e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.8972624838352203, + "num_tokens": 2004941.0, + "step": 1119 + }, + { + "epoch": 0.18136183304995546, + "grad_norm": 21.62910270690918, + "learning_rate": 8.188147668393784e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.9056650102138519, + "num_tokens": 2006738.0, + "step": 1120 + }, + { + "epoch": 0.1815237632580358, + "grad_norm": 27.202850341796875, + "learning_rate": 8.186528497409328e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.8865191042423248, + "num_tokens": 2008532.0, + "step": 1121 + }, + { + "epoch": 0.1816856934661161, + "grad_norm": 34.75956344604492, + "learning_rate": 8.184909326424871e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.877687931060791, + "num_tokens": 2010329.0, + "step": 1122 + }, + { + "epoch": 0.18184762367419643, + "grad_norm": 18.468286514282227, + "learning_rate": 8.183290155440415e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.9209109842777252, + "num_tokens": 2012119.0, + "step": 1123 + }, + { + "epoch": 0.18200955388227674, + "grad_norm": 29.60638999938965, + "learning_rate": 8.18167098445596e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.8882653117179871, + "num_tokens": 2013918.0, + "step": 1124 + }, + { + "epoch": 0.18217148409035705, + "grad_norm": 21.280981063842773, + "learning_rate": 8.180051813471504e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.9124059975147247, + "num_tokens": 2015703.0, + "step": 1125 + }, + { + "epoch": 0.18233341429843739, + "grad_norm": 19.67123031616211, + "learning_rate": 8.178432642487047e-06, + "loss": 0.6123, + "mean_token_accuracy": 0.9148550927639008, + "num_tokens": 2017497.0, + "step": 1126 + }, + { + "epoch": 0.1824953445065177, + "grad_norm": 23.823774337768555, + "learning_rate": 8.176813471502591e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.8936701118946075, + "num_tokens": 2019282.0, + "step": 1127 + }, + { + "epoch": 0.182657274714598, + "grad_norm": 24.950214385986328, + "learning_rate": 8.175194300518136e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.91737300157547, + "num_tokens": 2021070.0, + "step": 1128 + }, + { + "epoch": 0.18281920492267834, + "grad_norm": 23.3730525970459, + "learning_rate": 8.17357512953368e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.9131805896759033, + "num_tokens": 2022858.0, + "step": 1129 + }, + { + "epoch": 0.18298113513075864, + "grad_norm": 28.625581741333008, + "learning_rate": 8.171955958549223e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.8903346657752991, + "num_tokens": 2024653.0, + "step": 1130 + }, + { + "epoch": 0.18314306533883895, + "grad_norm": 17.436969757080078, + "learning_rate": 8.170336787564767e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.924433171749115, + "num_tokens": 2026430.0, + "step": 1131 + }, + { + "epoch": 0.18330499554691929, + "grad_norm": 27.786771774291992, + "learning_rate": 8.168717616580312e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.8920877575874329, + "num_tokens": 2028228.0, + "step": 1132 + }, + { + "epoch": 0.1834669257549996, + "grad_norm": 24.240581512451172, + "learning_rate": 8.167098445595856e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.9045454561710358, + "num_tokens": 2030023.0, + "step": 1133 + }, + { + "epoch": 0.1836288559630799, + "grad_norm": 22.884143829345703, + "learning_rate": 8.1654792746114e-06, + "loss": 0.662, + "mean_token_accuracy": 0.9100719392299652, + "num_tokens": 2031813.0, + "step": 1134 + }, + { + "epoch": 0.18379078617116024, + "grad_norm": 21.736400604248047, + "learning_rate": 8.163860103626943e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.9108347296714783, + "num_tokens": 2033594.0, + "step": 1135 + }, + { + "epoch": 0.18395271637924054, + "grad_norm": 17.320335388183594, + "learning_rate": 8.162240932642488e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.9178501665592194, + "num_tokens": 2035374.0, + "step": 1136 + }, + { + "epoch": 0.18411464658732085, + "grad_norm": 22.587438583374023, + "learning_rate": 8.160621761658032e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.9135279059410095, + "num_tokens": 2037175.0, + "step": 1137 + }, + { + "epoch": 0.1842765767954012, + "grad_norm": 26.97759246826172, + "learning_rate": 8.159002590673575e-06, + "loss": 0.738, + "mean_token_accuracy": 0.9051197171211243, + "num_tokens": 2038972.0, + "step": 1138 + }, + { + "epoch": 0.1844385070034815, + "grad_norm": 19.594486236572266, + "learning_rate": 8.15738341968912e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.9057179093360901, + "num_tokens": 2040760.0, + "step": 1139 + }, + { + "epoch": 0.18460043721156183, + "grad_norm": 25.995878219604492, + "learning_rate": 8.155764248704664e-06, + "loss": 0.7645, + "mean_token_accuracy": 0.9039260447025299, + "num_tokens": 2042553.0, + "step": 1140 + }, + { + "epoch": 0.18476236741964214, + "grad_norm": 22.171667098999023, + "learning_rate": 8.154145077720208e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.908413290977478, + "num_tokens": 2044337.0, + "step": 1141 + }, + { + "epoch": 0.18492429762772244, + "grad_norm": 22.2275333404541, + "learning_rate": 8.152525906735751e-06, + "loss": 0.6204, + "mean_token_accuracy": 0.9176007807254791, + "num_tokens": 2046128.0, + "step": 1142 + }, + { + "epoch": 0.18508622783580278, + "grad_norm": 27.31783676147461, + "learning_rate": 8.150906735751297e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.8987409472465515, + "num_tokens": 2047914.0, + "step": 1143 + }, + { + "epoch": 0.1852481580438831, + "grad_norm": 30.983585357666016, + "learning_rate": 8.14928756476684e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.896200567483902, + "num_tokens": 2049712.0, + "step": 1144 + }, + { + "epoch": 0.1854100882519634, + "grad_norm": 25.47585678100586, + "learning_rate": 8.147668393782384e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.8934580087661743, + "num_tokens": 2051505.0, + "step": 1145 + }, + { + "epoch": 0.18557201846004373, + "grad_norm": 21.676660537719727, + "learning_rate": 8.146049222797928e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.917391300201416, + "num_tokens": 2053295.0, + "step": 1146 + }, + { + "epoch": 0.18573394866812404, + "grad_norm": 23.765411376953125, + "learning_rate": 8.144430051813473e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.8993055522441864, + "num_tokens": 2055095.0, + "step": 1147 + }, + { + "epoch": 0.18589587887620435, + "grad_norm": 21.997718811035156, + "learning_rate": 8.142810880829016e-06, + "loss": 0.6556, + "mean_token_accuracy": 0.9146656692028046, + "num_tokens": 2056888.0, + "step": 1148 + }, + { + "epoch": 0.18605780908428468, + "grad_norm": 24.783058166503906, + "learning_rate": 8.14119170984456e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.8886054456233978, + "num_tokens": 2058687.0, + "step": 1149 + }, + { + "epoch": 0.186219739292365, + "grad_norm": 36.05255126953125, + "learning_rate": 8.139572538860104e-06, + "loss": 1.1645, + "mean_token_accuracy": 0.8693121671676636, + "num_tokens": 2060481.0, + "step": 1150 + }, + { + "epoch": 0.1863816695004453, + "grad_norm": 27.821964263916016, + "learning_rate": 8.137953367875649e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.8832116723060608, + "num_tokens": 2062267.0, + "step": 1151 + }, + { + "epoch": 0.18654359970852563, + "grad_norm": 22.230712890625, + "learning_rate": 8.136334196891192e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.9081102907657623, + "num_tokens": 2064051.0, + "step": 1152 + }, + { + "epoch": 0.18670552991660594, + "grad_norm": 24.18702507019043, + "learning_rate": 8.134715025906736e-06, + "loss": 0.736, + "mean_token_accuracy": 0.9047702252864838, + "num_tokens": 2065846.0, + "step": 1153 + }, + { + "epoch": 0.18686746012468625, + "grad_norm": 25.95163345336914, + "learning_rate": 8.13309585492228e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.8866084218025208, + "num_tokens": 2067649.0, + "step": 1154 + }, + { + "epoch": 0.18702939033276658, + "grad_norm": 20.390827178955078, + "learning_rate": 8.131476683937825e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.9115451872348785, + "num_tokens": 2069432.0, + "step": 1155 + }, + { + "epoch": 0.1871913205408469, + "grad_norm": 18.583084106445312, + "learning_rate": 8.129857512953369e-06, + "loss": 0.5579, + "mean_token_accuracy": 0.9198294281959534, + "num_tokens": 2071218.0, + "step": 1156 + }, + { + "epoch": 0.18735325074892722, + "grad_norm": 28.328243255615234, + "learning_rate": 8.128238341968912e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.897201418876648, + "num_tokens": 2073013.0, + "step": 1157 + }, + { + "epoch": 0.18751518095700753, + "grad_norm": 25.317455291748047, + "learning_rate": 8.126619170984457e-06, + "loss": 0.779, + "mean_token_accuracy": 0.8967473804950714, + "num_tokens": 2074806.0, + "step": 1158 + }, + { + "epoch": 0.18767711116508784, + "grad_norm": 24.809972763061523, + "learning_rate": 8.125000000000001e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.8940375447273254, + "num_tokens": 2076610.0, + "step": 1159 + }, + { + "epoch": 0.18783904137316818, + "grad_norm": 20.725688934326172, + "learning_rate": 8.123380829015545e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.9077828824520111, + "num_tokens": 2078393.0, + "step": 1160 + }, + { + "epoch": 0.18800097158124848, + "grad_norm": 19.541519165039062, + "learning_rate": 8.121761658031088e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.9143867790699005, + "num_tokens": 2080173.0, + "step": 1161 + }, + { + "epoch": 0.1881629017893288, + "grad_norm": 21.553709030151367, + "learning_rate": 8.120142487046633e-06, + "loss": 0.6597, + "mean_token_accuracy": 0.9043911099433899, + "num_tokens": 2081957.0, + "step": 1162 + }, + { + "epoch": 0.18832483199740913, + "grad_norm": 25.712669372558594, + "learning_rate": 8.118523316062177e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.8952664136886597, + "num_tokens": 2083746.0, + "step": 1163 + }, + { + "epoch": 0.18848676220548943, + "grad_norm": 15.06070613861084, + "learning_rate": 8.11690414507772e-06, + "loss": 0.6219, + "mean_token_accuracy": 0.9184591770172119, + "num_tokens": 2085528.0, + "step": 1164 + }, + { + "epoch": 0.18864869241356974, + "grad_norm": 33.2811164855957, + "learning_rate": 8.115284974093264e-06, + "loss": 1.1145, + "mean_token_accuracy": 0.8748107850551605, + "num_tokens": 2087331.0, + "step": 1165 + }, + { + "epoch": 0.18881062262165008, + "grad_norm": 28.879222869873047, + "learning_rate": 8.11366580310881e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.8837940394878387, + "num_tokens": 2089135.0, + "step": 1166 + }, + { + "epoch": 0.18897255282973038, + "grad_norm": 18.189077377319336, + "learning_rate": 8.112046632124353e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.9051094651222229, + "num_tokens": 2090921.0, + "step": 1167 + }, + { + "epoch": 0.1891344830378107, + "grad_norm": 25.741928100585938, + "learning_rate": 8.110427461139897e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.9044477939605713, + "num_tokens": 2092716.0, + "step": 1168 + }, + { + "epoch": 0.18929641324589103, + "grad_norm": 24.907745361328125, + "learning_rate": 8.10880829015544e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.9029007852077484, + "num_tokens": 2094506.0, + "step": 1169 + }, + { + "epoch": 0.18945834345397133, + "grad_norm": 26.88646697998047, + "learning_rate": 8.107189119170986e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.9039436280727386, + "num_tokens": 2096299.0, + "step": 1170 + }, + { + "epoch": 0.18962027366205167, + "grad_norm": 16.537071228027344, + "learning_rate": 8.105569948186529e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.9260563254356384, + "num_tokens": 2098095.0, + "step": 1171 + }, + { + "epoch": 0.18978220387013198, + "grad_norm": 24.197826385498047, + "learning_rate": 8.103950777202073e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.8968545496463776, + "num_tokens": 2099887.0, + "step": 1172 + }, + { + "epoch": 0.18994413407821228, + "grad_norm": 24.82110595703125, + "learning_rate": 8.102331606217616e-06, + "loss": 0.6624, + "mean_token_accuracy": 0.9094041585922241, + "num_tokens": 2101674.0, + "step": 1173 + }, + { + "epoch": 0.19010606428629262, + "grad_norm": 23.7404842376709, + "learning_rate": 8.100712435233162e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.9000329375267029, + "num_tokens": 2103456.0, + "step": 1174 + }, + { + "epoch": 0.19026799449437293, + "grad_norm": 28.095535278320312, + "learning_rate": 8.099093264248705e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.8889317512512207, + "num_tokens": 2105256.0, + "step": 1175 + }, + { + "epoch": 0.19042992470245323, + "grad_norm": 24.71026039123535, + "learning_rate": 8.097474093264249e-06, + "loss": 0.7809, + "mean_token_accuracy": 0.9141661822795868, + "num_tokens": 2107048.0, + "step": 1176 + }, + { + "epoch": 0.19059185491053357, + "grad_norm": 26.66474723815918, + "learning_rate": 8.095854922279794e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.8950007855892181, + "num_tokens": 2108836.0, + "step": 1177 + }, + { + "epoch": 0.19075378511861388, + "grad_norm": 34.18571090698242, + "learning_rate": 8.094235751295338e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.8766702711582184, + "num_tokens": 2110646.0, + "step": 1178 + }, + { + "epoch": 0.19091571532669419, + "grad_norm": 21.079694747924805, + "learning_rate": 8.092616580310881e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.909546822309494, + "num_tokens": 2112434.0, + "step": 1179 + }, + { + "epoch": 0.19107764553477452, + "grad_norm": 22.957441329956055, + "learning_rate": 8.090997409326425e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.9119808673858643, + "num_tokens": 2114228.0, + "step": 1180 + }, + { + "epoch": 0.19123957574285483, + "grad_norm": 29.247337341308594, + "learning_rate": 8.08937823834197e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.8916361033916473, + "num_tokens": 2116026.0, + "step": 1181 + }, + { + "epoch": 0.19140150595093514, + "grad_norm": 23.098365783691406, + "learning_rate": 8.087759067357514e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.9030748903751373, + "num_tokens": 2117806.0, + "step": 1182 + }, + { + "epoch": 0.19156343615901547, + "grad_norm": 25.975364685058594, + "learning_rate": 8.086139896373057e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.911080926656723, + "num_tokens": 2119600.0, + "step": 1183 + }, + { + "epoch": 0.19172536636709578, + "grad_norm": 18.41541290283203, + "learning_rate": 8.084520725388601e-06, + "loss": 0.6056, + "mean_token_accuracy": 0.9174720048904419, + "num_tokens": 2121391.0, + "step": 1184 + }, + { + "epoch": 0.1918872965751761, + "grad_norm": 30.505582809448242, + "learning_rate": 8.082901554404146e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.8689639568328857, + "num_tokens": 2123201.0, + "step": 1185 + }, + { + "epoch": 0.19204922678325642, + "grad_norm": 21.6943416595459, + "learning_rate": 8.08128238341969e-06, + "loss": 0.658, + "mean_token_accuracy": 0.906897246837616, + "num_tokens": 2124991.0, + "step": 1186 + }, + { + "epoch": 0.19221115699133673, + "grad_norm": 20.60589027404785, + "learning_rate": 8.079663212435233e-06, + "loss": 0.5987, + "mean_token_accuracy": 0.9109354317188263, + "num_tokens": 2126772.0, + "step": 1187 + }, + { + "epoch": 0.19237308719941706, + "grad_norm": 28.147605895996094, + "learning_rate": 8.078044041450777e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.8932117521762848, + "num_tokens": 2128565.0, + "step": 1188 + }, + { + "epoch": 0.19253501740749737, + "grad_norm": 23.695560455322266, + "learning_rate": 8.076424870466322e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.9094942808151245, + "num_tokens": 2130353.0, + "step": 1189 + }, + { + "epoch": 0.19269694761557768, + "grad_norm": 21.533119201660156, + "learning_rate": 8.074805699481866e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.8966437876224518, + "num_tokens": 2132136.0, + "step": 1190 + }, + { + "epoch": 0.19285887782365801, + "grad_norm": 27.630273818969727, + "learning_rate": 8.07318652849741e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.901408463716507, + "num_tokens": 2133932.0, + "step": 1191 + }, + { + "epoch": 0.19302080803173832, + "grad_norm": 26.547698974609375, + "learning_rate": 8.071567357512955e-06, + "loss": 0.758, + "mean_token_accuracy": 0.8882890045642853, + "num_tokens": 2135721.0, + "step": 1192 + }, + { + "epoch": 0.19318273823981863, + "grad_norm": 27.16016960144043, + "learning_rate": 8.069948186528498e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.9013746976852417, + "num_tokens": 2137507.0, + "step": 1193 + }, + { + "epoch": 0.19334466844789897, + "grad_norm": 25.121185302734375, + "learning_rate": 8.068329015544042e-06, + "loss": 0.745, + "mean_token_accuracy": 0.9100438058376312, + "num_tokens": 2139296.0, + "step": 1194 + }, + { + "epoch": 0.19350659865597927, + "grad_norm": 29.57895278930664, + "learning_rate": 8.066709844559585e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.8931781053543091, + "num_tokens": 2141105.0, + "step": 1195 + }, + { + "epoch": 0.19366852886405958, + "grad_norm": 22.85805320739746, + "learning_rate": 8.06509067357513e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.9187187254428864, + "num_tokens": 2142900.0, + "step": 1196 + }, + { + "epoch": 0.19383045907213992, + "grad_norm": 26.71902847290039, + "learning_rate": 8.063471502590674e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.9058204889297485, + "num_tokens": 2144698.0, + "step": 1197 + }, + { + "epoch": 0.19399238928022022, + "grad_norm": 23.67783546447754, + "learning_rate": 8.061852331606218e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.9172877967357635, + "num_tokens": 2146488.0, + "step": 1198 + }, + { + "epoch": 0.19415431948830053, + "grad_norm": 35.40565490722656, + "learning_rate": 8.060233160621762e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.8943355679512024, + "num_tokens": 2148283.0, + "step": 1199 + }, + { + "epoch": 0.19431624969638087, + "grad_norm": 30.098793029785156, + "learning_rate": 8.058613989637307e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.8916083872318268, + "num_tokens": 2150081.0, + "step": 1200 + }, + { + "epoch": 0.19447817990446117, + "grad_norm": 26.206090927124023, + "learning_rate": 8.05699481865285e-06, + "loss": 0.6646, + "mean_token_accuracy": 0.9059523940086365, + "num_tokens": 2151880.0, + "step": 1201 + }, + { + "epoch": 0.19464011011254148, + "grad_norm": 36.07347106933594, + "learning_rate": 8.055375647668394e-06, + "loss": 0.966, + "mean_token_accuracy": 0.87706458568573, + "num_tokens": 2153684.0, + "step": 1202 + }, + { + "epoch": 0.19480204032062182, + "grad_norm": 25.063003540039062, + "learning_rate": 8.053756476683938e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.9081102907657623, + "num_tokens": 2155468.0, + "step": 1203 + }, + { + "epoch": 0.19496397052870212, + "grad_norm": 28.45714569091797, + "learning_rate": 8.052137305699483e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.8988896012306213, + "num_tokens": 2157257.0, + "step": 1204 + }, + { + "epoch": 0.19512590073678246, + "grad_norm": 30.118324279785156, + "learning_rate": 8.050518134715026e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.8998035788536072, + "num_tokens": 2159058.0, + "step": 1205 + }, + { + "epoch": 0.19528783094486277, + "grad_norm": 15.193723678588867, + "learning_rate": 8.04889896373057e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.929380863904953, + "num_tokens": 2160839.0, + "step": 1206 + }, + { + "epoch": 0.19544976115294307, + "grad_norm": 26.35215950012207, + "learning_rate": 8.047279792746114e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.9021909236907959, + "num_tokens": 2162637.0, + "step": 1207 + }, + { + "epoch": 0.1956116913610234, + "grad_norm": 26.451229095458984, + "learning_rate": 8.045660621761659e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.9034482836723328, + "num_tokens": 2164439.0, + "step": 1208 + }, + { + "epoch": 0.19577362156910372, + "grad_norm": 25.747333526611328, + "learning_rate": 8.044041450777202e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.8984929025173187, + "num_tokens": 2166227.0, + "step": 1209 + }, + { + "epoch": 0.19593555177718402, + "grad_norm": 23.462419509887695, + "learning_rate": 8.042422279792746e-06, + "loss": 0.6088, + "mean_token_accuracy": 0.9098878800868988, + "num_tokens": 2168028.0, + "step": 1210 + }, + { + "epoch": 0.19609748198526436, + "grad_norm": 16.312673568725586, + "learning_rate": 8.040803108808291e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.9318181872367859, + "num_tokens": 2169804.0, + "step": 1211 + }, + { + "epoch": 0.19625941219334467, + "grad_norm": 26.479970932006836, + "learning_rate": 8.039183937823835e-06, + "loss": 0.708, + "mean_token_accuracy": 0.901408463716507, + "num_tokens": 2171600.0, + "step": 1212 + }, + { + "epoch": 0.19642134240142498, + "grad_norm": 30.41205406188965, + "learning_rate": 8.037564766839379e-06, + "loss": 0.767, + "mean_token_accuracy": 0.8999904990196228, + "num_tokens": 2173403.0, + "step": 1213 + }, + { + "epoch": 0.1965832726095053, + "grad_norm": 25.26466178894043, + "learning_rate": 8.035945595854922e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.907131016254425, + "num_tokens": 2175193.0, + "step": 1214 + }, + { + "epoch": 0.19674520281758562, + "grad_norm": 21.494007110595703, + "learning_rate": 8.034326424870467e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.9097758233547211, + "num_tokens": 2176982.0, + "step": 1215 + }, + { + "epoch": 0.19690713302566593, + "grad_norm": 25.302734375, + "learning_rate": 8.032707253886011e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.9087297320365906, + "num_tokens": 2178767.0, + "step": 1216 + }, + { + "epoch": 0.19706906323374626, + "grad_norm": 24.172443389892578, + "learning_rate": 8.031088082901555e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.9043652415275574, + "num_tokens": 2180551.0, + "step": 1217 + }, + { + "epoch": 0.19723099344182657, + "grad_norm": 36.29187774658203, + "learning_rate": 8.029468911917098e-06, + "loss": 0.906, + "mean_token_accuracy": 0.8779591619968414, + "num_tokens": 2182348.0, + "step": 1218 + }, + { + "epoch": 0.1973929236499069, + "grad_norm": 23.735004425048828, + "learning_rate": 8.027849740932643e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.8912636637687683, + "num_tokens": 2184145.0, + "step": 1219 + }, + { + "epoch": 0.1975548538579872, + "grad_norm": 30.058345794677734, + "learning_rate": 8.026230569948187e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.8915935754776001, + "num_tokens": 2185942.0, + "step": 1220 + }, + { + "epoch": 0.19771678406606752, + "grad_norm": 23.397602081298828, + "learning_rate": 8.02461139896373e-06, + "loss": 0.722, + "mean_token_accuracy": 0.9078834652900696, + "num_tokens": 2187736.0, + "step": 1221 + }, + { + "epoch": 0.19787871427414785, + "grad_norm": 27.18838882446289, + "learning_rate": 8.022992227979274e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.901430606842041, + "num_tokens": 2189522.0, + "step": 1222 + }, + { + "epoch": 0.19804064448222816, + "grad_norm": 23.468795776367188, + "learning_rate": 8.02137305699482e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.9081010818481445, + "num_tokens": 2191317.0, + "step": 1223 + }, + { + "epoch": 0.19820257469030847, + "grad_norm": 31.06011199951172, + "learning_rate": 8.019753886010363e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.9059420526027679, + "num_tokens": 2193117.0, + "step": 1224 + }, + { + "epoch": 0.1983645048983888, + "grad_norm": 22.679725646972656, + "learning_rate": 8.018134715025907e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.9179934859275818, + "num_tokens": 2194909.0, + "step": 1225 + }, + { + "epoch": 0.1985264351064691, + "grad_norm": 23.59375, + "learning_rate": 8.01651554404145e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.8968297243118286, + "num_tokens": 2196692.0, + "step": 1226 + }, + { + "epoch": 0.19868836531454942, + "grad_norm": 23.235851287841797, + "learning_rate": 8.014896373056996e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.9032624065876007, + "num_tokens": 2198495.0, + "step": 1227 + }, + { + "epoch": 0.19885029552262976, + "grad_norm": 27.12327003479004, + "learning_rate": 8.01327720207254e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.9052418172359467, + "num_tokens": 2200281.0, + "step": 1228 + }, + { + "epoch": 0.19901222573071006, + "grad_norm": 20.638362884521484, + "learning_rate": 8.011658031088083e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.9024864137172699, + "num_tokens": 2202071.0, + "step": 1229 + }, + { + "epoch": 0.19917415593879037, + "grad_norm": 22.3620662689209, + "learning_rate": 8.010038860103628e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.9149899184703827, + "num_tokens": 2203865.0, + "step": 1230 + }, + { + "epoch": 0.1993360861468707, + "grad_norm": 26.976482391357422, + "learning_rate": 8.008419689119172e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.8991561830043793, + "num_tokens": 2205655.0, + "step": 1231 + }, + { + "epoch": 0.199498016354951, + "grad_norm": 29.312589645385742, + "learning_rate": 8.006800518134715e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.8712643682956696, + "num_tokens": 2207462.0, + "step": 1232 + }, + { + "epoch": 0.19965994656303132, + "grad_norm": 25.790000915527344, + "learning_rate": 8.005181347150259e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.9038500487804413, + "num_tokens": 2209255.0, + "step": 1233 + }, + { + "epoch": 0.19982187677111166, + "grad_norm": 26.386384963989258, + "learning_rate": 8.003562176165804e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.9032407402992249, + "num_tokens": 2211046.0, + "step": 1234 + }, + { + "epoch": 0.19998380697919196, + "grad_norm": 31.159879684448242, + "learning_rate": 8.001943005181348e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.8943355679512024, + "num_tokens": 2212841.0, + "step": 1235 + }, + { + "epoch": 0.2001457371872723, + "grad_norm": 31.24150848388672, + "learning_rate": 8.000323834196891e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.8791281580924988, + "num_tokens": 2214633.0, + "step": 1236 + }, + { + "epoch": 0.2003076673953526, + "grad_norm": 16.259525299072266, + "learning_rate": 7.998704663212435e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.9250216782093048, + "num_tokens": 2216425.0, + "step": 1237 + }, + { + "epoch": 0.20046959760343291, + "grad_norm": 26.028379440307617, + "learning_rate": 7.99708549222798e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.8952554762363434, + "num_tokens": 2218214.0, + "step": 1238 + }, + { + "epoch": 0.20063152781151325, + "grad_norm": 30.29619026184082, + "learning_rate": 7.995466321243524e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.9125567078590393, + "num_tokens": 2220011.0, + "step": 1239 + }, + { + "epoch": 0.20079345801959356, + "grad_norm": 21.31903076171875, + "learning_rate": 7.993847150259067e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.9077857732772827, + "num_tokens": 2221794.0, + "step": 1240 + }, + { + "epoch": 0.20095538822767386, + "grad_norm": 23.54694938659668, + "learning_rate": 7.992227979274611e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.9064182341098785, + "num_tokens": 2223584.0, + "step": 1241 + }, + { + "epoch": 0.2011173184357542, + "grad_norm": 31.1170597076416, + "learning_rate": 7.990608808290156e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.8809839189052582, + "num_tokens": 2225387.0, + "step": 1242 + }, + { + "epoch": 0.2012792486438345, + "grad_norm": 24.913646697998047, + "learning_rate": 7.9889896373057e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.9072401821613312, + "num_tokens": 2227179.0, + "step": 1243 + }, + { + "epoch": 0.20144117885191481, + "grad_norm": 27.373817443847656, + "learning_rate": 7.987370466321243e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.9032531678676605, + "num_tokens": 2228970.0, + "step": 1244 + }, + { + "epoch": 0.20160310905999515, + "grad_norm": 16.3076171875, + "learning_rate": 7.985751295336787e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.925621896982193, + "num_tokens": 2230751.0, + "step": 1245 + }, + { + "epoch": 0.20176503926807546, + "grad_norm": 21.297365188598633, + "learning_rate": 7.984132124352332e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.9205517172813416, + "num_tokens": 2232540.0, + "step": 1246 + }, + { + "epoch": 0.20192696947615577, + "grad_norm": 26.522666931152344, + "learning_rate": 7.982512953367876e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.9116956293582916, + "num_tokens": 2234335.0, + "step": 1247 + }, + { + "epoch": 0.2020888996842361, + "grad_norm": 29.628372192382812, + "learning_rate": 7.98089378238342e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.8961202502250671, + "num_tokens": 2236126.0, + "step": 1248 + }, + { + "epoch": 0.2022508298923164, + "grad_norm": 22.819133758544922, + "learning_rate": 7.979274611398965e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.9080882370471954, + "num_tokens": 2237910.0, + "step": 1249 + }, + { + "epoch": 0.20241276010039672, + "grad_norm": 25.56781768798828, + "learning_rate": 7.977655440414508e-06, + "loss": 0.6363, + "mean_token_accuracy": 0.9072797000408173, + "num_tokens": 2239702.0, + "step": 1250 + }, + { + "epoch": 0.20257469030847705, + "grad_norm": 27.90135955810547, + "learning_rate": 7.976036269430052e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.9027777910232544, + "num_tokens": 2241481.0, + "step": 1251 + }, + { + "epoch": 0.20273662051655736, + "grad_norm": 26.237934112548828, + "learning_rate": 7.974417098445595e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.9003292620182037, + "num_tokens": 2243274.0, + "step": 1252 + }, + { + "epoch": 0.2028985507246377, + "grad_norm": 29.386486053466797, + "learning_rate": 7.97279792746114e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.8825460076332092, + "num_tokens": 2245076.0, + "step": 1253 + }, + { + "epoch": 0.203060480932718, + "grad_norm": 19.233985900878906, + "learning_rate": 7.971178756476684e-06, + "loss": 0.6225, + "mean_token_accuracy": 0.921707957983017, + "num_tokens": 2246882.0, + "step": 1254 + }, + { + "epoch": 0.2032224111407983, + "grad_norm": 29.620250701904297, + "learning_rate": 7.969559585492228e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.8967578411102295, + "num_tokens": 2248675.0, + "step": 1255 + }, + { + "epoch": 0.20338434134887864, + "grad_norm": 28.40350341796875, + "learning_rate": 7.967940414507773e-06, + "loss": 0.6629, + "mean_token_accuracy": 0.9082458913326263, + "num_tokens": 2250470.0, + "step": 1256 + }, + { + "epoch": 0.20354627155695895, + "grad_norm": 33.82870864868164, + "learning_rate": 7.966321243523317e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.8770425021648407, + "num_tokens": 2252271.0, + "step": 1257 + }, + { + "epoch": 0.20370820176503926, + "grad_norm": 21.204627990722656, + "learning_rate": 7.964702072538862e-06, + "loss": 0.6308, + "mean_token_accuracy": 0.9131805896759033, + "num_tokens": 2254059.0, + "step": 1258 + }, + { + "epoch": 0.2038701319731196, + "grad_norm": 19.29145622253418, + "learning_rate": 7.963082901554406e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.9252963960170746, + "num_tokens": 2255852.0, + "step": 1259 + }, + { + "epoch": 0.2040320621811999, + "grad_norm": 15.525376319885254, + "learning_rate": 7.96146373056995e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.9204832017421722, + "num_tokens": 2257640.0, + "step": 1260 + }, + { + "epoch": 0.2041939923892802, + "grad_norm": 23.608671188354492, + "learning_rate": 7.959844559585493e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.9208798110485077, + "num_tokens": 2259443.0, + "step": 1261 + }, + { + "epoch": 0.20435592259736055, + "grad_norm": 29.058277130126953, + "learning_rate": 7.958225388601038e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.8783470988273621, + "num_tokens": 2261243.0, + "step": 1262 + }, + { + "epoch": 0.20451785280544085, + "grad_norm": 29.889188766479492, + "learning_rate": 7.956606217616582e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.886211633682251, + "num_tokens": 2263036.0, + "step": 1263 + }, + { + "epoch": 0.20467978301352116, + "grad_norm": 22.74662971496582, + "learning_rate": 7.954987046632125e-06, + "loss": 0.734, + "mean_token_accuracy": 0.905089259147644, + "num_tokens": 2264822.0, + "step": 1264 + }, + { + "epoch": 0.2048417132216015, + "grad_norm": 24.45282745361328, + "learning_rate": 7.953367875647669e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.9092390239238739, + "num_tokens": 2266610.0, + "step": 1265 + }, + { + "epoch": 0.2050036434296818, + "grad_norm": 22.241226196289062, + "learning_rate": 7.951748704663214e-06, + "loss": 0.6192, + "mean_token_accuracy": 0.9075387418270111, + "num_tokens": 2268403.0, + "step": 1266 + }, + { + "epoch": 0.2051655736377621, + "grad_norm": 21.609891891479492, + "learning_rate": 7.950129533678758e-06, + "loss": 0.6528, + "mean_token_accuracy": 0.9169946312904358, + "num_tokens": 2270193.0, + "step": 1267 + }, + { + "epoch": 0.20532750384584245, + "grad_norm": 27.25162696838379, + "learning_rate": 7.948510362694301e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.9016619026660919, + "num_tokens": 2271980.0, + "step": 1268 + }, + { + "epoch": 0.20548943405392275, + "grad_norm": 22.569896697998047, + "learning_rate": 7.946891191709845e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.9053504168987274, + "num_tokens": 2273776.0, + "step": 1269 + }, + { + "epoch": 0.2056513642620031, + "grad_norm": 18.745363235473633, + "learning_rate": 7.94527202072539e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.9148510098457336, + "num_tokens": 2275558.0, + "step": 1270 + }, + { + "epoch": 0.2058132944700834, + "grad_norm": 24.38509750366211, + "learning_rate": 7.943652849740934e-06, + "loss": 0.694, + "mean_token_accuracy": 0.9044477939605713, + "num_tokens": 2277353.0, + "step": 1271 + }, + { + "epoch": 0.2059752246781637, + "grad_norm": 26.728092193603516, + "learning_rate": 7.942033678756477e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.9018024206161499, + "num_tokens": 2279150.0, + "step": 1272 + }, + { + "epoch": 0.20613715488624404, + "grad_norm": 29.9021053314209, + "learning_rate": 7.940414507772021e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.8949199318885803, + "num_tokens": 2280948.0, + "step": 1273 + }, + { + "epoch": 0.20629908509432435, + "grad_norm": 24.022016525268555, + "learning_rate": 7.938795336787566e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.8990746736526489, + "num_tokens": 2282757.0, + "step": 1274 + }, + { + "epoch": 0.20646101530240465, + "grad_norm": 20.628925323486328, + "learning_rate": 7.93717616580311e-06, + "loss": 0.6829, + "mean_token_accuracy": 0.9062825739383698, + "num_tokens": 2284546.0, + "step": 1275 + }, + { + "epoch": 0.206622945510485, + "grad_norm": 22.150596618652344, + "learning_rate": 7.935556994818653e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.9142856895923615, + "num_tokens": 2286338.0, + "step": 1276 + }, + { + "epoch": 0.2067848757185653, + "grad_norm": 22.98813819885254, + "learning_rate": 7.933937823834199e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.9059523940086365, + "num_tokens": 2288137.0, + "step": 1277 + }, + { + "epoch": 0.2069468059266456, + "grad_norm": 25.81568717956543, + "learning_rate": 7.932318652849742e-06, + "loss": 0.762, + "mean_token_accuracy": 0.8968901038169861, + "num_tokens": 2289931.0, + "step": 1278 + }, + { + "epoch": 0.20710873613472594, + "grad_norm": 24.10573959350586, + "learning_rate": 7.930699481865286e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.9096696376800537, + "num_tokens": 2291731.0, + "step": 1279 + }, + { + "epoch": 0.20727066634280625, + "grad_norm": 22.798585891723633, + "learning_rate": 7.92908031088083e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.8974413573741913, + "num_tokens": 2293517.0, + "step": 1280 + }, + { + "epoch": 0.20743259655088656, + "grad_norm": 25.903820037841797, + "learning_rate": 7.927461139896375e-06, + "loss": 0.728, + "mean_token_accuracy": 0.9001071751117706, + "num_tokens": 2295309.0, + "step": 1281 + }, + { + "epoch": 0.2075945267589669, + "grad_norm": 27.341861724853516, + "learning_rate": 7.925841968911918e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.9090731143951416, + "num_tokens": 2297107.0, + "step": 1282 + }, + { + "epoch": 0.2077564569670472, + "grad_norm": 15.652795791625977, + "learning_rate": 7.924222797927462e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.9236221313476562, + "num_tokens": 2298894.0, + "step": 1283 + }, + { + "epoch": 0.20791838717512753, + "grad_norm": 36.49123764038086, + "learning_rate": 7.922603626943006e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.9106518626213074, + "num_tokens": 2300697.0, + "step": 1284 + }, + { + "epoch": 0.20808031738320784, + "grad_norm": 22.572795867919922, + "learning_rate": 7.920984455958551e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.9148550927639008, + "num_tokens": 2302479.0, + "step": 1285 + }, + { + "epoch": 0.20824224759128815, + "grad_norm": 26.504154205322266, + "learning_rate": 7.919365284974094e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.904900074005127, + "num_tokens": 2304264.0, + "step": 1286 + }, + { + "epoch": 0.20840417779936848, + "grad_norm": 23.916187286376953, + "learning_rate": 7.917746113989638e-06, + "loss": 0.6363, + "mean_token_accuracy": 0.9089095592498779, + "num_tokens": 2306061.0, + "step": 1287 + }, + { + "epoch": 0.2085661080074488, + "grad_norm": 25.13608741760254, + "learning_rate": 7.916126943005182e-06, + "loss": 0.6777, + "mean_token_accuracy": 0.905089259147644, + "num_tokens": 2307847.0, + "step": 1288 + }, + { + "epoch": 0.2087280382155291, + "grad_norm": 20.095993041992188, + "learning_rate": 7.914507772020727e-06, + "loss": 0.6085, + "mean_token_accuracy": 0.913159966468811, + "num_tokens": 2309635.0, + "step": 1289 + }, + { + "epoch": 0.20888996842360943, + "grad_norm": 25.800636291503906, + "learning_rate": 7.91288860103627e-06, + "loss": 0.687, + "mean_token_accuracy": 0.9041857421398163, + "num_tokens": 2311429.0, + "step": 1290 + }, + { + "epoch": 0.20905189863168974, + "grad_norm": 21.290725708007812, + "learning_rate": 7.911269430051814e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.9139508605003357, + "num_tokens": 2313220.0, + "step": 1291 + }, + { + "epoch": 0.20921382883977005, + "grad_norm": 31.271337509155273, + "learning_rate": 7.909650259067358e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.8817271590232849, + "num_tokens": 2315026.0, + "step": 1292 + }, + { + "epoch": 0.20937575904785038, + "grad_norm": 25.913667678833008, + "learning_rate": 7.908031088082903e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.9037662148475647, + "num_tokens": 2316819.0, + "step": 1293 + }, + { + "epoch": 0.2095376892559307, + "grad_norm": 33.95661544799805, + "learning_rate": 7.906411917098447e-06, + "loss": 0.926, + "mean_token_accuracy": 0.8770685493946075, + "num_tokens": 2318607.0, + "step": 1294 + }, + { + "epoch": 0.209699619464011, + "grad_norm": 22.09990692138672, + "learning_rate": 7.90479274611399e-06, + "loss": 0.6087, + "mean_token_accuracy": 0.908687949180603, + "num_tokens": 2320404.0, + "step": 1295 + }, + { + "epoch": 0.20986154967209134, + "grad_norm": 26.62694549560547, + "learning_rate": 7.903173575129535e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.9004559218883514, + "num_tokens": 2322197.0, + "step": 1296 + }, + { + "epoch": 0.21002347988017164, + "grad_norm": 31.434181213378906, + "learning_rate": 7.901554404145079e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.8788608312606812, + "num_tokens": 2323981.0, + "step": 1297 + }, + { + "epoch": 0.21018541008825195, + "grad_norm": 27.5379581451416, + "learning_rate": 7.899935233160623e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.8913753032684326, + "num_tokens": 2325786.0, + "step": 1298 + }, + { + "epoch": 0.21034734029633229, + "grad_norm": 15.636994361877441, + "learning_rate": 7.898316062176166e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.9166475236415863, + "num_tokens": 2327562.0, + "step": 1299 + }, + { + "epoch": 0.2105092705044126, + "grad_norm": 19.72308921813965, + "learning_rate": 7.896696891191711e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.9030748903751373, + "num_tokens": 2329353.0, + "step": 1300 + }, + { + "epoch": 0.21067120071249293, + "grad_norm": 18.04261016845703, + "learning_rate": 7.895077720207255e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.9189277589321136, + "num_tokens": 2331136.0, + "step": 1301 + }, + { + "epoch": 0.21083313092057324, + "grad_norm": 23.98662757873535, + "learning_rate": 7.893458549222799e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.8980670273303986, + "num_tokens": 2332942.0, + "step": 1302 + }, + { + "epoch": 0.21099506112865354, + "grad_norm": 18.525197982788086, + "learning_rate": 7.891839378238342e-06, + "loss": 0.5254, + "mean_token_accuracy": 0.9305733144283295, + "num_tokens": 2334728.0, + "step": 1303 + }, + { + "epoch": 0.21115699133673388, + "grad_norm": 16.44396209716797, + "learning_rate": 7.890220207253888e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.9205682873725891, + "num_tokens": 2336517.0, + "step": 1304 + }, + { + "epoch": 0.2113189215448142, + "grad_norm": 19.87498664855957, + "learning_rate": 7.888601036269431e-06, + "loss": 0.5981, + "mean_token_accuracy": 0.9141328632831573, + "num_tokens": 2338321.0, + "step": 1305 + }, + { + "epoch": 0.2114808517528945, + "grad_norm": 22.984481811523438, + "learning_rate": 7.886981865284975e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.9144723415374756, + "num_tokens": 2340113.0, + "step": 1306 + }, + { + "epoch": 0.21164278196097483, + "grad_norm": 28.53995704650879, + "learning_rate": 7.885362694300518e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.8944272994995117, + "num_tokens": 2341897.0, + "step": 1307 + }, + { + "epoch": 0.21180471216905514, + "grad_norm": 13.00845718383789, + "learning_rate": 7.883743523316064e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.9289297759532928, + "num_tokens": 2343690.0, + "step": 1308 + }, + { + "epoch": 0.21196664237713544, + "grad_norm": 20.79195213317871, + "learning_rate": 7.882124352331607e-06, + "loss": 0.641, + "mean_token_accuracy": 0.9079380929470062, + "num_tokens": 2345472.0, + "step": 1309 + }, + { + "epoch": 0.21212857258521578, + "grad_norm": 27.601594924926758, + "learning_rate": 7.88050518134715e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.89560467004776, + "num_tokens": 2347271.0, + "step": 1310 + }, + { + "epoch": 0.2122905027932961, + "grad_norm": 21.75929832458496, + "learning_rate": 7.878886010362694e-06, + "loss": 0.6011, + "mean_token_accuracy": 0.9186292290687561, + "num_tokens": 2349065.0, + "step": 1311 + }, + { + "epoch": 0.2124524330013764, + "grad_norm": 24.202491760253906, + "learning_rate": 7.87726683937824e-06, + "loss": 0.6438, + "mean_token_accuracy": 0.9069488048553467, + "num_tokens": 2350854.0, + "step": 1312 + }, + { + "epoch": 0.21261436320945673, + "grad_norm": 26.516664505004883, + "learning_rate": 7.875647668393783e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.8975427448749542, + "num_tokens": 2352640.0, + "step": 1313 + }, + { + "epoch": 0.21277629341753704, + "grad_norm": 29.516916275024414, + "learning_rate": 7.874028497409327e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.9042788743972778, + "num_tokens": 2354434.0, + "step": 1314 + }, + { + "epoch": 0.21293822362561735, + "grad_norm": 25.39187240600586, + "learning_rate": 7.872409326424872e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.8921815752983093, + "num_tokens": 2356225.0, + "step": 1315 + }, + { + "epoch": 0.21310015383369768, + "grad_norm": 27.50579833984375, + "learning_rate": 7.870790155440416e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.9000816643238068, + "num_tokens": 2358017.0, + "step": 1316 + }, + { + "epoch": 0.213262084041778, + "grad_norm": 26.496875762939453, + "learning_rate": 7.86917098445596e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.8958283066749573, + "num_tokens": 2359817.0, + "step": 1317 + }, + { + "epoch": 0.21342401424985832, + "grad_norm": 25.31161117553711, + "learning_rate": 7.867551813471503e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.8931970596313477, + "num_tokens": 2361601.0, + "step": 1318 + }, + { + "epoch": 0.21358594445793863, + "grad_norm": 25.23062515258789, + "learning_rate": 7.865932642487048e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.9007633626461029, + "num_tokens": 2363375.0, + "step": 1319 + }, + { + "epoch": 0.21374787466601894, + "grad_norm": 21.320762634277344, + "learning_rate": 7.864313471502592e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.9121415317058563, + "num_tokens": 2365172.0, + "step": 1320 + }, + { + "epoch": 0.21390980487409927, + "grad_norm": 26.12787628173828, + "learning_rate": 7.862694300518135e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.9006058275699615, + "num_tokens": 2366956.0, + "step": 1321 + }, + { + "epoch": 0.21407173508217958, + "grad_norm": 23.60651206970215, + "learning_rate": 7.861075129533679e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.9182727932929993, + "num_tokens": 2368762.0, + "step": 1322 + }, + { + "epoch": 0.2142336652902599, + "grad_norm": 24.133113861083984, + "learning_rate": 7.859455958549224e-06, + "loss": 0.6514, + "mean_token_accuracy": 0.9066033959388733, + "num_tokens": 2370563.0, + "step": 1323 + }, + { + "epoch": 0.21439559549834022, + "grad_norm": 23.532142639160156, + "learning_rate": 7.857836787564768e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.8986014127731323, + "num_tokens": 2372361.0, + "step": 1324 + }, + { + "epoch": 0.21455752570642053, + "grad_norm": 21.116100311279297, + "learning_rate": 7.856217616580311e-06, + "loss": 0.608, + "mean_token_accuracy": 0.9245029091835022, + "num_tokens": 2374151.0, + "step": 1325 + }, + { + "epoch": 0.21471945591450084, + "grad_norm": 28.5745792388916, + "learning_rate": 7.854598445595855e-06, + "loss": 0.756, + "mean_token_accuracy": 0.8998612463474274, + "num_tokens": 2375942.0, + "step": 1326 + }, + { + "epoch": 0.21488138612258117, + "grad_norm": 19.671836853027344, + "learning_rate": 7.8529792746114e-06, + "loss": 0.6606, + "mean_token_accuracy": 0.8993055522441864, + "num_tokens": 2377730.0, + "step": 1327 + }, + { + "epoch": 0.21504331633066148, + "grad_norm": 26.114513397216797, + "learning_rate": 7.851360103626944e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.9104059636592865, + "num_tokens": 2379521.0, + "step": 1328 + }, + { + "epoch": 0.2152052465387418, + "grad_norm": 23.38545799255371, + "learning_rate": 7.849740932642487e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.903900682926178, + "num_tokens": 2381314.0, + "step": 1329 + }, + { + "epoch": 0.21536717674682213, + "grad_norm": 23.690956115722656, + "learning_rate": 7.848121761658031e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.9098140597343445, + "num_tokens": 2383115.0, + "step": 1330 + }, + { + "epoch": 0.21552910695490243, + "grad_norm": 25.65019416809082, + "learning_rate": 7.846502590673576e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.8888489007949829, + "num_tokens": 2384906.0, + "step": 1331 + }, + { + "epoch": 0.21569103716298277, + "grad_norm": 24.10000991821289, + "learning_rate": 7.84488341968912e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.9091245234012604, + "num_tokens": 2386703.0, + "step": 1332 + }, + { + "epoch": 0.21585296737106308, + "grad_norm": 21.570556640625, + "learning_rate": 7.843264248704663e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.9097693562507629, + "num_tokens": 2388492.0, + "step": 1333 + }, + { + "epoch": 0.21601489757914338, + "grad_norm": 20.557296752929688, + "learning_rate": 7.841645077720209e-06, + "loss": 0.6124, + "mean_token_accuracy": 0.906470000743866, + "num_tokens": 2390282.0, + "step": 1334 + }, + { + "epoch": 0.21617682778722372, + "grad_norm": 31.200876235961914, + "learning_rate": 7.840025906735752e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.8734805285930634, + "num_tokens": 2392078.0, + "step": 1335 + }, + { + "epoch": 0.21633875799530403, + "grad_norm": 23.863723754882812, + "learning_rate": 7.838406735751296e-06, + "loss": 0.781, + "mean_token_accuracy": 0.894610196352005, + "num_tokens": 2393865.0, + "step": 1336 + }, + { + "epoch": 0.21650068820338433, + "grad_norm": 24.848957061767578, + "learning_rate": 7.83678756476684e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.9015037715435028, + "num_tokens": 2395650.0, + "step": 1337 + }, + { + "epoch": 0.21666261841146467, + "grad_norm": 32.46807098388672, + "learning_rate": 7.835168393782385e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.8708485960960388, + "num_tokens": 2397448.0, + "step": 1338 + }, + { + "epoch": 0.21682454861954498, + "grad_norm": 28.673524856567383, + "learning_rate": 7.833549222797928e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.8889916837215424, + "num_tokens": 2399239.0, + "step": 1339 + }, + { + "epoch": 0.21698647882762528, + "grad_norm": 20.588809967041016, + "learning_rate": 7.831930051813472e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.9190376698970795, + "num_tokens": 2401035.0, + "step": 1340 + }, + { + "epoch": 0.21714840903570562, + "grad_norm": 22.57328987121582, + "learning_rate": 7.830310880829016e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.9100719392299652, + "num_tokens": 2402825.0, + "step": 1341 + }, + { + "epoch": 0.21731033924378593, + "grad_norm": 27.874149322509766, + "learning_rate": 7.828691709844561e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.8975524306297302, + "num_tokens": 2404620.0, + "step": 1342 + }, + { + "epoch": 0.21747226945186623, + "grad_norm": 18.996997833251953, + "learning_rate": 7.827072538860104e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.9154095649719238, + "num_tokens": 2406404.0, + "step": 1343 + }, + { + "epoch": 0.21763419965994657, + "grad_norm": 29.685951232910156, + "learning_rate": 7.825453367875648e-06, + "loss": 1.1648, + "mean_token_accuracy": 0.8810355961322784, + "num_tokens": 2408209.0, + "step": 1344 + }, + { + "epoch": 0.21779612986802688, + "grad_norm": 22.576805114746094, + "learning_rate": 7.823834196891192e-06, + "loss": 0.659, + "mean_token_accuracy": 0.913159966468811, + "num_tokens": 2409997.0, + "step": 1345 + }, + { + "epoch": 0.21795806007610719, + "grad_norm": 25.6392765045166, + "learning_rate": 7.822215025906737e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.9051474332809448, + "num_tokens": 2411792.0, + "step": 1346 + }, + { + "epoch": 0.21811999028418752, + "grad_norm": 24.15930938720703, + "learning_rate": 7.82059585492228e-06, + "loss": 0.6591, + "mean_token_accuracy": 0.9027804732322693, + "num_tokens": 2413591.0, + "step": 1347 + }, + { + "epoch": 0.21828192049226783, + "grad_norm": 23.67548370361328, + "learning_rate": 7.818976683937824e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.8874224722385406, + "num_tokens": 2415387.0, + "step": 1348 + }, + { + "epoch": 0.21844385070034816, + "grad_norm": 18.736164093017578, + "learning_rate": 7.817357512953368e-06, + "loss": 0.591, + "mean_token_accuracy": 0.920550525188446, + "num_tokens": 2417176.0, + "step": 1349 + }, + { + "epoch": 0.21860578090842847, + "grad_norm": 18.090635299682617, + "learning_rate": 7.815738341968913e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.9181873500347137, + "num_tokens": 2418957.0, + "step": 1350 + }, + { + "epoch": 0.21876771111650878, + "grad_norm": 22.60944938659668, + "learning_rate": 7.814119170984457e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.9181602001190186, + "num_tokens": 2420738.0, + "step": 1351 + }, + { + "epoch": 0.2189296413245891, + "grad_norm": 23.433828353881836, + "learning_rate": 7.8125e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.9080882370471954, + "num_tokens": 2422522.0, + "step": 1352 + }, + { + "epoch": 0.21909157153266942, + "grad_norm": 21.873239517211914, + "learning_rate": 7.810880829015545e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.9010319709777832, + "num_tokens": 2424316.0, + "step": 1353 + }, + { + "epoch": 0.21925350174074973, + "grad_norm": 24.422439575195312, + "learning_rate": 7.809261658031089e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.9116041362285614, + "num_tokens": 2426112.0, + "step": 1354 + }, + { + "epoch": 0.21941543194883006, + "grad_norm": 19.918426513671875, + "learning_rate": 7.807642487046633e-06, + "loss": 0.6226, + "mean_token_accuracy": 0.9134254455566406, + "num_tokens": 2427901.0, + "step": 1355 + }, + { + "epoch": 0.21957736215691037, + "grad_norm": 28.144012451171875, + "learning_rate": 7.806023316062176e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.8800454437732697, + "num_tokens": 2429701.0, + "step": 1356 + }, + { + "epoch": 0.21973929236499068, + "grad_norm": 24.157426834106445, + "learning_rate": 7.804404145077721e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.9004964828491211, + "num_tokens": 2431494.0, + "step": 1357 + }, + { + "epoch": 0.21990122257307101, + "grad_norm": 19.312145233154297, + "learning_rate": 7.802784974093265e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.9266602396965027, + "num_tokens": 2433279.0, + "step": 1358 + }, + { + "epoch": 0.22006315278115132, + "grad_norm": 20.21554946899414, + "learning_rate": 7.801165803108809e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.924761027097702, + "num_tokens": 2435070.0, + "step": 1359 + }, + { + "epoch": 0.22022508298923163, + "grad_norm": 24.232200622558594, + "learning_rate": 7.799546632124352e-06, + "loss": 0.799, + "mean_token_accuracy": 0.8945572972297668, + "num_tokens": 2436857.0, + "step": 1360 + }, + { + "epoch": 0.22038701319731197, + "grad_norm": 30.001169204711914, + "learning_rate": 7.797927461139898e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.8915297389030457, + "num_tokens": 2438655.0, + "step": 1361 + }, + { + "epoch": 0.22054894340539227, + "grad_norm": 28.108449935913086, + "learning_rate": 7.796308290155441e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.8862321972846985, + "num_tokens": 2440459.0, + "step": 1362 + }, + { + "epoch": 0.22071087361347258, + "grad_norm": 26.04998207092285, + "learning_rate": 7.794689119170985e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.8954448401927948, + "num_tokens": 2442258.0, + "step": 1363 + }, + { + "epoch": 0.22087280382155292, + "grad_norm": 21.462345123291016, + "learning_rate": 7.793069948186528e-06, + "loss": 0.6494, + "mean_token_accuracy": 0.9084370732307434, + "num_tokens": 2444043.0, + "step": 1364 + }, + { + "epoch": 0.22103473402963322, + "grad_norm": 18.6356258392334, + "learning_rate": 7.791450777202074e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.9255318939685822, + "num_tokens": 2445837.0, + "step": 1365 + }, + { + "epoch": 0.22119666423771356, + "grad_norm": 24.375577926635742, + "learning_rate": 7.789831606217617e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.8933209776878357, + "num_tokens": 2447628.0, + "step": 1366 + }, + { + "epoch": 0.22135859444579387, + "grad_norm": 20.464832305908203, + "learning_rate": 7.78821243523316e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.9079841077327728, + "num_tokens": 2449422.0, + "step": 1367 + }, + { + "epoch": 0.22152052465387417, + "grad_norm": 19.878673553466797, + "learning_rate": 7.786593264248704e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.9013539850711823, + "num_tokens": 2451207.0, + "step": 1368 + }, + { + "epoch": 0.2216824548619545, + "grad_norm": 17.676183700561523, + "learning_rate": 7.78497409326425e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.9192405343055725, + "num_tokens": 2452991.0, + "step": 1369 + }, + { + "epoch": 0.22184438507003482, + "grad_norm": 22.46243667602539, + "learning_rate": 7.783354922279793e-06, + "loss": 0.736, + "mean_token_accuracy": 0.9061065912246704, + "num_tokens": 2454780.0, + "step": 1370 + }, + { + "epoch": 0.22200631527811512, + "grad_norm": 19.763105392456055, + "learning_rate": 7.781735751295337e-06, + "loss": 0.6251, + "mean_token_accuracy": 0.9067688584327698, + "num_tokens": 2456572.0, + "step": 1371 + }, + { + "epoch": 0.22216824548619546, + "grad_norm": 27.176179885864258, + "learning_rate": 7.780116580310882e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.9007352888584137, + "num_tokens": 2458356.0, + "step": 1372 + }, + { + "epoch": 0.22233017569427577, + "grad_norm": 19.885753631591797, + "learning_rate": 7.778497409326426e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.9202898740768433, + "num_tokens": 2460144.0, + "step": 1373 + }, + { + "epoch": 0.22249210590235607, + "grad_norm": 25.12118911743164, + "learning_rate": 7.77687823834197e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.8936170041561127, + "num_tokens": 2461938.0, + "step": 1374 + }, + { + "epoch": 0.2226540361104364, + "grad_norm": 24.92267417907715, + "learning_rate": 7.775259067357513e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.8918404579162598, + "num_tokens": 2463727.0, + "step": 1375 + }, + { + "epoch": 0.22281596631851672, + "grad_norm": 20.28502082824707, + "learning_rate": 7.773639896373058e-06, + "loss": 0.6384, + "mean_token_accuracy": 0.919762909412384, + "num_tokens": 2465514.0, + "step": 1376 + }, + { + "epoch": 0.22297789652659702, + "grad_norm": 29.23663330078125, + "learning_rate": 7.772020725388602e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.8926453292369843, + "num_tokens": 2467313.0, + "step": 1377 + }, + { + "epoch": 0.22313982673467736, + "grad_norm": 25.073808670043945, + "learning_rate": 7.770401554404145e-06, + "loss": 0.7095, + "mean_token_accuracy": 0.8994308412075043, + "num_tokens": 2469113.0, + "step": 1378 + }, + { + "epoch": 0.22330175694275767, + "grad_norm": 27.79206085205078, + "learning_rate": 7.768782383419689e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.8808779120445251, + "num_tokens": 2470909.0, + "step": 1379 + }, + { + "epoch": 0.22346368715083798, + "grad_norm": 25.242015838623047, + "learning_rate": 7.767163212435234e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.9044065773487091, + "num_tokens": 2472693.0, + "step": 1380 + }, + { + "epoch": 0.2236256173589183, + "grad_norm": 24.53001594543457, + "learning_rate": 7.765544041450778e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.9142078757286072, + "num_tokens": 2474484.0, + "step": 1381 + }, + { + "epoch": 0.22378754756699862, + "grad_norm": 20.68842887878418, + "learning_rate": 7.763924870466321e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.9108880758285522, + "num_tokens": 2476265.0, + "step": 1382 + }, + { + "epoch": 0.22394947777507895, + "grad_norm": 23.500314712524414, + "learning_rate": 7.762305699481865e-06, + "loss": 0.6658, + "mean_token_accuracy": 0.9125318229198456, + "num_tokens": 2478052.0, + "step": 1383 + }, + { + "epoch": 0.22411140798315926, + "grad_norm": 28.060489654541016, + "learning_rate": 7.76068652849741e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.9050595164299011, + "num_tokens": 2479848.0, + "step": 1384 + }, + { + "epoch": 0.22427333819123957, + "grad_norm": 24.152925491333008, + "learning_rate": 7.759067357512954e-06, + "loss": 0.758, + "mean_token_accuracy": 0.9088743031024933, + "num_tokens": 2481635.0, + "step": 1385 + }, + { + "epoch": 0.2244352683993199, + "grad_norm": 24.893659591674805, + "learning_rate": 7.757448186528497e-06, + "loss": 0.714, + "mean_token_accuracy": 0.9049295783042908, + "num_tokens": 2483431.0, + "step": 1386 + }, + { + "epoch": 0.2245971986074002, + "grad_norm": 24.605436325073242, + "learning_rate": 7.755829015544041e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.9062369465827942, + "num_tokens": 2485220.0, + "step": 1387 + }, + { + "epoch": 0.22475912881548052, + "grad_norm": 16.942914962768555, + "learning_rate": 7.754209844559586e-06, + "loss": 0.535, + "mean_token_accuracy": 0.9213924705982208, + "num_tokens": 2487012.0, + "step": 1388 + }, + { + "epoch": 0.22492105902356085, + "grad_norm": 29.950647354125977, + "learning_rate": 7.75259067357513e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.8884143531322479, + "num_tokens": 2488802.0, + "step": 1389 + }, + { + "epoch": 0.22508298923164116, + "grad_norm": 22.724166870117188, + "learning_rate": 7.750971502590674e-06, + "loss": 0.6355, + "mean_token_accuracy": 0.9154545664787292, + "num_tokens": 2490596.0, + "step": 1390 + }, + { + "epoch": 0.22524491943972147, + "grad_norm": 23.553930282592773, + "learning_rate": 7.749352331606219e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.9071327447891235, + "num_tokens": 2492399.0, + "step": 1391 + }, + { + "epoch": 0.2254068496478018, + "grad_norm": 15.872113227844238, + "learning_rate": 7.747733160621762e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.9289695024490356, + "num_tokens": 2494192.0, + "step": 1392 + }, + { + "epoch": 0.2255687798558821, + "grad_norm": 24.228591918945312, + "learning_rate": 7.746113989637306e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.8924717307090759, + "num_tokens": 2495983.0, + "step": 1393 + }, + { + "epoch": 0.22573071006396242, + "grad_norm": 19.374597549438477, + "learning_rate": 7.74449481865285e-06, + "loss": 0.666, + "mean_token_accuracy": 0.9094203114509583, + "num_tokens": 2497771.0, + "step": 1394 + }, + { + "epoch": 0.22589264027204276, + "grad_norm": 24.548786163330078, + "learning_rate": 7.742875647668395e-06, + "loss": 0.733, + "mean_token_accuracy": 0.8859421610832214, + "num_tokens": 2499555.0, + "step": 1395 + }, + { + "epoch": 0.22605457048012306, + "grad_norm": 23.346195220947266, + "learning_rate": 7.741256476683938e-06, + "loss": 0.7169, + "mean_token_accuracy": 0.8956018388271332, + "num_tokens": 2501346.0, + "step": 1396 + }, + { + "epoch": 0.2262165006882034, + "grad_norm": 33.626216888427734, + "learning_rate": 7.739637305699482e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.8953339755535126, + "num_tokens": 2503154.0, + "step": 1397 + }, + { + "epoch": 0.2263784308962837, + "grad_norm": 18.985727310180664, + "learning_rate": 7.738018134715026e-06, + "loss": 0.5546, + "mean_token_accuracy": 0.9245029091835022, + "num_tokens": 2504944.0, + "step": 1398 + }, + { + "epoch": 0.226540361104364, + "grad_norm": 25.161592483520508, + "learning_rate": 7.736398963730571e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.9025547206401825, + "num_tokens": 2506733.0, + "step": 1399 + }, + { + "epoch": 0.22670229131244435, + "grad_norm": 19.465166091918945, + "learning_rate": 7.734779792746114e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.9239541888237, + "num_tokens": 2508521.0, + "step": 1400 + }, + { + "epoch": 0.22686422152052466, + "grad_norm": 21.66914176940918, + "learning_rate": 7.733160621761658e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.9044117629528046, + "num_tokens": 2510305.0, + "step": 1401 + }, + { + "epoch": 0.22702615172860496, + "grad_norm": 28.106422424316406, + "learning_rate": 7.731541450777202e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.8937198221683502, + "num_tokens": 2512090.0, + "step": 1402 + }, + { + "epoch": 0.2271880819366853, + "grad_norm": 14.6756010055542, + "learning_rate": 7.729922279792747e-06, + "loss": 0.5455, + "mean_token_accuracy": 0.9160583913326263, + "num_tokens": 2513876.0, + "step": 1403 + }, + { + "epoch": 0.2273500121447656, + "grad_norm": 25.356327056884766, + "learning_rate": 7.72830310880829e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.902165025472641, + "num_tokens": 2515685.0, + "step": 1404 + }, + { + "epoch": 0.22751194235284591, + "grad_norm": 21.3913516998291, + "learning_rate": 7.726683937823834e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.9084541201591492, + "num_tokens": 2517470.0, + "step": 1405 + }, + { + "epoch": 0.22767387256092625, + "grad_norm": 22.52349853515625, + "learning_rate": 7.725064766839378e-06, + "loss": 0.6411, + "mean_token_accuracy": 0.9082632958889008, + "num_tokens": 2519265.0, + "step": 1406 + }, + { + "epoch": 0.22783580276900656, + "grad_norm": 20.94371223449707, + "learning_rate": 7.723445595854923e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.9111787378787994, + "num_tokens": 2521046.0, + "step": 1407 + }, + { + "epoch": 0.22799773297708686, + "grad_norm": 19.096118927001953, + "learning_rate": 7.721826424870467e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.9067419171333313, + "num_tokens": 2522836.0, + "step": 1408 + }, + { + "epoch": 0.2281596631851672, + "grad_norm": 21.31864356994629, + "learning_rate": 7.72020725388601e-06, + "loss": 0.6323, + "mean_token_accuracy": 0.9032630920410156, + "num_tokens": 2524627.0, + "step": 1409 + }, + { + "epoch": 0.2283215933932475, + "grad_norm": 13.813851356506348, + "learning_rate": 7.718588082901555e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.9407802522182465, + "num_tokens": 2526427.0, + "step": 1410 + }, + { + "epoch": 0.22848352360132781, + "grad_norm": 21.170127868652344, + "learning_rate": 7.716968911917099e-06, + "loss": 0.6638, + "mean_token_accuracy": 0.9124966859817505, + "num_tokens": 2528213.0, + "step": 1411 + }, + { + "epoch": 0.22864545380940815, + "grad_norm": 27.441308975219727, + "learning_rate": 7.715349740932643e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.8875938653945923, + "num_tokens": 2530002.0, + "step": 1412 + }, + { + "epoch": 0.22880738401748846, + "grad_norm": 23.334585189819336, + "learning_rate": 7.713730569948186e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.9046095311641693, + "num_tokens": 2531797.0, + "step": 1413 + }, + { + "epoch": 0.2289693142255688, + "grad_norm": 18.853336334228516, + "learning_rate": 7.712111398963732e-06, + "loss": 0.5559, + "mean_token_accuracy": 0.9182054400444031, + "num_tokens": 2533578.0, + "step": 1414 + }, + { + "epoch": 0.2291312444336491, + "grad_norm": 23.185396194458008, + "learning_rate": 7.710492227979275e-06, + "loss": 0.626, + "mean_token_accuracy": 0.9049992561340332, + "num_tokens": 2535374.0, + "step": 1415 + }, + { + "epoch": 0.2292931746417294, + "grad_norm": 14.400371551513672, + "learning_rate": 7.708873056994819e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.9228314161300659, + "num_tokens": 2537157.0, + "step": 1416 + }, + { + "epoch": 0.22945510484980974, + "grad_norm": 21.116069793701172, + "learning_rate": 7.707253886010362e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.9159381687641144, + "num_tokens": 2538943.0, + "step": 1417 + }, + { + "epoch": 0.22961703505789005, + "grad_norm": 22.72123146057129, + "learning_rate": 7.705634715025908e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.89585942029953, + "num_tokens": 2540732.0, + "step": 1418 + }, + { + "epoch": 0.22977896526597036, + "grad_norm": 23.89801025390625, + "learning_rate": 7.704015544041451e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.8941701948642731, + "num_tokens": 2542528.0, + "step": 1419 + }, + { + "epoch": 0.2299408954740507, + "grad_norm": 28.52039909362793, + "learning_rate": 7.702396373056995e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.8997378349304199, + "num_tokens": 2544319.0, + "step": 1420 + }, + { + "epoch": 0.230102825682131, + "grad_norm": 24.92635726928711, + "learning_rate": 7.700777202072538e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.9124040901660919, + "num_tokens": 2546105.0, + "step": 1421 + }, + { + "epoch": 0.2302647558902113, + "grad_norm": 27.74716567993164, + "learning_rate": 7.699158031088084e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.9103453755378723, + "num_tokens": 2547896.0, + "step": 1422 + }, + { + "epoch": 0.23042668609829164, + "grad_norm": 29.32065200805664, + "learning_rate": 7.697538860103627e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.8966663181781769, + "num_tokens": 2549679.0, + "step": 1423 + }, + { + "epoch": 0.23058861630637195, + "grad_norm": 23.518152236938477, + "learning_rate": 7.69591968911917e-06, + "loss": 0.678, + "mean_token_accuracy": 0.9049295783042908, + "num_tokens": 2551475.0, + "step": 1424 + }, + { + "epoch": 0.23075054651445226, + "grad_norm": 20.571474075317383, + "learning_rate": 7.694300518134716e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.9113828539848328, + "num_tokens": 2553269.0, + "step": 1425 + }, + { + "epoch": 0.2309124767225326, + "grad_norm": 19.52507781982422, + "learning_rate": 7.69268134715026e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.9212669432163239, + "num_tokens": 2555047.0, + "step": 1426 + }, + { + "epoch": 0.2310744069306129, + "grad_norm": 22.284969329833984, + "learning_rate": 7.691062176165803e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.9127790331840515, + "num_tokens": 2556834.0, + "step": 1427 + }, + { + "epoch": 0.2312363371386932, + "grad_norm": 24.386430740356445, + "learning_rate": 7.689443005181347e-06, + "loss": 0.7547, + "mean_token_accuracy": 0.8994667828083038, + "num_tokens": 2558624.0, + "step": 1428 + }, + { + "epoch": 0.23139826734677355, + "grad_norm": 22.268789291381836, + "learning_rate": 7.687823834196892e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.9182723760604858, + "num_tokens": 2560418.0, + "step": 1429 + }, + { + "epoch": 0.23156019755485385, + "grad_norm": 20.271615982055664, + "learning_rate": 7.686204663212436e-06, + "loss": 0.6307, + "mean_token_accuracy": 0.9018568992614746, + "num_tokens": 2562195.0, + "step": 1430 + }, + { + "epoch": 0.2317221277629342, + "grad_norm": 33.752899169921875, + "learning_rate": 7.68458549222798e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.8899519145488739, + "num_tokens": 2563997.0, + "step": 1431 + }, + { + "epoch": 0.2318840579710145, + "grad_norm": 27.174192428588867, + "learning_rate": 7.682966321243523e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.8941729366779327, + "num_tokens": 2565782.0, + "step": 1432 + }, + { + "epoch": 0.2320459881790948, + "grad_norm": 24.73139190673828, + "learning_rate": 7.681347150259068e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.9002998471260071, + "num_tokens": 2567577.0, + "step": 1433 + }, + { + "epoch": 0.23220791838717514, + "grad_norm": 24.544334411621094, + "learning_rate": 7.679727979274612e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.9112384617328644, + "num_tokens": 2569382.0, + "step": 1434 + }, + { + "epoch": 0.23236984859525545, + "grad_norm": 13.470335960388184, + "learning_rate": 7.678108808290155e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.924838125705719, + "num_tokens": 2571173.0, + "step": 1435 + }, + { + "epoch": 0.23253177880333575, + "grad_norm": 25.5701904296875, + "learning_rate": 7.676489637305699e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.9034899771213531, + "num_tokens": 2572965.0, + "step": 1436 + }, + { + "epoch": 0.2326937090114161, + "grad_norm": 30.022069931030273, + "learning_rate": 7.674870466321244e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.8899728953838348, + "num_tokens": 2574777.0, + "step": 1437 + }, + { + "epoch": 0.2328556392194964, + "grad_norm": 19.45071792602539, + "learning_rate": 7.673251295336788e-06, + "loss": 0.5666, + "mean_token_accuracy": 0.9142857193946838, + "num_tokens": 2576569.0, + "step": 1438 + }, + { + "epoch": 0.2330175694275767, + "grad_norm": 18.2053165435791, + "learning_rate": 7.671632124352331e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.9208124577999115, + "num_tokens": 2578358.0, + "step": 1439 + }, + { + "epoch": 0.23317949963565704, + "grad_norm": 23.79119873046875, + "learning_rate": 7.670012953367875e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.9053932428359985, + "num_tokens": 2580145.0, + "step": 1440 + }, + { + "epoch": 0.23334142984373735, + "grad_norm": 27.382652282714844, + "learning_rate": 7.66839378238342e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.9009722173213959, + "num_tokens": 2581951.0, + "step": 1441 + }, + { + "epoch": 0.23350336005181765, + "grad_norm": 28.890094757080078, + "learning_rate": 7.666774611398964e-06, + "loss": 0.845, + "mean_token_accuracy": 0.8961202502250671, + "num_tokens": 2583742.0, + "step": 1442 + }, + { + "epoch": 0.233665290259898, + "grad_norm": 32.617794036865234, + "learning_rate": 7.665155440414507e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.8866921663284302, + "num_tokens": 2585545.0, + "step": 1443 + }, + { + "epoch": 0.2338272204679783, + "grad_norm": 31.685365676879883, + "learning_rate": 7.663536269430053e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.8888835310935974, + "num_tokens": 2587345.0, + "step": 1444 + }, + { + "epoch": 0.23398915067605863, + "grad_norm": 20.850082397460938, + "learning_rate": 7.661917098445596e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.8996683359146118, + "num_tokens": 2589126.0, + "step": 1445 + }, + { + "epoch": 0.23415108088413894, + "grad_norm": 25.50518226623535, + "learning_rate": 7.66029792746114e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.9110672175884247, + "num_tokens": 2590919.0, + "step": 1446 + }, + { + "epoch": 0.23431301109221925, + "grad_norm": 25.487224578857422, + "learning_rate": 7.658678756476684e-06, + "loss": 0.6788, + "mean_token_accuracy": 0.9054646492004395, + "num_tokens": 2592716.0, + "step": 1447 + }, + { + "epoch": 0.23447494130029958, + "grad_norm": 26.57265281677246, + "learning_rate": 7.657059585492229e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.907892107963562, + "num_tokens": 2594511.0, + "step": 1448 + }, + { + "epoch": 0.2346368715083799, + "grad_norm": 20.960603713989258, + "learning_rate": 7.655440414507772e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.9197037518024445, + "num_tokens": 2596297.0, + "step": 1449 + }, + { + "epoch": 0.2347988017164602, + "grad_norm": 29.576522827148438, + "learning_rate": 7.653821243523318e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.9039513766765594, + "num_tokens": 2598090.0, + "step": 1450 + }, + { + "epoch": 0.23496073192454053, + "grad_norm": 20.15174102783203, + "learning_rate": 7.652202072538861e-06, + "loss": 0.6108, + "mean_token_accuracy": 0.9192526638507843, + "num_tokens": 2599875.0, + "step": 1451 + }, + { + "epoch": 0.23512266213262084, + "grad_norm": 23.13926887512207, + "learning_rate": 7.650582901554405e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.9043422639369965, + "num_tokens": 2601667.0, + "step": 1452 + }, + { + "epoch": 0.23528459234070115, + "grad_norm": 29.10275650024414, + "learning_rate": 7.64896373056995e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.887701541185379, + "num_tokens": 2603455.0, + "step": 1453 + }, + { + "epoch": 0.23544652254878148, + "grad_norm": 23.72844123840332, + "learning_rate": 7.647344559585494e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.9113768041133881, + "num_tokens": 2605249.0, + "step": 1454 + }, + { + "epoch": 0.2356084527568618, + "grad_norm": 29.564889907836914, + "learning_rate": 7.645725388601037e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.8921815752983093, + "num_tokens": 2607040.0, + "step": 1455 + }, + { + "epoch": 0.2357703829649421, + "grad_norm": 27.300100326538086, + "learning_rate": 7.644106217616581e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.8922254145145416, + "num_tokens": 2608831.0, + "step": 1456 + }, + { + "epoch": 0.23593231317302243, + "grad_norm": 22.000505447387695, + "learning_rate": 7.642487046632126e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.9094942808151245, + "num_tokens": 2610619.0, + "step": 1457 + }, + { + "epoch": 0.23609424338110274, + "grad_norm": 26.95020866394043, + "learning_rate": 7.64086787564767e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.9104224443435669, + "num_tokens": 2612410.0, + "step": 1458 + }, + { + "epoch": 0.23625617358918305, + "grad_norm": 24.554372787475586, + "learning_rate": 7.639248704663213e-06, + "loss": 0.68, + "mean_token_accuracy": 0.9136690497398376, + "num_tokens": 2614200.0, + "step": 1459 + }, + { + "epoch": 0.23641810379726338, + "grad_norm": 23.83368492126465, + "learning_rate": 7.637629533678757e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.8920835256576538, + "num_tokens": 2615999.0, + "step": 1460 + }, + { + "epoch": 0.2365800340053437, + "grad_norm": 22.855274200439453, + "learning_rate": 7.636010362694302e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.9004510045051575, + "num_tokens": 2617792.0, + "step": 1461 + }, + { + "epoch": 0.23674196421342403, + "grad_norm": 25.45528221130371, + "learning_rate": 7.634391191709846e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.8968993723392487, + "num_tokens": 2619585.0, + "step": 1462 + }, + { + "epoch": 0.23690389442150434, + "grad_norm": 24.026029586791992, + "learning_rate": 7.63277202072539e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.8956202864646912, + "num_tokens": 2621385.0, + "step": 1463 + }, + { + "epoch": 0.23706582462958464, + "grad_norm": 15.273926734924316, + "learning_rate": 7.631152849740933e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.9177290201187134, + "num_tokens": 2623176.0, + "step": 1464 + }, + { + "epoch": 0.23722775483766498, + "grad_norm": 21.250120162963867, + "learning_rate": 7.629533678756478e-06, + "loss": 0.6154, + "mean_token_accuracy": 0.9167449176311493, + "num_tokens": 2624965.0, + "step": 1465 + }, + { + "epoch": 0.23738968504574529, + "grad_norm": 18.954471588134766, + "learning_rate": 7.627914507772022e-06, + "loss": 0.6021, + "mean_token_accuracy": 0.9067012369632721, + "num_tokens": 2626758.0, + "step": 1466 + }, + { + "epoch": 0.2375516152538256, + "grad_norm": 16.815805435180664, + "learning_rate": 7.6262953367875655e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.9197114408016205, + "num_tokens": 2628556.0, + "step": 1467 + }, + { + "epoch": 0.23771354546190593, + "grad_norm": 23.18590545654297, + "learning_rate": 7.62467616580311e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.890966385602951, + "num_tokens": 2630344.0, + "step": 1468 + }, + { + "epoch": 0.23787547566998624, + "grad_norm": 20.75792694091797, + "learning_rate": 7.6230569948186535e-06, + "loss": 0.6454, + "mean_token_accuracy": 0.9175784289836884, + "num_tokens": 2632134.0, + "step": 1469 + }, + { + "epoch": 0.23803740587806654, + "grad_norm": 23.482240676879883, + "learning_rate": 7.621437823834198e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.9166666865348816, + "num_tokens": 2633922.0, + "step": 1470 + }, + { + "epoch": 0.23819933608614688, + "grad_norm": 17.71479034423828, + "learning_rate": 7.619818652849742e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.9216595590114594, + "num_tokens": 2635715.0, + "step": 1471 + }, + { + "epoch": 0.2383612662942272, + "grad_norm": 20.87181282043457, + "learning_rate": 7.618199481865286e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.9134100675582886, + "num_tokens": 2637492.0, + "step": 1472 + }, + { + "epoch": 0.2385231965023075, + "grad_norm": 20.5545597076416, + "learning_rate": 7.61658031088083e-06, + "loss": 0.677, + "mean_token_accuracy": 0.911565750837326, + "num_tokens": 2639295.0, + "step": 1473 + }, + { + "epoch": 0.23868512671038783, + "grad_norm": 29.069311141967773, + "learning_rate": 7.614961139896374e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.901127815246582, + "num_tokens": 2641080.0, + "step": 1474 + }, + { + "epoch": 0.23884705691846814, + "grad_norm": 20.82093048095703, + "learning_rate": 7.6133419689119184e-06, + "loss": 0.6569, + "mean_token_accuracy": 0.9105582237243652, + "num_tokens": 2642871.0, + "step": 1475 + }, + { + "epoch": 0.23900898712654844, + "grad_norm": 18.608184814453125, + "learning_rate": 7.611722797927462e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.9069086015224457, + "num_tokens": 2644662.0, + "step": 1476 + }, + { + "epoch": 0.23917091733462878, + "grad_norm": 25.548845291137695, + "learning_rate": 7.6101036269430065e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.8976739048957825, + "num_tokens": 2646448.0, + "step": 1477 + }, + { + "epoch": 0.2393328475427091, + "grad_norm": 24.109821319580078, + "learning_rate": 7.60848445595855e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.9069280624389648, + "num_tokens": 2648250.0, + "step": 1478 + }, + { + "epoch": 0.23949477775078942, + "grad_norm": 17.441059112548828, + "learning_rate": 7.6068652849740945e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.9265454113483429, + "num_tokens": 2650048.0, + "step": 1479 + }, + { + "epoch": 0.23965670795886973, + "grad_norm": 20.806262969970703, + "learning_rate": 7.605246113989638e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.9220424890518188, + "num_tokens": 2651843.0, + "step": 1480 + }, + { + "epoch": 0.23981863816695004, + "grad_norm": 29.643178939819336, + "learning_rate": 7.6036269430051825e-06, + "loss": 0.89, + "mean_token_accuracy": 0.8886658549308777, + "num_tokens": 2653655.0, + "step": 1481 + }, + { + "epoch": 0.23998056837503037, + "grad_norm": 28.434528350830078, + "learning_rate": 7.602007772020726e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.8941605985164642, + "num_tokens": 2655441.0, + "step": 1482 + }, + { + "epoch": 0.24014249858311068, + "grad_norm": 26.569814682006836, + "learning_rate": 7.6003886010362705e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.8942823112010956, + "num_tokens": 2657237.0, + "step": 1483 + }, + { + "epoch": 0.240304428791191, + "grad_norm": 27.218181610107422, + "learning_rate": 7.598769430051814e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.8952403366565704, + "num_tokens": 2659026.0, + "step": 1484 + }, + { + "epoch": 0.24046635899927132, + "grad_norm": 18.8972110748291, + "learning_rate": 7.5971502590673586e-06, + "loss": 0.6046, + "mean_token_accuracy": 0.9118140041828156, + "num_tokens": 2660810.0, + "step": 1485 + }, + { + "epoch": 0.24062828920735163, + "grad_norm": 22.67765235900879, + "learning_rate": 7.595531088082902e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.911750465631485, + "num_tokens": 2662595.0, + "step": 1486 + }, + { + "epoch": 0.24079021941543194, + "grad_norm": 16.718015670776367, + "learning_rate": 7.593911917098447e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.9213612377643585, + "num_tokens": 2664374.0, + "step": 1487 + }, + { + "epoch": 0.24095214962351227, + "grad_norm": 17.105724334716797, + "learning_rate": 7.59229274611399e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.9253042638301849, + "num_tokens": 2666167.0, + "step": 1488 + }, + { + "epoch": 0.24111407983159258, + "grad_norm": 19.515966415405273, + "learning_rate": 7.590673575129535e-06, + "loss": 0.5879, + "mean_token_accuracy": 0.9197037518024445, + "num_tokens": 2667953.0, + "step": 1489 + }, + { + "epoch": 0.2412760100396729, + "grad_norm": 21.645244598388672, + "learning_rate": 7.589054404145079e-06, + "loss": 0.6582, + "mean_token_accuracy": 0.9079285264015198, + "num_tokens": 2669747.0, + "step": 1490 + }, + { + "epoch": 0.24143794024775322, + "grad_norm": 30.58285140991211, + "learning_rate": 7.587435233160623e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.8922710418701172, + "num_tokens": 2671537.0, + "step": 1491 + }, + { + "epoch": 0.24159987045583353, + "grad_norm": 22.627206802368164, + "learning_rate": 7.585816062176167e-06, + "loss": 0.65, + "mean_token_accuracy": 0.9050639867782593, + "num_tokens": 2673323.0, + "step": 1492 + }, + { + "epoch": 0.24176180066391384, + "grad_norm": 24.889347076416016, + "learning_rate": 7.584196891191711e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.9036940634250641, + "num_tokens": 2675115.0, + "step": 1493 + }, + { + "epoch": 0.24192373087199417, + "grad_norm": 21.869319915771484, + "learning_rate": 7.582577720207255e-06, + "loss": 0.6485, + "mean_token_accuracy": 0.9047642946243286, + "num_tokens": 2676909.0, + "step": 1494 + }, + { + "epoch": 0.24208566108007448, + "grad_norm": 19.944337844848633, + "learning_rate": 7.580958549222799e-06, + "loss": 0.6158, + "mean_token_accuracy": 0.9129863977432251, + "num_tokens": 2678697.0, + "step": 1495 + }, + { + "epoch": 0.24224759128815482, + "grad_norm": 18.30863380432129, + "learning_rate": 7.579339378238343e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.9129549264907837, + "num_tokens": 2680484.0, + "step": 1496 + }, + { + "epoch": 0.24240952149623513, + "grad_norm": 23.380239486694336, + "learning_rate": 7.577720207253887e-06, + "loss": 0.6162, + "mean_token_accuracy": 0.9076152145862579, + "num_tokens": 2682267.0, + "step": 1497 + }, + { + "epoch": 0.24257145170431543, + "grad_norm": 26.253238677978516, + "learning_rate": 7.576101036269431e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.8954682946205139, + "num_tokens": 2684056.0, + "step": 1498 + }, + { + "epoch": 0.24273338191239577, + "grad_norm": 16.309751510620117, + "learning_rate": 7.574481865284975e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.9154388010501862, + "num_tokens": 2685828.0, + "step": 1499 + }, + { + "epoch": 0.24289531212047608, + "grad_norm": 25.659961700439453, + "learning_rate": 7.572862694300519e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.9036984443664551, + "num_tokens": 2687610.0, + "step": 1500 + }, + { + "epoch": 0.24305724232855638, + "grad_norm": 21.450206756591797, + "learning_rate": 7.571243523316063e-06, + "loss": 0.584, + "mean_token_accuracy": 0.912291944026947, + "num_tokens": 2689407.0, + "step": 1501 + }, + { + "epoch": 0.24321917253663672, + "grad_norm": 22.383747100830078, + "learning_rate": 7.569624352331607e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.9118537902832031, + "num_tokens": 2691191.0, + "step": 1502 + }, + { + "epoch": 0.24338110274471703, + "grad_norm": 24.435007095336914, + "learning_rate": 7.568005181347151e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.9078405201435089, + "num_tokens": 2692987.0, + "step": 1503 + }, + { + "epoch": 0.24354303295279733, + "grad_norm": 22.43130874633789, + "learning_rate": 7.566386010362695e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.9187643826007843, + "num_tokens": 2694782.0, + "step": 1504 + }, + { + "epoch": 0.24370496316087767, + "grad_norm": 23.468494415283203, + "learning_rate": 7.564766839378239e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.9134188592433929, + "num_tokens": 2696582.0, + "step": 1505 + }, + { + "epoch": 0.24386689336895798, + "grad_norm": 27.195268630981445, + "learning_rate": 7.563147668393783e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.8979646265506744, + "num_tokens": 2698368.0, + "step": 1506 + }, + { + "epoch": 0.24402882357703828, + "grad_norm": 27.289867401123047, + "learning_rate": 7.561528497409327e-06, + "loss": 0.82, + "mean_token_accuracy": 0.9015287756919861, + "num_tokens": 2700171.0, + "step": 1507 + }, + { + "epoch": 0.24419075378511862, + "grad_norm": 30.325847625732422, + "learning_rate": 7.559909326424871e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.8940750360488892, + "num_tokens": 2701957.0, + "step": 1508 + }, + { + "epoch": 0.24435268399319893, + "grad_norm": 22.628604888916016, + "learning_rate": 7.558290155440416e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.9064748287200928, + "num_tokens": 2703747.0, + "step": 1509 + }, + { + "epoch": 0.24451461420127926, + "grad_norm": 22.936546325683594, + "learning_rate": 7.556670984455959e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.9130063951015472, + "num_tokens": 2705533.0, + "step": 1510 + }, + { + "epoch": 0.24467654440935957, + "grad_norm": 25.915376663208008, + "learning_rate": 7.555051813471504e-06, + "loss": 0.687, + "mean_token_accuracy": 0.904751181602478, + "num_tokens": 2707328.0, + "step": 1511 + }, + { + "epoch": 0.24483847461743988, + "grad_norm": 23.123489379882812, + "learning_rate": 7.553432642487047e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.91131791472435, + "num_tokens": 2709122.0, + "step": 1512 + }, + { + "epoch": 0.2450004048255202, + "grad_norm": 18.64593505859375, + "learning_rate": 7.551813471502592e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.9169561862945557, + "num_tokens": 2710912.0, + "step": 1513 + }, + { + "epoch": 0.24516233503360052, + "grad_norm": 28.47271728515625, + "learning_rate": 7.550194300518135e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.8945723176002502, + "num_tokens": 2712718.0, + "step": 1514 + }, + { + "epoch": 0.24532426524168083, + "grad_norm": 29.037677764892578, + "learning_rate": 7.54857512953368e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.8883761167526245, + "num_tokens": 2714507.0, + "step": 1515 + }, + { + "epoch": 0.24548619544976116, + "grad_norm": 23.622699737548828, + "learning_rate": 7.546955958549223e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.9115781486034393, + "num_tokens": 2716301.0, + "step": 1516 + }, + { + "epoch": 0.24564812565784147, + "grad_norm": 18.688987731933594, + "learning_rate": 7.545336787564768e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.9199221730232239, + "num_tokens": 2718089.0, + "step": 1517 + }, + { + "epoch": 0.24581005586592178, + "grad_norm": 20.6220703125, + "learning_rate": 7.543717616580311e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.9208124577999115, + "num_tokens": 2719878.0, + "step": 1518 + }, + { + "epoch": 0.2459719860740021, + "grad_norm": 26.859193801879883, + "learning_rate": 7.542098445595856e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.896783173084259, + "num_tokens": 2721671.0, + "step": 1519 + }, + { + "epoch": 0.24613391628208242, + "grad_norm": 25.553131103515625, + "learning_rate": 7.5404792746113994e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.8910236060619354, + "num_tokens": 2723458.0, + "step": 1520 + }, + { + "epoch": 0.24629584649016273, + "grad_norm": 23.847808837890625, + "learning_rate": 7.538860103626944e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.9069312810897827, + "num_tokens": 2725249.0, + "step": 1521 + }, + { + "epoch": 0.24645777669824306, + "grad_norm": 33.21686935424805, + "learning_rate": 7.5372409326424875e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.8881908357143402, + "num_tokens": 2727038.0, + "step": 1522 + }, + { + "epoch": 0.24661970690632337, + "grad_norm": 26.16434097290039, + "learning_rate": 7.535621761658032e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.8986344635486603, + "num_tokens": 2728826.0, + "step": 1523 + }, + { + "epoch": 0.24678163711440368, + "grad_norm": 21.566883087158203, + "learning_rate": 7.5340025906735755e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.915032148361206, + "num_tokens": 2730609.0, + "step": 1524 + }, + { + "epoch": 0.24694356732248401, + "grad_norm": 16.49151039123535, + "learning_rate": 7.53238341968912e-06, + "loss": 0.5304, + "mean_token_accuracy": 0.9265749752521515, + "num_tokens": 2732406.0, + "step": 1525 + }, + { + "epoch": 0.24710549753056432, + "grad_norm": 24.9566650390625, + "learning_rate": 7.5307642487046635e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.9080615937709808, + "num_tokens": 2734200.0, + "step": 1526 + }, + { + "epoch": 0.24726742773864466, + "grad_norm": 24.29615592956543, + "learning_rate": 7.529145077720208e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.9031609296798706, + "num_tokens": 2735989.0, + "step": 1527 + }, + { + "epoch": 0.24742935794672496, + "grad_norm": 24.336101531982422, + "learning_rate": 7.527525906735752e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.8999948799610138, + "num_tokens": 2737781.0, + "step": 1528 + }, + { + "epoch": 0.24759128815480527, + "grad_norm": 23.452478408813477, + "learning_rate": 7.525906735751296e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.9184397161006927, + "num_tokens": 2739575.0, + "step": 1529 + }, + { + "epoch": 0.2477532183628856, + "grad_norm": 18.691591262817383, + "learning_rate": 7.52428756476684e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.9154922664165497, + "num_tokens": 2741370.0, + "step": 1530 + }, + { + "epoch": 0.24791514857096592, + "grad_norm": 23.151039123535156, + "learning_rate": 7.522668393782384e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.9080419540405273, + "num_tokens": 2743165.0, + "step": 1531 + }, + { + "epoch": 0.24807707877904622, + "grad_norm": 26.80721092224121, + "learning_rate": 7.5210492227979284e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.8945767283439636, + "num_tokens": 2744952.0, + "step": 1532 + }, + { + "epoch": 0.24823900898712656, + "grad_norm": 28.548725128173828, + "learning_rate": 7.519430051813472e-06, + "loss": 0.707, + "mean_token_accuracy": 0.89616858959198, + "num_tokens": 2746753.0, + "step": 1533 + }, + { + "epoch": 0.24840093919520687, + "grad_norm": 25.250219345092773, + "learning_rate": 7.5178108808290165e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.8999183177947998, + "num_tokens": 2748545.0, + "step": 1534 + }, + { + "epoch": 0.24856286940328717, + "grad_norm": 19.515411376953125, + "learning_rate": 7.51619170984456e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.9077614545822144, + "num_tokens": 2750328.0, + "step": 1535 + }, + { + "epoch": 0.2487247996113675, + "grad_norm": 21.264278411865234, + "learning_rate": 7.5145725388601045e-06, + "loss": 0.646, + "mean_token_accuracy": 0.9023066759109497, + "num_tokens": 2752106.0, + "step": 1536 + }, + { + "epoch": 0.24888672981944782, + "grad_norm": 22.406787872314453, + "learning_rate": 7.512953367875648e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.9196296334266663, + "num_tokens": 2753903.0, + "step": 1537 + }, + { + "epoch": 0.24904866002752812, + "grad_norm": 23.076675415039062, + "learning_rate": 7.5113341968911925e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.9081918001174927, + "num_tokens": 2755698.0, + "step": 1538 + }, + { + "epoch": 0.24921059023560846, + "grad_norm": 27.663761138916016, + "learning_rate": 7.509715025906736e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.8968591690063477, + "num_tokens": 2757491.0, + "step": 1539 + }, + { + "epoch": 0.24937252044368877, + "grad_norm": 26.995481491088867, + "learning_rate": 7.5080958549222805e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.8955065310001373, + "num_tokens": 2759281.0, + "step": 1540 + }, + { + "epoch": 0.24953445065176907, + "grad_norm": 27.979995727539062, + "learning_rate": 7.506476683937824e-06, + "loss": 0.6811, + "mean_token_accuracy": 0.9012860655784607, + "num_tokens": 2761077.0, + "step": 1541 + }, + { + "epoch": 0.2496963808598494, + "grad_norm": 24.379438400268555, + "learning_rate": 7.5048575129533686e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.9179728329181671, + "num_tokens": 2762866.0, + "step": 1542 + }, + { + "epoch": 0.24985831106792972, + "grad_norm": 25.636383056640625, + "learning_rate": 7.503238341968912e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.9112319052219391, + "num_tokens": 2764660.0, + "step": 1543 + }, + { + "epoch": 0.25002024127601, + "grad_norm": 20.341447830200195, + "learning_rate": 7.501619170984457e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.908687949180603, + "num_tokens": 2766445.0, + "step": 1544 + }, + { + "epoch": 0.25018217148409033, + "grad_norm": 20.21607780456543, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.9089393615722656, + "num_tokens": 2768232.0, + "step": 1545 + }, + { + "epoch": 0.2503441016921707, + "grad_norm": 22.839038848876953, + "learning_rate": 7.498380829015545e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.9091029465198517, + "num_tokens": 2770019.0, + "step": 1546 + }, + { + "epoch": 0.250506031900251, + "grad_norm": 31.88857650756836, + "learning_rate": 7.496761658031089e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.8856574296951294, + "num_tokens": 2771811.0, + "step": 1547 + }, + { + "epoch": 0.2506679621083313, + "grad_norm": 23.820371627807617, + "learning_rate": 7.495142487046633e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.9087075293064117, + "num_tokens": 2773607.0, + "step": 1548 + }, + { + "epoch": 0.2508298923164116, + "grad_norm": 32.36301803588867, + "learning_rate": 7.493523316062177e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.8912698328495026, + "num_tokens": 2775403.0, + "step": 1549 + }, + { + "epoch": 0.2509918225244919, + "grad_norm": 30.207796096801758, + "learning_rate": 7.491904145077721e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.8877179026603699, + "num_tokens": 2777200.0, + "step": 1550 + }, + { + "epoch": 0.2511537527325723, + "grad_norm": 29.130632400512695, + "learning_rate": 7.490284974093265e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.8989678025245667, + "num_tokens": 2778989.0, + "step": 1551 + }, + { + "epoch": 0.2513156829406526, + "grad_norm": 29.353410720825195, + "learning_rate": 7.488665803108809e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.8890840411186218, + "num_tokens": 2780771.0, + "step": 1552 + }, + { + "epoch": 0.2514776131487329, + "grad_norm": 29.334426879882812, + "learning_rate": 7.487046632124353e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.9125295579433441, + "num_tokens": 2782568.0, + "step": 1553 + }, + { + "epoch": 0.2516395433568132, + "grad_norm": 17.115327835083008, + "learning_rate": 7.485427461139897e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.9213072657585144, + "num_tokens": 2784359.0, + "step": 1554 + }, + { + "epoch": 0.2518014735648935, + "grad_norm": 26.857919692993164, + "learning_rate": 7.483808290155441e-06, + "loss": 0.609, + "mean_token_accuracy": 0.9131987988948822, + "num_tokens": 2786148.0, + "step": 1555 + }, + { + "epoch": 0.2519634037729738, + "grad_norm": 17.469654083251953, + "learning_rate": 7.482189119170985e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.9192526638507843, + "num_tokens": 2787933.0, + "step": 1556 + }, + { + "epoch": 0.2521253339810542, + "grad_norm": 29.414236068725586, + "learning_rate": 7.480569948186529e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.8974026143550873, + "num_tokens": 2789728.0, + "step": 1557 + }, + { + "epoch": 0.2522872641891345, + "grad_norm": 17.6724796295166, + "learning_rate": 7.478950777202073e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.925000011920929, + "num_tokens": 2791519.0, + "step": 1558 + }, + { + "epoch": 0.2524491943972148, + "grad_norm": 26.13325309753418, + "learning_rate": 7.477331606217617e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.9007092118263245, + "num_tokens": 2793313.0, + "step": 1559 + }, + { + "epoch": 0.2526111246052951, + "grad_norm": 27.62114906311035, + "learning_rate": 7.475712435233161e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.8849717974662781, + "num_tokens": 2795103.0, + "step": 1560 + }, + { + "epoch": 0.2527730548133754, + "grad_norm": 22.445478439331055, + "learning_rate": 7.474093264248705e-06, + "loss": 0.6687, + "mean_token_accuracy": 0.9126871824264526, + "num_tokens": 2796891.0, + "step": 1561 + }, + { + "epoch": 0.2529349850214557, + "grad_norm": 26.90172576904297, + "learning_rate": 7.472474093264249e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.8858237564563751, + "num_tokens": 2798683.0, + "step": 1562 + }, + { + "epoch": 0.2530969152295361, + "grad_norm": 16.617937088012695, + "learning_rate": 7.470854922279793e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.9143631160259247, + "num_tokens": 2800464.0, + "step": 1563 + }, + { + "epoch": 0.2532588454376164, + "grad_norm": 18.556692123413086, + "learning_rate": 7.469235751295338e-06, + "loss": 0.5806, + "mean_token_accuracy": 0.9084370732307434, + "num_tokens": 2802249.0, + "step": 1564 + }, + { + "epoch": 0.2534207756456967, + "grad_norm": 20.311861038208008, + "learning_rate": 7.467616580310881e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.9232409298419952, + "num_tokens": 2804035.0, + "step": 1565 + }, + { + "epoch": 0.253582705853777, + "grad_norm": 24.73240852355957, + "learning_rate": 7.465997409326426e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.898207426071167, + "num_tokens": 2805821.0, + "step": 1566 + }, + { + "epoch": 0.2537446360618573, + "grad_norm": 25.900110244750977, + "learning_rate": 7.464378238341969e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.9127525687217712, + "num_tokens": 2807608.0, + "step": 1567 + }, + { + "epoch": 0.2539065662699377, + "grad_norm": 20.611909866333008, + "learning_rate": 7.462759067357514e-06, + "loss": 0.6174, + "mean_token_accuracy": 0.9211897850036621, + "num_tokens": 2809396.0, + "step": 1568 + }, + { + "epoch": 0.254068496478018, + "grad_norm": 22.453723907470703, + "learning_rate": 7.461139896373057e-06, + "loss": 0.6729, + "mean_token_accuracy": 0.9144104421138763, + "num_tokens": 2811178.0, + "step": 1569 + }, + { + "epoch": 0.2542304266860983, + "grad_norm": 33.576324462890625, + "learning_rate": 7.459520725388602e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.8982540667057037, + "num_tokens": 2812984.0, + "step": 1570 + }, + { + "epoch": 0.2543923568941786, + "grad_norm": 19.841344833374023, + "learning_rate": 7.457901554404145e-06, + "loss": 0.677, + "mean_token_accuracy": 0.9145643413066864, + "num_tokens": 2814777.0, + "step": 1571 + }, + { + "epoch": 0.2545542871022589, + "grad_norm": 22.187772750854492, + "learning_rate": 7.45628238341969e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.9186760783195496, + "num_tokens": 2816571.0, + "step": 1572 + }, + { + "epoch": 0.2547162173103392, + "grad_norm": 18.278175354003906, + "learning_rate": 7.454663212435233e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.9174720048904419, + "num_tokens": 2818362.0, + "step": 1573 + }, + { + "epoch": 0.2548781475184196, + "grad_norm": 17.979129791259766, + "learning_rate": 7.453044041450778e-06, + "loss": 0.5493, + "mean_token_accuracy": 0.9128985702991486, + "num_tokens": 2820162.0, + "step": 1574 + }, + { + "epoch": 0.2550400777264999, + "grad_norm": 19.199766159057617, + "learning_rate": 7.4514248704663214e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.9079736173152924, + "num_tokens": 2821957.0, + "step": 1575 + }, + { + "epoch": 0.2552020079345802, + "grad_norm": 19.891965866088867, + "learning_rate": 7.449805699481866e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.9185689091682434, + "num_tokens": 2823739.0, + "step": 1576 + }, + { + "epoch": 0.2553639381426605, + "grad_norm": 32.130130767822266, + "learning_rate": 7.4481865284974095e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.8872180283069611, + "num_tokens": 2825543.0, + "step": 1577 + }, + { + "epoch": 0.2555258683507408, + "grad_norm": 14.756477355957031, + "learning_rate": 7.446567357512954e-06, + "loss": 0.5659, + "mean_token_accuracy": 0.9263225495815277, + "num_tokens": 2827340.0, + "step": 1578 + }, + { + "epoch": 0.2556877985588211, + "grad_norm": 19.128576278686523, + "learning_rate": 7.4449481865284975e-06, + "loss": 0.6239, + "mean_token_accuracy": 0.9171972870826721, + "num_tokens": 2829130.0, + "step": 1579 + }, + { + "epoch": 0.2558497287669015, + "grad_norm": 18.862079620361328, + "learning_rate": 7.443329015544042e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.9112637341022491, + "num_tokens": 2830912.0, + "step": 1580 + }, + { + "epoch": 0.2560116589749818, + "grad_norm": 26.105390548706055, + "learning_rate": 7.4417098445595855e-06, + "loss": 0.6622, + "mean_token_accuracy": 0.9000000059604645, + "num_tokens": 2832714.0, + "step": 1581 + }, + { + "epoch": 0.2561735891830621, + "grad_norm": 22.801639556884766, + "learning_rate": 7.44009067357513e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.9014279842376709, + "num_tokens": 2834500.0, + "step": 1582 + }, + { + "epoch": 0.2563355193911424, + "grad_norm": 28.046396255493164, + "learning_rate": 7.438471502590674e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.898670494556427, + "num_tokens": 2836289.0, + "step": 1583 + }, + { + "epoch": 0.2564974495992227, + "grad_norm": 15.66650390625, + "learning_rate": 7.436852331606218e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.9282343983650208, + "num_tokens": 2838079.0, + "step": 1584 + }, + { + "epoch": 0.2566593798073031, + "grad_norm": 23.62172508239746, + "learning_rate": 7.435233160621762e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.9111754298210144, + "num_tokens": 2839873.0, + "step": 1585 + }, + { + "epoch": 0.2568213100153834, + "grad_norm": 27.808300018310547, + "learning_rate": 7.433613989637306e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.8953336775302887, + "num_tokens": 2841662.0, + "step": 1586 + }, + { + "epoch": 0.2569832402234637, + "grad_norm": 18.4217472076416, + "learning_rate": 7.4319948186528504e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.9146904051303864, + "num_tokens": 2843444.0, + "step": 1587 + }, + { + "epoch": 0.257145170431544, + "grad_norm": 21.971416473388672, + "learning_rate": 7.430375647668394e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.9037989974021912, + "num_tokens": 2845236.0, + "step": 1588 + }, + { + "epoch": 0.2573071006396243, + "grad_norm": 21.468717575073242, + "learning_rate": 7.4287564766839385e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.9122180640697479, + "num_tokens": 2847021.0, + "step": 1589 + }, + { + "epoch": 0.2574690308477046, + "grad_norm": 18.815839767456055, + "learning_rate": 7.427137305699482e-06, + "loss": 0.5851, + "mean_token_accuracy": 0.9184397161006927, + "num_tokens": 2848815.0, + "step": 1590 + }, + { + "epoch": 0.257630961055785, + "grad_norm": 22.126934051513672, + "learning_rate": 7.4255181347150265e-06, + "loss": 0.6571, + "mean_token_accuracy": 0.9103114902973175, + "num_tokens": 2850607.0, + "step": 1591 + }, + { + "epoch": 0.2577928912638653, + "grad_norm": 24.832630157470703, + "learning_rate": 7.42389896373057e-06, + "loss": 0.744, + "mean_token_accuracy": 0.8955419659614563, + "num_tokens": 2852398.0, + "step": 1592 + }, + { + "epoch": 0.2579548214719456, + "grad_norm": 24.52344512939453, + "learning_rate": 7.4222797927461145e-06, + "loss": 0.648, + "mean_token_accuracy": 0.9103802740573883, + "num_tokens": 2854189.0, + "step": 1593 + }, + { + "epoch": 0.2581167516800259, + "grad_norm": 14.720680236816406, + "learning_rate": 7.420660621761658e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9260977506637573, + "num_tokens": 2855985.0, + "step": 1594 + }, + { + "epoch": 0.2582786818881062, + "grad_norm": 25.270793914794922, + "learning_rate": 7.4190414507772025e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.9138705134391785, + "num_tokens": 2857764.0, + "step": 1595 + }, + { + "epoch": 0.2584406120961865, + "grad_norm": 20.666845321655273, + "learning_rate": 7.417422279792746e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.9033996760845184, + "num_tokens": 2859545.0, + "step": 1596 + }, + { + "epoch": 0.2586025423042669, + "grad_norm": 24.13327407836914, + "learning_rate": 7.4158031088082906e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.8925729393959045, + "num_tokens": 2861345.0, + "step": 1597 + }, + { + "epoch": 0.2587644725123472, + "grad_norm": 19.873493194580078, + "learning_rate": 7.414183937823834e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.917929470539093, + "num_tokens": 2863137.0, + "step": 1598 + }, + { + "epoch": 0.2589264027204275, + "grad_norm": 19.287906646728516, + "learning_rate": 7.412564766839379e-06, + "loss": 0.635, + "mean_token_accuracy": 0.9122960269451141, + "num_tokens": 2864924.0, + "step": 1599 + }, + { + "epoch": 0.2590883329285078, + "grad_norm": 25.044971466064453, + "learning_rate": 7.410945595854922e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.9067248702049255, + "num_tokens": 2866712.0, + "step": 1600 + }, + { + "epoch": 0.2592502631365881, + "grad_norm": 24.437191009521484, + "learning_rate": 7.409326424870467e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.8893246352672577, + "num_tokens": 2868495.0, + "step": 1601 + }, + { + "epoch": 0.2594121933446685, + "grad_norm": 21.227680206298828, + "learning_rate": 7.407707253886011e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.9082276821136475, + "num_tokens": 2870291.0, + "step": 1602 + }, + { + "epoch": 0.2595741235527488, + "grad_norm": 29.17960548400879, + "learning_rate": 7.406088082901555e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.8758048117160797, + "num_tokens": 2872085.0, + "step": 1603 + }, + { + "epoch": 0.2597360537608291, + "grad_norm": 22.167009353637695, + "learning_rate": 7.404468911917099e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.8975377082824707, + "num_tokens": 2873880.0, + "step": 1604 + }, + { + "epoch": 0.2598979839689094, + "grad_norm": 33.898197174072266, + "learning_rate": 7.402849740932643e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.8941076397895813, + "num_tokens": 2875675.0, + "step": 1605 + }, + { + "epoch": 0.2600599141769897, + "grad_norm": 22.588411331176758, + "learning_rate": 7.401230569948187e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.9095667600631714, + "num_tokens": 2877465.0, + "step": 1606 + }, + { + "epoch": 0.26022184438507, + "grad_norm": 25.015331268310547, + "learning_rate": 7.399611398963731e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.9037989974021912, + "num_tokens": 2879257.0, + "step": 1607 + }, + { + "epoch": 0.2603837745931504, + "grad_norm": 25.3990535736084, + "learning_rate": 7.397992227979275e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.8942881524562836, + "num_tokens": 2881043.0, + "step": 1608 + }, + { + "epoch": 0.2605457048012307, + "grad_norm": 22.725894927978516, + "learning_rate": 7.396373056994819e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.9011540710926056, + "num_tokens": 2882838.0, + "step": 1609 + }, + { + "epoch": 0.260707635009311, + "grad_norm": 24.916545867919922, + "learning_rate": 7.394753886010363e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.9055270254611969, + "num_tokens": 2884636.0, + "step": 1610 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 18.58320426940918, + "learning_rate": 7.393134715025907e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.9201717376708984, + "num_tokens": 2886436.0, + "step": 1611 + }, + { + "epoch": 0.2610314954254716, + "grad_norm": 21.788318634033203, + "learning_rate": 7.391515544041451e-06, + "loss": 0.6117, + "mean_token_accuracy": 0.9121415317058563, + "num_tokens": 2888233.0, + "step": 1612 + }, + { + "epoch": 0.2611934256335519, + "grad_norm": 24.78152847290039, + "learning_rate": 7.389896373056995e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.9063853025436401, + "num_tokens": 2890043.0, + "step": 1613 + }, + { + "epoch": 0.2613553558416323, + "grad_norm": 19.024290084838867, + "learning_rate": 7.388277202072539e-06, + "loss": 0.6388, + "mean_token_accuracy": 0.9004613161087036, + "num_tokens": 2891825.0, + "step": 1614 + }, + { + "epoch": 0.2615172860497126, + "grad_norm": 19.56563949584961, + "learning_rate": 7.386658031088083e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.9070870876312256, + "num_tokens": 2893617.0, + "step": 1615 + }, + { + "epoch": 0.2616792162577929, + "grad_norm": 16.62118911743164, + "learning_rate": 7.385038860103627e-06, + "loss": 0.6309, + "mean_token_accuracy": 0.9163140058517456, + "num_tokens": 2895404.0, + "step": 1616 + }, + { + "epoch": 0.2618411464658732, + "grad_norm": 19.283964157104492, + "learning_rate": 7.383419689119171e-06, + "loss": 0.579, + "mean_token_accuracy": 0.91215580701828, + "num_tokens": 2897200.0, + "step": 1617 + }, + { + "epoch": 0.2620030766739535, + "grad_norm": 24.186052322387695, + "learning_rate": 7.381800518134715e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.8960174918174744, + "num_tokens": 2898991.0, + "step": 1618 + }, + { + "epoch": 0.26216500688203387, + "grad_norm": 18.09012794494629, + "learning_rate": 7.380181347150259e-06, + "loss": 0.6185, + "mean_token_accuracy": 0.9062369465827942, + "num_tokens": 2900780.0, + "step": 1619 + }, + { + "epoch": 0.2623269370901142, + "grad_norm": 21.075101852416992, + "learning_rate": 7.378562176165803e-06, + "loss": 0.6515, + "mean_token_accuracy": 0.9239674508571625, + "num_tokens": 2902569.0, + "step": 1620 + }, + { + "epoch": 0.2624888672981945, + "grad_norm": 20.784603118896484, + "learning_rate": 7.376943005181348e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.9206287264823914, + "num_tokens": 2904358.0, + "step": 1621 + }, + { + "epoch": 0.2626507975062748, + "grad_norm": 26.173858642578125, + "learning_rate": 7.375323834196891e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.8859971463680267, + "num_tokens": 2906150.0, + "step": 1622 + }, + { + "epoch": 0.2628127277143551, + "grad_norm": 26.334566116333008, + "learning_rate": 7.373704663212436e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.9070670008659363, + "num_tokens": 2907942.0, + "step": 1623 + }, + { + "epoch": 0.2629746579224354, + "grad_norm": 20.980764389038086, + "learning_rate": 7.372085492227979e-06, + "loss": 0.618, + "mean_token_accuracy": 0.9102904498577118, + "num_tokens": 2909734.0, + "step": 1624 + }, + { + "epoch": 0.26313658813051577, + "grad_norm": 18.446298599243164, + "learning_rate": 7.370466321243524e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.9192083179950714, + "num_tokens": 2911518.0, + "step": 1625 + }, + { + "epoch": 0.2632985183385961, + "grad_norm": 19.16767692565918, + "learning_rate": 7.368847150259067e-06, + "loss": 0.6321, + "mean_token_accuracy": 0.901695728302002, + "num_tokens": 2913304.0, + "step": 1626 + }, + { + "epoch": 0.2634604485466764, + "grad_norm": 21.383743286132812, + "learning_rate": 7.367227979274612e-06, + "loss": 0.6772, + "mean_token_accuracy": 0.9109818339347839, + "num_tokens": 2915086.0, + "step": 1627 + }, + { + "epoch": 0.2636223787547567, + "grad_norm": 24.10162353515625, + "learning_rate": 7.365608808290155e-06, + "loss": 0.833, + "mean_token_accuracy": 0.9025388360023499, + "num_tokens": 2916875.0, + "step": 1628 + }, + { + "epoch": 0.263784308962837, + "grad_norm": 19.051870346069336, + "learning_rate": 7.3639896373057e-06, + "loss": 0.624, + "mean_token_accuracy": 0.9239382445812225, + "num_tokens": 2918675.0, + "step": 1629 + }, + { + "epoch": 0.2639462391709173, + "grad_norm": 21.700407028198242, + "learning_rate": 7.362370466321243e-06, + "loss": 0.6047, + "mean_token_accuracy": 0.9085765480995178, + "num_tokens": 2920471.0, + "step": 1630 + }, + { + "epoch": 0.26410816937899767, + "grad_norm": 25.165332794189453, + "learning_rate": 7.360751295336788e-06, + "loss": 0.7159, + "mean_token_accuracy": 0.9000000059604645, + "num_tokens": 2922273.0, + "step": 1631 + }, + { + "epoch": 0.264270099587078, + "grad_norm": 30.421682357788086, + "learning_rate": 7.3591321243523314e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.8911159336566925, + "num_tokens": 2924070.0, + "step": 1632 + }, + { + "epoch": 0.2644320297951583, + "grad_norm": 20.775630950927734, + "learning_rate": 7.357512953367876e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.9061861932277679, + "num_tokens": 2925869.0, + "step": 1633 + }, + { + "epoch": 0.2645939600032386, + "grad_norm": 15.506750106811523, + "learning_rate": 7.3558937823834195e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9258370995521545, + "num_tokens": 2927664.0, + "step": 1634 + }, + { + "epoch": 0.2647558902113189, + "grad_norm": 20.23809814453125, + "learning_rate": 7.354274611398964e-06, + "loss": 0.606, + "mean_token_accuracy": 0.9184607565402985, + "num_tokens": 2929458.0, + "step": 1635 + }, + { + "epoch": 0.26491782041939926, + "grad_norm": 25.841655731201172, + "learning_rate": 7.3526554404145075e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.9018123745918274, + "num_tokens": 2931244.0, + "step": 1636 + }, + { + "epoch": 0.26507975062747957, + "grad_norm": 27.68209457397461, + "learning_rate": 7.351036269430052e-06, + "loss": 0.746, + "mean_token_accuracy": 0.9044131338596344, + "num_tokens": 2933048.0, + "step": 1637 + }, + { + "epoch": 0.2652416808355599, + "grad_norm": 18.19292640686035, + "learning_rate": 7.3494170984455955e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.9142778217792511, + "num_tokens": 2934829.0, + "step": 1638 + }, + { + "epoch": 0.2654036110436402, + "grad_norm": 21.33133888244629, + "learning_rate": 7.34779792746114e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.9137547016143799, + "num_tokens": 2936619.0, + "step": 1639 + }, + { + "epoch": 0.2655655412517205, + "grad_norm": 24.50990867614746, + "learning_rate": 7.346178756476684e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.8989050984382629, + "num_tokens": 2938408.0, + "step": 1640 + }, + { + "epoch": 0.2657274714598008, + "grad_norm": 26.69629669189453, + "learning_rate": 7.344559585492228e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.8951023519039154, + "num_tokens": 2940216.0, + "step": 1641 + }, + { + "epoch": 0.26588940166788116, + "grad_norm": 28.35828971862793, + "learning_rate": 7.342940414507773e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.8862600028514862, + "num_tokens": 2942009.0, + "step": 1642 + }, + { + "epoch": 0.26605133187596147, + "grad_norm": 26.645549774169922, + "learning_rate": 7.341321243523317e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.897817462682724, + "num_tokens": 2943805.0, + "step": 1643 + }, + { + "epoch": 0.2662132620840418, + "grad_norm": 20.410158157348633, + "learning_rate": 7.339702072538861e-06, + "loss": 0.6523, + "mean_token_accuracy": 0.9127601683139801, + "num_tokens": 2945593.0, + "step": 1644 + }, + { + "epoch": 0.2663751922921221, + "grad_norm": 24.349576950073242, + "learning_rate": 7.338082901554405e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.8949275612831116, + "num_tokens": 2947381.0, + "step": 1645 + }, + { + "epoch": 0.2665371225002024, + "grad_norm": 25.9034423828125, + "learning_rate": 7.336463730569949e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.9089070856571198, + "num_tokens": 2949178.0, + "step": 1646 + }, + { + "epoch": 0.26669905270828276, + "grad_norm": 18.9997501373291, + "learning_rate": 7.334844559585494e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.9262779355049133, + "num_tokens": 2950975.0, + "step": 1647 + }, + { + "epoch": 0.26686098291636307, + "grad_norm": 26.151365280151367, + "learning_rate": 7.333225388601037e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.8992022573947906, + "num_tokens": 2952766.0, + "step": 1648 + }, + { + "epoch": 0.2670229131244434, + "grad_norm": 21.182880401611328, + "learning_rate": 7.331606217616582e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.9083566069602966, + "num_tokens": 2954551.0, + "step": 1649 + }, + { + "epoch": 0.2671848433325237, + "grad_norm": 22.718381881713867, + "learning_rate": 7.329987046632125e-06, + "loss": 0.6221, + "mean_token_accuracy": 0.9123508334159851, + "num_tokens": 2956337.0, + "step": 1650 + }, + { + "epoch": 0.267346773540604, + "grad_norm": 12.982319831848145, + "learning_rate": 7.32836787564767e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.9285956621170044, + "num_tokens": 2958115.0, + "step": 1651 + }, + { + "epoch": 0.2675087037486843, + "grad_norm": 20.37946319580078, + "learning_rate": 7.326748704663213e-06, + "loss": 0.6445, + "mean_token_accuracy": 0.9067419171333313, + "num_tokens": 2959905.0, + "step": 1652 + }, + { + "epoch": 0.26767063395676466, + "grad_norm": 20.764528274536133, + "learning_rate": 7.325129533678758e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.9079882204532623, + "num_tokens": 2961700.0, + "step": 1653 + }, + { + "epoch": 0.26783256416484497, + "grad_norm": 23.021770477294922, + "learning_rate": 7.3235103626943014e-06, + "loss": 0.666, + "mean_token_accuracy": 0.9037662148475647, + "num_tokens": 2963493.0, + "step": 1654 + }, + { + "epoch": 0.2679944943729253, + "grad_norm": 21.963050842285156, + "learning_rate": 7.321891191709846e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.9121578335762024, + "num_tokens": 2965278.0, + "step": 1655 + }, + { + "epoch": 0.2681564245810056, + "grad_norm": 22.99561309814453, + "learning_rate": 7.3202720207253895e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.904386967420578, + "num_tokens": 2967072.0, + "step": 1656 + }, + { + "epoch": 0.2683183547890859, + "grad_norm": 24.727148056030273, + "learning_rate": 7.318652849740934e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.9037568271160126, + "num_tokens": 2968864.0, + "step": 1657 + }, + { + "epoch": 0.2684802849971662, + "grad_norm": 20.770639419555664, + "learning_rate": 7.3170336787564775e-06, + "loss": 0.655, + "mean_token_accuracy": 0.9138047397136688, + "num_tokens": 2970654.0, + "step": 1658 + }, + { + "epoch": 0.26864221520524656, + "grad_norm": 25.987524032592773, + "learning_rate": 7.315414507772022e-06, + "loss": 0.707, + "mean_token_accuracy": 0.9010467231273651, + "num_tokens": 2972439.0, + "step": 1659 + }, + { + "epoch": 0.26880414541332687, + "grad_norm": 22.091861724853516, + "learning_rate": 7.3137953367875655e-06, + "loss": 0.6424, + "mean_token_accuracy": 0.913192093372345, + "num_tokens": 2974227.0, + "step": 1660 + }, + { + "epoch": 0.2689660756214072, + "grad_norm": 21.8024845123291, + "learning_rate": 7.31217616580311e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.9130645990371704, + "num_tokens": 2976026.0, + "step": 1661 + }, + { + "epoch": 0.2691280058294875, + "grad_norm": 16.567657470703125, + "learning_rate": 7.3105569948186535e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.9172661900520325, + "num_tokens": 2977816.0, + "step": 1662 + }, + { + "epoch": 0.2692899360375678, + "grad_norm": 25.517282485961914, + "learning_rate": 7.308937823834198e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.9107862710952759, + "num_tokens": 2979608.0, + "step": 1663 + }, + { + "epoch": 0.26945186624564815, + "grad_norm": 13.7872896194458, + "learning_rate": 7.3073186528497416e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.9232734441757202, + "num_tokens": 2981381.0, + "step": 1664 + }, + { + "epoch": 0.26961379645372846, + "grad_norm": 17.210819244384766, + "learning_rate": 7.305699481865286e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.9283071458339691, + "num_tokens": 2983172.0, + "step": 1665 + }, + { + "epoch": 0.26977572666180877, + "grad_norm": 20.563995361328125, + "learning_rate": 7.3040803108808304e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.9100719392299652, + "num_tokens": 2984962.0, + "step": 1666 + }, + { + "epoch": 0.2699376568698891, + "grad_norm": 20.487215042114258, + "learning_rate": 7.302461139896374e-06, + "loss": 0.601, + "mean_token_accuracy": 0.9241819083690643, + "num_tokens": 2986764.0, + "step": 1667 + }, + { + "epoch": 0.2700995870779694, + "grad_norm": 19.923784255981445, + "learning_rate": 7.3008419689119185e-06, + "loss": 0.6009, + "mean_token_accuracy": 0.9166505932807922, + "num_tokens": 2988564.0, + "step": 1668 + }, + { + "epoch": 0.2702615172860497, + "grad_norm": 28.150285720825195, + "learning_rate": 7.299222797927462e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.8988725543022156, + "num_tokens": 2990362.0, + "step": 1669 + }, + { + "epoch": 0.27042344749413005, + "grad_norm": 25.16350746154785, + "learning_rate": 7.2976036269430065e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.9053639471530914, + "num_tokens": 2992149.0, + "step": 1670 + }, + { + "epoch": 0.27058537770221036, + "grad_norm": 31.642532348632812, + "learning_rate": 7.29598445595855e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.8931206166744232, + "num_tokens": 2993950.0, + "step": 1671 + }, + { + "epoch": 0.27074730791029067, + "grad_norm": 29.09742546081543, + "learning_rate": 7.2943652849740945e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.9048761129379272, + "num_tokens": 2995745.0, + "step": 1672 + }, + { + "epoch": 0.270909238118371, + "grad_norm": 29.280715942382812, + "learning_rate": 7.292746113989638e-06, + "loss": 0.705, + "mean_token_accuracy": 0.8885361850261688, + "num_tokens": 2997553.0, + "step": 1673 + }, + { + "epoch": 0.2710711683264513, + "grad_norm": 22.281274795532227, + "learning_rate": 7.2911269430051825e-06, + "loss": 0.6363, + "mean_token_accuracy": 0.9023892879486084, + "num_tokens": 2999352.0, + "step": 1674 + }, + { + "epoch": 0.2712330985345316, + "grad_norm": 27.497831344604492, + "learning_rate": 7.289507772020726e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.9021909236907959, + "num_tokens": 3001141.0, + "step": 1675 + }, + { + "epoch": 0.27139502874261195, + "grad_norm": 27.839120864868164, + "learning_rate": 7.2878886010362706e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.9010110795497894, + "num_tokens": 3002946.0, + "step": 1676 + }, + { + "epoch": 0.27155695895069226, + "grad_norm": 22.24791145324707, + "learning_rate": 7.286269430051814e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.9094203114509583, + "num_tokens": 3004734.0, + "step": 1677 + }, + { + "epoch": 0.27171888915877257, + "grad_norm": 26.267562866210938, + "learning_rate": 7.284650259067359e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.9125371277332306, + "num_tokens": 3006542.0, + "step": 1678 + }, + { + "epoch": 0.2718808193668529, + "grad_norm": 22.462438583374023, + "learning_rate": 7.283031088082902e-06, + "loss": 0.6491, + "mean_token_accuracy": 0.9124087691307068, + "num_tokens": 3008328.0, + "step": 1679 + }, + { + "epoch": 0.2720427495749332, + "grad_norm": 25.66066551208496, + "learning_rate": 7.281411917098447e-06, + "loss": 0.6398, + "mean_token_accuracy": 0.9113828539848328, + "num_tokens": 3010122.0, + "step": 1680 + }, + { + "epoch": 0.27220467978301355, + "grad_norm": 26.425907135009766, + "learning_rate": 7.27979274611399e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.9058688282966614, + "num_tokens": 3011920.0, + "step": 1681 + }, + { + "epoch": 0.27236660999109386, + "grad_norm": 21.59770965576172, + "learning_rate": 7.278173575129535e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.894313633441925, + "num_tokens": 3013708.0, + "step": 1682 + }, + { + "epoch": 0.27252854019917416, + "grad_norm": 26.245925903320312, + "learning_rate": 7.276554404145078e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.8905109763145447, + "num_tokens": 3015494.0, + "step": 1683 + }, + { + "epoch": 0.27269047040725447, + "grad_norm": 28.3322811126709, + "learning_rate": 7.274935233160623e-06, + "loss": 0.85, + "mean_token_accuracy": 0.8906124830245972, + "num_tokens": 3017289.0, + "step": 1684 + }, + { + "epoch": 0.2728524006153348, + "grad_norm": 20.236648559570312, + "learning_rate": 7.273316062176167e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.9073867499828339, + "num_tokens": 3019082.0, + "step": 1685 + }, + { + "epoch": 0.2730143308234151, + "grad_norm": 22.558427810668945, + "learning_rate": 7.271696891191711e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.901033878326416, + "num_tokens": 3020877.0, + "step": 1686 + }, + { + "epoch": 0.27317626103149545, + "grad_norm": 20.01369857788086, + "learning_rate": 7.270077720207255e-06, + "loss": 0.6531, + "mean_token_accuracy": 0.9148764908313751, + "num_tokens": 3022671.0, + "step": 1687 + }, + { + "epoch": 0.27333819123957576, + "grad_norm": 19.840303421020508, + "learning_rate": 7.268458549222799e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.9163931012153625, + "num_tokens": 3024458.0, + "step": 1688 + }, + { + "epoch": 0.27350012144765606, + "grad_norm": 24.691333770751953, + "learning_rate": 7.266839378238343e-06, + "loss": 0.717, + "mean_token_accuracy": 0.8982333838939667, + "num_tokens": 3026245.0, + "step": 1689 + }, + { + "epoch": 0.27366205165573637, + "grad_norm": 22.79033660888672, + "learning_rate": 7.265220207253887e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.9070870876312256, + "num_tokens": 3028037.0, + "step": 1690 + }, + { + "epoch": 0.2738239818638167, + "grad_norm": 15.833498001098633, + "learning_rate": 7.263601036269431e-06, + "loss": 0.5777, + "mean_token_accuracy": 0.9146759510040283, + "num_tokens": 3029818.0, + "step": 1691 + }, + { + "epoch": 0.273985912071897, + "grad_norm": 17.392908096313477, + "learning_rate": 7.261981865284975e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.9181021451950073, + "num_tokens": 3031611.0, + "step": 1692 + }, + { + "epoch": 0.27414784227997735, + "grad_norm": 24.336748123168945, + "learning_rate": 7.260362694300519e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.8917735517024994, + "num_tokens": 3033400.0, + "step": 1693 + }, + { + "epoch": 0.27430977248805766, + "grad_norm": 19.489585876464844, + "learning_rate": 7.258743523316063e-06, + "loss": 0.67, + "mean_token_accuracy": 0.9124087393283844, + "num_tokens": 3035186.0, + "step": 1694 + }, + { + "epoch": 0.27447170269613796, + "grad_norm": 26.379592895507812, + "learning_rate": 7.257124352331607e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.8939060866832733, + "num_tokens": 3036981.0, + "step": 1695 + }, + { + "epoch": 0.27463363290421827, + "grad_norm": 18.416854858398438, + "learning_rate": 7.255505181347151e-06, + "loss": 0.548, + "mean_token_accuracy": 0.9272640645503998, + "num_tokens": 3038768.0, + "step": 1696 + }, + { + "epoch": 0.2747955631122986, + "grad_norm": 21.19996452331543, + "learning_rate": 7.253886010362695e-06, + "loss": 0.6504, + "mean_token_accuracy": 0.9184423983097076, + "num_tokens": 3040561.0, + "step": 1697 + }, + { + "epoch": 0.27495749332037894, + "grad_norm": 17.445430755615234, + "learning_rate": 7.252266839378239e-06, + "loss": 0.599, + "mean_token_accuracy": 0.9213643670082092, + "num_tokens": 3042353.0, + "step": 1698 + }, + { + "epoch": 0.27511942352845925, + "grad_norm": 16.73468589782715, + "learning_rate": 7.250647668393783e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.927003413438797, + "num_tokens": 3044139.0, + "step": 1699 + }, + { + "epoch": 0.27528135373653956, + "grad_norm": 24.797021865844727, + "learning_rate": 7.249028497409327e-06, + "loss": 0.6211, + "mean_token_accuracy": 0.9191596508026123, + "num_tokens": 3045947.0, + "step": 1700 + }, + { + "epoch": 0.27544328394461987, + "grad_norm": 21.3189754486084, + "learning_rate": 7.247409326424871e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.9075706899166107, + "num_tokens": 3047740.0, + "step": 1701 + }, + { + "epoch": 0.2756052141527002, + "grad_norm": 17.231767654418945, + "learning_rate": 7.245790155440415e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.9214285910129547, + "num_tokens": 3049532.0, + "step": 1702 + }, + { + "epoch": 0.2757671443607805, + "grad_norm": 27.29370880126953, + "learning_rate": 7.244170984455959e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.8919772505760193, + "num_tokens": 3051322.0, + "step": 1703 + }, + { + "epoch": 0.27592907456886084, + "grad_norm": 25.817386627197266, + "learning_rate": 7.242551813471504e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.8861111104488373, + "num_tokens": 3053113.0, + "step": 1704 + }, + { + "epoch": 0.27609100477694115, + "grad_norm": 17.878170013427734, + "learning_rate": 7.240932642487047e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.9097945094108582, + "num_tokens": 3054903.0, + "step": 1705 + }, + { + "epoch": 0.27625293498502146, + "grad_norm": 25.780912399291992, + "learning_rate": 7.239313471502592e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.8893822431564331, + "num_tokens": 3056703.0, + "step": 1706 + }, + { + "epoch": 0.27641486519310177, + "grad_norm": 35.73030471801758, + "learning_rate": 7.237694300518135e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.8657047748565674, + "num_tokens": 3058510.0, + "step": 1707 + }, + { + "epoch": 0.2765767954011821, + "grad_norm": 22.286853790283203, + "learning_rate": 7.23607512953368e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.9059281051158905, + "num_tokens": 3060298.0, + "step": 1708 + }, + { + "epoch": 0.2767387256092624, + "grad_norm": 31.68348503112793, + "learning_rate": 7.234455958549223e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.8848258852958679, + "num_tokens": 3062094.0, + "step": 1709 + }, + { + "epoch": 0.27690065581734274, + "grad_norm": 34.010719299316406, + "learning_rate": 7.232836787564768e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.8824006617069244, + "num_tokens": 3063887.0, + "step": 1710 + }, + { + "epoch": 0.27706258602542305, + "grad_norm": 21.52349281311035, + "learning_rate": 7.2312176165803114e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.9139194190502167, + "num_tokens": 3065689.0, + "step": 1711 + }, + { + "epoch": 0.27722451623350336, + "grad_norm": 37.346290588378906, + "learning_rate": 7.229598445595856e-06, + "loss": 1.0875, + "mean_token_accuracy": 0.8657718300819397, + "num_tokens": 3067499.0, + "step": 1712 + }, + { + "epoch": 0.27738644644158367, + "grad_norm": 27.85877799987793, + "learning_rate": 7.2279792746113995e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.8947355449199677, + "num_tokens": 3069296.0, + "step": 1713 + }, + { + "epoch": 0.277548376649664, + "grad_norm": 23.893590927124023, + "learning_rate": 7.226360103626944e-06, + "loss": 0.6507, + "mean_token_accuracy": 0.89560467004776, + "num_tokens": 3071095.0, + "step": 1714 + }, + { + "epoch": 0.27771030685774434, + "grad_norm": 20.309354782104492, + "learning_rate": 7.2247409326424875e-06, + "loss": 0.6597, + "mean_token_accuracy": 0.9127551019191742, + "num_tokens": 3072894.0, + "step": 1715 + }, + { + "epoch": 0.27787223706582465, + "grad_norm": 26.50943946838379, + "learning_rate": 7.223121761658032e-06, + "loss": 0.735, + "mean_token_accuracy": 0.9046299159526825, + "num_tokens": 3074689.0, + "step": 1716 + }, + { + "epoch": 0.27803416727390495, + "grad_norm": 20.479528427124023, + "learning_rate": 7.2215025906735755e-06, + "loss": 0.5773, + "mean_token_accuracy": 0.9187424778938293, + "num_tokens": 3076484.0, + "step": 1717 + }, + { + "epoch": 0.27819609748198526, + "grad_norm": 16.82716178894043, + "learning_rate": 7.21988341968912e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.9288500547409058, + "num_tokens": 3078277.0, + "step": 1718 + }, + { + "epoch": 0.27835802769006557, + "grad_norm": 19.977067947387695, + "learning_rate": 7.2182642487046635e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.9223257005214691, + "num_tokens": 3080071.0, + "step": 1719 + }, + { + "epoch": 0.2785199578981459, + "grad_norm": 22.197608947753906, + "learning_rate": 7.216645077720208e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.8984741866588593, + "num_tokens": 3081857.0, + "step": 1720 + }, + { + "epoch": 0.27868188810622624, + "grad_norm": 24.387502670288086, + "learning_rate": 7.2150259067357516e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.8981804847717285, + "num_tokens": 3083644.0, + "step": 1721 + }, + { + "epoch": 0.27884381831430655, + "grad_norm": 20.709949493408203, + "learning_rate": 7.213406735751296e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.9100719392299652, + "num_tokens": 3085434.0, + "step": 1722 + }, + { + "epoch": 0.27900574852238685, + "grad_norm": 20.250320434570312, + "learning_rate": 7.2117875647668404e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.9085853397846222, + "num_tokens": 3087230.0, + "step": 1723 + }, + { + "epoch": 0.27916767873046716, + "grad_norm": 21.463279724121094, + "learning_rate": 7.210168393782384e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.8989899158477783, + "num_tokens": 3089018.0, + "step": 1724 + }, + { + "epoch": 0.27932960893854747, + "grad_norm": 23.75426483154297, + "learning_rate": 7.2085492227979285e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.9062369465827942, + "num_tokens": 3090807.0, + "step": 1725 + }, + { + "epoch": 0.2794915391466278, + "grad_norm": 17.86284828186035, + "learning_rate": 7.206930051813472e-06, + "loss": 0.5551, + "mean_token_accuracy": 0.920451283454895, + "num_tokens": 3092608.0, + "step": 1726 + }, + { + "epoch": 0.27965346935470814, + "grad_norm": 21.230356216430664, + "learning_rate": 7.2053108808290165e-06, + "loss": 0.6512, + "mean_token_accuracy": 0.9052238762378693, + "num_tokens": 3094394.0, + "step": 1727 + }, + { + "epoch": 0.27981539956278845, + "grad_norm": 27.15345573425293, + "learning_rate": 7.20369170984456e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.8992336392402649, + "num_tokens": 3096184.0, + "step": 1728 + }, + { + "epoch": 0.27997732977086875, + "grad_norm": 18.63810920715332, + "learning_rate": 7.2020725388601045e-06, + "loss": 0.6006, + "mean_token_accuracy": 0.9218443036079407, + "num_tokens": 3097978.0, + "step": 1729 + }, + { + "epoch": 0.28013925997894906, + "grad_norm": 19.0826416015625, + "learning_rate": 7.200453367875648e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.9006539285182953, + "num_tokens": 3099772.0, + "step": 1730 + }, + { + "epoch": 0.28030119018702937, + "grad_norm": 22.469587326049805, + "learning_rate": 7.1988341968911925e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.9026418626308441, + "num_tokens": 3101570.0, + "step": 1731 + }, + { + "epoch": 0.28046312039510973, + "grad_norm": 18.425203323364258, + "learning_rate": 7.197215025906736e-06, + "loss": 0.5702, + "mean_token_accuracy": 0.9202331602573395, + "num_tokens": 3103358.0, + "step": 1732 + }, + { + "epoch": 0.28062505060319004, + "grad_norm": 17.37875747680664, + "learning_rate": 7.1955958549222806e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.924378901720047, + "num_tokens": 3105148.0, + "step": 1733 + }, + { + "epoch": 0.28078698081127035, + "grad_norm": 28.775325775146484, + "learning_rate": 7.193976683937824e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.8898792564868927, + "num_tokens": 3106949.0, + "step": 1734 + }, + { + "epoch": 0.28094891101935066, + "grad_norm": 26.76213264465332, + "learning_rate": 7.192357512953369e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.9092342555522919, + "num_tokens": 3108746.0, + "step": 1735 + }, + { + "epoch": 0.28111084122743096, + "grad_norm": 21.826457977294922, + "learning_rate": 7.190738341968912e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.9163931012153625, + "num_tokens": 3110533.0, + "step": 1736 + }, + { + "epoch": 0.28127277143551127, + "grad_norm": 19.561569213867188, + "learning_rate": 7.189119170984457e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.921950489282608, + "num_tokens": 3112327.0, + "step": 1737 + }, + { + "epoch": 0.28143470164359163, + "grad_norm": 16.025388717651367, + "learning_rate": 7.1875e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.9285677969455719, + "num_tokens": 3114119.0, + "step": 1738 + }, + { + "epoch": 0.28159663185167194, + "grad_norm": 21.772972106933594, + "learning_rate": 7.185880829015545e-06, + "loss": 0.5805, + "mean_token_accuracy": 0.9229403436183929, + "num_tokens": 3115891.0, + "step": 1739 + }, + { + "epoch": 0.28175856205975225, + "grad_norm": 31.67912483215332, + "learning_rate": 7.184261658031088e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.8773398995399475, + "num_tokens": 3117688.0, + "step": 1740 + }, + { + "epoch": 0.28192049226783256, + "grad_norm": 24.92591094970703, + "learning_rate": 7.182642487046633e-06, + "loss": 0.6503, + "mean_token_accuracy": 0.9142156839370728, + "num_tokens": 3119480.0, + "step": 1741 + }, + { + "epoch": 0.28208242247591286, + "grad_norm": 16.902267456054688, + "learning_rate": 7.181023316062177e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.9216987490653992, + "num_tokens": 3121272.0, + "step": 1742 + }, + { + "epoch": 0.28224435268399317, + "grad_norm": 18.500478744506836, + "learning_rate": 7.179404145077721e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.9199725091457367, + "num_tokens": 3123059.0, + "step": 1743 + }, + { + "epoch": 0.28240628289207353, + "grad_norm": 25.750652313232422, + "learning_rate": 7.177784974093265e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.8999382853507996, + "num_tokens": 3124850.0, + "step": 1744 + }, + { + "epoch": 0.28256821310015384, + "grad_norm": 21.59621238708496, + "learning_rate": 7.176165803108809e-06, + "loss": 0.6082, + "mean_token_accuracy": 0.9203707277774811, + "num_tokens": 3126651.0, + "step": 1745 + }, + { + "epoch": 0.28273014330823415, + "grad_norm": 14.908434867858887, + "learning_rate": 7.174546632124353e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.9211378395557404, + "num_tokens": 3128442.0, + "step": 1746 + }, + { + "epoch": 0.28289207351631446, + "grad_norm": 8.731510162353516, + "learning_rate": 7.172927461139897e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.9343991577625275, + "num_tokens": 3130228.0, + "step": 1747 + }, + { + "epoch": 0.28305400372439476, + "grad_norm": 22.36594581604004, + "learning_rate": 7.171308290155441e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.9206026494503021, + "num_tokens": 3132017.0, + "step": 1748 + }, + { + "epoch": 0.28321593393247513, + "grad_norm": 26.78276252746582, + "learning_rate": 7.169689119170985e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.8984403610229492, + "num_tokens": 3133805.0, + "step": 1749 + }, + { + "epoch": 0.28337786414055544, + "grad_norm": 22.339689254760742, + "learning_rate": 7.168069948186529e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.9100719392299652, + "num_tokens": 3135595.0, + "step": 1750 + }, + { + "epoch": 0.28353979434863574, + "grad_norm": 24.507793426513672, + "learning_rate": 7.166450777202073e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.9116646647453308, + "num_tokens": 3137390.0, + "step": 1751 + }, + { + "epoch": 0.28370172455671605, + "grad_norm": 31.657018661499023, + "learning_rate": 7.164831606217617e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.8889145851135254, + "num_tokens": 3139181.0, + "step": 1752 + }, + { + "epoch": 0.28386365476479636, + "grad_norm": 25.023855209350586, + "learning_rate": 7.163212435233161e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.920797735452652, + "num_tokens": 3140958.0, + "step": 1753 + }, + { + "epoch": 0.28402558497287667, + "grad_norm": 25.699691772460938, + "learning_rate": 7.161593264248705e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.9119867086410522, + "num_tokens": 3142754.0, + "step": 1754 + }, + { + "epoch": 0.28418751518095703, + "grad_norm": 23.910308837890625, + "learning_rate": 7.159974093264249e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.9109405279159546, + "num_tokens": 3144547.0, + "step": 1755 + }, + { + "epoch": 0.28434944538903734, + "grad_norm": 27.815820693969727, + "learning_rate": 7.158354922279793e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.9046299159526825, + "num_tokens": 3146342.0, + "step": 1756 + }, + { + "epoch": 0.28451137559711764, + "grad_norm": 13.852578163146973, + "learning_rate": 7.156735751295337e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9316923022270203, + "num_tokens": 3148132.0, + "step": 1757 + }, + { + "epoch": 0.28467330580519795, + "grad_norm": 27.006715774536133, + "learning_rate": 7.155116580310881e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.8957136273384094, + "num_tokens": 3149922.0, + "step": 1758 + }, + { + "epoch": 0.28483523601327826, + "grad_norm": 27.09081268310547, + "learning_rate": 7.153497409326425e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.9068683385848999, + "num_tokens": 3151724.0, + "step": 1759 + }, + { + "epoch": 0.2849971662213586, + "grad_norm": 18.78533935546875, + "learning_rate": 7.151878238341969e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.9096638560295105, + "num_tokens": 3153512.0, + "step": 1760 + }, + { + "epoch": 0.28515909642943893, + "grad_norm": 21.3704833984375, + "learning_rate": 7.150259067357514e-06, + "loss": 0.6402, + "mean_token_accuracy": 0.9099134206771851, + "num_tokens": 3155301.0, + "step": 1761 + }, + { + "epoch": 0.28532102663751924, + "grad_norm": 26.896732330322266, + "learning_rate": 7.148639896373057e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.8941701948642731, + "num_tokens": 3157097.0, + "step": 1762 + }, + { + "epoch": 0.28548295684559954, + "grad_norm": 21.581382751464844, + "learning_rate": 7.147020725388602e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.9101685881614685, + "num_tokens": 3158898.0, + "step": 1763 + }, + { + "epoch": 0.28564488705367985, + "grad_norm": 23.770286560058594, + "learning_rate": 7.145401554404145e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.9125587046146393, + "num_tokens": 3160684.0, + "step": 1764 + }, + { + "epoch": 0.28580681726176016, + "grad_norm": 24.074739456176758, + "learning_rate": 7.14378238341969e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.9010489284992218, + "num_tokens": 3162479.0, + "step": 1765 + }, + { + "epoch": 0.2859687474698405, + "grad_norm": 29.391033172607422, + "learning_rate": 7.1421632124352334e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.8920139968395233, + "num_tokens": 3164278.0, + "step": 1766 + }, + { + "epoch": 0.28613067767792083, + "grad_norm": 20.008272171020508, + "learning_rate": 7.140544041450778e-06, + "loss": 0.606, + "mean_token_accuracy": 0.9132780730724335, + "num_tokens": 3166067.0, + "step": 1767 + }, + { + "epoch": 0.28629260788600114, + "grad_norm": 29.234251022338867, + "learning_rate": 7.1389248704663215e-06, + "loss": 0.6728, + "mean_token_accuracy": 0.8939980864524841, + "num_tokens": 3167861.0, + "step": 1768 + }, + { + "epoch": 0.28645453809408145, + "grad_norm": 27.4943904876709, + "learning_rate": 7.137305699481866e-06, + "loss": 0.756, + "mean_token_accuracy": 0.8960067927837372, + "num_tokens": 3169652.0, + "step": 1769 + }, + { + "epoch": 0.28661646830216175, + "grad_norm": 23.386791229248047, + "learning_rate": 7.1356865284974095e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.9003976583480835, + "num_tokens": 3171435.0, + "step": 1770 + }, + { + "epoch": 0.28677839851024206, + "grad_norm": 25.822498321533203, + "learning_rate": 7.134067357512954e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.8897051215171814, + "num_tokens": 3173228.0, + "step": 1771 + }, + { + "epoch": 0.2869403287183224, + "grad_norm": 33.0545654296875, + "learning_rate": 7.1324481865284975e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.8997684121131897, + "num_tokens": 3175028.0, + "step": 1772 + }, + { + "epoch": 0.28710225892640273, + "grad_norm": 16.903104782104492, + "learning_rate": 7.130829015544042e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.9195243418216705, + "num_tokens": 3176801.0, + "step": 1773 + }, + { + "epoch": 0.28726418913448304, + "grad_norm": 28.1411190032959, + "learning_rate": 7.1292098445595855e-06, + "loss": 0.827, + "mean_token_accuracy": 0.8897902071475983, + "num_tokens": 3178585.0, + "step": 1774 + }, + { + "epoch": 0.28742611934256335, + "grad_norm": 22.95647430419922, + "learning_rate": 7.12759067357513e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.9054373502731323, + "num_tokens": 3180382.0, + "step": 1775 + }, + { + "epoch": 0.28758804955064365, + "grad_norm": 17.32350730895996, + "learning_rate": 7.1259715025906736e-06, + "loss": 0.5548, + "mean_token_accuracy": 0.927619993686676, + "num_tokens": 3182184.0, + "step": 1776 + }, + { + "epoch": 0.287749979758724, + "grad_norm": 33.343257904052734, + "learning_rate": 7.124352331606218e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.9024864137172699, + "num_tokens": 3183983.0, + "step": 1777 + }, + { + "epoch": 0.2879119099668043, + "grad_norm": 23.17249870300293, + "learning_rate": 7.1227331606217624e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.9051577150821686, + "num_tokens": 3185769.0, + "step": 1778 + }, + { + "epoch": 0.28807384017488463, + "grad_norm": 24.464021682739258, + "learning_rate": 7.121113989637306e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.9078470766544342, + "num_tokens": 3187563.0, + "step": 1779 + }, + { + "epoch": 0.28823577038296494, + "grad_norm": 25.535337448120117, + "learning_rate": 7.1194948186528505e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.9045390188694, + "num_tokens": 3189366.0, + "step": 1780 + }, + { + "epoch": 0.28839770059104525, + "grad_norm": 27.85541534423828, + "learning_rate": 7.117875647668394e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.9077968001365662, + "num_tokens": 3191160.0, + "step": 1781 + }, + { + "epoch": 0.28855963079912555, + "grad_norm": 19.054771423339844, + "learning_rate": 7.1162564766839385e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.9195241630077362, + "num_tokens": 3192958.0, + "step": 1782 + }, + { + "epoch": 0.2887215610072059, + "grad_norm": 25.0560302734375, + "learning_rate": 7.114637305699482e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.8999121785163879, + "num_tokens": 3194740.0, + "step": 1783 + }, + { + "epoch": 0.2888834912152862, + "grad_norm": 24.85127067565918, + "learning_rate": 7.1130181347150265e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.8858367800712585, + "num_tokens": 3196541.0, + "step": 1784 + }, + { + "epoch": 0.28904542142336653, + "grad_norm": 22.763168334960938, + "learning_rate": 7.11139896373057e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.9073200225830078, + "num_tokens": 3198323.0, + "step": 1785 + }, + { + "epoch": 0.28920735163144684, + "grad_norm": 18.682348251342773, + "learning_rate": 7.1097797927461145e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.9149402678012848, + "num_tokens": 3200128.0, + "step": 1786 + }, + { + "epoch": 0.28936928183952715, + "grad_norm": 26.31068992614746, + "learning_rate": 7.108160621761658e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.9076961874961853, + "num_tokens": 3201922.0, + "step": 1787 + }, + { + "epoch": 0.28953121204760746, + "grad_norm": 23.188451766967773, + "learning_rate": 7.1065414507772026e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.8951772749423981, + "num_tokens": 3203711.0, + "step": 1788 + }, + { + "epoch": 0.2896931422556878, + "grad_norm": 21.997140884399414, + "learning_rate": 7.104922279792746e-06, + "loss": 0.6518, + "mean_token_accuracy": 0.9054292142391205, + "num_tokens": 3205509.0, + "step": 1789 + }, + { + "epoch": 0.2898550724637681, + "grad_norm": 20.585386276245117, + "learning_rate": 7.103303108808291e-06, + "loss": 0.6402, + "mean_token_accuracy": 0.9077979624271393, + "num_tokens": 3207293.0, + "step": 1790 + }, + { + "epoch": 0.29001700267184843, + "grad_norm": 21.726736068725586, + "learning_rate": 7.101683937823834e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.903900682926178, + "num_tokens": 3209076.0, + "step": 1791 + }, + { + "epoch": 0.29017893287992874, + "grad_norm": 21.304414749145508, + "learning_rate": 7.100064766839379e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.9222372174263, + "num_tokens": 3210871.0, + "step": 1792 + }, + { + "epoch": 0.29034086308800905, + "grad_norm": 21.047353744506836, + "learning_rate": 7.098445595854922e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.9229517877101898, + "num_tokens": 3212668.0, + "step": 1793 + }, + { + "epoch": 0.2905027932960894, + "grad_norm": 22.387699127197266, + "learning_rate": 7.096826424870467e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.9159420430660248, + "num_tokens": 3214453.0, + "step": 1794 + }, + { + "epoch": 0.2906647235041697, + "grad_norm": 16.854310989379883, + "learning_rate": 7.09520725388601e-06, + "loss": 0.583, + "mean_token_accuracy": 0.9225809872150421, + "num_tokens": 3216249.0, + "step": 1795 + }, + { + "epoch": 0.29082665371225, + "grad_norm": 17.550506591796875, + "learning_rate": 7.093588082901555e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.9181357622146606, + "num_tokens": 3218042.0, + "step": 1796 + }, + { + "epoch": 0.29098858392033033, + "grad_norm": 25.162944793701172, + "learning_rate": 7.091968911917099e-06, + "loss": 0.708, + "mean_token_accuracy": 0.9017778038978577, + "num_tokens": 3219839.0, + "step": 1797 + }, + { + "epoch": 0.29115051412841064, + "grad_norm": 18.8907527923584, + "learning_rate": 7.090349740932643e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.9176688194274902, + "num_tokens": 3221630.0, + "step": 1798 + }, + { + "epoch": 0.29131244433649095, + "grad_norm": 24.114839553833008, + "learning_rate": 7.088730569948187e-06, + "loss": 0.6788, + "mean_token_accuracy": 0.9061359763145447, + "num_tokens": 3223419.0, + "step": 1799 + }, + { + "epoch": 0.2914743745445713, + "grad_norm": 21.52777671813965, + "learning_rate": 7.087111398963731e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.9120011925697327, + "num_tokens": 3225215.0, + "step": 1800 + }, + { + "epoch": 0.2916363047526516, + "grad_norm": 16.458261489868164, + "learning_rate": 7.085492227979275e-06, + "loss": 0.6179, + "mean_token_accuracy": 0.92044797539711, + "num_tokens": 3227003.0, + "step": 1801 + }, + { + "epoch": 0.29179823496073193, + "grad_norm": 24.96923065185547, + "learning_rate": 7.083873056994819e-06, + "loss": 0.656, + "mean_token_accuracy": 0.8987536430358887, + "num_tokens": 3228792.0, + "step": 1802 + }, + { + "epoch": 0.29196016516881224, + "grad_norm": 18.641719818115234, + "learning_rate": 7.082253886010363e-06, + "loss": 0.6011, + "mean_token_accuracy": 0.9200254082679749, + "num_tokens": 3230579.0, + "step": 1803 + }, + { + "epoch": 0.29212209537689254, + "grad_norm": 23.91495704650879, + "learning_rate": 7.080634715025907e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.9035947918891907, + "num_tokens": 3232371.0, + "step": 1804 + }, + { + "epoch": 0.29228402558497285, + "grad_norm": 25.538455963134766, + "learning_rate": 7.079015544041451e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.9019423723220825, + "num_tokens": 3234148.0, + "step": 1805 + }, + { + "epoch": 0.2924459557930532, + "grad_norm": 21.477170944213867, + "learning_rate": 7.077396373056995e-06, + "loss": 0.6279, + "mean_token_accuracy": 0.9062043726444244, + "num_tokens": 3235937.0, + "step": 1806 + }, + { + "epoch": 0.2926078860011335, + "grad_norm": 30.938232421875, + "learning_rate": 7.075777202072539e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.9032630920410156, + "num_tokens": 3237728.0, + "step": 1807 + }, + { + "epoch": 0.29276981620921383, + "grad_norm": 24.801923751831055, + "learning_rate": 7.074158031088083e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.8962906301021576, + "num_tokens": 3239510.0, + "step": 1808 + }, + { + "epoch": 0.29293174641729414, + "grad_norm": 25.62843132019043, + "learning_rate": 7.072538860103627e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.9031884074211121, + "num_tokens": 3241310.0, + "step": 1809 + }, + { + "epoch": 0.29309367662537444, + "grad_norm": 24.041162490844727, + "learning_rate": 7.070919689119171e-06, + "loss": 0.6757, + "mean_token_accuracy": 0.9052418172359467, + "num_tokens": 3243096.0, + "step": 1810 + }, + { + "epoch": 0.2932556068334548, + "grad_norm": 23.216135025024414, + "learning_rate": 7.069300518134715e-06, + "loss": 0.5936, + "mean_token_accuracy": 0.9085957109928131, + "num_tokens": 3244891.0, + "step": 1811 + }, + { + "epoch": 0.2934175370415351, + "grad_norm": 29.72441291809082, + "learning_rate": 7.067681347150259e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.8975876569747925, + "num_tokens": 3246686.0, + "step": 1812 + }, + { + "epoch": 0.2935794672496154, + "grad_norm": 26.464025497436523, + "learning_rate": 7.066062176165803e-06, + "loss": 0.6181, + "mean_token_accuracy": 0.9070670008659363, + "num_tokens": 3248478.0, + "step": 1813 + }, + { + "epoch": 0.29374139745769573, + "grad_norm": 20.82701873779297, + "learning_rate": 7.064443005181347e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.9132352769374847, + "num_tokens": 3250266.0, + "step": 1814 + }, + { + "epoch": 0.29390332766577604, + "grad_norm": 27.885255813598633, + "learning_rate": 7.062823834196891e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.902446061372757, + "num_tokens": 3252067.0, + "step": 1815 + }, + { + "epoch": 0.29406525787385634, + "grad_norm": 24.843610763549805, + "learning_rate": 7.061204663212436e-06, + "loss": 0.6487, + "mean_token_accuracy": 0.9192118346691132, + "num_tokens": 3253864.0, + "step": 1816 + }, + { + "epoch": 0.2942271880819367, + "grad_norm": 25.647825241088867, + "learning_rate": 7.059585492227979e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.9078470766544342, + "num_tokens": 3255658.0, + "step": 1817 + }, + { + "epoch": 0.294389118290017, + "grad_norm": 19.608985900878906, + "learning_rate": 7.057966321243524e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.9183256030082703, + "num_tokens": 3257439.0, + "step": 1818 + }, + { + "epoch": 0.2945510484980973, + "grad_norm": 22.01833724975586, + "learning_rate": 7.056347150259067e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.9080341756343842, + "num_tokens": 3259234.0, + "step": 1819 + }, + { + "epoch": 0.29471297870617763, + "grad_norm": 23.9237060546875, + "learning_rate": 7.054727979274612e-06, + "loss": 0.659, + "mean_token_accuracy": 0.9042107164859772, + "num_tokens": 3261017.0, + "step": 1820 + }, + { + "epoch": 0.29487490891425794, + "grad_norm": 25.75096321105957, + "learning_rate": 7.053108808290155e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.9050742387771606, + "num_tokens": 3262810.0, + "step": 1821 + }, + { + "epoch": 0.29503683912233825, + "grad_norm": 22.666175842285156, + "learning_rate": 7.0514896373057e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.9119242131710052, + "num_tokens": 3264595.0, + "step": 1822 + }, + { + "epoch": 0.2951987693304186, + "grad_norm": 24.539077758789062, + "learning_rate": 7.0498704663212434e-06, + "loss": 0.6422, + "mean_token_accuracy": 0.9094203114509583, + "num_tokens": 3266383.0, + "step": 1823 + }, + { + "epoch": 0.2953606995384989, + "grad_norm": 14.791534423828125, + "learning_rate": 7.048251295336788e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.9255244135856628, + "num_tokens": 3268177.0, + "step": 1824 + }, + { + "epoch": 0.2955226297465792, + "grad_norm": 20.986412048339844, + "learning_rate": 7.0466321243523315e-06, + "loss": 0.641, + "mean_token_accuracy": 0.9122835099697113, + "num_tokens": 3269963.0, + "step": 1825 + }, + { + "epoch": 0.29568455995465953, + "grad_norm": 20.484474182128906, + "learning_rate": 7.045012953367876e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.9147412180900574, + "num_tokens": 3271745.0, + "step": 1826 + }, + { + "epoch": 0.29584649016273984, + "grad_norm": 21.436328887939453, + "learning_rate": 7.0433937823834195e-06, + "loss": 0.6073, + "mean_token_accuracy": 0.9102211594581604, + "num_tokens": 3273524.0, + "step": 1827 + }, + { + "epoch": 0.2960084203708202, + "grad_norm": 20.828622817993164, + "learning_rate": 7.041774611398964e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.9098861515522003, + "num_tokens": 3275314.0, + "step": 1828 + }, + { + "epoch": 0.2961703505789005, + "grad_norm": 25.984052658081055, + "learning_rate": 7.0401554404145075e-06, + "loss": 0.839, + "mean_token_accuracy": 0.8964285552501678, + "num_tokens": 3277106.0, + "step": 1829 + }, + { + "epoch": 0.2963322807869808, + "grad_norm": 17.318105697631836, + "learning_rate": 7.038536269430052e-06, + "loss": 0.55, + "mean_token_accuracy": 0.9267389476299286, + "num_tokens": 3278891.0, + "step": 1830 + }, + { + "epoch": 0.2964942109950611, + "grad_norm": 21.48662567138672, + "learning_rate": 7.0369170984455956e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.9152255654335022, + "num_tokens": 3280676.0, + "step": 1831 + }, + { + "epoch": 0.29665614120314143, + "grad_norm": 29.47977638244629, + "learning_rate": 7.03529792746114e-06, + "loss": 0.827, + "mean_token_accuracy": 0.8855248391628265, + "num_tokens": 3282467.0, + "step": 1832 + }, + { + "epoch": 0.29681807141122174, + "grad_norm": 22.44864273071289, + "learning_rate": 7.0336787564766836e-06, + "loss": 0.6171, + "mean_token_accuracy": 0.9077857732772827, + "num_tokens": 3284250.0, + "step": 1833 + }, + { + "epoch": 0.2969800016193021, + "grad_norm": 23.180383682250977, + "learning_rate": 7.032059585492228e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.9025468528270721, + "num_tokens": 3286037.0, + "step": 1834 + }, + { + "epoch": 0.2971419318273824, + "grad_norm": 19.097293853759766, + "learning_rate": 7.030440414507773e-06, + "loss": 0.6403, + "mean_token_accuracy": 0.9049040675163269, + "num_tokens": 3287823.0, + "step": 1835 + }, + { + "epoch": 0.2973038620354627, + "grad_norm": 18.602781295776367, + "learning_rate": 7.028821243523317e-06, + "loss": 0.6654, + "mean_token_accuracy": 0.9163228571414948, + "num_tokens": 3289610.0, + "step": 1836 + }, + { + "epoch": 0.297465792243543, + "grad_norm": 20.606163024902344, + "learning_rate": 7.027202072538861e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.9134344756603241, + "num_tokens": 3291399.0, + "step": 1837 + }, + { + "epoch": 0.29762772245162333, + "grad_norm": 29.514991760253906, + "learning_rate": 7.025582901554405e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.9006385803222656, + "num_tokens": 3293191.0, + "step": 1838 + }, + { + "epoch": 0.29778965265970364, + "grad_norm": 24.596303939819336, + "learning_rate": 7.023963730569949e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.9115384519100189, + "num_tokens": 3294986.0, + "step": 1839 + }, + { + "epoch": 0.297951582867784, + "grad_norm": 15.325115203857422, + "learning_rate": 7.022344559585493e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.9207678437232971, + "num_tokens": 3296763.0, + "step": 1840 + }, + { + "epoch": 0.2981135130758643, + "grad_norm": 25.706947326660156, + "learning_rate": 7.020725388601037e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.8916953504085541, + "num_tokens": 3298552.0, + "step": 1841 + }, + { + "epoch": 0.2982754432839446, + "grad_norm": 12.791678428649902, + "learning_rate": 7.019106217616582e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.9204900860786438, + "num_tokens": 3300341.0, + "step": 1842 + }, + { + "epoch": 0.2984373734920249, + "grad_norm": 27.136098861694336, + "learning_rate": 7.017487046632125e-06, + "loss": 0.8081, + "mean_token_accuracy": 0.8882094025611877, + "num_tokens": 3302139.0, + "step": 1843 + }, + { + "epoch": 0.29859930370010523, + "grad_norm": 16.981441497802734, + "learning_rate": 7.01586787564767e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.9280426800251007, + "num_tokens": 3303929.0, + "step": 1844 + }, + { + "epoch": 0.2987612339081856, + "grad_norm": 23.834144592285156, + "learning_rate": 7.0142487046632134e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.8959557414054871, + "num_tokens": 3305727.0, + "step": 1845 + }, + { + "epoch": 0.2989231641162659, + "grad_norm": 17.492734909057617, + "learning_rate": 7.012629533678758e-06, + "loss": 0.534, + "mean_token_accuracy": 0.9230892956256866, + "num_tokens": 3307512.0, + "step": 1846 + }, + { + "epoch": 0.2990850943243462, + "grad_norm": 24.705543518066406, + "learning_rate": 7.0110103626943015e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.894191324710846, + "num_tokens": 3309298.0, + "step": 1847 + }, + { + "epoch": 0.2992470245324265, + "grad_norm": 17.670211791992188, + "learning_rate": 7.009391191709846e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.920273095369339, + "num_tokens": 3311086.0, + "step": 1848 + }, + { + "epoch": 0.2994089547405068, + "grad_norm": 22.412593841552734, + "learning_rate": 7.0077720207253895e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.9022997617721558, + "num_tokens": 3312875.0, + "step": 1849 + }, + { + "epoch": 0.29957088494858714, + "grad_norm": 21.903331756591797, + "learning_rate": 7.006152849740934e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.9199481308460236, + "num_tokens": 3314662.0, + "step": 1850 + }, + { + "epoch": 0.2997328151566675, + "grad_norm": 22.382152557373047, + "learning_rate": 7.0045336787564775e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.9107106626033783, + "num_tokens": 3316455.0, + "step": 1851 + }, + { + "epoch": 0.2998947453647478, + "grad_norm": 16.95178985595703, + "learning_rate": 7.002914507772022e-06, + "loss": 0.5694, + "mean_token_accuracy": 0.9163933396339417, + "num_tokens": 3318242.0, + "step": 1852 + }, + { + "epoch": 0.3000566755728281, + "grad_norm": 21.244850158691406, + "learning_rate": 7.0012953367875655e-06, + "loss": 0.6569, + "mean_token_accuracy": 0.9016563296318054, + "num_tokens": 3320039.0, + "step": 1853 + }, + { + "epoch": 0.3002186057809084, + "grad_norm": 30.909950256347656, + "learning_rate": 6.99967616580311e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.8684979379177094, + "num_tokens": 3321840.0, + "step": 1854 + }, + { + "epoch": 0.30038053598898873, + "grad_norm": 22.09109878540039, + "learning_rate": 6.9980569948186536e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.9082706868648529, + "num_tokens": 3323625.0, + "step": 1855 + }, + { + "epoch": 0.30054246619706904, + "grad_norm": 21.22285270690918, + "learning_rate": 6.996437823834198e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.9114378988742828, + "num_tokens": 3325408.0, + "step": 1856 + }, + { + "epoch": 0.3007043964051494, + "grad_norm": 16.059595108032227, + "learning_rate": 6.994818652849742e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.9188898801803589, + "num_tokens": 3327203.0, + "step": 1857 + }, + { + "epoch": 0.3008663266132297, + "grad_norm": 24.401905059814453, + "learning_rate": 6.993199481865286e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.9079736173152924, + "num_tokens": 3328998.0, + "step": 1858 + }, + { + "epoch": 0.30102825682131, + "grad_norm": 15.001967430114746, + "learning_rate": 6.99158031088083e-06, + "loss": 0.5801, + "mean_token_accuracy": 0.9214604198932648, + "num_tokens": 3330778.0, + "step": 1859 + }, + { + "epoch": 0.3011901870293903, + "grad_norm": 17.746395111083984, + "learning_rate": 6.989961139896374e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.9071033895015717, + "num_tokens": 3332559.0, + "step": 1860 + }, + { + "epoch": 0.30135211723747063, + "grad_norm": 18.552631378173828, + "learning_rate": 6.9883419689119185e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.9260731339454651, + "num_tokens": 3334355.0, + "step": 1861 + }, + { + "epoch": 0.301514047445551, + "grad_norm": 15.442876815795898, + "learning_rate": 6.986722797927462e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.9165207743644714, + "num_tokens": 3336154.0, + "step": 1862 + }, + { + "epoch": 0.3016759776536313, + "grad_norm": 18.62020492553711, + "learning_rate": 6.9851036269430065e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.9289631247520447, + "num_tokens": 3337948.0, + "step": 1863 + }, + { + "epoch": 0.3018379078617116, + "grad_norm": 26.801982879638672, + "learning_rate": 6.98348445595855e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.9023066759109497, + "num_tokens": 3339737.0, + "step": 1864 + }, + { + "epoch": 0.3019998380697919, + "grad_norm": 23.987979888916016, + "learning_rate": 6.9818652849740945e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.9144199192523956, + "num_tokens": 3341529.0, + "step": 1865 + }, + { + "epoch": 0.3021617682778722, + "grad_norm": 25.172924041748047, + "learning_rate": 6.980246113989638e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.9084325432777405, + "num_tokens": 3343325.0, + "step": 1866 + }, + { + "epoch": 0.30232369848595253, + "grad_norm": 15.044196128845215, + "learning_rate": 6.9786269430051826e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.9206465184688568, + "num_tokens": 3345114.0, + "step": 1867 + }, + { + "epoch": 0.3024856286940329, + "grad_norm": 21.617923736572266, + "learning_rate": 6.977007772020726e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.8904095888137817, + "num_tokens": 3346909.0, + "step": 1868 + }, + { + "epoch": 0.3026475589021132, + "grad_norm": 23.95100212097168, + "learning_rate": 6.975388601036271e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.8981540501117706, + "num_tokens": 3348696.0, + "step": 1869 + }, + { + "epoch": 0.3028094891101935, + "grad_norm": 19.04384422302246, + "learning_rate": 6.973769430051814e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.9258203208446503, + "num_tokens": 3350491.0, + "step": 1870 + }, + { + "epoch": 0.3029714193182738, + "grad_norm": 16.7266845703125, + "learning_rate": 6.972150259067359e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.9291283786296844, + "num_tokens": 3352271.0, + "step": 1871 + }, + { + "epoch": 0.3031333495263541, + "grad_norm": 18.93291473388672, + "learning_rate": 6.970531088082902e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.9159350991249084, + "num_tokens": 3354068.0, + "step": 1872 + }, + { + "epoch": 0.3032952797344345, + "grad_norm": 18.34059715270996, + "learning_rate": 6.968911917098447e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.9197037518024445, + "num_tokens": 3355854.0, + "step": 1873 + }, + { + "epoch": 0.3034572099425148, + "grad_norm": 18.49833869934082, + "learning_rate": 6.96729274611399e-06, + "loss": 0.6321, + "mean_token_accuracy": 0.9137681126594543, + "num_tokens": 3357644.0, + "step": 1874 + }, + { + "epoch": 0.3036191401505951, + "grad_norm": 25.430721282958984, + "learning_rate": 6.965673575129535e-06, + "loss": 0.8121, + "mean_token_accuracy": 0.8993876874446869, + "num_tokens": 3359434.0, + "step": 1875 + }, + { + "epoch": 0.3037810703586754, + "grad_norm": 27.58957862854004, + "learning_rate": 6.964054404145078e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.8917474746704102, + "num_tokens": 3361223.0, + "step": 1876 + }, + { + "epoch": 0.3039430005667557, + "grad_norm": 23.025615692138672, + "learning_rate": 6.962435233160623e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.9156656563282013, + "num_tokens": 3363018.0, + "step": 1877 + }, + { + "epoch": 0.304104930774836, + "grad_norm": 19.629365921020508, + "learning_rate": 6.960816062176166e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.9144199192523956, + "num_tokens": 3364810.0, + "step": 1878 + }, + { + "epoch": 0.3042668609829164, + "grad_norm": 19.332963943481445, + "learning_rate": 6.959196891191711e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.9159740209579468, + "num_tokens": 3366596.0, + "step": 1879 + }, + { + "epoch": 0.3044287911909967, + "grad_norm": 20.373083114624023, + "learning_rate": 6.957577720207255e-06, + "loss": 0.6618, + "mean_token_accuracy": 0.9054433107376099, + "num_tokens": 3368383.0, + "step": 1880 + }, + { + "epoch": 0.304590721399077, + "grad_norm": 13.793593406677246, + "learning_rate": 6.955958549222799e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9342925548553467, + "num_tokens": 3370169.0, + "step": 1881 + }, + { + "epoch": 0.3047526516071573, + "grad_norm": 15.495427131652832, + "learning_rate": 6.954339378238343e-06, + "loss": 0.5459, + "mean_token_accuracy": 0.9253092110157013, + "num_tokens": 3371962.0, + "step": 1882 + }, + { + "epoch": 0.3049145818152376, + "grad_norm": 26.103240966796875, + "learning_rate": 6.952720207253887e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.9024867117404938, + "num_tokens": 3373751.0, + "step": 1883 + }, + { + "epoch": 0.3050765120233179, + "grad_norm": 22.593032836914062, + "learning_rate": 6.951101036269431e-06, + "loss": 0.652, + "mean_token_accuracy": 0.9052592515945435, + "num_tokens": 3375547.0, + "step": 1884 + }, + { + "epoch": 0.3052384422313983, + "grad_norm": 19.726655960083008, + "learning_rate": 6.949481865284975e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.9178866446018219, + "num_tokens": 3377351.0, + "step": 1885 + }, + { + "epoch": 0.3054003724394786, + "grad_norm": 30.640186309814453, + "learning_rate": 6.947862694300519e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.8836050927639008, + "num_tokens": 3379145.0, + "step": 1886 + }, + { + "epoch": 0.3055623026475589, + "grad_norm": 21.694368362426758, + "learning_rate": 6.946243523316063e-06, + "loss": 0.6258, + "mean_token_accuracy": 0.913159966468811, + "num_tokens": 3380933.0, + "step": 1887 + }, + { + "epoch": 0.3057242328556392, + "grad_norm": 24.265811920166016, + "learning_rate": 6.944624352331607e-06, + "loss": 0.6526, + "mean_token_accuracy": 0.9049978852272034, + "num_tokens": 3382719.0, + "step": 1888 + }, + { + "epoch": 0.3058861630637195, + "grad_norm": 25.403676986694336, + "learning_rate": 6.943005181347151e-06, + "loss": 0.6758, + "mean_token_accuracy": 0.897176593542099, + "num_tokens": 3384512.0, + "step": 1889 + }, + { + "epoch": 0.3060480932717999, + "grad_norm": 28.32275390625, + "learning_rate": 6.941386010362695e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.8904704749584198, + "num_tokens": 3386307.0, + "step": 1890 + }, + { + "epoch": 0.3062100234798802, + "grad_norm": 13.344834327697754, + "learning_rate": 6.939766839378239e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9352190792560577, + "num_tokens": 3388096.0, + "step": 1891 + }, + { + "epoch": 0.3063719536879605, + "grad_norm": 19.27501678466797, + "learning_rate": 6.938147668393783e-06, + "loss": 0.642, + "mean_token_accuracy": 0.9164286553859711, + "num_tokens": 3389883.0, + "step": 1892 + }, + { + "epoch": 0.3065338838960408, + "grad_norm": 24.48785400390625, + "learning_rate": 6.936528497409327e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.8967473804950714, + "num_tokens": 3391676.0, + "step": 1893 + }, + { + "epoch": 0.3066958141041211, + "grad_norm": 17.873153686523438, + "learning_rate": 6.934909326424871e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.918720155954361, + "num_tokens": 3393458.0, + "step": 1894 + }, + { + "epoch": 0.3068577443122014, + "grad_norm": 20.680383682250977, + "learning_rate": 6.933290155440415e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.9086743593215942, + "num_tokens": 3395244.0, + "step": 1895 + }, + { + "epoch": 0.3070196745202818, + "grad_norm": 23.652812957763672, + "learning_rate": 6.931670984455959e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.9061359763145447, + "num_tokens": 3397033.0, + "step": 1896 + }, + { + "epoch": 0.3071816047283621, + "grad_norm": 26.407339096069336, + "learning_rate": 6.930051813471503e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.9091712236404419, + "num_tokens": 3398810.0, + "step": 1897 + }, + { + "epoch": 0.3073435349364424, + "grad_norm": 21.341291427612305, + "learning_rate": 6.928432642487047e-06, + "loss": 0.594, + "mean_token_accuracy": 0.9184397161006927, + "num_tokens": 3400604.0, + "step": 1898 + }, + { + "epoch": 0.3075054651445227, + "grad_norm": 15.613903999328613, + "learning_rate": 6.926813471502592e-06, + "loss": 0.5954, + "mean_token_accuracy": 0.9220707416534424, + "num_tokens": 3402398.0, + "step": 1899 + }, + { + "epoch": 0.307667395352603, + "grad_norm": 25.65532875061035, + "learning_rate": 6.925194300518135e-06, + "loss": 0.6464, + "mean_token_accuracy": 0.9087617993354797, + "num_tokens": 3404195.0, + "step": 1900 + }, + { + "epoch": 0.3078293255606833, + "grad_norm": 27.368011474609375, + "learning_rate": 6.92357512953368e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.8879284858703613, + "num_tokens": 3405992.0, + "step": 1901 + }, + { + "epoch": 0.3079912557687637, + "grad_norm": 23.894681930541992, + "learning_rate": 6.9219559585492234e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.9103172123432159, + "num_tokens": 3407793.0, + "step": 1902 + }, + { + "epoch": 0.308153185976844, + "grad_norm": 26.8140811920166, + "learning_rate": 6.920336787564768e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.903620183467865, + "num_tokens": 3409595.0, + "step": 1903 + }, + { + "epoch": 0.3083151161849243, + "grad_norm": 21.525659561157227, + "learning_rate": 6.9187176165803115e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.9226754009723663, + "num_tokens": 3411379.0, + "step": 1904 + }, + { + "epoch": 0.3084770463930046, + "grad_norm": 17.20766830444336, + "learning_rate": 6.917098445595856e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.9230810403823853, + "num_tokens": 3413165.0, + "step": 1905 + }, + { + "epoch": 0.3086389766010849, + "grad_norm": 24.366456985473633, + "learning_rate": 6.9154792746113995e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.9044228792190552, + "num_tokens": 3414968.0, + "step": 1906 + }, + { + "epoch": 0.3088009068091653, + "grad_norm": 31.981435775756836, + "learning_rate": 6.913860103626944e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.8914006948471069, + "num_tokens": 3416765.0, + "step": 1907 + }, + { + "epoch": 0.3089628370172456, + "grad_norm": 22.964876174926758, + "learning_rate": 6.9122409326424875e-06, + "loss": 0.6266, + "mean_token_accuracy": 0.9024867117404938, + "num_tokens": 3418554.0, + "step": 1908 + }, + { + "epoch": 0.3091247672253259, + "grad_norm": 23.065261840820312, + "learning_rate": 6.910621761658032e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.9148924648761749, + "num_tokens": 3420336.0, + "step": 1909 + }, + { + "epoch": 0.3092866974334062, + "grad_norm": 21.18498992919922, + "learning_rate": 6.9090025906735755e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.9206026494503021, + "num_tokens": 3422125.0, + "step": 1910 + }, + { + "epoch": 0.3094486276414865, + "grad_norm": 20.91034507751465, + "learning_rate": 6.90738341968912e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.9143702983856201, + "num_tokens": 3423917.0, + "step": 1911 + }, + { + "epoch": 0.3096105578495668, + "grad_norm": 21.986989974975586, + "learning_rate": 6.9057642487046636e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.9142578542232513, + "num_tokens": 3425697.0, + "step": 1912 + }, + { + "epoch": 0.3097724880576472, + "grad_norm": 25.399215698242188, + "learning_rate": 6.904145077720208e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.9093185067176819, + "num_tokens": 3427496.0, + "step": 1913 + }, + { + "epoch": 0.3099344182657275, + "grad_norm": 19.495424270629883, + "learning_rate": 6.902525906735752e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.9114588499069214, + "num_tokens": 3429291.0, + "step": 1914 + }, + { + "epoch": 0.3100963484738078, + "grad_norm": 23.71278953552246, + "learning_rate": 6.900906735751296e-06, + "loss": 0.6681, + "mean_token_accuracy": 0.9090617895126343, + "num_tokens": 3431088.0, + "step": 1915 + }, + { + "epoch": 0.3102582786818881, + "grad_norm": 15.605432510375977, + "learning_rate": 6.89928756476684e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.9214173555374146, + "num_tokens": 3432867.0, + "step": 1916 + }, + { + "epoch": 0.3104202088899684, + "grad_norm": 19.36934471130371, + "learning_rate": 6.897668393782384e-06, + "loss": 0.613, + "mean_token_accuracy": 0.9160839319229126, + "num_tokens": 3434652.0, + "step": 1917 + }, + { + "epoch": 0.3105821390980487, + "grad_norm": 24.261260986328125, + "learning_rate": 6.8960492227979285e-06, + "loss": 0.72, + "mean_token_accuracy": 0.8996146023273468, + "num_tokens": 3436443.0, + "step": 1918 + }, + { + "epoch": 0.3107440693061291, + "grad_norm": 18.746381759643555, + "learning_rate": 6.894430051813472e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.9184607565402985, + "num_tokens": 3438237.0, + "step": 1919 + }, + { + "epoch": 0.3109059995142094, + "grad_norm": 25.644567489624023, + "learning_rate": 6.8928108808290165e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.8938788175582886, + "num_tokens": 3440039.0, + "step": 1920 + }, + { + "epoch": 0.3110679297222897, + "grad_norm": 18.823265075683594, + "learning_rate": 6.89119170984456e-06, + "loss": 0.6231, + "mean_token_accuracy": 0.9175146520137787, + "num_tokens": 3441830.0, + "step": 1921 + }, + { + "epoch": 0.31122985993037, + "grad_norm": 34.07335662841797, + "learning_rate": 6.8895725388601046e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.8788793087005615, + "num_tokens": 3443631.0, + "step": 1922 + }, + { + "epoch": 0.3113917901384503, + "grad_norm": 24.817089080810547, + "learning_rate": 6.887953367875648e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.9011238813400269, + "num_tokens": 3445426.0, + "step": 1923 + }, + { + "epoch": 0.31155372034653067, + "grad_norm": 17.930213928222656, + "learning_rate": 6.886334196891193e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.924413800239563, + "num_tokens": 3447217.0, + "step": 1924 + }, + { + "epoch": 0.311715650554611, + "grad_norm": 21.13175392150879, + "learning_rate": 6.884715025906736e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.9273166060447693, + "num_tokens": 3449017.0, + "step": 1925 + }, + { + "epoch": 0.3118775807626913, + "grad_norm": 22.420001983642578, + "learning_rate": 6.883095854922281e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.9131964445114136, + "num_tokens": 3450805.0, + "step": 1926 + }, + { + "epoch": 0.3120395109707716, + "grad_norm": 18.214536666870117, + "learning_rate": 6.881476683937824e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.9271321892738342, + "num_tokens": 3452591.0, + "step": 1927 + }, + { + "epoch": 0.3122014411788519, + "grad_norm": 22.271533966064453, + "learning_rate": 6.879857512953369e-06, + "loss": 0.5691, + "mean_token_accuracy": 0.9136956036090851, + "num_tokens": 3454392.0, + "step": 1928 + }, + { + "epoch": 0.3123633713869322, + "grad_norm": 30.941083908081055, + "learning_rate": 6.878238341968912e-06, + "loss": 1.1504, + "mean_token_accuracy": 0.8910346925258636, + "num_tokens": 3456186.0, + "step": 1929 + }, + { + "epoch": 0.3125253015950126, + "grad_norm": 19.465234756469727, + "learning_rate": 6.876619170984457e-06, + "loss": 0.5998, + "mean_token_accuracy": 0.9128319621086121, + "num_tokens": 3457971.0, + "step": 1930 + }, + { + "epoch": 0.3126872318030929, + "grad_norm": 16.59850311279297, + "learning_rate": 6.875e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.919047623872757, + "num_tokens": 3459755.0, + "step": 1931 + }, + { + "epoch": 0.3128491620111732, + "grad_norm": 20.765493392944336, + "learning_rate": 6.873380829015545e-06, + "loss": 0.6289, + "mean_token_accuracy": 0.9211459457874298, + "num_tokens": 3461546.0, + "step": 1932 + }, + { + "epoch": 0.3130110922192535, + "grad_norm": 26.060544967651367, + "learning_rate": 6.871761658031088e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.9129863977432251, + "num_tokens": 3463334.0, + "step": 1933 + }, + { + "epoch": 0.3131730224273338, + "grad_norm": 19.259361267089844, + "learning_rate": 6.870142487046633e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.9171325266361237, + "num_tokens": 3465124.0, + "step": 1934 + }, + { + "epoch": 0.3133349526354141, + "grad_norm": 32.98956298828125, + "learning_rate": 6.868523316062176e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.8826494514942169, + "num_tokens": 3466917.0, + "step": 1935 + }, + { + "epoch": 0.3134968828434945, + "grad_norm": 25.334579467773438, + "learning_rate": 6.866904145077721e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.906528502702713, + "num_tokens": 3468697.0, + "step": 1936 + }, + { + "epoch": 0.3136588130515748, + "grad_norm": 21.44000244140625, + "learning_rate": 6.865284974093265e-06, + "loss": 0.6089, + "mean_token_accuracy": 0.9114553928375244, + "num_tokens": 3470501.0, + "step": 1937 + }, + { + "epoch": 0.3138207432596551, + "grad_norm": 22.911813735961914, + "learning_rate": 6.863665803108809e-06, + "loss": 0.6441, + "mean_token_accuracy": 0.9065613150596619, + "num_tokens": 3472302.0, + "step": 1938 + }, + { + "epoch": 0.3139826734677354, + "grad_norm": 25.368274688720703, + "learning_rate": 6.862046632124353e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.8980347812175751, + "num_tokens": 3474096.0, + "step": 1939 + }, + { + "epoch": 0.3141446036758157, + "grad_norm": 24.771181106567383, + "learning_rate": 6.860427461139897e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.9033122956752777, + "num_tokens": 3475877.0, + "step": 1940 + }, + { + "epoch": 0.31430653388389607, + "grad_norm": 19.39136505126953, + "learning_rate": 6.858808290155441e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.9175740778446198, + "num_tokens": 3477656.0, + "step": 1941 + }, + { + "epoch": 0.3144684640919764, + "grad_norm": 24.30678367614746, + "learning_rate": 6.857189119170985e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.9056722819805145, + "num_tokens": 3479444.0, + "step": 1942 + }, + { + "epoch": 0.3146303943000567, + "grad_norm": 15.824822425842285, + "learning_rate": 6.855569948186529e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.9248643219470978, + "num_tokens": 3481222.0, + "step": 1943 + }, + { + "epoch": 0.314792324508137, + "grad_norm": 29.671920776367188, + "learning_rate": 6.853950777202073e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.8927496075630188, + "num_tokens": 3483014.0, + "step": 1944 + }, + { + "epoch": 0.3149542547162173, + "grad_norm": 21.041963577270508, + "learning_rate": 6.852331606217617e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.9127962291240692, + "num_tokens": 3484801.0, + "step": 1945 + }, + { + "epoch": 0.3151161849242976, + "grad_norm": 28.425615310668945, + "learning_rate": 6.850712435233161e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.8961202502250671, + "num_tokens": 3486592.0, + "step": 1946 + }, + { + "epoch": 0.31527811513237797, + "grad_norm": 24.378219604492188, + "learning_rate": 6.849093264248705e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.8991971015930176, + "num_tokens": 3488382.0, + "step": 1947 + }, + { + "epoch": 0.3154400453404583, + "grad_norm": 30.380659103393555, + "learning_rate": 6.847474093264249e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.8788124322891235, + "num_tokens": 3490166.0, + "step": 1948 + }, + { + "epoch": 0.3156019755485386, + "grad_norm": 17.325603485107422, + "learning_rate": 6.845854922279793e-06, + "loss": 0.5398, + "mean_token_accuracy": 0.9238302707672119, + "num_tokens": 3491954.0, + "step": 1949 + }, + { + "epoch": 0.3157639057566189, + "grad_norm": 14.562023162841797, + "learning_rate": 6.844235751295337e-06, + "loss": 0.557, + "mean_token_accuracy": 0.9157631993293762, + "num_tokens": 3493739.0, + "step": 1950 + }, + { + "epoch": 0.3159258359646992, + "grad_norm": 19.599218368530273, + "learning_rate": 6.842616580310881e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.9235645532608032, + "num_tokens": 3495538.0, + "step": 1951 + }, + { + "epoch": 0.3160877661727795, + "grad_norm": 17.00807762145996, + "learning_rate": 6.840997409326425e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.9196504652500153, + "num_tokens": 3497324.0, + "step": 1952 + }, + { + "epoch": 0.31624969638085987, + "grad_norm": 21.016956329345703, + "learning_rate": 6.839378238341969e-06, + "loss": 0.6203, + "mean_token_accuracy": 0.925253301858902, + "num_tokens": 3499117.0, + "step": 1953 + }, + { + "epoch": 0.3164116265889402, + "grad_norm": 19.136619567871094, + "learning_rate": 6.837759067357513e-06, + "loss": 0.6435, + "mean_token_accuracy": 0.9066585004329681, + "num_tokens": 3500909.0, + "step": 1954 + }, + { + "epoch": 0.3165735567970205, + "grad_norm": 18.964651107788086, + "learning_rate": 6.836139896373057e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.9048619270324707, + "num_tokens": 3502704.0, + "step": 1955 + }, + { + "epoch": 0.3167354870051008, + "grad_norm": 25.121936798095703, + "learning_rate": 6.834520725388602e-06, + "loss": 0.6829, + "mean_token_accuracy": 0.905919760465622, + "num_tokens": 3504502.0, + "step": 1956 + }, + { + "epoch": 0.3168974172131811, + "grad_norm": 23.669727325439453, + "learning_rate": 6.8329015544041454e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.9120011925697327, + "num_tokens": 3506298.0, + "step": 1957 + }, + { + "epoch": 0.31705934742126146, + "grad_norm": 21.615951538085938, + "learning_rate": 6.83128238341969e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.9109025001525879, + "num_tokens": 3508080.0, + "step": 1958 + }, + { + "epoch": 0.31722127762934177, + "grad_norm": 21.237550735473633, + "learning_rate": 6.8296632124352335e-06, + "loss": 0.6103, + "mean_token_accuracy": 0.9176004827022552, + "num_tokens": 3509871.0, + "step": 1959 + }, + { + "epoch": 0.3173832078374221, + "grad_norm": 24.932605743408203, + "learning_rate": 6.828044041450778e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.8964575529098511, + "num_tokens": 3511663.0, + "step": 1960 + }, + { + "epoch": 0.3175451380455024, + "grad_norm": 21.199174880981445, + "learning_rate": 6.8264248704663215e-06, + "loss": 0.6658, + "mean_token_accuracy": 0.9043275713920593, + "num_tokens": 3513456.0, + "step": 1961 + }, + { + "epoch": 0.3177070682535827, + "grad_norm": 22.532611846923828, + "learning_rate": 6.824805699481866e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.9090404212474823, + "num_tokens": 3515243.0, + "step": 1962 + }, + { + "epoch": 0.317868998461663, + "grad_norm": 17.341398239135742, + "learning_rate": 6.8231865284974095e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.9179342985153198, + "num_tokens": 3517047.0, + "step": 1963 + }, + { + "epoch": 0.31803092866974336, + "grad_norm": 25.42489242553711, + "learning_rate": 6.821567357512954e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.8838652670383453, + "num_tokens": 3518844.0, + "step": 1964 + }, + { + "epoch": 0.31819285887782367, + "grad_norm": 17.559890747070312, + "learning_rate": 6.8199481865284975e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.9091269969940186, + "num_tokens": 3520631.0, + "step": 1965 + }, + { + "epoch": 0.318354789085904, + "grad_norm": 21.703004837036133, + "learning_rate": 6.818329015544042e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.926225483417511, + "num_tokens": 3522414.0, + "step": 1966 + }, + { + "epoch": 0.3185167192939843, + "grad_norm": 25.73619270324707, + "learning_rate": 6.8167098445595856e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.8907374143600464, + "num_tokens": 3524201.0, + "step": 1967 + }, + { + "epoch": 0.3186786495020646, + "grad_norm": 21.41761589050293, + "learning_rate": 6.81509067357513e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.9065134227275848, + "num_tokens": 3525993.0, + "step": 1968 + }, + { + "epoch": 0.3188405797101449, + "grad_norm": 16.376726150512695, + "learning_rate": 6.813471502590674e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.9169117510318756, + "num_tokens": 3527781.0, + "step": 1969 + }, + { + "epoch": 0.31900250991822526, + "grad_norm": 28.338106155395508, + "learning_rate": 6.811852331606218e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.8929156363010406, + "num_tokens": 3529573.0, + "step": 1970 + }, + { + "epoch": 0.31916444012630557, + "grad_norm": 19.332244873046875, + "learning_rate": 6.810233160621762e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.9200184643268585, + "num_tokens": 3531372.0, + "step": 1971 + }, + { + "epoch": 0.3193263703343859, + "grad_norm": 20.007776260375977, + "learning_rate": 6.808613989637306e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.910084992647171, + "num_tokens": 3533162.0, + "step": 1972 + }, + { + "epoch": 0.3194883005424662, + "grad_norm": 13.949797630310059, + "learning_rate": 6.80699481865285e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9313608109951019, + "num_tokens": 3534951.0, + "step": 1973 + }, + { + "epoch": 0.3196502307505465, + "grad_norm": 22.358320236206055, + "learning_rate": 6.805375647668394e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.9104166626930237, + "num_tokens": 3536742.0, + "step": 1974 + }, + { + "epoch": 0.31981216095862686, + "grad_norm": 28.46381950378418, + "learning_rate": 6.8037564766839385e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.8937085866928101, + "num_tokens": 3538545.0, + "step": 1975 + }, + { + "epoch": 0.31997409116670716, + "grad_norm": 19.395126342773438, + "learning_rate": 6.802137305699482e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.9068088531494141, + "num_tokens": 3540336.0, + "step": 1976 + }, + { + "epoch": 0.32013602137478747, + "grad_norm": 25.95215606689453, + "learning_rate": 6.8005181347150265e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.8960882127285004, + "num_tokens": 3542127.0, + "step": 1977 + }, + { + "epoch": 0.3202979515828678, + "grad_norm": 18.66096305847168, + "learning_rate": 6.79889896373057e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.9215896725654602, + "num_tokens": 3543919.0, + "step": 1978 + }, + { + "epoch": 0.3204598817909481, + "grad_norm": 25.1312313079834, + "learning_rate": 6.7972797927461146e-06, + "loss": 0.719, + "mean_token_accuracy": 0.906470000743866, + "num_tokens": 3545709.0, + "step": 1979 + }, + { + "epoch": 0.3206218119990284, + "grad_norm": 18.188488006591797, + "learning_rate": 6.795660621761658e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.9198676943778992, + "num_tokens": 3547495.0, + "step": 1980 + }, + { + "epoch": 0.32078374220710876, + "grad_norm": 18.340505599975586, + "learning_rate": 6.794041450777203e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.9162003695964813, + "num_tokens": 3549281.0, + "step": 1981 + }, + { + "epoch": 0.32094567241518906, + "grad_norm": 25.959964752197266, + "learning_rate": 6.792422279792746e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.9021967649459839, + "num_tokens": 3551069.0, + "step": 1982 + }, + { + "epoch": 0.3211076026232694, + "grad_norm": 26.811525344848633, + "learning_rate": 6.790803108808291e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.8895547986030579, + "num_tokens": 3552863.0, + "step": 1983 + }, + { + "epoch": 0.3212695328313497, + "grad_norm": 26.007314682006836, + "learning_rate": 6.789183937823834e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.9056684076786041, + "num_tokens": 3554649.0, + "step": 1984 + }, + { + "epoch": 0.32143146303943, + "grad_norm": 31.120197296142578, + "learning_rate": 6.787564766839379e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.8775861859321594, + "num_tokens": 3556446.0, + "step": 1985 + }, + { + "epoch": 0.32159339324751035, + "grad_norm": 18.893774032592773, + "learning_rate": 6.785945595854922e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.9166885316371918, + "num_tokens": 3558234.0, + "step": 1986 + }, + { + "epoch": 0.32175532345559066, + "grad_norm": 33.02800750732422, + "learning_rate": 6.784326424870467e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.886833906173706, + "num_tokens": 3560037.0, + "step": 1987 + }, + { + "epoch": 0.32191725366367097, + "grad_norm": 21.673070907592773, + "learning_rate": 6.78270725388601e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.9175745248794556, + "num_tokens": 3561828.0, + "step": 1988 + }, + { + "epoch": 0.3220791838717513, + "grad_norm": 21.844501495361328, + "learning_rate": 6.781088082901555e-06, + "loss": 0.6004, + "mean_token_accuracy": 0.9182330667972565, + "num_tokens": 3563632.0, + "step": 1989 + }, + { + "epoch": 0.3222411140798316, + "grad_norm": 13.191789627075195, + "learning_rate": 6.779468911917098e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.9226222932338715, + "num_tokens": 3565415.0, + "step": 1990 + }, + { + "epoch": 0.3224030442879119, + "grad_norm": 28.812782287597656, + "learning_rate": 6.777849740932643e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.8956834375858307, + "num_tokens": 3567205.0, + "step": 1991 + }, + { + "epoch": 0.32256497449599225, + "grad_norm": 9.942994117736816, + "learning_rate": 6.776230569948186e-06, + "loss": 0.448, + "mean_token_accuracy": 0.9384453892707825, + "num_tokens": 3568993.0, + "step": 1992 + }, + { + "epoch": 0.32272690470407256, + "grad_norm": 21.88311195373535, + "learning_rate": 6.774611398963731e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.9302331507205963, + "num_tokens": 3570792.0, + "step": 1993 + }, + { + "epoch": 0.32288883491215287, + "grad_norm": 24.2215576171875, + "learning_rate": 6.772992227979275e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.9200254082679749, + "num_tokens": 3572579.0, + "step": 1994 + }, + { + "epoch": 0.3230507651202332, + "grad_norm": 20.88166046142578, + "learning_rate": 6.771373056994819e-06, + "loss": 0.6339, + "mean_token_accuracy": 0.9164022207260132, + "num_tokens": 3574366.0, + "step": 1995 + }, + { + "epoch": 0.3232126953283135, + "grad_norm": 22.844417572021484, + "learning_rate": 6.769753886010363e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.9099134206771851, + "num_tokens": 3576155.0, + "step": 1996 + }, + { + "epoch": 0.3233746255363938, + "grad_norm": 21.03485870361328, + "learning_rate": 6.768134715025907e-06, + "loss": 0.6386, + "mean_token_accuracy": 0.9197080433368683, + "num_tokens": 3577941.0, + "step": 1997 + }, + { + "epoch": 0.32353655574447415, + "grad_norm": 27.215187072753906, + "learning_rate": 6.766515544041451e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.9036587774753571, + "num_tokens": 3579731.0, + "step": 1998 + }, + { + "epoch": 0.32369848595255446, + "grad_norm": 22.64470672607422, + "learning_rate": 6.764896373056995e-06, + "loss": 0.6064, + "mean_token_accuracy": 0.9181357622146606, + "num_tokens": 3581524.0, + "step": 1999 + }, + { + "epoch": 0.32386041616063477, + "grad_norm": 22.80707550048828, + "learning_rate": 6.763277202072539e-06, + "loss": 0.6502, + "mean_token_accuracy": 0.9168103933334351, + "num_tokens": 3583325.0, + "step": 2000 + }, + { + "epoch": 0.3240223463687151, + "grad_norm": 26.383079528808594, + "learning_rate": 6.761658031088083e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.9136288166046143, + "num_tokens": 3585115.0, + "step": 2001 + }, + { + "epoch": 0.3241842765767954, + "grad_norm": 25.72802734375, + "learning_rate": 6.760038860103627e-06, + "loss": 0.626, + "mean_token_accuracy": 0.9035947918891907, + "num_tokens": 3586907.0, + "step": 2002 + }, + { + "epoch": 0.32434620678487575, + "grad_norm": 24.603004455566406, + "learning_rate": 6.758419689119171e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.9002188444137573, + "num_tokens": 3588709.0, + "step": 2003 + }, + { + "epoch": 0.32450813699295605, + "grad_norm": 20.02543830871582, + "learning_rate": 6.756800518134715e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.9172143638134003, + "num_tokens": 3590497.0, + "step": 2004 + }, + { + "epoch": 0.32467006720103636, + "grad_norm": 23.4384822845459, + "learning_rate": 6.755181347150259e-06, + "loss": 0.6238, + "mean_token_accuracy": 0.9175007045269012, + "num_tokens": 3592275.0, + "step": 2005 + }, + { + "epoch": 0.32483199740911667, + "grad_norm": 28.369173049926758, + "learning_rate": 6.753562176165803e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.902777761220932, + "num_tokens": 3594075.0, + "step": 2006 + }, + { + "epoch": 0.324993927617197, + "grad_norm": 20.84299087524414, + "learning_rate": 6.751943005181347e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.9120141863822937, + "num_tokens": 3595872.0, + "step": 2007 + }, + { + "epoch": 0.3251558578252773, + "grad_norm": 27.899316787719727, + "learning_rate": 6.750323834196891e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.9035714268684387, + "num_tokens": 3597664.0, + "step": 2008 + }, + { + "epoch": 0.32531778803335765, + "grad_norm": 28.193729400634766, + "learning_rate": 6.748704663212435e-06, + "loss": 0.6159, + "mean_token_accuracy": 0.9147864580154419, + "num_tokens": 3599458.0, + "step": 2009 + }, + { + "epoch": 0.32547971824143795, + "grad_norm": 21.17216682434082, + "learning_rate": 6.747085492227979e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.9143323600292206, + "num_tokens": 3601250.0, + "step": 2010 + }, + { + "epoch": 0.32564164844951826, + "grad_norm": 20.555675506591797, + "learning_rate": 6.745466321243524e-06, + "loss": 0.6367, + "mean_token_accuracy": 0.9260968863964081, + "num_tokens": 3603046.0, + "step": 2011 + }, + { + "epoch": 0.32580357865759857, + "grad_norm": 25.259862899780273, + "learning_rate": 6.743847150259067e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.9092986583709717, + "num_tokens": 3604834.0, + "step": 2012 + }, + { + "epoch": 0.3259655088656789, + "grad_norm": 24.507728576660156, + "learning_rate": 6.742227979274612e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.8953706622123718, + "num_tokens": 3606623.0, + "step": 2013 + }, + { + "epoch": 0.3261274390737592, + "grad_norm": 24.899192810058594, + "learning_rate": 6.7406088082901554e-06, + "loss": 0.841, + "mean_token_accuracy": 0.9067947864532471, + "num_tokens": 3608403.0, + "step": 2014 + }, + { + "epoch": 0.32628936928183955, + "grad_norm": 15.977191925048828, + "learning_rate": 6.7389896373057e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.9286187887191772, + "num_tokens": 3610195.0, + "step": 2015 + }, + { + "epoch": 0.32645129948991986, + "grad_norm": 17.4147891998291, + "learning_rate": 6.7373704663212435e-06, + "loss": 0.597, + "mean_token_accuracy": 0.9253731369972229, + "num_tokens": 3611975.0, + "step": 2016 + }, + { + "epoch": 0.32661322969800016, + "grad_norm": 24.433765411376953, + "learning_rate": 6.735751295336788e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.8985491693019867, + "num_tokens": 3613762.0, + "step": 2017 + }, + { + "epoch": 0.32677515990608047, + "grad_norm": 25.415340423583984, + "learning_rate": 6.7341321243523315e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.8961718380451202, + "num_tokens": 3615549.0, + "step": 2018 + }, + { + "epoch": 0.3269370901141608, + "grad_norm": 28.617385864257812, + "learning_rate": 6.732512953367876e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.8899040818214417, + "num_tokens": 3617350.0, + "step": 2019 + }, + { + "epoch": 0.32709902032224114, + "grad_norm": 24.829530715942383, + "learning_rate": 6.7308937823834195e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.8891937732696533, + "num_tokens": 3619132.0, + "step": 2020 + }, + { + "epoch": 0.32726095053032145, + "grad_norm": 26.703296661376953, + "learning_rate": 6.729274611398964e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.905139833688736, + "num_tokens": 3620926.0, + "step": 2021 + }, + { + "epoch": 0.32742288073840176, + "grad_norm": 31.412715911865234, + "learning_rate": 6.7276554404145076e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.8864319622516632, + "num_tokens": 3622728.0, + "step": 2022 + }, + { + "epoch": 0.32758481094648206, + "grad_norm": 16.676950454711914, + "learning_rate": 6.726036269430052e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.9283071458339691, + "num_tokens": 3624519.0, + "step": 2023 + }, + { + "epoch": 0.32774674115456237, + "grad_norm": 26.585298538208008, + "learning_rate": 6.724417098445596e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.8922753632068634, + "num_tokens": 3626319.0, + "step": 2024 + }, + { + "epoch": 0.3279086713626427, + "grad_norm": 16.874208450317383, + "learning_rate": 6.72279792746114e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.9233440160751343, + "num_tokens": 3628118.0, + "step": 2025 + }, + { + "epoch": 0.32807060157072304, + "grad_norm": 23.043498992919922, + "learning_rate": 6.721178756476684e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.9078603386878967, + "num_tokens": 3629901.0, + "step": 2026 + }, + { + "epoch": 0.32823253177880335, + "grad_norm": 25.120412826538086, + "learning_rate": 6.719559585492228e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.896541953086853, + "num_tokens": 3631704.0, + "step": 2027 + }, + { + "epoch": 0.32839446198688366, + "grad_norm": 20.261638641357422, + "learning_rate": 6.717940414507773e-06, + "loss": 0.5755, + "mean_token_accuracy": 0.9192405343055725, + "num_tokens": 3633488.0, + "step": 2028 + }, + { + "epoch": 0.32855639219496396, + "grad_norm": 27.570838928222656, + "learning_rate": 6.716321243523317e-06, + "loss": 0.76, + "mean_token_accuracy": 0.8926311731338501, + "num_tokens": 3635280.0, + "step": 2029 + }, + { + "epoch": 0.32871832240304427, + "grad_norm": 17.54373550415039, + "learning_rate": 6.714702072538861e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.9218370020389557, + "num_tokens": 3637061.0, + "step": 2030 + }, + { + "epoch": 0.3288802526111246, + "grad_norm": 22.089427947998047, + "learning_rate": 6.713082901554405e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.9107623100280762, + "num_tokens": 3638843.0, + "step": 2031 + }, + { + "epoch": 0.32904218281920494, + "grad_norm": 27.783069610595703, + "learning_rate": 6.711463730569949e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.8858237564563751, + "num_tokens": 3640644.0, + "step": 2032 + }, + { + "epoch": 0.32920411302728525, + "grad_norm": 24.617048263549805, + "learning_rate": 6.709844559585493e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.8977023065090179, + "num_tokens": 3642439.0, + "step": 2033 + }, + { + "epoch": 0.32936604323536556, + "grad_norm": 16.71503257751465, + "learning_rate": 6.708225388601037e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.9290726780891418, + "num_tokens": 3644219.0, + "step": 2034 + }, + { + "epoch": 0.32952797344344587, + "grad_norm": 23.588836669921875, + "learning_rate": 6.706606217616581e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.9159002006053925, + "num_tokens": 3646017.0, + "step": 2035 + }, + { + "epoch": 0.3296899036515262, + "grad_norm": 18.560108184814453, + "learning_rate": 6.7049870466321254e-06, + "loss": 0.613, + "mean_token_accuracy": 0.9233440160751343, + "num_tokens": 3647816.0, + "step": 2036 + }, + { + "epoch": 0.32985183385960654, + "grad_norm": 26.22181510925293, + "learning_rate": 6.70336787564767e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.8948275744915009, + "num_tokens": 3649613.0, + "step": 2037 + }, + { + "epoch": 0.33001376406768684, + "grad_norm": 18.475557327270508, + "learning_rate": 6.7017487046632135e-06, + "loss": 0.554, + "mean_token_accuracy": 0.9198883175849915, + "num_tokens": 3651412.0, + "step": 2038 + }, + { + "epoch": 0.33017569427576715, + "grad_norm": 18.607297897338867, + "learning_rate": 6.700129533678758e-06, + "loss": 0.5602, + "mean_token_accuracy": 0.9221243560314178, + "num_tokens": 3653206.0, + "step": 2039 + }, + { + "epoch": 0.33033762448384746, + "grad_norm": 20.52931022644043, + "learning_rate": 6.6985103626943015e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.9153079688549042, + "num_tokens": 3655000.0, + "step": 2040 + }, + { + "epoch": 0.33049955469192777, + "grad_norm": 25.807109832763672, + "learning_rate": 6.696891191709846e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.893869936466217, + "num_tokens": 3656786.0, + "step": 2041 + }, + { + "epoch": 0.3306614849000081, + "grad_norm": 22.523910522460938, + "learning_rate": 6.6952720207253895e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.9076103568077087, + "num_tokens": 3658579.0, + "step": 2042 + }, + { + "epoch": 0.33082341510808844, + "grad_norm": 31.554567337036133, + "learning_rate": 6.693652849740934e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.9000000059604645, + "num_tokens": 3660371.0, + "step": 2043 + }, + { + "epoch": 0.33098534531616874, + "grad_norm": 19.515472412109375, + "learning_rate": 6.6920336787564775e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.9102682769298553, + "num_tokens": 3662162.0, + "step": 2044 + }, + { + "epoch": 0.33114727552424905, + "grad_norm": 24.487789154052734, + "learning_rate": 6.690414507772022e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.8914916217327118, + "num_tokens": 3663950.0, + "step": 2045 + }, + { + "epoch": 0.33130920573232936, + "grad_norm": 18.697772979736328, + "learning_rate": 6.6887953367875656e-06, + "loss": 0.5862, + "mean_token_accuracy": 0.925220400094986, + "num_tokens": 3665743.0, + "step": 2046 + }, + { + "epoch": 0.33147113594040967, + "grad_norm": 13.087461471557617, + "learning_rate": 6.68717616580311e-06, + "loss": 0.5326, + "mean_token_accuracy": 0.927003413438797, + "num_tokens": 3667529.0, + "step": 2047 + }, + { + "epoch": 0.33163306614849, + "grad_norm": 24.593027114868164, + "learning_rate": 6.685556994818654e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.911445826292038, + "num_tokens": 3669324.0, + "step": 2048 + }, + { + "epoch": 0.33179499635657034, + "grad_norm": 20.102643966674805, + "learning_rate": 6.683937823834198e-06, + "loss": 0.5943, + "mean_token_accuracy": 0.925000011920929, + "num_tokens": 3671115.0, + "step": 2049 + }, + { + "epoch": 0.33195692656465065, + "grad_norm": 25.548480987548828, + "learning_rate": 6.682318652849742e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.89896559715271, + "num_tokens": 3672914.0, + "step": 2050 + }, + { + "epoch": 0.33211885677273095, + "grad_norm": 26.462007522583008, + "learning_rate": 6.680699481865286e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.8949579894542694, + "num_tokens": 3674702.0, + "step": 2051 + }, + { + "epoch": 0.33228078698081126, + "grad_norm": 18.874984741210938, + "learning_rate": 6.67908031088083e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.91366907954216, + "num_tokens": 3676492.0, + "step": 2052 + }, + { + "epoch": 0.33244271718889157, + "grad_norm": 26.781715393066406, + "learning_rate": 6.677461139896374e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.8842191100120544, + "num_tokens": 3678297.0, + "step": 2053 + }, + { + "epoch": 0.33260464739697193, + "grad_norm": 21.541614532470703, + "learning_rate": 6.675841968911918e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.9157900214195251, + "num_tokens": 3680082.0, + "step": 2054 + }, + { + "epoch": 0.33276657760505224, + "grad_norm": 15.959024429321289, + "learning_rate": 6.674222797927462e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.9268921315670013, + "num_tokens": 3681867.0, + "step": 2055 + }, + { + "epoch": 0.33292850781313255, + "grad_norm": 18.403867721557617, + "learning_rate": 6.6726036269430065e-06, + "loss": 0.6644, + "mean_token_accuracy": 0.9185185432434082, + "num_tokens": 3683649.0, + "step": 2056 + }, + { + "epoch": 0.33309043802121285, + "grad_norm": 17.219331741333008, + "learning_rate": 6.67098445595855e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.9277708232402802, + "num_tokens": 3685438.0, + "step": 2057 + }, + { + "epoch": 0.33325236822929316, + "grad_norm": 25.002901077270508, + "learning_rate": 6.6693652849740946e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.9075932800769806, + "num_tokens": 3687231.0, + "step": 2058 + }, + { + "epoch": 0.33341429843737347, + "grad_norm": 17.4008846282959, + "learning_rate": 6.667746113989638e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.9212316572666168, + "num_tokens": 3689023.0, + "step": 2059 + }, + { + "epoch": 0.33357622864545383, + "grad_norm": 20.410114288330078, + "learning_rate": 6.666126943005183e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.9204900860786438, + "num_tokens": 3690812.0, + "step": 2060 + }, + { + "epoch": 0.33373815885353414, + "grad_norm": 22.921175003051758, + "learning_rate": 6.664507772020726e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.9034899771213531, + "num_tokens": 3692604.0, + "step": 2061 + }, + { + "epoch": 0.33390008906161445, + "grad_norm": 19.56877326965332, + "learning_rate": 6.662888601036271e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.9231182336807251, + "num_tokens": 3694389.0, + "step": 2062 + }, + { + "epoch": 0.33406201926969475, + "grad_norm": 25.663497924804688, + "learning_rate": 6.661269430051814e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.9108623564243317, + "num_tokens": 3696179.0, + "step": 2063 + }, + { + "epoch": 0.33422394947777506, + "grad_norm": 21.264484405517578, + "learning_rate": 6.659650259067359e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.9215896725654602, + "num_tokens": 3697971.0, + "step": 2064 + }, + { + "epoch": 0.33438587968585537, + "grad_norm": 21.311599731445312, + "learning_rate": 6.658031088082902e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.9121031761169434, + "num_tokens": 3699767.0, + "step": 2065 + }, + { + "epoch": 0.33454780989393573, + "grad_norm": 28.692983627319336, + "learning_rate": 6.656411917098447e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.9006994962692261, + "num_tokens": 3701570.0, + "step": 2066 + }, + { + "epoch": 0.33470974010201604, + "grad_norm": 23.254192352294922, + "learning_rate": 6.65479274611399e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.8967473804950714, + "num_tokens": 3703363.0, + "step": 2067 + }, + { + "epoch": 0.33487167031009635, + "grad_norm": 26.036184310913086, + "learning_rate": 6.653173575129535e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.9035947918891907, + "num_tokens": 3705155.0, + "step": 2068 + }, + { + "epoch": 0.33503360051817666, + "grad_norm": 22.794485092163086, + "learning_rate": 6.651554404145078e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.9043233096599579, + "num_tokens": 3706940.0, + "step": 2069 + }, + { + "epoch": 0.33519553072625696, + "grad_norm": 21.834421157836914, + "learning_rate": 6.649935233160623e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.9057921469211578, + "num_tokens": 3708728.0, + "step": 2070 + }, + { + "epoch": 0.3353574609343373, + "grad_norm": 25.597959518432617, + "learning_rate": 6.648316062176166e-06, + "loss": 0.713, + "mean_token_accuracy": 0.9058879911899567, + "num_tokens": 3710514.0, + "step": 2071 + }, + { + "epoch": 0.33551939114241763, + "grad_norm": 15.884485244750977, + "learning_rate": 6.646696891191711e-06, + "loss": 0.5761, + "mean_token_accuracy": 0.9224588871002197, + "num_tokens": 3712297.0, + "step": 2072 + }, + { + "epoch": 0.33568132135049794, + "grad_norm": 21.887197494506836, + "learning_rate": 6.645077720207254e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.8964285850524902, + "num_tokens": 3714089.0, + "step": 2073 + }, + { + "epoch": 0.33584325155857825, + "grad_norm": 16.8392276763916, + "learning_rate": 6.643458549222799e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.9265734255313873, + "num_tokens": 3715887.0, + "step": 2074 + }, + { + "epoch": 0.33600518176665856, + "grad_norm": 18.458192825317383, + "learning_rate": 6.641839378238343e-06, + "loss": 0.656, + "mean_token_accuracy": 0.9099322259426117, + "num_tokens": 3717676.0, + "step": 2075 + }, + { + "epoch": 0.33616711197473886, + "grad_norm": 21.561832427978516, + "learning_rate": 6.640220207253887e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.9246582388877869, + "num_tokens": 3719466.0, + "step": 2076 + }, + { + "epoch": 0.3363290421828192, + "grad_norm": 16.147790908813477, + "learning_rate": 6.638601036269431e-06, + "loss": 0.5606, + "mean_token_accuracy": 0.9244604408740997, + "num_tokens": 3721256.0, + "step": 2077 + }, + { + "epoch": 0.33649097239089953, + "grad_norm": 27.431909561157227, + "learning_rate": 6.636981865284975e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.9035714268684387, + "num_tokens": 3723048.0, + "step": 2078 + }, + { + "epoch": 0.33665290259897984, + "grad_norm": 23.539583206176758, + "learning_rate": 6.635362694300519e-06, + "loss": 0.636, + "mean_token_accuracy": 0.9092437028884888, + "num_tokens": 3724836.0, + "step": 2079 + }, + { + "epoch": 0.33681483280706015, + "grad_norm": 30.017309188842773, + "learning_rate": 6.633743523316063e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.8976097106933594, + "num_tokens": 3726641.0, + "step": 2080 + }, + { + "epoch": 0.33697676301514046, + "grad_norm": 15.944644927978516, + "learning_rate": 6.632124352331607e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.9175926148891449, + "num_tokens": 3728420.0, + "step": 2081 + }, + { + "epoch": 0.33713869322322076, + "grad_norm": 16.257055282592773, + "learning_rate": 6.630505181347151e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.9328828752040863, + "num_tokens": 3730215.0, + "step": 2082 + }, + { + "epoch": 0.3373006234313011, + "grad_norm": 23.728219985961914, + "learning_rate": 6.628886010362695e-06, + "loss": 0.598, + "mean_token_accuracy": 0.909731537103653, + "num_tokens": 3732016.0, + "step": 2083 + }, + { + "epoch": 0.33746255363938144, + "grad_norm": 24.099153518676758, + "learning_rate": 6.627266839378239e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.9021464586257935, + "num_tokens": 3733804.0, + "step": 2084 + }, + { + "epoch": 0.33762448384746174, + "grad_norm": 24.35784912109375, + "learning_rate": 6.625647668393783e-06, + "loss": 0.6357, + "mean_token_accuracy": 0.9089610874652863, + "num_tokens": 3735591.0, + "step": 2085 + }, + { + "epoch": 0.33778641405554205, + "grad_norm": 25.602277755737305, + "learning_rate": 6.624028497409327e-06, + "loss": 0.6798, + "mean_token_accuracy": 0.9007353186607361, + "num_tokens": 3737375.0, + "step": 2086 + }, + { + "epoch": 0.33794834426362236, + "grad_norm": 26.737590789794922, + "learning_rate": 6.622409326424871e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.900648832321167, + "num_tokens": 3739168.0, + "step": 2087 + }, + { + "epoch": 0.3381102744717027, + "grad_norm": 15.450510025024414, + "learning_rate": 6.620790155440415e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.9195210933685303, + "num_tokens": 3740953.0, + "step": 2088 + }, + { + "epoch": 0.33827220467978303, + "grad_norm": 25.912057876586914, + "learning_rate": 6.619170984455959e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.9024069905281067, + "num_tokens": 3742741.0, + "step": 2089 + }, + { + "epoch": 0.33843413488786334, + "grad_norm": 19.65656852722168, + "learning_rate": 6.617551813471503e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.9061065912246704, + "num_tokens": 3744530.0, + "step": 2090 + }, + { + "epoch": 0.33859606509594364, + "grad_norm": 25.34670066833496, + "learning_rate": 6.615932642487047e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.8944444358348846, + "num_tokens": 3746317.0, + "step": 2091 + }, + { + "epoch": 0.33875799530402395, + "grad_norm": 18.848058700561523, + "learning_rate": 6.614313471502591e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.9214125573635101, + "num_tokens": 3748109.0, + "step": 2092 + }, + { + "epoch": 0.33891992551210426, + "grad_norm": 26.715635299682617, + "learning_rate": 6.6126943005181354e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.9032630920410156, + "num_tokens": 3749900.0, + "step": 2093 + }, + { + "epoch": 0.3390818557201846, + "grad_norm": 21.10277557373047, + "learning_rate": 6.61107512953368e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.9136128425598145, + "num_tokens": 3751690.0, + "step": 2094 + }, + { + "epoch": 0.33924378592826493, + "grad_norm": 20.28023338317871, + "learning_rate": 6.6094559585492235e-06, + "loss": 0.585, + "mean_token_accuracy": 0.9159272611141205, + "num_tokens": 3753487.0, + "step": 2095 + }, + { + "epoch": 0.33940571613634524, + "grad_norm": 24.7639102935791, + "learning_rate": 6.607836787564768e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.901890754699707, + "num_tokens": 3755275.0, + "step": 2096 + }, + { + "epoch": 0.33956764634442554, + "grad_norm": 27.058752059936523, + "learning_rate": 6.6062176165803115e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.896896243095398, + "num_tokens": 3757078.0, + "step": 2097 + }, + { + "epoch": 0.33972957655250585, + "grad_norm": 19.066463470458984, + "learning_rate": 6.604598445595856e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.9209109842777252, + "num_tokens": 3758868.0, + "step": 2098 + }, + { + "epoch": 0.3398915067605862, + "grad_norm": 25.428539276123047, + "learning_rate": 6.6029792746113995e-06, + "loss": 0.6257, + "mean_token_accuracy": 0.9128378331661224, + "num_tokens": 3760668.0, + "step": 2099 + }, + { + "epoch": 0.3400534369686665, + "grad_norm": 20.141963958740234, + "learning_rate": 6.601360103626944e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.9193500280380249, + "num_tokens": 3762464.0, + "step": 2100 + }, + { + "epoch": 0.34021536717674683, + "grad_norm": 24.149559020996094, + "learning_rate": 6.5997409326424875e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.9061971306800842, + "num_tokens": 3764264.0, + "step": 2101 + }, + { + "epoch": 0.34037729738482714, + "grad_norm": 20.161081314086914, + "learning_rate": 6.598121761658032e-06, + "loss": 0.6499, + "mean_token_accuracy": 0.9059343636035919, + "num_tokens": 3766052.0, + "step": 2102 + }, + { + "epoch": 0.34053922759290745, + "grad_norm": 25.36886215209961, + "learning_rate": 6.5965025906735756e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.9120689630508423, + "num_tokens": 3767849.0, + "step": 2103 + }, + { + "epoch": 0.34070115780098775, + "grad_norm": 25.93685531616211, + "learning_rate": 6.59488341968912e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.9087276458740234, + "num_tokens": 3769635.0, + "step": 2104 + }, + { + "epoch": 0.3408630880090681, + "grad_norm": 32.673309326171875, + "learning_rate": 6.593264248704664e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.8947907388210297, + "num_tokens": 3771423.0, + "step": 2105 + }, + { + "epoch": 0.3410250182171484, + "grad_norm": 24.944774627685547, + "learning_rate": 6.591645077720208e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.8978102207183838, + "num_tokens": 3773209.0, + "step": 2106 + }, + { + "epoch": 0.34118694842522873, + "grad_norm": 24.814537048339844, + "learning_rate": 6.590025906735752e-06, + "loss": 0.6365, + "mean_token_accuracy": 0.9089886546134949, + "num_tokens": 3775007.0, + "step": 2107 + }, + { + "epoch": 0.34134887863330904, + "grad_norm": 16.94260597229004, + "learning_rate": 6.588406735751296e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.9143631160259247, + "num_tokens": 3776788.0, + "step": 2108 + }, + { + "epoch": 0.34151080884138935, + "grad_norm": 18.645910263061523, + "learning_rate": 6.58678756476684e-06, + "loss": 0.6291, + "mean_token_accuracy": 0.9120418429374695, + "num_tokens": 3778584.0, + "step": 2109 + }, + { + "epoch": 0.34167273904946965, + "grad_norm": 22.338497161865234, + "learning_rate": 6.585168393782384e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.9230088293552399, + "num_tokens": 3780369.0, + "step": 2110 + }, + { + "epoch": 0.34183466925755, + "grad_norm": 27.6822509765625, + "learning_rate": 6.583549222797928e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.9130434989929199, + "num_tokens": 3782157.0, + "step": 2111 + }, + { + "epoch": 0.3419965994656303, + "grad_norm": 17.167701721191406, + "learning_rate": 6.581930051813472e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.9244965612888336, + "num_tokens": 3783947.0, + "step": 2112 + }, + { + "epoch": 0.34215852967371063, + "grad_norm": 18.025650024414062, + "learning_rate": 6.5803108808290166e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.9195785224437714, + "num_tokens": 3785732.0, + "step": 2113 + }, + { + "epoch": 0.34232045988179094, + "grad_norm": 24.95975112915039, + "learning_rate": 6.57869170984456e-06, + "loss": 0.6442, + "mean_token_accuracy": 0.9143518507480621, + "num_tokens": 3787523.0, + "step": 2114 + }, + { + "epoch": 0.34248239008987125, + "grad_norm": 19.770832061767578, + "learning_rate": 6.577072538860105e-06, + "loss": 0.553, + "mean_token_accuracy": 0.9280538260936737, + "num_tokens": 3789313.0, + "step": 2115 + }, + { + "epoch": 0.3426443202979516, + "grad_norm": 31.38514518737793, + "learning_rate": 6.575453367875648e-06, + "loss": 0.78, + "mean_token_accuracy": 0.8878780007362366, + "num_tokens": 3791102.0, + "step": 2116 + }, + { + "epoch": 0.3428062505060319, + "grad_norm": 29.965070724487305, + "learning_rate": 6.573834196891193e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.8893055617809296, + "num_tokens": 3792894.0, + "step": 2117 + }, + { + "epoch": 0.3429681807141122, + "grad_norm": 24.24552345275879, + "learning_rate": 6.572215025906736e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.8999391794204712, + "num_tokens": 3794675.0, + "step": 2118 + }, + { + "epoch": 0.34313011092219253, + "grad_norm": 18.665618896484375, + "learning_rate": 6.570595854922281e-06, + "loss": 0.5663, + "mean_token_accuracy": 0.9210539758205414, + "num_tokens": 3796465.0, + "step": 2119 + }, + { + "epoch": 0.34329204113027284, + "grad_norm": 24.936668395996094, + "learning_rate": 6.568976683937824e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.9131924510002136, + "num_tokens": 3798266.0, + "step": 2120 + }, + { + "epoch": 0.34345397133835315, + "grad_norm": 27.47084617614746, + "learning_rate": 6.567357512953369e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.8929530084133148, + "num_tokens": 3800067.0, + "step": 2121 + }, + { + "epoch": 0.3436159015464335, + "grad_norm": 24.947195053100586, + "learning_rate": 6.565738341968912e-06, + "loss": 0.5777, + "mean_token_accuracy": 0.9059751033782959, + "num_tokens": 3801864.0, + "step": 2122 + }, + { + "epoch": 0.3437778317545138, + "grad_norm": 21.40584373474121, + "learning_rate": 6.564119170984457e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.9168370068073273, + "num_tokens": 3803663.0, + "step": 2123 + }, + { + "epoch": 0.3439397619625941, + "grad_norm": 20.94043731689453, + "learning_rate": 6.5625e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.9150566458702087, + "num_tokens": 3805434.0, + "step": 2124 + }, + { + "epoch": 0.34410169217067443, + "grad_norm": 26.31580352783203, + "learning_rate": 6.560880829015545e-06, + "loss": 0.6515, + "mean_token_accuracy": 0.90427565574646, + "num_tokens": 3807228.0, + "step": 2125 + }, + { + "epoch": 0.34426362237875474, + "grad_norm": 21.0064754486084, + "learning_rate": 6.559261658031088e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.913382351398468, + "num_tokens": 3809017.0, + "step": 2126 + }, + { + "epoch": 0.34442555258683505, + "grad_norm": 30.202377319335938, + "learning_rate": 6.557642487046633e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.9010576605796814, + "num_tokens": 3810822.0, + "step": 2127 + }, + { + "epoch": 0.3445874827949154, + "grad_norm": 23.299419403076172, + "learning_rate": 6.556023316062176e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.9135643839836121, + "num_tokens": 3812613.0, + "step": 2128 + }, + { + "epoch": 0.3447494130029957, + "grad_norm": 21.97918701171875, + "learning_rate": 6.554404145077721e-06, + "loss": 0.5576, + "mean_token_accuracy": 0.919034868478775, + "num_tokens": 3814409.0, + "step": 2129 + }, + { + "epoch": 0.344911343211076, + "grad_norm": 25.32720947265625, + "learning_rate": 6.552784974093264e-06, + "loss": 0.743, + "mean_token_accuracy": 0.9148935973644257, + "num_tokens": 3816203.0, + "step": 2130 + }, + { + "epoch": 0.34507327341915633, + "grad_norm": 26.97684669494629, + "learning_rate": 6.551165803108809e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.9024765491485596, + "num_tokens": 3817992.0, + "step": 2131 + }, + { + "epoch": 0.34523520362723664, + "grad_norm": 26.395828247070312, + "learning_rate": 6.549546632124353e-06, + "loss": 0.6384, + "mean_token_accuracy": 0.9059554934501648, + "num_tokens": 3819791.0, + "step": 2132 + }, + { + "epoch": 0.345397133835317, + "grad_norm": 26.689434051513672, + "learning_rate": 6.547927461139897e-06, + "loss": 0.6337, + "mean_token_accuracy": 0.9123508334159851, + "num_tokens": 3821577.0, + "step": 2133 + }, + { + "epoch": 0.3455590640433973, + "grad_norm": 16.77696990966797, + "learning_rate": 6.546308290155441e-06, + "loss": 0.5896, + "mean_token_accuracy": 0.9193372428417206, + "num_tokens": 3823360.0, + "step": 2134 + }, + { + "epoch": 0.3457209942514776, + "grad_norm": 21.114429473876953, + "learning_rate": 6.544689119170985e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.9181883335113525, + "num_tokens": 3825152.0, + "step": 2135 + }, + { + "epoch": 0.3458829244595579, + "grad_norm": 24.115955352783203, + "learning_rate": 6.543069948186529e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.9202331602573395, + "num_tokens": 3826940.0, + "step": 2136 + }, + { + "epoch": 0.34604485466763824, + "grad_norm": 26.51895523071289, + "learning_rate": 6.541450777202073e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.8995098173618317, + "num_tokens": 3828732.0, + "step": 2137 + }, + { + "epoch": 0.34620678487571854, + "grad_norm": 29.941570281982422, + "learning_rate": 6.539831606217617e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.8849684298038483, + "num_tokens": 3830531.0, + "step": 2138 + }, + { + "epoch": 0.3463687150837989, + "grad_norm": 17.610179901123047, + "learning_rate": 6.538212435233161e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.919669508934021, + "num_tokens": 3832317.0, + "step": 2139 + }, + { + "epoch": 0.3465306452918792, + "grad_norm": 21.39354705810547, + "learning_rate": 6.536593264248705e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.9195118546485901, + "num_tokens": 3834125.0, + "step": 2140 + }, + { + "epoch": 0.3466925754999595, + "grad_norm": 18.09645652770996, + "learning_rate": 6.534974093264249e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.911347508430481, + "num_tokens": 3835919.0, + "step": 2141 + }, + { + "epoch": 0.34685450570803983, + "grad_norm": 21.196632385253906, + "learning_rate": 6.533354922279793e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.917339563369751, + "num_tokens": 3837709.0, + "step": 2142 + }, + { + "epoch": 0.34701643591612014, + "grad_norm": 30.876632690429688, + "learning_rate": 6.531735751295337e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.8922254145145416, + "num_tokens": 3839500.0, + "step": 2143 + }, + { + "epoch": 0.34717836612420044, + "grad_norm": 22.115489959716797, + "learning_rate": 6.530116580310881e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.9021967649459839, + "num_tokens": 3841288.0, + "step": 2144 + }, + { + "epoch": 0.3473402963322808, + "grad_norm": 22.1717586517334, + "learning_rate": 6.528497409326425e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.9206287264823914, + "num_tokens": 3843077.0, + "step": 2145 + }, + { + "epoch": 0.3475022265403611, + "grad_norm": 15.205970764160156, + "learning_rate": 6.526878238341969e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.9209504723548889, + "num_tokens": 3844867.0, + "step": 2146 + }, + { + "epoch": 0.3476641567484414, + "grad_norm": 14.341293334960938, + "learning_rate": 6.525259067357513e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.930633544921875, + "num_tokens": 3846653.0, + "step": 2147 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 32.03675079345703, + "learning_rate": 6.5236398963730574e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.8916361033916473, + "num_tokens": 3848451.0, + "step": 2148 + }, + { + "epoch": 0.34798801716460204, + "grad_norm": 23.610078811645508, + "learning_rate": 6.522020725388601e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.9120039641857147, + "num_tokens": 3850247.0, + "step": 2149 + }, + { + "epoch": 0.3481499473726824, + "grad_norm": 24.427082061767578, + "learning_rate": 6.5204015544041455e-06, + "loss": 0.6618, + "mean_token_accuracy": 0.9093185067176819, + "num_tokens": 3852046.0, + "step": 2150 + }, + { + "epoch": 0.3483118775807627, + "grad_norm": 28.468671798706055, + "learning_rate": 6.51878238341969e-06, + "loss": 0.618, + "mean_token_accuracy": 0.9067831337451935, + "num_tokens": 3853837.0, + "step": 2151 + }, + { + "epoch": 0.348473807788843, + "grad_norm": 19.28902816772461, + "learning_rate": 6.5171632124352335e-06, + "loss": 0.6626, + "mean_token_accuracy": 0.9191673398017883, + "num_tokens": 3855621.0, + "step": 2152 + }, + { + "epoch": 0.3486357379969233, + "grad_norm": 19.29517936706543, + "learning_rate": 6.515544041450778e-06, + "loss": 0.5749, + "mean_token_accuracy": 0.9207557141780853, + "num_tokens": 3857411.0, + "step": 2153 + }, + { + "epoch": 0.34879766820500363, + "grad_norm": 21.733097076416016, + "learning_rate": 6.5139248704663215e-06, + "loss": 0.6338, + "mean_token_accuracy": 0.9153417944908142, + "num_tokens": 3859207.0, + "step": 2154 + }, + { + "epoch": 0.34895959841308394, + "grad_norm": 16.114097595214844, + "learning_rate": 6.512305699481866e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.9319444596767426, + "num_tokens": 3860998.0, + "step": 2155 + }, + { + "epoch": 0.3491215286211643, + "grad_norm": 22.43841552734375, + "learning_rate": 6.5106865284974095e-06, + "loss": 0.6581, + "mean_token_accuracy": 0.9116838276386261, + "num_tokens": 3862793.0, + "step": 2156 + }, + { + "epoch": 0.3492834588292446, + "grad_norm": 14.085238456726074, + "learning_rate": 6.509067357512954e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.9301860332489014, + "num_tokens": 3864577.0, + "step": 2157 + }, + { + "epoch": 0.3494453890373249, + "grad_norm": 29.33224105834961, + "learning_rate": 6.5074481865284976e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.8738393187522888, + "num_tokens": 3866366.0, + "step": 2158 + }, + { + "epoch": 0.3496073192454052, + "grad_norm": 21.894073486328125, + "learning_rate": 6.505829015544042e-06, + "loss": 0.6197, + "mean_token_accuracy": 0.9097852110862732, + "num_tokens": 3868155.0, + "step": 2159 + }, + { + "epoch": 0.34976924945348553, + "grad_norm": 25.372777938842773, + "learning_rate": 6.504209844559586e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.9064182341098785, + "num_tokens": 3869945.0, + "step": 2160 + }, + { + "epoch": 0.34993117966156584, + "grad_norm": 31.39737892150879, + "learning_rate": 6.50259067357513e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.9008542001247406, + "num_tokens": 3871749.0, + "step": 2161 + }, + { + "epoch": 0.3500931098696462, + "grad_norm": 25.322107315063477, + "learning_rate": 6.500971502590674e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.8953647315502167, + "num_tokens": 3873537.0, + "step": 2162 + }, + { + "epoch": 0.3502550400777265, + "grad_norm": 23.296398162841797, + "learning_rate": 6.499352331606218e-06, + "loss": 0.6366, + "mean_token_accuracy": 0.9176002144813538, + "num_tokens": 3875328.0, + "step": 2163 + }, + { + "epoch": 0.3504169702858068, + "grad_norm": 28.476030349731445, + "learning_rate": 6.497733160621762e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.902803361415863, + "num_tokens": 3877124.0, + "step": 2164 + }, + { + "epoch": 0.3505789004938871, + "grad_norm": 26.659847259521484, + "learning_rate": 6.496113989637306e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.901033878326416, + "num_tokens": 3878919.0, + "step": 2165 + }, + { + "epoch": 0.35074083070196743, + "grad_norm": 23.27151870727539, + "learning_rate": 6.49449481865285e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.8963005542755127, + "num_tokens": 3880720.0, + "step": 2166 + }, + { + "epoch": 0.3509027609100478, + "grad_norm": 21.743480682373047, + "learning_rate": 6.492875647668394e-06, + "loss": 0.6307, + "mean_token_accuracy": 0.9157631993293762, + "num_tokens": 3882505.0, + "step": 2167 + }, + { + "epoch": 0.3510646911181281, + "grad_norm": 33.06528091430664, + "learning_rate": 6.491256476683938e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.8974468111991882, + "num_tokens": 3884308.0, + "step": 2168 + }, + { + "epoch": 0.3512266213262084, + "grad_norm": 21.615249633789062, + "learning_rate": 6.489637305699482e-06, + "loss": 0.5949, + "mean_token_accuracy": 0.9198540151119232, + "num_tokens": 3886107.0, + "step": 2169 + }, + { + "epoch": 0.3513885515342887, + "grad_norm": 23.438983917236328, + "learning_rate": 6.4880181347150266e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.9104995429515839, + "num_tokens": 3887898.0, + "step": 2170 + }, + { + "epoch": 0.351550481742369, + "grad_norm": 24.982452392578125, + "learning_rate": 6.48639896373057e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.9085317552089691, + "num_tokens": 3889694.0, + "step": 2171 + }, + { + "epoch": 0.35171241195044933, + "grad_norm": 22.238332748413086, + "learning_rate": 6.484779792746115e-06, + "loss": 0.6205, + "mean_token_accuracy": 0.9106506407260895, + "num_tokens": 3891474.0, + "step": 2172 + }, + { + "epoch": 0.3518743421585297, + "grad_norm": 19.65492057800293, + "learning_rate": 6.483160621761658e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.9171359837055206, + "num_tokens": 3893263.0, + "step": 2173 + }, + { + "epoch": 0.35203627236661, + "grad_norm": 28.740571975708008, + "learning_rate": 6.481541450777203e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.8981566727161407, + "num_tokens": 3895070.0, + "step": 2174 + }, + { + "epoch": 0.3521982025746903, + "grad_norm": 20.463491439819336, + "learning_rate": 6.479922279792746e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.9199904799461365, + "num_tokens": 3896857.0, + "step": 2175 + }, + { + "epoch": 0.3523601327827706, + "grad_norm": 28.632444381713867, + "learning_rate": 6.478303108808291e-06, + "loss": 0.6384, + "mean_token_accuracy": 0.9181406795978546, + "num_tokens": 3898648.0, + "step": 2176 + }, + { + "epoch": 0.3525220629908509, + "grad_norm": 31.122787475585938, + "learning_rate": 6.476683937823834e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.8806885182857513, + "num_tokens": 3900445.0, + "step": 2177 + }, + { + "epoch": 0.35268399319893123, + "grad_norm": 39.111812591552734, + "learning_rate": 6.475064766839379e-06, + "loss": 1.1488, + "mean_token_accuracy": 0.8923469483852386, + "num_tokens": 3902244.0, + "step": 2178 + }, + { + "epoch": 0.3528459234070116, + "grad_norm": 26.414106369018555, + "learning_rate": 6.473445595854922e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.9079371392726898, + "num_tokens": 3904038.0, + "step": 2179 + }, + { + "epoch": 0.3530078536150919, + "grad_norm": 16.560407638549805, + "learning_rate": 6.471826424870467e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.923355907201767, + "num_tokens": 3905823.0, + "step": 2180 + }, + { + "epoch": 0.3531697838231722, + "grad_norm": 30.784868240356445, + "learning_rate": 6.47020725388601e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.8981938660144806, + "num_tokens": 3907620.0, + "step": 2181 + }, + { + "epoch": 0.3533317140312525, + "grad_norm": 19.335500717163086, + "learning_rate": 6.468588082901555e-06, + "loss": 0.6107, + "mean_token_accuracy": 0.9160934388637543, + "num_tokens": 3909406.0, + "step": 2182 + }, + { + "epoch": 0.3534936442393328, + "grad_norm": 21.249637603759766, + "learning_rate": 6.466968911917098e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.9219701290130615, + "num_tokens": 3911200.0, + "step": 2183 + }, + { + "epoch": 0.3536555744474132, + "grad_norm": 28.769323348999023, + "learning_rate": 6.465349740932643e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.8975040316581726, + "num_tokens": 3912985.0, + "step": 2184 + }, + { + "epoch": 0.3538175046554935, + "grad_norm": 19.6943416595459, + "learning_rate": 6.463730569948186e-06, + "loss": 0.6119, + "mean_token_accuracy": 0.9160839319229126, + "num_tokens": 3914783.0, + "step": 2185 + }, + { + "epoch": 0.3539794348635738, + "grad_norm": 19.53598403930664, + "learning_rate": 6.462111398963731e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.9122383296489716, + "num_tokens": 3916568.0, + "step": 2186 + }, + { + "epoch": 0.3541413650716541, + "grad_norm": 26.573444366455078, + "learning_rate": 6.460492227979274e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.9058031737804413, + "num_tokens": 3918357.0, + "step": 2187 + }, + { + "epoch": 0.3543032952797344, + "grad_norm": 24.73661994934082, + "learning_rate": 6.458873056994819e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.9100933074951172, + "num_tokens": 3920144.0, + "step": 2188 + }, + { + "epoch": 0.35446522548781473, + "grad_norm": 24.69605255126953, + "learning_rate": 6.457253886010363e-06, + "loss": 0.782, + "mean_token_accuracy": 0.8998858034610748, + "num_tokens": 3921946.0, + "step": 2189 + }, + { + "epoch": 0.3546271556958951, + "grad_norm": 17.774948120117188, + "learning_rate": 6.455634715025907e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.9215146005153656, + "num_tokens": 3923738.0, + "step": 2190 + }, + { + "epoch": 0.3547890859039754, + "grad_norm": 19.63977813720703, + "learning_rate": 6.454015544041451e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.9250841736793518, + "num_tokens": 3925517.0, + "step": 2191 + }, + { + "epoch": 0.3549510161120557, + "grad_norm": 21.64784812927246, + "learning_rate": 6.452396373056995e-06, + "loss": 0.5961, + "mean_token_accuracy": 0.9041826725006104, + "num_tokens": 3927303.0, + "step": 2192 + }, + { + "epoch": 0.355112946320136, + "grad_norm": 25.56424331665039, + "learning_rate": 6.450777202072539e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.9089855253696442, + "num_tokens": 3929103.0, + "step": 2193 + }, + { + "epoch": 0.3552748765282163, + "grad_norm": 25.165279388427734, + "learning_rate": 6.449158031088083e-06, + "loss": 0.6583, + "mean_token_accuracy": 0.901846170425415, + "num_tokens": 3930890.0, + "step": 2194 + }, + { + "epoch": 0.35543680673629663, + "grad_norm": 20.730913162231445, + "learning_rate": 6.447538860103627e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.9091269969940186, + "num_tokens": 3932677.0, + "step": 2195 + }, + { + "epoch": 0.355598736944377, + "grad_norm": 20.076528549194336, + "learning_rate": 6.445919689119171e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.9171116650104523, + "num_tokens": 3934475.0, + "step": 2196 + }, + { + "epoch": 0.3557606671524573, + "grad_norm": 22.78238868713379, + "learning_rate": 6.444300518134715e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.9063973128795624, + "num_tokens": 3936265.0, + "step": 2197 + }, + { + "epoch": 0.3559225973605376, + "grad_norm": 29.081985473632812, + "learning_rate": 6.442681347150259e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.8993506729602814, + "num_tokens": 3938085.0, + "step": 2198 + }, + { + "epoch": 0.3560845275686179, + "grad_norm": 24.60321807861328, + "learning_rate": 6.441062176165803e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.8968862593173981, + "num_tokens": 3939876.0, + "step": 2199 + }, + { + "epoch": 0.3562464577766982, + "grad_norm": 20.562898635864258, + "learning_rate": 6.439443005181347e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.9092437028884888, + "num_tokens": 3941664.0, + "step": 2200 + }, + { + "epoch": 0.3564083879847786, + "grad_norm": 24.693836212158203, + "learning_rate": 6.437823834196891e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.9002995491027832, + "num_tokens": 3943447.0, + "step": 2201 + }, + { + "epoch": 0.3565703181928589, + "grad_norm": 17.60163688659668, + "learning_rate": 6.436204663212435e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.9113799929618835, + "num_tokens": 3945230.0, + "step": 2202 + }, + { + "epoch": 0.3567322484009392, + "grad_norm": 17.729053497314453, + "learning_rate": 6.434585492227979e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.9117646813392639, + "num_tokens": 3947014.0, + "step": 2203 + }, + { + "epoch": 0.3568941786090195, + "grad_norm": 17.950485229492188, + "learning_rate": 6.432966321243523e-06, + "loss": 0.6298, + "mean_token_accuracy": 0.9147887229919434, + "num_tokens": 3948808.0, + "step": 2204 + }, + { + "epoch": 0.3570561088170998, + "grad_norm": 20.892547607421875, + "learning_rate": 6.4313471502590674e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.9107877314090729, + "num_tokens": 3950599.0, + "step": 2205 + }, + { + "epoch": 0.3572180390251801, + "grad_norm": 28.4982852935791, + "learning_rate": 6.429727979274611e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.8985221683979034, + "num_tokens": 3952396.0, + "step": 2206 + }, + { + "epoch": 0.3573799692332605, + "grad_norm": 19.08439826965332, + "learning_rate": 6.4281088082901555e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.9081169068813324, + "num_tokens": 3954180.0, + "step": 2207 + }, + { + "epoch": 0.3575418994413408, + "grad_norm": 18.782014846801758, + "learning_rate": 6.4264896373057e-06, + "loss": 0.6324, + "mean_token_accuracy": 0.9076087176799774, + "num_tokens": 3955974.0, + "step": 2208 + }, + { + "epoch": 0.3577038296494211, + "grad_norm": 17.24158477783203, + "learning_rate": 6.4248704663212435e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.9196909368038177, + "num_tokens": 3957760.0, + "step": 2209 + }, + { + "epoch": 0.3578657598575014, + "grad_norm": 19.374765396118164, + "learning_rate": 6.423251295336788e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.9154887795448303, + "num_tokens": 3959556.0, + "step": 2210 + }, + { + "epoch": 0.3580276900655817, + "grad_norm": 17.251399993896484, + "learning_rate": 6.4216321243523315e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.9157369434833527, + "num_tokens": 3961342.0, + "step": 2211 + }, + { + "epoch": 0.3581896202736621, + "grad_norm": 18.58675193786621, + "learning_rate": 6.420012953367876e-06, + "loss": 0.623, + "mean_token_accuracy": 0.9122377634048462, + "num_tokens": 3963127.0, + "step": 2212 + }, + { + "epoch": 0.3583515504817424, + "grad_norm": 23.050987243652344, + "learning_rate": 6.4183937823834196e-06, + "loss": 0.6375, + "mean_token_accuracy": 0.9127601683139801, + "num_tokens": 3964915.0, + "step": 2213 + }, + { + "epoch": 0.3585134806898227, + "grad_norm": 16.583791732788086, + "learning_rate": 6.416774611398964e-06, + "loss": 0.5465, + "mean_token_accuracy": 0.919117659330368, + "num_tokens": 3966699.0, + "step": 2214 + }, + { + "epoch": 0.358675410897903, + "grad_norm": 18.57876968383789, + "learning_rate": 6.415155440414508e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.9094203114509583, + "num_tokens": 3968487.0, + "step": 2215 + }, + { + "epoch": 0.3588373411059833, + "grad_norm": 20.355701446533203, + "learning_rate": 6.413536269430052e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.9210199117660522, + "num_tokens": 3970277.0, + "step": 2216 + }, + { + "epoch": 0.3589992713140636, + "grad_norm": 20.74054527282715, + "learning_rate": 6.411917098445596e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.9163228571414948, + "num_tokens": 3972064.0, + "step": 2217 + }, + { + "epoch": 0.359161201522144, + "grad_norm": 19.709436416625977, + "learning_rate": 6.41029792746114e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.9106077551841736, + "num_tokens": 3973855.0, + "step": 2218 + }, + { + "epoch": 0.3593231317302243, + "grad_norm": 27.718469619750977, + "learning_rate": 6.408678756476684e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.9007360637187958, + "num_tokens": 3975647.0, + "step": 2219 + }, + { + "epoch": 0.3594850619383046, + "grad_norm": 21.05332374572754, + "learning_rate": 6.407059585492228e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.9156555533409119, + "num_tokens": 3977432.0, + "step": 2220 + }, + { + "epoch": 0.3596469921463849, + "grad_norm": 23.723247528076172, + "learning_rate": 6.405440414507773e-06, + "loss": 0.6236, + "mean_token_accuracy": 0.9197080135345459, + "num_tokens": 3979218.0, + "step": 2221 + }, + { + "epoch": 0.3598089223544652, + "grad_norm": 22.432613372802734, + "learning_rate": 6.403821243523317e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.9116883277893066, + "num_tokens": 3981002.0, + "step": 2222 + }, + { + "epoch": 0.3599708525625455, + "grad_norm": 22.70639991760254, + "learning_rate": 6.402202072538861e-06, + "loss": 0.6073, + "mean_token_accuracy": 0.9172360301017761, + "num_tokens": 3982792.0, + "step": 2223 + }, + { + "epoch": 0.3601327827706259, + "grad_norm": 20.778104782104492, + "learning_rate": 6.400582901554405e-06, + "loss": 0.618, + "mean_token_accuracy": 0.9178501665592194, + "num_tokens": 3984572.0, + "step": 2224 + }, + { + "epoch": 0.3602947129787062, + "grad_norm": 17.046594619750977, + "learning_rate": 6.398963730569949e-06, + "loss": 0.524, + "mean_token_accuracy": 0.9230892956256866, + "num_tokens": 3986357.0, + "step": 2225 + }, + { + "epoch": 0.3604566431867865, + "grad_norm": 24.413124084472656, + "learning_rate": 6.397344559585493e-06, + "loss": 0.681, + "mean_token_accuracy": 0.9015937745571136, + "num_tokens": 3988163.0, + "step": 2226 + }, + { + "epoch": 0.3606185733948668, + "grad_norm": 23.731372833251953, + "learning_rate": 6.3957253886010374e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.9119867086410522, + "num_tokens": 3989959.0, + "step": 2227 + }, + { + "epoch": 0.3607805036029471, + "grad_norm": 26.18189239501953, + "learning_rate": 6.394106217616581e-06, + "loss": 0.6074, + "mean_token_accuracy": 0.9135818481445312, + "num_tokens": 3991748.0, + "step": 2228 + }, + { + "epoch": 0.3609424338110275, + "grad_norm": 21.4954891204834, + "learning_rate": 6.3924870466321255e-06, + "loss": 0.571, + "mean_token_accuracy": 0.9179131388664246, + "num_tokens": 3993539.0, + "step": 2229 + }, + { + "epoch": 0.3611043640191078, + "grad_norm": 25.930837631225586, + "learning_rate": 6.390867875647669e-06, + "loss": 0.743, + "mean_token_accuracy": 0.9045799672603607, + "num_tokens": 3995334.0, + "step": 2230 + }, + { + "epoch": 0.3612662942271881, + "grad_norm": 29.53824806213379, + "learning_rate": 6.3892487046632135e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.9051370322704315, + "num_tokens": 3997129.0, + "step": 2231 + }, + { + "epoch": 0.3614282244352684, + "grad_norm": 25.533300399780273, + "learning_rate": 6.387629533678757e-06, + "loss": 0.6064, + "mean_token_accuracy": 0.9019704461097717, + "num_tokens": 3998926.0, + "step": 2232 + }, + { + "epoch": 0.3615901546433487, + "grad_norm": 27.277572631835938, + "learning_rate": 6.3860103626943015e-06, + "loss": 0.7397, + "mean_token_accuracy": 0.8994092047214508, + "num_tokens": 4000716.0, + "step": 2233 + }, + { + "epoch": 0.361752084851429, + "grad_norm": 24.49983787536621, + "learning_rate": 6.384391191709846e-06, + "loss": 0.6236, + "mean_token_accuracy": 0.9057773351669312, + "num_tokens": 4002504.0, + "step": 2234 + }, + { + "epoch": 0.3619140150595094, + "grad_norm": 25.49953269958496, + "learning_rate": 6.3827720207253895e-06, + "loss": 0.6405, + "mean_token_accuracy": 0.9103453755378723, + "num_tokens": 4004295.0, + "step": 2235 + }, + { + "epoch": 0.3620759452675897, + "grad_norm": 29.299388885498047, + "learning_rate": 6.381152849740934e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.9023735225200653, + "num_tokens": 4006084.0, + "step": 2236 + }, + { + "epoch": 0.36223787547567, + "grad_norm": 22.609750747680664, + "learning_rate": 6.3795336787564776e-06, + "loss": 0.6354, + "mean_token_accuracy": 0.9089886546134949, + "num_tokens": 4007882.0, + "step": 2237 + }, + { + "epoch": 0.3623998056837503, + "grad_norm": 21.020383834838867, + "learning_rate": 6.377914507772022e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.9249706864356995, + "num_tokens": 4009674.0, + "step": 2238 + }, + { + "epoch": 0.3625617358918306, + "grad_norm": 22.250280380249023, + "learning_rate": 6.376295336787566e-06, + "loss": 0.5649, + "mean_token_accuracy": 0.9104297459125519, + "num_tokens": 4011465.0, + "step": 2239 + }, + { + "epoch": 0.3627236660999109, + "grad_norm": 26.73594856262207, + "learning_rate": 6.37467616580311e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.8925926089286804, + "num_tokens": 4013256.0, + "step": 2240 + }, + { + "epoch": 0.3628855963079913, + "grad_norm": 22.481779098510742, + "learning_rate": 6.373056994818654e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.9039260447025299, + "num_tokens": 4015049.0, + "step": 2241 + }, + { + "epoch": 0.3630475265160716, + "grad_norm": 23.815908432006836, + "learning_rate": 6.371437823834198e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.9049826860427856, + "num_tokens": 4016835.0, + "step": 2242 + }, + { + "epoch": 0.3632094567241519, + "grad_norm": 15.922857284545898, + "learning_rate": 6.369818652849742e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9239558577537537, + "num_tokens": 4018623.0, + "step": 2243 + }, + { + "epoch": 0.3633713869322322, + "grad_norm": 17.92227554321289, + "learning_rate": 6.368199481865286e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.9259096682071686, + "num_tokens": 4020405.0, + "step": 2244 + }, + { + "epoch": 0.3635333171403125, + "grad_norm": 24.845996856689453, + "learning_rate": 6.36658031088083e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.897082507610321, + "num_tokens": 4022199.0, + "step": 2245 + }, + { + "epoch": 0.36369524734839287, + "grad_norm": 30.46639633178711, + "learning_rate": 6.364961139896374e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.8785386979579926, + "num_tokens": 4023999.0, + "step": 2246 + }, + { + "epoch": 0.3638571775564732, + "grad_norm": 28.35569190979004, + "learning_rate": 6.363341968911918e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.9060223400592804, + "num_tokens": 4025798.0, + "step": 2247 + }, + { + "epoch": 0.3640191077645535, + "grad_norm": 19.480100631713867, + "learning_rate": 6.361722797927462e-06, + "loss": 0.6185, + "mean_token_accuracy": 0.9233989119529724, + "num_tokens": 4027584.0, + "step": 2248 + }, + { + "epoch": 0.3641810379726338, + "grad_norm": 27.425073623657227, + "learning_rate": 6.360103626943006e-06, + "loss": 0.745, + "mean_token_accuracy": 0.8896499276161194, + "num_tokens": 4029386.0, + "step": 2249 + }, + { + "epoch": 0.3643429681807141, + "grad_norm": 23.494970321655273, + "learning_rate": 6.35848445595855e-06, + "loss": 0.6428, + "mean_token_accuracy": 0.9052592515945435, + "num_tokens": 4031182.0, + "step": 2250 + }, + { + "epoch": 0.3645048983887944, + "grad_norm": 24.000337600708008, + "learning_rate": 6.356865284974094e-06, + "loss": 0.6155, + "mean_token_accuracy": 0.9052910208702087, + "num_tokens": 4032969.0, + "step": 2251 + }, + { + "epoch": 0.36466682859687477, + "grad_norm": 26.413236618041992, + "learning_rate": 6.355246113989638e-06, + "loss": 0.638, + "mean_token_accuracy": 0.9183934032917023, + "num_tokens": 4034764.0, + "step": 2252 + }, + { + "epoch": 0.3648287588049551, + "grad_norm": 28.493099212646484, + "learning_rate": 6.353626943005183e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.9046049118041992, + "num_tokens": 4036559.0, + "step": 2253 + }, + { + "epoch": 0.3649906890130354, + "grad_norm": 28.178659439086914, + "learning_rate": 6.352007772020726e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.9089753031730652, + "num_tokens": 4038357.0, + "step": 2254 + }, + { + "epoch": 0.3651526192211157, + "grad_norm": 28.810771942138672, + "learning_rate": 6.350388601036271e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.9027248620986938, + "num_tokens": 4040157.0, + "step": 2255 + }, + { + "epoch": 0.365314549429196, + "grad_norm": 18.55508041381836, + "learning_rate": 6.348769430051814e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.9272255897521973, + "num_tokens": 4041944.0, + "step": 2256 + }, + { + "epoch": 0.3654764796372763, + "grad_norm": 15.460658073425293, + "learning_rate": 6.347150259067359e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.9333333373069763, + "num_tokens": 4043726.0, + "step": 2257 + }, + { + "epoch": 0.36563840984535667, + "grad_norm": 29.21817398071289, + "learning_rate": 6.345531088082902e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.8981158137321472, + "num_tokens": 4045513.0, + "step": 2258 + }, + { + "epoch": 0.365800340053437, + "grad_norm": 30.518543243408203, + "learning_rate": 6.343911917098447e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.8950586915016174, + "num_tokens": 4047311.0, + "step": 2259 + }, + { + "epoch": 0.3659622702615173, + "grad_norm": 21.996871948242188, + "learning_rate": 6.34229274611399e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.9106077551841736, + "num_tokens": 4049102.0, + "step": 2260 + }, + { + "epoch": 0.3661242004695976, + "grad_norm": 23.002965927124023, + "learning_rate": 6.340673575129535e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.9161014258861542, + "num_tokens": 4050887.0, + "step": 2261 + }, + { + "epoch": 0.3662861306776779, + "grad_norm": 21.776159286499023, + "learning_rate": 6.339054404145078e-06, + "loss": 0.6339, + "mean_token_accuracy": 0.9082373082637787, + "num_tokens": 4052673.0, + "step": 2262 + }, + { + "epoch": 0.36644806088575826, + "grad_norm": 18.76465606689453, + "learning_rate": 6.337435233160623e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.922358363866806, + "num_tokens": 4054468.0, + "step": 2263 + }, + { + "epoch": 0.36660999109383857, + "grad_norm": 30.67425537109375, + "learning_rate": 6.335816062176166e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.8983929753303528, + "num_tokens": 4056273.0, + "step": 2264 + }, + { + "epoch": 0.3667719213019189, + "grad_norm": 23.984634399414062, + "learning_rate": 6.334196891191711e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.9116408228874207, + "num_tokens": 4058079.0, + "step": 2265 + }, + { + "epoch": 0.3669338515099992, + "grad_norm": 25.59101676940918, + "learning_rate": 6.332577720207254e-06, + "loss": 0.65, + "mean_token_accuracy": 0.9103172123432159, + "num_tokens": 4059880.0, + "step": 2266 + }, + { + "epoch": 0.3670957817180795, + "grad_norm": 25.86172866821289, + "learning_rate": 6.330958549222799e-06, + "loss": 0.6583, + "mean_token_accuracy": 0.913968563079834, + "num_tokens": 4061671.0, + "step": 2267 + }, + { + "epoch": 0.3672577119261598, + "grad_norm": 22.27694320678711, + "learning_rate": 6.329339378238342e-06, + "loss": 0.5915, + "mean_token_accuracy": 0.9078137874603271, + "num_tokens": 4063476.0, + "step": 2268 + }, + { + "epoch": 0.36741964213424017, + "grad_norm": 23.27398681640625, + "learning_rate": 6.327720207253887e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.905844658613205, + "num_tokens": 4065264.0, + "step": 2269 + }, + { + "epoch": 0.3675815723423205, + "grad_norm": 23.89768409729004, + "learning_rate": 6.326101036269431e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.8995938897132874, + "num_tokens": 4067046.0, + "step": 2270 + }, + { + "epoch": 0.3677435025504008, + "grad_norm": 22.042600631713867, + "learning_rate": 6.324481865284975e-06, + "loss": 0.5969, + "mean_token_accuracy": 0.9139508605003357, + "num_tokens": 4068837.0, + "step": 2271 + }, + { + "epoch": 0.3679054327584811, + "grad_norm": 14.952237129211426, + "learning_rate": 6.322862694300519e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.9237650036811829, + "num_tokens": 4070638.0, + "step": 2272 + }, + { + "epoch": 0.3680673629665614, + "grad_norm": 22.978012084960938, + "learning_rate": 6.321243523316063e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.9067129492759705, + "num_tokens": 4072429.0, + "step": 2273 + }, + { + "epoch": 0.3682292931746417, + "grad_norm": 25.961809158325195, + "learning_rate": 6.319624352331607e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.8986742198467255, + "num_tokens": 4074217.0, + "step": 2274 + }, + { + "epoch": 0.36839122338272207, + "grad_norm": 27.614992141723633, + "learning_rate": 6.318005181347151e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.8922211229801178, + "num_tokens": 4076015.0, + "step": 2275 + }, + { + "epoch": 0.3685531535908024, + "grad_norm": 20.41364097595215, + "learning_rate": 6.316386010362695e-06, + "loss": 0.65, + "mean_token_accuracy": 0.9083916246891022, + "num_tokens": 4077800.0, + "step": 2276 + }, + { + "epoch": 0.3687150837988827, + "grad_norm": 32.38789749145508, + "learning_rate": 6.314766839378239e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.9021503329277039, + "num_tokens": 4079597.0, + "step": 2277 + }, + { + "epoch": 0.368877014006963, + "grad_norm": 23.38788414001465, + "learning_rate": 6.313147668393783e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.9045549929141998, + "num_tokens": 4081392.0, + "step": 2278 + }, + { + "epoch": 0.3690389442150433, + "grad_norm": 23.370681762695312, + "learning_rate": 6.311528497409327e-06, + "loss": 0.6662, + "mean_token_accuracy": 0.9151688516139984, + "num_tokens": 4083175.0, + "step": 2279 + }, + { + "epoch": 0.36920087442312366, + "grad_norm": 21.638273239135742, + "learning_rate": 6.309909326424871e-06, + "loss": 0.6185, + "mean_token_accuracy": 0.9138127863407135, + "num_tokens": 4084965.0, + "step": 2280 + }, + { + "epoch": 0.36936280463120397, + "grad_norm": 22.466955184936523, + "learning_rate": 6.308290155440415e-06, + "loss": 0.6432, + "mean_token_accuracy": 0.9143323600292206, + "num_tokens": 4086757.0, + "step": 2281 + }, + { + "epoch": 0.3695247348392843, + "grad_norm": 24.6545352935791, + "learning_rate": 6.306670984455959e-06, + "loss": 0.6682, + "mean_token_accuracy": 0.9001071751117706, + "num_tokens": 4088549.0, + "step": 2282 + }, + { + "epoch": 0.3696866650473646, + "grad_norm": 15.135995864868164, + "learning_rate": 6.305051813471503e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.9280020892620087, + "num_tokens": 4090339.0, + "step": 2283 + }, + { + "epoch": 0.3698485952554449, + "grad_norm": 31.99400520324707, + "learning_rate": 6.3034326424870474e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.8988343775272369, + "num_tokens": 4092138.0, + "step": 2284 + }, + { + "epoch": 0.3700105254635252, + "grad_norm": 22.13029670715332, + "learning_rate": 6.301813471502591e-06, + "loss": 0.6305, + "mean_token_accuracy": 0.9074974656105042, + "num_tokens": 4093931.0, + "step": 2285 + }, + { + "epoch": 0.37017245567160556, + "grad_norm": 19.34627342224121, + "learning_rate": 6.3001943005181355e-06, + "loss": 0.579, + "mean_token_accuracy": 0.9171972870826721, + "num_tokens": 4095721.0, + "step": 2286 + }, + { + "epoch": 0.37033438587968587, + "grad_norm": 24.71289825439453, + "learning_rate": 6.298575129533679e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.8978580832481384, + "num_tokens": 4097507.0, + "step": 2287 + }, + { + "epoch": 0.3704963160877662, + "grad_norm": 28.232088088989258, + "learning_rate": 6.2969559585492235e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.9048126935958862, + "num_tokens": 4099303.0, + "step": 2288 + }, + { + "epoch": 0.3706582462958465, + "grad_norm": 25.628108978271484, + "learning_rate": 6.295336787564768e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.892824113368988, + "num_tokens": 4101094.0, + "step": 2289 + }, + { + "epoch": 0.3708201765039268, + "grad_norm": 19.739723205566406, + "learning_rate": 6.2937176165803115e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.9203381836414337, + "num_tokens": 4102882.0, + "step": 2290 + }, + { + "epoch": 0.3709821067120071, + "grad_norm": 17.226787567138672, + "learning_rate": 6.292098445595856e-06, + "loss": 0.5839, + "mean_token_accuracy": 0.9204832017421722, + "num_tokens": 4104670.0, + "step": 2291 + }, + { + "epoch": 0.37114403692008746, + "grad_norm": 22.36802101135254, + "learning_rate": 6.2904792746113995e-06, + "loss": 0.6395, + "mean_token_accuracy": 0.9168752431869507, + "num_tokens": 4106459.0, + "step": 2292 + }, + { + "epoch": 0.37130596712816777, + "grad_norm": 15.932029724121094, + "learning_rate": 6.288860103626944e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9266433119773865, + "num_tokens": 4108256.0, + "step": 2293 + }, + { + "epoch": 0.3714678973362481, + "grad_norm": 21.709257125854492, + "learning_rate": 6.2872409326424876e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.9003114402294159, + "num_tokens": 4110029.0, + "step": 2294 + }, + { + "epoch": 0.3716298275443284, + "grad_norm": 35.92877960205078, + "learning_rate": 6.285621761658032e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.8998287618160248, + "num_tokens": 4111839.0, + "step": 2295 + }, + { + "epoch": 0.3717917577524087, + "grad_norm": 18.95516014099121, + "learning_rate": 6.284002590673576e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.9136690497398376, + "num_tokens": 4113629.0, + "step": 2296 + }, + { + "epoch": 0.37195368796048905, + "grad_norm": 21.056535720825195, + "learning_rate": 6.28238341968912e-06, + "loss": 0.61, + "mean_token_accuracy": 0.9245029091835022, + "num_tokens": 4115419.0, + "step": 2297 + }, + { + "epoch": 0.37211561816856936, + "grad_norm": 26.51069450378418, + "learning_rate": 6.280764248704664e-06, + "loss": 0.632, + "mean_token_accuracy": 0.9092245101928711, + "num_tokens": 4117217.0, + "step": 2298 + }, + { + "epoch": 0.37227754837664967, + "grad_norm": 19.568313598632812, + "learning_rate": 6.279145077720208e-06, + "loss": 0.545, + "mean_token_accuracy": 0.9250357449054718, + "num_tokens": 4119009.0, + "step": 2299 + }, + { + "epoch": 0.37243947858473, + "grad_norm": 23.084980010986328, + "learning_rate": 6.277525906735752e-06, + "loss": 0.615, + "mean_token_accuracy": 0.9119922816753387, + "num_tokens": 4120805.0, + "step": 2300 + }, + { + "epoch": 0.3726014087928103, + "grad_norm": 17.704103469848633, + "learning_rate": 6.275906735751296e-06, + "loss": 0.6366, + "mean_token_accuracy": 0.9284208714962006, + "num_tokens": 4122582.0, + "step": 2301 + }, + { + "epoch": 0.3727633390008906, + "grad_norm": 26.454050064086914, + "learning_rate": 6.27428756476684e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.9072712361812592, + "num_tokens": 4124383.0, + "step": 2302 + }, + { + "epoch": 0.37292526920897096, + "grad_norm": 20.5181827545166, + "learning_rate": 6.272668393782384e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.918178141117096, + "num_tokens": 4126176.0, + "step": 2303 + }, + { + "epoch": 0.37308719941705126, + "grad_norm": 28.848773956298828, + "learning_rate": 6.271049222797928e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.8857142925262451, + "num_tokens": 4127968.0, + "step": 2304 + }, + { + "epoch": 0.37324912962513157, + "grad_norm": 18.847166061401367, + "learning_rate": 6.269430051813472e-06, + "loss": 0.5958, + "mean_token_accuracy": 0.9130389094352722, + "num_tokens": 4129756.0, + "step": 2305 + }, + { + "epoch": 0.3734110598332119, + "grad_norm": 30.33755874633789, + "learning_rate": 6.267810880829016e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.9020931124687195, + "num_tokens": 4131554.0, + "step": 2306 + }, + { + "epoch": 0.3735729900412922, + "grad_norm": 15.333966255187988, + "learning_rate": 6.26619170984456e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.9197037518024445, + "num_tokens": 4133340.0, + "step": 2307 + }, + { + "epoch": 0.3737349202493725, + "grad_norm": 17.22572898864746, + "learning_rate": 6.264572538860105e-06, + "loss": 0.6413, + "mean_token_accuracy": 0.9240403473377228, + "num_tokens": 4135128.0, + "step": 2308 + }, + { + "epoch": 0.37389685045745286, + "grad_norm": 23.395233154296875, + "learning_rate": 6.262953367875648e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.9148142039775848, + "num_tokens": 4136909.0, + "step": 2309 + }, + { + "epoch": 0.37405878066553316, + "grad_norm": 25.794370651245117, + "learning_rate": 6.261334196891193e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.9094164371490479, + "num_tokens": 4138696.0, + "step": 2310 + }, + { + "epoch": 0.37422071087361347, + "grad_norm": 21.045114517211914, + "learning_rate": 6.259715025906736e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.919584333896637, + "num_tokens": 4140482.0, + "step": 2311 + }, + { + "epoch": 0.3743826410816938, + "grad_norm": 24.307363510131836, + "learning_rate": 6.258095854922281e-06, + "loss": 0.6822, + "mean_token_accuracy": 0.9070360660552979, + "num_tokens": 4142274.0, + "step": 2312 + }, + { + "epoch": 0.3745445712897741, + "grad_norm": 28.865230560302734, + "learning_rate": 6.256476683937824e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.8979166448116302, + "num_tokens": 4144070.0, + "step": 2313 + }, + { + "epoch": 0.37470650149785445, + "grad_norm": 22.208402633666992, + "learning_rate": 6.254857512953369e-06, + "loss": 0.6102, + "mean_token_accuracy": 0.9111519455909729, + "num_tokens": 4145862.0, + "step": 2314 + }, + { + "epoch": 0.37486843170593476, + "grad_norm": 20.481834411621094, + "learning_rate": 6.253238341968912e-06, + "loss": 0.5823, + "mean_token_accuracy": 0.9198294281959534, + "num_tokens": 4147648.0, + "step": 2315 + }, + { + "epoch": 0.37503036191401506, + "grad_norm": 17.121299743652344, + "learning_rate": 6.251619170984457e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.9275209903717041, + "num_tokens": 4149436.0, + "step": 2316 + }, + { + "epoch": 0.37519229212209537, + "grad_norm": 21.6956729888916, + "learning_rate": 6.25e-06, + "loss": 0.559, + "mean_token_accuracy": 0.9205268919467926, + "num_tokens": 4151225.0, + "step": 2317 + }, + { + "epoch": 0.3753542223301757, + "grad_norm": 20.43297576904297, + "learning_rate": 6.248380829015545e-06, + "loss": 0.635, + "mean_token_accuracy": 0.908904105424881, + "num_tokens": 4153023.0, + "step": 2318 + }, + { + "epoch": 0.375516152538256, + "grad_norm": 23.611289978027344, + "learning_rate": 6.246761658031088e-06, + "loss": 0.5564, + "mean_token_accuracy": 0.9193286001682281, + "num_tokens": 4154820.0, + "step": 2319 + }, + { + "epoch": 0.37567808274633635, + "grad_norm": 31.101444244384766, + "learning_rate": 6.245142487046633e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.8932262659072876, + "num_tokens": 4156613.0, + "step": 2320 + }, + { + "epoch": 0.37584001295441666, + "grad_norm": 27.796873092651367, + "learning_rate": 6.243523316062176e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.9142303168773651, + "num_tokens": 4158405.0, + "step": 2321 + }, + { + "epoch": 0.37600194316249697, + "grad_norm": 19.307470321655273, + "learning_rate": 6.241904145077721e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.9273574352264404, + "num_tokens": 4160205.0, + "step": 2322 + }, + { + "epoch": 0.3761638733705773, + "grad_norm": 32.03840255737305, + "learning_rate": 6.240284974093264e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.88413867354393, + "num_tokens": 4161993.0, + "step": 2323 + }, + { + "epoch": 0.3763258035786576, + "grad_norm": 34.219757080078125, + "learning_rate": 6.238665803108809e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.882758617401123, + "num_tokens": 4163795.0, + "step": 2324 + }, + { + "epoch": 0.37648773378673794, + "grad_norm": 23.71923065185547, + "learning_rate": 6.237046632124352e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.9079736173152924, + "num_tokens": 4165578.0, + "step": 2325 + }, + { + "epoch": 0.37664966399481825, + "grad_norm": 37.05076599121094, + "learning_rate": 6.235427461139897e-06, + "loss": 1.143, + "mean_token_accuracy": 0.8762303590774536, + "num_tokens": 4167371.0, + "step": 2326 + }, + { + "epoch": 0.37681159420289856, + "grad_norm": 19.135000228881836, + "learning_rate": 6.233808290155441e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.9110513627529144, + "num_tokens": 4169153.0, + "step": 2327 + }, + { + "epoch": 0.37697352441097887, + "grad_norm": 18.642662048339844, + "learning_rate": 6.232189119170985e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.9231814444065094, + "num_tokens": 4170939.0, + "step": 2328 + }, + { + "epoch": 0.3771354546190592, + "grad_norm": 28.72759437561035, + "learning_rate": 6.230569948186529e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.8905594348907471, + "num_tokens": 4172734.0, + "step": 2329 + }, + { + "epoch": 0.3772973848271395, + "grad_norm": 28.877880096435547, + "learning_rate": 6.228950777202073e-06, + "loss": 0.6739, + "mean_token_accuracy": 0.9017039239406586, + "num_tokens": 4174531.0, + "step": 2330 + }, + { + "epoch": 0.37745931503521984, + "grad_norm": 29.932432174682617, + "learning_rate": 6.227331606217617e-06, + "loss": 0.6826, + "mean_token_accuracy": 0.9043845534324646, + "num_tokens": 4176326.0, + "step": 2331 + }, + { + "epoch": 0.37762124524330015, + "grad_norm": 25.98511505126953, + "learning_rate": 6.225712435233161e-06, + "loss": 0.6355, + "mean_token_accuracy": 0.9164335429668427, + "num_tokens": 4178111.0, + "step": 2332 + }, + { + "epoch": 0.37778317545138046, + "grad_norm": 25.36676597595215, + "learning_rate": 6.224093264248705e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.916067510843277, + "num_tokens": 4179909.0, + "step": 2333 + }, + { + "epoch": 0.37794510565946077, + "grad_norm": 26.015789031982422, + "learning_rate": 6.222474093264249e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.8996115028858185, + "num_tokens": 4181699.0, + "step": 2334 + }, + { + "epoch": 0.3781070358675411, + "grad_norm": 23.07358741760254, + "learning_rate": 6.220854922279793e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.9124854505062103, + "num_tokens": 4183486.0, + "step": 2335 + }, + { + "epoch": 0.3782689660756214, + "grad_norm": 20.87764549255371, + "learning_rate": 6.219235751295337e-06, + "loss": 0.6071, + "mean_token_accuracy": 0.9129863977432251, + "num_tokens": 4185274.0, + "step": 2336 + }, + { + "epoch": 0.37843089628370175, + "grad_norm": 21.269840240478516, + "learning_rate": 6.217616580310881e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.9165872633457184, + "num_tokens": 4187061.0, + "step": 2337 + }, + { + "epoch": 0.37859282649178205, + "grad_norm": 21.05706787109375, + "learning_rate": 6.215997409326425e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.9277969002723694, + "num_tokens": 4188850.0, + "step": 2338 + }, + { + "epoch": 0.37875475669986236, + "grad_norm": 20.515602111816406, + "learning_rate": 6.2143782383419694e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.9243930280208588, + "num_tokens": 4190640.0, + "step": 2339 + }, + { + "epoch": 0.37891668690794267, + "grad_norm": 24.510225296020508, + "learning_rate": 6.212759067357513e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.8986153602600098, + "num_tokens": 4192437.0, + "step": 2340 + }, + { + "epoch": 0.379078617116023, + "grad_norm": 19.635196685791016, + "learning_rate": 6.2111398963730575e-06, + "loss": 0.6407, + "mean_token_accuracy": 0.9179104566574097, + "num_tokens": 4194217.0, + "step": 2341 + }, + { + "epoch": 0.37924054732410334, + "grad_norm": 21.90341567993164, + "learning_rate": 6.209520725388601e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.9139773845672607, + "num_tokens": 4196008.0, + "step": 2342 + }, + { + "epoch": 0.37940247753218365, + "grad_norm": 18.383474349975586, + "learning_rate": 6.2079015544041455e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.9220010936260223, + "num_tokens": 4197789.0, + "step": 2343 + }, + { + "epoch": 0.37956440774026395, + "grad_norm": 24.39859390258789, + "learning_rate": 6.206282383419689e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.9136646091938019, + "num_tokens": 4199579.0, + "step": 2344 + }, + { + "epoch": 0.37972633794834426, + "grad_norm": 28.421485900878906, + "learning_rate": 6.2046632124352335e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.9034113883972168, + "num_tokens": 4201372.0, + "step": 2345 + }, + { + "epoch": 0.37988826815642457, + "grad_norm": 22.188867568969727, + "learning_rate": 6.203044041450778e-06, + "loss": 0.5955, + "mean_token_accuracy": 0.9139243066310883, + "num_tokens": 4203163.0, + "step": 2346 + }, + { + "epoch": 0.3800501983645049, + "grad_norm": 26.85664176940918, + "learning_rate": 6.2014248704663215e-06, + "loss": 0.6117, + "mean_token_accuracy": 0.9104297459125519, + "num_tokens": 4204954.0, + "step": 2347 + }, + { + "epoch": 0.38021212857258524, + "grad_norm": 21.927711486816406, + "learning_rate": 6.199805699481866e-06, + "loss": 0.5582, + "mean_token_accuracy": 0.9123508334159851, + "num_tokens": 4206740.0, + "step": 2348 + }, + { + "epoch": 0.38037405878066555, + "grad_norm": 33.48126983642578, + "learning_rate": 6.1981865284974096e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.8973762691020966, + "num_tokens": 4208554.0, + "step": 2349 + }, + { + "epoch": 0.38053598898874585, + "grad_norm": 31.373567581176758, + "learning_rate": 6.196567357512954e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.9085317552089691, + "num_tokens": 4210350.0, + "step": 2350 + }, + { + "epoch": 0.38069791919682616, + "grad_norm": 22.877470016479492, + "learning_rate": 6.194948186528498e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.9247430562973022, + "num_tokens": 4212141.0, + "step": 2351 + }, + { + "epoch": 0.38085984940490647, + "grad_norm": 29.325273513793945, + "learning_rate": 6.193329015544042e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.9041364789009094, + "num_tokens": 4213935.0, + "step": 2352 + }, + { + "epoch": 0.3810217796129868, + "grad_norm": 31.117767333984375, + "learning_rate": 6.191709844559586e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.8759579062461853, + "num_tokens": 4215737.0, + "step": 2353 + }, + { + "epoch": 0.38118370982106714, + "grad_norm": 26.284822463989258, + "learning_rate": 6.19009067357513e-06, + "loss": 0.654, + "mean_token_accuracy": 0.9106282591819763, + "num_tokens": 4217540.0, + "step": 2354 + }, + { + "epoch": 0.38134564002914745, + "grad_norm": 22.111026763916016, + "learning_rate": 6.188471502590674e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.9267310798168182, + "num_tokens": 4219325.0, + "step": 2355 + }, + { + "epoch": 0.38150757023722776, + "grad_norm": 27.846500396728516, + "learning_rate": 6.186852331606218e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.9067532122135162, + "num_tokens": 4221105.0, + "step": 2356 + }, + { + "epoch": 0.38166950044530806, + "grad_norm": 25.400686264038086, + "learning_rate": 6.185233160621762e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.9107513427734375, + "num_tokens": 4222895.0, + "step": 2357 + }, + { + "epoch": 0.38183143065338837, + "grad_norm": 18.665996551513672, + "learning_rate": 6.183613989637306e-06, + "loss": 0.6434, + "mean_token_accuracy": 0.9037568271160126, + "num_tokens": 4224687.0, + "step": 2358 + }, + { + "epoch": 0.38199336086146873, + "grad_norm": 14.849250793457031, + "learning_rate": 6.18199481865285e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.9334261417388916, + "num_tokens": 4226470.0, + "step": 2359 + }, + { + "epoch": 0.38215529106954904, + "grad_norm": 21.723913192749023, + "learning_rate": 6.180375647668394e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.9247430562973022, + "num_tokens": 4228261.0, + "step": 2360 + }, + { + "epoch": 0.38231722127762935, + "grad_norm": 24.744943618774414, + "learning_rate": 6.178756476683938e-06, + "loss": 0.6468, + "mean_token_accuracy": 0.911750465631485, + "num_tokens": 4230046.0, + "step": 2361 + }, + { + "epoch": 0.38247915148570966, + "grad_norm": 27.28764533996582, + "learning_rate": 6.177137305699482e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.9223533868789673, + "num_tokens": 4231844.0, + "step": 2362 + }, + { + "epoch": 0.38264108169378996, + "grad_norm": 26.96405792236328, + "learning_rate": 6.175518134715026e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.8986698091030121, + "num_tokens": 4233633.0, + "step": 2363 + }, + { + "epoch": 0.38280301190187027, + "grad_norm": 22.760257720947266, + "learning_rate": 6.17389896373057e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.9163931012153625, + "num_tokens": 4235420.0, + "step": 2364 + }, + { + "epoch": 0.38296494210995063, + "grad_norm": 29.48963165283203, + "learning_rate": 6.172279792746115e-06, + "loss": 0.756, + "mean_token_accuracy": 0.896064817905426, + "num_tokens": 4237211.0, + "step": 2365 + }, + { + "epoch": 0.38312687231803094, + "grad_norm": 21.912670135498047, + "learning_rate": 6.170660621761658e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.9205766022205353, + "num_tokens": 4239000.0, + "step": 2366 + }, + { + "epoch": 0.38328880252611125, + "grad_norm": 19.380645751953125, + "learning_rate": 6.169041450777203e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9288247227668762, + "num_tokens": 4240793.0, + "step": 2367 + }, + { + "epoch": 0.38345073273419156, + "grad_norm": 19.986095428466797, + "learning_rate": 6.167422279792746e-06, + "loss": 0.5959, + "mean_token_accuracy": 0.920409768819809, + "num_tokens": 4242581.0, + "step": 2368 + }, + { + "epoch": 0.38361266294227186, + "grad_norm": 15.863972663879395, + "learning_rate": 6.165803108808291e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.9266417622566223, + "num_tokens": 4244365.0, + "step": 2369 + }, + { + "epoch": 0.3837745931503522, + "grad_norm": 19.17365837097168, + "learning_rate": 6.164183937823834e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.9221530854701996, + "num_tokens": 4246148.0, + "step": 2370 + }, + { + "epoch": 0.38393652335843254, + "grad_norm": 28.560333251953125, + "learning_rate": 6.162564766839379e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.9154411554336548, + "num_tokens": 4247932.0, + "step": 2371 + }, + { + "epoch": 0.38409845356651284, + "grad_norm": 28.11639976501465, + "learning_rate": 6.160945595854922e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.9031755924224854, + "num_tokens": 4249733.0, + "step": 2372 + }, + { + "epoch": 0.38426038377459315, + "grad_norm": 24.67559242248535, + "learning_rate": 6.159326424870467e-06, + "loss": 0.6375, + "mean_token_accuracy": 0.9055226147174835, + "num_tokens": 4251520.0, + "step": 2373 + }, + { + "epoch": 0.38442231398267346, + "grad_norm": 31.49180793762207, + "learning_rate": 6.15770725388601e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.9009661972522736, + "num_tokens": 4253314.0, + "step": 2374 + }, + { + "epoch": 0.38458424419075377, + "grad_norm": 24.148473739624023, + "learning_rate": 6.156088082901555e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.9177156090736389, + "num_tokens": 4255104.0, + "step": 2375 + }, + { + "epoch": 0.38474617439883413, + "grad_norm": 31.747270584106445, + "learning_rate": 6.154468911917098e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.9109589159488678, + "num_tokens": 4256908.0, + "step": 2376 + }, + { + "epoch": 0.38490810460691444, + "grad_norm": 25.216537475585938, + "learning_rate": 6.152849740932643e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.9107142984867096, + "num_tokens": 4258700.0, + "step": 2377 + }, + { + "epoch": 0.38507003481499474, + "grad_norm": 14.640902519226074, + "learning_rate": 6.151230569948186e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9241215884685516, + "num_tokens": 4260489.0, + "step": 2378 + }, + { + "epoch": 0.38523196502307505, + "grad_norm": 34.02447509765625, + "learning_rate": 6.149611398963731e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.8783153593540192, + "num_tokens": 4262288.0, + "step": 2379 + }, + { + "epoch": 0.38539389523115536, + "grad_norm": 15.521620750427246, + "learning_rate": 6.147992227979274e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9316252768039703, + "num_tokens": 4264078.0, + "step": 2380 + }, + { + "epoch": 0.38555582543923567, + "grad_norm": 26.370441436767578, + "learning_rate": 6.146373056994819e-06, + "loss": 0.6768, + "mean_token_accuracy": 0.8977884352207184, + "num_tokens": 4265864.0, + "step": 2381 + }, + { + "epoch": 0.38571775564731603, + "grad_norm": 34.29618453979492, + "learning_rate": 6.144753886010362e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.8916840553283691, + "num_tokens": 4267653.0, + "step": 2382 + }, + { + "epoch": 0.38587968585539634, + "grad_norm": 25.426679611206055, + "learning_rate": 6.143134715025907e-06, + "loss": 0.6618, + "mean_token_accuracy": 0.8966927230358124, + "num_tokens": 4269446.0, + "step": 2383 + }, + { + "epoch": 0.38604161606347664, + "grad_norm": 22.19244956970215, + "learning_rate": 6.141515544041451e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9310924410820007, + "num_tokens": 4271234.0, + "step": 2384 + }, + { + "epoch": 0.38620354627155695, + "grad_norm": 23.652576446533203, + "learning_rate": 6.139896373056995e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.9228723645210266, + "num_tokens": 4273019.0, + "step": 2385 + }, + { + "epoch": 0.38636547647963726, + "grad_norm": 18.39960289001465, + "learning_rate": 6.138277202072539e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9268921315670013, + "num_tokens": 4274804.0, + "step": 2386 + }, + { + "epoch": 0.38652740668771757, + "grad_norm": 20.631511688232422, + "learning_rate": 6.136658031088083e-06, + "loss": 0.643, + "mean_token_accuracy": 0.9215146005153656, + "num_tokens": 4276596.0, + "step": 2387 + }, + { + "epoch": 0.38668933689579793, + "grad_norm": 24.41177749633789, + "learning_rate": 6.135038860103627e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.9176002144813538, + "num_tokens": 4278387.0, + "step": 2388 + }, + { + "epoch": 0.38685126710387824, + "grad_norm": 41.67818069458008, + "learning_rate": 6.133419689119171e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.8757992386817932, + "num_tokens": 4280173.0, + "step": 2389 + }, + { + "epoch": 0.38701319731195855, + "grad_norm": 21.251855850219727, + "learning_rate": 6.131800518134715e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.9172651469707489, + "num_tokens": 4281976.0, + "step": 2390 + }, + { + "epoch": 0.38717512752003885, + "grad_norm": 29.321077346801758, + "learning_rate": 6.130181347150259e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.8996478021144867, + "num_tokens": 4283768.0, + "step": 2391 + }, + { + "epoch": 0.38733705772811916, + "grad_norm": 30.430627822875977, + "learning_rate": 6.128562176165803e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.9084491729736328, + "num_tokens": 4285552.0, + "step": 2392 + }, + { + "epoch": 0.3874989879361995, + "grad_norm": 33.49610900878906, + "learning_rate": 6.126943005181347e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.8874330222606659, + "num_tokens": 4287348.0, + "step": 2393 + }, + { + "epoch": 0.38766091814427983, + "grad_norm": 24.73077964782715, + "learning_rate": 6.125323834196891e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.9160839319229126, + "num_tokens": 4289146.0, + "step": 2394 + }, + { + "epoch": 0.38782284835236014, + "grad_norm": 33.180877685546875, + "learning_rate": 6.123704663212435e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.9076961874961853, + "num_tokens": 4290940.0, + "step": 2395 + }, + { + "epoch": 0.38798477856044045, + "grad_norm": 35.44948196411133, + "learning_rate": 6.1220854922279794e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.8878780007362366, + "num_tokens": 4292729.0, + "step": 2396 + }, + { + "epoch": 0.38814670876852075, + "grad_norm": 29.867279052734375, + "learning_rate": 6.120466321243523e-06, + "loss": 0.6441, + "mean_token_accuracy": 0.9082009792327881, + "num_tokens": 4294524.0, + "step": 2397 + }, + { + "epoch": 0.38830863897660106, + "grad_norm": 17.9906005859375, + "learning_rate": 6.1188471502590675e-06, + "loss": 0.6241, + "mean_token_accuracy": 0.9147235453128815, + "num_tokens": 4296294.0, + "step": 2398 + }, + { + "epoch": 0.3884705691846814, + "grad_norm": 29.192676544189453, + "learning_rate": 6.117227979274611e-06, + "loss": 0.6341, + "mean_token_accuracy": 0.8953312635421753, + "num_tokens": 4298083.0, + "step": 2399 + }, + { + "epoch": 0.38863249939276173, + "grad_norm": 22.05814552307129, + "learning_rate": 6.1156088082901555e-06, + "loss": 0.5492, + "mean_token_accuracy": 0.9166885912418365, + "num_tokens": 4299871.0, + "step": 2400 + }, + { + "epoch": 0.38879442960084204, + "grad_norm": 23.100486755371094, + "learning_rate": 6.113989637305699e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.9197114408016205, + "num_tokens": 4301669.0, + "step": 2401 + }, + { + "epoch": 0.38895635980892235, + "grad_norm": 26.80182647705078, + "learning_rate": 6.1123704663212435e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.9133472442626953, + "num_tokens": 4303458.0, + "step": 2402 + }, + { + "epoch": 0.38911829001700265, + "grad_norm": 31.55597686767578, + "learning_rate": 6.110751295336788e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.8886907696723938, + "num_tokens": 4305264.0, + "step": 2403 + }, + { + "epoch": 0.38928022022508296, + "grad_norm": 31.774499893188477, + "learning_rate": 6.1091321243523316e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.8809821903705597, + "num_tokens": 4307053.0, + "step": 2404 + }, + { + "epoch": 0.3894421504331633, + "grad_norm": 27.53371238708496, + "learning_rate": 6.107512953367876e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.9090012311935425, + "num_tokens": 4308839.0, + "step": 2405 + }, + { + "epoch": 0.38960408064124363, + "grad_norm": 21.76186180114746, + "learning_rate": 6.10589378238342e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.925220400094986, + "num_tokens": 4310632.0, + "step": 2406 + }, + { + "epoch": 0.38976601084932394, + "grad_norm": 16.794729232788086, + "learning_rate": 6.104274611398964e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.9231884181499481, + "num_tokens": 4312417.0, + "step": 2407 + }, + { + "epoch": 0.38992794105740425, + "grad_norm": 19.790908813476562, + "learning_rate": 6.102655440414508e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.9195210933685303, + "num_tokens": 4314202.0, + "step": 2408 + }, + { + "epoch": 0.39008987126548456, + "grad_norm": 27.3967227935791, + "learning_rate": 6.101036269430052e-06, + "loss": 0.6574, + "mean_token_accuracy": 0.9037568271160126, + "num_tokens": 4315994.0, + "step": 2409 + }, + { + "epoch": 0.3902518014735649, + "grad_norm": 27.468692779541016, + "learning_rate": 6.099417098445596e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.9087953269481659, + "num_tokens": 4317780.0, + "step": 2410 + }, + { + "epoch": 0.3904137316816452, + "grad_norm": 13.325836181640625, + "learning_rate": 6.09779792746114e-06, + "loss": 0.5389, + "mean_token_accuracy": 0.9294127523899078, + "num_tokens": 4319561.0, + "step": 2411 + }, + { + "epoch": 0.39057566188972553, + "grad_norm": 23.503740310668945, + "learning_rate": 6.096178756476684e-06, + "loss": 0.6462, + "mean_token_accuracy": 0.9142034649848938, + "num_tokens": 4321363.0, + "step": 2412 + }, + { + "epoch": 0.39073759209780584, + "grad_norm": 23.477584838867188, + "learning_rate": 6.094559585492228e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.9085765480995178, + "num_tokens": 4323159.0, + "step": 2413 + }, + { + "epoch": 0.39089952230588615, + "grad_norm": 27.98770523071289, + "learning_rate": 6.092940414507773e-06, + "loss": 0.6692, + "mean_token_accuracy": 0.9034899771213531, + "num_tokens": 4324951.0, + "step": 2414 + }, + { + "epoch": 0.39106145251396646, + "grad_norm": 23.318613052368164, + "learning_rate": 6.091321243523317e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.9146631062030792, + "num_tokens": 4326756.0, + "step": 2415 + }, + { + "epoch": 0.3912233827220468, + "grad_norm": 20.08234977722168, + "learning_rate": 6.089702072538861e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.9100168347358704, + "num_tokens": 4328535.0, + "step": 2416 + }, + { + "epoch": 0.3913853129301271, + "grad_norm": 24.074081420898438, + "learning_rate": 6.088082901554405e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.9077968001365662, + "num_tokens": 4330329.0, + "step": 2417 + }, + { + "epoch": 0.39154724313820743, + "grad_norm": 29.092697143554688, + "learning_rate": 6.0864637305699494e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.88978111743927, + "num_tokens": 4332122.0, + "step": 2418 + }, + { + "epoch": 0.39170917334628774, + "grad_norm": 20.12398338317871, + "learning_rate": 6.084844559585493e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.9107279777526855, + "num_tokens": 4333914.0, + "step": 2419 + }, + { + "epoch": 0.39187110355436805, + "grad_norm": 23.617050170898438, + "learning_rate": 6.0832253886010375e-06, + "loss": 0.584, + "mean_token_accuracy": 0.9223276674747467, + "num_tokens": 4335709.0, + "step": 2420 + }, + { + "epoch": 0.39203303376244836, + "grad_norm": 27.276884078979492, + "learning_rate": 6.081606217616581e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.8941934108734131, + "num_tokens": 4337501.0, + "step": 2421 + }, + { + "epoch": 0.3921949639705287, + "grad_norm": 24.665903091430664, + "learning_rate": 6.0799870466321255e-06, + "loss": 0.6515, + "mean_token_accuracy": 0.9157529175281525, + "num_tokens": 4339297.0, + "step": 2422 + }, + { + "epoch": 0.39235689417860903, + "grad_norm": 20.519485473632812, + "learning_rate": 6.078367875647669e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.9202898740768433, + "num_tokens": 4341085.0, + "step": 2423 + }, + { + "epoch": 0.39251882438668934, + "grad_norm": 23.69145393371582, + "learning_rate": 6.0767487046632135e-06, + "loss": 0.727, + "mean_token_accuracy": 0.9065680205821991, + "num_tokens": 4342876.0, + "step": 2424 + }, + { + "epoch": 0.39268075459476964, + "grad_norm": 25.522798538208008, + "learning_rate": 6.075129533678757e-06, + "loss": 0.6424, + "mean_token_accuracy": 0.9090236127376556, + "num_tokens": 4344663.0, + "step": 2425 + }, + { + "epoch": 0.39284268480284995, + "grad_norm": 17.07276153564453, + "learning_rate": 6.0735103626943015e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.9265037775039673, + "num_tokens": 4346448.0, + "step": 2426 + }, + { + "epoch": 0.3930046150109303, + "grad_norm": 13.80805492401123, + "learning_rate": 6.071891191709845e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.9304717183113098, + "num_tokens": 4348248.0, + "step": 2427 + }, + { + "epoch": 0.3931665452190106, + "grad_norm": 18.124977111816406, + "learning_rate": 6.0702720207253896e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.9284613132476807, + "num_tokens": 4350039.0, + "step": 2428 + }, + { + "epoch": 0.39332847542709093, + "grad_norm": 22.377042770385742, + "learning_rate": 6.068652849740934e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.9244702756404877, + "num_tokens": 4351827.0, + "step": 2429 + }, + { + "epoch": 0.39349040563517124, + "grad_norm": 18.47957420349121, + "learning_rate": 6.067033678756478e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.9189484119415283, + "num_tokens": 4353623.0, + "step": 2430 + }, + { + "epoch": 0.39365233584325154, + "grad_norm": 17.989002227783203, + "learning_rate": 6.065414507772022e-06, + "loss": 0.54, + "mean_token_accuracy": 0.9215906858444214, + "num_tokens": 4355415.0, + "step": 2431 + }, + { + "epoch": 0.39381426605133185, + "grad_norm": 30.640718460083008, + "learning_rate": 6.063795336787566e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.8983498811721802, + "num_tokens": 4357222.0, + "step": 2432 + }, + { + "epoch": 0.3939761962594122, + "grad_norm": 21.26382064819336, + "learning_rate": 6.06217616580311e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.9168076515197754, + "num_tokens": 4359022.0, + "step": 2433 + }, + { + "epoch": 0.3941381264674925, + "grad_norm": 17.125774383544922, + "learning_rate": 6.060556994818654e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.9148924648761749, + "num_tokens": 4360804.0, + "step": 2434 + }, + { + "epoch": 0.39430005667557283, + "grad_norm": 19.612777709960938, + "learning_rate": 6.058937823834198e-06, + "loss": 0.6028, + "mean_token_accuracy": 0.9131964445114136, + "num_tokens": 4362592.0, + "step": 2435 + }, + { + "epoch": 0.39446198688365314, + "grad_norm": 19.06717300415039, + "learning_rate": 6.057318652849742e-06, + "loss": 0.5443, + "mean_token_accuracy": 0.9174337685108185, + "num_tokens": 4364371.0, + "step": 2436 + }, + { + "epoch": 0.39462391709173344, + "grad_norm": 17.325023651123047, + "learning_rate": 6.055699481865286e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.9102470278739929, + "num_tokens": 4366150.0, + "step": 2437 + }, + { + "epoch": 0.3947858472998138, + "grad_norm": 27.475378036499023, + "learning_rate": 6.05408031088083e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.897081196308136, + "num_tokens": 4367943.0, + "step": 2438 + }, + { + "epoch": 0.3949477775078941, + "grad_norm": 16.568401336669922, + "learning_rate": 6.052461139896374e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.9233269393444061, + "num_tokens": 4369729.0, + "step": 2439 + }, + { + "epoch": 0.3951097077159744, + "grad_norm": 24.641258239746094, + "learning_rate": 6.050841968911918e-06, + "loss": 0.63, + "mean_token_accuracy": 0.9069159030914307, + "num_tokens": 4371531.0, + "step": 2440 + }, + { + "epoch": 0.39527163792405473, + "grad_norm": 21.62548065185547, + "learning_rate": 6.049222797927462e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.9185331463813782, + "num_tokens": 4373326.0, + "step": 2441 + }, + { + "epoch": 0.39543356813213504, + "grad_norm": 17.75751304626465, + "learning_rate": 6.047603626943006e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.9304511249065399, + "num_tokens": 4375111.0, + "step": 2442 + }, + { + "epoch": 0.39559549834021535, + "grad_norm": 23.21023178100586, + "learning_rate": 6.04598445595855e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.9090500473976135, + "num_tokens": 4376898.0, + "step": 2443 + }, + { + "epoch": 0.3957574285482957, + "grad_norm": 33.61475372314453, + "learning_rate": 6.044365284974094e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.9015286862850189, + "num_tokens": 4378694.0, + "step": 2444 + }, + { + "epoch": 0.395919358756376, + "grad_norm": 26.48158073425293, + "learning_rate": 6.042746113989638e-06, + "loss": 0.6273, + "mean_token_accuracy": 0.9106450378894806, + "num_tokens": 4380486.0, + "step": 2445 + }, + { + "epoch": 0.3960812889644563, + "grad_norm": 19.514726638793945, + "learning_rate": 6.041126943005182e-06, + "loss": 0.565, + "mean_token_accuracy": 0.9172885715961456, + "num_tokens": 4382264.0, + "step": 2446 + }, + { + "epoch": 0.39624321917253663, + "grad_norm": 22.42393684387207, + "learning_rate": 6.039507772020726e-06, + "loss": 0.6092, + "mean_token_accuracy": 0.9151678681373596, + "num_tokens": 4384059.0, + "step": 2447 + }, + { + "epoch": 0.39640514938061694, + "grad_norm": 33.834651947021484, + "learning_rate": 6.037888601036271e-06, + "loss": 0.964, + "mean_token_accuracy": 0.8863383233547211, + "num_tokens": 4385852.0, + "step": 2448 + }, + { + "epoch": 0.39656707958869725, + "grad_norm": 25.878007888793945, + "learning_rate": 6.036269430051814e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.9095016121864319, + "num_tokens": 4387651.0, + "step": 2449 + }, + { + "epoch": 0.3967290097967776, + "grad_norm": 33.61030197143555, + "learning_rate": 6.034650259067359e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.8814473152160645, + "num_tokens": 4389450.0, + "step": 2450 + }, + { + "epoch": 0.3968909400048579, + "grad_norm": 24.599184036254883, + "learning_rate": 6.033031088082902e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.9265101552009583, + "num_tokens": 4391248.0, + "step": 2451 + }, + { + "epoch": 0.3970528702129382, + "grad_norm": 36.088565826416016, + "learning_rate": 6.031411917098447e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.8985527157783508, + "num_tokens": 4393056.0, + "step": 2452 + }, + { + "epoch": 0.39721480042101853, + "grad_norm": 23.43155860900879, + "learning_rate": 6.02979274611399e-06, + "loss": 0.763, + "mean_token_accuracy": 0.9103802740573883, + "num_tokens": 4394847.0, + "step": 2453 + }, + { + "epoch": 0.39737673062909884, + "grad_norm": 27.536840438842773, + "learning_rate": 6.028173575129535e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.9161564707756042, + "num_tokens": 4396646.0, + "step": 2454 + }, + { + "epoch": 0.3975386608371792, + "grad_norm": 27.58855438232422, + "learning_rate": 6.026554404145078e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.9101727306842804, + "num_tokens": 4398437.0, + "step": 2455 + }, + { + "epoch": 0.3977005910452595, + "grad_norm": 29.029890060424805, + "learning_rate": 6.024935233160623e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.9041210412979126, + "num_tokens": 4400230.0, + "step": 2456 + }, + { + "epoch": 0.3978625212533398, + "grad_norm": 29.050634384155273, + "learning_rate": 6.023316062176166e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.9099523723125458, + "num_tokens": 4402020.0, + "step": 2457 + }, + { + "epoch": 0.3980244514614201, + "grad_norm": 27.5640811920166, + "learning_rate": 6.021696891191711e-06, + "loss": 0.6544, + "mean_token_accuracy": 0.9092592597007751, + "num_tokens": 4403807.0, + "step": 2458 + }, + { + "epoch": 0.39818638166950043, + "grad_norm": 15.775254249572754, + "learning_rate": 6.020077720207254e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.9245341718196869, + "num_tokens": 4405597.0, + "step": 2459 + }, + { + "epoch": 0.39834831187758074, + "grad_norm": 31.133451461791992, + "learning_rate": 6.018458549222799e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.8978846967220306, + "num_tokens": 4407393.0, + "step": 2460 + }, + { + "epoch": 0.3985102420856611, + "grad_norm": 23.698078155517578, + "learning_rate": 6.016839378238342e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.9162943363189697, + "num_tokens": 4409192.0, + "step": 2461 + }, + { + "epoch": 0.3986721722937414, + "grad_norm": 22.433738708496094, + "learning_rate": 6.015220207253887e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.9181863963603973, + "num_tokens": 4410985.0, + "step": 2462 + }, + { + "epoch": 0.3988341025018217, + "grad_norm": 27.3464412689209, + "learning_rate": 6.0136010362694304e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.8985449969768524, + "num_tokens": 4412772.0, + "step": 2463 + }, + { + "epoch": 0.398996032709902, + "grad_norm": 15.620551109313965, + "learning_rate": 6.011981865284975e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.9259096682071686, + "num_tokens": 4414554.0, + "step": 2464 + }, + { + "epoch": 0.39915796291798233, + "grad_norm": 30.190011978149414, + "learning_rate": 6.0103626943005185e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.9055112302303314, + "num_tokens": 4416351.0, + "step": 2465 + }, + { + "epoch": 0.39931989312606264, + "grad_norm": 21.78976821899414, + "learning_rate": 6.008743523316063e-06, + "loss": 0.638, + "mean_token_accuracy": 0.9120462834835052, + "num_tokens": 4418135.0, + "step": 2466 + }, + { + "epoch": 0.399481823334143, + "grad_norm": 23.980592727661133, + "learning_rate": 6.007124352331607e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.9081102907657623, + "num_tokens": 4419919.0, + "step": 2467 + }, + { + "epoch": 0.3996437535422233, + "grad_norm": 29.508264541625977, + "learning_rate": 6.005505181347151e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.9135153293609619, + "num_tokens": 4421719.0, + "step": 2468 + }, + { + "epoch": 0.3998056837503036, + "grad_norm": 20.48666763305664, + "learning_rate": 6.003886010362695e-06, + "loss": 0.6011, + "mean_token_accuracy": 0.9116883277893066, + "num_tokens": 4423503.0, + "step": 2469 + }, + { + "epoch": 0.3999676139583839, + "grad_norm": 22.19963836669922, + "learning_rate": 6.002266839378239e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.9145896732807159, + "num_tokens": 4425296.0, + "step": 2470 + }, + { + "epoch": 0.40012954416646423, + "grad_norm": 18.056947708129883, + "learning_rate": 6.000647668393783e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.9283071458339691, + "num_tokens": 4427087.0, + "step": 2471 + }, + { + "epoch": 0.4002914743745446, + "grad_norm": 24.245954513549805, + "learning_rate": 5.999028497409327e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.9065713882446289, + "num_tokens": 4428878.0, + "step": 2472 + }, + { + "epoch": 0.4004534045826249, + "grad_norm": 30.069059371948242, + "learning_rate": 5.997409326424871e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.9020979106426239, + "num_tokens": 4430676.0, + "step": 2473 + }, + { + "epoch": 0.4006153347907052, + "grad_norm": 16.61443328857422, + "learning_rate": 5.995790155440415e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.929848313331604, + "num_tokens": 4432473.0, + "step": 2474 + }, + { + "epoch": 0.4007772649987855, + "grad_norm": 23.960824966430664, + "learning_rate": 5.9941709844559594e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.9051418602466583, + "num_tokens": 4434270.0, + "step": 2475 + }, + { + "epoch": 0.40093919520686583, + "grad_norm": 23.24922752380371, + "learning_rate": 5.992551813471503e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.9059259295463562, + "num_tokens": 4436067.0, + "step": 2476 + }, + { + "epoch": 0.40110112541494614, + "grad_norm": 25.58210563659668, + "learning_rate": 5.9909326424870475e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.9025649130344391, + "num_tokens": 4437856.0, + "step": 2477 + }, + { + "epoch": 0.4012630556230265, + "grad_norm": 27.10450553894043, + "learning_rate": 5.989313471502591e-06, + "loss": 0.721, + "mean_token_accuracy": 0.9046140313148499, + "num_tokens": 4439653.0, + "step": 2478 + }, + { + "epoch": 0.4014249858311068, + "grad_norm": 23.10318374633789, + "learning_rate": 5.9876943005181355e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.9073809385299683, + "num_tokens": 4441455.0, + "step": 2479 + }, + { + "epoch": 0.4015869160391871, + "grad_norm": 22.148962020874023, + "learning_rate": 5.986075129533679e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.9035920202732086, + "num_tokens": 4443247.0, + "step": 2480 + }, + { + "epoch": 0.4017488462472674, + "grad_norm": 22.3599796295166, + "learning_rate": 5.9844559585492235e-06, + "loss": 0.6438, + "mean_token_accuracy": 0.9138582646846771, + "num_tokens": 4445037.0, + "step": 2481 + }, + { + "epoch": 0.40191077645534773, + "grad_norm": 25.71211814880371, + "learning_rate": 5.982836787564767e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.9142748713493347, + "num_tokens": 4446830.0, + "step": 2482 + }, + { + "epoch": 0.40207270666342804, + "grad_norm": 16.857341766357422, + "learning_rate": 5.9812176165803116e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.9236375987529755, + "num_tokens": 4448617.0, + "step": 2483 + }, + { + "epoch": 0.4022346368715084, + "grad_norm": 25.236000061035156, + "learning_rate": 5.979598445595855e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.9066280722618103, + "num_tokens": 4450406.0, + "step": 2484 + }, + { + "epoch": 0.4023965670795887, + "grad_norm": 22.98699378967285, + "learning_rate": 5.9779792746114e-06, + "loss": 0.5689, + "mean_token_accuracy": 0.926809549331665, + "num_tokens": 4452204.0, + "step": 2485 + }, + { + "epoch": 0.402558497287669, + "grad_norm": 20.611967086791992, + "learning_rate": 5.976360103626944e-06, + "loss": 0.58, + "mean_token_accuracy": 0.9121921360492706, + "num_tokens": 4454001.0, + "step": 2486 + }, + { + "epoch": 0.4027204274957493, + "grad_norm": 25.8209228515625, + "learning_rate": 5.974740932642488e-06, + "loss": 0.662, + "mean_token_accuracy": 0.9118140041828156, + "num_tokens": 4455785.0, + "step": 2487 + }, + { + "epoch": 0.40288235770382963, + "grad_norm": 19.91486167907715, + "learning_rate": 5.973121761658032e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.9110361337661743, + "num_tokens": 4457576.0, + "step": 2488 + }, + { + "epoch": 0.40304428791191, + "grad_norm": 19.50437355041504, + "learning_rate": 5.971502590673576e-06, + "loss": 0.622, + "mean_token_accuracy": 0.9277893602848053, + "num_tokens": 4459365.0, + "step": 2489 + }, + { + "epoch": 0.4032062181199903, + "grad_norm": 22.586467742919922, + "learning_rate": 5.96988341968912e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.923235297203064, + "num_tokens": 4461163.0, + "step": 2490 + }, + { + "epoch": 0.4033681483280706, + "grad_norm": 22.85340690612793, + "learning_rate": 5.968264248704664e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.9136938452720642, + "num_tokens": 4462954.0, + "step": 2491 + }, + { + "epoch": 0.4035300785361509, + "grad_norm": 27.774314880371094, + "learning_rate": 5.966645077720208e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.8940350711345673, + "num_tokens": 4464768.0, + "step": 2492 + }, + { + "epoch": 0.4036920087442312, + "grad_norm": 23.874713897705078, + "learning_rate": 5.965025906735752e-06, + "loss": 0.6301, + "mean_token_accuracy": 0.9095029234886169, + "num_tokens": 4466567.0, + "step": 2493 + }, + { + "epoch": 0.40385393895231153, + "grad_norm": 24.43355941772461, + "learning_rate": 5.963406735751296e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.9071428775787354, + "num_tokens": 4468359.0, + "step": 2494 + }, + { + "epoch": 0.4040158691603919, + "grad_norm": 25.318588256835938, + "learning_rate": 5.96178756476684e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.89723339676857, + "num_tokens": 4470153.0, + "step": 2495 + }, + { + "epoch": 0.4041777993684722, + "grad_norm": 28.589645385742188, + "learning_rate": 5.960168393782384e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.9009498357772827, + "num_tokens": 4471957.0, + "step": 2496 + }, + { + "epoch": 0.4043397295765525, + "grad_norm": 25.1281681060791, + "learning_rate": 5.958549222797928e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.9164653122425079, + "num_tokens": 4473768.0, + "step": 2497 + }, + { + "epoch": 0.4045016597846328, + "grad_norm": 21.27931785583496, + "learning_rate": 5.956930051813472e-06, + "loss": 0.6308, + "mean_token_accuracy": 0.9097070395946503, + "num_tokens": 4475557.0, + "step": 2498 + }, + { + "epoch": 0.4046635899927131, + "grad_norm": 17.604877471923828, + "learning_rate": 5.955310880829016e-06, + "loss": 0.6041, + "mean_token_accuracy": 0.9176002144813538, + "num_tokens": 4477348.0, + "step": 2499 + }, + { + "epoch": 0.40482552020079343, + "grad_norm": 21.642637252807617, + "learning_rate": 5.95369170984456e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.9219181835651398, + "num_tokens": 4479129.0, + "step": 2500 + }, + { + "epoch": 0.4049874504088738, + "grad_norm": 26.70907211303711, + "learning_rate": 5.952072538860104e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.8918569087982178, + "num_tokens": 4480919.0, + "step": 2501 + }, + { + "epoch": 0.4051493806169541, + "grad_norm": 24.135648727416992, + "learning_rate": 5.950453367875648e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.9158806204795837, + "num_tokens": 4482705.0, + "step": 2502 + }, + { + "epoch": 0.4053113108250344, + "grad_norm": 21.401960372924805, + "learning_rate": 5.948834196891193e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.9153079688549042, + "num_tokens": 4484499.0, + "step": 2503 + }, + { + "epoch": 0.4054732410331147, + "grad_norm": 17.22169303894043, + "learning_rate": 5.947215025906736e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.9126032292842865, + "num_tokens": 4486285.0, + "step": 2504 + }, + { + "epoch": 0.405635171241195, + "grad_norm": 22.238935470581055, + "learning_rate": 5.945595854922281e-06, + "loss": 0.5978, + "mean_token_accuracy": 0.9169968664646149, + "num_tokens": 4488074.0, + "step": 2505 + }, + { + "epoch": 0.4057971014492754, + "grad_norm": 20.01361656188965, + "learning_rate": 5.943976683937824e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.9195588231086731, + "num_tokens": 4489872.0, + "step": 2506 + }, + { + "epoch": 0.4059590316573557, + "grad_norm": 21.853452682495117, + "learning_rate": 5.942357512953369e-06, + "loss": 0.5928, + "mean_token_accuracy": 0.9114184975624084, + "num_tokens": 4491666.0, + "step": 2507 + }, + { + "epoch": 0.406120961865436, + "grad_norm": 14.27613353729248, + "learning_rate": 5.940738341968912e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.9321029782295227, + "num_tokens": 4493443.0, + "step": 2508 + }, + { + "epoch": 0.4062828920735163, + "grad_norm": 20.791099548339844, + "learning_rate": 5.939119170984457e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.9173325896263123, + "num_tokens": 4495245.0, + "step": 2509 + }, + { + "epoch": 0.4064448222815966, + "grad_norm": 17.984691619873047, + "learning_rate": 5.9375e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.9258756041526794, + "num_tokens": 4497039.0, + "step": 2510 + }, + { + "epoch": 0.4066067524896769, + "grad_norm": 26.65800666809082, + "learning_rate": 5.935880829015545e-06, + "loss": 0.6621, + "mean_token_accuracy": 0.9114285707473755, + "num_tokens": 4498841.0, + "step": 2511 + }, + { + "epoch": 0.4067686826977573, + "grad_norm": 24.68552017211914, + "learning_rate": 5.934261658031088e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.9046815931797028, + "num_tokens": 4500635.0, + "step": 2512 + }, + { + "epoch": 0.4069306129058376, + "grad_norm": 19.272958755493164, + "learning_rate": 5.932642487046633e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.9239901900291443, + "num_tokens": 4502424.0, + "step": 2513 + }, + { + "epoch": 0.4070925431139179, + "grad_norm": 16.699970245361328, + "learning_rate": 5.931023316062176e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.9236669540405273, + "num_tokens": 4504211.0, + "step": 2514 + }, + { + "epoch": 0.4072544733219982, + "grad_norm": 27.650733947753906, + "learning_rate": 5.929404145077721e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.896987795829773, + "num_tokens": 4506014.0, + "step": 2515 + }, + { + "epoch": 0.4074164035300785, + "grad_norm": 25.33139991760254, + "learning_rate": 5.927784974093264e-06, + "loss": 0.6255, + "mean_token_accuracy": 0.9077353179454803, + "num_tokens": 4507819.0, + "step": 2516 + }, + { + "epoch": 0.4075783337381588, + "grad_norm": 20.26680564880371, + "learning_rate": 5.926165803108809e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.9172661602497101, + "num_tokens": 4509609.0, + "step": 2517 + }, + { + "epoch": 0.4077402639462392, + "grad_norm": 23.118213653564453, + "learning_rate": 5.9245466321243524e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.9091197550296783, + "num_tokens": 4511396.0, + "step": 2518 + }, + { + "epoch": 0.4079021941543195, + "grad_norm": 24.033912658691406, + "learning_rate": 5.922927461139897e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.9103405475616455, + "num_tokens": 4513198.0, + "step": 2519 + }, + { + "epoch": 0.4080641243623998, + "grad_norm": 26.5091495513916, + "learning_rate": 5.9213082901554405e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.9003831446170807, + "num_tokens": 4514990.0, + "step": 2520 + }, + { + "epoch": 0.4082260545704801, + "grad_norm": 25.187744140625, + "learning_rate": 5.919689119170985e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.9122793972492218, + "num_tokens": 4516786.0, + "step": 2521 + }, + { + "epoch": 0.4083879847785604, + "grad_norm": 18.701908111572266, + "learning_rate": 5.918069948186529e-06, + "loss": 0.5936, + "mean_token_accuracy": 0.9224390387535095, + "num_tokens": 4518582.0, + "step": 2522 + }, + { + "epoch": 0.4085499149866408, + "grad_norm": 29.930334091186523, + "learning_rate": 5.916450777202073e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.9078470766544342, + "num_tokens": 4520376.0, + "step": 2523 + }, + { + "epoch": 0.4087118451947211, + "grad_norm": 28.683332443237305, + "learning_rate": 5.914831606217617e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.8981540501117706, + "num_tokens": 4522163.0, + "step": 2524 + }, + { + "epoch": 0.4088737754028014, + "grad_norm": 22.226905822753906, + "learning_rate": 5.913212435233161e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.9238445162773132, + "num_tokens": 4523951.0, + "step": 2525 + }, + { + "epoch": 0.4090357056108817, + "grad_norm": 21.099422454833984, + "learning_rate": 5.911593264248705e-06, + "loss": 0.6355, + "mean_token_accuracy": 0.9116721749305725, + "num_tokens": 4525746.0, + "step": 2526 + }, + { + "epoch": 0.409197635818962, + "grad_norm": 26.98863983154297, + "learning_rate": 5.909974093264249e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.9043259620666504, + "num_tokens": 4527540.0, + "step": 2527 + }, + { + "epoch": 0.4093595660270423, + "grad_norm": 21.997154235839844, + "learning_rate": 5.908354922279793e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.9259183704853058, + "num_tokens": 4529349.0, + "step": 2528 + }, + { + "epoch": 0.4095214962351227, + "grad_norm": 19.95395851135254, + "learning_rate": 5.906735751295337e-06, + "loss": 0.573, + "mean_token_accuracy": 0.91847363114357, + "num_tokens": 4531143.0, + "step": 2529 + }, + { + "epoch": 0.409683426443203, + "grad_norm": 31.12894630432129, + "learning_rate": 5.9051165803108814e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.9041050970554352, + "num_tokens": 4532947.0, + "step": 2530 + }, + { + "epoch": 0.4098453566512833, + "grad_norm": 20.292251586914062, + "learning_rate": 5.903497409326425e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.9309569299221039, + "num_tokens": 4534734.0, + "step": 2531 + }, + { + "epoch": 0.4100072868593636, + "grad_norm": 19.76032257080078, + "learning_rate": 5.9018782383419695e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.9219315946102142, + "num_tokens": 4536528.0, + "step": 2532 + }, + { + "epoch": 0.4101692170674439, + "grad_norm": 22.101547241210938, + "learning_rate": 5.900259067357513e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.9174394011497498, + "num_tokens": 4538316.0, + "step": 2533 + }, + { + "epoch": 0.4103311472755242, + "grad_norm": 22.76506805419922, + "learning_rate": 5.8986398963730575e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.9249706864356995, + "num_tokens": 4540108.0, + "step": 2534 + }, + { + "epoch": 0.4104930774836046, + "grad_norm": 22.418729782104492, + "learning_rate": 5.897020725388601e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.9114282727241516, + "num_tokens": 4541891.0, + "step": 2535 + }, + { + "epoch": 0.4106550076916849, + "grad_norm": 28.73288345336914, + "learning_rate": 5.8954015544041455e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.9032674133777618, + "num_tokens": 4543690.0, + "step": 2536 + }, + { + "epoch": 0.4108169378997652, + "grad_norm": 31.85521125793457, + "learning_rate": 5.893782383419689e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.9028123617172241, + "num_tokens": 4545480.0, + "step": 2537 + }, + { + "epoch": 0.4109788681078455, + "grad_norm": 24.487714767456055, + "learning_rate": 5.8921632124352335e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.9132784605026245, + "num_tokens": 4547271.0, + "step": 2538 + }, + { + "epoch": 0.4111407983159258, + "grad_norm": 27.29911994934082, + "learning_rate": 5.890544041450777e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.9045549929141998, + "num_tokens": 4549066.0, + "step": 2539 + }, + { + "epoch": 0.4113027285240062, + "grad_norm": 20.49338150024414, + "learning_rate": 5.8889248704663216e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.9243475496768951, + "num_tokens": 4550856.0, + "step": 2540 + }, + { + "epoch": 0.4114646587320865, + "grad_norm": 16.157089233398438, + "learning_rate": 5.887305699481866e-06, + "loss": 0.492, + "mean_token_accuracy": 0.9228169620037079, + "num_tokens": 4552640.0, + "step": 2541 + }, + { + "epoch": 0.4116265889401668, + "grad_norm": 27.82122039794922, + "learning_rate": 5.88568652849741e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.8979591727256775, + "num_tokens": 4554446.0, + "step": 2542 + }, + { + "epoch": 0.4117885191482471, + "grad_norm": 29.573129653930664, + "learning_rate": 5.884067357512954e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.895780086517334, + "num_tokens": 4556246.0, + "step": 2543 + }, + { + "epoch": 0.4119504493563274, + "grad_norm": 21.327186584472656, + "learning_rate": 5.882448186528498e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.9154887795448303, + "num_tokens": 4558042.0, + "step": 2544 + }, + { + "epoch": 0.4121123795644077, + "grad_norm": 24.890348434448242, + "learning_rate": 5.880829015544042e-06, + "loss": 0.726, + "mean_token_accuracy": 0.8984703123569489, + "num_tokens": 4559832.0, + "step": 2545 + }, + { + "epoch": 0.4122743097724881, + "grad_norm": 20.519004821777344, + "learning_rate": 5.879209844559586e-06, + "loss": 0.544, + "mean_token_accuracy": 0.919334203004837, + "num_tokens": 4561629.0, + "step": 2546 + }, + { + "epoch": 0.4124362399805684, + "grad_norm": 20.222881317138672, + "learning_rate": 5.87759067357513e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.9250216782093048, + "num_tokens": 4563421.0, + "step": 2547 + }, + { + "epoch": 0.4125981701886487, + "grad_norm": 34.028263092041016, + "learning_rate": 5.875971502590674e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.9096132516860962, + "num_tokens": 4565221.0, + "step": 2548 + }, + { + "epoch": 0.412760100396729, + "grad_norm": 18.328582763671875, + "learning_rate": 5.874352331606218e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.9250421226024628, + "num_tokens": 4567013.0, + "step": 2549 + }, + { + "epoch": 0.4129220306048093, + "grad_norm": 23.221229553222656, + "learning_rate": 5.872733160621762e-06, + "loss": 0.5967, + "mean_token_accuracy": 0.9072261154651642, + "num_tokens": 4568803.0, + "step": 2550 + }, + { + "epoch": 0.4130839608128897, + "grad_norm": 22.880868911743164, + "learning_rate": 5.871113989637306e-06, + "loss": 0.594, + "mean_token_accuracy": 0.920152485370636, + "num_tokens": 4570590.0, + "step": 2551 + }, + { + "epoch": 0.41324589102097, + "grad_norm": 18.815189361572266, + "learning_rate": 5.86949481865285e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.922222226858139, + "num_tokens": 4572372.0, + "step": 2552 + }, + { + "epoch": 0.4134078212290503, + "grad_norm": 31.15639305114746, + "learning_rate": 5.867875647668394e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.8942348957061768, + "num_tokens": 4574158.0, + "step": 2553 + }, + { + "epoch": 0.4135697514371306, + "grad_norm": 16.219648361206055, + "learning_rate": 5.866256476683938e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.9253065884113312, + "num_tokens": 4575951.0, + "step": 2554 + }, + { + "epoch": 0.4137316816452109, + "grad_norm": 25.33356285095215, + "learning_rate": 5.864637305699482e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.917548805475235, + "num_tokens": 4577742.0, + "step": 2555 + }, + { + "epoch": 0.4138936118532912, + "grad_norm": 22.65717887878418, + "learning_rate": 5.863018134715026e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.9111111164093018, + "num_tokens": 4579524.0, + "step": 2556 + }, + { + "epoch": 0.4140555420613716, + "grad_norm": 22.198528289794922, + "learning_rate": 5.86139896373057e-06, + "loss": 0.6564, + "mean_token_accuracy": 0.911347508430481, + "num_tokens": 4581318.0, + "step": 2557 + }, + { + "epoch": 0.4142174722694519, + "grad_norm": 26.533159255981445, + "learning_rate": 5.859779792746114e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.9003607630729675, + "num_tokens": 4583119.0, + "step": 2558 + }, + { + "epoch": 0.4143794024775322, + "grad_norm": 24.138341903686523, + "learning_rate": 5.858160621761658e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.9162554144859314, + "num_tokens": 4584916.0, + "step": 2559 + }, + { + "epoch": 0.4145413326856125, + "grad_norm": 25.73813819885254, + "learning_rate": 5.856541450777203e-06, + "loss": 0.632, + "mean_token_accuracy": 0.9055944085121155, + "num_tokens": 4586714.0, + "step": 2560 + }, + { + "epoch": 0.4147032628936928, + "grad_norm": 37.1443977355957, + "learning_rate": 5.854922279792746e-06, + "loss": 1.1386, + "mean_token_accuracy": 0.8717955052852631, + "num_tokens": 4588507.0, + "step": 2561 + }, + { + "epoch": 0.4148651931017731, + "grad_norm": 25.781566619873047, + "learning_rate": 5.853303108808291e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.9101727306842804, + "num_tokens": 4590298.0, + "step": 2562 + }, + { + "epoch": 0.4150271233098535, + "grad_norm": 17.320585250854492, + "learning_rate": 5.851683937823834e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.9257739782333374, + "num_tokens": 4592092.0, + "step": 2563 + }, + { + "epoch": 0.4151890535179338, + "grad_norm": 25.535747528076172, + "learning_rate": 5.850064766839379e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.9130645990371704, + "num_tokens": 4593891.0, + "step": 2564 + }, + { + "epoch": 0.4153509837260141, + "grad_norm": 24.325716018676758, + "learning_rate": 5.848445595854922e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.9164724051952362, + "num_tokens": 4595690.0, + "step": 2565 + }, + { + "epoch": 0.4155129139340944, + "grad_norm": 27.941818237304688, + "learning_rate": 5.846826424870467e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.8991561830043793, + "num_tokens": 4597480.0, + "step": 2566 + }, + { + "epoch": 0.4156748441421747, + "grad_norm": 22.496826171875, + "learning_rate": 5.84520725388601e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.9260774850845337, + "num_tokens": 4599276.0, + "step": 2567 + }, + { + "epoch": 0.41583677435025507, + "grad_norm": 27.053525924682617, + "learning_rate": 5.843588082901555e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.9051958322525024, + "num_tokens": 4601062.0, + "step": 2568 + }, + { + "epoch": 0.4159987045583354, + "grad_norm": 22.22924041748047, + "learning_rate": 5.841968911917098e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.9153555035591125, + "num_tokens": 4602846.0, + "step": 2569 + }, + { + "epoch": 0.4161606347664157, + "grad_norm": 33.67448425292969, + "learning_rate": 5.840349740932643e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.8908644616603851, + "num_tokens": 4604642.0, + "step": 2570 + }, + { + "epoch": 0.416322564974496, + "grad_norm": 24.905147552490234, + "learning_rate": 5.838730569948186e-06, + "loss": 0.6279, + "mean_token_accuracy": 0.9090061187744141, + "num_tokens": 4606428.0, + "step": 2571 + }, + { + "epoch": 0.4164844951825763, + "grad_norm": 24.318157196044922, + "learning_rate": 5.837111398963731e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.9092437028884888, + "num_tokens": 4608216.0, + "step": 2572 + }, + { + "epoch": 0.4166464253906566, + "grad_norm": 24.842966079711914, + "learning_rate": 5.835492227979274e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.9167156219482422, + "num_tokens": 4610014.0, + "step": 2573 + }, + { + "epoch": 0.41680835559873697, + "grad_norm": 25.95463752746582, + "learning_rate": 5.833873056994819e-06, + "loss": 0.905, + "mean_token_accuracy": 0.8942229747772217, + "num_tokens": 4611798.0, + "step": 2574 + }, + { + "epoch": 0.4169702858068173, + "grad_norm": 23.649118423461914, + "learning_rate": 5.8322538860103624e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.9157062470912933, + "num_tokens": 4613595.0, + "step": 2575 + }, + { + "epoch": 0.4171322160148976, + "grad_norm": 30.974651336669922, + "learning_rate": 5.830634715025907e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.8999438583850861, + "num_tokens": 4615387.0, + "step": 2576 + }, + { + "epoch": 0.4172941462229779, + "grad_norm": 19.067163467407227, + "learning_rate": 5.8290155440414505e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.920550525188446, + "num_tokens": 4617176.0, + "step": 2577 + }, + { + "epoch": 0.4174560764310582, + "grad_norm": 26.5705623626709, + "learning_rate": 5.827396373056995e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.901753157377243, + "num_tokens": 4618973.0, + "step": 2578 + }, + { + "epoch": 0.4176180066391385, + "grad_norm": 30.52427101135254, + "learning_rate": 5.825777202072539e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.906272828578949, + "num_tokens": 4620762.0, + "step": 2579 + }, + { + "epoch": 0.41777993684721887, + "grad_norm": 31.000513076782227, + "learning_rate": 5.824158031088083e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.9050009846687317, + "num_tokens": 4622557.0, + "step": 2580 + }, + { + "epoch": 0.4179418670552992, + "grad_norm": 30.3168888092041, + "learning_rate": 5.822538860103627e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.9071656465530396, + "num_tokens": 4624359.0, + "step": 2581 + }, + { + "epoch": 0.4181037972633795, + "grad_norm": 27.74933433532715, + "learning_rate": 5.820919689119171e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.9094203114509583, + "num_tokens": 4626147.0, + "step": 2582 + }, + { + "epoch": 0.4182657274714598, + "grad_norm": 26.21828269958496, + "learning_rate": 5.819300518134715e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.9160181879997253, + "num_tokens": 4627945.0, + "step": 2583 + }, + { + "epoch": 0.4184276576795401, + "grad_norm": 27.86294937133789, + "learning_rate": 5.817681347150259e-06, + "loss": 0.6502, + "mean_token_accuracy": 0.9113768041133881, + "num_tokens": 4629739.0, + "step": 2584 + }, + { + "epoch": 0.41858958788762046, + "grad_norm": 24.900253295898438, + "learning_rate": 5.8160621761658034e-06, + "loss": 0.6338, + "mean_token_accuracy": 0.9100041687488556, + "num_tokens": 4631528.0, + "step": 2585 + }, + { + "epoch": 0.41875151809570077, + "grad_norm": 25.233060836791992, + "learning_rate": 5.814443005181347e-06, + "loss": 0.6313, + "mean_token_accuracy": 0.9164092838764191, + "num_tokens": 4633328.0, + "step": 2586 + }, + { + "epoch": 0.4189134483037811, + "grad_norm": 19.95783042907715, + "learning_rate": 5.8128238341968915e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.9197080135345459, + "num_tokens": 4635114.0, + "step": 2587 + }, + { + "epoch": 0.4190753785118614, + "grad_norm": 21.245548248291016, + "learning_rate": 5.811204663212435e-06, + "loss": 0.5853, + "mean_token_accuracy": 0.916770339012146, + "num_tokens": 4636904.0, + "step": 2588 + }, + { + "epoch": 0.4192373087199417, + "grad_norm": 21.612083435058594, + "learning_rate": 5.8095854922279795e-06, + "loss": 0.6156, + "mean_token_accuracy": 0.9161564707756042, + "num_tokens": 4638703.0, + "step": 2589 + }, + { + "epoch": 0.419399238928022, + "grad_norm": 19.873619079589844, + "learning_rate": 5.807966321243523e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.9268577098846436, + "num_tokens": 4640502.0, + "step": 2590 + }, + { + "epoch": 0.41956116913610236, + "grad_norm": 29.5637149810791, + "learning_rate": 5.8063471502590675e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.8982490599155426, + "num_tokens": 4642289.0, + "step": 2591 + }, + { + "epoch": 0.41972309934418267, + "grad_norm": 27.144763946533203, + "learning_rate": 5.804727979274611e-06, + "loss": 0.6497, + "mean_token_accuracy": 0.908977746963501, + "num_tokens": 4644085.0, + "step": 2592 + }, + { + "epoch": 0.419885029552263, + "grad_norm": 16.945966720581055, + "learning_rate": 5.8031088082901555e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.9172511100769043, + "num_tokens": 4645863.0, + "step": 2593 + }, + { + "epoch": 0.4200469597603433, + "grad_norm": 28.04581069946289, + "learning_rate": 5.801489637305699e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.9052250683307648, + "num_tokens": 4647660.0, + "step": 2594 + }, + { + "epoch": 0.4202088899684236, + "grad_norm": 22.998987197875977, + "learning_rate": 5.7998704663212436e-06, + "loss": 0.6566, + "mean_token_accuracy": 0.9103288650512695, + "num_tokens": 4649451.0, + "step": 2595 + }, + { + "epoch": 0.4203708201765039, + "grad_norm": 24.98158836364746, + "learning_rate": 5.798251295336787e-06, + "loss": 0.6474, + "mean_token_accuracy": 0.9150382578372955, + "num_tokens": 4651245.0, + "step": 2596 + }, + { + "epoch": 0.42053275038458426, + "grad_norm": 23.99442481994629, + "learning_rate": 5.796632124352332e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.9081889390945435, + "num_tokens": 4653039.0, + "step": 2597 + }, + { + "epoch": 0.42069468059266457, + "grad_norm": 22.756681442260742, + "learning_rate": 5.795012953367876e-06, + "loss": 0.62, + "mean_token_accuracy": 0.9184397161006927, + "num_tokens": 4654833.0, + "step": 2598 + }, + { + "epoch": 0.4208566108007449, + "grad_norm": 32.483375549316406, + "learning_rate": 5.79339378238342e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.9015027582645416, + "num_tokens": 4656629.0, + "step": 2599 + }, + { + "epoch": 0.4210185410088252, + "grad_norm": 22.955522537231445, + "learning_rate": 5.791774611398964e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.9139266312122345, + "num_tokens": 4658408.0, + "step": 2600 + }, + { + "epoch": 0.4211804712169055, + "grad_norm": 28.10015106201172, + "learning_rate": 5.790155440414508e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.9017808437347412, + "num_tokens": 4660216.0, + "step": 2601 + }, + { + "epoch": 0.42134240142498586, + "grad_norm": 29.612756729125977, + "learning_rate": 5.788536269430052e-06, + "loss": 0.6319, + "mean_token_accuracy": 0.9055380523204803, + "num_tokens": 4662024.0, + "step": 2602 + }, + { + "epoch": 0.42150433163306616, + "grad_norm": 27.647302627563477, + "learning_rate": 5.786917098445596e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.901695728302002, + "num_tokens": 4663810.0, + "step": 2603 + }, + { + "epoch": 0.4216662618411465, + "grad_norm": 20.27496910095215, + "learning_rate": 5.78529792746114e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.9195804297924042, + "num_tokens": 4665608.0, + "step": 2604 + }, + { + "epoch": 0.4218281920492268, + "grad_norm": 23.218393325805664, + "learning_rate": 5.783678756476684e-06, + "loss": 0.625, + "mean_token_accuracy": 0.9081760048866272, + "num_tokens": 4667403.0, + "step": 2605 + }, + { + "epoch": 0.4219901222573071, + "grad_norm": 25.345605850219727, + "learning_rate": 5.782059585492228e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.8941427767276764, + "num_tokens": 4669196.0, + "step": 2606 + }, + { + "epoch": 0.4221520524653874, + "grad_norm": 20.053924560546875, + "learning_rate": 5.780440414507773e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.9227321743965149, + "num_tokens": 4670993.0, + "step": 2607 + }, + { + "epoch": 0.42231398267346776, + "grad_norm": 26.87257194519043, + "learning_rate": 5.778821243523317e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.8981329500675201, + "num_tokens": 4672789.0, + "step": 2608 + }, + { + "epoch": 0.42247591288154807, + "grad_norm": 12.919868469238281, + "learning_rate": 5.7772020725388614e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.9254679083824158, + "num_tokens": 4674569.0, + "step": 2609 + }, + { + "epoch": 0.4226378430896284, + "grad_norm": 23.54468536376953, + "learning_rate": 5.775582901554405e-06, + "loss": 0.6156, + "mean_token_accuracy": 0.9104477763175964, + "num_tokens": 4676349.0, + "step": 2610 + }, + { + "epoch": 0.4227997732977087, + "grad_norm": 24.3798828125, + "learning_rate": 5.7739637305699495e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.9140931963920593, + "num_tokens": 4678141.0, + "step": 2611 + }, + { + "epoch": 0.422961703505789, + "grad_norm": 12.141080856323242, + "learning_rate": 5.772344559585493e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.9399600327014923, + "num_tokens": 4679936.0, + "step": 2612 + }, + { + "epoch": 0.4231236337138693, + "grad_norm": 18.956920623779297, + "learning_rate": 5.7707253886010375e-06, + "loss": 0.5844, + "mean_token_accuracy": 0.9241921901702881, + "num_tokens": 4681726.0, + "step": 2613 + }, + { + "epoch": 0.42328556392194966, + "grad_norm": 27.292633056640625, + "learning_rate": 5.769106217616581e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.9077271521091461, + "num_tokens": 4683520.0, + "step": 2614 + }, + { + "epoch": 0.42344749413002997, + "grad_norm": 21.6551456451416, + "learning_rate": 5.7674870466321255e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.921777218580246, + "num_tokens": 4685326.0, + "step": 2615 + }, + { + "epoch": 0.4236094243381103, + "grad_norm": 20.61473846435547, + "learning_rate": 5.765867875647669e-06, + "loss": 0.652, + "mean_token_accuracy": 0.920273095369339, + "num_tokens": 4687114.0, + "step": 2616 + }, + { + "epoch": 0.4237713545461906, + "grad_norm": 26.408039093017578, + "learning_rate": 5.7642487046632135e-06, + "loss": 0.618, + "mean_token_accuracy": 0.9165661931037903, + "num_tokens": 4688914.0, + "step": 2617 + }, + { + "epoch": 0.4239332847542709, + "grad_norm": 32.399925231933594, + "learning_rate": 5.762629533678757e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.9081632792949677, + "num_tokens": 4690720.0, + "step": 2618 + }, + { + "epoch": 0.42409521496235125, + "grad_norm": 15.553979873657227, + "learning_rate": 5.7610103626943016e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.93031245470047, + "num_tokens": 4692505.0, + "step": 2619 + }, + { + "epoch": 0.42425714517043156, + "grad_norm": 24.6832275390625, + "learning_rate": 5.759391191709845e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.9002940058708191, + "num_tokens": 4694298.0, + "step": 2620 + }, + { + "epoch": 0.42441907537851187, + "grad_norm": 34.95283126831055, + "learning_rate": 5.75777202072539e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.8843482434749603, + "num_tokens": 4696096.0, + "step": 2621 + }, + { + "epoch": 0.4245810055865922, + "grad_norm": 30.824203491210938, + "learning_rate": 5.756152849740933e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.8984484672546387, + "num_tokens": 4697891.0, + "step": 2622 + }, + { + "epoch": 0.4247429357946725, + "grad_norm": 31.794389724731445, + "learning_rate": 5.754533678756478e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.8897947072982788, + "num_tokens": 4699685.0, + "step": 2623 + }, + { + "epoch": 0.4249048660027528, + "grad_norm": 27.11253547668457, + "learning_rate": 5.752914507772022e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.8947382271289825, + "num_tokens": 4701473.0, + "step": 2624 + }, + { + "epoch": 0.42506679621083315, + "grad_norm": 23.858304977416992, + "learning_rate": 5.751295336787566e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.902512788772583, + "num_tokens": 4703262.0, + "step": 2625 + }, + { + "epoch": 0.42522872641891346, + "grad_norm": 23.854270935058594, + "learning_rate": 5.74967616580311e-06, + "loss": 0.6209, + "mean_token_accuracy": 0.9186400771141052, + "num_tokens": 4705057.0, + "step": 2626 + }, + { + "epoch": 0.42539065662699377, + "grad_norm": 21.63140106201172, + "learning_rate": 5.748056994818654e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.9125266671180725, + "num_tokens": 4706843.0, + "step": 2627 + }, + { + "epoch": 0.4255525868350741, + "grad_norm": 36.0816650390625, + "learning_rate": 5.746437823834198e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.8799603283405304, + "num_tokens": 4708646.0, + "step": 2628 + }, + { + "epoch": 0.4257145170431544, + "grad_norm": 26.497888565063477, + "learning_rate": 5.744818652849742e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.9111787378787994, + "num_tokens": 4710440.0, + "step": 2629 + }, + { + "epoch": 0.4258764472512347, + "grad_norm": 18.768795013427734, + "learning_rate": 5.743199481865286e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.9124966859817505, + "num_tokens": 4712226.0, + "step": 2630 + }, + { + "epoch": 0.42603837745931505, + "grad_norm": 17.265892028808594, + "learning_rate": 5.74158031088083e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.9233953356742859, + "num_tokens": 4714011.0, + "step": 2631 + }, + { + "epoch": 0.42620030766739536, + "grad_norm": 19.992643356323242, + "learning_rate": 5.739961139896374e-06, + "loss": 0.6266, + "mean_token_accuracy": 0.9145896732807159, + "num_tokens": 4715804.0, + "step": 2632 + }, + { + "epoch": 0.42636223787547567, + "grad_norm": 29.299583435058594, + "learning_rate": 5.738341968911918e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.8843642771244049, + "num_tokens": 4717608.0, + "step": 2633 + }, + { + "epoch": 0.426524168083556, + "grad_norm": 31.09081268310547, + "learning_rate": 5.736722797927462e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.9000816643238068, + "num_tokens": 4719400.0, + "step": 2634 + }, + { + "epoch": 0.4266860982916363, + "grad_norm": 15.340228080749512, + "learning_rate": 5.735103626943006e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.9304481148719788, + "num_tokens": 4721185.0, + "step": 2635 + }, + { + "epoch": 0.42684802849971665, + "grad_norm": 16.546255111694336, + "learning_rate": 5.73348445595855e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.9294085204601288, + "num_tokens": 4722966.0, + "step": 2636 + }, + { + "epoch": 0.42700995870779695, + "grad_norm": 23.766008377075195, + "learning_rate": 5.731865284974094e-06, + "loss": 0.6068, + "mean_token_accuracy": 0.9143702983856201, + "num_tokens": 4724758.0, + "step": 2637 + }, + { + "epoch": 0.42717188891587726, + "grad_norm": 24.576339721679688, + "learning_rate": 5.730246113989638e-06, + "loss": 0.625, + "mean_token_accuracy": 0.9040851294994354, + "num_tokens": 4726552.0, + "step": 2638 + }, + { + "epoch": 0.42733381912395757, + "grad_norm": 29.197771072387695, + "learning_rate": 5.728626943005182e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.8945578336715698, + "num_tokens": 4728358.0, + "step": 2639 + }, + { + "epoch": 0.4274957493320379, + "grad_norm": 25.663074493408203, + "learning_rate": 5.727007772020726e-06, + "loss": 0.6315, + "mean_token_accuracy": 0.9103405475616455, + "num_tokens": 4730160.0, + "step": 2640 + }, + { + "epoch": 0.4276576795401182, + "grad_norm": 25.778417587280273, + "learning_rate": 5.72538860103627e-06, + "loss": 0.6107, + "mean_token_accuracy": 0.9197080135345459, + "num_tokens": 4731946.0, + "step": 2641 + }, + { + "epoch": 0.42781960974819855, + "grad_norm": 22.788490295410156, + "learning_rate": 5.723769430051814e-06, + "loss": 0.6079, + "mean_token_accuracy": 0.910306453704834, + "num_tokens": 4733748.0, + "step": 2642 + }, + { + "epoch": 0.42798153995627886, + "grad_norm": 21.9490909576416, + "learning_rate": 5.722150259067359e-06, + "loss": 0.613, + "mean_token_accuracy": 0.9189872145652771, + "num_tokens": 4735533.0, + "step": 2643 + }, + { + "epoch": 0.42814347016435916, + "grad_norm": 25.787572860717773, + "learning_rate": 5.720531088082902e-06, + "loss": 0.7368, + "mean_token_accuracy": 0.9083473086357117, + "num_tokens": 4737338.0, + "step": 2644 + }, + { + "epoch": 0.42830540037243947, + "grad_norm": 26.58540153503418, + "learning_rate": 5.718911917098447e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.9166666865348816, + "num_tokens": 4739138.0, + "step": 2645 + }, + { + "epoch": 0.4284673305805198, + "grad_norm": 23.15785026550293, + "learning_rate": 5.71729274611399e-06, + "loss": 0.6202, + "mean_token_accuracy": 0.9148764908313751, + "num_tokens": 4740932.0, + "step": 2646 + }, + { + "epoch": 0.4286292607886001, + "grad_norm": 34.17253112792969, + "learning_rate": 5.715673575129535e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.8918783962726593, + "num_tokens": 4742741.0, + "step": 2647 + }, + { + "epoch": 0.42879119099668045, + "grad_norm": 29.94915008544922, + "learning_rate": 5.714054404145078e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.887706845998764, + "num_tokens": 4744529.0, + "step": 2648 + }, + { + "epoch": 0.42895312120476076, + "grad_norm": 17.295637130737305, + "learning_rate": 5.712435233160623e-06, + "loss": 0.5666, + "mean_token_accuracy": 0.9282300472259521, + "num_tokens": 4746320.0, + "step": 2649 + }, + { + "epoch": 0.42911505141284106, + "grad_norm": 18.61489486694336, + "learning_rate": 5.710816062176166e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.9180035591125488, + "num_tokens": 4748100.0, + "step": 2650 + }, + { + "epoch": 0.42927698162092137, + "grad_norm": 22.395387649536133, + "learning_rate": 5.709196891191711e-06, + "loss": 0.682, + "mean_token_accuracy": 0.9140350818634033, + "num_tokens": 4749880.0, + "step": 2651 + }, + { + "epoch": 0.4294389118290017, + "grad_norm": 27.54579734802246, + "learning_rate": 5.707577720207254e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.9027763307094574, + "num_tokens": 4751669.0, + "step": 2652 + }, + { + "epoch": 0.42960084203708204, + "grad_norm": 25.062528610229492, + "learning_rate": 5.705958549222799e-06, + "loss": 0.5874, + "mean_token_accuracy": 0.9062761068344116, + "num_tokens": 4753458.0, + "step": 2653 + }, + { + "epoch": 0.42976277224516235, + "grad_norm": 23.13579559326172, + "learning_rate": 5.7043393782383424e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.9072912335395813, + "num_tokens": 4755249.0, + "step": 2654 + }, + { + "epoch": 0.42992470245324266, + "grad_norm": 26.464677810668945, + "learning_rate": 5.702720207253887e-06, + "loss": 0.68, + "mean_token_accuracy": 0.8968591690063477, + "num_tokens": 4757042.0, + "step": 2655 + }, + { + "epoch": 0.43008663266132297, + "grad_norm": 22.059852600097656, + "learning_rate": 5.7011010362694305e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.9116941690444946, + "num_tokens": 4758837.0, + "step": 2656 + }, + { + "epoch": 0.4302485628694033, + "grad_norm": 29.53142738342285, + "learning_rate": 5.699481865284975e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.9084957540035248, + "num_tokens": 4760633.0, + "step": 2657 + }, + { + "epoch": 0.4304104930774836, + "grad_norm": 20.650423049926758, + "learning_rate": 5.6978626943005185e-06, + "loss": 0.559, + "mean_token_accuracy": 0.9237982928752899, + "num_tokens": 4762421.0, + "step": 2658 + }, + { + "epoch": 0.43057242328556394, + "grad_norm": 19.217164993286133, + "learning_rate": 5.696243523316063e-06, + "loss": 0.564, + "mean_token_accuracy": 0.9206821024417877, + "num_tokens": 4764210.0, + "step": 2659 + }, + { + "epoch": 0.43073435349364425, + "grad_norm": 22.50171661376953, + "learning_rate": 5.6946243523316065e-06, + "loss": 0.585, + "mean_token_accuracy": 0.9168067276477814, + "num_tokens": 4765998.0, + "step": 2660 + }, + { + "epoch": 0.43089628370172456, + "grad_norm": 21.96280288696289, + "learning_rate": 5.693005181347151e-06, + "loss": 0.6544, + "mean_token_accuracy": 0.9148834943771362, + "num_tokens": 4767793.0, + "step": 2661 + }, + { + "epoch": 0.43105821390980487, + "grad_norm": 13.62598705291748, + "learning_rate": 5.691386010362695e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.9291044771671295, + "num_tokens": 4769573.0, + "step": 2662 + }, + { + "epoch": 0.4312201441178852, + "grad_norm": 20.593812942504883, + "learning_rate": 5.689766839378239e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.9189277589321136, + "num_tokens": 4771356.0, + "step": 2663 + }, + { + "epoch": 0.43138207432596554, + "grad_norm": 27.30518341064453, + "learning_rate": 5.688147668393783e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.9087708294391632, + "num_tokens": 4773153.0, + "step": 2664 + }, + { + "epoch": 0.43154400453404584, + "grad_norm": 24.983089447021484, + "learning_rate": 5.686528497409327e-06, + "loss": 0.6171, + "mean_token_accuracy": 0.9202521741390228, + "num_tokens": 4774941.0, + "step": 2665 + }, + { + "epoch": 0.43170593474212615, + "grad_norm": 21.732349395751953, + "learning_rate": 5.6849093264248714e-06, + "loss": 0.5606, + "mean_token_accuracy": 0.9137163758277893, + "num_tokens": 4776731.0, + "step": 2666 + }, + { + "epoch": 0.43186786495020646, + "grad_norm": 22.275178909301758, + "learning_rate": 5.683290155440415e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.9053651690483093, + "num_tokens": 4778515.0, + "step": 2667 + }, + { + "epoch": 0.43202979515828677, + "grad_norm": 17.078319549560547, + "learning_rate": 5.6816709844559595e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.9252963960170746, + "num_tokens": 4780308.0, + "step": 2668 + }, + { + "epoch": 0.4321917253663671, + "grad_norm": 27.664405822753906, + "learning_rate": 5.680051813471503e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.8983109295368195, + "num_tokens": 4782095.0, + "step": 2669 + }, + { + "epoch": 0.43235365557444744, + "grad_norm": 28.662857055664062, + "learning_rate": 5.6784326424870475e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.8996916711330414, + "num_tokens": 4783886.0, + "step": 2670 + }, + { + "epoch": 0.43251558578252775, + "grad_norm": 28.25349235534668, + "learning_rate": 5.676813471502591e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.9098878800868988, + "num_tokens": 4785687.0, + "step": 2671 + }, + { + "epoch": 0.43267751599060805, + "grad_norm": 19.321746826171875, + "learning_rate": 5.6751943005181355e-06, + "loss": 0.559, + "mean_token_accuracy": 0.919584333896637, + "num_tokens": 4787473.0, + "step": 2672 + }, + { + "epoch": 0.43283944619868836, + "grad_norm": 21.610492706298828, + "learning_rate": 5.673575129533679e-06, + "loss": 0.539, + "mean_token_accuracy": 0.9187147319316864, + "num_tokens": 4789267.0, + "step": 2673 + }, + { + "epoch": 0.43300137640676867, + "grad_norm": 27.249958038330078, + "learning_rate": 5.6719559585492236e-06, + "loss": 0.581, + "mean_token_accuracy": 0.92306187748909, + "num_tokens": 4791065.0, + "step": 2674 + }, + { + "epoch": 0.433163306614849, + "grad_norm": 21.888282775878906, + "learning_rate": 5.670336787564767e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.9181873500347137, + "num_tokens": 4792858.0, + "step": 2675 + }, + { + "epoch": 0.43332523682292934, + "grad_norm": 27.923105239868164, + "learning_rate": 5.668717616580312e-06, + "loss": 0.6431, + "mean_token_accuracy": 0.9148550927639008, + "num_tokens": 4794652.0, + "step": 2676 + }, + { + "epoch": 0.43348716703100965, + "grad_norm": 19.418935775756836, + "learning_rate": 5.667098445595855e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.9209782183170319, + "num_tokens": 4796430.0, + "step": 2677 + }, + { + "epoch": 0.43364909723908995, + "grad_norm": 25.08905792236328, + "learning_rate": 5.6654792746114e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.9261982440948486, + "num_tokens": 4798213.0, + "step": 2678 + }, + { + "epoch": 0.43381102744717026, + "grad_norm": 26.60637664794922, + "learning_rate": 5.663860103626943e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.9145744144916534, + "num_tokens": 4800005.0, + "step": 2679 + }, + { + "epoch": 0.43397295765525057, + "grad_norm": 18.8463077545166, + "learning_rate": 5.662240932642488e-06, + "loss": 0.518, + "mean_token_accuracy": 0.9267310798168182, + "num_tokens": 4801790.0, + "step": 2680 + }, + { + "epoch": 0.43413488786333093, + "grad_norm": 29.289709091186523, + "learning_rate": 5.660621761658032e-06, + "loss": 0.671, + "mean_token_accuracy": 0.9097487032413483, + "num_tokens": 4803577.0, + "step": 2681 + }, + { + "epoch": 0.43429681807141124, + "grad_norm": 21.096981048583984, + "learning_rate": 5.659002590673576e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.9295774698257446, + "num_tokens": 4805373.0, + "step": 2682 + }, + { + "epoch": 0.43445874827949155, + "grad_norm": 32.044883728027344, + "learning_rate": 5.65738341968912e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.9010662138462067, + "num_tokens": 4807180.0, + "step": 2683 + }, + { + "epoch": 0.43462067848757185, + "grad_norm": 25.138193130493164, + "learning_rate": 5.655764248704664e-06, + "loss": 0.663, + "mean_token_accuracy": 0.9160937964916229, + "num_tokens": 4808966.0, + "step": 2684 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 19.392642974853516, + "learning_rate": 5.654145077720208e-06, + "loss": 0.508, + "mean_token_accuracy": 0.9236772358417511, + "num_tokens": 4810753.0, + "step": 2685 + }, + { + "epoch": 0.43494453890373247, + "grad_norm": 21.204151153564453, + "learning_rate": 5.652525906735752e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.9176872968673706, + "num_tokens": 4812545.0, + "step": 2686 + }, + { + "epoch": 0.43510646911181283, + "grad_norm": 26.812856674194336, + "learning_rate": 5.650906735751296e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.9075387418270111, + "num_tokens": 4814338.0, + "step": 2687 + }, + { + "epoch": 0.43526839931989314, + "grad_norm": 33.402442932128906, + "learning_rate": 5.64928756476684e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.8943662047386169, + "num_tokens": 4816134.0, + "step": 2688 + }, + { + "epoch": 0.43543032952797345, + "grad_norm": 20.40118980407715, + "learning_rate": 5.647668393782384e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.9147412180900574, + "num_tokens": 4817916.0, + "step": 2689 + }, + { + "epoch": 0.43559225973605376, + "grad_norm": 26.93141746520996, + "learning_rate": 5.646049222797928e-06, + "loss": 0.6172, + "mean_token_accuracy": 0.9041759967803955, + "num_tokens": 4819712.0, + "step": 2690 + }, + { + "epoch": 0.43575418994413406, + "grad_norm": 28.92831039428711, + "learning_rate": 5.644430051813472e-06, + "loss": 0.6386, + "mean_token_accuracy": 0.8897416591644287, + "num_tokens": 4821505.0, + "step": 2691 + }, + { + "epoch": 0.43591612015221437, + "grad_norm": 22.93592643737793, + "learning_rate": 5.642810880829016e-06, + "loss": 0.5556, + "mean_token_accuracy": 0.9180963933467865, + "num_tokens": 4823297.0, + "step": 2692 + }, + { + "epoch": 0.43607805036029473, + "grad_norm": 12.93975830078125, + "learning_rate": 5.64119170984456e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.9337658882141113, + "num_tokens": 4825081.0, + "step": 2693 + }, + { + "epoch": 0.43623998056837504, + "grad_norm": 32.457054138183594, + "learning_rate": 5.639572538860104e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.8971927165985107, + "num_tokens": 4826875.0, + "step": 2694 + }, + { + "epoch": 0.43640191077645535, + "grad_norm": 19.667940139770508, + "learning_rate": 5.637953367875648e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.9194128215312958, + "num_tokens": 4828660.0, + "step": 2695 + }, + { + "epoch": 0.43656384098453566, + "grad_norm": 22.991769790649414, + "learning_rate": 5.636334196891192e-06, + "loss": 0.5946, + "mean_token_accuracy": 0.9160839319229126, + "num_tokens": 4830447.0, + "step": 2696 + }, + { + "epoch": 0.43672577119261596, + "grad_norm": 25.189119338989258, + "learning_rate": 5.634715025906736e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.9027777910232544, + "num_tokens": 4832238.0, + "step": 2697 + }, + { + "epoch": 0.4368877014006963, + "grad_norm": 22.71463966369629, + "learning_rate": 5.63309585492228e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.9167017042636871, + "num_tokens": 4834026.0, + "step": 2698 + }, + { + "epoch": 0.43704963160877663, + "grad_norm": 26.27680778503418, + "learning_rate": 5.631476683937824e-06, + "loss": 0.5936, + "mean_token_accuracy": 0.9130810499191284, + "num_tokens": 4835827.0, + "step": 2699 + }, + { + "epoch": 0.43721156181685694, + "grad_norm": 23.116701126098633, + "learning_rate": 5.629857512953369e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.9161287248134613, + "num_tokens": 4837625.0, + "step": 2700 + }, + { + "epoch": 0.43737349202493725, + "grad_norm": 18.830284118652344, + "learning_rate": 5.628238341968912e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.9265037775039673, + "num_tokens": 4839410.0, + "step": 2701 + }, + { + "epoch": 0.43753542223301756, + "grad_norm": 20.59864616394043, + "learning_rate": 5.626619170984457e-06, + "loss": 0.5544, + "mean_token_accuracy": 0.9170937538146973, + "num_tokens": 4841200.0, + "step": 2702 + }, + { + "epoch": 0.43769735244109786, + "grad_norm": 24.57659339904785, + "learning_rate": 5.625e-06, + "loss": 0.5824, + "mean_token_accuracy": 0.9157004952430725, + "num_tokens": 4842985.0, + "step": 2703 + }, + { + "epoch": 0.4378592826491782, + "grad_norm": 26.68417739868164, + "learning_rate": 5.623380829015545e-06, + "loss": 0.746, + "mean_token_accuracy": 0.9068345129489899, + "num_tokens": 4844776.0, + "step": 2704 + }, + { + "epoch": 0.43802121285725854, + "grad_norm": 17.324947357177734, + "learning_rate": 5.621761658031088e-06, + "loss": 0.5752, + "mean_token_accuracy": 0.9229723215103149, + "num_tokens": 4846562.0, + "step": 2705 + }, + { + "epoch": 0.43818314306533884, + "grad_norm": 23.368986129760742, + "learning_rate": 5.620142487046633e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.9187802076339722, + "num_tokens": 4848356.0, + "step": 2706 + }, + { + "epoch": 0.43834507327341915, + "grad_norm": 26.392820358276367, + "learning_rate": 5.618523316062176e-06, + "loss": 0.66, + "mean_token_accuracy": 0.9090151488780975, + "num_tokens": 4850141.0, + "step": 2707 + }, + { + "epoch": 0.43850700348149946, + "grad_norm": 19.447568893432617, + "learning_rate": 5.616904145077721e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.9331349432468414, + "num_tokens": 4851937.0, + "step": 2708 + }, + { + "epoch": 0.43866893368957977, + "grad_norm": 31.07088279724121, + "learning_rate": 5.6152849740932644e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.905844658613205, + "num_tokens": 4853725.0, + "step": 2709 + }, + { + "epoch": 0.43883086389766013, + "grad_norm": 27.62972640991211, + "learning_rate": 5.613665803108809e-06, + "loss": 0.6222, + "mean_token_accuracy": 0.9139773845672607, + "num_tokens": 4855516.0, + "step": 2710 + }, + { + "epoch": 0.43899279410574044, + "grad_norm": 23.208091735839844, + "learning_rate": 5.6120466321243525e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.9099584519863129, + "num_tokens": 4857304.0, + "step": 2711 + }, + { + "epoch": 0.43915472431382074, + "grad_norm": 17.770944595336914, + "learning_rate": 5.610427461139897e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9229517877101898, + "num_tokens": 4859101.0, + "step": 2712 + }, + { + "epoch": 0.43931665452190105, + "grad_norm": 27.526386260986328, + "learning_rate": 5.6088082901554405e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.8963922262191772, + "num_tokens": 4860892.0, + "step": 2713 + }, + { + "epoch": 0.43947858472998136, + "grad_norm": 30.08072853088379, + "learning_rate": 5.607189119170985e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.9060161411762238, + "num_tokens": 4862690.0, + "step": 2714 + }, + { + "epoch": 0.4396405149380617, + "grad_norm": 23.267772674560547, + "learning_rate": 5.6055699481865285e-06, + "loss": 0.6049, + "mean_token_accuracy": 0.9203906953334808, + "num_tokens": 4864478.0, + "step": 2715 + }, + { + "epoch": 0.43980244514614203, + "grad_norm": 21.03306770324707, + "learning_rate": 5.603950777202073e-06, + "loss": 0.6328, + "mean_token_accuracy": 0.9191176295280457, + "num_tokens": 4866262.0, + "step": 2716 + }, + { + "epoch": 0.43996437535422234, + "grad_norm": 28.422283172607422, + "learning_rate": 5.6023316062176165e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.9212790131568909, + "num_tokens": 4868052.0, + "step": 2717 + }, + { + "epoch": 0.44012630556230264, + "grad_norm": 24.285396575927734, + "learning_rate": 5.600712435233161e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.9147909283638, + "num_tokens": 4869847.0, + "step": 2718 + }, + { + "epoch": 0.44028823577038295, + "grad_norm": 27.072790145874023, + "learning_rate": 5.599093264248705e-06, + "loss": 0.6352, + "mean_token_accuracy": 0.9072340428829193, + "num_tokens": 4871650.0, + "step": 2719 + }, + { + "epoch": 0.44045016597846326, + "grad_norm": 26.667236328125, + "learning_rate": 5.597474093264249e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.9160805642604828, + "num_tokens": 4873436.0, + "step": 2720 + }, + { + "epoch": 0.4406120961865436, + "grad_norm": 34.54020690917969, + "learning_rate": 5.5958549222797934e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.898889034986496, + "num_tokens": 4875215.0, + "step": 2721 + }, + { + "epoch": 0.44077402639462393, + "grad_norm": 23.151290893554688, + "learning_rate": 5.594235751295337e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.9230088293552399, + "num_tokens": 4877000.0, + "step": 2722 + }, + { + "epoch": 0.44093595660270424, + "grad_norm": 33.38759994506836, + "learning_rate": 5.5926165803108815e-06, + "loss": 0.67, + "mean_token_accuracy": 0.9074074029922485, + "num_tokens": 4878800.0, + "step": 2723 + }, + { + "epoch": 0.44109788681078455, + "grad_norm": 26.60508918762207, + "learning_rate": 5.590997409326425e-06, + "loss": 0.5924, + "mean_token_accuracy": 0.9129863977432251, + "num_tokens": 4880588.0, + "step": 2724 + }, + { + "epoch": 0.44125981701886485, + "grad_norm": 28.57872772216797, + "learning_rate": 5.5893782383419695e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.9103121757507324, + "num_tokens": 4882378.0, + "step": 2725 + }, + { + "epoch": 0.44142174722694516, + "grad_norm": 21.66533851623535, + "learning_rate": 5.587759067357513e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.9250536262989044, + "num_tokens": 4884183.0, + "step": 2726 + }, + { + "epoch": 0.4415836774350255, + "grad_norm": 25.022130966186523, + "learning_rate": 5.5861398963730575e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.925781637430191, + "num_tokens": 4885978.0, + "step": 2727 + }, + { + "epoch": 0.44174560764310583, + "grad_norm": 25.449390411376953, + "learning_rate": 5.584520725388601e-06, + "loss": 0.6625, + "mean_token_accuracy": 0.9050511717796326, + "num_tokens": 4887764.0, + "step": 2728 + }, + { + "epoch": 0.44190753785118614, + "grad_norm": 26.575929641723633, + "learning_rate": 5.5829015544041455e-06, + "loss": 0.705, + "mean_token_accuracy": 0.9071428775787354, + "num_tokens": 4889556.0, + "step": 2729 + }, + { + "epoch": 0.44206946805926645, + "grad_norm": 28.210323333740234, + "learning_rate": 5.581282383419689e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.9019704461097717, + "num_tokens": 4891353.0, + "step": 2730 + }, + { + "epoch": 0.44223139826734675, + "grad_norm": 25.475828170776367, + "learning_rate": 5.5796632124352336e-06, + "loss": 0.6002, + "mean_token_accuracy": 0.9227039813995361, + "num_tokens": 4893136.0, + "step": 2731 + }, + { + "epoch": 0.4423933284754271, + "grad_norm": 28.722536087036133, + "learning_rate": 5.578044041450777e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.9136646091938019, + "num_tokens": 4894926.0, + "step": 2732 + }, + { + "epoch": 0.4425552586835074, + "grad_norm": 25.91667938232422, + "learning_rate": 5.576424870466322e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.9045088589191437, + "num_tokens": 4896711.0, + "step": 2733 + }, + { + "epoch": 0.44271718889158773, + "grad_norm": 23.53471565246582, + "learning_rate": 5.574805699481865e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.9170056283473969, + "num_tokens": 4898500.0, + "step": 2734 + }, + { + "epoch": 0.44287911909966804, + "grad_norm": 25.055017471313477, + "learning_rate": 5.57318652849741e-06, + "loss": 0.6123, + "mean_token_accuracy": 0.9089095592498779, + "num_tokens": 4900297.0, + "step": 2735 + }, + { + "epoch": 0.44304104930774835, + "grad_norm": 26.53508186340332, + "learning_rate": 5.571567357512954e-06, + "loss": 0.5755, + "mean_token_accuracy": 0.9150429666042328, + "num_tokens": 4902092.0, + "step": 2736 + }, + { + "epoch": 0.44320297951582865, + "grad_norm": 27.02204704284668, + "learning_rate": 5.569948186528498e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.9140793681144714, + "num_tokens": 4903883.0, + "step": 2737 + }, + { + "epoch": 0.443364909723909, + "grad_norm": 28.95026206970215, + "learning_rate": 5.568329015544042e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.9164723455905914, + "num_tokens": 4905682.0, + "step": 2738 + }, + { + "epoch": 0.4435268399319893, + "grad_norm": 24.438188552856445, + "learning_rate": 5.566709844559586e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.9187915623188019, + "num_tokens": 4907465.0, + "step": 2739 + }, + { + "epoch": 0.44368877014006963, + "grad_norm": 16.558094024658203, + "learning_rate": 5.56509067357513e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.925567239522934, + "num_tokens": 4909246.0, + "step": 2740 + }, + { + "epoch": 0.44385070034814994, + "grad_norm": 35.34246826171875, + "learning_rate": 5.563471502590674e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.8841721713542938, + "num_tokens": 4911043.0, + "step": 2741 + }, + { + "epoch": 0.44401263055623025, + "grad_norm": 23.990554809570312, + "learning_rate": 5.561852331606218e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.9163069427013397, + "num_tokens": 4912829.0, + "step": 2742 + }, + { + "epoch": 0.44417456076431056, + "grad_norm": 20.967924118041992, + "learning_rate": 5.560233160621762e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.920550525188446, + "num_tokens": 4914618.0, + "step": 2743 + }, + { + "epoch": 0.4443364909723909, + "grad_norm": 23.729198455810547, + "learning_rate": 5.558613989637306e-06, + "loss": 0.543, + "mean_token_accuracy": 0.9198492169380188, + "num_tokens": 4916416.0, + "step": 2744 + }, + { + "epoch": 0.4444984211804712, + "grad_norm": 25.297008514404297, + "learning_rate": 5.55699481865285e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.9157738089561462, + "num_tokens": 4918212.0, + "step": 2745 + }, + { + "epoch": 0.44466035138855153, + "grad_norm": 16.53269386291504, + "learning_rate": 5.555375647668394e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.929892361164093, + "num_tokens": 4920010.0, + "step": 2746 + }, + { + "epoch": 0.44482228159663184, + "grad_norm": 22.38106918334961, + "learning_rate": 5.553756476683938e-06, + "loss": 0.6166, + "mean_token_accuracy": 0.9132690131664276, + "num_tokens": 4921799.0, + "step": 2747 + }, + { + "epoch": 0.44498421180471215, + "grad_norm": 18.34514617919922, + "learning_rate": 5.552137305699482e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9303542673587799, + "num_tokens": 4923584.0, + "step": 2748 + }, + { + "epoch": 0.4451461420127925, + "grad_norm": 17.57979965209961, + "learning_rate": 5.550518134715026e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.9276595711708069, + "num_tokens": 4925372.0, + "step": 2749 + }, + { + "epoch": 0.4453080722208728, + "grad_norm": 28.670560836791992, + "learning_rate": 5.54889896373057e-06, + "loss": 0.725, + "mean_token_accuracy": 0.9060908854007721, + "num_tokens": 4927170.0, + "step": 2750 + }, + { + "epoch": 0.4454700024289531, + "grad_norm": 29.89539909362793, + "learning_rate": 5.547279792746114e-06, + "loss": 0.773, + "mean_token_accuracy": 0.9009331166744232, + "num_tokens": 4928966.0, + "step": 2751 + }, + { + "epoch": 0.44563193263703343, + "grad_norm": 23.308883666992188, + "learning_rate": 5.545660621761658e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.9170055389404297, + "num_tokens": 4930755.0, + "step": 2752 + }, + { + "epoch": 0.44579386284511374, + "grad_norm": 17.085216522216797, + "learning_rate": 5.544041450777202e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.925621896982193, + "num_tokens": 4932536.0, + "step": 2753 + }, + { + "epoch": 0.44595579305319405, + "grad_norm": 24.243133544921875, + "learning_rate": 5.542422279792746e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.9160574078559875, + "num_tokens": 4934321.0, + "step": 2754 + }, + { + "epoch": 0.4461177232612744, + "grad_norm": 22.24656867980957, + "learning_rate": 5.540803108808291e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.915139764547348, + "num_tokens": 4936117.0, + "step": 2755 + }, + { + "epoch": 0.4462796534693547, + "grad_norm": 22.625072479248047, + "learning_rate": 5.539183937823834e-06, + "loss": 0.6627, + "mean_token_accuracy": 0.9067419171333313, + "num_tokens": 4937907.0, + "step": 2756 + }, + { + "epoch": 0.446441583677435, + "grad_norm": 16.466907501220703, + "learning_rate": 5.537564766839379e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.9284502267837524, + "num_tokens": 4939698.0, + "step": 2757 + }, + { + "epoch": 0.44660351388551534, + "grad_norm": 29.324201583862305, + "learning_rate": 5.535945595854922e-06, + "loss": 0.702, + "mean_token_accuracy": 0.9055489599704742, + "num_tokens": 4941494.0, + "step": 2758 + }, + { + "epoch": 0.44676544409359564, + "grad_norm": 28.939008712768555, + "learning_rate": 5.534326424870467e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.8974235355854034, + "num_tokens": 4943279.0, + "step": 2759 + }, + { + "epoch": 0.44692737430167595, + "grad_norm": 29.352474212646484, + "learning_rate": 5.53270725388601e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.8951492607593536, + "num_tokens": 4945065.0, + "step": 2760 + }, + { + "epoch": 0.4470893045097563, + "grad_norm": 29.990007400512695, + "learning_rate": 5.531088082901555e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.8965224027633667, + "num_tokens": 4946868.0, + "step": 2761 + }, + { + "epoch": 0.4472512347178366, + "grad_norm": 32.23251724243164, + "learning_rate": 5.529468911917098e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.8844271302223206, + "num_tokens": 4948666.0, + "step": 2762 + }, + { + "epoch": 0.44741316492591693, + "grad_norm": 27.487356185913086, + "learning_rate": 5.527849740932643e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.9192523658275604, + "num_tokens": 4950463.0, + "step": 2763 + }, + { + "epoch": 0.44757509513399724, + "grad_norm": 27.68252182006836, + "learning_rate": 5.526230569948186e-06, + "loss": 0.612, + "mean_token_accuracy": 0.9084639251232147, + "num_tokens": 4952248.0, + "step": 2764 + }, + { + "epoch": 0.44773702534207754, + "grad_norm": 15.705184936523438, + "learning_rate": 5.524611398963731e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.9345941543579102, + "num_tokens": 4954036.0, + "step": 2765 + }, + { + "epoch": 0.4478989555501579, + "grad_norm": 21.034793853759766, + "learning_rate": 5.5229922279792744e-06, + "loss": 0.6366, + "mean_token_accuracy": 0.9187709391117096, + "num_tokens": 4955833.0, + "step": 2766 + }, + { + "epoch": 0.4480608857582382, + "grad_norm": 15.66590404510498, + "learning_rate": 5.521373056994819e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.927371621131897, + "num_tokens": 4957619.0, + "step": 2767 + }, + { + "epoch": 0.4482228159663185, + "grad_norm": 18.492515563964844, + "learning_rate": 5.5197538860103625e-06, + "loss": 0.5607, + "mean_token_accuracy": 0.9262706935405731, + "num_tokens": 4959416.0, + "step": 2768 + }, + { + "epoch": 0.44838474617439883, + "grad_norm": 18.984676361083984, + "learning_rate": 5.518134715025907e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.925253301858902, + "num_tokens": 4961209.0, + "step": 2769 + }, + { + "epoch": 0.44854667638247914, + "grad_norm": 27.814537048339844, + "learning_rate": 5.5165155440414505e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.8949275612831116, + "num_tokens": 4962997.0, + "step": 2770 + }, + { + "epoch": 0.44870860659055944, + "grad_norm": 19.05231475830078, + "learning_rate": 5.514896373056995e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.920273095369339, + "num_tokens": 4964785.0, + "step": 2771 + }, + { + "epoch": 0.4488705367986398, + "grad_norm": 25.37510871887207, + "learning_rate": 5.5132772020725385e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.9160839319229126, + "num_tokens": 4966583.0, + "step": 2772 + }, + { + "epoch": 0.4490324670067201, + "grad_norm": 25.628129959106445, + "learning_rate": 5.511658031088083e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.9076103568077087, + "num_tokens": 4968376.0, + "step": 2773 + }, + { + "epoch": 0.4491943972148004, + "grad_norm": 29.199174880981445, + "learning_rate": 5.510038860103627e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.8978102207183838, + "num_tokens": 4970162.0, + "step": 2774 + }, + { + "epoch": 0.44935632742288073, + "grad_norm": 23.482973098754883, + "learning_rate": 5.508419689119171e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.9113682210445404, + "num_tokens": 4971956.0, + "step": 2775 + }, + { + "epoch": 0.44951825763096104, + "grad_norm": 19.054073333740234, + "learning_rate": 5.5068005181347154e-06, + "loss": 0.5559, + "mean_token_accuracy": 0.9225352108478546, + "num_tokens": 4973752.0, + "step": 2776 + }, + { + "epoch": 0.4496801878390414, + "grad_norm": 26.444379806518555, + "learning_rate": 5.505181347150259e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.9127170443534851, + "num_tokens": 4975550.0, + "step": 2777 + }, + { + "epoch": 0.4498421180471217, + "grad_norm": 19.205793380737305, + "learning_rate": 5.5035621761658035e-06, + "loss": 0.515, + "mean_token_accuracy": 0.9288500547409058, + "num_tokens": 4977343.0, + "step": 2778 + }, + { + "epoch": 0.450004048255202, + "grad_norm": 15.34057903289795, + "learning_rate": 5.501943005181347e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.9248889684677124, + "num_tokens": 4979135.0, + "step": 2779 + }, + { + "epoch": 0.4501659784632823, + "grad_norm": 27.74591064453125, + "learning_rate": 5.5003238341968915e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.9089954197406769, + "num_tokens": 4980943.0, + "step": 2780 + }, + { + "epoch": 0.45032790867136263, + "grad_norm": 21.331445693969727, + "learning_rate": 5.498704663212435e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9219819009304047, + "num_tokens": 4982737.0, + "step": 2781 + }, + { + "epoch": 0.45048983887944294, + "grad_norm": 17.33263397216797, + "learning_rate": 5.4970854922279795e-06, + "loss": 0.5332, + "mean_token_accuracy": 0.9224945604801178, + "num_tokens": 4984520.0, + "step": 2782 + }, + { + "epoch": 0.4506517690875233, + "grad_norm": 20.298730850219727, + "learning_rate": 5.495466321243523e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9211378395557404, + "num_tokens": 4986311.0, + "step": 2783 + }, + { + "epoch": 0.4508136992956036, + "grad_norm": 21.53404426574707, + "learning_rate": 5.4938471502590675e-06, + "loss": 0.6246, + "mean_token_accuracy": 0.924717366695404, + "num_tokens": 4988102.0, + "step": 2784 + }, + { + "epoch": 0.4509756295036839, + "grad_norm": 29.463748931884766, + "learning_rate": 5.492227979274611e-06, + "loss": 0.761, + "mean_token_accuracy": 0.9069536328315735, + "num_tokens": 4989915.0, + "step": 2785 + }, + { + "epoch": 0.4511375597117642, + "grad_norm": 32.4338264465332, + "learning_rate": 5.4906088082901556e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.8913461565971375, + "num_tokens": 4991701.0, + "step": 2786 + }, + { + "epoch": 0.45129948991984453, + "grad_norm": 20.215728759765625, + "learning_rate": 5.488989637305699e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.917548805475235, + "num_tokens": 4993492.0, + "step": 2787 + }, + { + "epoch": 0.45146142012792484, + "grad_norm": 19.93372344970703, + "learning_rate": 5.487370466321244e-06, + "loss": 0.5925, + "mean_token_accuracy": 0.9151683151721954, + "num_tokens": 4995287.0, + "step": 2788 + }, + { + "epoch": 0.4516233503360052, + "grad_norm": 26.035869598388672, + "learning_rate": 5.485751295336787e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.9004205167293549, + "num_tokens": 4997080.0, + "step": 2789 + }, + { + "epoch": 0.4517852805440855, + "grad_norm": 25.034929275512695, + "learning_rate": 5.484132124352332e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.9066450893878937, + "num_tokens": 4998871.0, + "step": 2790 + }, + { + "epoch": 0.4519472107521658, + "grad_norm": 30.935081481933594, + "learning_rate": 5.482512953367875e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.8970410823822021, + "num_tokens": 5000665.0, + "step": 2791 + }, + { + "epoch": 0.4521091409602461, + "grad_norm": 26.603248596191406, + "learning_rate": 5.48089378238342e-06, + "loss": 0.644, + "mean_token_accuracy": 0.9118581116199493, + "num_tokens": 5002461.0, + "step": 2792 + }, + { + "epoch": 0.45227107116832643, + "grad_norm": 17.50041961669922, + "learning_rate": 5.479274611398964e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.9292457401752472, + "num_tokens": 5004256.0, + "step": 2793 + }, + { + "epoch": 0.4524330013764068, + "grad_norm": 30.962717056274414, + "learning_rate": 5.477655440414508e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.8976273834705353, + "num_tokens": 5006051.0, + "step": 2794 + }, + { + "epoch": 0.4525949315844871, + "grad_norm": 14.432768821716309, + "learning_rate": 5.476036269430052e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.9181868433952332, + "num_tokens": 5007832.0, + "step": 2795 + }, + { + "epoch": 0.4527568617925674, + "grad_norm": 28.438678741455078, + "learning_rate": 5.474417098445596e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.9127098321914673, + "num_tokens": 5009618.0, + "step": 2796 + }, + { + "epoch": 0.4529187920006477, + "grad_norm": 14.729020118713379, + "learning_rate": 5.47279792746114e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9290726780891418, + "num_tokens": 5011398.0, + "step": 2797 + }, + { + "epoch": 0.453080722208728, + "grad_norm": 22.016693115234375, + "learning_rate": 5.471178756476684e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.9120418429374695, + "num_tokens": 5013194.0, + "step": 2798 + }, + { + "epoch": 0.45324265241680833, + "grad_norm": 21.031702041625977, + "learning_rate": 5.469559585492228e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.9257246553897858, + "num_tokens": 5014988.0, + "step": 2799 + }, + { + "epoch": 0.4534045826248887, + "grad_norm": 17.902660369873047, + "learning_rate": 5.4679404145077734e-06, + "loss": 0.6153, + "mean_token_accuracy": 0.9168752431869507, + "num_tokens": 5016777.0, + "step": 2800 + }, + { + "epoch": 0.453566512832969, + "grad_norm": 12.702520370483398, + "learning_rate": 5.466321243523317e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.9298729598522186, + "num_tokens": 5018574.0, + "step": 2801 + }, + { + "epoch": 0.4537284430410493, + "grad_norm": 19.156356811523438, + "learning_rate": 5.4647020725388615e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.925709456205368, + "num_tokens": 5020369.0, + "step": 2802 + }, + { + "epoch": 0.4538903732491296, + "grad_norm": 28.117366790771484, + "learning_rate": 5.463082901554405e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.9012333154678345, + "num_tokens": 5022163.0, + "step": 2803 + }, + { + "epoch": 0.4540523034572099, + "grad_norm": 17.92461395263672, + "learning_rate": 5.4614637305699495e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.9154166281223297, + "num_tokens": 5023949.0, + "step": 2804 + }, + { + "epoch": 0.45421423366529023, + "grad_norm": 25.747995376586914, + "learning_rate": 5.459844559585493e-06, + "loss": 0.5747, + "mean_token_accuracy": 0.922680139541626, + "num_tokens": 5025745.0, + "step": 2805 + }, + { + "epoch": 0.4543761638733706, + "grad_norm": 22.67438316345215, + "learning_rate": 5.4582253886010375e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.9175146520137787, + "num_tokens": 5027536.0, + "step": 2806 + }, + { + "epoch": 0.4545380940814509, + "grad_norm": 28.96368980407715, + "learning_rate": 5.456606217616581e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.8960694372653961, + "num_tokens": 5029336.0, + "step": 2807 + }, + { + "epoch": 0.4547000242895312, + "grad_norm": 36.49106979370117, + "learning_rate": 5.4549870466321255e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.8745629489421844, + "num_tokens": 5031135.0, + "step": 2808 + }, + { + "epoch": 0.4548619544976115, + "grad_norm": 24.831716537475586, + "learning_rate": 5.453367875647669e-06, + "loss": 0.6366, + "mean_token_accuracy": 0.9065735042095184, + "num_tokens": 5032925.0, + "step": 2809 + }, + { + "epoch": 0.45502388470569183, + "grad_norm": 25.967815399169922, + "learning_rate": 5.4517487046632136e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.9176878929138184, + "num_tokens": 5034717.0, + "step": 2810 + }, + { + "epoch": 0.4551858149137722, + "grad_norm": 23.92116928100586, + "learning_rate": 5.450129533678757e-06, + "loss": 0.6478, + "mean_token_accuracy": 0.9033152163028717, + "num_tokens": 5036507.0, + "step": 2811 + }, + { + "epoch": 0.4553477451218525, + "grad_norm": 16.223268508911133, + "learning_rate": 5.448510362694302e-06, + "loss": 0.522, + "mean_token_accuracy": 0.9236077964305878, + "num_tokens": 5038293.0, + "step": 2812 + }, + { + "epoch": 0.4555096753299328, + "grad_norm": 21.59657096862793, + "learning_rate": 5.446891191709845e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.9093892574310303, + "num_tokens": 5040081.0, + "step": 2813 + }, + { + "epoch": 0.4556716055380131, + "grad_norm": 18.043031692504883, + "learning_rate": 5.44527202072539e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.9277969002723694, + "num_tokens": 5041870.0, + "step": 2814 + }, + { + "epoch": 0.4558335357460934, + "grad_norm": 21.975814819335938, + "learning_rate": 5.443652849740933e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.9263225495815277, + "num_tokens": 5043667.0, + "step": 2815 + }, + { + "epoch": 0.45599546595417373, + "grad_norm": 26.610212326049805, + "learning_rate": 5.442033678756478e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.9009661972522736, + "num_tokens": 5045461.0, + "step": 2816 + }, + { + "epoch": 0.4561573961622541, + "grad_norm": 16.82908058166504, + "learning_rate": 5.440414507772021e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.916636049747467, + "num_tokens": 5047249.0, + "step": 2817 + }, + { + "epoch": 0.4563193263703344, + "grad_norm": 24.216522216796875, + "learning_rate": 5.438795336787566e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.9120439887046814, + "num_tokens": 5049044.0, + "step": 2818 + }, + { + "epoch": 0.4564812565784147, + "grad_norm": 25.455324172973633, + "learning_rate": 5.43717616580311e-06, + "loss": 0.594, + "mean_token_accuracy": 0.9079841077327728, + "num_tokens": 5050838.0, + "step": 2819 + }, + { + "epoch": 0.456643186786495, + "grad_norm": 23.583614349365234, + "learning_rate": 5.435556994818654e-06, + "loss": 0.6547, + "mean_token_accuracy": 0.9151848256587982, + "num_tokens": 5052633.0, + "step": 2820 + }, + { + "epoch": 0.4568051169945753, + "grad_norm": 24.683320999145508, + "learning_rate": 5.433937823834198e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.8986204266548157, + "num_tokens": 5054432.0, + "step": 2821 + }, + { + "epoch": 0.45696704720265563, + "grad_norm": 24.798282623291016, + "learning_rate": 5.432318652849742e-06, + "loss": 0.6165, + "mean_token_accuracy": 0.9121578335762024, + "num_tokens": 5056217.0, + "step": 2822 + }, + { + "epoch": 0.457128977410736, + "grad_norm": 32.945404052734375, + "learning_rate": 5.430699481865286e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.882340282201767, + "num_tokens": 5058018.0, + "step": 2823 + }, + { + "epoch": 0.4572909076188163, + "grad_norm": 13.521900177001953, + "learning_rate": 5.42908031088083e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.9336929321289062, + "num_tokens": 5059801.0, + "step": 2824 + }, + { + "epoch": 0.4574528378268966, + "grad_norm": 22.05010414123535, + "learning_rate": 5.427461139896374e-06, + "loss": 0.571, + "mean_token_accuracy": 0.9081010818481445, + "num_tokens": 5061596.0, + "step": 2825 + }, + { + "epoch": 0.4576147680349769, + "grad_norm": 25.248748779296875, + "learning_rate": 5.425841968911918e-06, + "loss": 0.6441, + "mean_token_accuracy": 0.9033879339694977, + "num_tokens": 5063388.0, + "step": 2826 + }, + { + "epoch": 0.4577766982430572, + "grad_norm": 23.87033462524414, + "learning_rate": 5.424222797927462e-06, + "loss": 0.6315, + "mean_token_accuracy": 0.913968563079834, + "num_tokens": 5065179.0, + "step": 2827 + }, + { + "epoch": 0.4579386284511376, + "grad_norm": 26.319520950317383, + "learning_rate": 5.422603626943006e-06, + "loss": 0.662, + "mean_token_accuracy": 0.9100414216518402, + "num_tokens": 5066969.0, + "step": 2828 + }, + { + "epoch": 0.4581005586592179, + "grad_norm": 13.703821182250977, + "learning_rate": 5.42098445595855e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.9357960820198059, + "num_tokens": 5068762.0, + "step": 2829 + }, + { + "epoch": 0.4582624888672982, + "grad_norm": 20.925565719604492, + "learning_rate": 5.419365284974094e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.9188898801803589, + "num_tokens": 5070557.0, + "step": 2830 + }, + { + "epoch": 0.4584244190753785, + "grad_norm": 25.780550003051758, + "learning_rate": 5.417746113989638e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.91847363114357, + "num_tokens": 5072351.0, + "step": 2831 + }, + { + "epoch": 0.4585863492834588, + "grad_norm": 27.270395278930664, + "learning_rate": 5.416126943005182e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.8855840265750885, + "num_tokens": 5074151.0, + "step": 2832 + }, + { + "epoch": 0.4587482794915391, + "grad_norm": 28.422182083129883, + "learning_rate": 5.414507772020726e-06, + "loss": 0.6886, + "mean_token_accuracy": 0.9022475481033325, + "num_tokens": 5075939.0, + "step": 2833 + }, + { + "epoch": 0.4589102096996195, + "grad_norm": 22.179424285888672, + "learning_rate": 5.41288860103627e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.9132466912269592, + "num_tokens": 5077716.0, + "step": 2834 + }, + { + "epoch": 0.4590721399076998, + "grad_norm": 17.141340255737305, + "learning_rate": 5.411269430051814e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.9219367802143097, + "num_tokens": 5079498.0, + "step": 2835 + }, + { + "epoch": 0.4592340701157801, + "grad_norm": 21.801841735839844, + "learning_rate": 5.409650259067358e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.9222591519355774, + "num_tokens": 5081279.0, + "step": 2836 + }, + { + "epoch": 0.4593960003238604, + "grad_norm": 24.38442039489746, + "learning_rate": 5.408031088082902e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.9234082102775574, + "num_tokens": 5083077.0, + "step": 2837 + }, + { + "epoch": 0.4595579305319407, + "grad_norm": 12.665369987487793, + "learning_rate": 5.406411917098447e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.9268797039985657, + "num_tokens": 5084862.0, + "step": 2838 + }, + { + "epoch": 0.459719860740021, + "grad_norm": 20.18532371520996, + "learning_rate": 5.40479274611399e-06, + "loss": 0.56, + "mean_token_accuracy": 0.9214285612106323, + "num_tokens": 5086654.0, + "step": 2839 + }, + { + "epoch": 0.4598817909481014, + "grad_norm": 20.738492965698242, + "learning_rate": 5.403173575129535e-06, + "loss": 0.5513, + "mean_token_accuracy": 0.919047623872757, + "num_tokens": 5088438.0, + "step": 2840 + }, + { + "epoch": 0.4600437211561817, + "grad_norm": 21.79051971435547, + "learning_rate": 5.401554404145078e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.9157062470912933, + "num_tokens": 5090235.0, + "step": 2841 + }, + { + "epoch": 0.460205651364262, + "grad_norm": 30.287353515625, + "learning_rate": 5.399935233160623e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.9126865565776825, + "num_tokens": 5092021.0, + "step": 2842 + }, + { + "epoch": 0.4603675815723423, + "grad_norm": 21.694766998291016, + "learning_rate": 5.398316062176166e-06, + "loss": 0.6355, + "mean_token_accuracy": 0.9133446216583252, + "num_tokens": 5093799.0, + "step": 2843 + }, + { + "epoch": 0.4605295117804226, + "grad_norm": 30.974061965942383, + "learning_rate": 5.396696891191711e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.897817462682724, + "num_tokens": 5095595.0, + "step": 2844 + }, + { + "epoch": 0.460691441988503, + "grad_norm": 21.503873825073242, + "learning_rate": 5.3950777202072544e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.9208074510097504, + "num_tokens": 5097385.0, + "step": 2845 + }, + { + "epoch": 0.4608533721965833, + "grad_norm": 23.618362426757812, + "learning_rate": 5.393458549222799e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.9154929518699646, + "num_tokens": 5099181.0, + "step": 2846 + }, + { + "epoch": 0.4610153024046636, + "grad_norm": 16.37499237060547, + "learning_rate": 5.3918393782383425e-06, + "loss": 0.5803, + "mean_token_accuracy": 0.9311820566654205, + "num_tokens": 5100969.0, + "step": 2847 + }, + { + "epoch": 0.4611772326127439, + "grad_norm": 22.36064910888672, + "learning_rate": 5.390220207253887e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.9070101678371429, + "num_tokens": 5102750.0, + "step": 2848 + }, + { + "epoch": 0.4613391628208242, + "grad_norm": 30.884986877441406, + "learning_rate": 5.3886010362694305e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.9010878205299377, + "num_tokens": 5104544.0, + "step": 2849 + }, + { + "epoch": 0.4615010930289045, + "grad_norm": 21.944713592529297, + "learning_rate": 5.386981865284975e-06, + "loss": 0.6046, + "mean_token_accuracy": 0.925709456205368, + "num_tokens": 5106339.0, + "step": 2850 + }, + { + "epoch": 0.4616630232369849, + "grad_norm": 23.72605323791504, + "learning_rate": 5.3853626943005185e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.9118026793003082, + "num_tokens": 5108135.0, + "step": 2851 + }, + { + "epoch": 0.4618249534450652, + "grad_norm": 27.90321922302246, + "learning_rate": 5.383743523316063e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.9058675467967987, + "num_tokens": 5109932.0, + "step": 2852 + }, + { + "epoch": 0.4619868836531455, + "grad_norm": 23.34131622314453, + "learning_rate": 5.3821243523316065e-06, + "loss": 0.6354, + "mean_token_accuracy": 0.9053634703159332, + "num_tokens": 5111729.0, + "step": 2853 + }, + { + "epoch": 0.4621488138612258, + "grad_norm": 24.6957950592041, + "learning_rate": 5.380505181347151e-06, + "loss": 0.69, + "mean_token_accuracy": 0.9039099514484406, + "num_tokens": 5113521.0, + "step": 2854 + }, + { + "epoch": 0.4623107440693061, + "grad_norm": 13.516589164733887, + "learning_rate": 5.3788860103626946e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.9321138858795166, + "num_tokens": 5115313.0, + "step": 2855 + }, + { + "epoch": 0.4624726742773864, + "grad_norm": 18.10955810546875, + "learning_rate": 5.377266839378239e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.9263710677623749, + "num_tokens": 5117097.0, + "step": 2856 + }, + { + "epoch": 0.4626346044854668, + "grad_norm": 19.24322509765625, + "learning_rate": 5.3756476683937834e-06, + "loss": 0.566, + "mean_token_accuracy": 0.921171635389328, + "num_tokens": 5118888.0, + "step": 2857 + }, + { + "epoch": 0.4627965346935471, + "grad_norm": 16.20059585571289, + "learning_rate": 5.374028497409327e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9195095896720886, + "num_tokens": 5120674.0, + "step": 2858 + }, + { + "epoch": 0.4629584649016274, + "grad_norm": 23.226848602294922, + "learning_rate": 5.3724093264248715e-06, + "loss": 0.69, + "mean_token_accuracy": 0.9109677076339722, + "num_tokens": 5122465.0, + "step": 2859 + }, + { + "epoch": 0.4631203951097077, + "grad_norm": 25.412384033203125, + "learning_rate": 5.370790155440415e-06, + "loss": 0.6229, + "mean_token_accuracy": 0.9193286001682281, + "num_tokens": 5124262.0, + "step": 2860 + }, + { + "epoch": 0.463282325317788, + "grad_norm": 23.752702713012695, + "learning_rate": 5.3691709844559595e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.9160798192024231, + "num_tokens": 5126048.0, + "step": 2861 + }, + { + "epoch": 0.4634442555258684, + "grad_norm": 20.44335174560547, + "learning_rate": 5.367551813471503e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.9229323267936707, + "num_tokens": 5127833.0, + "step": 2862 + }, + { + "epoch": 0.4636061857339487, + "grad_norm": 18.631301879882812, + "learning_rate": 5.3659326424870475e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9239130616188049, + "num_tokens": 5129621.0, + "step": 2863 + }, + { + "epoch": 0.463768115942029, + "grad_norm": 18.931396484375, + "learning_rate": 5.364313471502591e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.9220321774482727, + "num_tokens": 5131415.0, + "step": 2864 + }, + { + "epoch": 0.4639300461501093, + "grad_norm": 8.640852928161621, + "learning_rate": 5.3626943005181356e-06, + "loss": 0.45, + "mean_token_accuracy": 0.9319812953472137, + "num_tokens": 5133192.0, + "step": 2865 + }, + { + "epoch": 0.4640919763581896, + "grad_norm": 19.540283203125, + "learning_rate": 5.361075129533679e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9296531975269318, + "num_tokens": 5134974.0, + "step": 2866 + }, + { + "epoch": 0.4642539065662699, + "grad_norm": 19.483684539794922, + "learning_rate": 5.359455958549224e-06, + "loss": 0.5599, + "mean_token_accuracy": 0.9189277589321136, + "num_tokens": 5136757.0, + "step": 2867 + }, + { + "epoch": 0.4644158367743503, + "grad_norm": 19.582748413085938, + "learning_rate": 5.357836787564767e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.925253301858902, + "num_tokens": 5138550.0, + "step": 2868 + }, + { + "epoch": 0.4645777669824306, + "grad_norm": 25.55223846435547, + "learning_rate": 5.356217616580312e-06, + "loss": 0.6178, + "mean_token_accuracy": 0.9044606685638428, + "num_tokens": 5140334.0, + "step": 2869 + }, + { + "epoch": 0.4647396971905109, + "grad_norm": 27.396039962768555, + "learning_rate": 5.354598445595855e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.9157095551490784, + "num_tokens": 5142119.0, + "step": 2870 + }, + { + "epoch": 0.4649016273985912, + "grad_norm": 17.082460403442383, + "learning_rate": 5.3529792746114e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.9304637312889099, + "num_tokens": 5143918.0, + "step": 2871 + }, + { + "epoch": 0.4650635576066715, + "grad_norm": 27.788162231445312, + "learning_rate": 5.351360103626943e-06, + "loss": 0.5862, + "mean_token_accuracy": 0.9222605228424072, + "num_tokens": 5145713.0, + "step": 2872 + }, + { + "epoch": 0.4652254878147518, + "grad_norm": 28.16967010498047, + "learning_rate": 5.349740932642488e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.9074721336364746, + "num_tokens": 5147506.0, + "step": 2873 + }, + { + "epoch": 0.4653874180228322, + "grad_norm": 19.386356353759766, + "learning_rate": 5.348121761658031e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.9280538260936737, + "num_tokens": 5149296.0, + "step": 2874 + }, + { + "epoch": 0.4655493482309125, + "grad_norm": 27.669445037841797, + "learning_rate": 5.346502590673576e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.9144883453845978, + "num_tokens": 5151089.0, + "step": 2875 + }, + { + "epoch": 0.4657112784389928, + "grad_norm": 25.646547317504883, + "learning_rate": 5.34488341968912e-06, + "loss": 0.6391, + "mean_token_accuracy": 0.9154929518699646, + "num_tokens": 5152885.0, + "step": 2876 + }, + { + "epoch": 0.4658732086470731, + "grad_norm": 12.353198051452637, + "learning_rate": 5.343264248704664e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.9358339011669159, + "num_tokens": 5154662.0, + "step": 2877 + }, + { + "epoch": 0.4660351388551534, + "grad_norm": 28.414976119995117, + "learning_rate": 5.341645077720208e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.9062483906745911, + "num_tokens": 5156452.0, + "step": 2878 + }, + { + "epoch": 0.46619706906323377, + "grad_norm": 21.641206741333008, + "learning_rate": 5.340025906735752e-06, + "loss": 0.5588, + "mean_token_accuracy": 0.9247687458992004, + "num_tokens": 5158243.0, + "step": 2879 + }, + { + "epoch": 0.4663589992713141, + "grad_norm": 27.501628875732422, + "learning_rate": 5.338406735751296e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.9107498526573181, + "num_tokens": 5160035.0, + "step": 2880 + }, + { + "epoch": 0.4665209294793944, + "grad_norm": 30.56386375427246, + "learning_rate": 5.33678756476684e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.9067992568016052, + "num_tokens": 5161826.0, + "step": 2881 + }, + { + "epoch": 0.4666828596874747, + "grad_norm": 16.881032943725586, + "learning_rate": 5.335168393782384e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.9225217700004578, + "num_tokens": 5163609.0, + "step": 2882 + }, + { + "epoch": 0.466844789895555, + "grad_norm": 21.027667999267578, + "learning_rate": 5.333549222797928e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.9174043536186218, + "num_tokens": 5165399.0, + "step": 2883 + }, + { + "epoch": 0.4670067201036353, + "grad_norm": 27.815034866333008, + "learning_rate": 5.331930051813472e-06, + "loss": 0.6248, + "mean_token_accuracy": 0.9103453755378723, + "num_tokens": 5167190.0, + "step": 2884 + }, + { + "epoch": 0.46716865031171567, + "grad_norm": 22.24852752685547, + "learning_rate": 5.330310880829016e-06, + "loss": 0.548, + "mean_token_accuracy": 0.9185613691806793, + "num_tokens": 5168984.0, + "step": 2885 + }, + { + "epoch": 0.467330580519796, + "grad_norm": 31.455965042114258, + "learning_rate": 5.32869170984456e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.8947044312953949, + "num_tokens": 5170781.0, + "step": 2886 + }, + { + "epoch": 0.4674925107278763, + "grad_norm": 24.850784301757812, + "learning_rate": 5.327072538860104e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.9040032625198364, + "num_tokens": 5172573.0, + "step": 2887 + }, + { + "epoch": 0.4676544409359566, + "grad_norm": 29.294872283935547, + "learning_rate": 5.325453367875648e-06, + "loss": 0.6597, + "mean_token_accuracy": 0.9121934175491333, + "num_tokens": 5174370.0, + "step": 2888 + }, + { + "epoch": 0.4678163711440369, + "grad_norm": 24.771955490112305, + "learning_rate": 5.323834196891192e-06, + "loss": 0.584, + "mean_token_accuracy": 0.9157004952430725, + "num_tokens": 5176155.0, + "step": 2889 + }, + { + "epoch": 0.46797830135211727, + "grad_norm": 22.884859085083008, + "learning_rate": 5.322215025906736e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9274798929691315, + "num_tokens": 5177943.0, + "step": 2890 + }, + { + "epoch": 0.4681402315601976, + "grad_norm": 25.530378341674805, + "learning_rate": 5.32059585492228e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.9188898801803589, + "num_tokens": 5179738.0, + "step": 2891 + }, + { + "epoch": 0.4683021617682779, + "grad_norm": 36.57127380371094, + "learning_rate": 5.318976683937824e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.8946167230606079, + "num_tokens": 5181534.0, + "step": 2892 + }, + { + "epoch": 0.4684640919763582, + "grad_norm": 24.721677780151367, + "learning_rate": 5.317357512953368e-06, + "loss": 0.6147, + "mean_token_accuracy": 0.9181610941886902, + "num_tokens": 5183327.0, + "step": 2893 + }, + { + "epoch": 0.4686260221844385, + "grad_norm": 29.362308502197266, + "learning_rate": 5.315738341968912e-06, + "loss": 0.6518, + "mean_token_accuracy": 0.9158130884170532, + "num_tokens": 5185124.0, + "step": 2894 + }, + { + "epoch": 0.4687879523925188, + "grad_norm": 18.696104049682617, + "learning_rate": 5.314119170984457e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.92356076836586, + "num_tokens": 5186910.0, + "step": 2895 + }, + { + "epoch": 0.46894988260059917, + "grad_norm": 18.09763526916504, + "learning_rate": 5.3125e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.9239495694637299, + "num_tokens": 5188698.0, + "step": 2896 + }, + { + "epoch": 0.4691118128086795, + "grad_norm": 20.345279693603516, + "learning_rate": 5.310880829015545e-06, + "loss": 0.5513, + "mean_token_accuracy": 0.928205132484436, + "num_tokens": 5190488.0, + "step": 2897 + }, + { + "epoch": 0.4692737430167598, + "grad_norm": 21.41593360900879, + "learning_rate": 5.309261658031088e-06, + "loss": 0.5923, + "mean_token_accuracy": 0.9211202263832092, + "num_tokens": 5192279.0, + "step": 2898 + }, + { + "epoch": 0.4694356732248401, + "grad_norm": 22.710124969482422, + "learning_rate": 5.307642487046633e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.9117647111415863, + "num_tokens": 5194063.0, + "step": 2899 + }, + { + "epoch": 0.4695976034329204, + "grad_norm": 26.75022315979004, + "learning_rate": 5.3060233160621764e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.9060872197151184, + "num_tokens": 5195863.0, + "step": 2900 + }, + { + "epoch": 0.4697595336410007, + "grad_norm": 18.813129425048828, + "learning_rate": 5.304404145077721e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.9177893698215485, + "num_tokens": 5197655.0, + "step": 2901 + }, + { + "epoch": 0.46992146384908107, + "grad_norm": 25.961584091186523, + "learning_rate": 5.3027849740932645e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.9199818968772888, + "num_tokens": 5199465.0, + "step": 2902 + }, + { + "epoch": 0.4700833940571614, + "grad_norm": 26.278345108032227, + "learning_rate": 5.301165803108809e-06, + "loss": 0.5734, + "mean_token_accuracy": 0.9147329926490784, + "num_tokens": 5201258.0, + "step": 2903 + }, + { + "epoch": 0.4702453242652417, + "grad_norm": 22.301340103149414, + "learning_rate": 5.2995466321243525e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.9258565604686737, + "num_tokens": 5203053.0, + "step": 2904 + }, + { + "epoch": 0.470407254473322, + "grad_norm": 24.686975479125977, + "learning_rate": 5.297927461139897e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.9020900130271912, + "num_tokens": 5204841.0, + "step": 2905 + }, + { + "epoch": 0.4705691846814023, + "grad_norm": 23.020736694335938, + "learning_rate": 5.2963082901554405e-06, + "loss": 0.5858, + "mean_token_accuracy": 0.9146870970726013, + "num_tokens": 5206633.0, + "step": 2906 + }, + { + "epoch": 0.47073111488948266, + "grad_norm": 19.296911239624023, + "learning_rate": 5.294689119170985e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.9206710755825043, + "num_tokens": 5208423.0, + "step": 2907 + }, + { + "epoch": 0.47089304509756297, + "grad_norm": 31.81449317932129, + "learning_rate": 5.2930699481865285e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.9136128425598145, + "num_tokens": 5210213.0, + "step": 2908 + }, + { + "epoch": 0.4710549753056433, + "grad_norm": 17.322669982910156, + "learning_rate": 5.291450777202073e-06, + "loss": 0.5352, + "mean_token_accuracy": 0.9223621189594269, + "num_tokens": 5212008.0, + "step": 2909 + }, + { + "epoch": 0.4712169055137236, + "grad_norm": 29.945405960083008, + "learning_rate": 5.2898316062176166e-06, + "loss": 0.6949, + "mean_token_accuracy": 0.9190655052661896, + "num_tokens": 5213814.0, + "step": 2910 + }, + { + "epoch": 0.4713788357218039, + "grad_norm": 28.96602439880371, + "learning_rate": 5.288212435233161e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.8920530378818512, + "num_tokens": 5215605.0, + "step": 2911 + }, + { + "epoch": 0.4715407659298842, + "grad_norm": 31.174510955810547, + "learning_rate": 5.286593264248705e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.9063625335693359, + "num_tokens": 5217417.0, + "step": 2912 + }, + { + "epoch": 0.47170269613796456, + "grad_norm": 28.651702880859375, + "learning_rate": 5.284974093264249e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.9039416313171387, + "num_tokens": 5219210.0, + "step": 2913 + }, + { + "epoch": 0.47186462634604487, + "grad_norm": 27.769548416137695, + "learning_rate": 5.2833549222797935e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.9201631844043732, + "num_tokens": 5220997.0, + "step": 2914 + }, + { + "epoch": 0.4720265565541252, + "grad_norm": 27.756732940673828, + "learning_rate": 5.281735751295337e-06, + "loss": 0.6688, + "mean_token_accuracy": 0.9071381092071533, + "num_tokens": 5222789.0, + "step": 2915 + }, + { + "epoch": 0.4721884867622055, + "grad_norm": 25.227214813232422, + "learning_rate": 5.2801165803108815e-06, + "loss": 0.6198, + "mean_token_accuracy": 0.91366907954216, + "num_tokens": 5224579.0, + "step": 2916 + }, + { + "epoch": 0.4723504169702858, + "grad_norm": 28.95039939880371, + "learning_rate": 5.278497409326425e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.9052895903587341, + "num_tokens": 5226376.0, + "step": 2917 + }, + { + "epoch": 0.4725123471783661, + "grad_norm": 18.444847106933594, + "learning_rate": 5.2768782383419695e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.9286729693412781, + "num_tokens": 5228169.0, + "step": 2918 + }, + { + "epoch": 0.47267427738644646, + "grad_norm": 32.053131103515625, + "learning_rate": 5.275259067357513e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.9108525216579437, + "num_tokens": 5229950.0, + "step": 2919 + }, + { + "epoch": 0.47283620759452677, + "grad_norm": 30.479393005371094, + "learning_rate": 5.2736398963730575e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.9009911119937897, + "num_tokens": 5231744.0, + "step": 2920 + }, + { + "epoch": 0.4729981378026071, + "grad_norm": 29.874521255493164, + "learning_rate": 5.272020725388601e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.9142303168773651, + "num_tokens": 5233536.0, + "step": 2921 + }, + { + "epoch": 0.4731600680106874, + "grad_norm": 19.59438705444336, + "learning_rate": 5.2704015544041456e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.9199579954147339, + "num_tokens": 5235324.0, + "step": 2922 + }, + { + "epoch": 0.4733219982187677, + "grad_norm": 30.61916160583496, + "learning_rate": 5.268782383419689e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.8919772505760193, + "num_tokens": 5237114.0, + "step": 2923 + }, + { + "epoch": 0.47348392842684806, + "grad_norm": 22.354694366455078, + "learning_rate": 5.267163212435234e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.9122854769229889, + "num_tokens": 5238911.0, + "step": 2924 + }, + { + "epoch": 0.47364585863492836, + "grad_norm": 18.398378372192383, + "learning_rate": 5.265544041450777e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9285568594932556, + "num_tokens": 5240703.0, + "step": 2925 + }, + { + "epoch": 0.47380778884300867, + "grad_norm": 18.617631912231445, + "learning_rate": 5.263924870466322e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.9296690821647644, + "num_tokens": 5242485.0, + "step": 2926 + }, + { + "epoch": 0.473969719051089, + "grad_norm": 30.68703269958496, + "learning_rate": 5.262305699481865e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.9034347832202911, + "num_tokens": 5244276.0, + "step": 2927 + }, + { + "epoch": 0.4741316492591693, + "grad_norm": 24.17237663269043, + "learning_rate": 5.26068652849741e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.9156014919281006, + "num_tokens": 5246061.0, + "step": 2928 + }, + { + "epoch": 0.4742935794672496, + "grad_norm": 28.16291046142578, + "learning_rate": 5.259067357512953e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.9071002006530762, + "num_tokens": 5247842.0, + "step": 2929 + }, + { + "epoch": 0.47445550967532996, + "grad_norm": 20.256938934326172, + "learning_rate": 5.257448186528498e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.9247859120368958, + "num_tokens": 5249631.0, + "step": 2930 + }, + { + "epoch": 0.47461743988341026, + "grad_norm": 31.084367752075195, + "learning_rate": 5.255829015544041e-06, + "loss": 0.6111, + "mean_token_accuracy": 0.8967473804950714, + "num_tokens": 5251424.0, + "step": 2931 + }, + { + "epoch": 0.47477937009149057, + "grad_norm": 29.46352767944336, + "learning_rate": 5.254209844559586e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.9000969231128693, + "num_tokens": 5253216.0, + "step": 2932 + }, + { + "epoch": 0.4749413002995709, + "grad_norm": 27.638547897338867, + "learning_rate": 5.25259067357513e-06, + "loss": 0.6544, + "mean_token_accuracy": 0.9124966859817505, + "num_tokens": 5255002.0, + "step": 2933 + }, + { + "epoch": 0.4751032305076512, + "grad_norm": 24.843135833740234, + "learning_rate": 5.250971502590674e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.9029503166675568, + "num_tokens": 5256792.0, + "step": 2934 + }, + { + "epoch": 0.4752651607157315, + "grad_norm": 24.82257080078125, + "learning_rate": 5.249352331606218e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.9170937538146973, + "num_tokens": 5258582.0, + "step": 2935 + }, + { + "epoch": 0.47542709092381186, + "grad_norm": 26.72260856628418, + "learning_rate": 5.247733160621762e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.915909081697464, + "num_tokens": 5260366.0, + "step": 2936 + }, + { + "epoch": 0.47558902113189216, + "grad_norm": 19.526792526245117, + "learning_rate": 5.246113989637306e-06, + "loss": 0.5785, + "mean_token_accuracy": 0.9261710345745087, + "num_tokens": 5262149.0, + "step": 2937 + }, + { + "epoch": 0.47575095133997247, + "grad_norm": 27.608476638793945, + "learning_rate": 5.24449481865285e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.9204832017421722, + "num_tokens": 5263937.0, + "step": 2938 + }, + { + "epoch": 0.4759128815480528, + "grad_norm": 20.302509307861328, + "learning_rate": 5.242875647668394e-06, + "loss": 0.6104, + "mean_token_accuracy": 0.9283144772052765, + "num_tokens": 5265728.0, + "step": 2939 + }, + { + "epoch": 0.4760748117561331, + "grad_norm": 27.7907772064209, + "learning_rate": 5.241256476683938e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.9052895903587341, + "num_tokens": 5267525.0, + "step": 2940 + }, + { + "epoch": 0.47623674196421345, + "grad_norm": 24.1446590423584, + "learning_rate": 5.239637305699482e-06, + "loss": 0.6232, + "mean_token_accuracy": 0.9198294281959534, + "num_tokens": 5269311.0, + "step": 2941 + }, + { + "epoch": 0.47639867217229376, + "grad_norm": 26.522228240966797, + "learning_rate": 5.238018134715026e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.9090448617935181, + "num_tokens": 5271118.0, + "step": 2942 + }, + { + "epoch": 0.47656060238037407, + "grad_norm": 17.190567016601562, + "learning_rate": 5.23639896373057e-06, + "loss": 0.5789, + "mean_token_accuracy": 0.9256495237350464, + "num_tokens": 5272899.0, + "step": 2943 + }, + { + "epoch": 0.4767225325884544, + "grad_norm": 24.518218994140625, + "learning_rate": 5.234779792746114e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.9070110321044922, + "num_tokens": 5274701.0, + "step": 2944 + }, + { + "epoch": 0.4768844627965347, + "grad_norm": 27.35640525817871, + "learning_rate": 5.233160621761658e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.9179335236549377, + "num_tokens": 5276494.0, + "step": 2945 + }, + { + "epoch": 0.477046393004615, + "grad_norm": 16.56838607788086, + "learning_rate": 5.231541450777202e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.9310924410820007, + "num_tokens": 5278282.0, + "step": 2946 + }, + { + "epoch": 0.47720832321269535, + "grad_norm": 21.148740768432617, + "learning_rate": 5.229922279792746e-06, + "loss": 0.5593, + "mean_token_accuracy": 0.9239130616188049, + "num_tokens": 5280070.0, + "step": 2947 + }, + { + "epoch": 0.47737025342077566, + "grad_norm": 28.640382766723633, + "learning_rate": 5.22830310880829e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.9025510251522064, + "num_tokens": 5281869.0, + "step": 2948 + }, + { + "epoch": 0.47753218362885597, + "grad_norm": 25.223848342895508, + "learning_rate": 5.226683937823834e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.9097852110862732, + "num_tokens": 5283658.0, + "step": 2949 + }, + { + "epoch": 0.4776941138369363, + "grad_norm": 19.175058364868164, + "learning_rate": 5.225064766839378e-06, + "loss": 0.5493, + "mean_token_accuracy": 0.930431067943573, + "num_tokens": 5285444.0, + "step": 2950 + }, + { + "epoch": 0.4778560440450166, + "grad_norm": 25.689964294433594, + "learning_rate": 5.223445595854922e-06, + "loss": 0.5523, + "mean_token_accuracy": 0.9234882295131683, + "num_tokens": 5287243.0, + "step": 2951 + }, + { + "epoch": 0.4780179742530969, + "grad_norm": 27.593448638916016, + "learning_rate": 5.221826424870467e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.9061359763145447, + "num_tokens": 5289032.0, + "step": 2952 + }, + { + "epoch": 0.47817990446117725, + "grad_norm": 19.62053680419922, + "learning_rate": 5.22020725388601e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.925144374370575, + "num_tokens": 5290825.0, + "step": 2953 + }, + { + "epoch": 0.47834183466925756, + "grad_norm": 14.325468063354492, + "learning_rate": 5.218588082901555e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.9305888414382935, + "num_tokens": 5292611.0, + "step": 2954 + }, + { + "epoch": 0.47850376487733787, + "grad_norm": 28.998231887817383, + "learning_rate": 5.216968911917098e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.9110942184925079, + "num_tokens": 5294404.0, + "step": 2955 + }, + { + "epoch": 0.4786656950854182, + "grad_norm": 22.359689712524414, + "learning_rate": 5.215349740932643e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.9170056283473969, + "num_tokens": 5296193.0, + "step": 2956 + }, + { + "epoch": 0.4788276252934985, + "grad_norm": 29.983604431152344, + "learning_rate": 5.2137305699481864e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.8951023519039154, + "num_tokens": 5298001.0, + "step": 2957 + }, + { + "epoch": 0.47898955550157885, + "grad_norm": 25.94442367553711, + "learning_rate": 5.212111398963731e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.916137307882309, + "num_tokens": 5299811.0, + "step": 2958 + }, + { + "epoch": 0.47915148570965915, + "grad_norm": 18.060644149780273, + "learning_rate": 5.2104922279792745e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.9217325150966644, + "num_tokens": 5301604.0, + "step": 2959 + }, + { + "epoch": 0.47931341591773946, + "grad_norm": 27.37018394470215, + "learning_rate": 5.208873056994819e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.9093098342418671, + "num_tokens": 5303391.0, + "step": 2960 + }, + { + "epoch": 0.47947534612581977, + "grad_norm": 32.403934478759766, + "learning_rate": 5.2072538860103625e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.89775151014328, + "num_tokens": 5305177.0, + "step": 2961 + }, + { + "epoch": 0.4796372763339001, + "grad_norm": 26.418136596679688, + "learning_rate": 5.205634715025907e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.9303635060787201, + "num_tokens": 5306963.0, + "step": 2962 + }, + { + "epoch": 0.4797992065419804, + "grad_norm": 16.83273696899414, + "learning_rate": 5.2040155440414505e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.9293177723884583, + "num_tokens": 5308744.0, + "step": 2963 + }, + { + "epoch": 0.47996113675006075, + "grad_norm": 21.232816696166992, + "learning_rate": 5.202396373056995e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.9186103940010071, + "num_tokens": 5310526.0, + "step": 2964 + }, + { + "epoch": 0.48012306695814105, + "grad_norm": 21.434364318847656, + "learning_rate": 5.2007772020725386e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.9103163778781891, + "num_tokens": 5312306.0, + "step": 2965 + }, + { + "epoch": 0.48028499716622136, + "grad_norm": 27.445018768310547, + "learning_rate": 5.199158031088083e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.9068088531494141, + "num_tokens": 5314097.0, + "step": 2966 + }, + { + "epoch": 0.48044692737430167, + "grad_norm": 21.907724380493164, + "learning_rate": 5.197538860103627e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.9169968664646149, + "num_tokens": 5315886.0, + "step": 2967 + }, + { + "epoch": 0.480608857582382, + "grad_norm": 21.375431060791016, + "learning_rate": 5.195919689119171e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.9162167906761169, + "num_tokens": 5317673.0, + "step": 2968 + }, + { + "epoch": 0.4807707877904623, + "grad_norm": 29.330354690551758, + "learning_rate": 5.1943005181347155e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.9070242643356323, + "num_tokens": 5319465.0, + "step": 2969 + }, + { + "epoch": 0.48093271799854265, + "grad_norm": 28.010086059570312, + "learning_rate": 5.192681347150259e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.903486430644989, + "num_tokens": 5321246.0, + "step": 2970 + }, + { + "epoch": 0.48109464820662295, + "grad_norm": 31.358449935913086, + "learning_rate": 5.1910621761658035e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.9015277624130249, + "num_tokens": 5323052.0, + "step": 2971 + }, + { + "epoch": 0.48125657841470326, + "grad_norm": 25.900650024414062, + "learning_rate": 5.189443005181347e-06, + "loss": 0.747, + "mean_token_accuracy": 0.9117058515548706, + "num_tokens": 5324836.0, + "step": 2972 + }, + { + "epoch": 0.48141850862278357, + "grad_norm": 18.731250762939453, + "learning_rate": 5.1878238341968915e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.9287525415420532, + "num_tokens": 5326629.0, + "step": 2973 + }, + { + "epoch": 0.4815804388308639, + "grad_norm": 30.721891403198242, + "learning_rate": 5.186204663212435e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.9066331386566162, + "num_tokens": 5328430.0, + "step": 2974 + }, + { + "epoch": 0.48174236903894424, + "grad_norm": 18.47876739501953, + "learning_rate": 5.1845854922279795e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.9295634925365448, + "num_tokens": 5330226.0, + "step": 2975 + }, + { + "epoch": 0.48190429924702455, + "grad_norm": 17.191940307617188, + "learning_rate": 5.182966321243523e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.9239878058433533, + "num_tokens": 5332014.0, + "step": 2976 + }, + { + "epoch": 0.48206622945510486, + "grad_norm": 30.276260375976562, + "learning_rate": 5.1813471502590676e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.9014598727226257, + "num_tokens": 5333800.0, + "step": 2977 + }, + { + "epoch": 0.48222815966318516, + "grad_norm": 20.260787963867188, + "learning_rate": 5.179727979274611e-06, + "loss": 0.5451, + "mean_token_accuracy": 0.9211378395557404, + "num_tokens": 5335591.0, + "step": 2978 + }, + { + "epoch": 0.48239008987126547, + "grad_norm": 21.172622680664062, + "learning_rate": 5.178108808290156e-06, + "loss": 0.6007, + "mean_token_accuracy": 0.9207678437232971, + "num_tokens": 5337368.0, + "step": 2979 + }, + { + "epoch": 0.4825520200793458, + "grad_norm": 25.751380920410156, + "learning_rate": 5.176489637305699e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.9152900874614716, + "num_tokens": 5339162.0, + "step": 2980 + }, + { + "epoch": 0.48271395028742614, + "grad_norm": 16.326499938964844, + "learning_rate": 5.174870466321244e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.9212839901447296, + "num_tokens": 5340954.0, + "step": 2981 + }, + { + "epoch": 0.48287588049550645, + "grad_norm": 22.657331466674805, + "learning_rate": 5.173251295336787e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.9103802740573883, + "num_tokens": 5342745.0, + "step": 2982 + }, + { + "epoch": 0.48303781070358676, + "grad_norm": 27.320241928100586, + "learning_rate": 5.171632124352332e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.9119634330272675, + "num_tokens": 5344541.0, + "step": 2983 + }, + { + "epoch": 0.48319974091166706, + "grad_norm": 26.14234161376953, + "learning_rate": 5.170012953367875e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.9091990888118744, + "num_tokens": 5346328.0, + "step": 2984 + }, + { + "epoch": 0.48336167111974737, + "grad_norm": 35.31435775756836, + "learning_rate": 5.16839378238342e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.9124755561351776, + "num_tokens": 5348126.0, + "step": 2985 + }, + { + "epoch": 0.4835236013278277, + "grad_norm": 22.177021026611328, + "learning_rate": 5.166774611398963e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.924717366695404, + "num_tokens": 5349917.0, + "step": 2986 + }, + { + "epoch": 0.48368553153590804, + "grad_norm": 20.260520935058594, + "learning_rate": 5.165155440414508e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.9114106595516205, + "num_tokens": 5351700.0, + "step": 2987 + }, + { + "epoch": 0.48384746174398835, + "grad_norm": 22.355167388916016, + "learning_rate": 5.163536269430052e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.9194128215312958, + "num_tokens": 5353485.0, + "step": 2988 + }, + { + "epoch": 0.48400939195206866, + "grad_norm": 23.14637565612793, + "learning_rate": 5.161917098445596e-06, + "loss": 0.6222, + "mean_token_accuracy": 0.9128788113594055, + "num_tokens": 5355284.0, + "step": 2989 + }, + { + "epoch": 0.48417132216014896, + "grad_norm": 21.406024932861328, + "learning_rate": 5.16029792746114e-06, + "loss": 0.5813, + "mean_token_accuracy": 0.9181225001811981, + "num_tokens": 5357065.0, + "step": 2990 + }, + { + "epoch": 0.4843332523682293, + "grad_norm": 16.63289451599121, + "learning_rate": 5.158678756476684e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.9243197441101074, + "num_tokens": 5358868.0, + "step": 2991 + }, + { + "epoch": 0.48449518257630964, + "grad_norm": 22.046833038330078, + "learning_rate": 5.157059585492228e-06, + "loss": 0.624, + "mean_token_accuracy": 0.9187424778938293, + "num_tokens": 5360663.0, + "step": 2992 + }, + { + "epoch": 0.48465711278438994, + "grad_norm": 26.663820266723633, + "learning_rate": 5.155440414507773e-06, + "loss": 0.6302, + "mean_token_accuracy": 0.9023735225200653, + "num_tokens": 5362452.0, + "step": 2993 + }, + { + "epoch": 0.48481904299247025, + "grad_norm": 24.68050193786621, + "learning_rate": 5.153821243523317e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.9097970426082611, + "num_tokens": 5364242.0, + "step": 2994 + }, + { + "epoch": 0.48498097320055056, + "grad_norm": 27.26254653930664, + "learning_rate": 5.1522020725388615e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.8963338732719421, + "num_tokens": 5366032.0, + "step": 2995 + }, + { + "epoch": 0.48514290340863087, + "grad_norm": 21.771852493286133, + "learning_rate": 5.150582901554405e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.9178571403026581, + "num_tokens": 5367824.0, + "step": 2996 + }, + { + "epoch": 0.4853048336167112, + "grad_norm": 18.900741577148438, + "learning_rate": 5.1489637305699495e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9257739782333374, + "num_tokens": 5369618.0, + "step": 2997 + }, + { + "epoch": 0.48546676382479154, + "grad_norm": 37.06663131713867, + "learning_rate": 5.147344559585493e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.9115897119045258, + "num_tokens": 5371401.0, + "step": 2998 + }, + { + "epoch": 0.48562869403287184, + "grad_norm": 27.482330322265625, + "learning_rate": 5.1457253886010375e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.9165938198566437, + "num_tokens": 5373200.0, + "step": 2999 + }, + { + "epoch": 0.48579062424095215, + "grad_norm": 22.539199829101562, + "learning_rate": 5.144106217616581e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.9197974801063538, + "num_tokens": 5374986.0, + "step": 3000 + }, + { + "epoch": 0.48595255444903246, + "grad_norm": 24.7188777923584, + "learning_rate": 5.1424870466321256e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.9130434989929199, + "num_tokens": 5376774.0, + "step": 3001 + }, + { + "epoch": 0.48611448465711277, + "grad_norm": 27.121692657470703, + "learning_rate": 5.140867875647669e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.9027452766895294, + "num_tokens": 5378582.0, + "step": 3002 + }, + { + "epoch": 0.48627641486519313, + "grad_norm": 32.31459426879883, + "learning_rate": 5.139248704663214e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.9036190807819366, + "num_tokens": 5380364.0, + "step": 3003 + }, + { + "epoch": 0.48643834507327344, + "grad_norm": 18.34467124938965, + "learning_rate": 5.137629533678757e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.9326991438865662, + "num_tokens": 5382157.0, + "step": 3004 + }, + { + "epoch": 0.48660027528135374, + "grad_norm": 31.747346878051758, + "learning_rate": 5.136010362694302e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.9042553305625916, + "num_tokens": 5383951.0, + "step": 3005 + }, + { + "epoch": 0.48676220548943405, + "grad_norm": 35.95388412475586, + "learning_rate": 5.134391191709845e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.8956834673881531, + "num_tokens": 5385741.0, + "step": 3006 + }, + { + "epoch": 0.48692413569751436, + "grad_norm": 33.44746398925781, + "learning_rate": 5.13277202072539e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.9012691080570221, + "num_tokens": 5387527.0, + "step": 3007 + }, + { + "epoch": 0.48708606590559467, + "grad_norm": 25.80466079711914, + "learning_rate": 5.131152849740933e-06, + "loss": 0.6592, + "mean_token_accuracy": 0.8974113762378693, + "num_tokens": 5389321.0, + "step": 3008 + }, + { + "epoch": 0.48724799611367503, + "grad_norm": 35.45637512207031, + "learning_rate": 5.129533678756478e-06, + "loss": 1.035, + "mean_token_accuracy": 0.8858107924461365, + "num_tokens": 5391121.0, + "step": 3009 + }, + { + "epoch": 0.48740992632175534, + "grad_norm": 25.505640029907227, + "learning_rate": 5.127914507772021e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.9088345766067505, + "num_tokens": 5392906.0, + "step": 3010 + }, + { + "epoch": 0.48757185652983565, + "grad_norm": 24.818864822387695, + "learning_rate": 5.126295336787566e-06, + "loss": 0.6682, + "mean_token_accuracy": 0.909489631652832, + "num_tokens": 5394683.0, + "step": 3011 + }, + { + "epoch": 0.48773378673791595, + "grad_norm": 28.48109245300293, + "learning_rate": 5.124676165803109e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.9065458476543427, + "num_tokens": 5396485.0, + "step": 3012 + }, + { + "epoch": 0.48789571694599626, + "grad_norm": 36.29601287841797, + "learning_rate": 5.123056994818654e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.8908576667308807, + "num_tokens": 5398281.0, + "step": 3013 + }, + { + "epoch": 0.48805764715407657, + "grad_norm": 22.847387313842773, + "learning_rate": 5.121437823834198e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.9162943363189697, + "num_tokens": 5400080.0, + "step": 3014 + }, + { + "epoch": 0.48821957736215693, + "grad_norm": 24.827085494995117, + "learning_rate": 5.119818652849742e-06, + "loss": 0.5996, + "mean_token_accuracy": 0.9145347476005554, + "num_tokens": 5401871.0, + "step": 3015 + }, + { + "epoch": 0.48838150757023724, + "grad_norm": 27.44051742553711, + "learning_rate": 5.118199481865286e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.8899087309837341, + "num_tokens": 5403664.0, + "step": 3016 + }, + { + "epoch": 0.48854343777831755, + "grad_norm": 20.68699836730957, + "learning_rate": 5.11658031088083e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.9229323267936707, + "num_tokens": 5405449.0, + "step": 3017 + }, + { + "epoch": 0.48870536798639785, + "grad_norm": 21.14838409423828, + "learning_rate": 5.114961139896374e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.9212149381637573, + "num_tokens": 5407240.0, + "step": 3018 + }, + { + "epoch": 0.48886729819447816, + "grad_norm": 24.232927322387695, + "learning_rate": 5.113341968911918e-06, + "loss": 0.6271, + "mean_token_accuracy": 0.9087995290756226, + "num_tokens": 5409027.0, + "step": 3019 + }, + { + "epoch": 0.4890292284025585, + "grad_norm": 24.3666934967041, + "learning_rate": 5.111722797927462e-06, + "loss": 0.57, + "mean_token_accuracy": 0.9190065264701843, + "num_tokens": 5410835.0, + "step": 3020 + }, + { + "epoch": 0.48919115861063883, + "grad_norm": 31.462709426879883, + "learning_rate": 5.110103626943006e-06, + "loss": 0.6625, + "mean_token_accuracy": 0.8957125246524811, + "num_tokens": 5412625.0, + "step": 3021 + }, + { + "epoch": 0.48935308881871914, + "grad_norm": 39.11698913574219, + "learning_rate": 5.10848445595855e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.895560085773468, + "num_tokens": 5414424.0, + "step": 3022 + }, + { + "epoch": 0.48951501902679945, + "grad_norm": 19.440439224243164, + "learning_rate": 5.106865284974094e-06, + "loss": 0.6222, + "mean_token_accuracy": 0.9205517172813416, + "num_tokens": 5416213.0, + "step": 3023 + }, + { + "epoch": 0.48967694923487975, + "grad_norm": 23.849119186401367, + "learning_rate": 5.105246113989638e-06, + "loss": 0.565, + "mean_token_accuracy": 0.9218370020389557, + "num_tokens": 5418006.0, + "step": 3024 + }, + { + "epoch": 0.48983887944296006, + "grad_norm": 30.319480895996094, + "learning_rate": 5.103626943005182e-06, + "loss": 0.764, + "mean_token_accuracy": 0.8976001739501953, + "num_tokens": 5419811.0, + "step": 3025 + }, + { + "epoch": 0.4900008096510404, + "grad_norm": 25.434173583984375, + "learning_rate": 5.102007772020726e-06, + "loss": 0.5891, + "mean_token_accuracy": 0.9194001257419586, + "num_tokens": 5421608.0, + "step": 3026 + }, + { + "epoch": 0.49016273985912073, + "grad_norm": 18.40189552307129, + "learning_rate": 5.10038860103627e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.923776239156723, + "num_tokens": 5423393.0, + "step": 3027 + }, + { + "epoch": 0.49032467006720104, + "grad_norm": 22.239166259765625, + "learning_rate": 5.098769430051814e-06, + "loss": 0.6115, + "mean_token_accuracy": 0.9151596128940582, + "num_tokens": 5425176.0, + "step": 3028 + }, + { + "epoch": 0.49048660027528135, + "grad_norm": 28.200517654418945, + "learning_rate": 5.097150259067358e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.9147758781909943, + "num_tokens": 5426970.0, + "step": 3029 + }, + { + "epoch": 0.49064853048336166, + "grad_norm": 11.054727554321289, + "learning_rate": 5.095531088082902e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.940364271402359, + "num_tokens": 5428768.0, + "step": 3030 + }, + { + "epoch": 0.49081046069144196, + "grad_norm": 19.688766479492188, + "learning_rate": 5.093911917098446e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.9120773077011108, + "num_tokens": 5430553.0, + "step": 3031 + }, + { + "epoch": 0.4909723908995223, + "grad_norm": 23.41379737854004, + "learning_rate": 5.09229274611399e-06, + "loss": 0.6742, + "mean_token_accuracy": 0.9201680421829224, + "num_tokens": 5432341.0, + "step": 3032 + }, + { + "epoch": 0.49113432110760263, + "grad_norm": 13.503064155578613, + "learning_rate": 5.090673575129535e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.9277893602848053, + "num_tokens": 5434130.0, + "step": 3033 + }, + { + "epoch": 0.49129625131568294, + "grad_norm": 21.570785522460938, + "learning_rate": 5.089054404145078e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.9215146005153656, + "num_tokens": 5435922.0, + "step": 3034 + }, + { + "epoch": 0.49145818152376325, + "grad_norm": 19.1263427734375, + "learning_rate": 5.087435233160623e-06, + "loss": 0.539, + "mean_token_accuracy": 0.9208633005619049, + "num_tokens": 5437712.0, + "step": 3035 + }, + { + "epoch": 0.49162011173184356, + "grad_norm": 22.503528594970703, + "learning_rate": 5.0858160621761664e-06, + "loss": 0.5902, + "mean_token_accuracy": 0.9201492369174957, + "num_tokens": 5439498.0, + "step": 3036 + }, + { + "epoch": 0.4917820419399239, + "grad_norm": 19.080575942993164, + "learning_rate": 5.084196891191711e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.9224673211574554, + "num_tokens": 5441281.0, + "step": 3037 + }, + { + "epoch": 0.4919439721480042, + "grad_norm": 22.11385154724121, + "learning_rate": 5.0825777202072545e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.9240615367889404, + "num_tokens": 5443070.0, + "step": 3038 + }, + { + "epoch": 0.49210590235608453, + "grad_norm": 21.53031349182129, + "learning_rate": 5.080958549222799e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.9276065528392792, + "num_tokens": 5444872.0, + "step": 3039 + }, + { + "epoch": 0.49226783256416484, + "grad_norm": 38.85481643676758, + "learning_rate": 5.0793393782383425e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.8947316110134125, + "num_tokens": 5446668.0, + "step": 3040 + }, + { + "epoch": 0.49242976277224515, + "grad_norm": 21.9334774017334, + "learning_rate": 5.077720207253887e-06, + "loss": 0.5597, + "mean_token_accuracy": 0.9217071831226349, + "num_tokens": 5448461.0, + "step": 3041 + }, + { + "epoch": 0.49259169298032546, + "grad_norm": 25.68813133239746, + "learning_rate": 5.0761010362694305e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.9102693796157837, + "num_tokens": 5450240.0, + "step": 3042 + }, + { + "epoch": 0.4927536231884058, + "grad_norm": 26.759706497192383, + "learning_rate": 5.074481865284975e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.9143833816051483, + "num_tokens": 5452032.0, + "step": 3043 + }, + { + "epoch": 0.49291555339648613, + "grad_norm": 28.56093406677246, + "learning_rate": 5.0728626943005186e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.90159872174263, + "num_tokens": 5453818.0, + "step": 3044 + }, + { + "epoch": 0.49307748360456644, + "grad_norm": 27.25768280029297, + "learning_rate": 5.071243523316063e-06, + "loss": 0.6151, + "mean_token_accuracy": 0.9106077551841736, + "num_tokens": 5455609.0, + "step": 3045 + }, + { + "epoch": 0.49323941381264674, + "grad_norm": 22.179088592529297, + "learning_rate": 5.069624352331607e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.9246068596839905, + "num_tokens": 5457400.0, + "step": 3046 + }, + { + "epoch": 0.49340134402072705, + "grad_norm": 29.503969192504883, + "learning_rate": 5.068005181347151e-06, + "loss": 0.6232, + "mean_token_accuracy": 0.9218687415122986, + "num_tokens": 5459195.0, + "step": 3047 + }, + { + "epoch": 0.49356327422880736, + "grad_norm": 31.340065002441406, + "learning_rate": 5.066386010362695e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.9051197171211243, + "num_tokens": 5460992.0, + "step": 3048 + }, + { + "epoch": 0.4937252044368877, + "grad_norm": 19.16147232055664, + "learning_rate": 5.064766839378239e-06, + "loss": 0.505, + "mean_token_accuracy": 0.9289807379245758, + "num_tokens": 5462785.0, + "step": 3049 + }, + { + "epoch": 0.49388713464496803, + "grad_norm": 25.503454208374023, + "learning_rate": 5.063147668393783e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.9213643670082092, + "num_tokens": 5464577.0, + "step": 3050 + }, + { + "epoch": 0.49404906485304834, + "grad_norm": 29.64555549621582, + "learning_rate": 5.061528497409327e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.9025370180606842, + "num_tokens": 5466376.0, + "step": 3051 + }, + { + "epoch": 0.49421099506112864, + "grad_norm": 26.07890510559082, + "learning_rate": 5.0599093264248715e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.9245370626449585, + "num_tokens": 5468167.0, + "step": 3052 + }, + { + "epoch": 0.49437292526920895, + "grad_norm": 29.486907958984375, + "learning_rate": 5.058290155440415e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.8994667828083038, + "num_tokens": 5469957.0, + "step": 3053 + }, + { + "epoch": 0.4945348554772893, + "grad_norm": 34.593841552734375, + "learning_rate": 5.0566709844559595e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.9030612111091614, + "num_tokens": 5471756.0, + "step": 3054 + }, + { + "epoch": 0.4946967856853696, + "grad_norm": 36.80933380126953, + "learning_rate": 5.055051813471503e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.8978835940361023, + "num_tokens": 5473543.0, + "step": 3055 + }, + { + "epoch": 0.49485871589344993, + "grad_norm": 20.747751235961914, + "learning_rate": 5.0534326424870476e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.9206026494503021, + "num_tokens": 5475332.0, + "step": 3056 + }, + { + "epoch": 0.49502064610153024, + "grad_norm": 33.02836227416992, + "learning_rate": 5.051813471502591e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.9016009867191315, + "num_tokens": 5477129.0, + "step": 3057 + }, + { + "epoch": 0.49518257630961054, + "grad_norm": 29.7998104095459, + "learning_rate": 5.050194300518136e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.9015302062034607, + "num_tokens": 5478925.0, + "step": 3058 + }, + { + "epoch": 0.49534450651769085, + "grad_norm": 29.436275482177734, + "learning_rate": 5.048575129533679e-06, + "loss": 0.6536, + "mean_token_accuracy": 0.9068117141723633, + "num_tokens": 5480715.0, + "step": 3059 + }, + { + "epoch": 0.4955064367257712, + "grad_norm": 20.1005859375, + "learning_rate": 5.046955958549224e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.9200661182403564, + "num_tokens": 5482501.0, + "step": 3060 + }, + { + "epoch": 0.4956683669338515, + "grad_norm": 44.073246002197266, + "learning_rate": 5.045336787564767e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.883424699306488, + "num_tokens": 5484304.0, + "step": 3061 + }, + { + "epoch": 0.49583029714193183, + "grad_norm": 22.542896270751953, + "learning_rate": 5.043717616580312e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.9309523701667786, + "num_tokens": 5486091.0, + "step": 3062 + }, + { + "epoch": 0.49599222735001214, + "grad_norm": 14.202231407165527, + "learning_rate": 5.042098445595855e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.929380863904953, + "num_tokens": 5487872.0, + "step": 3063 + }, + { + "epoch": 0.49615415755809245, + "grad_norm": 29.298137664794922, + "learning_rate": 5.0404792746114e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.9045239984989166, + "num_tokens": 5489668.0, + "step": 3064 + }, + { + "epoch": 0.49631608776617275, + "grad_norm": 24.303762435913086, + "learning_rate": 5.038860103626943e-06, + "loss": 0.5539, + "mean_token_accuracy": 0.9151596128940582, + "num_tokens": 5491451.0, + "step": 3065 + }, + { + "epoch": 0.4964780179742531, + "grad_norm": 24.276966094970703, + "learning_rate": 5.037240932642488e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.9299584329128265, + "num_tokens": 5493249.0, + "step": 3066 + }, + { + "epoch": 0.4966399481823334, + "grad_norm": 28.88763427734375, + "learning_rate": 5.035621761658031e-06, + "loss": 0.6596, + "mean_token_accuracy": 0.9113465547561646, + "num_tokens": 5495032.0, + "step": 3067 + }, + { + "epoch": 0.49680187839041373, + "grad_norm": 14.301376342773438, + "learning_rate": 5.034002590673576e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.9395537674427032, + "num_tokens": 5496825.0, + "step": 3068 + }, + { + "epoch": 0.49696380859849404, + "grad_norm": 35.044189453125, + "learning_rate": 5.032383419689119e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.8989678025245667, + "num_tokens": 5498614.0, + "step": 3069 + }, + { + "epoch": 0.49712573880657435, + "grad_norm": 26.74432945251465, + "learning_rate": 5.030764248704664e-06, + "loss": 0.588, + "mean_token_accuracy": 0.9208264350891113, + "num_tokens": 5500404.0, + "step": 3070 + }, + { + "epoch": 0.4972876690146547, + "grad_norm": 25.613605499267578, + "learning_rate": 5.029145077720208e-06, + "loss": 0.6502, + "mean_token_accuracy": 0.9144460260868073, + "num_tokens": 5502185.0, + "step": 3071 + }, + { + "epoch": 0.497449599222735, + "grad_norm": 27.34073829650879, + "learning_rate": 5.027525906735752e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.9103453755378723, + "num_tokens": 5503976.0, + "step": 3072 + }, + { + "epoch": 0.4976115294308153, + "grad_norm": 17.895008087158203, + "learning_rate": 5.025906735751296e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9291643500328064, + "num_tokens": 5505770.0, + "step": 3073 + }, + { + "epoch": 0.49777345963889563, + "grad_norm": 21.76251983642578, + "learning_rate": 5.02428756476684e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.9258969724178314, + "num_tokens": 5507564.0, + "step": 3074 + }, + { + "epoch": 0.49793538984697594, + "grad_norm": 30.428112030029297, + "learning_rate": 5.022668393782384e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.9073173403739929, + "num_tokens": 5509357.0, + "step": 3075 + }, + { + "epoch": 0.49809732005505625, + "grad_norm": 32.509490966796875, + "learning_rate": 5.021049222797928e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.9009523689746857, + "num_tokens": 5511159.0, + "step": 3076 + }, + { + "epoch": 0.4982592502631366, + "grad_norm": 22.4287166595459, + "learning_rate": 5.019430051813472e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.9184782803058624, + "num_tokens": 5512953.0, + "step": 3077 + }, + { + "epoch": 0.4984211804712169, + "grad_norm": 31.916921615600586, + "learning_rate": 5.017810880829016e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.896999180316925, + "num_tokens": 5514746.0, + "step": 3078 + }, + { + "epoch": 0.4985831106792972, + "grad_norm": 24.6771297454834, + "learning_rate": 5.01619170984456e-06, + "loss": 0.6828, + "mean_token_accuracy": 0.9120295643806458, + "num_tokens": 5516544.0, + "step": 3079 + }, + { + "epoch": 0.49874504088737753, + "grad_norm": 24.779296875, + "learning_rate": 5.014572538860104e-06, + "loss": 0.615, + "mean_token_accuracy": 0.9171842634677887, + "num_tokens": 5518334.0, + "step": 3080 + }, + { + "epoch": 0.49890697109545784, + "grad_norm": 13.718116760253906, + "learning_rate": 5.012953367875648e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.9293532371520996, + "num_tokens": 5520115.0, + "step": 3081 + }, + { + "epoch": 0.49906890130353815, + "grad_norm": 23.878095626831055, + "learning_rate": 5.011334196891192e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.9238230586051941, + "num_tokens": 5521913.0, + "step": 3082 + }, + { + "epoch": 0.4992308315116185, + "grad_norm": 23.816272735595703, + "learning_rate": 5.009715025906736e-06, + "loss": 0.589, + "mean_token_accuracy": 0.9171972870826721, + "num_tokens": 5523703.0, + "step": 3083 + }, + { + "epoch": 0.4993927617196988, + "grad_norm": 25.58169174194336, + "learning_rate": 5.00809585492228e-06, + "loss": 0.6082, + "mean_token_accuracy": 0.9080895781517029, + "num_tokens": 5525509.0, + "step": 3084 + }, + { + "epoch": 0.4995546919277791, + "grad_norm": 26.280405044555664, + "learning_rate": 5.006476683937824e-06, + "loss": 0.6946, + "mean_token_accuracy": 0.9057921469211578, + "num_tokens": 5527297.0, + "step": 3085 + }, + { + "epoch": 0.49971662213585943, + "grad_norm": 25.21198272705078, + "learning_rate": 5.004857512953368e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.9059889316558838, + "num_tokens": 5529086.0, + "step": 3086 + }, + { + "epoch": 0.49987855234393974, + "grad_norm": 22.442089080810547, + "learning_rate": 5.003238341968912e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.9139516949653625, + "num_tokens": 5530877.0, + "step": 3087 + }, + { + "epoch": 0.50004048255202, + "grad_norm": 21.99388885498047, + "learning_rate": 5.001619170984456e-06, + "loss": 0.6024, + "mean_token_accuracy": 0.9236485958099365, + "num_tokens": 5532664.0, + "step": 3088 + }, + { + "epoch": 0.5002024127601004, + "grad_norm": 23.198711395263672, + "learning_rate": 5e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.927532434463501, + "num_tokens": 5534452.0, + "step": 3089 + }, + { + "epoch": 0.5003643429681807, + "grad_norm": 34.14920425415039, + "learning_rate": 4.998380829015545e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.8876847326755524, + "num_tokens": 5536249.0, + "step": 3090 + }, + { + "epoch": 0.5005262731762611, + "grad_norm": 30.41759490966797, + "learning_rate": 4.9967616580310884e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.8971927165985107, + "num_tokens": 5538043.0, + "step": 3091 + }, + { + "epoch": 0.5006882033843414, + "grad_norm": 33.016761779785156, + "learning_rate": 4.995142487046633e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.906521737575531, + "num_tokens": 5539833.0, + "step": 3092 + }, + { + "epoch": 0.5008501335924217, + "grad_norm": 18.510164260864258, + "learning_rate": 4.9935233160621765e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.9204118847846985, + "num_tokens": 5541622.0, + "step": 3093 + }, + { + "epoch": 0.501012063800502, + "grad_norm": 22.790176391601562, + "learning_rate": 4.991904145077721e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.920228511095047, + "num_tokens": 5543409.0, + "step": 3094 + }, + { + "epoch": 0.5011739940085823, + "grad_norm": 19.51079750061035, + "learning_rate": 4.9902849740932645e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.928893506526947, + "num_tokens": 5545202.0, + "step": 3095 + }, + { + "epoch": 0.5013359242166626, + "grad_norm": 31.858049392700195, + "learning_rate": 4.988665803108809e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.9172256290912628, + "num_tokens": 5547004.0, + "step": 3096 + }, + { + "epoch": 0.5014978544247429, + "grad_norm": 23.543933868408203, + "learning_rate": 4.9870466321243525e-06, + "loss": 0.6227, + "mean_token_accuracy": 0.9173052906990051, + "num_tokens": 5548794.0, + "step": 3097 + }, + { + "epoch": 0.5016597846328232, + "grad_norm": 23.4957332611084, + "learning_rate": 4.985427461139897e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.9063536524772644, + "num_tokens": 5550584.0, + "step": 3098 + }, + { + "epoch": 0.5018217148409035, + "grad_norm": 33.95772171020508, + "learning_rate": 4.9838082901554405e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.8994092047214508, + "num_tokens": 5552374.0, + "step": 3099 + }, + { + "epoch": 0.5019836450489839, + "grad_norm": 25.336759567260742, + "learning_rate": 4.982189119170985e-06, + "loss": 0.6396, + "mean_token_accuracy": 0.9056650102138519, + "num_tokens": 5554171.0, + "step": 3100 + }, + { + "epoch": 0.5021455752570642, + "grad_norm": 27.51428985595703, + "learning_rate": 4.9805699481865286e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.9198970198631287, + "num_tokens": 5555970.0, + "step": 3101 + }, + { + "epoch": 0.5023075054651446, + "grad_norm": 24.61184310913086, + "learning_rate": 4.978950777202073e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.919047623872757, + "num_tokens": 5557754.0, + "step": 3102 + }, + { + "epoch": 0.5024694356732249, + "grad_norm": 30.319303512573242, + "learning_rate": 4.977331606217617e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.8855316936969757, + "num_tokens": 5559554.0, + "step": 3103 + }, + { + "epoch": 0.5026313658813052, + "grad_norm": 24.716384887695312, + "learning_rate": 4.975712435233161e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.9092400968074799, + "num_tokens": 5561332.0, + "step": 3104 + }, + { + "epoch": 0.5027932960893855, + "grad_norm": 23.923810958862305, + "learning_rate": 4.974093264248705e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.9146403074264526, + "num_tokens": 5563125.0, + "step": 3105 + }, + { + "epoch": 0.5029552262974658, + "grad_norm": 28.690021514892578, + "learning_rate": 4.972474093264249e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.9021420180797577, + "num_tokens": 5564923.0, + "step": 3106 + }, + { + "epoch": 0.5031171565055461, + "grad_norm": 23.697677612304688, + "learning_rate": 4.970854922279793e-06, + "loss": 0.6551, + "mean_token_accuracy": 0.9196009635925293, + "num_tokens": 5566721.0, + "step": 3107 + }, + { + "epoch": 0.5032790867136264, + "grad_norm": 24.035978317260742, + "learning_rate": 4.969235751295337e-06, + "loss": 0.6393, + "mean_token_accuracy": 0.908129870891571, + "num_tokens": 5568515.0, + "step": 3108 + }, + { + "epoch": 0.5034410169217067, + "grad_norm": 17.372512817382812, + "learning_rate": 4.9676165803108815e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.9220321774482727, + "num_tokens": 5570309.0, + "step": 3109 + }, + { + "epoch": 0.503602947129787, + "grad_norm": 31.005531311035156, + "learning_rate": 4.965997409326425e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.9067992568016052, + "num_tokens": 5572100.0, + "step": 3110 + }, + { + "epoch": 0.5037648773378673, + "grad_norm": 22.811525344848633, + "learning_rate": 4.9643782383419695e-06, + "loss": 0.5433, + "mean_token_accuracy": 0.9227941036224365, + "num_tokens": 5573884.0, + "step": 3111 + }, + { + "epoch": 0.5039268075459477, + "grad_norm": 33.46623229980469, + "learning_rate": 4.962759067357513e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.9057226777076721, + "num_tokens": 5575682.0, + "step": 3112 + }, + { + "epoch": 0.504088737754028, + "grad_norm": 22.027273178100586, + "learning_rate": 4.9611398963730576e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.9185907244682312, + "num_tokens": 5577477.0, + "step": 3113 + }, + { + "epoch": 0.5042506679621084, + "grad_norm": 27.57921600341797, + "learning_rate": 4.959520725388601e-06, + "loss": 0.6476, + "mean_token_accuracy": 0.9070360660552979, + "num_tokens": 5579269.0, + "step": 3114 + }, + { + "epoch": 0.5044125981701887, + "grad_norm": 27.70969581604004, + "learning_rate": 4.957901554404146e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.9250127077102661, + "num_tokens": 5581062.0, + "step": 3115 + }, + { + "epoch": 0.504574528378269, + "grad_norm": 32.048744201660156, + "learning_rate": 4.956282383419689e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.9077905118465424, + "num_tokens": 5582867.0, + "step": 3116 + }, + { + "epoch": 0.5047364585863493, + "grad_norm": 30.043624877929688, + "learning_rate": 4.954663212435234e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.8915310800075531, + "num_tokens": 5584656.0, + "step": 3117 + }, + { + "epoch": 0.5048983887944296, + "grad_norm": 16.69049072265625, + "learning_rate": 4.953044041450777e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9252786040306091, + "num_tokens": 5586449.0, + "step": 3118 + }, + { + "epoch": 0.5050603190025099, + "grad_norm": 24.523834228515625, + "learning_rate": 4.951424870466322e-06, + "loss": 0.749, + "mean_token_accuracy": 0.8919464945793152, + "num_tokens": 5588239.0, + "step": 3119 + }, + { + "epoch": 0.5052222492105902, + "grad_norm": 18.17458152770996, + "learning_rate": 4.949805699481865e-06, + "loss": 0.629, + "mean_token_accuracy": 0.9185140430927277, + "num_tokens": 5590021.0, + "step": 3120 + }, + { + "epoch": 0.5053841794186705, + "grad_norm": 27.78891372680664, + "learning_rate": 4.94818652849741e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.9081889390945435, + "num_tokens": 5591815.0, + "step": 3121 + }, + { + "epoch": 0.5055461096267508, + "grad_norm": 26.635297775268555, + "learning_rate": 4.946567357512953e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.9131884276866913, + "num_tokens": 5593615.0, + "step": 3122 + }, + { + "epoch": 0.5057080398348311, + "grad_norm": 15.540252685546875, + "learning_rate": 4.944948186528498e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.926991730928421, + "num_tokens": 5595401.0, + "step": 3123 + }, + { + "epoch": 0.5058699700429115, + "grad_norm": 25.944604873657227, + "learning_rate": 4.943329015544041e-06, + "loss": 0.544, + "mean_token_accuracy": 0.9194128215312958, + "num_tokens": 5597186.0, + "step": 3124 + }, + { + "epoch": 0.5060319002509919, + "grad_norm": 27.062040328979492, + "learning_rate": 4.941709844559586e-06, + "loss": 0.6089, + "mean_token_accuracy": 0.9166885912418365, + "num_tokens": 5598974.0, + "step": 3125 + }, + { + "epoch": 0.5061938304590722, + "grad_norm": 35.009090423583984, + "learning_rate": 4.940090673575129e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.8907828330993652, + "num_tokens": 5600762.0, + "step": 3126 + }, + { + "epoch": 0.5063557606671525, + "grad_norm": 23.119449615478516, + "learning_rate": 4.938471502590674e-06, + "loss": 0.589, + "mean_token_accuracy": 0.9234496355056763, + "num_tokens": 5602547.0, + "step": 3127 + }, + { + "epoch": 0.5065176908752328, + "grad_norm": 31.09634780883789, + "learning_rate": 4.936852331606218e-06, + "loss": 0.685, + "mean_token_accuracy": 0.8980016112327576, + "num_tokens": 5604333.0, + "step": 3128 + }, + { + "epoch": 0.5066796210833131, + "grad_norm": 23.628664016723633, + "learning_rate": 4.935233160621762e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.8974365592002869, + "num_tokens": 5606127.0, + "step": 3129 + }, + { + "epoch": 0.5068415512913934, + "grad_norm": 23.543800354003906, + "learning_rate": 4.933613989637306e-06, + "loss": 0.6196, + "mean_token_accuracy": 0.9214285612106323, + "num_tokens": 5607919.0, + "step": 3130 + }, + { + "epoch": 0.5070034814994737, + "grad_norm": 18.900848388671875, + "learning_rate": 4.93199481865285e-06, + "loss": 0.5513, + "mean_token_accuracy": 0.9285568594932556, + "num_tokens": 5609711.0, + "step": 3131 + }, + { + "epoch": 0.507165411707554, + "grad_norm": 23.889690399169922, + "learning_rate": 4.930375647668394e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.9176711738109589, + "num_tokens": 5611513.0, + "step": 3132 + }, + { + "epoch": 0.5073273419156343, + "grad_norm": 25.092618942260742, + "learning_rate": 4.928756476683938e-06, + "loss": 0.6514, + "mean_token_accuracy": 0.9155176877975464, + "num_tokens": 5613297.0, + "step": 3133 + }, + { + "epoch": 0.5074892721237146, + "grad_norm": 22.767011642456055, + "learning_rate": 4.927137305699482e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.9263209402561188, + "num_tokens": 5615095.0, + "step": 3134 + }, + { + "epoch": 0.507651202331795, + "grad_norm": 36.97871398925781, + "learning_rate": 4.925518134715026e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.9035714268684387, + "num_tokens": 5616887.0, + "step": 3135 + }, + { + "epoch": 0.5078131325398754, + "grad_norm": 31.4343318939209, + "learning_rate": 4.92389896373057e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.8829307556152344, + "num_tokens": 5618672.0, + "step": 3136 + }, + { + "epoch": 0.5079750627479557, + "grad_norm": 24.045391082763672, + "learning_rate": 4.922279792746114e-06, + "loss": 0.5625, + "mean_token_accuracy": 0.9164723455905914, + "num_tokens": 5620471.0, + "step": 3137 + }, + { + "epoch": 0.508136992956036, + "grad_norm": 23.62331771850586, + "learning_rate": 4.920660621761658e-06, + "loss": 0.5726, + "mean_token_accuracy": 0.9205517172813416, + "num_tokens": 5622260.0, + "step": 3138 + }, + { + "epoch": 0.5082989231641163, + "grad_norm": 23.12000274658203, + "learning_rate": 4.919041450777203e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.9199735522270203, + "num_tokens": 5624047.0, + "step": 3139 + }, + { + "epoch": 0.5084608533721966, + "grad_norm": 32.7731819152832, + "learning_rate": 4.917422279792747e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.8896402716636658, + "num_tokens": 5625848.0, + "step": 3140 + }, + { + "epoch": 0.5086227835802769, + "grad_norm": 25.463199615478516, + "learning_rate": 4.915803108808291e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.9107498526573181, + "num_tokens": 5627629.0, + "step": 3141 + }, + { + "epoch": 0.5087847137883572, + "grad_norm": 29.587570190429688, + "learning_rate": 4.914183937823835e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.9059281051158905, + "num_tokens": 5629417.0, + "step": 3142 + }, + { + "epoch": 0.5089466439964375, + "grad_norm": 24.350513458251953, + "learning_rate": 4.912564766839379e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.9228169620037079, + "num_tokens": 5631201.0, + "step": 3143 + }, + { + "epoch": 0.5091085742045178, + "grad_norm": 18.100515365600586, + "learning_rate": 4.910945595854923e-06, + "loss": 0.5784, + "mean_token_accuracy": 0.9295460283756256, + "num_tokens": 5632997.0, + "step": 3144 + }, + { + "epoch": 0.5092705044125981, + "grad_norm": 28.804550170898438, + "learning_rate": 4.909326424870467e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.8866026103496552, + "num_tokens": 5634791.0, + "step": 3145 + }, + { + "epoch": 0.5094324346206784, + "grad_norm": 29.25710678100586, + "learning_rate": 4.907707253886011e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.907142847776413, + "num_tokens": 5636583.0, + "step": 3146 + }, + { + "epoch": 0.5095943648287588, + "grad_norm": 31.203798294067383, + "learning_rate": 4.906088082901555e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.901867538690567, + "num_tokens": 5638380.0, + "step": 3147 + }, + { + "epoch": 0.5097562950368392, + "grad_norm": 27.93453598022461, + "learning_rate": 4.904468911917099e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.9079633057117462, + "num_tokens": 5640176.0, + "step": 3148 + }, + { + "epoch": 0.5099182252449195, + "grad_norm": 27.137514114379883, + "learning_rate": 4.902849740932643e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.9014925360679626, + "num_tokens": 5641962.0, + "step": 3149 + }, + { + "epoch": 0.5100801554529998, + "grad_norm": 27.561450958251953, + "learning_rate": 4.901230569948187e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.908223420381546, + "num_tokens": 5643757.0, + "step": 3150 + }, + { + "epoch": 0.5102420856610801, + "grad_norm": 31.550500869750977, + "learning_rate": 4.899611398963731e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.9077979624271393, + "num_tokens": 5645541.0, + "step": 3151 + }, + { + "epoch": 0.5104040158691604, + "grad_norm": 36.09164047241211, + "learning_rate": 4.897992227979275e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.8914532661437988, + "num_tokens": 5647337.0, + "step": 3152 + }, + { + "epoch": 0.5105659460772407, + "grad_norm": 19.480131149291992, + "learning_rate": 4.896373056994819e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.9304347932338715, + "num_tokens": 5649122.0, + "step": 3153 + }, + { + "epoch": 0.510727876285321, + "grad_norm": 16.59333610534668, + "learning_rate": 4.894753886010363e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.9314528703689575, + "num_tokens": 5650911.0, + "step": 3154 + }, + { + "epoch": 0.5108898064934013, + "grad_norm": 26.649478912353516, + "learning_rate": 4.893134715025907e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.9250216782093048, + "num_tokens": 5652703.0, + "step": 3155 + }, + { + "epoch": 0.5110517367014816, + "grad_norm": 21.956741333007812, + "learning_rate": 4.891515544041451e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9214300215244293, + "num_tokens": 5654496.0, + "step": 3156 + }, + { + "epoch": 0.5112136669095619, + "grad_norm": 30.248903274536133, + "learning_rate": 4.889896373056995e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.9186813235282898, + "num_tokens": 5656291.0, + "step": 3157 + }, + { + "epoch": 0.5113755971176422, + "grad_norm": 18.268016815185547, + "learning_rate": 4.8882772020725394e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.9188909530639648, + "num_tokens": 5658074.0, + "step": 3158 + }, + { + "epoch": 0.5115375273257227, + "grad_norm": 24.37735939025879, + "learning_rate": 4.886658031088084e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.9012155830860138, + "num_tokens": 5659870.0, + "step": 3159 + }, + { + "epoch": 0.511699457533803, + "grad_norm": 21.751869201660156, + "learning_rate": 4.8850388601036275e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.9097758233547211, + "num_tokens": 5661659.0, + "step": 3160 + }, + { + "epoch": 0.5118613877418833, + "grad_norm": 19.527822494506836, + "learning_rate": 4.883419689119172e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.9254851043224335, + "num_tokens": 5663465.0, + "step": 3161 + }, + { + "epoch": 0.5120233179499636, + "grad_norm": 26.259260177612305, + "learning_rate": 4.8818005181347155e-06, + "loss": 0.598, + "mean_token_accuracy": 0.9196009635925293, + "num_tokens": 5665263.0, + "step": 3162 + }, + { + "epoch": 0.5121852481580439, + "grad_norm": 22.342472076416016, + "learning_rate": 4.88018134715026e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.9328171610832214, + "num_tokens": 5667058.0, + "step": 3163 + }, + { + "epoch": 0.5123471783661242, + "grad_norm": 18.374914169311523, + "learning_rate": 4.8785621761658035e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.9212393462657928, + "num_tokens": 5668850.0, + "step": 3164 + }, + { + "epoch": 0.5125091085742045, + "grad_norm": 22.808595657348633, + "learning_rate": 4.876943005181348e-06, + "loss": 0.6087, + "mean_token_accuracy": 0.9309645891189575, + "num_tokens": 5670636.0, + "step": 3165 + }, + { + "epoch": 0.5126710387822848, + "grad_norm": 21.37262535095215, + "learning_rate": 4.8753238341968915e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.9253092110157013, + "num_tokens": 5672429.0, + "step": 3166 + }, + { + "epoch": 0.5128329689903651, + "grad_norm": 21.01248550415039, + "learning_rate": 4.873704663212436e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.9208920300006866, + "num_tokens": 5674218.0, + "step": 3167 + }, + { + "epoch": 0.5129948991984454, + "grad_norm": 19.431440353393555, + "learning_rate": 4.8720854922279796e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.9172360301017761, + "num_tokens": 5676008.0, + "step": 3168 + }, + { + "epoch": 0.5131568294065257, + "grad_norm": 25.13711929321289, + "learning_rate": 4.870466321243524e-06, + "loss": 0.5437, + "mean_token_accuracy": 0.9104059636592865, + "num_tokens": 5677799.0, + "step": 3169 + }, + { + "epoch": 0.5133187596146062, + "grad_norm": 20.09469223022461, + "learning_rate": 4.868847150259068e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.9192849397659302, + "num_tokens": 5679583.0, + "step": 3170 + }, + { + "epoch": 0.5134806898226865, + "grad_norm": 14.375593185424805, + "learning_rate": 4.867227979274612e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.9386600852012634, + "num_tokens": 5681372.0, + "step": 3171 + }, + { + "epoch": 0.5136426200307668, + "grad_norm": 21.145713806152344, + "learning_rate": 4.865608808290156e-06, + "loss": 0.6416, + "mean_token_accuracy": 0.9228060841560364, + "num_tokens": 5683169.0, + "step": 3172 + }, + { + "epoch": 0.5138045502388471, + "grad_norm": 16.526351928710938, + "learning_rate": 4.8639896373057e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.9345445930957794, + "num_tokens": 5684956.0, + "step": 3173 + }, + { + "epoch": 0.5139664804469274, + "grad_norm": 26.206796646118164, + "learning_rate": 4.862370466321244e-06, + "loss": 0.784, + "mean_token_accuracy": 0.9096069931983948, + "num_tokens": 5686754.0, + "step": 3174 + }, + { + "epoch": 0.5141284106550077, + "grad_norm": 22.978458404541016, + "learning_rate": 4.860751295336788e-06, + "loss": 0.6112, + "mean_token_accuracy": 0.9222849309444427, + "num_tokens": 5688547.0, + "step": 3175 + }, + { + "epoch": 0.514290340863088, + "grad_norm": 34.91913604736328, + "learning_rate": 4.859132124352332e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.9001225531101227, + "num_tokens": 5690339.0, + "step": 3176 + }, + { + "epoch": 0.5144522710711683, + "grad_norm": 21.252050399780273, + "learning_rate": 4.857512953367876e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.9163931012153625, + "num_tokens": 5692126.0, + "step": 3177 + }, + { + "epoch": 0.5146142012792486, + "grad_norm": 16.567798614501953, + "learning_rate": 4.8558937823834205e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.930802047252655, + "num_tokens": 5693912.0, + "step": 3178 + }, + { + "epoch": 0.5147761314873289, + "grad_norm": 24.064016342163086, + "learning_rate": 4.854274611398964e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.9217752516269684, + "num_tokens": 5695705.0, + "step": 3179 + }, + { + "epoch": 0.5149380616954092, + "grad_norm": 24.991971969604492, + "learning_rate": 4.8526554404145086e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.9147057235240936, + "num_tokens": 5697497.0, + "step": 3180 + }, + { + "epoch": 0.5150999919034897, + "grad_norm": 29.236038208007812, + "learning_rate": 4.851036269430052e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.9132352769374847, + "num_tokens": 5699285.0, + "step": 3181 + }, + { + "epoch": 0.51526192211157, + "grad_norm": 21.264066696166992, + "learning_rate": 4.849417098445597e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.9265628457069397, + "num_tokens": 5701069.0, + "step": 3182 + }, + { + "epoch": 0.5154238523196503, + "grad_norm": 24.06795883178711, + "learning_rate": 4.84779792746114e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.9138657748699188, + "num_tokens": 5702848.0, + "step": 3183 + }, + { + "epoch": 0.5155857825277306, + "grad_norm": 30.608985900878906, + "learning_rate": 4.846178756476685e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.9011540710926056, + "num_tokens": 5704643.0, + "step": 3184 + }, + { + "epoch": 0.5157477127358109, + "grad_norm": 27.0435848236084, + "learning_rate": 4.844559585492228e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.9168742299079895, + "num_tokens": 5706444.0, + "step": 3185 + }, + { + "epoch": 0.5159096429438912, + "grad_norm": 25.459152221679688, + "learning_rate": 4.842940414507773e-06, + "loss": 0.6159, + "mean_token_accuracy": 0.9194777309894562, + "num_tokens": 5708230.0, + "step": 3186 + }, + { + "epoch": 0.5160715731519715, + "grad_norm": 34.461395263671875, + "learning_rate": 4.841321243523316e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.8959807753562927, + "num_tokens": 5710011.0, + "step": 3187 + }, + { + "epoch": 0.5162335033600518, + "grad_norm": 22.986474990844727, + "learning_rate": 4.839702072538861e-06, + "loss": 0.5628, + "mean_token_accuracy": 0.9218654632568359, + "num_tokens": 5711804.0, + "step": 3188 + }, + { + "epoch": 0.5163954335681321, + "grad_norm": 19.097637176513672, + "learning_rate": 4.838082901554404e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9208074510097504, + "num_tokens": 5713594.0, + "step": 3189 + }, + { + "epoch": 0.5165573637762124, + "grad_norm": 32.008941650390625, + "learning_rate": 4.836463730569949e-06, + "loss": 0.6807, + "mean_token_accuracy": 0.9115604162216187, + "num_tokens": 5715390.0, + "step": 3190 + }, + { + "epoch": 0.5167192939842927, + "grad_norm": 19.023216247558594, + "learning_rate": 4.834844559585492e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.9270983338356018, + "num_tokens": 5717176.0, + "step": 3191 + }, + { + "epoch": 0.516881224192373, + "grad_norm": 21.860477447509766, + "learning_rate": 4.833225388601037e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.920981764793396, + "num_tokens": 5718966.0, + "step": 3192 + }, + { + "epoch": 0.5170431544004535, + "grad_norm": 30.08136749267578, + "learning_rate": 4.83160621761658e-06, + "loss": 0.5458, + "mean_token_accuracy": 0.9120581448078156, + "num_tokens": 5720782.0, + "step": 3193 + }, + { + "epoch": 0.5172050846085338, + "grad_norm": 24.68195915222168, + "learning_rate": 4.829987046632125e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.9226865768432617, + "num_tokens": 5722578.0, + "step": 3194 + }, + { + "epoch": 0.5173670148166141, + "grad_norm": 24.32181739807129, + "learning_rate": 4.828367875647668e-06, + "loss": 0.6692, + "mean_token_accuracy": 0.9149965345859528, + "num_tokens": 5724384.0, + "step": 3195 + }, + { + "epoch": 0.5175289450246944, + "grad_norm": 16.229299545288086, + "learning_rate": 4.826748704663213e-06, + "loss": 0.6334, + "mean_token_accuracy": 0.9291443824768066, + "num_tokens": 5726164.0, + "step": 3196 + }, + { + "epoch": 0.5176908752327747, + "grad_norm": 20.368091583251953, + "learning_rate": 4.825129533678757e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.9257525205612183, + "num_tokens": 5727944.0, + "step": 3197 + }, + { + "epoch": 0.517852805440855, + "grad_norm": 31.058956146240234, + "learning_rate": 4.823510362694301e-06, + "loss": 0.6184, + "mean_token_accuracy": 0.9202437698841095, + "num_tokens": 5729757.0, + "step": 3198 + }, + { + "epoch": 0.5180147356489353, + "grad_norm": 24.40435791015625, + "learning_rate": 4.821891191709845e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.9134595096111298, + "num_tokens": 5731546.0, + "step": 3199 + }, + { + "epoch": 0.5181766658570156, + "grad_norm": 25.60504150390625, + "learning_rate": 4.820272020725389e-06, + "loss": 0.52, + "mean_token_accuracy": 0.9207285344600677, + "num_tokens": 5733335.0, + "step": 3200 + }, + { + "epoch": 0.5183385960650959, + "grad_norm": 33.42374038696289, + "learning_rate": 4.818652849740933e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.8986698091030121, + "num_tokens": 5735133.0, + "step": 3201 + }, + { + "epoch": 0.5185005262731762, + "grad_norm": 31.141822814941406, + "learning_rate": 4.817033678756477e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.9061620235443115, + "num_tokens": 5736922.0, + "step": 3202 + }, + { + "epoch": 0.5186624564812565, + "grad_norm": 27.583694458007812, + "learning_rate": 4.815414507772021e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.9141042828559875, + "num_tokens": 5738713.0, + "step": 3203 + }, + { + "epoch": 0.518824386689337, + "grad_norm": 24.821687698364258, + "learning_rate": 4.813795336787565e-06, + "loss": 0.5783, + "mean_token_accuracy": 0.9173012375831604, + "num_tokens": 5740503.0, + "step": 3204 + }, + { + "epoch": 0.5189863168974173, + "grad_norm": 22.333023071289062, + "learning_rate": 4.812176165803109e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.9267153441905975, + "num_tokens": 5742288.0, + "step": 3205 + }, + { + "epoch": 0.5191482471054976, + "grad_norm": 27.891273498535156, + "learning_rate": 4.810556994818653e-06, + "loss": 0.6097, + "mean_token_accuracy": 0.9145390093326569, + "num_tokens": 5744081.0, + "step": 3206 + }, + { + "epoch": 0.5193101773135779, + "grad_norm": 42.02451705932617, + "learning_rate": 4.808937823834197e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.8926622867584229, + "num_tokens": 5745890.0, + "step": 3207 + }, + { + "epoch": 0.5194721075216582, + "grad_norm": 26.871685028076172, + "learning_rate": 4.807318652849741e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.9087412357330322, + "num_tokens": 5747675.0, + "step": 3208 + }, + { + "epoch": 0.5196340377297385, + "grad_norm": 28.634862899780273, + "learning_rate": 4.805699481865285e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.9107352197170258, + "num_tokens": 5749467.0, + "step": 3209 + }, + { + "epoch": 0.5197959679378188, + "grad_norm": 16.785398483276367, + "learning_rate": 4.804080310880829e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9255028665065765, + "num_tokens": 5751247.0, + "step": 3210 + }, + { + "epoch": 0.5199578981458991, + "grad_norm": 29.749080657958984, + "learning_rate": 4.802461139896373e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.9064182341098785, + "num_tokens": 5753037.0, + "step": 3211 + }, + { + "epoch": 0.5201198283539794, + "grad_norm": 21.897811889648438, + "learning_rate": 4.800841968911917e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.9309405386447906, + "num_tokens": 5754824.0, + "step": 3212 + }, + { + "epoch": 0.5202817585620597, + "grad_norm": 39.71262741088867, + "learning_rate": 4.799222797927461e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.893934041261673, + "num_tokens": 5756618.0, + "step": 3213 + }, + { + "epoch": 0.52044368877014, + "grad_norm": 19.655574798583984, + "learning_rate": 4.797603626943005e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.931654691696167, + "num_tokens": 5758408.0, + "step": 3214 + }, + { + "epoch": 0.5206056189782204, + "grad_norm": 34.73652648925781, + "learning_rate": 4.7959844559585494e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.8951486647129059, + "num_tokens": 5760206.0, + "step": 3215 + }, + { + "epoch": 0.5207675491863007, + "grad_norm": 25.752023696899414, + "learning_rate": 4.794365284974094e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.9186748266220093, + "num_tokens": 5762000.0, + "step": 3216 + }, + { + "epoch": 0.5209294793943811, + "grad_norm": 23.28451156616211, + "learning_rate": 4.7927461139896375e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.9148925542831421, + "num_tokens": 5763793.0, + "step": 3217 + }, + { + "epoch": 0.5210914096024614, + "grad_norm": 24.838577270507812, + "learning_rate": 4.791126943005182e-06, + "loss": 0.6159, + "mean_token_accuracy": 0.917545735836029, + "num_tokens": 5765572.0, + "step": 3218 + }, + { + "epoch": 0.5212533398105417, + "grad_norm": 21.141876220703125, + "learning_rate": 4.7895077720207255e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9260774850845337, + "num_tokens": 5767368.0, + "step": 3219 + }, + { + "epoch": 0.521415270018622, + "grad_norm": 33.55743408203125, + "learning_rate": 4.78788860103627e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.8968351483345032, + "num_tokens": 5769171.0, + "step": 3220 + }, + { + "epoch": 0.5215772002267023, + "grad_norm": 32.839271545410156, + "learning_rate": 4.7862694300518135e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.9036430418491364, + "num_tokens": 5770963.0, + "step": 3221 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 29.301279067993164, + "learning_rate": 4.784650259067358e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.9256495237350464, + "num_tokens": 5772744.0, + "step": 3222 + }, + { + "epoch": 0.5219010606428629, + "grad_norm": 20.398630142211914, + "learning_rate": 4.7830310880829015e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.9255533218383789, + "num_tokens": 5774538.0, + "step": 3223 + }, + { + "epoch": 0.5220629908509432, + "grad_norm": 21.35122299194336, + "learning_rate": 4.781411917098446e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.9153417944908142, + "num_tokens": 5776334.0, + "step": 3224 + }, + { + "epoch": 0.5222249210590235, + "grad_norm": 20.598468780517578, + "learning_rate": 4.7797927461139896e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.9174016416072845, + "num_tokens": 5778112.0, + "step": 3225 + }, + { + "epoch": 0.5223868512671038, + "grad_norm": 23.522504806518555, + "learning_rate": 4.778173575129534e-06, + "loss": 0.627, + "mean_token_accuracy": 0.9160583913326263, + "num_tokens": 5779898.0, + "step": 3226 + }, + { + "epoch": 0.5225487814751842, + "grad_norm": 16.0715389251709, + "learning_rate": 4.776554404145078e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.9371417462825775, + "num_tokens": 5781697.0, + "step": 3227 + }, + { + "epoch": 0.5227107116832646, + "grad_norm": 18.35127830505371, + "learning_rate": 4.774935233160622e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.9282300472259521, + "num_tokens": 5783488.0, + "step": 3228 + }, + { + "epoch": 0.5228726418913449, + "grad_norm": 28.10984992980957, + "learning_rate": 4.773316062176166e-06, + "loss": 0.6624, + "mean_token_accuracy": 0.9129156172275543, + "num_tokens": 5785287.0, + "step": 3229 + }, + { + "epoch": 0.5230345720994252, + "grad_norm": 25.66130256652832, + "learning_rate": 4.77169689119171e-06, + "loss": 0.6423, + "mean_token_accuracy": 0.9164130985736847, + "num_tokens": 5787098.0, + "step": 3230 + }, + { + "epoch": 0.5231965023075055, + "grad_norm": 18.254972457885742, + "learning_rate": 4.770077720207254e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.924217939376831, + "num_tokens": 5788887.0, + "step": 3231 + }, + { + "epoch": 0.5233584325155858, + "grad_norm": 17.91209602355957, + "learning_rate": 4.768458549222798e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.9255494475364685, + "num_tokens": 5790669.0, + "step": 3232 + }, + { + "epoch": 0.5235203627236661, + "grad_norm": 24.230043411254883, + "learning_rate": 4.766839378238342e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.9195504486560822, + "num_tokens": 5792466.0, + "step": 3233 + }, + { + "epoch": 0.5236822929317464, + "grad_norm": 22.80183219909668, + "learning_rate": 4.765220207253887e-06, + "loss": 0.6187, + "mean_token_accuracy": 0.9176159799098969, + "num_tokens": 5794256.0, + "step": 3234 + }, + { + "epoch": 0.5238442231398267, + "grad_norm": 22.230703353881836, + "learning_rate": 4.7636010362694306e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.9278229773044586, + "num_tokens": 5796045.0, + "step": 3235 + }, + { + "epoch": 0.524006153347907, + "grad_norm": 23.797130584716797, + "learning_rate": 4.761981865284975e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.9162554144859314, + "num_tokens": 5797842.0, + "step": 3236 + }, + { + "epoch": 0.5241680835559873, + "grad_norm": 28.26338005065918, + "learning_rate": 4.760362694300519e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.9147329926490784, + "num_tokens": 5799635.0, + "step": 3237 + }, + { + "epoch": 0.5243300137640677, + "grad_norm": 32.87785339355469, + "learning_rate": 4.758743523316063e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.8946414291858673, + "num_tokens": 5801423.0, + "step": 3238 + }, + { + "epoch": 0.524491943972148, + "grad_norm": 26.376771926879883, + "learning_rate": 4.757124352331607e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.9102478623390198, + "num_tokens": 5803214.0, + "step": 3239 + }, + { + "epoch": 0.5246538741802284, + "grad_norm": 22.24866485595703, + "learning_rate": 4.755505181347151e-06, + "loss": 0.6436, + "mean_token_accuracy": 0.9244824051856995, + "num_tokens": 5805004.0, + "step": 3240 + }, + { + "epoch": 0.5248158043883087, + "grad_norm": 16.914201736450195, + "learning_rate": 4.753886010362695e-06, + "loss": 0.5114, + "mean_token_accuracy": 0.9229840040206909, + "num_tokens": 5806789.0, + "step": 3241 + }, + { + "epoch": 0.524977734596389, + "grad_norm": 25.600482940673828, + "learning_rate": 4.752266839378239e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.9145643413066864, + "num_tokens": 5808582.0, + "step": 3242 + }, + { + "epoch": 0.5251396648044693, + "grad_norm": 21.872465133666992, + "learning_rate": 4.750647668393783e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.9221014678478241, + "num_tokens": 5810376.0, + "step": 3243 + }, + { + "epoch": 0.5253015950125496, + "grad_norm": 24.721261978149414, + "learning_rate": 4.749028497409327e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.9139400720596313, + "num_tokens": 5812168.0, + "step": 3244 + }, + { + "epoch": 0.5254635252206299, + "grad_norm": 29.457611083984375, + "learning_rate": 4.747409326424871e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.9130252003669739, + "num_tokens": 5813956.0, + "step": 3245 + }, + { + "epoch": 0.5256254554287102, + "grad_norm": 27.688304901123047, + "learning_rate": 4.745790155440415e-06, + "loss": 0.725, + "mean_token_accuracy": 0.9120689630508423, + "num_tokens": 5815753.0, + "step": 3246 + }, + { + "epoch": 0.5257873856367905, + "grad_norm": 22.55941390991211, + "learning_rate": 4.7441709844559596e-06, + "loss": 0.609, + "mean_token_accuracy": 0.9209627509117126, + "num_tokens": 5817543.0, + "step": 3247 + }, + { + "epoch": 0.5259493158448708, + "grad_norm": 22.25295066833496, + "learning_rate": 4.742551813471503e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.9259218573570251, + "num_tokens": 5819325.0, + "step": 3248 + }, + { + "epoch": 0.5261112460529512, + "grad_norm": 26.40397071838379, + "learning_rate": 4.740932642487048e-06, + "loss": 0.5601, + "mean_token_accuracy": 0.925000011920929, + "num_tokens": 5821117.0, + "step": 3249 + }, + { + "epoch": 0.5262731762610315, + "grad_norm": 27.036094665527344, + "learning_rate": 4.739313471502591e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.9033782184123993, + "num_tokens": 5822906.0, + "step": 3250 + }, + { + "epoch": 0.5264351064691118, + "grad_norm": 22.441499710083008, + "learning_rate": 4.737694300518136e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.9305555820465088, + "num_tokens": 5824706.0, + "step": 3251 + }, + { + "epoch": 0.5265970366771922, + "grad_norm": 29.68516731262207, + "learning_rate": 4.736075129533679e-06, + "loss": 0.6662, + "mean_token_accuracy": 0.8968908786773682, + "num_tokens": 5826500.0, + "step": 3252 + }, + { + "epoch": 0.5267589668852725, + "grad_norm": 14.240768432617188, + "learning_rate": 4.734455958549224e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.9335511922836304, + "num_tokens": 5828283.0, + "step": 3253 + }, + { + "epoch": 0.5269208970933528, + "grad_norm": 30.80999183654785, + "learning_rate": 4.732836787564767e-06, + "loss": 0.6644, + "mean_token_accuracy": 0.9068345129489899, + "num_tokens": 5830074.0, + "step": 3254 + }, + { + "epoch": 0.5270828273014331, + "grad_norm": 16.276378631591797, + "learning_rate": 4.731217616580312e-06, + "loss": 0.524, + "mean_token_accuracy": 0.9244604408740997, + "num_tokens": 5831864.0, + "step": 3255 + }, + { + "epoch": 0.5272447575095134, + "grad_norm": 30.29820442199707, + "learning_rate": 4.729598445595855e-06, + "loss": 0.855, + "mean_token_accuracy": 0.8985449969768524, + "num_tokens": 5833651.0, + "step": 3256 + }, + { + "epoch": 0.5274066877175937, + "grad_norm": 21.22198486328125, + "learning_rate": 4.7279792746114e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.9183647632598877, + "num_tokens": 5835444.0, + "step": 3257 + }, + { + "epoch": 0.527568617925674, + "grad_norm": 25.498836517333984, + "learning_rate": 4.726360103626943e-06, + "loss": 0.5817, + "mean_token_accuracy": 0.9121601581573486, + "num_tokens": 5837241.0, + "step": 3258 + }, + { + "epoch": 0.5277305481337543, + "grad_norm": 20.432722091674805, + "learning_rate": 4.724740932642488e-06, + "loss": 0.5515, + "mean_token_accuracy": 0.9210049211978912, + "num_tokens": 5839031.0, + "step": 3259 + }, + { + "epoch": 0.5278924783418346, + "grad_norm": 22.283710479736328, + "learning_rate": 4.723121761658031e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.9211459457874298, + "num_tokens": 5840822.0, + "step": 3260 + }, + { + "epoch": 0.528054408549915, + "grad_norm": 24.2774658203125, + "learning_rate": 4.721502590673576e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.9052200019359589, + "num_tokens": 5842618.0, + "step": 3261 + }, + { + "epoch": 0.5282163387579953, + "grad_norm": 32.79093933105469, + "learning_rate": 4.719883419689119e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.9064554274082184, + "num_tokens": 5844408.0, + "step": 3262 + }, + { + "epoch": 0.5283782689660756, + "grad_norm": 29.020587921142578, + "learning_rate": 4.718264248704664e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.9155213236808777, + "num_tokens": 5846193.0, + "step": 3263 + }, + { + "epoch": 0.528540199174156, + "grad_norm": 26.841073989868164, + "learning_rate": 4.716645077720207e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.9155176877975464, + "num_tokens": 5847977.0, + "step": 3264 + }, + { + "epoch": 0.5287021293822363, + "grad_norm": 24.67628288269043, + "learning_rate": 4.715025906735752e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.9150060415267944, + "num_tokens": 5849771.0, + "step": 3265 + }, + { + "epoch": 0.5288640595903166, + "grad_norm": 13.892492294311523, + "learning_rate": 4.713406735751296e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.9319115877151489, + "num_tokens": 5851562.0, + "step": 3266 + }, + { + "epoch": 0.5290259897983969, + "grad_norm": 25.920000076293945, + "learning_rate": 4.71178756476684e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.9067660570144653, + "num_tokens": 5853352.0, + "step": 3267 + }, + { + "epoch": 0.5291879200064772, + "grad_norm": 32.46107864379883, + "learning_rate": 4.710168393782384e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.9011238813400269, + "num_tokens": 5855147.0, + "step": 3268 + }, + { + "epoch": 0.5293498502145575, + "grad_norm": 25.851436614990234, + "learning_rate": 4.708549222797928e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.9108265936374664, + "num_tokens": 5856940.0, + "step": 3269 + }, + { + "epoch": 0.5295117804226378, + "grad_norm": 33.249656677246094, + "learning_rate": 4.706930051813472e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.8916058242321014, + "num_tokens": 5858719.0, + "step": 3270 + }, + { + "epoch": 0.5296737106307181, + "grad_norm": 19.06661605834961, + "learning_rate": 4.705310880829016e-06, + "loss": 0.5882, + "mean_token_accuracy": 0.9184104800224304, + "num_tokens": 5860513.0, + "step": 3271 + }, + { + "epoch": 0.5298356408387985, + "grad_norm": 22.882741928100586, + "learning_rate": 4.70369170984456e-06, + "loss": 0.57, + "mean_token_accuracy": 0.9269450306892395, + "num_tokens": 5862299.0, + "step": 3272 + }, + { + "epoch": 0.5299975710468788, + "grad_norm": 31.506717681884766, + "learning_rate": 4.702072538860104e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.894209623336792, + "num_tokens": 5864094.0, + "step": 3273 + }, + { + "epoch": 0.5301595012549591, + "grad_norm": 23.07085418701172, + "learning_rate": 4.700453367875648e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.9136485755443573, + "num_tokens": 5865895.0, + "step": 3274 + }, + { + "epoch": 0.5303214314630394, + "grad_norm": 28.44403648376465, + "learning_rate": 4.698834196891192e-06, + "loss": 0.622, + "mean_token_accuracy": 0.9064748287200928, + "num_tokens": 5867685.0, + "step": 3275 + }, + { + "epoch": 0.5304833616711198, + "grad_norm": 15.705784797668457, + "learning_rate": 4.697215025906736e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.9300665259361267, + "num_tokens": 5869483.0, + "step": 3276 + }, + { + "epoch": 0.5306452918792001, + "grad_norm": 22.40789794921875, + "learning_rate": 4.69559585492228e-06, + "loss": 0.6449, + "mean_token_accuracy": 0.909722238779068, + "num_tokens": 5871271.0, + "step": 3277 + }, + { + "epoch": 0.5308072220872804, + "grad_norm": 29.81925392150879, + "learning_rate": 4.693976683937824e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.8999184370040894, + "num_tokens": 5873072.0, + "step": 3278 + }, + { + "epoch": 0.5309691522953607, + "grad_norm": 30.950777053833008, + "learning_rate": 4.692357512953368e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.8946863114833832, + "num_tokens": 5874869.0, + "step": 3279 + }, + { + "epoch": 0.531131082503441, + "grad_norm": 21.334619522094727, + "learning_rate": 4.690738341968912e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9268405139446259, + "num_tokens": 5876668.0, + "step": 3280 + }, + { + "epoch": 0.5312930127115213, + "grad_norm": 30.601539611816406, + "learning_rate": 4.689119170984456e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.9077341854572296, + "num_tokens": 5878451.0, + "step": 3281 + }, + { + "epoch": 0.5314549429196016, + "grad_norm": 28.87685203552246, + "learning_rate": 4.6875000000000004e-06, + "loss": 0.6488, + "mean_token_accuracy": 0.9045454561710358, + "num_tokens": 5880246.0, + "step": 3282 + }, + { + "epoch": 0.531616873127682, + "grad_norm": 14.6832275390625, + "learning_rate": 4.685880829015544e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.9363949596881866, + "num_tokens": 5882041.0, + "step": 3283 + }, + { + "epoch": 0.5317788033357623, + "grad_norm": 29.889806747436523, + "learning_rate": 4.6842616580310885e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.9056248366832733, + "num_tokens": 5883839.0, + "step": 3284 + }, + { + "epoch": 0.5319407335438426, + "grad_norm": 22.20687484741211, + "learning_rate": 4.682642487046633e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.9266775846481323, + "num_tokens": 5885622.0, + "step": 3285 + }, + { + "epoch": 0.5321026637519229, + "grad_norm": 22.255115509033203, + "learning_rate": 4.6810233160621765e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.915778249502182, + "num_tokens": 5887408.0, + "step": 3286 + }, + { + "epoch": 0.5322645939600033, + "grad_norm": 24.455339431762695, + "learning_rate": 4.679404145077721e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.9180491268634796, + "num_tokens": 5889200.0, + "step": 3287 + }, + { + "epoch": 0.5324265241680836, + "grad_norm": 23.976558685302734, + "learning_rate": 4.6777849740932645e-06, + "loss": 0.5587, + "mean_token_accuracy": 0.9140793681144714, + "num_tokens": 5890991.0, + "step": 3288 + }, + { + "epoch": 0.5325884543761639, + "grad_norm": 22.580522537231445, + "learning_rate": 4.676165803108809e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9313608109951019, + "num_tokens": 5892780.0, + "step": 3289 + }, + { + "epoch": 0.5327503845842442, + "grad_norm": 25.41205406188965, + "learning_rate": 4.6745466321243525e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.9052004218101501, + "num_tokens": 5894577.0, + "step": 3290 + }, + { + "epoch": 0.5329123147923245, + "grad_norm": 17.420106887817383, + "learning_rate": 4.672927461139897e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.9288175106048584, + "num_tokens": 5896370.0, + "step": 3291 + }, + { + "epoch": 0.5330742450004048, + "grad_norm": 34.279117584228516, + "learning_rate": 4.6713082901554406e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.904699444770813, + "num_tokens": 5898177.0, + "step": 3292 + }, + { + "epoch": 0.5332361752084851, + "grad_norm": 21.430469512939453, + "learning_rate": 4.669689119170985e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.9122180640697479, + "num_tokens": 5899962.0, + "step": 3293 + }, + { + "epoch": 0.5333981054165655, + "grad_norm": 22.764860153198242, + "learning_rate": 4.668069948186529e-06, + "loss": 0.6188, + "mean_token_accuracy": 0.9151371717453003, + "num_tokens": 5901765.0, + "step": 3294 + }, + { + "epoch": 0.5335600356246458, + "grad_norm": 23.35373306274414, + "learning_rate": 4.666450777202073e-06, + "loss": 0.5751, + "mean_token_accuracy": 0.9236221313476562, + "num_tokens": 5903552.0, + "step": 3295 + }, + { + "epoch": 0.5337219658327261, + "grad_norm": 23.841981887817383, + "learning_rate": 4.664831606217617e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.9072797000408173, + "num_tokens": 5905344.0, + "step": 3296 + }, + { + "epoch": 0.5338838960408064, + "grad_norm": 15.606760025024414, + "learning_rate": 4.663212435233161e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9268648028373718, + "num_tokens": 5907131.0, + "step": 3297 + }, + { + "epoch": 0.5340458262488867, + "grad_norm": 17.10336685180664, + "learning_rate": 4.661593264248705e-06, + "loss": 0.534, + "mean_token_accuracy": 0.9301941990852356, + "num_tokens": 5908915.0, + "step": 3298 + }, + { + "epoch": 0.534207756456967, + "grad_norm": 20.359485626220703, + "learning_rate": 4.659974093264249e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.9192011952400208, + "num_tokens": 5910711.0, + "step": 3299 + }, + { + "epoch": 0.5343696866650474, + "grad_norm": 21.940690994262695, + "learning_rate": 4.658354922279793e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.9133472442626953, + "num_tokens": 5912500.0, + "step": 3300 + }, + { + "epoch": 0.5345316168731277, + "grad_norm": 27.70854949951172, + "learning_rate": 4.656735751295337e-06, + "loss": 0.6474, + "mean_token_accuracy": 0.9068073034286499, + "num_tokens": 5914301.0, + "step": 3301 + }, + { + "epoch": 0.534693547081208, + "grad_norm": 16.91778564453125, + "learning_rate": 4.655116580310881e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.916292667388916, + "num_tokens": 5916090.0, + "step": 3302 + }, + { + "epoch": 0.5348554772892883, + "grad_norm": 26.937002182006836, + "learning_rate": 4.653497409326425e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.9205682873725891, + "num_tokens": 5917879.0, + "step": 3303 + }, + { + "epoch": 0.5350174074973686, + "grad_norm": 24.79198455810547, + "learning_rate": 4.6518782383419696e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.9208920300006866, + "num_tokens": 5919668.0, + "step": 3304 + }, + { + "epoch": 0.5351793377054489, + "grad_norm": 21.163257598876953, + "learning_rate": 4.650259067357513e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.9231884181499481, + "num_tokens": 5921453.0, + "step": 3305 + }, + { + "epoch": 0.5353412679135293, + "grad_norm": 32.04161071777344, + "learning_rate": 4.648639896373058e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.9063608050346375, + "num_tokens": 5923242.0, + "step": 3306 + }, + { + "epoch": 0.5355031981216096, + "grad_norm": 20.623491287231445, + "learning_rate": 4.647020725388601e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.9171499609947205, + "num_tokens": 5925032.0, + "step": 3307 + }, + { + "epoch": 0.5356651283296899, + "grad_norm": 24.486120223999023, + "learning_rate": 4.645401554404146e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.9154887795448303, + "num_tokens": 5926828.0, + "step": 3308 + }, + { + "epoch": 0.5358270585377702, + "grad_norm": 17.307029724121094, + "learning_rate": 4.643782383419689e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.9227039813995361, + "num_tokens": 5928611.0, + "step": 3309 + }, + { + "epoch": 0.5359889887458505, + "grad_norm": 23.798681259155273, + "learning_rate": 4.642163212435234e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.913159966468811, + "num_tokens": 5930399.0, + "step": 3310 + }, + { + "epoch": 0.5361509189539309, + "grad_norm": 25.3358154296875, + "learning_rate": 4.640544041450777e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.9119522571563721, + "num_tokens": 5932182.0, + "step": 3311 + }, + { + "epoch": 0.5363128491620112, + "grad_norm": 41.298397064208984, + "learning_rate": 4.638924870466322e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.8780686259269714, + "num_tokens": 5933979.0, + "step": 3312 + }, + { + "epoch": 0.5364747793700915, + "grad_norm": 27.232192993164062, + "learning_rate": 4.637305699481865e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.9053481817245483, + "num_tokens": 5935776.0, + "step": 3313 + }, + { + "epoch": 0.5366367095781718, + "grad_norm": 17.515901565551758, + "learning_rate": 4.63568652849741e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.9252215027809143, + "num_tokens": 5937557.0, + "step": 3314 + }, + { + "epoch": 0.5367986397862521, + "grad_norm": 30.752872467041016, + "learning_rate": 4.634067357512953e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.9034347832202911, + "num_tokens": 5939348.0, + "step": 3315 + }, + { + "epoch": 0.5369605699943324, + "grad_norm": 28.745569229125977, + "learning_rate": 4.632448186528498e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.9015345275402069, + "num_tokens": 5941134.0, + "step": 3316 + }, + { + "epoch": 0.5371225002024128, + "grad_norm": 21.264015197753906, + "learning_rate": 4.630829015544041e-06, + "loss": 0.6348, + "mean_token_accuracy": 0.9246070981025696, + "num_tokens": 5942938.0, + "step": 3317 + }, + { + "epoch": 0.5372844304104931, + "grad_norm": 24.994874954223633, + "learning_rate": 4.629209844559586e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.9150778949260712, + "num_tokens": 5944721.0, + "step": 3318 + }, + { + "epoch": 0.5374463606185734, + "grad_norm": 27.732799530029297, + "learning_rate": 4.627590673575129e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.9074074327945709, + "num_tokens": 5946503.0, + "step": 3319 + }, + { + "epoch": 0.5376082908266537, + "grad_norm": 16.936355590820312, + "learning_rate": 4.625971502590674e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9231182336807251, + "num_tokens": 5948288.0, + "step": 3320 + }, + { + "epoch": 0.537770221034734, + "grad_norm": 24.789987564086914, + "learning_rate": 4.624352331606217e-06, + "loss": 0.613, + "mean_token_accuracy": 0.9123079180717468, + "num_tokens": 5950085.0, + "step": 3321 + }, + { + "epoch": 0.5379321512428143, + "grad_norm": 10.298727989196777, + "learning_rate": 4.622733160621762e-06, + "loss": 0.456, + "mean_token_accuracy": 0.9349911510944366, + "num_tokens": 5951874.0, + "step": 3322 + }, + { + "epoch": 0.5380940814508947, + "grad_norm": 21.85140037536621, + "learning_rate": 4.621113989637306e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.9145390093326569, + "num_tokens": 5953657.0, + "step": 3323 + }, + { + "epoch": 0.538256011658975, + "grad_norm": 29.054805755615234, + "learning_rate": 4.61949481865285e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.9007092118263245, + "num_tokens": 5955451.0, + "step": 3324 + }, + { + "epoch": 0.5384179418670553, + "grad_norm": 15.269128799438477, + "learning_rate": 4.617875647668394e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.9285130798816681, + "num_tokens": 5957243.0, + "step": 3325 + }, + { + "epoch": 0.5385798720751356, + "grad_norm": 27.347911834716797, + "learning_rate": 4.616256476683938e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.9085317552089691, + "num_tokens": 5959039.0, + "step": 3326 + }, + { + "epoch": 0.5387418022832159, + "grad_norm": 26.464378356933594, + "learning_rate": 4.614637305699482e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.9178784787654877, + "num_tokens": 5960831.0, + "step": 3327 + }, + { + "epoch": 0.5389037324912963, + "grad_norm": 20.680988311767578, + "learning_rate": 4.613018134715026e-06, + "loss": 0.6419, + "mean_token_accuracy": 0.9202331602573395, + "num_tokens": 5962619.0, + "step": 3328 + }, + { + "epoch": 0.5390656626993766, + "grad_norm": 22.59561538696289, + "learning_rate": 4.61139896373057e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.9267317354679108, + "num_tokens": 5964405.0, + "step": 3329 + }, + { + "epoch": 0.5392275929074569, + "grad_norm": 25.261756896972656, + "learning_rate": 4.609779792746114e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.9053235650062561, + "num_tokens": 5966202.0, + "step": 3330 + }, + { + "epoch": 0.5393895231155372, + "grad_norm": 18.882295608520508, + "learning_rate": 4.608160621761658e-06, + "loss": 0.5269, + "mean_token_accuracy": 0.918519139289856, + "num_tokens": 5967985.0, + "step": 3331 + }, + { + "epoch": 0.5395514533236175, + "grad_norm": 20.336748123168945, + "learning_rate": 4.606541450777203e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.9278618693351746, + "num_tokens": 5969788.0, + "step": 3332 + }, + { + "epoch": 0.5397133835316978, + "grad_norm": 29.584484100341797, + "learning_rate": 4.604922279792746e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.8960237205028534, + "num_tokens": 5971587.0, + "step": 3333 + }, + { + "epoch": 0.5398753137397782, + "grad_norm": 27.636363983154297, + "learning_rate": 4.603303108808291e-06, + "loss": 0.6279, + "mean_token_accuracy": 0.9137344062328339, + "num_tokens": 5973388.0, + "step": 3334 + }, + { + "epoch": 0.5400372439478585, + "grad_norm": 23.142065048217773, + "learning_rate": 4.601683937823835e-06, + "loss": 0.5569, + "mean_token_accuracy": 0.9152278006076813, + "num_tokens": 5975182.0, + "step": 3335 + }, + { + "epoch": 0.5401991741559388, + "grad_norm": 31.19795036315918, + "learning_rate": 4.600064766839379e-06, + "loss": 0.6246, + "mean_token_accuracy": 0.9099322259426117, + "num_tokens": 5976971.0, + "step": 3336 + }, + { + "epoch": 0.5403611043640191, + "grad_norm": 27.82547950744629, + "learning_rate": 4.598445595854923e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.9072259664535522, + "num_tokens": 5978763.0, + "step": 3337 + }, + { + "epoch": 0.5405230345720994, + "grad_norm": 24.315357208251953, + "learning_rate": 4.596826424870467e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.9188596606254578, + "num_tokens": 5980571.0, + "step": 3338 + }, + { + "epoch": 0.5406849647801797, + "grad_norm": 27.79184913635254, + "learning_rate": 4.595207253886011e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.9031609296798706, + "num_tokens": 5982372.0, + "step": 3339 + }, + { + "epoch": 0.5408468949882601, + "grad_norm": 25.85364532470703, + "learning_rate": 4.593588082901555e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.9119634628295898, + "num_tokens": 5984180.0, + "step": 3340 + }, + { + "epoch": 0.5410088251963404, + "grad_norm": 15.509039878845215, + "learning_rate": 4.591968911917099e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.9353419542312622, + "num_tokens": 5985970.0, + "step": 3341 + }, + { + "epoch": 0.5411707554044207, + "grad_norm": 20.80098533630371, + "learning_rate": 4.590349740932643e-06, + "loss": 0.5612, + "mean_token_accuracy": 0.9211459457874298, + "num_tokens": 5987761.0, + "step": 3342 + }, + { + "epoch": 0.541332685612501, + "grad_norm": 24.21620750427246, + "learning_rate": 4.588730569948187e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.9127168655395508, + "num_tokens": 5989548.0, + "step": 3343 + }, + { + "epoch": 0.5414946158205813, + "grad_norm": 32.148719787597656, + "learning_rate": 4.587111398963731e-06, + "loss": 0.907, + "mean_token_accuracy": 0.9039174318313599, + "num_tokens": 5991352.0, + "step": 3344 + }, + { + "epoch": 0.5416565460286616, + "grad_norm": 22.388635635375977, + "learning_rate": 4.585492227979275e-06, + "loss": 0.6226, + "mean_token_accuracy": 0.9076087176799774, + "num_tokens": 5993134.0, + "step": 3345 + }, + { + "epoch": 0.541818476236742, + "grad_norm": 32.44016647338867, + "learning_rate": 4.583873056994819e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.9028957486152649, + "num_tokens": 5994934.0, + "step": 3346 + }, + { + "epoch": 0.5419804064448223, + "grad_norm": 23.766708374023438, + "learning_rate": 4.582253886010363e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9224817752838135, + "num_tokens": 5996730.0, + "step": 3347 + }, + { + "epoch": 0.5421423366529026, + "grad_norm": 19.531980514526367, + "learning_rate": 4.580634715025907e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9212393462657928, + "num_tokens": 5998522.0, + "step": 3348 + }, + { + "epoch": 0.5423042668609829, + "grad_norm": 29.46169090270996, + "learning_rate": 4.5790155440414514e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.9092390239238739, + "num_tokens": 6000310.0, + "step": 3349 + }, + { + "epoch": 0.5424661970690632, + "grad_norm": 25.975584030151367, + "learning_rate": 4.577396373056995e-06, + "loss": 0.532, + "mean_token_accuracy": 0.9221243560314178, + "num_tokens": 6002104.0, + "step": 3350 + }, + { + "epoch": 0.5426281272771436, + "grad_norm": 22.205385208129883, + "learning_rate": 4.5757772020725395e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.925253301858902, + "num_tokens": 6003897.0, + "step": 3351 + }, + { + "epoch": 0.5427900574852239, + "grad_norm": 24.8353271484375, + "learning_rate": 4.574158031088083e-06, + "loss": 0.626, + "mean_token_accuracy": 0.9058675467967987, + "num_tokens": 6005694.0, + "step": 3352 + }, + { + "epoch": 0.5429519876933042, + "grad_norm": 33.30318832397461, + "learning_rate": 4.5725388601036275e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.9043208360671997, + "num_tokens": 6007500.0, + "step": 3353 + }, + { + "epoch": 0.5431139179013845, + "grad_norm": 21.73312759399414, + "learning_rate": 4.570919689119172e-06, + "loss": 0.6493, + "mean_token_accuracy": 0.9169534146785736, + "num_tokens": 6009289.0, + "step": 3354 + }, + { + "epoch": 0.5432758481094648, + "grad_norm": 26.999486923217773, + "learning_rate": 4.5693005181347155e-06, + "loss": 0.6115, + "mean_token_accuracy": 0.9213786423206329, + "num_tokens": 6011080.0, + "step": 3355 + }, + { + "epoch": 0.5434377783175451, + "grad_norm": 22.595088958740234, + "learning_rate": 4.56768134715026e-06, + "loss": 0.6032, + "mean_token_accuracy": 0.9203667938709259, + "num_tokens": 6012880.0, + "step": 3356 + }, + { + "epoch": 0.5435997085256254, + "grad_norm": 24.318208694458008, + "learning_rate": 4.5660621761658035e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.9236669540405273, + "num_tokens": 6014667.0, + "step": 3357 + }, + { + "epoch": 0.5437616387337058, + "grad_norm": 24.818958282470703, + "learning_rate": 4.564443005181348e-06, + "loss": 0.5907, + "mean_token_accuracy": 0.9182733595371246, + "num_tokens": 6016461.0, + "step": 3358 + }, + { + "epoch": 0.5439235689417861, + "grad_norm": 20.26692008972168, + "learning_rate": 4.5628238341968916e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.9258498251438141, + "num_tokens": 6018254.0, + "step": 3359 + }, + { + "epoch": 0.5440854991498664, + "grad_norm": 25.99869155883789, + "learning_rate": 4.561204663212436e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.9171532690525055, + "num_tokens": 6020043.0, + "step": 3360 + }, + { + "epoch": 0.5442474293579467, + "grad_norm": 22.137868881225586, + "learning_rate": 4.55958549222798e-06, + "loss": 0.5536, + "mean_token_accuracy": 0.9120462834835052, + "num_tokens": 6021827.0, + "step": 3361 + }, + { + "epoch": 0.5444093595660271, + "grad_norm": 33.363590240478516, + "learning_rate": 4.557966321243524e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.9103012084960938, + "num_tokens": 6023618.0, + "step": 3362 + }, + { + "epoch": 0.5445712897741074, + "grad_norm": 30.149311065673828, + "learning_rate": 4.556347150259068e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.925567239522934, + "num_tokens": 6025412.0, + "step": 3363 + }, + { + "epoch": 0.5447332199821877, + "grad_norm": 26.410747528076172, + "learning_rate": 4.554727979274612e-06, + "loss": 0.5687, + "mean_token_accuracy": 0.9087591171264648, + "num_tokens": 6027198.0, + "step": 3364 + }, + { + "epoch": 0.544895150190268, + "grad_norm": 23.280202865600586, + "learning_rate": 4.553108808290156e-06, + "loss": 0.5797, + "mean_token_accuracy": 0.9260977506637573, + "num_tokens": 6028994.0, + "step": 3365 + }, + { + "epoch": 0.5450570803983483, + "grad_norm": 24.00787925720215, + "learning_rate": 4.5514896373057e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.90947225689888, + "num_tokens": 6030793.0, + "step": 3366 + }, + { + "epoch": 0.5452190106064286, + "grad_norm": 33.04943084716797, + "learning_rate": 4.549870466321244e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.9010291695594788, + "num_tokens": 6032588.0, + "step": 3367 + }, + { + "epoch": 0.5453809408145089, + "grad_norm": 19.426136016845703, + "learning_rate": 4.548251295336788e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.9386403858661652, + "num_tokens": 6034377.0, + "step": 3368 + }, + { + "epoch": 0.5455428710225892, + "grad_norm": 33.3045768737793, + "learning_rate": 4.546632124352332e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.9010841548442841, + "num_tokens": 6036162.0, + "step": 3369 + }, + { + "epoch": 0.5457048012306696, + "grad_norm": 22.850669860839844, + "learning_rate": 4.545012953367876e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.9254666268825531, + "num_tokens": 6037956.0, + "step": 3370 + }, + { + "epoch": 0.5458667314387499, + "grad_norm": 34.92522430419922, + "learning_rate": 4.54339378238342e-06, + "loss": 0.652, + "mean_token_accuracy": 0.9084407687187195, + "num_tokens": 6039750.0, + "step": 3371 + }, + { + "epoch": 0.5460286616468302, + "grad_norm": 31.60854148864746, + "learning_rate": 4.541774611398964e-06, + "loss": 0.6056, + "mean_token_accuracy": 0.9003778994083405, + "num_tokens": 6041553.0, + "step": 3372 + }, + { + "epoch": 0.5461905918549105, + "grad_norm": 27.703815460205078, + "learning_rate": 4.540155440414509e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.9090197384357452, + "num_tokens": 6043351.0, + "step": 3373 + }, + { + "epoch": 0.5463525220629909, + "grad_norm": 17.604507446289062, + "learning_rate": 4.538536269430052e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.925038605928421, + "num_tokens": 6045142.0, + "step": 3374 + }, + { + "epoch": 0.5465144522710712, + "grad_norm": 11.845253944396973, + "learning_rate": 4.536917098445597e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.9340918958187103, + "num_tokens": 6046927.0, + "step": 3375 + }, + { + "epoch": 0.5466763824791515, + "grad_norm": 30.648853302001953, + "learning_rate": 4.53529792746114e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.9020787477493286, + "num_tokens": 6048725.0, + "step": 3376 + }, + { + "epoch": 0.5468383126872318, + "grad_norm": 25.15959358215332, + "learning_rate": 4.533678756476685e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.9091269969940186, + "num_tokens": 6050512.0, + "step": 3377 + }, + { + "epoch": 0.5470002428953121, + "grad_norm": 33.98947525024414, + "learning_rate": 4.532059585492228e-06, + "loss": 0.732, + "mean_token_accuracy": 0.8936116695404053, + "num_tokens": 6052306.0, + "step": 3378 + }, + { + "epoch": 0.5471621731033924, + "grad_norm": 32.97720718383789, + "learning_rate": 4.530440414507773e-06, + "loss": 0.7095, + "mean_token_accuracy": 0.9016696214675903, + "num_tokens": 6054103.0, + "step": 3379 + }, + { + "epoch": 0.5473241033114727, + "grad_norm": 25.53755760192871, + "learning_rate": 4.528821243523316e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.9171883165836334, + "num_tokens": 6055892.0, + "step": 3380 + }, + { + "epoch": 0.547486033519553, + "grad_norm": 16.071685791015625, + "learning_rate": 4.527202072538861e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.9271321892738342, + "num_tokens": 6057678.0, + "step": 3381 + }, + { + "epoch": 0.5476479637276334, + "grad_norm": 26.60755729675293, + "learning_rate": 4.525582901554404e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.9217752516269684, + "num_tokens": 6059471.0, + "step": 3382 + }, + { + "epoch": 0.5478098939357137, + "grad_norm": 30.898523330688477, + "learning_rate": 4.523963730569949e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.9077968001365662, + "num_tokens": 6061265.0, + "step": 3383 + }, + { + "epoch": 0.547971824143794, + "grad_norm": 24.920360565185547, + "learning_rate": 4.522344559585492e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.9229063987731934, + "num_tokens": 6063062.0, + "step": 3384 + }, + { + "epoch": 0.5481337543518744, + "grad_norm": 30.06410026550293, + "learning_rate": 4.520725388601037e-06, + "loss": 0.6639, + "mean_token_accuracy": 0.9201333820819855, + "num_tokens": 6064849.0, + "step": 3385 + }, + { + "epoch": 0.5482956845599547, + "grad_norm": 17.98684310913086, + "learning_rate": 4.51910621761658e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.9197037518024445, + "num_tokens": 6066635.0, + "step": 3386 + }, + { + "epoch": 0.548457614768035, + "grad_norm": 18.87660789489746, + "learning_rate": 4.517487046632125e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.9224673211574554, + "num_tokens": 6068418.0, + "step": 3387 + }, + { + "epoch": 0.5486195449761153, + "grad_norm": 20.49905776977539, + "learning_rate": 4.515867875647668e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.9176878929138184, + "num_tokens": 6070210.0, + "step": 3388 + }, + { + "epoch": 0.5487814751841956, + "grad_norm": 22.309688568115234, + "learning_rate": 4.514248704663213e-06, + "loss": 0.5803, + "mean_token_accuracy": 0.9176195561885834, + "num_tokens": 6072013.0, + "step": 3389 + }, + { + "epoch": 0.5489434053922759, + "grad_norm": 14.70454216003418, + "learning_rate": 4.512629533678756e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.9347826242446899, + "num_tokens": 6073801.0, + "step": 3390 + }, + { + "epoch": 0.5491053356003562, + "grad_norm": 25.066940307617188, + "learning_rate": 4.511010362694301e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.9167752265930176, + "num_tokens": 6075601.0, + "step": 3391 + }, + { + "epoch": 0.5492672658084365, + "grad_norm": 17.751445770263672, + "learning_rate": 4.509391191709845e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.9216715395450592, + "num_tokens": 6077380.0, + "step": 3392 + }, + { + "epoch": 0.5494291960165169, + "grad_norm": 25.87751579284668, + "learning_rate": 4.507772020725389e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.9210945665836334, + "num_tokens": 6079171.0, + "step": 3393 + }, + { + "epoch": 0.5495911262245972, + "grad_norm": 23.470722198486328, + "learning_rate": 4.506152849740933e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.9152742624282837, + "num_tokens": 6080977.0, + "step": 3394 + }, + { + "epoch": 0.5497530564326775, + "grad_norm": 30.11971664428711, + "learning_rate": 4.504533678756477e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.9252873659133911, + "num_tokens": 6082769.0, + "step": 3395 + }, + { + "epoch": 0.5499149866407579, + "grad_norm": 21.674415588378906, + "learning_rate": 4.502914507772021e-06, + "loss": 0.5839, + "mean_token_accuracy": 0.9148935973644257, + "num_tokens": 6084563.0, + "step": 3396 + }, + { + "epoch": 0.5500769168488382, + "grad_norm": 18.486682891845703, + "learning_rate": 4.501295336787565e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9282407462596893, + "num_tokens": 6086354.0, + "step": 3397 + }, + { + "epoch": 0.5502388470569185, + "grad_norm": 26.99074363708496, + "learning_rate": 4.499676165803109e-06, + "loss": 0.5899, + "mean_token_accuracy": 0.9072089195251465, + "num_tokens": 6088147.0, + "step": 3398 + }, + { + "epoch": 0.5504007772649988, + "grad_norm": 29.032562255859375, + "learning_rate": 4.498056994818653e-06, + "loss": 0.6014, + "mean_token_accuracy": 0.9144144356250763, + "num_tokens": 6089939.0, + "step": 3399 + }, + { + "epoch": 0.5505627074730791, + "grad_norm": 20.895023345947266, + "learning_rate": 4.496437823834197e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.9237553477287292, + "num_tokens": 6091728.0, + "step": 3400 + }, + { + "epoch": 0.5507246376811594, + "grad_norm": 19.834373474121094, + "learning_rate": 4.494818652849741e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.9270557165145874, + "num_tokens": 6093528.0, + "step": 3401 + }, + { + "epoch": 0.5508865678892397, + "grad_norm": 29.816694259643555, + "learning_rate": 4.493199481865285e-06, + "loss": 0.792, + "mean_token_accuracy": 0.9021425247192383, + "num_tokens": 6095316.0, + "step": 3402 + }, + { + "epoch": 0.55104849809732, + "grad_norm": 25.919754028320312, + "learning_rate": 4.491580310880829e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.9157004952430725, + "num_tokens": 6097101.0, + "step": 3403 + }, + { + "epoch": 0.5512104283054003, + "grad_norm": 16.491609573364258, + "learning_rate": 4.489961139896373e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.9252786040306091, + "num_tokens": 6098894.0, + "step": 3404 + }, + { + "epoch": 0.5513723585134807, + "grad_norm": 23.422494888305664, + "learning_rate": 4.488341968911917e-06, + "loss": 0.5706, + "mean_token_accuracy": 0.9108880758285522, + "num_tokens": 6100675.0, + "step": 3405 + }, + { + "epoch": 0.551534288721561, + "grad_norm": 16.785755157470703, + "learning_rate": 4.4867227979274614e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.928893506526947, + "num_tokens": 6102468.0, + "step": 3406 + }, + { + "epoch": 0.5516962189296414, + "grad_norm": 16.99287986755371, + "learning_rate": 4.485103626943005e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.9289297759532928, + "num_tokens": 6104261.0, + "step": 3407 + }, + { + "epoch": 0.5518581491377217, + "grad_norm": 22.296308517456055, + "learning_rate": 4.4834844559585495e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.9225463271141052, + "num_tokens": 6106044.0, + "step": 3408 + }, + { + "epoch": 0.552020079345802, + "grad_norm": 29.526416778564453, + "learning_rate": 4.481865284974093e-06, + "loss": 0.5857, + "mean_token_accuracy": 0.9151678681373596, + "num_tokens": 6107839.0, + "step": 3409 + }, + { + "epoch": 0.5521820095538823, + "grad_norm": 23.606788635253906, + "learning_rate": 4.4802461139896375e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.9253092110157013, + "num_tokens": 6109632.0, + "step": 3410 + }, + { + "epoch": 0.5523439397619626, + "grad_norm": 31.241962432861328, + "learning_rate": 4.478626943005182e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.9171842634677887, + "num_tokens": 6111422.0, + "step": 3411 + }, + { + "epoch": 0.5525058699700429, + "grad_norm": 25.66417121887207, + "learning_rate": 4.4770077720207255e-06, + "loss": 0.596, + "mean_token_accuracy": 0.9096779227256775, + "num_tokens": 6113211.0, + "step": 3412 + }, + { + "epoch": 0.5526678001781232, + "grad_norm": 32.516788482666016, + "learning_rate": 4.47538860103627e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.9028212130069733, + "num_tokens": 6115000.0, + "step": 3413 + }, + { + "epoch": 0.5528297303862035, + "grad_norm": 19.48625373840332, + "learning_rate": 4.4737694300518135e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.9206026494503021, + "num_tokens": 6116789.0, + "step": 3414 + }, + { + "epoch": 0.5529916605942838, + "grad_norm": 30.01239013671875, + "learning_rate": 4.472150259067358e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.9178589880466461, + "num_tokens": 6118580.0, + "step": 3415 + }, + { + "epoch": 0.5531535908023641, + "grad_norm": 36.29997253417969, + "learning_rate": 4.4705310880829016e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.8907604217529297, + "num_tokens": 6120385.0, + "step": 3416 + }, + { + "epoch": 0.5533155210104445, + "grad_norm": 28.68462371826172, + "learning_rate": 4.468911917098446e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.9115812182426453, + "num_tokens": 6122179.0, + "step": 3417 + }, + { + "epoch": 0.5534774512185248, + "grad_norm": 24.67974090576172, + "learning_rate": 4.46729274611399e-06, + "loss": 0.6111, + "mean_token_accuracy": 0.9260912537574768, + "num_tokens": 6123975.0, + "step": 3418 + }, + { + "epoch": 0.5536393814266052, + "grad_norm": 28.53451156616211, + "learning_rate": 4.465673575129534e-06, + "loss": 0.69, + "mean_token_accuracy": 0.9137205183506012, + "num_tokens": 6125754.0, + "step": 3419 + }, + { + "epoch": 0.5538013116346855, + "grad_norm": 28.087703704833984, + "learning_rate": 4.464054404145078e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.9165835082530975, + "num_tokens": 6127542.0, + "step": 3420 + }, + { + "epoch": 0.5539632418427658, + "grad_norm": 27.522611618041992, + "learning_rate": 4.462435233160622e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.9197037518024445, + "num_tokens": 6129328.0, + "step": 3421 + }, + { + "epoch": 0.5541251720508461, + "grad_norm": 31.729354858398438, + "learning_rate": 4.460816062176166e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.9142156839370728, + "num_tokens": 6131120.0, + "step": 3422 + }, + { + "epoch": 0.5542871022589264, + "grad_norm": 30.813610076904297, + "learning_rate": 4.45919689119171e-06, + "loss": 0.735, + "mean_token_accuracy": 0.9048126935958862, + "num_tokens": 6132916.0, + "step": 3423 + }, + { + "epoch": 0.5544490324670067, + "grad_norm": 33.84776306152344, + "learning_rate": 4.457577720207254e-06, + "loss": 0.583, + "mean_token_accuracy": 0.9138047397136688, + "num_tokens": 6134706.0, + "step": 3424 + }, + { + "epoch": 0.554610962675087, + "grad_norm": 29.813026428222656, + "learning_rate": 4.455958549222798e-06, + "loss": 0.658, + "mean_token_accuracy": 0.9007028937339783, + "num_tokens": 6136490.0, + "step": 3425 + }, + { + "epoch": 0.5547728928831673, + "grad_norm": 21.390344619750977, + "learning_rate": 4.454339378238342e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9166666865348816, + "num_tokens": 6138278.0, + "step": 3426 + }, + { + "epoch": 0.5549348230912476, + "grad_norm": 35.12249755859375, + "learning_rate": 4.452720207253887e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.9117614924907684, + "num_tokens": 6140061.0, + "step": 3427 + }, + { + "epoch": 0.555096753299328, + "grad_norm": 27.178850173950195, + "learning_rate": 4.451101036269431e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.9130477011203766, + "num_tokens": 6141860.0, + "step": 3428 + }, + { + "epoch": 0.5552586835074083, + "grad_norm": 25.129758834838867, + "learning_rate": 4.449481865284975e-06, + "loss": 0.5626, + "mean_token_accuracy": 0.9172320365905762, + "num_tokens": 6143649.0, + "step": 3429 + }, + { + "epoch": 0.5554206137154887, + "grad_norm": 13.15129280090332, + "learning_rate": 4.447862694300519e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.929921567440033, + "num_tokens": 6145432.0, + "step": 3430 + }, + { + "epoch": 0.555582543923569, + "grad_norm": 35.81571578979492, + "learning_rate": 4.446243523316063e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.8895420432090759, + "num_tokens": 6147224.0, + "step": 3431 + }, + { + "epoch": 0.5557444741316493, + "grad_norm": 24.94693374633789, + "learning_rate": 4.444624352331607e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.9133601486682892, + "num_tokens": 6149025.0, + "step": 3432 + }, + { + "epoch": 0.5559064043397296, + "grad_norm": 33.639076232910156, + "learning_rate": 4.443005181347151e-06, + "loss": 0.6128, + "mean_token_accuracy": 0.9141661822795868, + "num_tokens": 6150817.0, + "step": 3433 + }, + { + "epoch": 0.5560683345478099, + "grad_norm": 26.695627212524414, + "learning_rate": 4.441386010362695e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.9193286001682281, + "num_tokens": 6152614.0, + "step": 3434 + }, + { + "epoch": 0.5562302647558902, + "grad_norm": 22.153039932250977, + "learning_rate": 4.439766839378239e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.9326691031455994, + "num_tokens": 6154408.0, + "step": 3435 + }, + { + "epoch": 0.5563921949639705, + "grad_norm": 34.817508697509766, + "learning_rate": 4.438147668393783e-06, + "loss": 0.6228, + "mean_token_accuracy": 0.8978950083255768, + "num_tokens": 6156194.0, + "step": 3436 + }, + { + "epoch": 0.5565541251720508, + "grad_norm": 30.309364318847656, + "learning_rate": 4.436528497409327e-06, + "loss": 0.6014, + "mean_token_accuracy": 0.9158846139907837, + "num_tokens": 6157990.0, + "step": 3437 + }, + { + "epoch": 0.5567160553801311, + "grad_norm": 30.47202491760254, + "learning_rate": 4.434909326424871e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.8949397504329681, + "num_tokens": 6159783.0, + "step": 3438 + }, + { + "epoch": 0.5568779855882114, + "grad_norm": 22.867259979248047, + "learning_rate": 4.433290155440415e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.9116471707820892, + "num_tokens": 6161578.0, + "step": 3439 + }, + { + "epoch": 0.5570399157962918, + "grad_norm": 23.13840103149414, + "learning_rate": 4.431670984455959e-06, + "loss": 0.5589, + "mean_token_accuracy": 0.9164775907993317, + "num_tokens": 6163375.0, + "step": 3440 + }, + { + "epoch": 0.5572018460043722, + "grad_norm": 16.12798309326172, + "learning_rate": 4.430051813471503e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.9260278642177582, + "num_tokens": 6165171.0, + "step": 3441 + }, + { + "epoch": 0.5573637762124525, + "grad_norm": 33.908668518066406, + "learning_rate": 4.428432642487047e-06, + "loss": 0.6629, + "mean_token_accuracy": 0.8948566019535065, + "num_tokens": 6166978.0, + "step": 3442 + }, + { + "epoch": 0.5575257064205328, + "grad_norm": 35.57543182373047, + "learning_rate": 4.426813471502591e-06, + "loss": 0.6658, + "mean_token_accuracy": 0.9033960998058319, + "num_tokens": 6168780.0, + "step": 3443 + }, + { + "epoch": 0.5576876366286131, + "grad_norm": 26.46402931213379, + "learning_rate": 4.425194300518136e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.922982782125473, + "num_tokens": 6170578.0, + "step": 3444 + }, + { + "epoch": 0.5578495668366934, + "grad_norm": 25.53522491455078, + "learning_rate": 4.423575129533679e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.9198103249073029, + "num_tokens": 6172364.0, + "step": 3445 + }, + { + "epoch": 0.5580114970447737, + "grad_norm": 24.76997184753418, + "learning_rate": 4.421955958549224e-06, + "loss": 0.5994, + "mean_token_accuracy": 0.9176018536090851, + "num_tokens": 6174143.0, + "step": 3446 + }, + { + "epoch": 0.558173427252854, + "grad_norm": 28.09873390197754, + "learning_rate": 4.420336787564767e-06, + "loss": 0.6365, + "mean_token_accuracy": 0.9221028983592987, + "num_tokens": 6175938.0, + "step": 3447 + }, + { + "epoch": 0.5583353574609343, + "grad_norm": 30.89769744873047, + "learning_rate": 4.418717616580312e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.9095440208911896, + "num_tokens": 6177735.0, + "step": 3448 + }, + { + "epoch": 0.5584972876690146, + "grad_norm": 40.70014572143555, + "learning_rate": 4.417098445595855e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.8758874237537384, + "num_tokens": 6179528.0, + "step": 3449 + }, + { + "epoch": 0.5586592178770949, + "grad_norm": 31.241683959960938, + "learning_rate": 4.4154792746114e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.8986429274082184, + "num_tokens": 6181306.0, + "step": 3450 + }, + { + "epoch": 0.5588211480851752, + "grad_norm": 22.433712005615234, + "learning_rate": 4.413860103626943e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.9200661182403564, + "num_tokens": 6183092.0, + "step": 3451 + }, + { + "epoch": 0.5589830782932556, + "grad_norm": 35.89522171020508, + "learning_rate": 4.412240932642488e-06, + "loss": 0.773, + "mean_token_accuracy": 0.9011110067367554, + "num_tokens": 6184877.0, + "step": 3452 + }, + { + "epoch": 0.559145008501336, + "grad_norm": 18.537832260131836, + "learning_rate": 4.410621761658031e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.9179058969020844, + "num_tokens": 6186657.0, + "step": 3453 + }, + { + "epoch": 0.5593069387094163, + "grad_norm": 23.643512725830078, + "learning_rate": 4.409002590673576e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.9207285344600677, + "num_tokens": 6188446.0, + "step": 3454 + }, + { + "epoch": 0.5594688689174966, + "grad_norm": 35.087486267089844, + "learning_rate": 4.407383419689119e-06, + "loss": 0.5701, + "mean_token_accuracy": 0.9265037775039673, + "num_tokens": 6190231.0, + "step": 3455 + }, + { + "epoch": 0.5596307991255769, + "grad_norm": 20.429126739501953, + "learning_rate": 4.405764248704664e-06, + "loss": 0.547, + "mean_token_accuracy": 0.9333432614803314, + "num_tokens": 6192027.0, + "step": 3456 + }, + { + "epoch": 0.5597927293336572, + "grad_norm": 25.923635482788086, + "learning_rate": 4.404145077720207e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.909731537103653, + "num_tokens": 6193828.0, + "step": 3457 + }, + { + "epoch": 0.5599546595417375, + "grad_norm": 37.788047790527344, + "learning_rate": 4.402525906735752e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.8909301459789276, + "num_tokens": 6195625.0, + "step": 3458 + }, + { + "epoch": 0.5601165897498178, + "grad_norm": 16.530229568481445, + "learning_rate": 4.400906735751295e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.9288030862808228, + "num_tokens": 6197418.0, + "step": 3459 + }, + { + "epoch": 0.5602785199578981, + "grad_norm": 29.025163650512695, + "learning_rate": 4.39928756476684e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.9032630920410156, + "num_tokens": 6199209.0, + "step": 3460 + }, + { + "epoch": 0.5604404501659784, + "grad_norm": 28.432897567749023, + "learning_rate": 4.397668393782384e-06, + "loss": 0.625, + "mean_token_accuracy": 0.9061205685138702, + "num_tokens": 6200999.0, + "step": 3461 + }, + { + "epoch": 0.5606023803740587, + "grad_norm": 15.997380256652832, + "learning_rate": 4.396049222797928e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9308949708938599, + "num_tokens": 6202786.0, + "step": 3462 + }, + { + "epoch": 0.560764310582139, + "grad_norm": 34.19835662841797, + "learning_rate": 4.394430051813472e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.8912703394889832, + "num_tokens": 6204575.0, + "step": 3463 + }, + { + "epoch": 0.5609262407902195, + "grad_norm": 20.301677703857422, + "learning_rate": 4.392810880829016e-06, + "loss": 0.5913, + "mean_token_accuracy": 0.9235082268714905, + "num_tokens": 6206362.0, + "step": 3464 + }, + { + "epoch": 0.5610881709982998, + "grad_norm": 15.669766426086426, + "learning_rate": 4.39119170984456e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.9304511249065399, + "num_tokens": 6208147.0, + "step": 3465 + }, + { + "epoch": 0.5612501012063801, + "grad_norm": 26.02204132080078, + "learning_rate": 4.389572538860104e-06, + "loss": 0.6128, + "mean_token_accuracy": 0.9233216643333435, + "num_tokens": 6209946.0, + "step": 3466 + }, + { + "epoch": 0.5614120314144604, + "grad_norm": 24.204301834106445, + "learning_rate": 4.387953367875648e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.9177290201187134, + "num_tokens": 6211737.0, + "step": 3467 + }, + { + "epoch": 0.5615739616225407, + "grad_norm": 28.54127311706543, + "learning_rate": 4.386334196891192e-06, + "loss": 0.6131, + "mean_token_accuracy": 0.9066092073917389, + "num_tokens": 6213526.0, + "step": 3468 + }, + { + "epoch": 0.561735891830621, + "grad_norm": 16.19049072265625, + "learning_rate": 4.384715025906736e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.9327787756919861, + "num_tokens": 6215321.0, + "step": 3469 + }, + { + "epoch": 0.5618978220387013, + "grad_norm": 31.826154708862305, + "learning_rate": 4.38309585492228e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.8938117325305939, + "num_tokens": 6217106.0, + "step": 3470 + }, + { + "epoch": 0.5620597522467816, + "grad_norm": 27.895505905151367, + "learning_rate": 4.381476683937824e-06, + "loss": 0.597, + "mean_token_accuracy": 0.9089610874652863, + "num_tokens": 6218893.0, + "step": 3471 + }, + { + "epoch": 0.5622216824548619, + "grad_norm": 21.718050003051758, + "learning_rate": 4.379857512953368e-06, + "loss": 0.5366, + "mean_token_accuracy": 0.9270833432674408, + "num_tokens": 6220693.0, + "step": 3472 + }, + { + "epoch": 0.5623836126629422, + "grad_norm": 30.873859405517578, + "learning_rate": 4.3782383419689124e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.9107142984867096, + "num_tokens": 6222485.0, + "step": 3473 + }, + { + "epoch": 0.5625455428710225, + "grad_norm": 32.97699737548828, + "learning_rate": 4.376619170984456e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.9064815044403076, + "num_tokens": 6224276.0, + "step": 3474 + }, + { + "epoch": 0.562707473079103, + "grad_norm": 25.58026695251465, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.5816, + "mean_token_accuracy": 0.9146021008491516, + "num_tokens": 6226080.0, + "step": 3475 + }, + { + "epoch": 0.5628694032871833, + "grad_norm": 25.022464752197266, + "learning_rate": 4.373380829015544e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.9065589904785156, + "num_tokens": 6227870.0, + "step": 3476 + }, + { + "epoch": 0.5630313334952636, + "grad_norm": 27.34522819519043, + "learning_rate": 4.3717616580310885e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.9079061448574066, + "num_tokens": 6229654.0, + "step": 3477 + }, + { + "epoch": 0.5631932637033439, + "grad_norm": 25.846668243408203, + "learning_rate": 4.370142487046632e-06, + "loss": 0.6212, + "mean_token_accuracy": 0.9098980128765106, + "num_tokens": 6231444.0, + "step": 3478 + }, + { + "epoch": 0.5633551939114242, + "grad_norm": 25.948686599731445, + "learning_rate": 4.3685233160621765e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.9290320873260498, + "num_tokens": 6233239.0, + "step": 3479 + }, + { + "epoch": 0.5635171241195045, + "grad_norm": 26.796560287475586, + "learning_rate": 4.366904145077721e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.9089736044406891, + "num_tokens": 6235035.0, + "step": 3480 + }, + { + "epoch": 0.5636790543275848, + "grad_norm": 24.360271453857422, + "learning_rate": 4.3652849740932645e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.9210199117660522, + "num_tokens": 6236813.0, + "step": 3481 + }, + { + "epoch": 0.5638409845356651, + "grad_norm": 21.627534866333008, + "learning_rate": 4.363665803108809e-06, + "loss": 0.57, + "mean_token_accuracy": 0.9214646518230438, + "num_tokens": 6238592.0, + "step": 3482 + }, + { + "epoch": 0.5640029147437454, + "grad_norm": 39.132568359375, + "learning_rate": 4.3620466321243526e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.8964584767818451, + "num_tokens": 6240384.0, + "step": 3483 + }, + { + "epoch": 0.5641648449518257, + "grad_norm": 24.986648559570312, + "learning_rate": 4.360427461139897e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.9252451062202454, + "num_tokens": 6242176.0, + "step": 3484 + }, + { + "epoch": 0.564326775159906, + "grad_norm": 24.716169357299805, + "learning_rate": 4.358808290155441e-06, + "loss": 0.592, + "mean_token_accuracy": 0.9246070981025696, + "num_tokens": 6243980.0, + "step": 3485 + }, + { + "epoch": 0.5644887053679863, + "grad_norm": 19.94791030883789, + "learning_rate": 4.357189119170985e-06, + "loss": 0.5791, + "mean_token_accuracy": 0.915575385093689, + "num_tokens": 6245776.0, + "step": 3486 + }, + { + "epoch": 0.5646506355760668, + "grad_norm": 20.240619659423828, + "learning_rate": 4.355569948186529e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.9338199496269226, + "num_tokens": 6247560.0, + "step": 3487 + }, + { + "epoch": 0.5648125657841471, + "grad_norm": 29.253957748413086, + "learning_rate": 4.353950777202073e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.8954933881759644, + "num_tokens": 6249350.0, + "step": 3488 + }, + { + "epoch": 0.5649744959922274, + "grad_norm": 19.998720169067383, + "learning_rate": 4.352331606217617e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.9321800470352173, + "num_tokens": 6251142.0, + "step": 3489 + }, + { + "epoch": 0.5651364262003077, + "grad_norm": 13.082011222839355, + "learning_rate": 4.350712435233161e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.9304466843605042, + "num_tokens": 6252927.0, + "step": 3490 + }, + { + "epoch": 0.565298356408388, + "grad_norm": 20.88640785217285, + "learning_rate": 4.349093264248705e-06, + "loss": 0.5497, + "mean_token_accuracy": 0.9270833432674408, + "num_tokens": 6254727.0, + "step": 3491 + }, + { + "epoch": 0.5654602866164683, + "grad_norm": 18.046716690063477, + "learning_rate": 4.347474093264249e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.9285494089126587, + "num_tokens": 6256518.0, + "step": 3492 + }, + { + "epoch": 0.5656222168245486, + "grad_norm": 25.79833984375, + "learning_rate": 4.345854922279793e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.9246582388877869, + "num_tokens": 6258308.0, + "step": 3493 + }, + { + "epoch": 0.5657841470326289, + "grad_norm": 33.047569274902344, + "learning_rate": 4.344235751295337e-06, + "loss": 0.6758, + "mean_token_accuracy": 0.9082609713077545, + "num_tokens": 6260104.0, + "step": 3494 + }, + { + "epoch": 0.5659460772407092, + "grad_norm": 25.285409927368164, + "learning_rate": 4.342616580310881e-06, + "loss": 0.569, + "mean_token_accuracy": 0.9205268919467926, + "num_tokens": 6261893.0, + "step": 3495 + }, + { + "epoch": 0.5661080074487895, + "grad_norm": 27.62537384033203, + "learning_rate": 4.340997409326425e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.9231898188591003, + "num_tokens": 6263691.0, + "step": 3496 + }, + { + "epoch": 0.5662699376568698, + "grad_norm": 31.901121139526367, + "learning_rate": 4.339378238341969e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.8994689583778381, + "num_tokens": 6265479.0, + "step": 3497 + }, + { + "epoch": 0.5664318678649503, + "grad_norm": 28.196399688720703, + "learning_rate": 4.337759067357513e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.9097830355167389, + "num_tokens": 6267268.0, + "step": 3498 + }, + { + "epoch": 0.5665937980730306, + "grad_norm": 18.343788146972656, + "learning_rate": 4.336139896373058e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.929848313331604, + "num_tokens": 6269065.0, + "step": 3499 + }, + { + "epoch": 0.5667557282811109, + "grad_norm": 19.563533782958984, + "learning_rate": 4.334520725388601e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.9246068596839905, + "num_tokens": 6270856.0, + "step": 3500 + }, + { + "epoch": 0.5669176584891912, + "grad_norm": 26.151309967041016, + "learning_rate": 4.332901554404146e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.9213137030601501, + "num_tokens": 6272645.0, + "step": 3501 + }, + { + "epoch": 0.5670795886972715, + "grad_norm": 22.066959381103516, + "learning_rate": 4.331282383419689e-06, + "loss": 0.5423, + "mean_token_accuracy": 0.9283071458339691, + "num_tokens": 6274436.0, + "step": 3502 + }, + { + "epoch": 0.5672415189053518, + "grad_norm": 34.8818473815918, + "learning_rate": 4.329663212435234e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.907052606344223, + "num_tokens": 6276226.0, + "step": 3503 + }, + { + "epoch": 0.5674034491134321, + "grad_norm": 27.884471893310547, + "learning_rate": 4.328044041450777e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.9047606289386749, + "num_tokens": 6278011.0, + "step": 3504 + }, + { + "epoch": 0.5675653793215124, + "grad_norm": 30.48784637451172, + "learning_rate": 4.326424870466322e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.9084407687187195, + "num_tokens": 6279805.0, + "step": 3505 + }, + { + "epoch": 0.5677273095295927, + "grad_norm": 24.371286392211914, + "learning_rate": 4.324805699481865e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.9187375009059906, + "num_tokens": 6281589.0, + "step": 3506 + }, + { + "epoch": 0.567889239737673, + "grad_norm": 19.476707458496094, + "learning_rate": 4.32318652849741e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.9298029541969299, + "num_tokens": 6283386.0, + "step": 3507 + }, + { + "epoch": 0.5680511699457533, + "grad_norm": 29.63750457763672, + "learning_rate": 4.321567357512953e-06, + "loss": 0.69, + "mean_token_accuracy": 0.9135036468505859, + "num_tokens": 6285175.0, + "step": 3508 + }, + { + "epoch": 0.5682131001538338, + "grad_norm": 25.0279483795166, + "learning_rate": 4.319948186528498e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.9130310118198395, + "num_tokens": 6286964.0, + "step": 3509 + }, + { + "epoch": 0.5683750303619141, + "grad_norm": 33.23414993286133, + "learning_rate": 4.318329015544041e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.8847853541374207, + "num_tokens": 6288752.0, + "step": 3510 + }, + { + "epoch": 0.5685369605699944, + "grad_norm": 16.57394027709961, + "learning_rate": 4.316709844559586e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.9356481730937958, + "num_tokens": 6290543.0, + "step": 3511 + }, + { + "epoch": 0.5686988907780747, + "grad_norm": 34.508941650390625, + "learning_rate": 4.315090673575129e-06, + "loss": 0.788, + "mean_token_accuracy": 0.9035409688949585, + "num_tokens": 6292335.0, + "step": 3512 + }, + { + "epoch": 0.568860820986155, + "grad_norm": 20.83152961730957, + "learning_rate": 4.313471502590674e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.9249590635299683, + "num_tokens": 6294127.0, + "step": 3513 + }, + { + "epoch": 0.5690227511942353, + "grad_norm": 25.568204879760742, + "learning_rate": 4.311852331606217e-06, + "loss": 0.6219, + "mean_token_accuracy": 0.9015345275402069, + "num_tokens": 6295913.0, + "step": 3514 + }, + { + "epoch": 0.5691846814023156, + "grad_norm": 26.2885684967041, + "learning_rate": 4.310233160621762e-06, + "loss": 0.6793, + "mean_token_accuracy": 0.9024864137172699, + "num_tokens": 6297703.0, + "step": 3515 + }, + { + "epoch": 0.5693466116103959, + "grad_norm": 26.928274154663086, + "learning_rate": 4.308613989637305e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.9094594419002533, + "num_tokens": 6299503.0, + "step": 3516 + }, + { + "epoch": 0.5695085418184762, + "grad_norm": 17.5095272064209, + "learning_rate": 4.30699481865285e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.930820107460022, + "num_tokens": 6301290.0, + "step": 3517 + }, + { + "epoch": 0.5696704720265565, + "grad_norm": 27.616561889648438, + "learning_rate": 4.305375647668394e-06, + "loss": 0.6353, + "mean_token_accuracy": 0.9178043603897095, + "num_tokens": 6303094.0, + "step": 3518 + }, + { + "epoch": 0.5698324022346368, + "grad_norm": 26.010149002075195, + "learning_rate": 4.303756476683938e-06, + "loss": 0.6485, + "mean_token_accuracy": 0.9111787378787994, + "num_tokens": 6304888.0, + "step": 3519 + }, + { + "epoch": 0.5699943324427172, + "grad_norm": 31.747882843017578, + "learning_rate": 4.302137305699482e-06, + "loss": 0.618, + "mean_token_accuracy": 0.9105128347873688, + "num_tokens": 6306680.0, + "step": 3520 + }, + { + "epoch": 0.5701562626507976, + "grad_norm": 33.44720458984375, + "learning_rate": 4.300518134715026e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.9088472425937653, + "num_tokens": 6308483.0, + "step": 3521 + }, + { + "epoch": 0.5703181928588779, + "grad_norm": 25.704713821411133, + "learning_rate": 4.29889896373057e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.9219229817390442, + "num_tokens": 6310277.0, + "step": 3522 + }, + { + "epoch": 0.5704801230669582, + "grad_norm": 29.204517364501953, + "learning_rate": 4.297279792746114e-06, + "loss": 0.6062, + "mean_token_accuracy": 0.9158229231834412, + "num_tokens": 6312073.0, + "step": 3523 + }, + { + "epoch": 0.5706420532750385, + "grad_norm": 21.058008193969727, + "learning_rate": 4.295660621761658e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.9250925779342651, + "num_tokens": 6313852.0, + "step": 3524 + }, + { + "epoch": 0.5708039834831188, + "grad_norm": 31.2490291595459, + "learning_rate": 4.294041450777203e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.9077968001365662, + "num_tokens": 6315646.0, + "step": 3525 + }, + { + "epoch": 0.5709659136911991, + "grad_norm": 17.613039016723633, + "learning_rate": 4.292422279792746e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.9283527433872223, + "num_tokens": 6317438.0, + "step": 3526 + }, + { + "epoch": 0.5711278438992794, + "grad_norm": 21.228052139282227, + "learning_rate": 4.290803108808291e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.9190140962600708, + "num_tokens": 6319234.0, + "step": 3527 + }, + { + "epoch": 0.5712897741073597, + "grad_norm": 34.676734924316406, + "learning_rate": 4.2891839378238344e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.9099942147731781, + "num_tokens": 6321034.0, + "step": 3528 + }, + { + "epoch": 0.57145170431544, + "grad_norm": 30.754173278808594, + "learning_rate": 4.287564766839379e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.9189201891422272, + "num_tokens": 6322841.0, + "step": 3529 + }, + { + "epoch": 0.5716136345235203, + "grad_norm": 21.248600006103516, + "learning_rate": 4.2859455958549225e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.9264705777168274, + "num_tokens": 6324625.0, + "step": 3530 + }, + { + "epoch": 0.5717755647316006, + "grad_norm": 22.567075729370117, + "learning_rate": 4.284326424870467e-06, + "loss": 0.6122, + "mean_token_accuracy": 0.9266505837440491, + "num_tokens": 6326410.0, + "step": 3531 + }, + { + "epoch": 0.571937494939681, + "grad_norm": 22.986690521240234, + "learning_rate": 4.282707253886011e-06, + "loss": 0.5593, + "mean_token_accuracy": 0.9095434844493866, + "num_tokens": 6328209.0, + "step": 3532 + }, + { + "epoch": 0.5720994251477614, + "grad_norm": 22.414566040039062, + "learning_rate": 4.281088082901555e-06, + "loss": 0.5829, + "mean_token_accuracy": 0.9151932895183563, + "num_tokens": 6330004.0, + "step": 3533 + }, + { + "epoch": 0.5722613553558417, + "grad_norm": 23.611675262451172, + "learning_rate": 4.279468911917099e-06, + "loss": 0.5981, + "mean_token_accuracy": 0.9219858050346375, + "num_tokens": 6331798.0, + "step": 3534 + }, + { + "epoch": 0.572423285563922, + "grad_norm": 16.818462371826172, + "learning_rate": 4.277849740932643e-06, + "loss": 0.491, + "mean_token_accuracy": 0.9310256242752075, + "num_tokens": 6333590.0, + "step": 3535 + }, + { + "epoch": 0.5725852157720023, + "grad_norm": 24.466516494750977, + "learning_rate": 4.276230569948187e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.9213924705982208, + "num_tokens": 6335382.0, + "step": 3536 + }, + { + "epoch": 0.5727471459800826, + "grad_norm": 31.244674682617188, + "learning_rate": 4.274611398963731e-06, + "loss": 0.6527, + "mean_token_accuracy": 0.9117632210254669, + "num_tokens": 6337177.0, + "step": 3537 + }, + { + "epoch": 0.5729090761881629, + "grad_norm": 18.467866897583008, + "learning_rate": 4.272992227979275e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.9163140058517456, + "num_tokens": 6338964.0, + "step": 3538 + }, + { + "epoch": 0.5730710063962432, + "grad_norm": 21.92582130432129, + "learning_rate": 4.271373056994819e-06, + "loss": 0.557, + "mean_token_accuracy": 0.9218527674674988, + "num_tokens": 6340757.0, + "step": 3539 + }, + { + "epoch": 0.5732329366043235, + "grad_norm": 13.40427303314209, + "learning_rate": 4.2697538860103634e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.9328358173370361, + "num_tokens": 6342537.0, + "step": 3540 + }, + { + "epoch": 0.5733948668124038, + "grad_norm": 25.579177856445312, + "learning_rate": 4.268134715025907e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.9215202629566193, + "num_tokens": 6344331.0, + "step": 3541 + }, + { + "epoch": 0.5735567970204841, + "grad_norm": 21.58597755432129, + "learning_rate": 4.2665155440414515e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9265734255313873, + "num_tokens": 6346129.0, + "step": 3542 + }, + { + "epoch": 0.5737187272285645, + "grad_norm": 22.14926528930664, + "learning_rate": 4.264896373056995e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.9255260229110718, + "num_tokens": 6347910.0, + "step": 3543 + }, + { + "epoch": 0.5738806574366448, + "grad_norm": 26.317655563354492, + "learning_rate": 4.2632772020725395e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.913891464471817, + "num_tokens": 6349701.0, + "step": 3544 + }, + { + "epoch": 0.5740425876447252, + "grad_norm": 28.914287567138672, + "learning_rate": 4.261658031088083e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.9022997617721558, + "num_tokens": 6351490.0, + "step": 3545 + }, + { + "epoch": 0.5742045178528055, + "grad_norm": 18.756023406982422, + "learning_rate": 4.2600388601036275e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.9238255023956299, + "num_tokens": 6353276.0, + "step": 3546 + }, + { + "epoch": 0.5743664480608858, + "grad_norm": 32.8216438293457, + "learning_rate": 4.258419689119171e-06, + "loss": 0.6628, + "mean_token_accuracy": 0.9124149680137634, + "num_tokens": 6355075.0, + "step": 3547 + }, + { + "epoch": 0.5745283782689661, + "grad_norm": 22.16446304321289, + "learning_rate": 4.2568005181347155e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.9232880473136902, + "num_tokens": 6356861.0, + "step": 3548 + }, + { + "epoch": 0.5746903084770464, + "grad_norm": 24.55711555480957, + "learning_rate": 4.255181347150259e-06, + "loss": 0.6189, + "mean_token_accuracy": 0.923116147518158, + "num_tokens": 6358646.0, + "step": 3549 + }, + { + "epoch": 0.5748522386851267, + "grad_norm": 29.500802993774414, + "learning_rate": 4.2535621761658036e-06, + "loss": 0.593, + "mean_token_accuracy": 0.9172928631305695, + "num_tokens": 6360437.0, + "step": 3550 + }, + { + "epoch": 0.575014168893207, + "grad_norm": 15.286648750305176, + "learning_rate": 4.251943005181348e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.9306266009807587, + "num_tokens": 6362223.0, + "step": 3551 + }, + { + "epoch": 0.5751760991012873, + "grad_norm": 25.840396881103516, + "learning_rate": 4.250323834196892e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.9208469092845917, + "num_tokens": 6364013.0, + "step": 3552 + }, + { + "epoch": 0.5753380293093676, + "grad_norm": 31.959102630615234, + "learning_rate": 4.248704663212436e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.9154411852359772, + "num_tokens": 6365797.0, + "step": 3553 + }, + { + "epoch": 0.575499959517448, + "grad_norm": 31.157106399536133, + "learning_rate": 4.24708549222798e-06, + "loss": 0.599, + "mean_token_accuracy": 0.9136288166046143, + "num_tokens": 6367587.0, + "step": 3554 + }, + { + "epoch": 0.5756618897255283, + "grad_norm": 24.798954010009766, + "learning_rate": 4.245466321243524e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.9196504652500153, + "num_tokens": 6369373.0, + "step": 3555 + }, + { + "epoch": 0.5758238199336086, + "grad_norm": 37.87710952758789, + "learning_rate": 4.243847150259068e-06, + "loss": 0.775, + "mean_token_accuracy": 0.9021505415439606, + "num_tokens": 6371175.0, + "step": 3556 + }, + { + "epoch": 0.575985750141689, + "grad_norm": 21.68278694152832, + "learning_rate": 4.242227979274612e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.9238015413284302, + "num_tokens": 6372976.0, + "step": 3557 + }, + { + "epoch": 0.5761476803497693, + "grad_norm": 36.19822311401367, + "learning_rate": 4.240608808290156e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.9045666456222534, + "num_tokens": 6374781.0, + "step": 3558 + }, + { + "epoch": 0.5763096105578496, + "grad_norm": 24.155576705932617, + "learning_rate": 4.2389896373057e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.9176002144813538, + "num_tokens": 6376572.0, + "step": 3559 + }, + { + "epoch": 0.5764715407659299, + "grad_norm": 32.75246810913086, + "learning_rate": 4.237370466321244e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.9073200225830078, + "num_tokens": 6378354.0, + "step": 3560 + }, + { + "epoch": 0.5766334709740102, + "grad_norm": 24.438316345214844, + "learning_rate": 4.235751295336788e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.9104297459125519, + "num_tokens": 6380145.0, + "step": 3561 + }, + { + "epoch": 0.5767954011820905, + "grad_norm": 34.70059585571289, + "learning_rate": 4.234132124352332e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.9185907244682312, + "num_tokens": 6381940.0, + "step": 3562 + }, + { + "epoch": 0.5769573313901708, + "grad_norm": 27.871532440185547, + "learning_rate": 4.232512953367876e-06, + "loss": 0.5387, + "mean_token_accuracy": 0.9205244481563568, + "num_tokens": 6383729.0, + "step": 3563 + }, + { + "epoch": 0.5771192615982511, + "grad_norm": 28.184967041015625, + "learning_rate": 4.23089378238342e-06, + "loss": 0.5584, + "mean_token_accuracy": 0.923501193523407, + "num_tokens": 6385515.0, + "step": 3564 + }, + { + "epoch": 0.5772811918063314, + "grad_norm": 24.472591400146484, + "learning_rate": 4.229274611398964e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.9175926148891449, + "num_tokens": 6387306.0, + "step": 3565 + }, + { + "epoch": 0.5774431220144118, + "grad_norm": 40.821266174316406, + "learning_rate": 4.227655440414508e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.8963977694511414, + "num_tokens": 6389098.0, + "step": 3566 + }, + { + "epoch": 0.5776050522224921, + "grad_norm": 24.554664611816406, + "learning_rate": 4.226036269430052e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.9185742437839508, + "num_tokens": 6390892.0, + "step": 3567 + }, + { + "epoch": 0.5777669824305725, + "grad_norm": 30.932598114013672, + "learning_rate": 4.224417098445597e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9335784316062927, + "num_tokens": 6392675.0, + "step": 3568 + }, + { + "epoch": 0.5779289126386528, + "grad_norm": 32.80553436279297, + "learning_rate": 4.22279792746114e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.9106046259403229, + "num_tokens": 6394478.0, + "step": 3569 + }, + { + "epoch": 0.5780908428467331, + "grad_norm": 25.202241897583008, + "learning_rate": 4.221178756476685e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.9150778949260712, + "num_tokens": 6396261.0, + "step": 3570 + }, + { + "epoch": 0.5782527730548134, + "grad_norm": 30.182010650634766, + "learning_rate": 4.219559585492228e-06, + "loss": 0.6644, + "mean_token_accuracy": 0.9107471108436584, + "num_tokens": 6398053.0, + "step": 3571 + }, + { + "epoch": 0.5784147032628937, + "grad_norm": 27.293420791625977, + "learning_rate": 4.217940414507773e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9282702505588531, + "num_tokens": 6399844.0, + "step": 3572 + }, + { + "epoch": 0.578576633470974, + "grad_norm": 22.497621536254883, + "learning_rate": 4.216321243523316e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.9323180317878723, + "num_tokens": 6401635.0, + "step": 3573 + }, + { + "epoch": 0.5787385636790543, + "grad_norm": 37.42566680908203, + "learning_rate": 4.214702072538861e-06, + "loss": 0.541, + "mean_token_accuracy": 0.9222372174263, + "num_tokens": 6403430.0, + "step": 3574 + }, + { + "epoch": 0.5789004938871346, + "grad_norm": 39.466102600097656, + "learning_rate": 4.213082901554404e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.9066676497459412, + "num_tokens": 6405229.0, + "step": 3575 + }, + { + "epoch": 0.5790624240952149, + "grad_norm": 39.57737731933594, + "learning_rate": 4.211463730569949e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.8958951830863953, + "num_tokens": 6407030.0, + "step": 3576 + }, + { + "epoch": 0.5792243543032953, + "grad_norm": 38.141170501708984, + "learning_rate": 4.209844559585492e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.8979581892490387, + "num_tokens": 6408827.0, + "step": 3577 + }, + { + "epoch": 0.5793862845113756, + "grad_norm": 28.011844635009766, + "learning_rate": 4.208225388601037e-06, + "loss": 0.6533, + "mean_token_accuracy": 0.9024396538734436, + "num_tokens": 6410615.0, + "step": 3578 + }, + { + "epoch": 0.579548214719456, + "grad_norm": 23.022579193115234, + "learning_rate": 4.20660621761658e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.9172374606132507, + "num_tokens": 6412417.0, + "step": 3579 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 24.619352340698242, + "learning_rate": 4.204987046632125e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.9265942871570587, + "num_tokens": 6414215.0, + "step": 3580 + }, + { + "epoch": 0.5798720751356166, + "grad_norm": 21.680452346801758, + "learning_rate": 4.203367875647668e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.9310199022293091, + "num_tokens": 6416002.0, + "step": 3581 + }, + { + "epoch": 0.5800340053436969, + "grad_norm": 30.229331970214844, + "learning_rate": 4.201748704663213e-06, + "loss": 0.6105, + "mean_token_accuracy": 0.9115338325500488, + "num_tokens": 6417796.0, + "step": 3582 + }, + { + "epoch": 0.5801959355517772, + "grad_norm": 28.286911010742188, + "learning_rate": 4.200129533678756e-06, + "loss": 0.5877, + "mean_token_accuracy": 0.9098455607891083, + "num_tokens": 6419596.0, + "step": 3583 + }, + { + "epoch": 0.5803578657598575, + "grad_norm": 29.158273696899414, + "learning_rate": 4.198510362694301e-06, + "loss": 0.5836, + "mean_token_accuracy": 0.9200698137283325, + "num_tokens": 6421383.0, + "step": 3584 + }, + { + "epoch": 0.5805197959679378, + "grad_norm": 32.44015884399414, + "learning_rate": 4.1968911917098444e-06, + "loss": 0.7368, + "mean_token_accuracy": 0.9001736044883728, + "num_tokens": 6423167.0, + "step": 3585 + }, + { + "epoch": 0.5806817261760181, + "grad_norm": 23.043378829956055, + "learning_rate": 4.195272020725389e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.9283071458339691, + "num_tokens": 6424958.0, + "step": 3586 + }, + { + "epoch": 0.5808436563840984, + "grad_norm": 26.61720085144043, + "learning_rate": 4.193652849740933e-06, + "loss": 0.6098, + "mean_token_accuracy": 0.9155851304531097, + "num_tokens": 6426742.0, + "step": 3587 + }, + { + "epoch": 0.5810055865921788, + "grad_norm": 29.629486083984375, + "learning_rate": 4.192033678756477e-06, + "loss": 0.6529, + "mean_token_accuracy": 0.9078240692615509, + "num_tokens": 6428535.0, + "step": 3588 + }, + { + "epoch": 0.5811675168002591, + "grad_norm": 23.47311782836914, + "learning_rate": 4.190414507772021e-06, + "loss": 0.5693, + "mean_token_accuracy": 0.915340930223465, + "num_tokens": 6430321.0, + "step": 3589 + }, + { + "epoch": 0.5813294470083394, + "grad_norm": 34.31197738647461, + "learning_rate": 4.188795336787565e-06, + "loss": 0.6128, + "mean_token_accuracy": 0.9081944525241852, + "num_tokens": 6432127.0, + "step": 3590 + }, + { + "epoch": 0.5814913772164197, + "grad_norm": 14.147363662719727, + "learning_rate": 4.187176165803109e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.9336529076099396, + "num_tokens": 6433910.0, + "step": 3591 + }, + { + "epoch": 0.5816533074245, + "grad_norm": 32.367034912109375, + "learning_rate": 4.185556994818653e-06, + "loss": 0.6687, + "mean_token_accuracy": 0.9140287637710571, + "num_tokens": 6435701.0, + "step": 3592 + }, + { + "epoch": 0.5818152376325804, + "grad_norm": 26.745838165283203, + "learning_rate": 4.183937823834197e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.9092592597007751, + "num_tokens": 6437488.0, + "step": 3593 + }, + { + "epoch": 0.5819771678406607, + "grad_norm": 22.89157485961914, + "learning_rate": 4.182318652849741e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9296627044677734, + "num_tokens": 6439284.0, + "step": 3594 + }, + { + "epoch": 0.582139098048741, + "grad_norm": 32.1548957824707, + "learning_rate": 4.180699481865285e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.9018301069736481, + "num_tokens": 6441071.0, + "step": 3595 + }, + { + "epoch": 0.5823010282568213, + "grad_norm": 21.557720184326172, + "learning_rate": 4.179080310880829e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.9258066415786743, + "num_tokens": 6442866.0, + "step": 3596 + }, + { + "epoch": 0.5824629584649016, + "grad_norm": 30.659692764282227, + "learning_rate": 4.1774611398963734e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.9090061187744141, + "num_tokens": 6444652.0, + "step": 3597 + }, + { + "epoch": 0.5826248886729819, + "grad_norm": 26.35416030883789, + "learning_rate": 4.175841968911917e-06, + "loss": 0.6158, + "mean_token_accuracy": 0.9180491268634796, + "num_tokens": 6446433.0, + "step": 3598 + }, + { + "epoch": 0.5827868188810622, + "grad_norm": 22.53962516784668, + "learning_rate": 4.1742227979274615e-06, + "loss": 0.659, + "mean_token_accuracy": 0.9160572290420532, + "num_tokens": 6448219.0, + "step": 3599 + }, + { + "epoch": 0.5829487490891426, + "grad_norm": 17.339033126831055, + "learning_rate": 4.172603626943005e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.9293256103992462, + "num_tokens": 6450000.0, + "step": 3600 + }, + { + "epoch": 0.5831106792972229, + "grad_norm": 24.7391300201416, + "learning_rate": 4.1709844559585495e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.9140287637710571, + "num_tokens": 6451801.0, + "step": 3601 + }, + { + "epoch": 0.5832726095053032, + "grad_norm": 24.247093200683594, + "learning_rate": 4.169365284974093e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.9239353239536285, + "num_tokens": 6453589.0, + "step": 3602 + }, + { + "epoch": 0.5834345397133835, + "grad_norm": 26.65892791748047, + "learning_rate": 4.1677461139896375e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.9120599031448364, + "num_tokens": 6455374.0, + "step": 3603 + }, + { + "epoch": 0.5835964699214639, + "grad_norm": 17.594148635864258, + "learning_rate": 4.166126943005181e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.9306569397449493, + "num_tokens": 6457160.0, + "step": 3604 + }, + { + "epoch": 0.5837584001295442, + "grad_norm": 17.84099769592285, + "learning_rate": 4.1645077720207256e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.9311594367027283, + "num_tokens": 6458948.0, + "step": 3605 + }, + { + "epoch": 0.5839203303376245, + "grad_norm": 32.53583908081055, + "learning_rate": 4.16288860103627e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.8943827748298645, + "num_tokens": 6460743.0, + "step": 3606 + }, + { + "epoch": 0.5840822605457048, + "grad_norm": 12.079717636108398, + "learning_rate": 4.161269430051814e-06, + "loss": 0.5557, + "mean_token_accuracy": 0.9253689646720886, + "num_tokens": 6462523.0, + "step": 3607 + }, + { + "epoch": 0.5842441907537851, + "grad_norm": 23.461336135864258, + "learning_rate": 4.159650259067358e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.9112319052219391, + "num_tokens": 6464305.0, + "step": 3608 + }, + { + "epoch": 0.5844061209618654, + "grad_norm": 25.60858154296875, + "learning_rate": 4.158031088082902e-06, + "loss": 0.6387, + "mean_token_accuracy": 0.905193418264389, + "num_tokens": 6466098.0, + "step": 3609 + }, + { + "epoch": 0.5845680511699457, + "grad_norm": 26.739316940307617, + "learning_rate": 4.156411917098446e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.9072310924530029, + "num_tokens": 6467889.0, + "step": 3610 + }, + { + "epoch": 0.5847299813780261, + "grad_norm": 24.455007553100586, + "learning_rate": 4.15479274611399e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.9025605022907257, + "num_tokens": 6469678.0, + "step": 3611 + }, + { + "epoch": 0.5848919115861064, + "grad_norm": 30.066936492919922, + "learning_rate": 4.153173575129534e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.8987827003002167, + "num_tokens": 6471468.0, + "step": 3612 + }, + { + "epoch": 0.5850538417941867, + "grad_norm": 26.420854568481445, + "learning_rate": 4.151554404145078e-06, + "loss": 0.6292, + "mean_token_accuracy": 0.9057773351669312, + "num_tokens": 6473256.0, + "step": 3613 + }, + { + "epoch": 0.585215772002267, + "grad_norm": 22.476177215576172, + "learning_rate": 4.149935233160622e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.9169968664646149, + "num_tokens": 6475045.0, + "step": 3614 + }, + { + "epoch": 0.5853777022103474, + "grad_norm": 27.663366317749023, + "learning_rate": 4.148316062176166e-06, + "loss": 0.6204, + "mean_token_accuracy": 0.9119718372821808, + "num_tokens": 6476841.0, + "step": 3615 + }, + { + "epoch": 0.5855396324184277, + "grad_norm": 27.441537857055664, + "learning_rate": 4.14669689119171e-06, + "loss": 0.6085, + "mean_token_accuracy": 0.9127962291240692, + "num_tokens": 6478628.0, + "step": 3616 + }, + { + "epoch": 0.585701562626508, + "grad_norm": 16.526952743530273, + "learning_rate": 4.145077720207254e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.929679811000824, + "num_tokens": 6480425.0, + "step": 3617 + }, + { + "epoch": 0.5858634928345883, + "grad_norm": 25.384193420410156, + "learning_rate": 4.143458549222798e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.9155879616737366, + "num_tokens": 6482221.0, + "step": 3618 + }, + { + "epoch": 0.5860254230426686, + "grad_norm": 22.535831451416016, + "learning_rate": 4.141839378238342e-06, + "loss": 0.5873, + "mean_token_accuracy": 0.922939658164978, + "num_tokens": 6484005.0, + "step": 3619 + }, + { + "epoch": 0.5861873532507489, + "grad_norm": 31.863630294799805, + "learning_rate": 4.140220207253887e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.9001617133617401, + "num_tokens": 6485807.0, + "step": 3620 + }, + { + "epoch": 0.5863492834588292, + "grad_norm": 29.700624465942383, + "learning_rate": 4.138601036269431e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.9048641622066498, + "num_tokens": 6487592.0, + "step": 3621 + }, + { + "epoch": 0.5865112136669096, + "grad_norm": 21.672607421875, + "learning_rate": 4.136981865284975e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.920273095369339, + "num_tokens": 6489380.0, + "step": 3622 + }, + { + "epoch": 0.5866731438749899, + "grad_norm": 28.994285583496094, + "learning_rate": 4.135362694300519e-06, + "loss": 0.6587, + "mean_token_accuracy": 0.9087194800376892, + "num_tokens": 6491165.0, + "step": 3623 + }, + { + "epoch": 0.5868350740830702, + "grad_norm": 17.27082633972168, + "learning_rate": 4.133743523316063e-06, + "loss": 0.489, + "mean_token_accuracy": 0.9359357357025146, + "num_tokens": 6492958.0, + "step": 3624 + }, + { + "epoch": 0.5869970042911505, + "grad_norm": 23.989273071289062, + "learning_rate": 4.132124352331607e-06, + "loss": 0.5903, + "mean_token_accuracy": 0.919911116361618, + "num_tokens": 6494745.0, + "step": 3625 + }, + { + "epoch": 0.5871589344992308, + "grad_norm": 33.401512145996094, + "learning_rate": 4.130505181347151e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.9052895903587341, + "num_tokens": 6496542.0, + "step": 3626 + }, + { + "epoch": 0.5873208647073112, + "grad_norm": 15.182997703552246, + "learning_rate": 4.128886010362695e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.9323708117008209, + "num_tokens": 6498335.0, + "step": 3627 + }, + { + "epoch": 0.5874827949153915, + "grad_norm": 35.03440856933594, + "learning_rate": 4.127266839378239e-06, + "loss": 1.001, + "mean_token_accuracy": 0.8903170228004456, + "num_tokens": 6500128.0, + "step": 3628 + }, + { + "epoch": 0.5876447251234718, + "grad_norm": 29.223005294799805, + "learning_rate": 4.125647668393783e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.9097620248794556, + "num_tokens": 6501927.0, + "step": 3629 + }, + { + "epoch": 0.5878066553315521, + "grad_norm": 29.118810653686523, + "learning_rate": 4.124028497409327e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.9126032292842865, + "num_tokens": 6503713.0, + "step": 3630 + }, + { + "epoch": 0.5879685855396324, + "grad_norm": 23.20722770690918, + "learning_rate": 4.122409326424871e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.921798974275589, + "num_tokens": 6505494.0, + "step": 3631 + }, + { + "epoch": 0.5881305157477127, + "grad_norm": 14.8585205078125, + "learning_rate": 4.120790155440415e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.9293177723884583, + "num_tokens": 6507275.0, + "step": 3632 + }, + { + "epoch": 0.5882924459557931, + "grad_norm": 27.070327758789062, + "learning_rate": 4.119170984455959e-06, + "loss": 0.5946, + "mean_token_accuracy": 0.916402131319046, + "num_tokens": 6509062.0, + "step": 3633 + }, + { + "epoch": 0.5884543761638734, + "grad_norm": 23.33502769470215, + "learning_rate": 4.117551813471503e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.9244987666606903, + "num_tokens": 6510851.0, + "step": 3634 + }, + { + "epoch": 0.5886163063719537, + "grad_norm": 27.617918014526367, + "learning_rate": 4.115932642487047e-06, + "loss": 0.6104, + "mean_token_accuracy": 0.9107434451580048, + "num_tokens": 6512643.0, + "step": 3635 + }, + { + "epoch": 0.588778236580034, + "grad_norm": 33.694908142089844, + "learning_rate": 4.114313471502591e-06, + "loss": 0.621, + "mean_token_accuracy": 0.9089947044849396, + "num_tokens": 6514430.0, + "step": 3636 + }, + { + "epoch": 0.5889401667881143, + "grad_norm": 14.677541732788086, + "learning_rate": 4.112694300518135e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.933833509683609, + "num_tokens": 6516229.0, + "step": 3637 + }, + { + "epoch": 0.5891020969961946, + "grad_norm": 25.318374633789062, + "learning_rate": 4.111075129533679e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.9170559346675873, + "num_tokens": 6518006.0, + "step": 3638 + }, + { + "epoch": 0.589264027204275, + "grad_norm": 29.502891540527344, + "learning_rate": 4.109455958549224e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.9103894531726837, + "num_tokens": 6519819.0, + "step": 3639 + }, + { + "epoch": 0.5894259574123553, + "grad_norm": 31.912513732910156, + "learning_rate": 4.107836787564767e-06, + "loss": 0.6375, + "mean_token_accuracy": 0.8946863114833832, + "num_tokens": 6521605.0, + "step": 3640 + }, + { + "epoch": 0.5895878876204356, + "grad_norm": 20.212072372436523, + "learning_rate": 4.106217616580312e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.9313632845878601, + "num_tokens": 6523393.0, + "step": 3641 + }, + { + "epoch": 0.5897498178285159, + "grad_norm": 30.334707260131836, + "learning_rate": 4.104598445595855e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.9060223400592804, + "num_tokens": 6525192.0, + "step": 3642 + }, + { + "epoch": 0.5899117480365962, + "grad_norm": 43.30109405517578, + "learning_rate": 4.1029792746114e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.8868494629859924, + "num_tokens": 6526987.0, + "step": 3643 + }, + { + "epoch": 0.5900736782446765, + "grad_norm": 29.774261474609375, + "learning_rate": 4.101360103626943e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.9091590940952301, + "num_tokens": 6528782.0, + "step": 3644 + }, + { + "epoch": 0.5902356084527569, + "grad_norm": 21.97923469543457, + "learning_rate": 4.099740932642488e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.9142021834850311, + "num_tokens": 6530562.0, + "step": 3645 + }, + { + "epoch": 0.5903975386608372, + "grad_norm": 26.86726188659668, + "learning_rate": 4.098121761658031e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.9290780425071716, + "num_tokens": 6532356.0, + "step": 3646 + }, + { + "epoch": 0.5905594688689175, + "grad_norm": 19.118139266967773, + "learning_rate": 4.096502590673576e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.9302109181880951, + "num_tokens": 6534154.0, + "step": 3647 + }, + { + "epoch": 0.5907213990769978, + "grad_norm": 23.16008949279785, + "learning_rate": 4.094883419689119e-06, + "loss": 0.591, + "mean_token_accuracy": 0.9232684075832367, + "num_tokens": 6535938.0, + "step": 3648 + }, + { + "epoch": 0.5908833292850781, + "grad_norm": 23.8983154296875, + "learning_rate": 4.093264248704664e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.9138889014720917, + "num_tokens": 6537717.0, + "step": 3649 + }, + { + "epoch": 0.5910452594931584, + "grad_norm": 27.30613136291504, + "learning_rate": 4.091645077720207e-06, + "loss": 0.6056, + "mean_token_accuracy": 0.9282497465610504, + "num_tokens": 6539507.0, + "step": 3650 + }, + { + "epoch": 0.5912071897012388, + "grad_norm": 26.831071853637695, + "learning_rate": 4.090025906735752e-06, + "loss": 0.682, + "mean_token_accuracy": 0.905089259147644, + "num_tokens": 6541293.0, + "step": 3651 + }, + { + "epoch": 0.5913691199093191, + "grad_norm": 22.436786651611328, + "learning_rate": 4.0884067357512954e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.9113799929618835, + "num_tokens": 6543076.0, + "step": 3652 + }, + { + "epoch": 0.5915310501173994, + "grad_norm": 27.08148765563965, + "learning_rate": 4.08678756476684e-06, + "loss": 0.5851, + "mean_token_accuracy": 0.9168752431869507, + "num_tokens": 6544865.0, + "step": 3653 + }, + { + "epoch": 0.5916929803254797, + "grad_norm": 32.13164138793945, + "learning_rate": 4.0851683937823835e-06, + "loss": 0.6539, + "mean_token_accuracy": 0.9135036468505859, + "num_tokens": 6546654.0, + "step": 3654 + }, + { + "epoch": 0.59185491053356, + "grad_norm": 21.681386947631836, + "learning_rate": 4.083549222797928e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.9292744994163513, + "num_tokens": 6548435.0, + "step": 3655 + }, + { + "epoch": 0.5920168407416404, + "grad_norm": 22.89409065246582, + "learning_rate": 4.0819300518134715e-06, + "loss": 0.5896, + "mean_token_accuracy": 0.9189726412296295, + "num_tokens": 6550218.0, + "step": 3656 + }, + { + "epoch": 0.5921787709497207, + "grad_norm": 21.351774215698242, + "learning_rate": 4.080310880829016e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9235875904560089, + "num_tokens": 6552005.0, + "step": 3657 + }, + { + "epoch": 0.592340701157801, + "grad_norm": 31.21608543395996, + "learning_rate": 4.07869170984456e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.9150060415267944, + "num_tokens": 6553799.0, + "step": 3658 + }, + { + "epoch": 0.5925026313658813, + "grad_norm": 26.250547409057617, + "learning_rate": 4.077072538860104e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.9205673635005951, + "num_tokens": 6555602.0, + "step": 3659 + }, + { + "epoch": 0.5926645615739616, + "grad_norm": 25.410783767700195, + "learning_rate": 4.075453367875648e-06, + "loss": 0.5944, + "mean_token_accuracy": 0.9230892956256866, + "num_tokens": 6557387.0, + "step": 3660 + }, + { + "epoch": 0.5928264917820419, + "grad_norm": 27.4699649810791, + "learning_rate": 4.073834196891192e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.9270073175430298, + "num_tokens": 6559173.0, + "step": 3661 + }, + { + "epoch": 0.5929884219901222, + "grad_norm": 26.440393447875977, + "learning_rate": 4.072215025906736e-06, + "loss": 0.6325, + "mean_token_accuracy": 0.9184104800224304, + "num_tokens": 6560967.0, + "step": 3662 + }, + { + "epoch": 0.5931503521982026, + "grad_norm": 28.47506332397461, + "learning_rate": 4.07059585492228e-06, + "loss": 0.6139, + "mean_token_accuracy": 0.9164723455905914, + "num_tokens": 6562766.0, + "step": 3663 + }, + { + "epoch": 0.5933122824062829, + "grad_norm": 33.27669143676758, + "learning_rate": 4.0689766839378244e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.9172661900520325, + "num_tokens": 6564556.0, + "step": 3664 + }, + { + "epoch": 0.5934742126143632, + "grad_norm": 28.420377731323242, + "learning_rate": 4.067357512953368e-06, + "loss": 0.571, + "mean_token_accuracy": 0.9183273017406464, + "num_tokens": 6566350.0, + "step": 3665 + }, + { + "epoch": 0.5936361428224435, + "grad_norm": 29.539932250976562, + "learning_rate": 4.0657383419689125e-06, + "loss": 0.6323, + "mean_token_accuracy": 0.9178382456302643, + "num_tokens": 6568144.0, + "step": 3666 + }, + { + "epoch": 0.5937980730305239, + "grad_norm": 30.816614151000977, + "learning_rate": 4.064119170984456e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.9115451872348785, + "num_tokens": 6569927.0, + "step": 3667 + }, + { + "epoch": 0.5939600032386042, + "grad_norm": 22.580835342407227, + "learning_rate": 4.0625000000000005e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.9306755661964417, + "num_tokens": 6571725.0, + "step": 3668 + }, + { + "epoch": 0.5941219334466845, + "grad_norm": 21.536479949951172, + "learning_rate": 4.060880829015544e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9228707551956177, + "num_tokens": 6573521.0, + "step": 3669 + }, + { + "epoch": 0.5942838636547648, + "grad_norm": 31.76894760131836, + "learning_rate": 4.0592616580310885e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9217016398906708, + "num_tokens": 6575326.0, + "step": 3670 + }, + { + "epoch": 0.5944457938628451, + "grad_norm": 27.422283172607422, + "learning_rate": 4.057642487046632e-06, + "loss": 0.5867, + "mean_token_accuracy": 0.9133181571960449, + "num_tokens": 6577116.0, + "step": 3671 + }, + { + "epoch": 0.5946077240709254, + "grad_norm": 20.023258209228516, + "learning_rate": 4.0560233160621765e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.9366661608219147, + "num_tokens": 6578912.0, + "step": 3672 + }, + { + "epoch": 0.5947696542790057, + "grad_norm": 33.03738784790039, + "learning_rate": 4.05440414507772e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.9132374227046967, + "num_tokens": 6580713.0, + "step": 3673 + }, + { + "epoch": 0.594931584487086, + "grad_norm": 26.31281280517578, + "learning_rate": 4.0527849740932646e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.9239332377910614, + "num_tokens": 6582515.0, + "step": 3674 + }, + { + "epoch": 0.5950935146951664, + "grad_norm": 21.916072845458984, + "learning_rate": 4.051165803108808e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.9145643413066864, + "num_tokens": 6584308.0, + "step": 3675 + }, + { + "epoch": 0.5952554449032467, + "grad_norm": 16.831350326538086, + "learning_rate": 4.049546632124353e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.9287993907928467, + "num_tokens": 6586101.0, + "step": 3676 + }, + { + "epoch": 0.595417375111327, + "grad_norm": 22.027729034423828, + "learning_rate": 4.047927461139897e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.924933671951294, + "num_tokens": 6587893.0, + "step": 3677 + }, + { + "epoch": 0.5955793053194073, + "grad_norm": 24.40011978149414, + "learning_rate": 4.046308290155441e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.918353796005249, + "num_tokens": 6589674.0, + "step": 3678 + }, + { + "epoch": 0.5957412355274877, + "grad_norm": 31.515626907348633, + "learning_rate": 4.044689119170985e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.9161653220653534, + "num_tokens": 6591472.0, + "step": 3679 + }, + { + "epoch": 0.595903165735568, + "grad_norm": 22.767898559570312, + "learning_rate": 4.043069948186529e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.9255845248699188, + "num_tokens": 6593267.0, + "step": 3680 + }, + { + "epoch": 0.5960650959436483, + "grad_norm": 36.727088928222656, + "learning_rate": 4.041450777202073e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.9032207727432251, + "num_tokens": 6595057.0, + "step": 3681 + }, + { + "epoch": 0.5962270261517286, + "grad_norm": 24.581951141357422, + "learning_rate": 4.039831606217617e-06, + "loss": 0.5469, + "mean_token_accuracy": 0.913891464471817, + "num_tokens": 6596848.0, + "step": 3682 + }, + { + "epoch": 0.5963889563598089, + "grad_norm": 40.105064392089844, + "learning_rate": 4.038212435233161e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.9013539850711823, + "num_tokens": 6598644.0, + "step": 3683 + }, + { + "epoch": 0.5965508865678892, + "grad_norm": 19.64065170288086, + "learning_rate": 4.036593264248705e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9225490093231201, + "num_tokens": 6600427.0, + "step": 3684 + }, + { + "epoch": 0.5967128167759695, + "grad_norm": 27.96965217590332, + "learning_rate": 4.034974093264249e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.925253301858902, + "num_tokens": 6602220.0, + "step": 3685 + }, + { + "epoch": 0.5968747469840499, + "grad_norm": 32.57131576538086, + "learning_rate": 4.033354922279793e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.8995442986488342, + "num_tokens": 6604010.0, + "step": 3686 + }, + { + "epoch": 0.5970366771921302, + "grad_norm": 32.82229232788086, + "learning_rate": 4.031735751295337e-06, + "loss": 0.6617, + "mean_token_accuracy": 0.9109405279159546, + "num_tokens": 6605803.0, + "step": 3687 + }, + { + "epoch": 0.5971986074002105, + "grad_norm": 21.351390838623047, + "learning_rate": 4.030116580310881e-06, + "loss": 0.6382, + "mean_token_accuracy": 0.9186631441116333, + "num_tokens": 6607600.0, + "step": 3688 + }, + { + "epoch": 0.5973605376082908, + "grad_norm": 30.04303741455078, + "learning_rate": 4.028497409326425e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.9051948189735413, + "num_tokens": 6609406.0, + "step": 3689 + }, + { + "epoch": 0.5975224678163712, + "grad_norm": 25.815378189086914, + "learning_rate": 4.026878238341969e-06, + "loss": 0.5924, + "mean_token_accuracy": 0.9198676943778992, + "num_tokens": 6611192.0, + "step": 3690 + }, + { + "epoch": 0.5976843980244515, + "grad_norm": 30.091291427612305, + "learning_rate": 4.025259067357513e-06, + "loss": 0.6431, + "mean_token_accuracy": 0.9119961261749268, + "num_tokens": 6612985.0, + "step": 3691 + }, + { + "epoch": 0.5978463282325318, + "grad_norm": 34.82086944580078, + "learning_rate": 4.023639896373057e-06, + "loss": 0.905, + "mean_token_accuracy": 0.8985185027122498, + "num_tokens": 6614782.0, + "step": 3692 + }, + { + "epoch": 0.5980082584406121, + "grad_norm": 36.073421478271484, + "learning_rate": 4.022020725388601e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.9118259847164154, + "num_tokens": 6616588.0, + "step": 3693 + }, + { + "epoch": 0.5981701886486924, + "grad_norm": 32.50791549682617, + "learning_rate": 4.020401554404146e-06, + "loss": 0.6739, + "mean_token_accuracy": 0.9113631248474121, + "num_tokens": 6618394.0, + "step": 3694 + }, + { + "epoch": 0.5983321188567727, + "grad_norm": 28.352083206176758, + "learning_rate": 4.018782383419689e-06, + "loss": 0.6707, + "mean_token_accuracy": 0.9162254929542542, + "num_tokens": 6620192.0, + "step": 3695 + }, + { + "epoch": 0.598494049064853, + "grad_norm": 30.50720977783203, + "learning_rate": 4.017163212435234e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.9082609713077545, + "num_tokens": 6621988.0, + "step": 3696 + }, + { + "epoch": 0.5986559792729333, + "grad_norm": 23.101003646850586, + "learning_rate": 4.015544041450777e-06, + "loss": 0.518, + "mean_token_accuracy": 0.9206465184688568, + "num_tokens": 6623777.0, + "step": 3697 + }, + { + "epoch": 0.5988179094810137, + "grad_norm": 21.81392478942871, + "learning_rate": 4.013924870466322e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.920639842748642, + "num_tokens": 6625567.0, + "step": 3698 + }, + { + "epoch": 0.598979839689094, + "grad_norm": 23.16607666015625, + "learning_rate": 4.012305699481865e-06, + "loss": 0.6186, + "mean_token_accuracy": 0.918313592672348, + "num_tokens": 6627349.0, + "step": 3699 + }, + { + "epoch": 0.5991417698971743, + "grad_norm": 33.53606414794922, + "learning_rate": 4.01068652849741e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.8991917371749878, + "num_tokens": 6629128.0, + "step": 3700 + }, + { + "epoch": 0.5993037001052547, + "grad_norm": 28.027090072631836, + "learning_rate": 4.009067357512953e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.9151678681373596, + "num_tokens": 6630911.0, + "step": 3701 + }, + { + "epoch": 0.599465630313335, + "grad_norm": 15.572281837463379, + "learning_rate": 4.007448186528498e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.9324262738227844, + "num_tokens": 6632704.0, + "step": 3702 + }, + { + "epoch": 0.5996275605214153, + "grad_norm": 36.786128997802734, + "learning_rate": 4.005829015544041e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.9109029471874237, + "num_tokens": 6634496.0, + "step": 3703 + }, + { + "epoch": 0.5997894907294956, + "grad_norm": 27.85297393798828, + "learning_rate": 4.004209844559586e-06, + "loss": 0.5826, + "mean_token_accuracy": 0.9171826541423798, + "num_tokens": 6636296.0, + "step": 3704 + }, + { + "epoch": 0.5999514209375759, + "grad_norm": 14.017465591430664, + "learning_rate": 4.002590673575129e-06, + "loss": 0.473, + "mean_token_accuracy": 0.93160080909729, + "num_tokens": 6638086.0, + "step": 3705 + }, + { + "epoch": 0.6001133511456562, + "grad_norm": 17.05233383178711, + "learning_rate": 4.000971502590674e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9260194301605225, + "num_tokens": 6639868.0, + "step": 3706 + }, + { + "epoch": 0.6002752813537365, + "grad_norm": 33.7879638671875, + "learning_rate": 3.9993523316062174e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.9114553928375244, + "num_tokens": 6641672.0, + "step": 3707 + }, + { + "epoch": 0.6004372115618168, + "grad_norm": 28.628454208374023, + "learning_rate": 3.997733160621762e-06, + "loss": 0.6426, + "mean_token_accuracy": 0.9168866872787476, + "num_tokens": 6643462.0, + "step": 3708 + }, + { + "epoch": 0.6005991417698971, + "grad_norm": 31.869678497314453, + "learning_rate": 3.9961139896373055e-06, + "loss": 0.64, + "mean_token_accuracy": 0.9047606289386749, + "num_tokens": 6645247.0, + "step": 3709 + }, + { + "epoch": 0.6007610719779775, + "grad_norm": 19.07252311706543, + "learning_rate": 3.99449481865285e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.9226754009723663, + "num_tokens": 6647031.0, + "step": 3710 + }, + { + "epoch": 0.6009230021860578, + "grad_norm": 26.679035186767578, + "learning_rate": 3.9928756476683935e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.9185742437839508, + "num_tokens": 6648825.0, + "step": 3711 + }, + { + "epoch": 0.6010849323941381, + "grad_norm": 38.0206413269043, + "learning_rate": 3.991256476683938e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.898194432258606, + "num_tokens": 6650631.0, + "step": 3712 + }, + { + "epoch": 0.6012468626022185, + "grad_norm": 33.41487121582031, + "learning_rate": 3.989637305699482e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.9028554856777191, + "num_tokens": 6652431.0, + "step": 3713 + }, + { + "epoch": 0.6014087928102988, + "grad_norm": 26.38525390625, + "learning_rate": 3.988018134715026e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.9182541370391846, + "num_tokens": 6654224.0, + "step": 3714 + }, + { + "epoch": 0.6015707230183791, + "grad_norm": 25.297746658325195, + "learning_rate": 3.98639896373057e-06, + "loss": 0.6568, + "mean_token_accuracy": 0.9181488752365112, + "num_tokens": 6656006.0, + "step": 3715 + }, + { + "epoch": 0.6017326532264594, + "grad_norm": 37.08112335205078, + "learning_rate": 3.984779792746114e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.9148701429367065, + "num_tokens": 6657810.0, + "step": 3716 + }, + { + "epoch": 0.6018945834345397, + "grad_norm": 24.39055824279785, + "learning_rate": 3.983160621761658e-06, + "loss": 0.587, + "mean_token_accuracy": 0.9113828539848328, + "num_tokens": 6659604.0, + "step": 3717 + }, + { + "epoch": 0.60205651364262, + "grad_norm": 28.963029861450195, + "learning_rate": 3.981541450777203e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.9030910730361938, + "num_tokens": 6661384.0, + "step": 3718 + }, + { + "epoch": 0.6022184438507003, + "grad_norm": 34.459083557128906, + "learning_rate": 3.9799222797927464e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.9078014194965363, + "num_tokens": 6663178.0, + "step": 3719 + }, + { + "epoch": 0.6023803740587806, + "grad_norm": 21.510969161987305, + "learning_rate": 3.978303108808291e-06, + "loss": 0.6008, + "mean_token_accuracy": 0.916793555021286, + "num_tokens": 6664966.0, + "step": 3720 + }, + { + "epoch": 0.602542304266861, + "grad_norm": 24.454227447509766, + "learning_rate": 3.9766839378238345e-06, + "loss": 0.5782, + "mean_token_accuracy": 0.9210526347160339, + "num_tokens": 6666744.0, + "step": 3721 + }, + { + "epoch": 0.6027042344749413, + "grad_norm": 24.702714920043945, + "learning_rate": 3.975064766839379e-06, + "loss": 0.5617, + "mean_token_accuracy": 0.9157062470912933, + "num_tokens": 6668541.0, + "step": 3722 + }, + { + "epoch": 0.6028661646830216, + "grad_norm": 26.115219116210938, + "learning_rate": 3.9734455958549225e-06, + "loss": 0.649, + "mean_token_accuracy": 0.9074974656105042, + "num_tokens": 6670334.0, + "step": 3723 + }, + { + "epoch": 0.603028094891102, + "grad_norm": 33.99589538574219, + "learning_rate": 3.971826424870467e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.90386563539505, + "num_tokens": 6672127.0, + "step": 3724 + }, + { + "epoch": 0.6031900250991823, + "grad_norm": 20.441484451293945, + "learning_rate": 3.9702072538860105e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.9299903213977814, + "num_tokens": 6673912.0, + "step": 3725 + }, + { + "epoch": 0.6033519553072626, + "grad_norm": 36.506874084472656, + "learning_rate": 3.968588082901555e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.8910548090934753, + "num_tokens": 6675699.0, + "step": 3726 + }, + { + "epoch": 0.6035138855153429, + "grad_norm": 26.96748924255371, + "learning_rate": 3.966968911917099e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.907131016254425, + "num_tokens": 6677480.0, + "step": 3727 + }, + { + "epoch": 0.6036758157234232, + "grad_norm": 24.486268997192383, + "learning_rate": 3.965349740932643e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.9217728674411774, + "num_tokens": 6679272.0, + "step": 3728 + }, + { + "epoch": 0.6038377459315035, + "grad_norm": 34.572322845458984, + "learning_rate": 3.963730569948187e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.9017195999622345, + "num_tokens": 6681059.0, + "step": 3729 + }, + { + "epoch": 0.6039996761395838, + "grad_norm": 26.056943893432617, + "learning_rate": 3.962111398963731e-06, + "loss": 0.5463, + "mean_token_accuracy": 0.9211378395557404, + "num_tokens": 6682850.0, + "step": 3730 + }, + { + "epoch": 0.6041616063476641, + "grad_norm": 27.31075668334961, + "learning_rate": 3.9604922279792754e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.9054268598556519, + "num_tokens": 6684637.0, + "step": 3731 + }, + { + "epoch": 0.6043235365557444, + "grad_norm": 32.93107604980469, + "learning_rate": 3.958873056994819e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.9107471108436584, + "num_tokens": 6686429.0, + "step": 3732 + }, + { + "epoch": 0.6044854667638248, + "grad_norm": 29.372608184814453, + "learning_rate": 3.9572538860103635e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.8982490599155426, + "num_tokens": 6688216.0, + "step": 3733 + }, + { + "epoch": 0.6046473969719051, + "grad_norm": 25.47175407409668, + "learning_rate": 3.955634715025907e-06, + "loss": 0.6061, + "mean_token_accuracy": 0.9180261492729187, + "num_tokens": 6690009.0, + "step": 3734 + }, + { + "epoch": 0.6048093271799855, + "grad_norm": 28.501943588256836, + "learning_rate": 3.9540155440414515e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.9119718372821808, + "num_tokens": 6691805.0, + "step": 3735 + }, + { + "epoch": 0.6049712573880658, + "grad_norm": 32.363433837890625, + "learning_rate": 3.952396373056995e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.9058353900909424, + "num_tokens": 6693595.0, + "step": 3736 + }, + { + "epoch": 0.6051331875961461, + "grad_norm": 38.678653717041016, + "learning_rate": 3.9507772020725395e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.9062816500663757, + "num_tokens": 6695395.0, + "step": 3737 + }, + { + "epoch": 0.6052951178042264, + "grad_norm": 26.37191390991211, + "learning_rate": 3.949158031088083e-06, + "loss": 0.603, + "mean_token_accuracy": 0.9134595096111298, + "num_tokens": 6697184.0, + "step": 3738 + }, + { + "epoch": 0.6054570480123067, + "grad_norm": 31.75313377380371, + "learning_rate": 3.9475388601036275e-06, + "loss": 0.742, + "mean_token_accuracy": 0.9120039641857147, + "num_tokens": 6698980.0, + "step": 3739 + }, + { + "epoch": 0.605618978220387, + "grad_norm": 17.165111541748047, + "learning_rate": 3.945919689119171e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.9313608109951019, + "num_tokens": 6700769.0, + "step": 3740 + }, + { + "epoch": 0.6057809084284673, + "grad_norm": 34.97706985473633, + "learning_rate": 3.9443005181347156e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.9084957540035248, + "num_tokens": 6702565.0, + "step": 3741 + }, + { + "epoch": 0.6059428386365476, + "grad_norm": 26.550935745239258, + "learning_rate": 3.942681347150259e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.9218875765800476, + "num_tokens": 6704359.0, + "step": 3742 + }, + { + "epoch": 0.6061047688446279, + "grad_norm": 31.897624969482422, + "learning_rate": 3.941062176165804e-06, + "loss": 0.7863, + "mean_token_accuracy": 0.9066407978534698, + "num_tokens": 6706151.0, + "step": 3743 + }, + { + "epoch": 0.6062666990527082, + "grad_norm": 36.83315658569336, + "learning_rate": 3.939443005181347e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.8851260840892792, + "num_tokens": 6707942.0, + "step": 3744 + }, + { + "epoch": 0.6064286292607886, + "grad_norm": 23.051773071289062, + "learning_rate": 3.937823834196892e-06, + "loss": 0.553, + "mean_token_accuracy": 0.9232993125915527, + "num_tokens": 6709741.0, + "step": 3745 + }, + { + "epoch": 0.606590559468869, + "grad_norm": 28.415992736816406, + "learning_rate": 3.936204663212436e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.9119967818260193, + "num_tokens": 6711526.0, + "step": 3746 + }, + { + "epoch": 0.6067524896769493, + "grad_norm": 32.81734848022461, + "learning_rate": 3.93458549222798e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.9024762809276581, + "num_tokens": 6713314.0, + "step": 3747 + }, + { + "epoch": 0.6069144198850296, + "grad_norm": 22.28242301940918, + "learning_rate": 3.932966321243524e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.9215686321258545, + "num_tokens": 6715106.0, + "step": 3748 + }, + { + "epoch": 0.6070763500931099, + "grad_norm": 21.49435043334961, + "learning_rate": 3.931347150259068e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.9236485958099365, + "num_tokens": 6716893.0, + "step": 3749 + }, + { + "epoch": 0.6072382803011902, + "grad_norm": 27.520023345947266, + "learning_rate": 3.929727979274612e-06, + "loss": 0.6193, + "mean_token_accuracy": 0.9122975468635559, + "num_tokens": 6718679.0, + "step": 3750 + }, + { + "epoch": 0.6074002105092705, + "grad_norm": 31.730144500732422, + "learning_rate": 3.928108808290156e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.9118930697441101, + "num_tokens": 6720475.0, + "step": 3751 + }, + { + "epoch": 0.6075621407173508, + "grad_norm": 29.855321884155273, + "learning_rate": 3.9264896373057e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.9167623519897461, + "num_tokens": 6722264.0, + "step": 3752 + }, + { + "epoch": 0.6077240709254311, + "grad_norm": 30.293182373046875, + "learning_rate": 3.924870466321244e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.9075596034526825, + "num_tokens": 6724056.0, + "step": 3753 + }, + { + "epoch": 0.6078860011335114, + "grad_norm": 34.557804107666016, + "learning_rate": 3.923251295336788e-06, + "loss": 0.5937, + "mean_token_accuracy": 0.9066252708435059, + "num_tokens": 6725846.0, + "step": 3754 + }, + { + "epoch": 0.6080479313415917, + "grad_norm": 30.57419204711914, + "learning_rate": 3.921632124352332e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.9142877459526062, + "num_tokens": 6727649.0, + "step": 3755 + }, + { + "epoch": 0.608209861549672, + "grad_norm": 32.914615631103516, + "learning_rate": 3.920012953367876e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.9007092118263245, + "num_tokens": 6729443.0, + "step": 3756 + }, + { + "epoch": 0.6083717917577524, + "grad_norm": 26.21811294555664, + "learning_rate": 3.91839378238342e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.9114106595516205, + "num_tokens": 6731226.0, + "step": 3757 + }, + { + "epoch": 0.6085337219658328, + "grad_norm": 23.061723709106445, + "learning_rate": 3.916774611398964e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.9187144339084625, + "num_tokens": 6733021.0, + "step": 3758 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 30.69961166381836, + "learning_rate": 3.915155440414508e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.9183647632598877, + "num_tokens": 6734814.0, + "step": 3759 + }, + { + "epoch": 0.6088575823819934, + "grad_norm": 25.092193603515625, + "learning_rate": 3.913536269430052e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.9091029465198517, + "num_tokens": 6736601.0, + "step": 3760 + }, + { + "epoch": 0.6090195125900737, + "grad_norm": 29.371593475341797, + "learning_rate": 3.911917098445596e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.9028784930706024, + "num_tokens": 6738412.0, + "step": 3761 + }, + { + "epoch": 0.609181442798154, + "grad_norm": 21.791093826293945, + "learning_rate": 3.91029792746114e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.9190140962600708, + "num_tokens": 6740208.0, + "step": 3762 + }, + { + "epoch": 0.6093433730062343, + "grad_norm": 22.272323608398438, + "learning_rate": 3.908678756476684e-06, + "loss": 0.6172, + "mean_token_accuracy": 0.9109416007995605, + "num_tokens": 6741990.0, + "step": 3763 + }, + { + "epoch": 0.6095053032143146, + "grad_norm": 22.989259719848633, + "learning_rate": 3.907059585492228e-06, + "loss": 0.5625, + "mean_token_accuracy": 0.9178784787654877, + "num_tokens": 6743782.0, + "step": 3764 + }, + { + "epoch": 0.6096672334223949, + "grad_norm": 23.15787696838379, + "learning_rate": 3.905440414507773e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.9171754717826843, + "num_tokens": 6745560.0, + "step": 3765 + }, + { + "epoch": 0.6098291636304752, + "grad_norm": 24.93471908569336, + "learning_rate": 3.903821243523316e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.9097852110862732, + "num_tokens": 6747349.0, + "step": 3766 + }, + { + "epoch": 0.6099910938385555, + "grad_norm": 15.27294921875, + "learning_rate": 3.902202072538861e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.9336365461349487, + "num_tokens": 6749147.0, + "step": 3767 + }, + { + "epoch": 0.6101530240466359, + "grad_norm": 19.41769027709961, + "learning_rate": 3.900582901554404e-06, + "loss": 0.5289, + "mean_token_accuracy": 0.9269450306892395, + "num_tokens": 6750933.0, + "step": 3768 + }, + { + "epoch": 0.6103149542547163, + "grad_norm": 17.94277572631836, + "learning_rate": 3.898963730569949e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.9216800332069397, + "num_tokens": 6752713.0, + "step": 3769 + }, + { + "epoch": 0.6104768844627966, + "grad_norm": 20.330320358276367, + "learning_rate": 3.897344559585492e-06, + "loss": 0.5621, + "mean_token_accuracy": 0.9091245830059052, + "num_tokens": 6754500.0, + "step": 3770 + }, + { + "epoch": 0.6106388146708769, + "grad_norm": 31.550535202026367, + "learning_rate": 3.895725388601037e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.9107434451580048, + "num_tokens": 6756292.0, + "step": 3771 + }, + { + "epoch": 0.6108007448789572, + "grad_norm": 29.74262809753418, + "learning_rate": 3.89410621761658e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.9045474529266357, + "num_tokens": 6758075.0, + "step": 3772 + }, + { + "epoch": 0.6109626750870375, + "grad_norm": 18.766494750976562, + "learning_rate": 3.892487046632125e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.9255318939685822, + "num_tokens": 6759869.0, + "step": 3773 + }, + { + "epoch": 0.6111246052951178, + "grad_norm": 30.482812881469727, + "learning_rate": 3.890867875647668e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.9046145975589752, + "num_tokens": 6761662.0, + "step": 3774 + }, + { + "epoch": 0.6112865355031981, + "grad_norm": 36.873958587646484, + "learning_rate": 3.889248704663213e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.889857143163681, + "num_tokens": 6763455.0, + "step": 3775 + }, + { + "epoch": 0.6114484657112784, + "grad_norm": 19.92529296875, + "learning_rate": 3.8876295336787564e-06, + "loss": 0.5501, + "mean_token_accuracy": 0.9272717833518982, + "num_tokens": 6765242.0, + "step": 3776 + }, + { + "epoch": 0.6116103959193587, + "grad_norm": 22.154050827026367, + "learning_rate": 3.886010362694301e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.924217939376831, + "num_tokens": 6767031.0, + "step": 3777 + }, + { + "epoch": 0.611772326127439, + "grad_norm": 22.282154083251953, + "learning_rate": 3.8843911917098445e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.9162260293960571, + "num_tokens": 6768829.0, + "step": 3778 + }, + { + "epoch": 0.6119342563355193, + "grad_norm": 16.399154663085938, + "learning_rate": 3.882772020725389e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.9401785731315613, + "num_tokens": 6770625.0, + "step": 3779 + }, + { + "epoch": 0.6120961865435998, + "grad_norm": 30.0355281829834, + "learning_rate": 3.8811528497409325e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.9181813895702362, + "num_tokens": 6772430.0, + "step": 3780 + }, + { + "epoch": 0.6122581167516801, + "grad_norm": 21.679079055786133, + "learning_rate": 3.879533678756477e-06, + "loss": 0.5718, + "mean_token_accuracy": 0.9293956160545349, + "num_tokens": 6774225.0, + "step": 3781 + }, + { + "epoch": 0.6124200469597604, + "grad_norm": 40.652687072753906, + "learning_rate": 3.8779145077720205e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.8844460844993591, + "num_tokens": 6776023.0, + "step": 3782 + }, + { + "epoch": 0.6125819771678407, + "grad_norm": 13.871591567993164, + "learning_rate": 3.876295336787565e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.9423837065696716, + "num_tokens": 6777830.0, + "step": 3783 + }, + { + "epoch": 0.612743907375921, + "grad_norm": 22.459936141967773, + "learning_rate": 3.874676165803109e-06, + "loss": 0.5932, + "mean_token_accuracy": 0.9188725650310516, + "num_tokens": 6779613.0, + "step": 3784 + }, + { + "epoch": 0.6129058375840013, + "grad_norm": 31.759794235229492, + "learning_rate": 3.873056994818653e-06, + "loss": 0.6472, + "mean_token_accuracy": 0.9177459180355072, + "num_tokens": 6781404.0, + "step": 3785 + }, + { + "epoch": 0.6130677677920816, + "grad_norm": 21.124162673950195, + "learning_rate": 3.871437823834197e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9199119508266449, + "num_tokens": 6783202.0, + "step": 3786 + }, + { + "epoch": 0.6132296980001619, + "grad_norm": 31.745376586914062, + "learning_rate": 3.869818652849741e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.9053639471530914, + "num_tokens": 6784989.0, + "step": 3787 + }, + { + "epoch": 0.6133916282082422, + "grad_norm": 26.238304138183594, + "learning_rate": 3.8681994818652854e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.9134325683116913, + "num_tokens": 6786788.0, + "step": 3788 + }, + { + "epoch": 0.6135535584163225, + "grad_norm": 32.47900390625, + "learning_rate": 3.866580310880829e-06, + "loss": 0.571, + "mean_token_accuracy": 0.9181869029998779, + "num_tokens": 6788581.0, + "step": 3789 + }, + { + "epoch": 0.6137154886244028, + "grad_norm": 23.9670352935791, + "learning_rate": 3.8649611398963735e-06, + "loss": 0.5882, + "mean_token_accuracy": 0.9187581241130829, + "num_tokens": 6790376.0, + "step": 3790 + }, + { + "epoch": 0.6138774188324831, + "grad_norm": 27.3259220123291, + "learning_rate": 3.863341968911917e-06, + "loss": 0.62, + "mean_token_accuracy": 0.9175078868865967, + "num_tokens": 6792166.0, + "step": 3791 + }, + { + "epoch": 0.6140393490405636, + "grad_norm": 21.81146812438965, + "learning_rate": 3.8617227979274615e-06, + "loss": 0.547, + "mean_token_accuracy": 0.9223888218402863, + "num_tokens": 6793961.0, + "step": 3792 + }, + { + "epoch": 0.6142012792486439, + "grad_norm": 23.613964080810547, + "learning_rate": 3.860103626943005e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.9290209114551544, + "num_tokens": 6795755.0, + "step": 3793 + }, + { + "epoch": 0.6143632094567242, + "grad_norm": 25.392786026000977, + "learning_rate": 3.8584844559585495e-06, + "loss": 0.53, + "mean_token_accuracy": 0.9244921207427979, + "num_tokens": 6797545.0, + "step": 3794 + }, + { + "epoch": 0.6145251396648045, + "grad_norm": 32.05488204956055, + "learning_rate": 3.856865284974093e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.9196043908596039, + "num_tokens": 6799343.0, + "step": 3795 + }, + { + "epoch": 0.6146870698728848, + "grad_norm": 20.420400619506836, + "learning_rate": 3.8552461139896376e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.925607442855835, + "num_tokens": 6801138.0, + "step": 3796 + }, + { + "epoch": 0.6148490000809651, + "grad_norm": 30.513477325439453, + "learning_rate": 3.853626943005181e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.9052910208702087, + "num_tokens": 6802925.0, + "step": 3797 + }, + { + "epoch": 0.6150109302890454, + "grad_norm": 28.486637115478516, + "learning_rate": 3.852007772020726e-06, + "loss": 0.603, + "mean_token_accuracy": 0.9122854769229889, + "num_tokens": 6804722.0, + "step": 3798 + }, + { + "epoch": 0.6151728604971257, + "grad_norm": 30.109798431396484, + "learning_rate": 3.850388601036269e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.9060952067375183, + "num_tokens": 6806521.0, + "step": 3799 + }, + { + "epoch": 0.615334790705206, + "grad_norm": 18.956100463867188, + "learning_rate": 3.848769430051814e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9273809492588043, + "num_tokens": 6808308.0, + "step": 3800 + }, + { + "epoch": 0.6154967209132863, + "grad_norm": 26.668197631835938, + "learning_rate": 3.847150259067358e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.9249196350574493, + "num_tokens": 6810100.0, + "step": 3801 + }, + { + "epoch": 0.6156586511213666, + "grad_norm": 24.940509796142578, + "learning_rate": 3.845531088082902e-06, + "loss": 0.6047, + "mean_token_accuracy": 0.9209931194782257, + "num_tokens": 6811891.0, + "step": 3802 + }, + { + "epoch": 0.6158205813294471, + "grad_norm": 30.22850799560547, + "learning_rate": 3.843911917098446e-06, + "loss": 0.6112, + "mean_token_accuracy": 0.917723149061203, + "num_tokens": 6813684.0, + "step": 3803 + }, + { + "epoch": 0.6159825115375274, + "grad_norm": 27.589496612548828, + "learning_rate": 3.84229274611399e-06, + "loss": 0.6488, + "mean_token_accuracy": 0.9193262457847595, + "num_tokens": 6815481.0, + "step": 3804 + }, + { + "epoch": 0.6161444417456077, + "grad_norm": 27.943086624145508, + "learning_rate": 3.840673575129534e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.9083566069602966, + "num_tokens": 6817266.0, + "step": 3805 + }, + { + "epoch": 0.616306371953688, + "grad_norm": 19.584903717041016, + "learning_rate": 3.839054404145078e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.9224945604801178, + "num_tokens": 6819049.0, + "step": 3806 + }, + { + "epoch": 0.6164683021617683, + "grad_norm": 20.051246643066406, + "learning_rate": 3.837435233160622e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.9391339719295502, + "num_tokens": 6820841.0, + "step": 3807 + }, + { + "epoch": 0.6166302323698486, + "grad_norm": 15.651829719543457, + "learning_rate": 3.835816062176166e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.9247439801692963, + "num_tokens": 6822619.0, + "step": 3808 + }, + { + "epoch": 0.6167921625779289, + "grad_norm": 30.62786865234375, + "learning_rate": 3.83419689119171e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.9024867117404938, + "num_tokens": 6824408.0, + "step": 3809 + }, + { + "epoch": 0.6169540927860092, + "grad_norm": 28.820493698120117, + "learning_rate": 3.832577720207254e-06, + "loss": 0.573, + "mean_token_accuracy": 0.9138931930065155, + "num_tokens": 6826208.0, + "step": 3810 + }, + { + "epoch": 0.6171160229940895, + "grad_norm": 21.29700469970703, + "learning_rate": 3.830958549222798e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.9277743101119995, + "num_tokens": 6827997.0, + "step": 3811 + }, + { + "epoch": 0.6172779532021698, + "grad_norm": 29.305561065673828, + "learning_rate": 3.829339378238342e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.9260841310024261, + "num_tokens": 6829792.0, + "step": 3812 + }, + { + "epoch": 0.6174398834102501, + "grad_norm": 27.338274002075195, + "learning_rate": 3.827720207253886e-06, + "loss": 0.5942, + "mean_token_accuracy": 0.9142078757286072, + "num_tokens": 6831583.0, + "step": 3813 + }, + { + "epoch": 0.6176018136183306, + "grad_norm": 27.450578689575195, + "learning_rate": 3.826101036269431e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.9161375463008881, + "num_tokens": 6833370.0, + "step": 3814 + }, + { + "epoch": 0.6177637438264109, + "grad_norm": 30.09908103942871, + "learning_rate": 3.824481865284975e-06, + "loss": 0.6357, + "mean_token_accuracy": 0.9142303168773651, + "num_tokens": 6835162.0, + "step": 3815 + }, + { + "epoch": 0.6179256740344912, + "grad_norm": 34.357730865478516, + "learning_rate": 3.822862694300519e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.916083574295044, + "num_tokens": 6836959.0, + "step": 3816 + }, + { + "epoch": 0.6180876042425715, + "grad_norm": 27.420150756835938, + "learning_rate": 3.821243523316063e-06, + "loss": 0.6842, + "mean_token_accuracy": 0.8966230750083923, + "num_tokens": 6838742.0, + "step": 3817 + }, + { + "epoch": 0.6182495344506518, + "grad_norm": 23.396629333496094, + "learning_rate": 3.819624352331607e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.9242894947528839, + "num_tokens": 6840532.0, + "step": 3818 + }, + { + "epoch": 0.6184114646587321, + "grad_norm": 26.299522399902344, + "learning_rate": 3.818005181347151e-06, + "loss": 0.6181, + "mean_token_accuracy": 0.9216232597827911, + "num_tokens": 6842325.0, + "step": 3819 + }, + { + "epoch": 0.6185733948668124, + "grad_norm": 27.156272888183594, + "learning_rate": 3.816386010362695e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.903769850730896, + "num_tokens": 6844128.0, + "step": 3820 + }, + { + "epoch": 0.6187353250748927, + "grad_norm": 22.009973526000977, + "learning_rate": 3.814766839378239e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.9231078922748566, + "num_tokens": 6845913.0, + "step": 3821 + }, + { + "epoch": 0.618897255282973, + "grad_norm": 26.79131507873535, + "learning_rate": 3.8131476683937827e-06, + "loss": 0.63, + "mean_token_accuracy": 0.9200210571289062, + "num_tokens": 6847701.0, + "step": 3822 + }, + { + "epoch": 0.6190591854910533, + "grad_norm": 18.714893341064453, + "learning_rate": 3.8115284974093268e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.9271561801433563, + "num_tokens": 6849488.0, + "step": 3823 + }, + { + "epoch": 0.6192211156991336, + "grad_norm": 33.2025260925293, + "learning_rate": 3.809909326424871e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.9107142686843872, + "num_tokens": 6851280.0, + "step": 3824 + }, + { + "epoch": 0.6193830459072139, + "grad_norm": 11.177903175354004, + "learning_rate": 3.808290155440415e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.9370370507240295, + "num_tokens": 6853062.0, + "step": 3825 + }, + { + "epoch": 0.6195449761152944, + "grad_norm": 37.468692779541016, + "learning_rate": 3.8066709844559592e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.9169968664646149, + "num_tokens": 6854851.0, + "step": 3826 + }, + { + "epoch": 0.6197069063233747, + "grad_norm": 24.48048973083496, + "learning_rate": 3.8050518134715032e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.9166666567325592, + "num_tokens": 6856648.0, + "step": 3827 + }, + { + "epoch": 0.619868836531455, + "grad_norm": 50.5008544921875, + "learning_rate": 3.8034326424870472e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.8976364433765411, + "num_tokens": 6858444.0, + "step": 3828 + }, + { + "epoch": 0.6200307667395353, + "grad_norm": 26.1619815826416, + "learning_rate": 3.8018134715025913e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.925452709197998, + "num_tokens": 6860238.0, + "step": 3829 + }, + { + "epoch": 0.6201926969476156, + "grad_norm": 26.874610900878906, + "learning_rate": 3.8001943005181353e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.9128559231758118, + "num_tokens": 6862025.0, + "step": 3830 + }, + { + "epoch": 0.6203546271556959, + "grad_norm": 30.82623863220215, + "learning_rate": 3.7985751295336793e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.9100041687488556, + "num_tokens": 6863814.0, + "step": 3831 + }, + { + "epoch": 0.6205165573637762, + "grad_norm": 26.21592903137207, + "learning_rate": 3.7969559585492233e-06, + "loss": 0.763, + "mean_token_accuracy": 0.9129201769828796, + "num_tokens": 6865602.0, + "step": 3832 + }, + { + "epoch": 0.6206784875718565, + "grad_norm": 23.0000057220459, + "learning_rate": 3.7953367875647673e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9192083179950714, + "num_tokens": 6867386.0, + "step": 3833 + }, + { + "epoch": 0.6208404177799368, + "grad_norm": 28.99321174621582, + "learning_rate": 3.7937176165803113e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.9027210772037506, + "num_tokens": 6869185.0, + "step": 3834 + }, + { + "epoch": 0.6210023479880171, + "grad_norm": 27.852075576782227, + "learning_rate": 3.7920984455958553e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.9103163778781891, + "num_tokens": 6870965.0, + "step": 3835 + }, + { + "epoch": 0.6211642781960974, + "grad_norm": 27.417694091796875, + "learning_rate": 3.7904792746113993e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.918163388967514, + "num_tokens": 6872758.0, + "step": 3836 + }, + { + "epoch": 0.6213262084041778, + "grad_norm": 32.20460510253906, + "learning_rate": 3.7888601036269434e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.9036674499511719, + "num_tokens": 6874539.0, + "step": 3837 + }, + { + "epoch": 0.6214881386122582, + "grad_norm": 25.17333221435547, + "learning_rate": 3.7872409326424874e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.9170294404029846, + "num_tokens": 6876329.0, + "step": 3838 + }, + { + "epoch": 0.6216500688203385, + "grad_norm": 36.569908142089844, + "learning_rate": 3.7856217616580314e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.9045454561710358, + "num_tokens": 6878124.0, + "step": 3839 + }, + { + "epoch": 0.6218119990284188, + "grad_norm": 25.838119506835938, + "learning_rate": 3.7840025906735754e-06, + "loss": 0.6243, + "mean_token_accuracy": 0.9171971678733826, + "num_tokens": 6879937.0, + "step": 3840 + }, + { + "epoch": 0.6219739292364991, + "grad_norm": 38.13341522216797, + "learning_rate": 3.7823834196891194e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.8968901038169861, + "num_tokens": 6881731.0, + "step": 3841 + }, + { + "epoch": 0.6221358594445794, + "grad_norm": 37.35586166381836, + "learning_rate": 3.7807642487046634e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.8850767314434052, + "num_tokens": 6883530.0, + "step": 3842 + }, + { + "epoch": 0.6222977896526597, + "grad_norm": 26.050010681152344, + "learning_rate": 3.779145077720208e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.9114651679992676, + "num_tokens": 6885313.0, + "step": 3843 + }, + { + "epoch": 0.62245971986074, + "grad_norm": 20.554168701171875, + "learning_rate": 3.777525906735752e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.9287814497947693, + "num_tokens": 6887106.0, + "step": 3844 + }, + { + "epoch": 0.6226216500688203, + "grad_norm": 35.419403076171875, + "learning_rate": 3.775906735751296e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.8914404511451721, + "num_tokens": 6888904.0, + "step": 3845 + }, + { + "epoch": 0.6227835802769006, + "grad_norm": 32.76790237426758, + "learning_rate": 3.77428756476684e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.8984722197055817, + "num_tokens": 6890710.0, + "step": 3846 + }, + { + "epoch": 0.6229455104849809, + "grad_norm": 34.707923889160156, + "learning_rate": 3.772668393782384e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.8991894721984863, + "num_tokens": 6892510.0, + "step": 3847 + }, + { + "epoch": 0.6231074406930613, + "grad_norm": 17.059738159179688, + "learning_rate": 3.771049222797928e-06, + "loss": 0.472, + "mean_token_accuracy": 0.9340969324111938, + "num_tokens": 6894310.0, + "step": 3848 + }, + { + "epoch": 0.6232693709011417, + "grad_norm": 24.240140914916992, + "learning_rate": 3.769430051813472e-06, + "loss": 0.6523, + "mean_token_accuracy": 0.9148893356323242, + "num_tokens": 6896104.0, + "step": 3849 + }, + { + "epoch": 0.623431301109222, + "grad_norm": 27.741924285888672, + "learning_rate": 3.767810880829016e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.9082667231559753, + "num_tokens": 6897899.0, + "step": 3850 + }, + { + "epoch": 0.6235932313173023, + "grad_norm": 36.087921142578125, + "learning_rate": 3.76619170984456e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.9043117463588715, + "num_tokens": 6899694.0, + "step": 3851 + }, + { + "epoch": 0.6237551615253826, + "grad_norm": 34.343379974365234, + "learning_rate": 3.764572538860104e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.905139833688736, + "num_tokens": 6901488.0, + "step": 3852 + }, + { + "epoch": 0.6239170917334629, + "grad_norm": 19.885026931762695, + "learning_rate": 3.762953367875648e-06, + "loss": 0.6097, + "mean_token_accuracy": 0.919117659330368, + "num_tokens": 6903272.0, + "step": 3853 + }, + { + "epoch": 0.6240790219415432, + "grad_norm": 37.40237045288086, + "learning_rate": 3.761334196891192e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.885185182094574, + "num_tokens": 6905063.0, + "step": 3854 + }, + { + "epoch": 0.6242409521496235, + "grad_norm": 17.243078231811523, + "learning_rate": 3.759715025906736e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.9273434281349182, + "num_tokens": 6906850.0, + "step": 3855 + }, + { + "epoch": 0.6244028823577038, + "grad_norm": 28.773216247558594, + "learning_rate": 3.75809585492228e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.9146616458892822, + "num_tokens": 6908654.0, + "step": 3856 + }, + { + "epoch": 0.6245648125657841, + "grad_norm": 22.9847354888916, + "learning_rate": 3.756476683937824e-06, + "loss": 0.5481, + "mean_token_accuracy": 0.9183006584644318, + "num_tokens": 6910446.0, + "step": 3857 + }, + { + "epoch": 0.6247267427738644, + "grad_norm": 16.167688369750977, + "learning_rate": 3.754857512953368e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9342130422592163, + "num_tokens": 6912247.0, + "step": 3858 + }, + { + "epoch": 0.6248886729819448, + "grad_norm": 19.28745460510254, + "learning_rate": 3.753238341968912e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.9184104800224304, + "num_tokens": 6914041.0, + "step": 3859 + }, + { + "epoch": 0.6250506031900251, + "grad_norm": 21.27528190612793, + "learning_rate": 3.751619170984456e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.9240615367889404, + "num_tokens": 6915830.0, + "step": 3860 + }, + { + "epoch": 0.6252125333981055, + "grad_norm": 22.08339500427246, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.9166505932807922, + "num_tokens": 6917630.0, + "step": 3861 + }, + { + "epoch": 0.6253744636061858, + "grad_norm": 28.899700164794922, + "learning_rate": 3.7483808290155445e-06, + "loss": 0.5946, + "mean_token_accuracy": 0.9172877967357635, + "num_tokens": 6919420.0, + "step": 3862 + }, + { + "epoch": 0.6255363938142661, + "grad_norm": 28.444189071655273, + "learning_rate": 3.7467616580310885e-06, + "loss": 0.5929, + "mean_token_accuracy": 0.9182111024856567, + "num_tokens": 6921214.0, + "step": 3863 + }, + { + "epoch": 0.6256983240223464, + "grad_norm": 33.272071838378906, + "learning_rate": 3.7451424870466326e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.9063766002655029, + "num_tokens": 6923014.0, + "step": 3864 + }, + { + "epoch": 0.6258602542304267, + "grad_norm": 33.116668701171875, + "learning_rate": 3.7435233160621766e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.9050492644309998, + "num_tokens": 6924811.0, + "step": 3865 + }, + { + "epoch": 0.626022184438507, + "grad_norm": 20.595605850219727, + "learning_rate": 3.7419041450777206e-06, + "loss": 0.608, + "mean_token_accuracy": 0.9046897888183594, + "num_tokens": 6926595.0, + "step": 3866 + }, + { + "epoch": 0.6261841146465873, + "grad_norm": 24.082242965698242, + "learning_rate": 3.7402849740932646e-06, + "loss": 0.598, + "mean_token_accuracy": 0.916402131319046, + "num_tokens": 6928382.0, + "step": 3867 + }, + { + "epoch": 0.6263460448546676, + "grad_norm": 29.156993865966797, + "learning_rate": 3.7386658031088086e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.9047702252864838, + "num_tokens": 6930177.0, + "step": 3868 + }, + { + "epoch": 0.6265079750627479, + "grad_norm": 29.050241470336914, + "learning_rate": 3.7370466321243526e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.9064671695232391, + "num_tokens": 6931977.0, + "step": 3869 + }, + { + "epoch": 0.6266699052708282, + "grad_norm": 23.61697769165039, + "learning_rate": 3.7354274611398966e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.9177428483963013, + "num_tokens": 6933769.0, + "step": 3870 + }, + { + "epoch": 0.6268318354789086, + "grad_norm": 24.783767700195312, + "learning_rate": 3.7338082901554406e-06, + "loss": 0.6391, + "mean_token_accuracy": 0.9117632210254669, + "num_tokens": 6935564.0, + "step": 3871 + }, + { + "epoch": 0.626993765686989, + "grad_norm": 24.826244354248047, + "learning_rate": 3.7321891191709847e-06, + "loss": 0.591, + "mean_token_accuracy": 0.9235645532608032, + "num_tokens": 6937363.0, + "step": 3872 + }, + { + "epoch": 0.6271556958950693, + "grad_norm": 30.99048614501953, + "learning_rate": 3.7305699481865287e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.9094326794147491, + "num_tokens": 6939151.0, + "step": 3873 + }, + { + "epoch": 0.6273176261031496, + "grad_norm": 23.033340454101562, + "learning_rate": 3.7289507772020727e-06, + "loss": 0.606, + "mean_token_accuracy": 0.9244449734687805, + "num_tokens": 6940954.0, + "step": 3874 + }, + { + "epoch": 0.6274795563112299, + "grad_norm": 27.47634506225586, + "learning_rate": 3.7273316062176167e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.9157631993293762, + "num_tokens": 6942739.0, + "step": 3875 + }, + { + "epoch": 0.6276414865193102, + "grad_norm": 14.449963569641113, + "learning_rate": 3.7257124352331607e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.9285714328289032, + "num_tokens": 6944517.0, + "step": 3876 + }, + { + "epoch": 0.6278034167273905, + "grad_norm": 23.790767669677734, + "learning_rate": 3.7240932642487047e-06, + "loss": 0.5338, + "mean_token_accuracy": 0.9184397161006927, + "num_tokens": 6946311.0, + "step": 3877 + }, + { + "epoch": 0.6279653469354708, + "grad_norm": 33.56352996826172, + "learning_rate": 3.7224740932642487e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.9181795120239258, + "num_tokens": 6948104.0, + "step": 3878 + }, + { + "epoch": 0.6281272771435511, + "grad_norm": 22.858064651489258, + "learning_rate": 3.7208549222797928e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.9177290201187134, + "num_tokens": 6949895.0, + "step": 3879 + }, + { + "epoch": 0.6282892073516314, + "grad_norm": 15.735260963439941, + "learning_rate": 3.719235751295337e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.9262527227401733, + "num_tokens": 6951678.0, + "step": 3880 + }, + { + "epoch": 0.6284511375597117, + "grad_norm": 30.318920135498047, + "learning_rate": 3.717616580310881e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.9026936292648315, + "num_tokens": 6953468.0, + "step": 3881 + }, + { + "epoch": 0.6286130677677921, + "grad_norm": 30.7633056640625, + "learning_rate": 3.7159974093264252e-06, + "loss": 0.6508, + "mean_token_accuracy": 0.9145810008049011, + "num_tokens": 6955261.0, + "step": 3882 + }, + { + "epoch": 0.6287749979758724, + "grad_norm": 17.587066650390625, + "learning_rate": 3.7143782383419692e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9282888174057007, + "num_tokens": 6957052.0, + "step": 3883 + }, + { + "epoch": 0.6289369281839527, + "grad_norm": 31.08521842956543, + "learning_rate": 3.7127590673575132e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.9106450378894806, + "num_tokens": 6958844.0, + "step": 3884 + }, + { + "epoch": 0.6290988583920331, + "grad_norm": 22.40519142150879, + "learning_rate": 3.7111398963730573e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.9297545254230499, + "num_tokens": 6960641.0, + "step": 3885 + }, + { + "epoch": 0.6292607886001134, + "grad_norm": 18.38348388671875, + "learning_rate": 3.7095207253886013e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.9253731369972229, + "num_tokens": 6962421.0, + "step": 3886 + }, + { + "epoch": 0.6294227188081937, + "grad_norm": 35.30379867553711, + "learning_rate": 3.7079015544041453e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.8851491510868073, + "num_tokens": 6964211.0, + "step": 3887 + }, + { + "epoch": 0.629584649016274, + "grad_norm": 24.300065994262695, + "learning_rate": 3.7062823834196893e-06, + "loss": 0.6381, + "mean_token_accuracy": 0.907021552324295, + "num_tokens": 6966013.0, + "step": 3888 + }, + { + "epoch": 0.6297465792243543, + "grad_norm": 31.942567825317383, + "learning_rate": 3.7046632124352333e-06, + "loss": 0.573, + "mean_token_accuracy": 0.9090595841407776, + "num_tokens": 6967800.0, + "step": 3889 + }, + { + "epoch": 0.6299085094324346, + "grad_norm": 34.48097610473633, + "learning_rate": 3.7030440414507773e-06, + "loss": 0.74, + "mean_token_accuracy": 0.9079427421092987, + "num_tokens": 6969601.0, + "step": 3890 + }, + { + "epoch": 0.6300704396405149, + "grad_norm": 29.454008102416992, + "learning_rate": 3.7014248704663213e-06, + "loss": 0.722, + "mean_token_accuracy": 0.9096866250038147, + "num_tokens": 6971391.0, + "step": 3891 + }, + { + "epoch": 0.6302323698485952, + "grad_norm": 25.57354164123535, + "learning_rate": 3.6998056994818653e-06, + "loss": 0.6581, + "mean_token_accuracy": 0.9094942808151245, + "num_tokens": 6973179.0, + "step": 3892 + }, + { + "epoch": 0.6303943000566756, + "grad_norm": 17.95140838623047, + "learning_rate": 3.6981865284974094e-06, + "loss": 0.507, + "mean_token_accuracy": 0.9291283786296844, + "num_tokens": 6974959.0, + "step": 3893 + }, + { + "epoch": 0.6305562302647559, + "grad_norm": 28.256980895996094, + "learning_rate": 3.6965673575129534e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.9074419438838959, + "num_tokens": 6976741.0, + "step": 3894 + }, + { + "epoch": 0.6307181604728362, + "grad_norm": 31.554920196533203, + "learning_rate": 3.6949481865284974e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.8919501602649689, + "num_tokens": 6978529.0, + "step": 3895 + }, + { + "epoch": 0.6308800906809165, + "grad_norm": 26.896007537841797, + "learning_rate": 3.6933290155440414e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.9038753807544708, + "num_tokens": 6980322.0, + "step": 3896 + }, + { + "epoch": 0.6310420208889969, + "grad_norm": 26.34307289123535, + "learning_rate": 3.6917098445595854e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.9217216372489929, + "num_tokens": 6982116.0, + "step": 3897 + }, + { + "epoch": 0.6312039510970772, + "grad_norm": 25.917030334472656, + "learning_rate": 3.6900906735751294e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.9157635569572449, + "num_tokens": 6983913.0, + "step": 3898 + }, + { + "epoch": 0.6313658813051575, + "grad_norm": 28.111858367919922, + "learning_rate": 3.688471502590674e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.9156745970249176, + "num_tokens": 6985709.0, + "step": 3899 + }, + { + "epoch": 0.6315278115132378, + "grad_norm": 20.250635147094727, + "learning_rate": 3.686852331606218e-06, + "loss": 0.5391, + "mean_token_accuracy": 0.9263465404510498, + "num_tokens": 6987493.0, + "step": 3900 + }, + { + "epoch": 0.6316897417213181, + "grad_norm": 18.9285945892334, + "learning_rate": 3.685233160621762e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.9077706634998322, + "num_tokens": 6989286.0, + "step": 3901 + }, + { + "epoch": 0.6318516719293984, + "grad_norm": 21.333555221557617, + "learning_rate": 3.683613989637306e-06, + "loss": 0.6838, + "mean_token_accuracy": 0.9204900860786438, + "num_tokens": 6991075.0, + "step": 3902 + }, + { + "epoch": 0.6320136021374787, + "grad_norm": 21.34879493713379, + "learning_rate": 3.68199481865285e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.9295739829540253, + "num_tokens": 6992871.0, + "step": 3903 + }, + { + "epoch": 0.632175532345559, + "grad_norm": 24.16707420349121, + "learning_rate": 3.680375647668394e-06, + "loss": 0.5636, + "mean_token_accuracy": 0.913907915353775, + "num_tokens": 6994673.0, + "step": 3904 + }, + { + "epoch": 0.6323374625536394, + "grad_norm": 19.35509490966797, + "learning_rate": 3.678756476683938e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.9265811145305634, + "num_tokens": 6996458.0, + "step": 3905 + }, + { + "epoch": 0.6324993927617197, + "grad_norm": 29.268035888671875, + "learning_rate": 3.677137305699482e-06, + "loss": 0.6165, + "mean_token_accuracy": 0.9124059975147247, + "num_tokens": 6998243.0, + "step": 3906 + }, + { + "epoch": 0.6326613229698, + "grad_norm": 28.522531509399414, + "learning_rate": 3.675518134715026e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.9235521256923676, + "num_tokens": 7000043.0, + "step": 3907 + }, + { + "epoch": 0.6328232531778804, + "grad_norm": 24.959781646728516, + "learning_rate": 3.67389896373057e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.9117303192615509, + "num_tokens": 7001836.0, + "step": 3908 + }, + { + "epoch": 0.6329851833859607, + "grad_norm": 30.961679458618164, + "learning_rate": 3.672279792746114e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.9039416313171387, + "num_tokens": 7003629.0, + "step": 3909 + }, + { + "epoch": 0.633147113594041, + "grad_norm": 30.942033767700195, + "learning_rate": 3.6706606217616584e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.9170055389404297, + "num_tokens": 7005418.0, + "step": 3910 + }, + { + "epoch": 0.6333090438021213, + "grad_norm": 29.44158935546875, + "learning_rate": 3.6690414507772024e-06, + "loss": 0.6526, + "mean_token_accuracy": 0.9149852097034454, + "num_tokens": 7007224.0, + "step": 3911 + }, + { + "epoch": 0.6334709740102016, + "grad_norm": 19.448265075683594, + "learning_rate": 3.667422279792747e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.9271235466003418, + "num_tokens": 7009024.0, + "step": 3912 + }, + { + "epoch": 0.6336329042182819, + "grad_norm": 26.805383682250977, + "learning_rate": 3.665803108808291e-06, + "loss": 0.5593, + "mean_token_accuracy": 0.9213188886642456, + "num_tokens": 7010818.0, + "step": 3913 + }, + { + "epoch": 0.6337948344263622, + "grad_norm": 27.27880096435547, + "learning_rate": 3.664183937823835e-06, + "loss": 0.6715, + "mean_token_accuracy": 0.9093508422374725, + "num_tokens": 7012605.0, + "step": 3914 + }, + { + "epoch": 0.6339567646344425, + "grad_norm": 34.75452423095703, + "learning_rate": 3.662564766839379e-06, + "loss": 0.796, + "mean_token_accuracy": 0.9056878387928009, + "num_tokens": 7014392.0, + "step": 3915 + }, + { + "epoch": 0.6341186948425229, + "grad_norm": 41.40286636352539, + "learning_rate": 3.660945595854923e-06, + "loss": 0.84, + "mean_token_accuracy": 0.8972837030887604, + "num_tokens": 7016186.0, + "step": 3916 + }, + { + "epoch": 0.6342806250506032, + "grad_norm": 29.32868766784668, + "learning_rate": 3.659326424870467e-06, + "loss": 0.718, + "mean_token_accuracy": 0.9085586667060852, + "num_tokens": 7017982.0, + "step": 3917 + }, + { + "epoch": 0.6344425552586835, + "grad_norm": 18.295459747314453, + "learning_rate": 3.657707253886011e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.9259873926639557, + "num_tokens": 7019763.0, + "step": 3918 + }, + { + "epoch": 0.6346044854667638, + "grad_norm": 30.088726043701172, + "learning_rate": 3.656088082901555e-06, + "loss": 0.563, + "mean_token_accuracy": 0.9142156839370728, + "num_tokens": 7021555.0, + "step": 3919 + }, + { + "epoch": 0.6347664156748442, + "grad_norm": 34.729549407958984, + "learning_rate": 3.654468911917099e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.9172413945198059, + "num_tokens": 7023357.0, + "step": 3920 + }, + { + "epoch": 0.6349283458829245, + "grad_norm": 36.46147537231445, + "learning_rate": 3.652849740932643e-06, + "loss": 0.857, + "mean_token_accuracy": 0.8920139968395233, + "num_tokens": 7025156.0, + "step": 3921 + }, + { + "epoch": 0.6350902760910048, + "grad_norm": 26.019275665283203, + "learning_rate": 3.651230569948187e-06, + "loss": 0.571, + "mean_token_accuracy": 0.9231078922748566, + "num_tokens": 7026941.0, + "step": 3922 + }, + { + "epoch": 0.6352522062990851, + "grad_norm": 15.746135711669922, + "learning_rate": 3.649611398963731e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9258066415786743, + "num_tokens": 7028736.0, + "step": 3923 + }, + { + "epoch": 0.6354141365071654, + "grad_norm": 24.64112091064453, + "learning_rate": 3.647992227979275e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.9069499373435974, + "num_tokens": 7030518.0, + "step": 3924 + }, + { + "epoch": 0.6355760667152457, + "grad_norm": 29.56614112854004, + "learning_rate": 3.646373056994819e-06, + "loss": 0.6407, + "mean_token_accuracy": 0.9186462461948395, + "num_tokens": 7032310.0, + "step": 3925 + }, + { + "epoch": 0.635737996923326, + "grad_norm": 25.114641189575195, + "learning_rate": 3.644753886010363e-06, + "loss": 0.5943, + "mean_token_accuracy": 0.9129549264907837, + "num_tokens": 7034097.0, + "step": 3926 + }, + { + "epoch": 0.6358999271314064, + "grad_norm": 27.614503860473633, + "learning_rate": 3.643134715025907e-06, + "loss": 0.6117, + "mean_token_accuracy": 0.9077828824520111, + "num_tokens": 7035880.0, + "step": 3927 + }, + { + "epoch": 0.6360618573394867, + "grad_norm": 33.71192169189453, + "learning_rate": 3.641515544041451e-06, + "loss": 0.659, + "mean_token_accuracy": 0.9014345109462738, + "num_tokens": 7037677.0, + "step": 3928 + }, + { + "epoch": 0.636223787547567, + "grad_norm": 37.18674850463867, + "learning_rate": 3.639896373056995e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.9086325168609619, + "num_tokens": 7039471.0, + "step": 3929 + }, + { + "epoch": 0.6363857177556473, + "grad_norm": 27.030799865722656, + "learning_rate": 3.638277202072539e-06, + "loss": 0.5729, + "mean_token_accuracy": 0.9227983057498932, + "num_tokens": 7041267.0, + "step": 3930 + }, + { + "epoch": 0.6365476479637276, + "grad_norm": 34.76551055908203, + "learning_rate": 3.6366580310880836e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.9083834290504456, + "num_tokens": 7043052.0, + "step": 3931 + }, + { + "epoch": 0.636709578171808, + "grad_norm": 28.928834915161133, + "learning_rate": 3.6350388601036276e-06, + "loss": 0.5537, + "mean_token_accuracy": 0.9145744144916534, + "num_tokens": 7044844.0, + "step": 3932 + }, + { + "epoch": 0.6368715083798883, + "grad_norm": 26.747303009033203, + "learning_rate": 3.6334196891191716e-06, + "loss": 0.585, + "mean_token_accuracy": 0.9210945665836334, + "num_tokens": 7046635.0, + "step": 3933 + }, + { + "epoch": 0.6370334385879686, + "grad_norm": 27.86298942565918, + "learning_rate": 3.6318005181347156e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.9195342361927032, + "num_tokens": 7048431.0, + "step": 3934 + }, + { + "epoch": 0.6371953687960489, + "grad_norm": 23.24553680419922, + "learning_rate": 3.6301813471502596e-06, + "loss": 0.6336, + "mean_token_accuracy": 0.9113166928291321, + "num_tokens": 7050223.0, + "step": 3935 + }, + { + "epoch": 0.6373572990041292, + "grad_norm": 28.358461380004883, + "learning_rate": 3.6285621761658036e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.910344123840332, + "num_tokens": 7052013.0, + "step": 3936 + }, + { + "epoch": 0.6375192292122095, + "grad_norm": 25.419719696044922, + "learning_rate": 3.6269430051813476e-06, + "loss": 0.6042, + "mean_token_accuracy": 0.9108070731163025, + "num_tokens": 7053794.0, + "step": 3937 + }, + { + "epoch": 0.6376811594202898, + "grad_norm": 33.608699798583984, + "learning_rate": 3.6253238341968916e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.8931034505367279, + "num_tokens": 7055596.0, + "step": 3938 + }, + { + "epoch": 0.6378430896283702, + "grad_norm": 20.32437515258789, + "learning_rate": 3.6237046632124357e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.9347826242446899, + "num_tokens": 7057384.0, + "step": 3939 + }, + { + "epoch": 0.6380050198364505, + "grad_norm": 25.781030654907227, + "learning_rate": 3.6220854922279797e-06, + "loss": 0.5446, + "mean_token_accuracy": 0.9247430562973022, + "num_tokens": 7059175.0, + "step": 3940 + }, + { + "epoch": 0.6381669500445308, + "grad_norm": 33.92966079711914, + "learning_rate": 3.6204663212435237e-06, + "loss": 0.6046, + "mean_token_accuracy": 0.9137355089187622, + "num_tokens": 7060964.0, + "step": 3941 + }, + { + "epoch": 0.6383288802526111, + "grad_norm": 32.363685607910156, + "learning_rate": 3.6188471502590677e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.9139516949653625, + "num_tokens": 7062755.0, + "step": 3942 + }, + { + "epoch": 0.6384908104606914, + "grad_norm": 25.286544799804688, + "learning_rate": 3.6172279792746117e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.9151683747768402, + "num_tokens": 7064538.0, + "step": 3943 + }, + { + "epoch": 0.6386527406687718, + "grad_norm": 24.947399139404297, + "learning_rate": 3.6156088082901557e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.9258343279361725, + "num_tokens": 7066333.0, + "step": 3944 + }, + { + "epoch": 0.6388146708768521, + "grad_norm": 17.55426597595215, + "learning_rate": 3.6139896373056997e-06, + "loss": 0.5526, + "mean_token_accuracy": 0.9315025210380554, + "num_tokens": 7068121.0, + "step": 3945 + }, + { + "epoch": 0.6389766010849324, + "grad_norm": 17.16888999938965, + "learning_rate": 3.6123704663212437e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.93031245470047, + "num_tokens": 7069906.0, + "step": 3946 + }, + { + "epoch": 0.6391385312930127, + "grad_norm": 26.1005859375, + "learning_rate": 3.6107512953367878e-06, + "loss": 0.5703, + "mean_token_accuracy": 0.9173833131790161, + "num_tokens": 7071709.0, + "step": 3947 + }, + { + "epoch": 0.639300461501093, + "grad_norm": 23.8436336517334, + "learning_rate": 3.6091321243523318e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.9242250323295593, + "num_tokens": 7073498.0, + "step": 3948 + }, + { + "epoch": 0.6394623917091733, + "grad_norm": 20.450000762939453, + "learning_rate": 3.6075129533678758e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.926174134016037, + "num_tokens": 7075281.0, + "step": 3949 + }, + { + "epoch": 0.6396243219172537, + "grad_norm": 25.024497985839844, + "learning_rate": 3.6058937823834202e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.9184607565402985, + "num_tokens": 7077075.0, + "step": 3950 + }, + { + "epoch": 0.639786252125334, + "grad_norm": 27.83124542236328, + "learning_rate": 3.6042746113989642e-06, + "loss": 0.6047, + "mean_token_accuracy": 0.9117646813392639, + "num_tokens": 7078859.0, + "step": 3951 + }, + { + "epoch": 0.6399481823334143, + "grad_norm": 20.56722068786621, + "learning_rate": 3.6026554404145082e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.9266431927680969, + "num_tokens": 7080657.0, + "step": 3952 + }, + { + "epoch": 0.6401101125414946, + "grad_norm": 27.27486228942871, + "learning_rate": 3.6010362694300523e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.9147057235240936, + "num_tokens": 7082449.0, + "step": 3953 + }, + { + "epoch": 0.6402720427495749, + "grad_norm": 33.001556396484375, + "learning_rate": 3.5994170984455963e-06, + "loss": 0.6354, + "mean_token_accuracy": 0.9067221879959106, + "num_tokens": 7084240.0, + "step": 3954 + }, + { + "epoch": 0.6404339729576553, + "grad_norm": 25.626216888427734, + "learning_rate": 3.5977979274611403e-06, + "loss": 0.5506, + "mean_token_accuracy": 0.9249537289142609, + "num_tokens": 7086031.0, + "step": 3955 + }, + { + "epoch": 0.6405959031657356, + "grad_norm": 38.346561431884766, + "learning_rate": 3.5961787564766843e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.9123580157756805, + "num_tokens": 7087829.0, + "step": 3956 + }, + { + "epoch": 0.6407578333738159, + "grad_norm": 30.982160568237305, + "learning_rate": 3.5945595854922283e-06, + "loss": 0.67, + "mean_token_accuracy": 0.91215580701828, + "num_tokens": 7089625.0, + "step": 3957 + }, + { + "epoch": 0.6409197635818962, + "grad_norm": 26.56120491027832, + "learning_rate": 3.5929404145077723e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.9053263068199158, + "num_tokens": 7091401.0, + "step": 3958 + }, + { + "epoch": 0.6410816937899765, + "grad_norm": 32.32514190673828, + "learning_rate": 3.5913212435233163e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.9076152145862579, + "num_tokens": 7093184.0, + "step": 3959 + }, + { + "epoch": 0.6412436239980568, + "grad_norm": 20.913759231567383, + "learning_rate": 3.5897020725388604e-06, + "loss": 0.532, + "mean_token_accuracy": 0.9232880473136902, + "num_tokens": 7094970.0, + "step": 3960 + }, + { + "epoch": 0.6414055542061372, + "grad_norm": 29.14267349243164, + "learning_rate": 3.5880829015544044e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.9181873500347137, + "num_tokens": 7096763.0, + "step": 3961 + }, + { + "epoch": 0.6415674844142175, + "grad_norm": 34.89345169067383, + "learning_rate": 3.5864637305699484e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.8952554762363434, + "num_tokens": 7098552.0, + "step": 3962 + }, + { + "epoch": 0.6417294146222978, + "grad_norm": 33.273502349853516, + "learning_rate": 3.5848445595854924e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.9072793126106262, + "num_tokens": 7100334.0, + "step": 3963 + }, + { + "epoch": 0.6418913448303781, + "grad_norm": 17.590497970581055, + "learning_rate": 3.5832253886010364e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.9314852058887482, + "num_tokens": 7102124.0, + "step": 3964 + }, + { + "epoch": 0.6420532750384584, + "grad_norm": 37.113136291503906, + "learning_rate": 3.5816062176165804e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.9016793072223663, + "num_tokens": 7103921.0, + "step": 3965 + }, + { + "epoch": 0.6422152052465387, + "grad_norm": 37.9775505065918, + "learning_rate": 3.5799870466321244e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.8858951032161713, + "num_tokens": 7105713.0, + "step": 3966 + }, + { + "epoch": 0.642377135454619, + "grad_norm": 28.24148178100586, + "learning_rate": 3.5783678756476684e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.920736163854599, + "num_tokens": 7107502.0, + "step": 3967 + }, + { + "epoch": 0.6425390656626994, + "grad_norm": 23.76460838317871, + "learning_rate": 3.5767487046632125e-06, + "loss": 0.5593, + "mean_token_accuracy": 0.9266505837440491, + "num_tokens": 7109287.0, + "step": 3968 + }, + { + "epoch": 0.6427009958707797, + "grad_norm": 27.81460189819336, + "learning_rate": 3.575129533678757e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.9084613025188446, + "num_tokens": 7111072.0, + "step": 3969 + }, + { + "epoch": 0.64286292607886, + "grad_norm": 18.791671752929688, + "learning_rate": 3.573510362694301e-06, + "loss": 0.5925, + "mean_token_accuracy": 0.9250357449054718, + "num_tokens": 7112864.0, + "step": 3970 + }, + { + "epoch": 0.6430248562869403, + "grad_norm": 28.244186401367188, + "learning_rate": 3.571891191709845e-06, + "loss": 0.63, + "mean_token_accuracy": 0.9150694608688354, + "num_tokens": 7114646.0, + "step": 3971 + }, + { + "epoch": 0.6431867864950207, + "grad_norm": 31.945478439331055, + "learning_rate": 3.570272020725389e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.9060380160808563, + "num_tokens": 7116435.0, + "step": 3972 + }, + { + "epoch": 0.643348716703101, + "grad_norm": 24.323877334594727, + "learning_rate": 3.568652849740933e-06, + "loss": 0.6058, + "mean_token_accuracy": 0.9245142340660095, + "num_tokens": 7118226.0, + "step": 3973 + }, + { + "epoch": 0.6435106469111813, + "grad_norm": 12.806122779846191, + "learning_rate": 3.567033678756477e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.9333845376968384, + "num_tokens": 7120008.0, + "step": 3974 + }, + { + "epoch": 0.6436725771192616, + "grad_norm": 24.855051040649414, + "learning_rate": 3.565414507772021e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.9099896550178528, + "num_tokens": 7121798.0, + "step": 3975 + }, + { + "epoch": 0.6438345073273419, + "grad_norm": 30.180490493774414, + "learning_rate": 3.563795336787565e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.906272828578949, + "num_tokens": 7123587.0, + "step": 3976 + }, + { + "epoch": 0.6439964375354222, + "grad_norm": 21.050554275512695, + "learning_rate": 3.562176165803109e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.9244724810123444, + "num_tokens": 7125378.0, + "step": 3977 + }, + { + "epoch": 0.6441583677435025, + "grad_norm": 26.60388946533203, + "learning_rate": 3.560556994818653e-06, + "loss": 0.5712, + "mean_token_accuracy": 0.9151099026203156, + "num_tokens": 7127173.0, + "step": 3978 + }, + { + "epoch": 0.6443202979515829, + "grad_norm": 17.773040771484375, + "learning_rate": 3.558937823834197e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9279643595218658, + "num_tokens": 7128963.0, + "step": 3979 + }, + { + "epoch": 0.6444822281596632, + "grad_norm": 27.30126953125, + "learning_rate": 3.557318652849741e-06, + "loss": 0.5699, + "mean_token_accuracy": 0.9145637154579163, + "num_tokens": 7130756.0, + "step": 3980 + }, + { + "epoch": 0.6446441583677435, + "grad_norm": 24.299997329711914, + "learning_rate": 3.555699481865285e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.9235875904560089, + "num_tokens": 7132543.0, + "step": 3981 + }, + { + "epoch": 0.6448060885758238, + "grad_norm": 36.111351013183594, + "learning_rate": 3.554080310880829e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.9024215638637543, + "num_tokens": 7134340.0, + "step": 3982 + }, + { + "epoch": 0.6449680187839041, + "grad_norm": 41.428157806396484, + "learning_rate": 3.552461139896373e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.8927203118801117, + "num_tokens": 7136141.0, + "step": 3983 + }, + { + "epoch": 0.6451299489919845, + "grad_norm": 23.4683780670166, + "learning_rate": 3.550841968911917e-06, + "loss": 0.594, + "mean_token_accuracy": 0.9193591773509979, + "num_tokens": 7137926.0, + "step": 3984 + }, + { + "epoch": 0.6452918792000648, + "grad_norm": 18.577917098999023, + "learning_rate": 3.549222797927461e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.9229573309421539, + "num_tokens": 7139725.0, + "step": 3985 + }, + { + "epoch": 0.6454538094081451, + "grad_norm": 22.314523696899414, + "learning_rate": 3.547603626943005e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.9335517883300781, + "num_tokens": 7141522.0, + "step": 3986 + }, + { + "epoch": 0.6456157396162254, + "grad_norm": 30.128273010253906, + "learning_rate": 3.5459844559585496e-06, + "loss": 0.5723, + "mean_token_accuracy": 0.9081010818481445, + "num_tokens": 7143317.0, + "step": 3987 + }, + { + "epoch": 0.6457776698243057, + "grad_norm": 22.064788818359375, + "learning_rate": 3.5443652849740936e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9269450306892395, + "num_tokens": 7145103.0, + "step": 3988 + }, + { + "epoch": 0.645939600032386, + "grad_norm": 29.914934158325195, + "learning_rate": 3.5427461139896376e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.9183111488819122, + "num_tokens": 7146898.0, + "step": 3989 + }, + { + "epoch": 0.6461015302404663, + "grad_norm": 27.50421714782715, + "learning_rate": 3.5411269430051816e-06, + "loss": 0.5851, + "mean_token_accuracy": 0.9204936921596527, + "num_tokens": 7148687.0, + "step": 3990 + }, + { + "epoch": 0.6462634604485467, + "grad_norm": 29.823270797729492, + "learning_rate": 3.5395077720207256e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.9050143361091614, + "num_tokens": 7150471.0, + "step": 3991 + }, + { + "epoch": 0.646425390656627, + "grad_norm": 29.062028884887695, + "learning_rate": 3.5378886010362696e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.9156323969364166, + "num_tokens": 7152268.0, + "step": 3992 + }, + { + "epoch": 0.6465873208647073, + "grad_norm": 16.323213577270508, + "learning_rate": 3.5362694300518136e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.9307036101818085, + "num_tokens": 7154054.0, + "step": 3993 + }, + { + "epoch": 0.6467492510727876, + "grad_norm": 25.366973876953125, + "learning_rate": 3.5346502590673576e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.9213786423206329, + "num_tokens": 7155845.0, + "step": 3994 + }, + { + "epoch": 0.646911181280868, + "grad_norm": 20.914785385131836, + "learning_rate": 3.5330310880829017e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.9300562739372253, + "num_tokens": 7157643.0, + "step": 3995 + }, + { + "epoch": 0.6470731114889483, + "grad_norm": 30.377864837646484, + "learning_rate": 3.5314119170984457e-06, + "loss": 0.6476, + "mean_token_accuracy": 0.911697119474411, + "num_tokens": 7159438.0, + "step": 3996 + }, + { + "epoch": 0.6472350416970286, + "grad_norm": 35.38301467895508, + "learning_rate": 3.5297927461139897e-06, + "loss": 0.6195, + "mean_token_accuracy": 0.9117632210254669, + "num_tokens": 7161233.0, + "step": 3997 + }, + { + "epoch": 0.6473969719051089, + "grad_norm": 24.891189575195312, + "learning_rate": 3.5281735751295337e-06, + "loss": 0.6083, + "mean_token_accuracy": 0.9224931597709656, + "num_tokens": 7163031.0, + "step": 3998 + }, + { + "epoch": 0.6475589021131892, + "grad_norm": 27.8299560546875, + "learning_rate": 3.5265544041450777e-06, + "loss": 0.6279, + "mean_token_accuracy": 0.9064423739910126, + "num_tokens": 7164820.0, + "step": 3999 + }, + { + "epoch": 0.6477208323212695, + "grad_norm": 33.034236907958984, + "learning_rate": 3.5249352331606217e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.8992805778980255, + "num_tokens": 7166610.0, + "step": 4000 + }, + { + "epoch": 0.6478827625293498, + "grad_norm": 28.056472778320312, + "learning_rate": 3.5233160621761657e-06, + "loss": 0.5608, + "mean_token_accuracy": 0.9202331602573395, + "num_tokens": 7168398.0, + "step": 4001 + }, + { + "epoch": 0.6480446927374302, + "grad_norm": 23.79387092590332, + "learning_rate": 3.5216968911917097e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.9296690225601196, + "num_tokens": 7170195.0, + "step": 4002 + }, + { + "epoch": 0.6482066229455105, + "grad_norm": 29.284326553344727, + "learning_rate": 3.5200777202072538e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.9033879339694977, + "num_tokens": 7171987.0, + "step": 4003 + }, + { + "epoch": 0.6483685531535908, + "grad_norm": 24.550888061523438, + "learning_rate": 3.5184585492227978e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.9292853772640228, + "num_tokens": 7173782.0, + "step": 4004 + }, + { + "epoch": 0.6485304833616711, + "grad_norm": 32.4036979675293, + "learning_rate": 3.5168393782383418e-06, + "loss": 0.654, + "mean_token_accuracy": 0.9061261713504791, + "num_tokens": 7175571.0, + "step": 4005 + }, + { + "epoch": 0.6486924135697515, + "grad_norm": 22.52859115600586, + "learning_rate": 3.5152202072538866e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9306386113166809, + "num_tokens": 7177371.0, + "step": 4006 + }, + { + "epoch": 0.6488543437778318, + "grad_norm": 21.782211303710938, + "learning_rate": 3.5136010362694307e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.9237439632415771, + "num_tokens": 7179159.0, + "step": 4007 + }, + { + "epoch": 0.6490162739859121, + "grad_norm": 25.6978759765625, + "learning_rate": 3.5119818652849747e-06, + "loss": 0.593, + "mean_token_accuracy": 0.9242961406707764, + "num_tokens": 7180948.0, + "step": 4008 + }, + { + "epoch": 0.6491782041939924, + "grad_norm": 22.900150299072266, + "learning_rate": 3.5103626943005187e-06, + "loss": 0.5799, + "mean_token_accuracy": 0.9264069199562073, + "num_tokens": 7182732.0, + "step": 4009 + }, + { + "epoch": 0.6493401344020727, + "grad_norm": 21.268558502197266, + "learning_rate": 3.5087435233160627e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.9253484010696411, + "num_tokens": 7184524.0, + "step": 4010 + }, + { + "epoch": 0.649502064610153, + "grad_norm": 39.48198699951172, + "learning_rate": 3.5071243523316067e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.8874479532241821, + "num_tokens": 7186327.0, + "step": 4011 + }, + { + "epoch": 0.6496639948182333, + "grad_norm": 31.47050666809082, + "learning_rate": 3.5055051813471507e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.9037947356700897, + "num_tokens": 7188118.0, + "step": 4012 + }, + { + "epoch": 0.6498259250263136, + "grad_norm": 27.42321014404297, + "learning_rate": 3.5038860103626947e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.9099584519863129, + "num_tokens": 7189906.0, + "step": 4013 + }, + { + "epoch": 0.649987855234394, + "grad_norm": 28.187917709350586, + "learning_rate": 3.5022668393782388e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.9074857831001282, + "num_tokens": 7191710.0, + "step": 4014 + }, + { + "epoch": 0.6501497854424743, + "grad_norm": 25.513425827026367, + "learning_rate": 3.5006476683937828e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.9145256876945496, + "num_tokens": 7193492.0, + "step": 4015 + }, + { + "epoch": 0.6503117156505546, + "grad_norm": 23.447725296020508, + "learning_rate": 3.4990284974093268e-06, + "loss": 0.6777, + "mean_token_accuracy": 0.9142748713493347, + "num_tokens": 7195285.0, + "step": 4016 + }, + { + "epoch": 0.6504736458586349, + "grad_norm": 36.816287994384766, + "learning_rate": 3.497409326424871e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.9069208204746246, + "num_tokens": 7197087.0, + "step": 4017 + }, + { + "epoch": 0.6506355760667153, + "grad_norm": 30.002910614013672, + "learning_rate": 3.495790155440415e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.9091558456420898, + "num_tokens": 7198874.0, + "step": 4018 + }, + { + "epoch": 0.6507975062747956, + "grad_norm": 23.461652755737305, + "learning_rate": 3.4941709844559592e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.9281461834907532, + "num_tokens": 7200664.0, + "step": 4019 + }, + { + "epoch": 0.6509594364828759, + "grad_norm": 19.912803649902344, + "learning_rate": 3.4925518134715033e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.9288800358772278, + "num_tokens": 7202443.0, + "step": 4020 + }, + { + "epoch": 0.6511213666909562, + "grad_norm": 31.394453048706055, + "learning_rate": 3.4909326424870473e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.9102316498756409, + "num_tokens": 7204243.0, + "step": 4021 + }, + { + "epoch": 0.6512832968990365, + "grad_norm": 24.90923500061035, + "learning_rate": 3.4893134715025913e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.9149594306945801, + "num_tokens": 7206036.0, + "step": 4022 + }, + { + "epoch": 0.6514452271071168, + "grad_norm": 19.383697509765625, + "learning_rate": 3.4876943005181353e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.9255736768245697, + "num_tokens": 7207830.0, + "step": 4023 + }, + { + "epoch": 0.6516071573151971, + "grad_norm": 28.540752410888672, + "learning_rate": 3.4860751295336793e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.9243475496768951, + "num_tokens": 7209620.0, + "step": 4024 + }, + { + "epoch": 0.6517690875232774, + "grad_norm": 19.40833854675293, + "learning_rate": 3.4844559585492233e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.927194744348526, + "num_tokens": 7211407.0, + "step": 4025 + }, + { + "epoch": 0.6519310177313578, + "grad_norm": 32.680145263671875, + "learning_rate": 3.4828367875647673e-06, + "loss": 0.632, + "mean_token_accuracy": 0.9115099608898163, + "num_tokens": 7213190.0, + "step": 4026 + }, + { + "epoch": 0.6520929479394381, + "grad_norm": 38.18278503417969, + "learning_rate": 3.4812176165803113e-06, + "loss": 0.676, + "mean_token_accuracy": 0.9078000783920288, + "num_tokens": 7214974.0, + "step": 4027 + }, + { + "epoch": 0.6522548781475184, + "grad_norm": 33.32498550415039, + "learning_rate": 3.4795984455958554e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.9106754958629608, + "num_tokens": 7216777.0, + "step": 4028 + }, + { + "epoch": 0.6524168083555988, + "grad_norm": 19.744749069213867, + "learning_rate": 3.4779792746113994e-06, + "loss": 0.522, + "mean_token_accuracy": 0.92356076836586, + "num_tokens": 7218563.0, + "step": 4029 + }, + { + "epoch": 0.6525787385636791, + "grad_norm": 26.144132614135742, + "learning_rate": 3.4763601036269434e-06, + "loss": 0.5688, + "mean_token_accuracy": 0.9269892275333405, + "num_tokens": 7220362.0, + "step": 4030 + }, + { + "epoch": 0.6527406687717594, + "grad_norm": 31.8227481842041, + "learning_rate": 3.4747409326424874e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.9090404212474823, + "num_tokens": 7222149.0, + "step": 4031 + }, + { + "epoch": 0.6529025989798397, + "grad_norm": 12.841814994812012, + "learning_rate": 3.4731217616580314e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.9363886117935181, + "num_tokens": 7223944.0, + "step": 4032 + }, + { + "epoch": 0.65306452918792, + "grad_norm": 31.269855499267578, + "learning_rate": 3.4715025906735754e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.914426326751709, + "num_tokens": 7225734.0, + "step": 4033 + }, + { + "epoch": 0.6532264593960003, + "grad_norm": 23.391603469848633, + "learning_rate": 3.4698834196891194e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.9328988194465637, + "num_tokens": 7227529.0, + "step": 4034 + }, + { + "epoch": 0.6533883896040806, + "grad_norm": 37.719146728515625, + "learning_rate": 3.4682642487046635e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.9092244505882263, + "num_tokens": 7229327.0, + "step": 4035 + }, + { + "epoch": 0.6535503198121609, + "grad_norm": 31.31080436706543, + "learning_rate": 3.4666450777202075e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.9121822416782379, + "num_tokens": 7231121.0, + "step": 4036 + }, + { + "epoch": 0.6537122500202412, + "grad_norm": 26.320165634155273, + "learning_rate": 3.4650259067357515e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.9219858348369598, + "num_tokens": 7232915.0, + "step": 4037 + }, + { + "epoch": 0.6538741802283216, + "grad_norm": 31.74606704711914, + "learning_rate": 3.463406735751296e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.917929470539093, + "num_tokens": 7234707.0, + "step": 4038 + }, + { + "epoch": 0.6540361104364019, + "grad_norm": 27.640714645385742, + "learning_rate": 3.46178756476684e-06, + "loss": 0.5901, + "mean_token_accuracy": 0.9202451109886169, + "num_tokens": 7236505.0, + "step": 4039 + }, + { + "epoch": 0.6541980406444823, + "grad_norm": 38.41910934448242, + "learning_rate": 3.460168393782384e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.9064731001853943, + "num_tokens": 7238305.0, + "step": 4040 + }, + { + "epoch": 0.6543599708525626, + "grad_norm": 30.75510597229004, + "learning_rate": 3.458549222797928e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.9129273593425751, + "num_tokens": 7240104.0, + "step": 4041 + }, + { + "epoch": 0.6545219010606429, + "grad_norm": 22.84029769897461, + "learning_rate": 3.456930051813472e-06, + "loss": 0.6498, + "mean_token_accuracy": 0.920981764793396, + "num_tokens": 7241894.0, + "step": 4042 + }, + { + "epoch": 0.6546838312687232, + "grad_norm": 32.36472702026367, + "learning_rate": 3.455310880829016e-06, + "loss": 0.6209, + "mean_token_accuracy": 0.9137873351573944, + "num_tokens": 7243695.0, + "step": 4043 + }, + { + "epoch": 0.6548457614768035, + "grad_norm": 26.274927139282227, + "learning_rate": 3.45369170984456e-06, + "loss": 0.6185, + "mean_token_accuracy": 0.9136690497398376, + "num_tokens": 7245485.0, + "step": 4044 + }, + { + "epoch": 0.6550076916848838, + "grad_norm": 32.575828552246094, + "learning_rate": 3.452072538860104e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.8991447389125824, + "num_tokens": 7247274.0, + "step": 4045 + }, + { + "epoch": 0.6551696218929641, + "grad_norm": 26.599124908447266, + "learning_rate": 3.450453367875648e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9269305467605591, + "num_tokens": 7249073.0, + "step": 4046 + }, + { + "epoch": 0.6553315521010444, + "grad_norm": 23.48540496826172, + "learning_rate": 3.448834196891192e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.92771115899086, + "num_tokens": 7250862.0, + "step": 4047 + }, + { + "epoch": 0.6554934823091247, + "grad_norm": 19.32307243347168, + "learning_rate": 3.447215025906736e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.9309523701667786, + "num_tokens": 7252649.0, + "step": 4048 + }, + { + "epoch": 0.655655412517205, + "grad_norm": 26.650726318359375, + "learning_rate": 3.44559585492228e-06, + "loss": 0.6159, + "mean_token_accuracy": 0.9114089608192444, + "num_tokens": 7254432.0, + "step": 4049 + }, + { + "epoch": 0.6558173427252854, + "grad_norm": 21.824199676513672, + "learning_rate": 3.443976683937824e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.9318532347679138, + "num_tokens": 7256223.0, + "step": 4050 + }, + { + "epoch": 0.6559792729333657, + "grad_norm": 24.318891525268555, + "learning_rate": 3.442357512953368e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.9205301403999329, + "num_tokens": 7258013.0, + "step": 4051 + }, + { + "epoch": 0.6561412031414461, + "grad_norm": 34.56660461425781, + "learning_rate": 3.440738341968912e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.9117429554462433, + "num_tokens": 7259810.0, + "step": 4052 + }, + { + "epoch": 0.6563031333495264, + "grad_norm": 27.054893493652344, + "learning_rate": 3.439119170984456e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.9218875765800476, + "num_tokens": 7261604.0, + "step": 4053 + }, + { + "epoch": 0.6564650635576067, + "grad_norm": 35.610008239746094, + "learning_rate": 3.4375e-06, + "loss": 0.724, + "mean_token_accuracy": 0.9066002666950226, + "num_tokens": 7263405.0, + "step": 4054 + }, + { + "epoch": 0.656626993765687, + "grad_norm": 32.48411178588867, + "learning_rate": 3.435880829015544e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.9046227037906647, + "num_tokens": 7265200.0, + "step": 4055 + }, + { + "epoch": 0.6567889239737673, + "grad_norm": 27.101337432861328, + "learning_rate": 3.434261658031088e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.9063673317432404, + "num_tokens": 7266989.0, + "step": 4056 + }, + { + "epoch": 0.6569508541818476, + "grad_norm": 22.184051513671875, + "learning_rate": 3.4326424870466326e-06, + "loss": 0.5301, + "mean_token_accuracy": 0.9264705777168274, + "num_tokens": 7268773.0, + "step": 4057 + }, + { + "epoch": 0.6571127843899279, + "grad_norm": 21.04471206665039, + "learning_rate": 3.4310233160621766e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.9271039366722107, + "num_tokens": 7270573.0, + "step": 4058 + }, + { + "epoch": 0.6572747145980082, + "grad_norm": 18.429393768310547, + "learning_rate": 3.4294041450777206e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.9205682873725891, + "num_tokens": 7272362.0, + "step": 4059 + }, + { + "epoch": 0.6574366448060885, + "grad_norm": 28.869848251342773, + "learning_rate": 3.4277849740932646e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.9071428775787354, + "num_tokens": 7274154.0, + "step": 4060 + }, + { + "epoch": 0.6575985750141689, + "grad_norm": 21.936565399169922, + "learning_rate": 3.4261658031088086e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.9192001819610596, + "num_tokens": 7275937.0, + "step": 4061 + }, + { + "epoch": 0.6577605052222492, + "grad_norm": 22.220081329345703, + "learning_rate": 3.4245466321243527e-06, + "loss": 0.5425, + "mean_token_accuracy": 0.9248997569084167, + "num_tokens": 7277728.0, + "step": 4062 + }, + { + "epoch": 0.6579224354303296, + "grad_norm": 20.669706344604492, + "learning_rate": 3.4229274611398967e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.9232880473136902, + "num_tokens": 7279514.0, + "step": 4063 + }, + { + "epoch": 0.6580843656384099, + "grad_norm": 24.72484588623047, + "learning_rate": 3.4213082901554407e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.921973466873169, + "num_tokens": 7281295.0, + "step": 4064 + }, + { + "epoch": 0.6582462958464902, + "grad_norm": 35.96552658081055, + "learning_rate": 3.4196891191709847e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.8867307305335999, + "num_tokens": 7283087.0, + "step": 4065 + }, + { + "epoch": 0.6584082260545705, + "grad_norm": 38.397769927978516, + "learning_rate": 3.4180699481865287e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.9015027582645416, + "num_tokens": 7284883.0, + "step": 4066 + }, + { + "epoch": 0.6585701562626508, + "grad_norm": 30.753400802612305, + "learning_rate": 3.4164507772020727e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.9022058844566345, + "num_tokens": 7286671.0, + "step": 4067 + }, + { + "epoch": 0.6587320864707311, + "grad_norm": 29.82662010192871, + "learning_rate": 3.4148316062176167e-06, + "loss": 0.6662, + "mean_token_accuracy": 0.9106434285640717, + "num_tokens": 7288474.0, + "step": 4068 + }, + { + "epoch": 0.6588940166788114, + "grad_norm": 31.837980270385742, + "learning_rate": 3.4132124352331607e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.9119047820568085, + "num_tokens": 7290270.0, + "step": 4069 + }, + { + "epoch": 0.6590559468868917, + "grad_norm": 24.83180046081543, + "learning_rate": 3.4115932642487048e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.9184402525424957, + "num_tokens": 7292063.0, + "step": 4070 + }, + { + "epoch": 0.659217877094972, + "grad_norm": 30.984506607055664, + "learning_rate": 3.4099740932642488e-06, + "loss": 0.6058, + "mean_token_accuracy": 0.9161776304244995, + "num_tokens": 7293861.0, + "step": 4071 + }, + { + "epoch": 0.6593798073030523, + "grad_norm": 19.947614669799805, + "learning_rate": 3.4083549222797928e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.9264546930789948, + "num_tokens": 7295645.0, + "step": 4072 + }, + { + "epoch": 0.6595417375111327, + "grad_norm": 32.25144958496094, + "learning_rate": 3.406735751295337e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.9150349497795105, + "num_tokens": 7297440.0, + "step": 4073 + }, + { + "epoch": 0.6597036677192131, + "grad_norm": 32.69015884399414, + "learning_rate": 3.405116580310881e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.9015957415103912, + "num_tokens": 7299237.0, + "step": 4074 + }, + { + "epoch": 0.6598655979272934, + "grad_norm": 20.569913864135742, + "learning_rate": 3.403497409326425e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.925253301858902, + "num_tokens": 7301030.0, + "step": 4075 + }, + { + "epoch": 0.6600275281353737, + "grad_norm": 21.782926559448242, + "learning_rate": 3.4018782383419693e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.924717366695404, + "num_tokens": 7302821.0, + "step": 4076 + }, + { + "epoch": 0.660189458343454, + "grad_norm": 28.79715347290039, + "learning_rate": 3.4002590673575133e-06, + "loss": 0.624, + "mean_token_accuracy": 0.9102086126804352, + "num_tokens": 7304611.0, + "step": 4077 + }, + { + "epoch": 0.6603513885515343, + "grad_norm": 21.243640899658203, + "learning_rate": 3.3986398963730573e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.9211202263832092, + "num_tokens": 7306402.0, + "step": 4078 + }, + { + "epoch": 0.6605133187596146, + "grad_norm": 19.101961135864258, + "learning_rate": 3.3970207253886013e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.9341780245304108, + "num_tokens": 7308187.0, + "step": 4079 + }, + { + "epoch": 0.6606752489676949, + "grad_norm": 29.71619415283203, + "learning_rate": 3.3954015544041453e-06, + "loss": 0.5924, + "mean_token_accuracy": 0.9175745248794556, + "num_tokens": 7309978.0, + "step": 4080 + }, + { + "epoch": 0.6608371791757752, + "grad_norm": 20.572898864746094, + "learning_rate": 3.3937823834196893e-06, + "loss": 0.5529, + "mean_token_accuracy": 0.930587500333786, + "num_tokens": 7311779.0, + "step": 4081 + }, + { + "epoch": 0.6609991093838555, + "grad_norm": 31.080326080322266, + "learning_rate": 3.3921632124352333e-06, + "loss": 0.6659, + "mean_token_accuracy": 0.9075351357460022, + "num_tokens": 7313571.0, + "step": 4082 + }, + { + "epoch": 0.6611610395919358, + "grad_norm": 30.32560157775879, + "learning_rate": 3.3905440414507773e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.9190140962600708, + "num_tokens": 7315367.0, + "step": 4083 + }, + { + "epoch": 0.6613229698000161, + "grad_norm": 28.505826950073242, + "learning_rate": 3.3889248704663214e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.9087809026241302, + "num_tokens": 7317153.0, + "step": 4084 + }, + { + "epoch": 0.6614849000080966, + "grad_norm": 24.09868812561035, + "learning_rate": 3.3873056994818654e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.9208633005619049, + "num_tokens": 7318943.0, + "step": 4085 + }, + { + "epoch": 0.6616468302161769, + "grad_norm": 13.648755073547363, + "learning_rate": 3.3856865284974094e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.9312936961650848, + "num_tokens": 7320717.0, + "step": 4086 + }, + { + "epoch": 0.6618087604242572, + "grad_norm": 30.157011032104492, + "learning_rate": 3.3840673575129534e-06, + "loss": 0.6453, + "mean_token_accuracy": 0.9094308614730835, + "num_tokens": 7322516.0, + "step": 4087 + }, + { + "epoch": 0.6619706906323375, + "grad_norm": 29.7043514251709, + "learning_rate": 3.3824481865284974e-06, + "loss": 0.6341, + "mean_token_accuracy": 0.9122862815856934, + "num_tokens": 7324314.0, + "step": 4088 + }, + { + "epoch": 0.6621326208404178, + "grad_norm": 34.41802215576172, + "learning_rate": 3.3808290155440414e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.9134818017482758, + "num_tokens": 7326115.0, + "step": 4089 + }, + { + "epoch": 0.6622945510484981, + "grad_norm": 28.995506286621094, + "learning_rate": 3.3792098445595854e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.9114184975624084, + "num_tokens": 7327909.0, + "step": 4090 + }, + { + "epoch": 0.6624564812565784, + "grad_norm": 20.490962982177734, + "learning_rate": 3.3775906735751295e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.9258565604686737, + "num_tokens": 7329704.0, + "step": 4091 + }, + { + "epoch": 0.6626184114646587, + "grad_norm": 25.37592315673828, + "learning_rate": 3.3759715025906735e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.9228707551956177, + "num_tokens": 7331500.0, + "step": 4092 + }, + { + "epoch": 0.662780341672739, + "grad_norm": 19.97695541381836, + "learning_rate": 3.3743523316062175e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.9295739829540253, + "num_tokens": 7333296.0, + "step": 4093 + }, + { + "epoch": 0.6629422718808193, + "grad_norm": 21.26305389404297, + "learning_rate": 3.372733160621762e-06, + "loss": 0.6249, + "mean_token_accuracy": 0.921380490064621, + "num_tokens": 7335075.0, + "step": 4094 + }, + { + "epoch": 0.6631042020888996, + "grad_norm": 37.45973205566406, + "learning_rate": 3.371113989637306e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.9033972918987274, + "num_tokens": 7336878.0, + "step": 4095 + }, + { + "epoch": 0.66326613229698, + "grad_norm": 37.923309326171875, + "learning_rate": 3.36949481865285e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.8914105594158173, + "num_tokens": 7338666.0, + "step": 4096 + }, + { + "epoch": 0.6634280625050604, + "grad_norm": 16.70155143737793, + "learning_rate": 3.367875647668394e-06, + "loss": 0.5574, + "mean_token_accuracy": 0.9338235259056091, + "num_tokens": 7340450.0, + "step": 4097 + }, + { + "epoch": 0.6635899927131407, + "grad_norm": 34.92170333862305, + "learning_rate": 3.366256476683938e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.902599424123764, + "num_tokens": 7342240.0, + "step": 4098 + }, + { + "epoch": 0.663751922921221, + "grad_norm": 21.28485679626465, + "learning_rate": 3.364637305699482e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.9264125525951385, + "num_tokens": 7344024.0, + "step": 4099 + }, + { + "epoch": 0.6639138531293013, + "grad_norm": 31.46330451965332, + "learning_rate": 3.363018134715026e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.9095588028430939, + "num_tokens": 7345812.0, + "step": 4100 + }, + { + "epoch": 0.6640757833373816, + "grad_norm": 20.872543334960938, + "learning_rate": 3.36139896373057e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.935181736946106, + "num_tokens": 7347617.0, + "step": 4101 + }, + { + "epoch": 0.6642377135454619, + "grad_norm": 29.819311141967773, + "learning_rate": 3.359779792746114e-06, + "loss": 0.63, + "mean_token_accuracy": 0.9222372174263, + "num_tokens": 7349412.0, + "step": 4102 + }, + { + "epoch": 0.6643996437535422, + "grad_norm": 22.996164321899414, + "learning_rate": 3.3581606217616585e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.9154656827449799, + "num_tokens": 7351195.0, + "step": 4103 + }, + { + "epoch": 0.6645615739616225, + "grad_norm": 28.97639274597168, + "learning_rate": 3.3565414507772025e-06, + "loss": 0.591, + "mean_token_accuracy": 0.9189356565475464, + "num_tokens": 7352991.0, + "step": 4104 + }, + { + "epoch": 0.6647235041697028, + "grad_norm": 24.251434326171875, + "learning_rate": 3.3549222797927465e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.9193357825279236, + "num_tokens": 7354788.0, + "step": 4105 + }, + { + "epoch": 0.6648854343777831, + "grad_norm": 23.537084579467773, + "learning_rate": 3.3533031088082905e-06, + "loss": 0.5709, + "mean_token_accuracy": 0.9238370954990387, + "num_tokens": 7356577.0, + "step": 4106 + }, + { + "epoch": 0.6650473645858634, + "grad_norm": 21.477855682373047, + "learning_rate": 3.351683937823835e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.9203788638114929, + "num_tokens": 7358353.0, + "step": 4107 + }, + { + "epoch": 0.6652092947939439, + "grad_norm": 32.9747428894043, + "learning_rate": 3.350064766839379e-06, + "loss": 0.5767, + "mean_token_accuracy": 0.9162943363189697, + "num_tokens": 7360152.0, + "step": 4108 + }, + { + "epoch": 0.6653712250020242, + "grad_norm": 20.007253646850586, + "learning_rate": 3.348445595854923e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.9309831857681274, + "num_tokens": 7361941.0, + "step": 4109 + }, + { + "epoch": 0.6655331552101045, + "grad_norm": 23.31011199951172, + "learning_rate": 3.346826424870467e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.9160980880260468, + "num_tokens": 7363727.0, + "step": 4110 + }, + { + "epoch": 0.6656950854181848, + "grad_norm": 26.979835510253906, + "learning_rate": 3.345207253886011e-06, + "loss": 0.551, + "mean_token_accuracy": 0.9145896732807159, + "num_tokens": 7365520.0, + "step": 4111 + }, + { + "epoch": 0.6658570156262651, + "grad_norm": 25.96749496459961, + "learning_rate": 3.343588082901555e-06, + "loss": 0.6432, + "mean_token_accuracy": 0.9119619429111481, + "num_tokens": 7367304.0, + "step": 4112 + }, + { + "epoch": 0.6660189458343454, + "grad_norm": 22.079469680786133, + "learning_rate": 3.341968911917099e-06, + "loss": 0.556, + "mean_token_accuracy": 0.9103453755378723, + "num_tokens": 7369095.0, + "step": 4113 + }, + { + "epoch": 0.6661808760424257, + "grad_norm": 24.190614700317383, + "learning_rate": 3.340349740932643e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.9300176203250885, + "num_tokens": 7370893.0, + "step": 4114 + }, + { + "epoch": 0.666342806250506, + "grad_norm": 30.60780906677246, + "learning_rate": 3.338730569948187e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.9051418602466583, + "num_tokens": 7372678.0, + "step": 4115 + }, + { + "epoch": 0.6665047364585863, + "grad_norm": 27.995006561279297, + "learning_rate": 3.337111398963731e-06, + "loss": 0.6169, + "mean_token_accuracy": 0.9225313663482666, + "num_tokens": 7374474.0, + "step": 4116 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 22.01808738708496, + "learning_rate": 3.335492227979275e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9307036101818085, + "num_tokens": 7376260.0, + "step": 4117 + }, + { + "epoch": 0.6668285968747469, + "grad_norm": 38.67413330078125, + "learning_rate": 3.333873056994819e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.9016009867191315, + "num_tokens": 7378057.0, + "step": 4118 + }, + { + "epoch": 0.6669905270828274, + "grad_norm": 21.717626571655273, + "learning_rate": 3.332253886010363e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9299335479736328, + "num_tokens": 7379840.0, + "step": 4119 + }, + { + "epoch": 0.6671524572909077, + "grad_norm": 21.459293365478516, + "learning_rate": 3.330634715025907e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.9265749752521515, + "num_tokens": 7381637.0, + "step": 4120 + }, + { + "epoch": 0.667314387498988, + "grad_norm": 15.305166244506836, + "learning_rate": 3.329015544041451e-06, + "loss": 0.466, + "mean_token_accuracy": 0.9350432753562927, + "num_tokens": 7383426.0, + "step": 4121 + }, + { + "epoch": 0.6674763177070683, + "grad_norm": 37.940284729003906, + "learning_rate": 3.327396373056995e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.8948412537574768, + "num_tokens": 7385222.0, + "step": 4122 + }, + { + "epoch": 0.6676382479151486, + "grad_norm": 40.06706237792969, + "learning_rate": 3.325777202072539e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.8996402621269226, + "num_tokens": 7387023.0, + "step": 4123 + }, + { + "epoch": 0.6678001781232289, + "grad_norm": 30.599830627441406, + "learning_rate": 3.324158031088083e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.9265443980693817, + "num_tokens": 7388823.0, + "step": 4124 + }, + { + "epoch": 0.6679621083313092, + "grad_norm": 35.77900695800781, + "learning_rate": 3.322538860103627e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.900324672460556, + "num_tokens": 7390607.0, + "step": 4125 + }, + { + "epoch": 0.6681240385393895, + "grad_norm": 28.442115783691406, + "learning_rate": 3.3209196891191716e-06, + "loss": 0.5922, + "mean_token_accuracy": 0.9201680421829224, + "num_tokens": 7392395.0, + "step": 4126 + }, + { + "epoch": 0.6682859687474698, + "grad_norm": 25.203189849853516, + "learning_rate": 3.3193005181347156e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.9123993813991547, + "num_tokens": 7394180.0, + "step": 4127 + }, + { + "epoch": 0.6684478989555501, + "grad_norm": 29.367996215820312, + "learning_rate": 3.3176813471502596e-06, + "loss": 0.6078, + "mean_token_accuracy": 0.9122835099697113, + "num_tokens": 7395966.0, + "step": 4128 + }, + { + "epoch": 0.6686098291636304, + "grad_norm": 32.791717529296875, + "learning_rate": 3.3160621761658036e-06, + "loss": 0.6717, + "mean_token_accuracy": 0.901408463716507, + "num_tokens": 7397762.0, + "step": 4129 + }, + { + "epoch": 0.6687717593717107, + "grad_norm": 31.23275375366211, + "learning_rate": 3.3144430051813477e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.9110547602176666, + "num_tokens": 7399555.0, + "step": 4130 + }, + { + "epoch": 0.6689336895797912, + "grad_norm": 18.428125381469727, + "learning_rate": 3.3128238341968917e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.9250216782093048, + "num_tokens": 7401347.0, + "step": 4131 + }, + { + "epoch": 0.6690956197878715, + "grad_norm": 28.220844268798828, + "learning_rate": 3.3112046632124357e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.9276260435581207, + "num_tokens": 7403135.0, + "step": 4132 + }, + { + "epoch": 0.6692575499959518, + "grad_norm": 25.233781814575195, + "learning_rate": 3.3095854922279797e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.9201986491680145, + "num_tokens": 7404938.0, + "step": 4133 + }, + { + "epoch": 0.6694194802040321, + "grad_norm": 25.159303665161133, + "learning_rate": 3.3079663212435237e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.9166133105754852, + "num_tokens": 7406724.0, + "step": 4134 + }, + { + "epoch": 0.6695814104121124, + "grad_norm": 24.508403778076172, + "learning_rate": 3.3063471502590677e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.9208920300006866, + "num_tokens": 7408513.0, + "step": 4135 + }, + { + "epoch": 0.6697433406201927, + "grad_norm": 33.23428726196289, + "learning_rate": 3.3047279792746117e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.9074468016624451, + "num_tokens": 7410306.0, + "step": 4136 + }, + { + "epoch": 0.669905270828273, + "grad_norm": 27.269845962524414, + "learning_rate": 3.3031088082901557e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.9084325432777405, + "num_tokens": 7412102.0, + "step": 4137 + }, + { + "epoch": 0.6700672010363533, + "grad_norm": 31.97282600402832, + "learning_rate": 3.3014896373056998e-06, + "loss": 0.691, + "mean_token_accuracy": 0.9195847511291504, + "num_tokens": 7413899.0, + "step": 4138 + }, + { + "epoch": 0.6702291312444336, + "grad_norm": 26.36349868774414, + "learning_rate": 3.2998704663212438e-06, + "loss": 0.6079, + "mean_token_accuracy": 0.9260977506637573, + "num_tokens": 7415695.0, + "step": 4139 + }, + { + "epoch": 0.6703910614525139, + "grad_norm": 30.884103775024414, + "learning_rate": 3.2982512953367878e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.9151724278926849, + "num_tokens": 7417502.0, + "step": 4140 + }, + { + "epoch": 0.6705529916605942, + "grad_norm": 20.716890335083008, + "learning_rate": 3.296632124352332e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.9231884181499481, + "num_tokens": 7419287.0, + "step": 4141 + }, + { + "epoch": 0.6707149218686747, + "grad_norm": 37.35674285888672, + "learning_rate": 3.295012953367876e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.9035087823867798, + "num_tokens": 7421079.0, + "step": 4142 + }, + { + "epoch": 0.670876852076755, + "grad_norm": 30.0382080078125, + "learning_rate": 3.29339378238342e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.8996212184429169, + "num_tokens": 7422859.0, + "step": 4143 + }, + { + "epoch": 0.6710387822848353, + "grad_norm": 22.871402740478516, + "learning_rate": 3.291774611398964e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.91094771027565, + "num_tokens": 7424651.0, + "step": 4144 + }, + { + "epoch": 0.6712007124929156, + "grad_norm": 31.28093147277832, + "learning_rate": 3.2901554404145083e-06, + "loss": 0.6551, + "mean_token_accuracy": 0.9203781485557556, + "num_tokens": 7426439.0, + "step": 4145 + }, + { + "epoch": 0.6713626427009959, + "grad_norm": 36.13066101074219, + "learning_rate": 3.2885362694300523e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.9022015631198883, + "num_tokens": 7428237.0, + "step": 4146 + }, + { + "epoch": 0.6715245729090762, + "grad_norm": 31.17274284362793, + "learning_rate": 3.2869170984455963e-06, + "loss": 0.6474, + "mean_token_accuracy": 0.9184534549713135, + "num_tokens": 7430033.0, + "step": 4147 + }, + { + "epoch": 0.6716865031171565, + "grad_norm": 34.50510025024414, + "learning_rate": 3.2852979274611403e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.905601978302002, + "num_tokens": 7431820.0, + "step": 4148 + }, + { + "epoch": 0.6718484333252368, + "grad_norm": 29.901945114135742, + "learning_rate": 3.2836787564766843e-06, + "loss": 0.6654, + "mean_token_accuracy": 0.9154095649719238, + "num_tokens": 7433604.0, + "step": 4149 + }, + { + "epoch": 0.6720103635333171, + "grad_norm": 21.766742706298828, + "learning_rate": 3.2820595854922283e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.928205132484436, + "num_tokens": 7435394.0, + "step": 4150 + }, + { + "epoch": 0.6721722937413974, + "grad_norm": 25.516538619995117, + "learning_rate": 3.2804404145077724e-06, + "loss": 0.6397, + "mean_token_accuracy": 0.9198092520236969, + "num_tokens": 7437192.0, + "step": 4151 + }, + { + "epoch": 0.6723342239494777, + "grad_norm": 41.40788650512695, + "learning_rate": 3.2788212435233164e-06, + "loss": 1.0756, + "mean_token_accuracy": 0.8910714387893677, + "num_tokens": 7438980.0, + "step": 4152 + }, + { + "epoch": 0.6724961541575581, + "grad_norm": 28.43753433227539, + "learning_rate": 3.2772020725388604e-06, + "loss": 0.532, + "mean_token_accuracy": 0.9283088147640228, + "num_tokens": 7440772.0, + "step": 4153 + }, + { + "epoch": 0.6726580843656385, + "grad_norm": 25.482227325439453, + "learning_rate": 3.2755829015544044e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.9138143956661224, + "num_tokens": 7442563.0, + "step": 4154 + }, + { + "epoch": 0.6728200145737188, + "grad_norm": 22.197038650512695, + "learning_rate": 3.2739637305699484e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.9184782803058624, + "num_tokens": 7444357.0, + "step": 4155 + }, + { + "epoch": 0.6729819447817991, + "grad_norm": 22.777578353881836, + "learning_rate": 3.2723445595854924e-06, + "loss": 0.546, + "mean_token_accuracy": 0.9239495694637299, + "num_tokens": 7446145.0, + "step": 4156 + }, + { + "epoch": 0.6731438749898794, + "grad_norm": 35.817501068115234, + "learning_rate": 3.2707253886010364e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.9107017517089844, + "num_tokens": 7447940.0, + "step": 4157 + }, + { + "epoch": 0.6733058051979597, + "grad_norm": 28.14497947692871, + "learning_rate": 3.2691062176165804e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.9149396419525146, + "num_tokens": 7449734.0, + "step": 4158 + }, + { + "epoch": 0.67346773540604, + "grad_norm": 20.760400772094727, + "learning_rate": 3.2674870466321245e-06, + "loss": 0.581, + "mean_token_accuracy": 0.9210199117660522, + "num_tokens": 7451512.0, + "step": 4159 + }, + { + "epoch": 0.6736296656141203, + "grad_norm": 31.46258544921875, + "learning_rate": 3.2658678756476685e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.9079329371452332, + "num_tokens": 7453307.0, + "step": 4160 + }, + { + "epoch": 0.6737915958222006, + "grad_norm": 40.968109130859375, + "learning_rate": 3.2642487046632125e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.8833173513412476, + "num_tokens": 7455108.0, + "step": 4161 + }, + { + "epoch": 0.6739535260302809, + "grad_norm": 28.158157348632812, + "learning_rate": 3.2626295336787565e-06, + "loss": 0.5722, + "mean_token_accuracy": 0.9217686951160431, + "num_tokens": 7456914.0, + "step": 4162 + }, + { + "epoch": 0.6741154562383612, + "grad_norm": 24.687936782836914, + "learning_rate": 3.2610103626943005e-06, + "loss": 0.6051, + "mean_token_accuracy": 0.9168752431869507, + "num_tokens": 7458703.0, + "step": 4163 + }, + { + "epoch": 0.6742773864464415, + "grad_norm": 28.493154525756836, + "learning_rate": 3.259391191709845e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.9087591171264648, + "num_tokens": 7460489.0, + "step": 4164 + }, + { + "epoch": 0.674439316654522, + "grad_norm": 25.452533721923828, + "learning_rate": 3.257772020725389e-06, + "loss": 0.573, + "mean_token_accuracy": 0.9306824803352356, + "num_tokens": 7462288.0, + "step": 4165 + }, + { + "epoch": 0.6746012468626023, + "grad_norm": 23.6773681640625, + "learning_rate": 3.256152849740933e-06, + "loss": 0.6245, + "mean_token_accuracy": 0.9147312343120575, + "num_tokens": 7464081.0, + "step": 4166 + }, + { + "epoch": 0.6747631770706826, + "grad_norm": 19.944725036621094, + "learning_rate": 3.254533678756477e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.9264705777168274, + "num_tokens": 7465865.0, + "step": 4167 + }, + { + "epoch": 0.6749251072787629, + "grad_norm": 31.108671188354492, + "learning_rate": 3.252914507772021e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.9095184504985809, + "num_tokens": 7467663.0, + "step": 4168 + }, + { + "epoch": 0.6750870374868432, + "grad_norm": 19.126888275146484, + "learning_rate": 3.251295336787565e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.9296235740184784, + "num_tokens": 7469446.0, + "step": 4169 + }, + { + "epoch": 0.6752489676949235, + "grad_norm": 17.891489028930664, + "learning_rate": 3.249676165803109e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.9244965612888336, + "num_tokens": 7471236.0, + "step": 4170 + }, + { + "epoch": 0.6754108979030038, + "grad_norm": 24.59279441833496, + "learning_rate": 3.248056994818653e-06, + "loss": 0.6318, + "mean_token_accuracy": 0.9174375832080841, + "num_tokens": 7473027.0, + "step": 4171 + }, + { + "epoch": 0.6755728281110841, + "grad_norm": 26.764751434326172, + "learning_rate": 3.246437823834197e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.9064748287200928, + "num_tokens": 7474817.0, + "step": 4172 + }, + { + "epoch": 0.6757347583191644, + "grad_norm": 29.436477661132812, + "learning_rate": 3.244818652849741e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.9207108914852142, + "num_tokens": 7476594.0, + "step": 4173 + }, + { + "epoch": 0.6758966885272447, + "grad_norm": 28.81219482421875, + "learning_rate": 3.243199481865285e-06, + "loss": 0.6155, + "mean_token_accuracy": 0.9222605228424072, + "num_tokens": 7478389.0, + "step": 4174 + }, + { + "epoch": 0.676058618735325, + "grad_norm": 19.002336502075195, + "learning_rate": 3.241580310880829e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.9302941858768463, + "num_tokens": 7480173.0, + "step": 4175 + }, + { + "epoch": 0.6762205489434054, + "grad_norm": 24.259191513061523, + "learning_rate": 3.239961139896373e-06, + "loss": 0.6041, + "mean_token_accuracy": 0.9243930280208588, + "num_tokens": 7481963.0, + "step": 4176 + }, + { + "epoch": 0.6763824791514857, + "grad_norm": 28.416440963745117, + "learning_rate": 3.238341968911917e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.912962943315506, + "num_tokens": 7483750.0, + "step": 4177 + }, + { + "epoch": 0.6765444093595661, + "grad_norm": 27.9654483795166, + "learning_rate": 3.236722797927461e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.9087591171264648, + "num_tokens": 7485536.0, + "step": 4178 + }, + { + "epoch": 0.6767063395676464, + "grad_norm": 42.680240631103516, + "learning_rate": 3.235103626943005e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.9029786884784698, + "num_tokens": 7487338.0, + "step": 4179 + }, + { + "epoch": 0.6768682697757267, + "grad_norm": 26.36026954650879, + "learning_rate": 3.233484455958549e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.9208772778511047, + "num_tokens": 7489139.0, + "step": 4180 + }, + { + "epoch": 0.677030199983807, + "grad_norm": 29.242258071899414, + "learning_rate": 3.231865284974093e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.9154411852359772, + "num_tokens": 7490923.0, + "step": 4181 + }, + { + "epoch": 0.6771921301918873, + "grad_norm": 29.985437393188477, + "learning_rate": 3.230246113989637e-06, + "loss": 0.5793, + "mean_token_accuracy": 0.9109512269496918, + "num_tokens": 7492715.0, + "step": 4182 + }, + { + "epoch": 0.6773540603999676, + "grad_norm": 27.38640022277832, + "learning_rate": 3.2286269430051816e-06, + "loss": 0.6332, + "mean_token_accuracy": 0.918371319770813, + "num_tokens": 7494496.0, + "step": 4183 + }, + { + "epoch": 0.6775159906080479, + "grad_norm": 29.038549423217773, + "learning_rate": 3.2270077720207256e-06, + "loss": 0.597, + "mean_token_accuracy": 0.906521737575531, + "num_tokens": 7496286.0, + "step": 4184 + }, + { + "epoch": 0.6776779208161282, + "grad_norm": 29.188644409179688, + "learning_rate": 3.2253886010362696e-06, + "loss": 0.6243, + "mean_token_accuracy": 0.9112829566001892, + "num_tokens": 7498079.0, + "step": 4185 + }, + { + "epoch": 0.6778398510242085, + "grad_norm": 29.625587463378906, + "learning_rate": 3.2237694300518137e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.9126662909984589, + "num_tokens": 7499867.0, + "step": 4186 + }, + { + "epoch": 0.6780017812322889, + "grad_norm": 21.250776290893555, + "learning_rate": 3.2221502590673577e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.9211202263832092, + "num_tokens": 7501658.0, + "step": 4187 + }, + { + "epoch": 0.6781637114403692, + "grad_norm": 16.9936580657959, + "learning_rate": 3.2205310880829017e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9263710677623749, + "num_tokens": 7503442.0, + "step": 4188 + }, + { + "epoch": 0.6783256416484496, + "grad_norm": 11.794110298156738, + "learning_rate": 3.2189119170984457e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.9274809062480927, + "num_tokens": 7505216.0, + "step": 4189 + }, + { + "epoch": 0.6784875718565299, + "grad_norm": 21.56904411315918, + "learning_rate": 3.2172927461139897e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.9230731725692749, + "num_tokens": 7507014.0, + "step": 4190 + }, + { + "epoch": 0.6786495020646102, + "grad_norm": 25.205673217773438, + "learning_rate": 3.2156735751295337e-06, + "loss": 0.517, + "mean_token_accuracy": 0.9243339002132416, + "num_tokens": 7508815.0, + "step": 4191 + }, + { + "epoch": 0.6788114322726905, + "grad_norm": 21.605981826782227, + "learning_rate": 3.2140544041450777e-06, + "loss": 0.513, + "mean_token_accuracy": 0.9277989864349365, + "num_tokens": 7510602.0, + "step": 4192 + }, + { + "epoch": 0.6789733624807708, + "grad_norm": 12.93326187133789, + "learning_rate": 3.2124352331606218e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.9395328760147095, + "num_tokens": 7512395.0, + "step": 4193 + }, + { + "epoch": 0.6791352926888511, + "grad_norm": 19.103681564331055, + "learning_rate": 3.2108160621761658e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.925775021314621, + "num_tokens": 7514191.0, + "step": 4194 + }, + { + "epoch": 0.6792972228969314, + "grad_norm": 25.725322723388672, + "learning_rate": 3.2091968911917098e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.9288247227668762, + "num_tokens": 7515984.0, + "step": 4195 + }, + { + "epoch": 0.6794591531050117, + "grad_norm": 31.619583129882812, + "learning_rate": 3.207577720207254e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.9084407687187195, + "num_tokens": 7517778.0, + "step": 4196 + }, + { + "epoch": 0.679621083313092, + "grad_norm": 26.227676391601562, + "learning_rate": 3.205958549222798e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.9131537079811096, + "num_tokens": 7519555.0, + "step": 4197 + }, + { + "epoch": 0.6797830135211724, + "grad_norm": 24.51641082763672, + "learning_rate": 3.204339378238342e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.9210607707500458, + "num_tokens": 7521346.0, + "step": 4198 + }, + { + "epoch": 0.6799449437292527, + "grad_norm": 23.914867401123047, + "learning_rate": 3.2027202072538867e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.9178240597248077, + "num_tokens": 7523137.0, + "step": 4199 + }, + { + "epoch": 0.680106873937333, + "grad_norm": 23.75826072692871, + "learning_rate": 3.2011010362694307e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.9243475496768951, + "num_tokens": 7524927.0, + "step": 4200 + }, + { + "epoch": 0.6802688041454134, + "grad_norm": 36.00688934326172, + "learning_rate": 3.1994818652849747e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.9084208011627197, + "num_tokens": 7526722.0, + "step": 4201 + }, + { + "epoch": 0.6804307343534937, + "grad_norm": 27.803722381591797, + "learning_rate": 3.1978626943005187e-06, + "loss": 0.6001, + "mean_token_accuracy": 0.9178959727287292, + "num_tokens": 7528514.0, + "step": 4202 + }, + { + "epoch": 0.680592664561574, + "grad_norm": 28.25230598449707, + "learning_rate": 3.1962435233160627e-06, + "loss": 0.647, + "mean_token_accuracy": 0.9155176877975464, + "num_tokens": 7530298.0, + "step": 4203 + }, + { + "epoch": 0.6807545947696543, + "grad_norm": 31.916336059570312, + "learning_rate": 3.1946243523316067e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.8989050984382629, + "num_tokens": 7532087.0, + "step": 4204 + }, + { + "epoch": 0.6809165249777346, + "grad_norm": 28.49579620361328, + "learning_rate": 3.1930051813471508e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.8965517282485962, + "num_tokens": 7533889.0, + "step": 4205 + }, + { + "epoch": 0.6810784551858149, + "grad_norm": 13.097058296203613, + "learning_rate": 3.1913860103626948e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.9332089424133301, + "num_tokens": 7535671.0, + "step": 4206 + }, + { + "epoch": 0.6812403853938952, + "grad_norm": 26.408447265625, + "learning_rate": 3.1897668393782388e-06, + "loss": 0.6082, + "mean_token_accuracy": 0.919334203004837, + "num_tokens": 7537468.0, + "step": 4207 + }, + { + "epoch": 0.6814023156019755, + "grad_norm": 33.12641143798828, + "learning_rate": 3.188147668393783e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.9042984545230865, + "num_tokens": 7539252.0, + "step": 4208 + }, + { + "epoch": 0.6815642458100558, + "grad_norm": 21.051509857177734, + "learning_rate": 3.186528497409327e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9237982928752899, + "num_tokens": 7541040.0, + "step": 4209 + }, + { + "epoch": 0.6817261760181362, + "grad_norm": 35.04060363769531, + "learning_rate": 3.184909326424871e-06, + "loss": 0.6311, + "mean_token_accuracy": 0.9052895903587341, + "num_tokens": 7542837.0, + "step": 4210 + }, + { + "epoch": 0.6818881062262165, + "grad_norm": 31.298126220703125, + "learning_rate": 3.183290155440415e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.9166054725646973, + "num_tokens": 7544635.0, + "step": 4211 + }, + { + "epoch": 0.6820500364342968, + "grad_norm": 18.305156707763672, + "learning_rate": 3.181670984455959e-06, + "loss": 0.566, + "mean_token_accuracy": 0.9269837737083435, + "num_tokens": 7546420.0, + "step": 4212 + }, + { + "epoch": 0.6822119666423772, + "grad_norm": 33.741493225097656, + "learning_rate": 3.180051813471503e-06, + "loss": 0.5805, + "mean_token_accuracy": 0.9083735942840576, + "num_tokens": 7548205.0, + "step": 4213 + }, + { + "epoch": 0.6823738968504575, + "grad_norm": 31.62444496154785, + "learning_rate": 3.178432642487047e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.9177764356136322, + "num_tokens": 7549997.0, + "step": 4214 + }, + { + "epoch": 0.6825358270585378, + "grad_norm": 37.6870002746582, + "learning_rate": 3.1768134715025913e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.9097591638565063, + "num_tokens": 7551786.0, + "step": 4215 + }, + { + "epoch": 0.6826977572666181, + "grad_norm": 37.843257904052734, + "learning_rate": 3.1751943005181353e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.8941354155540466, + "num_tokens": 7553580.0, + "step": 4216 + }, + { + "epoch": 0.6828596874746984, + "grad_norm": 38.16155242919922, + "learning_rate": 3.1735751295336793e-06, + "loss": 0.6663, + "mean_token_accuracy": 0.9055489599704742, + "num_tokens": 7555376.0, + "step": 4217 + }, + { + "epoch": 0.6830216176827787, + "grad_norm": 38.955291748046875, + "learning_rate": 3.1719559585492233e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.9054322242736816, + "num_tokens": 7557172.0, + "step": 4218 + }, + { + "epoch": 0.683183547890859, + "grad_norm": 23.4375, + "learning_rate": 3.1703367875647674e-06, + "loss": 0.565, + "mean_token_accuracy": 0.9145896732807159, + "num_tokens": 7558965.0, + "step": 4219 + }, + { + "epoch": 0.6833454780989393, + "grad_norm": 38.129207611083984, + "learning_rate": 3.1687176165803114e-06, + "loss": 0.6275, + "mean_token_accuracy": 0.9103172123432159, + "num_tokens": 7560766.0, + "step": 4220 + }, + { + "epoch": 0.6835074083070197, + "grad_norm": 26.179931640625, + "learning_rate": 3.1670984455958554e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.9207393527030945, + "num_tokens": 7562543.0, + "step": 4221 + }, + { + "epoch": 0.6836693385151, + "grad_norm": 19.7171688079834, + "learning_rate": 3.1654792746113994e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.9257525205612183, + "num_tokens": 7564323.0, + "step": 4222 + }, + { + "epoch": 0.6838312687231803, + "grad_norm": 31.97835350036621, + "learning_rate": 3.1638601036269434e-06, + "loss": 0.6042, + "mean_token_accuracy": 0.9165835082530975, + "num_tokens": 7566111.0, + "step": 4223 + }, + { + "epoch": 0.6839931989312606, + "grad_norm": 24.31029510498047, + "learning_rate": 3.1622409326424874e-06, + "loss": 0.5513, + "mean_token_accuracy": 0.9231970608234406, + "num_tokens": 7567898.0, + "step": 4224 + }, + { + "epoch": 0.684155129139341, + "grad_norm": 28.73821258544922, + "learning_rate": 3.1606217616580314e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.9104229211807251, + "num_tokens": 7569700.0, + "step": 4225 + }, + { + "epoch": 0.6843170593474213, + "grad_norm": 27.546123504638672, + "learning_rate": 3.1590025906735755e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.922649472951889, + "num_tokens": 7571496.0, + "step": 4226 + }, + { + "epoch": 0.6844789895555016, + "grad_norm": 26.646446228027344, + "learning_rate": 3.1573834196891195e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.915268212556839, + "num_tokens": 7573291.0, + "step": 4227 + }, + { + "epoch": 0.6846409197635819, + "grad_norm": 25.911571502685547, + "learning_rate": 3.1557642487046635e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.9187424778938293, + "num_tokens": 7575086.0, + "step": 4228 + }, + { + "epoch": 0.6848028499716622, + "grad_norm": 17.31951904296875, + "learning_rate": 3.1541450777202075e-06, + "loss": 0.476, + "mean_token_accuracy": 0.930771678686142, + "num_tokens": 7576873.0, + "step": 4229 + }, + { + "epoch": 0.6849647801797425, + "grad_norm": 27.885652542114258, + "learning_rate": 3.1525259067357515e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.9126984179019928, + "num_tokens": 7578660.0, + "step": 4230 + }, + { + "epoch": 0.6851267103878228, + "grad_norm": 23.07590675354004, + "learning_rate": 3.1509067357512955e-06, + "loss": 0.5202, + "mean_token_accuracy": 0.9251798987388611, + "num_tokens": 7580453.0, + "step": 4231 + }, + { + "epoch": 0.6852886405959032, + "grad_norm": 28.671772003173828, + "learning_rate": 3.1492875647668395e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.9266331791877747, + "num_tokens": 7582253.0, + "step": 4232 + }, + { + "epoch": 0.6854505708039835, + "grad_norm": 24.947860717773438, + "learning_rate": 3.147668393782384e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.9163933396339417, + "num_tokens": 7584040.0, + "step": 4233 + }, + { + "epoch": 0.6856125010120638, + "grad_norm": 37.60129928588867, + "learning_rate": 3.146049222797928e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.9011110067367554, + "num_tokens": 7585825.0, + "step": 4234 + }, + { + "epoch": 0.6857744312201441, + "grad_norm": 23.55774688720703, + "learning_rate": 3.144430051813472e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.932934045791626, + "num_tokens": 7587622.0, + "step": 4235 + }, + { + "epoch": 0.6859363614282245, + "grad_norm": 40.561241149902344, + "learning_rate": 3.142810880829016e-06, + "loss": 0.695, + "mean_token_accuracy": 0.9062924981117249, + "num_tokens": 7589421.0, + "step": 4236 + }, + { + "epoch": 0.6860982916363048, + "grad_norm": 21.919212341308594, + "learning_rate": 3.14119170984456e-06, + "loss": 0.5963, + "mean_token_accuracy": 0.9260774850845337, + "num_tokens": 7591217.0, + "step": 4237 + }, + { + "epoch": 0.6862602218443851, + "grad_norm": 33.35395050048828, + "learning_rate": 3.139572538860104e-06, + "loss": 0.76, + "mean_token_accuracy": 0.8960212171077728, + "num_tokens": 7593017.0, + "step": 4238 + }, + { + "epoch": 0.6864221520524654, + "grad_norm": 25.844959259033203, + "learning_rate": 3.137953367875648e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.9150778949260712, + "num_tokens": 7594800.0, + "step": 4239 + }, + { + "epoch": 0.6865840822605457, + "grad_norm": 28.68640899658203, + "learning_rate": 3.136334196891192e-06, + "loss": 0.6571, + "mean_token_accuracy": 0.9140820801258087, + "num_tokens": 7596589.0, + "step": 4240 + }, + { + "epoch": 0.686746012468626, + "grad_norm": 30.82794189453125, + "learning_rate": 3.134715025906736e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.9208633303642273, + "num_tokens": 7598379.0, + "step": 4241 + }, + { + "epoch": 0.6869079426767063, + "grad_norm": 24.230995178222656, + "learning_rate": 3.13309585492228e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.9140544533729553, + "num_tokens": 7600170.0, + "step": 4242 + }, + { + "epoch": 0.6870698728847866, + "grad_norm": 31.77448844909668, + "learning_rate": 3.131476683937824e-06, + "loss": 0.6626, + "mean_token_accuracy": 0.9148550927639008, + "num_tokens": 7601964.0, + "step": 4243 + }, + { + "epoch": 0.687231803092867, + "grad_norm": 26.769216537475586, + "learning_rate": 3.129857512953368e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.9269722700119019, + "num_tokens": 7603750.0, + "step": 4244 + }, + { + "epoch": 0.6873937333009473, + "grad_norm": 33.159339904785156, + "learning_rate": 3.128238341968912e-06, + "loss": 0.719, + "mean_token_accuracy": 0.8930845856666565, + "num_tokens": 7605546.0, + "step": 4245 + }, + { + "epoch": 0.6875556635090276, + "grad_norm": 26.52994155883789, + "learning_rate": 3.126619170984456e-06, + "loss": 0.529, + "mean_token_accuracy": 0.9239353239536285, + "num_tokens": 7607334.0, + "step": 4246 + }, + { + "epoch": 0.687717593717108, + "grad_norm": 26.5125789642334, + "learning_rate": 3.125e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.9206432700157166, + "num_tokens": 7609122.0, + "step": 4247 + }, + { + "epoch": 0.6878795239251883, + "grad_norm": 33.758140563964844, + "learning_rate": 3.123380829015544e-06, + "loss": 0.6379, + "mean_token_accuracy": 0.9157638251781464, + "num_tokens": 7610919.0, + "step": 4248 + }, + { + "epoch": 0.6880414541332686, + "grad_norm": 31.36656951904297, + "learning_rate": 3.121761658031088e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.9121997356414795, + "num_tokens": 7612715.0, + "step": 4249 + }, + { + "epoch": 0.6882033843413489, + "grad_norm": 24.905214309692383, + "learning_rate": 3.120142487046632e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.9204588532447815, + "num_tokens": 7614518.0, + "step": 4250 + }, + { + "epoch": 0.6883653145494292, + "grad_norm": 29.66878318786621, + "learning_rate": 3.118523316062176e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.9060952067375183, + "num_tokens": 7616317.0, + "step": 4251 + }, + { + "epoch": 0.6885272447575095, + "grad_norm": 15.72887897491455, + "learning_rate": 3.1169041450777206e-06, + "loss": 0.434, + "mean_token_accuracy": 0.9365669786930084, + "num_tokens": 7618113.0, + "step": 4252 + }, + { + "epoch": 0.6886891749655898, + "grad_norm": 31.513444900512695, + "learning_rate": 3.1152849740932647e-06, + "loss": 0.6441, + "mean_token_accuracy": 0.9004052579402924, + "num_tokens": 7619906.0, + "step": 4253 + }, + { + "epoch": 0.6888511051736701, + "grad_norm": 41.84278869628906, + "learning_rate": 3.1136658031088087e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.9062213599681854, + "num_tokens": 7621706.0, + "step": 4254 + }, + { + "epoch": 0.6890130353817505, + "grad_norm": 42.813560485839844, + "learning_rate": 3.1120466321243527e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.8859060406684875, + "num_tokens": 7623516.0, + "step": 4255 + }, + { + "epoch": 0.6891749655898308, + "grad_norm": 31.858644485473633, + "learning_rate": 3.1104274611398967e-06, + "loss": 0.5804, + "mean_token_accuracy": 0.9147057235240936, + "num_tokens": 7625308.0, + "step": 4256 + }, + { + "epoch": 0.6893368957979111, + "grad_norm": 33.77197265625, + "learning_rate": 3.1088082901554407e-06, + "loss": 0.5978, + "mean_token_accuracy": 0.906272828578949, + "num_tokens": 7627097.0, + "step": 4257 + }, + { + "epoch": 0.6894988260059914, + "grad_norm": 28.079593658447266, + "learning_rate": 3.1071891191709847e-06, + "loss": 0.6073, + "mean_token_accuracy": 0.9194001257419586, + "num_tokens": 7628894.0, + "step": 4258 + }, + { + "epoch": 0.6896607562140717, + "grad_norm": 24.356002807617188, + "learning_rate": 3.1055699481865287e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.9141001403331757, + "num_tokens": 7630697.0, + "step": 4259 + }, + { + "epoch": 0.689822686422152, + "grad_norm": 23.325708389282227, + "learning_rate": 3.1039507772020727e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.9172928631305695, + "num_tokens": 7632488.0, + "step": 4260 + }, + { + "epoch": 0.6899846166302324, + "grad_norm": 27.013219833374023, + "learning_rate": 3.1023316062176168e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.9067800939083099, + "num_tokens": 7634279.0, + "step": 4261 + }, + { + "epoch": 0.6901465468383127, + "grad_norm": 28.254985809326172, + "learning_rate": 3.1007124352331608e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.9079623520374298, + "num_tokens": 7636063.0, + "step": 4262 + }, + { + "epoch": 0.690308477046393, + "grad_norm": 32.78112030029297, + "learning_rate": 3.0990932642487048e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.8981858789920807, + "num_tokens": 7637869.0, + "step": 4263 + }, + { + "epoch": 0.6904704072544733, + "grad_norm": 23.27775001525879, + "learning_rate": 3.097474093264249e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.9230295717716217, + "num_tokens": 7639666.0, + "step": 4264 + }, + { + "epoch": 0.6906323374625536, + "grad_norm": 22.289094924926758, + "learning_rate": 3.095854922279793e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.9306526780128479, + "num_tokens": 7641453.0, + "step": 4265 + }, + { + "epoch": 0.690794267670634, + "grad_norm": 24.208086013793945, + "learning_rate": 3.094235751295337e-06, + "loss": 0.5308, + "mean_token_accuracy": 0.9287993907928467, + "num_tokens": 7643246.0, + "step": 4266 + }, + { + "epoch": 0.6909561978787143, + "grad_norm": 30.610450744628906, + "learning_rate": 3.092616580310881e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.9104995429515839, + "num_tokens": 7645037.0, + "step": 4267 + }, + { + "epoch": 0.6911181280867946, + "grad_norm": 37.16124725341797, + "learning_rate": 3.090997409326425e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.8937198221683502, + "num_tokens": 7646831.0, + "step": 4268 + }, + { + "epoch": 0.6912800582948749, + "grad_norm": 21.48700714111328, + "learning_rate": 3.089378238341969e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.930820107460022, + "num_tokens": 7648618.0, + "step": 4269 + }, + { + "epoch": 0.6914419885029552, + "grad_norm": 26.752824783325195, + "learning_rate": 3.087759067357513e-06, + "loss": 0.5357, + "mean_token_accuracy": 0.9242281913757324, + "num_tokens": 7650407.0, + "step": 4270 + }, + { + "epoch": 0.6916039187110355, + "grad_norm": 21.06236457824707, + "learning_rate": 3.0861398963730573e-06, + "loss": 0.6069, + "mean_token_accuracy": 0.9192849397659302, + "num_tokens": 7652204.0, + "step": 4271 + }, + { + "epoch": 0.6917658489191159, + "grad_norm": 18.28443145751953, + "learning_rate": 3.0845207253886013e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.93425053358078, + "num_tokens": 7653990.0, + "step": 4272 + }, + { + "epoch": 0.6919277791271962, + "grad_norm": 19.159950256347656, + "learning_rate": 3.0829015544041453e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.9350432753562927, + "num_tokens": 7655779.0, + "step": 4273 + }, + { + "epoch": 0.6920897093352765, + "grad_norm": 26.919113159179688, + "learning_rate": 3.0812823834196893e-06, + "loss": 0.5996, + "mean_token_accuracy": 0.9255533218383789, + "num_tokens": 7657573.0, + "step": 4274 + }, + { + "epoch": 0.6922516395433568, + "grad_norm": 21.134170532226562, + "learning_rate": 3.0796632124352334e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.9240978360176086, + "num_tokens": 7659362.0, + "step": 4275 + }, + { + "epoch": 0.6924135697514371, + "grad_norm": 21.6868839263916, + "learning_rate": 3.0780440414507774e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.9319303333759308, + "num_tokens": 7661153.0, + "step": 4276 + }, + { + "epoch": 0.6925754999595174, + "grad_norm": 23.270296096801758, + "learning_rate": 3.0764248704663214e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.9315811395645142, + "num_tokens": 7662942.0, + "step": 4277 + }, + { + "epoch": 0.6927374301675978, + "grad_norm": 26.62202262878418, + "learning_rate": 3.0748056994818654e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.9183098375797272, + "num_tokens": 7664736.0, + "step": 4278 + }, + { + "epoch": 0.6928993603756781, + "grad_norm": 42.976409912109375, + "learning_rate": 3.0731865284974094e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.9002669751644135, + "num_tokens": 7666538.0, + "step": 4279 + }, + { + "epoch": 0.6930612905837584, + "grad_norm": 23.27865219116211, + "learning_rate": 3.0715673575129534e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.9337849617004395, + "num_tokens": 7668337.0, + "step": 4280 + }, + { + "epoch": 0.6932232207918387, + "grad_norm": 36.487632751464844, + "learning_rate": 3.0699481865284974e-06, + "loss": 0.6443, + "mean_token_accuracy": 0.9060479700565338, + "num_tokens": 7670126.0, + "step": 4281 + }, + { + "epoch": 0.693385150999919, + "grad_norm": 41.34493637084961, + "learning_rate": 3.0683290155440415e-06, + "loss": 0.776, + "mean_token_accuracy": 0.8948439657688141, + "num_tokens": 7671915.0, + "step": 4282 + }, + { + "epoch": 0.6935470812079993, + "grad_norm": 25.194103240966797, + "learning_rate": 3.0667098445595855e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.9239130616188049, + "num_tokens": 7673703.0, + "step": 4283 + }, + { + "epoch": 0.6937090114160797, + "grad_norm": 15.45263385772705, + "learning_rate": 3.0650906735751295e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.9233576655387878, + "num_tokens": 7675489.0, + "step": 4284 + }, + { + "epoch": 0.69387094162416, + "grad_norm": 25.46445083618164, + "learning_rate": 3.0634715025906735e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.9295215606689453, + "num_tokens": 7677285.0, + "step": 4285 + }, + { + "epoch": 0.6940328718322403, + "grad_norm": 18.48753547668457, + "learning_rate": 3.0618523316062175e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9313940107822418, + "num_tokens": 7679074.0, + "step": 4286 + }, + { + "epoch": 0.6941948020403206, + "grad_norm": 23.47724151611328, + "learning_rate": 3.0602331606217615e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.9191176295280457, + "num_tokens": 7680858.0, + "step": 4287 + }, + { + "epoch": 0.6943567322484009, + "grad_norm": 23.56111717224121, + "learning_rate": 3.0586139896373055e-06, + "loss": 0.5676, + "mean_token_accuracy": 0.9166666567325592, + "num_tokens": 7682645.0, + "step": 4288 + }, + { + "epoch": 0.6945186624564813, + "grad_norm": 26.695964813232422, + "learning_rate": 3.0569948186528495e-06, + "loss": 0.637, + "mean_token_accuracy": 0.9152795374393463, + "num_tokens": 7684431.0, + "step": 4289 + }, + { + "epoch": 0.6946805926645616, + "grad_norm": 24.464126586914062, + "learning_rate": 3.055375647668394e-06, + "loss": 0.6472, + "mean_token_accuracy": 0.9179058969020844, + "num_tokens": 7686211.0, + "step": 4290 + }, + { + "epoch": 0.6948425228726419, + "grad_norm": 29.139787673950195, + "learning_rate": 3.053756476683938e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.9081889390945435, + "num_tokens": 7688005.0, + "step": 4291 + }, + { + "epoch": 0.6950044530807222, + "grad_norm": 29.348642349243164, + "learning_rate": 3.052137305699482e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.9227604269981384, + "num_tokens": 7689802.0, + "step": 4292 + }, + { + "epoch": 0.6951663832888025, + "grad_norm": 21.27211570739746, + "learning_rate": 3.050518134715026e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.9242961406707764, + "num_tokens": 7691591.0, + "step": 4293 + }, + { + "epoch": 0.6953283134968828, + "grad_norm": 26.362018585205078, + "learning_rate": 3.04889896373057e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.9265734255313873, + "num_tokens": 7693389.0, + "step": 4294 + }, + { + "epoch": 0.6954902437049632, + "grad_norm": 31.844682693481445, + "learning_rate": 3.047279792746114e-06, + "loss": 0.5831, + "mean_token_accuracy": 0.918739378452301, + "num_tokens": 7695184.0, + "step": 4295 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 21.85762596130371, + "learning_rate": 3.0456606217616585e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.9192603230476379, + "num_tokens": 7696981.0, + "step": 4296 + }, + { + "epoch": 0.6958141041211238, + "grad_norm": 31.3807315826416, + "learning_rate": 3.0440414507772025e-06, + "loss": 0.5828, + "mean_token_accuracy": 0.9163228571414948, + "num_tokens": 7698768.0, + "step": 4297 + }, + { + "epoch": 0.6959760343292041, + "grad_norm": 26.4704532623291, + "learning_rate": 3.0424222797927465e-06, + "loss": 0.5622, + "mean_token_accuracy": 0.9224869906902313, + "num_tokens": 7700563.0, + "step": 4298 + }, + { + "epoch": 0.6961379645372844, + "grad_norm": 32.80046081542969, + "learning_rate": 3.0408031088082905e-06, + "loss": 0.6378, + "mean_token_accuracy": 0.9093822836875916, + "num_tokens": 7702350.0, + "step": 4299 + }, + { + "epoch": 0.6962998947453648, + "grad_norm": 16.963529586791992, + "learning_rate": 3.0391839378238345e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.9406462609767914, + "num_tokens": 7704149.0, + "step": 4300 + }, + { + "epoch": 0.6964618249534451, + "grad_norm": 29.477937698364258, + "learning_rate": 3.0375647668393785e-06, + "loss": 0.6388, + "mean_token_accuracy": 0.918635904788971, + "num_tokens": 7705942.0, + "step": 4301 + }, + { + "epoch": 0.6966237551615254, + "grad_norm": 27.048709869384766, + "learning_rate": 3.0359455958549226e-06, + "loss": 0.641, + "mean_token_accuracy": 0.9206465184688568, + "num_tokens": 7707731.0, + "step": 4302 + }, + { + "epoch": 0.6967856853696057, + "grad_norm": 28.267881393432617, + "learning_rate": 3.034326424870467e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.9135975241661072, + "num_tokens": 7709521.0, + "step": 4303 + }, + { + "epoch": 0.696947615577686, + "grad_norm": 24.507450103759766, + "learning_rate": 3.032707253886011e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.9230769276618958, + "num_tokens": 7711319.0, + "step": 4304 + }, + { + "epoch": 0.6971095457857663, + "grad_norm": 29.955137252807617, + "learning_rate": 3.031088082901555e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.9198475182056427, + "num_tokens": 7713118.0, + "step": 4305 + }, + { + "epoch": 0.6972714759938466, + "grad_norm": 18.128271102905273, + "learning_rate": 3.029468911917099e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9311560094356537, + "num_tokens": 7714907.0, + "step": 4306 + }, + { + "epoch": 0.697433406201927, + "grad_norm": 35.9201545715332, + "learning_rate": 3.027849740932643e-06, + "loss": 0.669, + "mean_token_accuracy": 0.9038960933685303, + "num_tokens": 7716691.0, + "step": 4307 + }, + { + "epoch": 0.6975953364100073, + "grad_norm": 24.874080657958984, + "learning_rate": 3.026230569948187e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.9295327067375183, + "num_tokens": 7718488.0, + "step": 4308 + }, + { + "epoch": 0.6977572666180876, + "grad_norm": 30.239694595336914, + "learning_rate": 3.024611398963731e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.9156638383865356, + "num_tokens": 7720285.0, + "step": 4309 + }, + { + "epoch": 0.6979191968261679, + "grad_norm": 33.590797424316406, + "learning_rate": 3.022992227979275e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.9171359837055206, + "num_tokens": 7722074.0, + "step": 4310 + }, + { + "epoch": 0.6980811270342483, + "grad_norm": 19.81442642211914, + "learning_rate": 3.021373056994819e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9251798987388611, + "num_tokens": 7723867.0, + "step": 4311 + }, + { + "epoch": 0.6982430572423286, + "grad_norm": 33.890472412109375, + "learning_rate": 3.019753886010363e-06, + "loss": 0.6095, + "mean_token_accuracy": 0.9172417223453522, + "num_tokens": 7725668.0, + "step": 4312 + }, + { + "epoch": 0.6984049874504089, + "grad_norm": 35.70350646972656, + "learning_rate": 3.018134715025907e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.9070101678371429, + "num_tokens": 7727449.0, + "step": 4313 + }, + { + "epoch": 0.6985669176584892, + "grad_norm": 33.863189697265625, + "learning_rate": 3.016515544041451e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.9046953022480011, + "num_tokens": 7729244.0, + "step": 4314 + }, + { + "epoch": 0.6987288478665695, + "grad_norm": 15.445845603942871, + "learning_rate": 3.014896373056995e-06, + "loss": 0.449, + "mean_token_accuracy": 0.9350871741771698, + "num_tokens": 7731034.0, + "step": 4315 + }, + { + "epoch": 0.6988907780746498, + "grad_norm": 29.000606536865234, + "learning_rate": 3.013277202072539e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.9129368960857391, + "num_tokens": 7732823.0, + "step": 4316 + }, + { + "epoch": 0.6990527082827301, + "grad_norm": 29.32222557067871, + "learning_rate": 3.011658031088083e-06, + "loss": 0.6226, + "mean_token_accuracy": 0.9175084233283997, + "num_tokens": 7734602.0, + "step": 4317 + }, + { + "epoch": 0.6992146384908104, + "grad_norm": 22.666032791137695, + "learning_rate": 3.010038860103627e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.9347517788410187, + "num_tokens": 7736405.0, + "step": 4318 + }, + { + "epoch": 0.6993765686988908, + "grad_norm": 30.00943374633789, + "learning_rate": 3.008419689119171e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.9130684435367584, + "num_tokens": 7738194.0, + "step": 4319 + }, + { + "epoch": 0.6995384989069711, + "grad_norm": 28.176820755004883, + "learning_rate": 3.0068005181347152e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.9087075293064117, + "num_tokens": 7739990.0, + "step": 4320 + }, + { + "epoch": 0.6997004291150514, + "grad_norm": 25.183198928833008, + "learning_rate": 3.0051813471502592e-06, + "loss": 0.5443, + "mean_token_accuracy": 0.9227814376354218, + "num_tokens": 7741787.0, + "step": 4321 + }, + { + "epoch": 0.6998623593231317, + "grad_norm": 21.352293014526367, + "learning_rate": 3.0035621761658037e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.9190886914730072, + "num_tokens": 7743584.0, + "step": 4322 + }, + { + "epoch": 0.7000242895312121, + "grad_norm": 36.539398193359375, + "learning_rate": 3.0019430051813477e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.9082512557506561, + "num_tokens": 7745381.0, + "step": 4323 + }, + { + "epoch": 0.7001862197392924, + "grad_norm": 28.199247360229492, + "learning_rate": 3.0003238341968917e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.9255030155181885, + "num_tokens": 7747175.0, + "step": 4324 + }, + { + "epoch": 0.7003481499473727, + "grad_norm": 41.741004943847656, + "learning_rate": 2.9987046632124357e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.9109512269496918, + "num_tokens": 7748967.0, + "step": 4325 + }, + { + "epoch": 0.700510080155453, + "grad_norm": 34.35584259033203, + "learning_rate": 2.9970854922279797e-06, + "loss": 0.5946, + "mean_token_accuracy": 0.9097178876399994, + "num_tokens": 7750767.0, + "step": 4326 + }, + { + "epoch": 0.7006720103635333, + "grad_norm": 32.58032989501953, + "learning_rate": 2.9954663212435237e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.9154135286808014, + "num_tokens": 7752552.0, + "step": 4327 + }, + { + "epoch": 0.7008339405716136, + "grad_norm": 27.898983001708984, + "learning_rate": 2.9938471502590677e-06, + "loss": 0.589, + "mean_token_accuracy": 0.9160839319229126, + "num_tokens": 7754350.0, + "step": 4328 + }, + { + "epoch": 0.7009958707796939, + "grad_norm": 26.859020233154297, + "learning_rate": 2.9922279792746118e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.9262315332889557, + "num_tokens": 7756147.0, + "step": 4329 + }, + { + "epoch": 0.7011578009877742, + "grad_norm": 27.141902923583984, + "learning_rate": 2.9906088082901558e-06, + "loss": 0.6475, + "mean_token_accuracy": 0.9248120486736298, + "num_tokens": 7757925.0, + "step": 4330 + }, + { + "epoch": 0.7013197311958546, + "grad_norm": 17.903593063354492, + "learning_rate": 2.9889896373057e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.9269501268863678, + "num_tokens": 7759711.0, + "step": 4331 + }, + { + "epoch": 0.7014816614039349, + "grad_norm": 29.495555877685547, + "learning_rate": 2.987370466321244e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.9200627207756042, + "num_tokens": 7761511.0, + "step": 4332 + }, + { + "epoch": 0.7016435916120152, + "grad_norm": 18.326393127441406, + "learning_rate": 2.985751295336788e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.932802140712738, + "num_tokens": 7763291.0, + "step": 4333 + }, + { + "epoch": 0.7018055218200956, + "grad_norm": 28.416234970092773, + "learning_rate": 2.984132124352332e-06, + "loss": 0.5807, + "mean_token_accuracy": 0.9195520281791687, + "num_tokens": 7765089.0, + "step": 4334 + }, + { + "epoch": 0.7019674520281759, + "grad_norm": 29.08116340637207, + "learning_rate": 2.982512953367876e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.9219858050346375, + "num_tokens": 7766883.0, + "step": 4335 + }, + { + "epoch": 0.7021293822362562, + "grad_norm": 27.274030685424805, + "learning_rate": 2.98089378238342e-06, + "loss": 0.6151, + "mean_token_accuracy": 0.91094771027565, + "num_tokens": 7768675.0, + "step": 4336 + }, + { + "epoch": 0.7022913124443365, + "grad_norm": 32.33250045776367, + "learning_rate": 2.979274611398964e-06, + "loss": 0.6556, + "mean_token_accuracy": 0.8999382853507996, + "num_tokens": 7770466.0, + "step": 4337 + }, + { + "epoch": 0.7024532426524168, + "grad_norm": 21.898845672607422, + "learning_rate": 2.977655440414508e-06, + "loss": 0.554, + "mean_token_accuracy": 0.9254246950149536, + "num_tokens": 7772246.0, + "step": 4338 + }, + { + "epoch": 0.7026151728604971, + "grad_norm": 33.45396423339844, + "learning_rate": 2.976036269430052e-06, + "loss": 0.5589, + "mean_token_accuracy": 0.9080985188484192, + "num_tokens": 7774041.0, + "step": 4339 + }, + { + "epoch": 0.7027771030685774, + "grad_norm": 21.81068229675293, + "learning_rate": 2.9744170984455963e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.9322995841503143, + "num_tokens": 7775834.0, + "step": 4340 + }, + { + "epoch": 0.7029390332766577, + "grad_norm": 21.69127082824707, + "learning_rate": 2.9727979274611403e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.9313203990459442, + "num_tokens": 7777623.0, + "step": 4341 + }, + { + "epoch": 0.703100963484738, + "grad_norm": 29.01365852355957, + "learning_rate": 2.9711787564766844e-06, + "loss": 0.5941, + "mean_token_accuracy": 0.9241737127304077, + "num_tokens": 7779412.0, + "step": 4342 + }, + { + "epoch": 0.7032628936928184, + "grad_norm": 25.276811599731445, + "learning_rate": 2.9695595854922284e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.9357620477676392, + "num_tokens": 7781204.0, + "step": 4343 + }, + { + "epoch": 0.7034248239008987, + "grad_norm": 15.474091529846191, + "learning_rate": 2.9679404145077724e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.93312007188797, + "num_tokens": 7783000.0, + "step": 4344 + }, + { + "epoch": 0.7035867541089791, + "grad_norm": 38.84894561767578, + "learning_rate": 2.9663212435233164e-06, + "loss": 0.5634, + "mean_token_accuracy": 0.915492981672287, + "num_tokens": 7784796.0, + "step": 4345 + }, + { + "epoch": 0.7037486843170594, + "grad_norm": 29.220617294311523, + "learning_rate": 2.9647020725388604e-06, + "loss": 0.527, + "mean_token_accuracy": 0.9233440160751343, + "num_tokens": 7786595.0, + "step": 4346 + }, + { + "epoch": 0.7039106145251397, + "grad_norm": 36.90061950683594, + "learning_rate": 2.9630829015544044e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.9002314805984497, + "num_tokens": 7788386.0, + "step": 4347 + }, + { + "epoch": 0.70407254473322, + "grad_norm": 36.549171447753906, + "learning_rate": 2.9614637305699484e-06, + "loss": 0.6485, + "mean_token_accuracy": 0.8953166306018829, + "num_tokens": 7790185.0, + "step": 4348 + }, + { + "epoch": 0.7042344749413003, + "grad_norm": 21.962106704711914, + "learning_rate": 2.9598445595854924e-06, + "loss": 0.6139, + "mean_token_accuracy": 0.9254679083824158, + "num_tokens": 7791965.0, + "step": 4349 + }, + { + "epoch": 0.7043964051493806, + "grad_norm": 24.022844314575195, + "learning_rate": 2.9582253886010365e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.9324262738227844, + "num_tokens": 7793758.0, + "step": 4350 + }, + { + "epoch": 0.7045583353574609, + "grad_norm": 43.5592155456543, + "learning_rate": 2.9566062176165805e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.9098112881183624, + "num_tokens": 7795547.0, + "step": 4351 + }, + { + "epoch": 0.7047202655655412, + "grad_norm": 38.72248458862305, + "learning_rate": 2.9549870466321245e-06, + "loss": 0.6066, + "mean_token_accuracy": 0.8943609595298767, + "num_tokens": 7797343.0, + "step": 4352 + }, + { + "epoch": 0.7048821957736215, + "grad_norm": 33.18864059448242, + "learning_rate": 2.9533678756476685e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.9085443913936615, + "num_tokens": 7799140.0, + "step": 4353 + }, + { + "epoch": 0.7050441259817019, + "grad_norm": 19.67289924621582, + "learning_rate": 2.9517487046632125e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.9323961734771729, + "num_tokens": 7800933.0, + "step": 4354 + }, + { + "epoch": 0.7052060561897822, + "grad_norm": 18.349987030029297, + "learning_rate": 2.9501295336787565e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.9336122870445251, + "num_tokens": 7802731.0, + "step": 4355 + }, + { + "epoch": 0.7053679863978625, + "grad_norm": 17.120546340942383, + "learning_rate": 2.9485103626943005e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.935251772403717, + "num_tokens": 7804521.0, + "step": 4356 + }, + { + "epoch": 0.7055299166059429, + "grad_norm": 25.03181266784668, + "learning_rate": 2.9468911917098446e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.9258608222007751, + "num_tokens": 7806303.0, + "step": 4357 + }, + { + "epoch": 0.7056918468140232, + "grad_norm": 23.878915786743164, + "learning_rate": 2.9452720207253886e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.9212084114551544, + "num_tokens": 7808095.0, + "step": 4358 + }, + { + "epoch": 0.7058537770221035, + "grad_norm": 19.38775634765625, + "learning_rate": 2.943652849740933e-06, + "loss": 0.483, + "mean_token_accuracy": 0.9377751350402832, + "num_tokens": 7809880.0, + "step": 4359 + }, + { + "epoch": 0.7060157072301838, + "grad_norm": 28.026166915893555, + "learning_rate": 2.942033678756477e-06, + "loss": 0.609, + "mean_token_accuracy": 0.9195478558540344, + "num_tokens": 7811677.0, + "step": 4360 + }, + { + "epoch": 0.7061776374382641, + "grad_norm": 42.87164306640625, + "learning_rate": 2.940414507772021e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.912478119134903, + "num_tokens": 7813476.0, + "step": 4361 + }, + { + "epoch": 0.7063395676463444, + "grad_norm": 37.81462478637695, + "learning_rate": 2.938795336787565e-06, + "loss": 0.6164, + "mean_token_accuracy": 0.9110561013221741, + "num_tokens": 7815268.0, + "step": 4362 + }, + { + "epoch": 0.7065014978544247, + "grad_norm": 39.03481674194336, + "learning_rate": 2.937176165803109e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.916801244020462, + "num_tokens": 7817066.0, + "step": 4363 + }, + { + "epoch": 0.706663428062505, + "grad_norm": 36.72638702392578, + "learning_rate": 2.935556994818653e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.8996493816375732, + "num_tokens": 7818871.0, + "step": 4364 + }, + { + "epoch": 0.7068253582705853, + "grad_norm": 33.811092376708984, + "learning_rate": 2.933937823834197e-06, + "loss": 0.6034, + "mean_token_accuracy": 0.9201717376708984, + "num_tokens": 7820671.0, + "step": 4365 + }, + { + "epoch": 0.7069872884786657, + "grad_norm": 37.00471115112305, + "learning_rate": 2.932318652849741e-06, + "loss": 0.644, + "mean_token_accuracy": 0.905694991350174, + "num_tokens": 7822471.0, + "step": 4366 + }, + { + "epoch": 0.707149218686746, + "grad_norm": 28.73087501525879, + "learning_rate": 2.930699481865285e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9208633303642273, + "num_tokens": 7824261.0, + "step": 4367 + }, + { + "epoch": 0.7073111488948264, + "grad_norm": 25.49989891052246, + "learning_rate": 2.929080310880829e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9233683049678802, + "num_tokens": 7826048.0, + "step": 4368 + }, + { + "epoch": 0.7074730791029067, + "grad_norm": 24.82110595703125, + "learning_rate": 2.927461139896373e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.9296235740184784, + "num_tokens": 7827844.0, + "step": 4369 + }, + { + "epoch": 0.707635009310987, + "grad_norm": 31.496789932250977, + "learning_rate": 2.925841968911917e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.9057525396347046, + "num_tokens": 7829632.0, + "step": 4370 + }, + { + "epoch": 0.7077969395190673, + "grad_norm": 34.46229934692383, + "learning_rate": 2.924222797927461e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.918371707201004, + "num_tokens": 7831435.0, + "step": 4371 + }, + { + "epoch": 0.7079588697271476, + "grad_norm": 29.72195816040039, + "learning_rate": 2.922603626943005e-06, + "loss": 0.5293, + "mean_token_accuracy": 0.9192849397659302, + "num_tokens": 7833232.0, + "step": 4372 + }, + { + "epoch": 0.7081207999352279, + "grad_norm": 30.362388610839844, + "learning_rate": 2.920984455958549e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.9190140962600708, + "num_tokens": 7835028.0, + "step": 4373 + }, + { + "epoch": 0.7082827301433082, + "grad_norm": 30.46919059753418, + "learning_rate": 2.919365284974093e-06, + "loss": 0.71, + "mean_token_accuracy": 0.9212526381015778, + "num_tokens": 7836832.0, + "step": 4374 + }, + { + "epoch": 0.7084446603513885, + "grad_norm": 27.92829132080078, + "learning_rate": 2.917746113989637e-06, + "loss": 0.5855, + "mean_token_accuracy": 0.9133029878139496, + "num_tokens": 7838633.0, + "step": 4375 + }, + { + "epoch": 0.7086065905594688, + "grad_norm": 38.85553741455078, + "learning_rate": 2.9161269430051812e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.907444030046463, + "num_tokens": 7840415.0, + "step": 4376 + }, + { + "epoch": 0.7087685207675491, + "grad_norm": 35.91222381591797, + "learning_rate": 2.9145077720207252e-06, + "loss": 0.6461, + "mean_token_accuracy": 0.9150060415267944, + "num_tokens": 7842209.0, + "step": 4377 + }, + { + "epoch": 0.7089304509756295, + "grad_norm": 33.09619140625, + "learning_rate": 2.9128886010362697e-06, + "loss": 0.837, + "mean_token_accuracy": 0.8928003907203674, + "num_tokens": 7844014.0, + "step": 4378 + }, + { + "epoch": 0.7090923811837099, + "grad_norm": 35.421180725097656, + "learning_rate": 2.9112694300518137e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.913329154253006, + "num_tokens": 7845803.0, + "step": 4379 + }, + { + "epoch": 0.7092543113917902, + "grad_norm": 26.26021957397461, + "learning_rate": 2.9096502590673577e-06, + "loss": 0.538, + "mean_token_accuracy": 0.9158168733119965, + "num_tokens": 7847588.0, + "step": 4380 + }, + { + "epoch": 0.7094162415998705, + "grad_norm": 26.682254791259766, + "learning_rate": 2.9080310880829017e-06, + "loss": 0.6159, + "mean_token_accuracy": 0.9165966212749481, + "num_tokens": 7849376.0, + "step": 4381 + }, + { + "epoch": 0.7095781718079508, + "grad_norm": 31.46900749206543, + "learning_rate": 2.9064119170984457e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.9051958322525024, + "num_tokens": 7851162.0, + "step": 4382 + }, + { + "epoch": 0.7097401020160311, + "grad_norm": 23.13848114013672, + "learning_rate": 2.9047927461139897e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.9225352108478546, + "num_tokens": 7852958.0, + "step": 4383 + }, + { + "epoch": 0.7099020322241114, + "grad_norm": 31.579944610595703, + "learning_rate": 2.9031735751295338e-06, + "loss": 0.5524, + "mean_token_accuracy": 0.9226190447807312, + "num_tokens": 7854754.0, + "step": 4384 + }, + { + "epoch": 0.7100639624321917, + "grad_norm": 40.66126251220703, + "learning_rate": 2.9015544041450778e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.9120703339576721, + "num_tokens": 7856558.0, + "step": 4385 + }, + { + "epoch": 0.710225892640272, + "grad_norm": 31.179441452026367, + "learning_rate": 2.8999352331606218e-06, + "loss": 0.6122, + "mean_token_accuracy": 0.9107279777526855, + "num_tokens": 7858350.0, + "step": 4386 + }, + { + "epoch": 0.7103878228483523, + "grad_norm": 31.148160934448242, + "learning_rate": 2.898316062176166e-06, + "loss": 0.5199, + "mean_token_accuracy": 0.9174720048904419, + "num_tokens": 7860141.0, + "step": 4387 + }, + { + "epoch": 0.7105497530564326, + "grad_norm": 25.27857208251953, + "learning_rate": 2.89669689119171e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.9199904799461365, + "num_tokens": 7861928.0, + "step": 4388 + }, + { + "epoch": 0.710711683264513, + "grad_norm": 18.702856063842773, + "learning_rate": 2.895077720207254e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.9325181245803833, + "num_tokens": 7863722.0, + "step": 4389 + }, + { + "epoch": 0.7108736134725933, + "grad_norm": 30.728200912475586, + "learning_rate": 2.893458549222798e-06, + "loss": 0.6522, + "mean_token_accuracy": 0.9195520281791687, + "num_tokens": 7865520.0, + "step": 4390 + }, + { + "epoch": 0.7110355436806737, + "grad_norm": 26.364105224609375, + "learning_rate": 2.891839378238342e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.920550525188446, + "num_tokens": 7867309.0, + "step": 4391 + }, + { + "epoch": 0.711197473888754, + "grad_norm": 27.728567123413086, + "learning_rate": 2.8902202072538867e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.9181869029998779, + "num_tokens": 7869102.0, + "step": 4392 + }, + { + "epoch": 0.7113594040968343, + "grad_norm": 27.701045989990234, + "learning_rate": 2.8886010362694307e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.9184782803058624, + "num_tokens": 7870884.0, + "step": 4393 + }, + { + "epoch": 0.7115213343049146, + "grad_norm": 30.105783462524414, + "learning_rate": 2.8869818652849747e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.9230892956256866, + "num_tokens": 7872669.0, + "step": 4394 + }, + { + "epoch": 0.7116832645129949, + "grad_norm": 15.815881729125977, + "learning_rate": 2.8853626943005187e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.9305888414382935, + "num_tokens": 7874455.0, + "step": 4395 + }, + { + "epoch": 0.7118451947210752, + "grad_norm": 25.80306625366211, + "learning_rate": 2.8837435233160628e-06, + "loss": 0.6352, + "mean_token_accuracy": 0.9146499931812286, + "num_tokens": 7876239.0, + "step": 4396 + }, + { + "epoch": 0.7120071249291555, + "grad_norm": 25.850324630737305, + "learning_rate": 2.8821243523316068e-06, + "loss": 0.5609, + "mean_token_accuracy": 0.921527773141861, + "num_tokens": 7878030.0, + "step": 4397 + }, + { + "epoch": 0.7121690551372358, + "grad_norm": 18.85869598388672, + "learning_rate": 2.8805051813471508e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.9351348578929901, + "num_tokens": 7879820.0, + "step": 4398 + }, + { + "epoch": 0.7123309853453161, + "grad_norm": 35.36299514770508, + "learning_rate": 2.878886010362695e-06, + "loss": 0.852, + "mean_token_accuracy": 0.8908450603485107, + "num_tokens": 7881616.0, + "step": 4399 + }, + { + "epoch": 0.7124929155533964, + "grad_norm": 25.404550552368164, + "learning_rate": 2.877266839378239e-06, + "loss": 0.582, + "mean_token_accuracy": 0.9257492423057556, + "num_tokens": 7883411.0, + "step": 4400 + }, + { + "epoch": 0.7126548457614768, + "grad_norm": 27.899307250976562, + "learning_rate": 2.875647668393783e-06, + "loss": 0.5715, + "mean_token_accuracy": 0.9274753034114838, + "num_tokens": 7885199.0, + "step": 4401 + }, + { + "epoch": 0.7128167759695572, + "grad_norm": 39.53618621826172, + "learning_rate": 2.874028497409327e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.8980299532413483, + "num_tokens": 7886987.0, + "step": 4402 + }, + { + "epoch": 0.7129787061776375, + "grad_norm": 26.686351776123047, + "learning_rate": 2.872409326424871e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.9277743101119995, + "num_tokens": 7888776.0, + "step": 4403 + }, + { + "epoch": 0.7131406363857178, + "grad_norm": 24.15049934387207, + "learning_rate": 2.870790155440415e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.9253731369972229, + "num_tokens": 7890556.0, + "step": 4404 + }, + { + "epoch": 0.7133025665937981, + "grad_norm": 31.79214096069336, + "learning_rate": 2.869170984455959e-06, + "loss": 0.5685, + "mean_token_accuracy": 0.919584333896637, + "num_tokens": 7892342.0, + "step": 4405 + }, + { + "epoch": 0.7134644968018784, + "grad_norm": 24.539613723754883, + "learning_rate": 2.867551813471503e-06, + "loss": 0.5995, + "mean_token_accuracy": 0.9186519980430603, + "num_tokens": 7894138.0, + "step": 4406 + }, + { + "epoch": 0.7136264270099587, + "grad_norm": 29.856191635131836, + "learning_rate": 2.865932642487047e-06, + "loss": 0.5708, + "mean_token_accuracy": 0.9257739782333374, + "num_tokens": 7895932.0, + "step": 4407 + }, + { + "epoch": 0.713788357218039, + "grad_norm": 23.091562271118164, + "learning_rate": 2.864313471502591e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.9288030862808228, + "num_tokens": 7897725.0, + "step": 4408 + }, + { + "epoch": 0.7139502874261193, + "grad_norm": 32.099342346191406, + "learning_rate": 2.862694300518135e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.9060677886009216, + "num_tokens": 7899513.0, + "step": 4409 + }, + { + "epoch": 0.7141122176341996, + "grad_norm": 31.167266845703125, + "learning_rate": 2.8610751295336794e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.9065865576267242, + "num_tokens": 7901303.0, + "step": 4410 + }, + { + "epoch": 0.7142741478422799, + "grad_norm": 25.934648513793945, + "learning_rate": 2.8594559585492234e-06, + "loss": 0.6515, + "mean_token_accuracy": 0.9172661900520325, + "num_tokens": 7903093.0, + "step": 4411 + }, + { + "epoch": 0.7144360780503602, + "grad_norm": 34.78670883178711, + "learning_rate": 2.8578367875647674e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.8984077274799347, + "num_tokens": 7904880.0, + "step": 4412 + }, + { + "epoch": 0.7145980082584407, + "grad_norm": 37.25510025024414, + "learning_rate": 2.8562176165803114e-06, + "loss": 0.7319, + "mean_token_accuracy": 0.8980726301670074, + "num_tokens": 7906667.0, + "step": 4413 + }, + { + "epoch": 0.714759938466521, + "grad_norm": 16.740596771240234, + "learning_rate": 2.8545984455958554e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.938446968793869, + "num_tokens": 7908455.0, + "step": 4414 + }, + { + "epoch": 0.7149218686746013, + "grad_norm": 32.48001480102539, + "learning_rate": 2.8529792746113994e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.89768186211586, + "num_tokens": 7910250.0, + "step": 4415 + }, + { + "epoch": 0.7150837988826816, + "grad_norm": 25.95272445678711, + "learning_rate": 2.8513601036269434e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.9211378395557404, + "num_tokens": 7912041.0, + "step": 4416 + }, + { + "epoch": 0.7152457290907619, + "grad_norm": 24.422197341918945, + "learning_rate": 2.8497409326424875e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9373228549957275, + "num_tokens": 7913824.0, + "step": 4417 + }, + { + "epoch": 0.7154076592988422, + "grad_norm": 27.63548469543457, + "learning_rate": 2.8481217616580315e-06, + "loss": 0.6108, + "mean_token_accuracy": 0.9194128215312958, + "num_tokens": 7915609.0, + "step": 4418 + }, + { + "epoch": 0.7155695895069225, + "grad_norm": 24.415332794189453, + "learning_rate": 2.8465025906735755e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.9291534125804901, + "num_tokens": 7917405.0, + "step": 4419 + }, + { + "epoch": 0.7157315197150028, + "grad_norm": 40.579830169677734, + "learning_rate": 2.8448834196891195e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.9033996760845184, + "num_tokens": 7919195.0, + "step": 4420 + }, + { + "epoch": 0.7158934499230831, + "grad_norm": 32.452816009521484, + "learning_rate": 2.8432642487046635e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.8941283226013184, + "num_tokens": 7920981.0, + "step": 4421 + }, + { + "epoch": 0.7160553801311634, + "grad_norm": 17.314289093017578, + "learning_rate": 2.8416450777202075e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.9340579807758331, + "num_tokens": 7922766.0, + "step": 4422 + }, + { + "epoch": 0.7162173103392437, + "grad_norm": 26.498666763305664, + "learning_rate": 2.8400259067357515e-06, + "loss": 0.5645, + "mean_token_accuracy": 0.9185185432434082, + "num_tokens": 7924548.0, + "step": 4423 + }, + { + "epoch": 0.7163792405473242, + "grad_norm": 25.58248519897461, + "learning_rate": 2.8384067357512955e-06, + "loss": 0.5523, + "mean_token_accuracy": 0.9154388010501862, + "num_tokens": 7926320.0, + "step": 4424 + }, + { + "epoch": 0.7165411707554045, + "grad_norm": 34.241920471191406, + "learning_rate": 2.8367875647668396e-06, + "loss": 0.5666, + "mean_token_accuracy": 0.9182370901107788, + "num_tokens": 7928113.0, + "step": 4425 + }, + { + "epoch": 0.7167031009634848, + "grad_norm": 28.584026336669922, + "learning_rate": 2.8351683937823836e-06, + "loss": 0.649, + "mean_token_accuracy": 0.9256640374660492, + "num_tokens": 7929906.0, + "step": 4426 + }, + { + "epoch": 0.7168650311715651, + "grad_norm": 13.715279579162598, + "learning_rate": 2.8335492227979276e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.9361573457717896, + "num_tokens": 7931700.0, + "step": 4427 + }, + { + "epoch": 0.7170269613796454, + "grad_norm": 30.5115909576416, + "learning_rate": 2.8319300518134716e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.9213517606258392, + "num_tokens": 7933480.0, + "step": 4428 + }, + { + "epoch": 0.7171888915877257, + "grad_norm": 29.673900604248047, + "learning_rate": 2.830310880829016e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.9187915623188019, + "num_tokens": 7935263.0, + "step": 4429 + }, + { + "epoch": 0.717350821795806, + "grad_norm": 32.55278015136719, + "learning_rate": 2.82869170984456e-06, + "loss": 0.6214, + "mean_token_accuracy": 0.9115812182426453, + "num_tokens": 7937057.0, + "step": 4430 + }, + { + "epoch": 0.7175127520038863, + "grad_norm": 25.465343475341797, + "learning_rate": 2.827072538860104e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.9177807569503784, + "num_tokens": 7938848.0, + "step": 4431 + }, + { + "epoch": 0.7176746822119666, + "grad_norm": 23.165142059326172, + "learning_rate": 2.825453367875648e-06, + "loss": 0.593, + "mean_token_accuracy": 0.917548805475235, + "num_tokens": 7940639.0, + "step": 4432 + }, + { + "epoch": 0.7178366124200469, + "grad_norm": 26.611894607543945, + "learning_rate": 2.823834196891192e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.9219181835651398, + "num_tokens": 7942420.0, + "step": 4433 + }, + { + "epoch": 0.7179985426281272, + "grad_norm": 24.82908821105957, + "learning_rate": 2.822215025906736e-06, + "loss": 0.5788, + "mean_token_accuracy": 0.9200018048286438, + "num_tokens": 7944231.0, + "step": 4434 + }, + { + "epoch": 0.7181604728362075, + "grad_norm": 33.258602142333984, + "learning_rate": 2.82059585492228e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.9182238876819611, + "num_tokens": 7946036.0, + "step": 4435 + }, + { + "epoch": 0.718322403044288, + "grad_norm": 38.56667709350586, + "learning_rate": 2.818976683937824e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.9082205593585968, + "num_tokens": 7947843.0, + "step": 4436 + }, + { + "epoch": 0.7184843332523683, + "grad_norm": 29.333974838256836, + "learning_rate": 2.817357512953368e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.9229983687400818, + "num_tokens": 7949652.0, + "step": 4437 + }, + { + "epoch": 0.7186462634604486, + "grad_norm": 34.60757827758789, + "learning_rate": 2.815738341968912e-06, + "loss": 0.674, + "mean_token_accuracy": 0.9056751430034637, + "num_tokens": 7951459.0, + "step": 4438 + }, + { + "epoch": 0.7188081936685289, + "grad_norm": 33.5838737487793, + "learning_rate": 2.814119170984456e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.9107142686843872, + "num_tokens": 7953251.0, + "step": 4439 + }, + { + "epoch": 0.7189701238766092, + "grad_norm": 28.21237564086914, + "learning_rate": 2.8125e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.9224817752838135, + "num_tokens": 7955047.0, + "step": 4440 + }, + { + "epoch": 0.7191320540846895, + "grad_norm": 24.667551040649414, + "learning_rate": 2.810880829015544e-06, + "loss": 0.5641, + "mean_token_accuracy": 0.9178914129734039, + "num_tokens": 7956839.0, + "step": 4441 + }, + { + "epoch": 0.7192939842927698, + "grad_norm": 20.769487380981445, + "learning_rate": 2.809261658031088e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.9259096682071686, + "num_tokens": 7958621.0, + "step": 4442 + }, + { + "epoch": 0.7194559145008501, + "grad_norm": 24.97512435913086, + "learning_rate": 2.8076424870466322e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.9330339431762695, + "num_tokens": 7960418.0, + "step": 4443 + }, + { + "epoch": 0.7196178447089304, + "grad_norm": 23.201566696166992, + "learning_rate": 2.8060233160621762e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.9287699460983276, + "num_tokens": 7962209.0, + "step": 4444 + }, + { + "epoch": 0.7197797749170107, + "grad_norm": 30.520273208618164, + "learning_rate": 2.8044041450777202e-06, + "loss": 0.6005, + "mean_token_accuracy": 0.9152927696704865, + "num_tokens": 7964004.0, + "step": 4445 + }, + { + "epoch": 0.719941705125091, + "grad_norm": 31.200077056884766, + "learning_rate": 2.8027849740932643e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.9131302535533905, + "num_tokens": 7965792.0, + "step": 4446 + }, + { + "epoch": 0.7201036353331715, + "grad_norm": 27.254308700561523, + "learning_rate": 2.8011658031088083e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.9178068339824677, + "num_tokens": 7967585.0, + "step": 4447 + }, + { + "epoch": 0.7202655655412518, + "grad_norm": 28.498950958251953, + "learning_rate": 2.7995466321243527e-06, + "loss": 0.5694, + "mean_token_accuracy": 0.9281594455242157, + "num_tokens": 7969376.0, + "step": 4448 + }, + { + "epoch": 0.7204274957493321, + "grad_norm": 37.301177978515625, + "learning_rate": 2.7979274611398967e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.9143702983856201, + "num_tokens": 7971168.0, + "step": 4449 + }, + { + "epoch": 0.7205894259574124, + "grad_norm": 17.07199478149414, + "learning_rate": 2.7963082901554407e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.9329670369625092, + "num_tokens": 7972963.0, + "step": 4450 + }, + { + "epoch": 0.7207513561654927, + "grad_norm": 28.770893096923828, + "learning_rate": 2.7946891191709847e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.9230273962020874, + "num_tokens": 7974748.0, + "step": 4451 + }, + { + "epoch": 0.720913286373573, + "grad_norm": 23.010498046875, + "learning_rate": 2.7930699481865288e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.9219458401203156, + "num_tokens": 7976529.0, + "step": 4452 + }, + { + "epoch": 0.7210752165816533, + "grad_norm": 20.701244354248047, + "learning_rate": 2.7914507772020728e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.9202856719493866, + "num_tokens": 7978317.0, + "step": 4453 + }, + { + "epoch": 0.7212371467897336, + "grad_norm": 21.25913429260254, + "learning_rate": 2.7898316062176168e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.9330285787582397, + "num_tokens": 7980112.0, + "step": 4454 + }, + { + "epoch": 0.7213990769978139, + "grad_norm": 16.468021392822266, + "learning_rate": 2.788212435233161e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.9295434653759003, + "num_tokens": 7981894.0, + "step": 4455 + }, + { + "epoch": 0.7215610072058942, + "grad_norm": 31.411575317382812, + "learning_rate": 2.786593264248705e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.9133903086185455, + "num_tokens": 7983684.0, + "step": 4456 + }, + { + "epoch": 0.7217229374139745, + "grad_norm": 29.316164016723633, + "learning_rate": 2.784974093264249e-06, + "loss": 0.6085, + "mean_token_accuracy": 0.9164022207260132, + "num_tokens": 7985471.0, + "step": 4457 + }, + { + "epoch": 0.721884867622055, + "grad_norm": 39.83155059814453, + "learning_rate": 2.783354922279793e-06, + "loss": 0.803, + "mean_token_accuracy": 0.898330569267273, + "num_tokens": 7987268.0, + "step": 4458 + }, + { + "epoch": 0.7220467978301353, + "grad_norm": 27.66533851623535, + "learning_rate": 2.781735751295337e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.924761027097702, + "num_tokens": 7989059.0, + "step": 4459 + }, + { + "epoch": 0.7222087280382156, + "grad_norm": 22.771656036376953, + "learning_rate": 2.780116580310881e-06, + "loss": 0.6052, + "mean_token_accuracy": 0.9199460446834564, + "num_tokens": 7990846.0, + "step": 4460 + }, + { + "epoch": 0.7223706582462959, + "grad_norm": 21.277753829956055, + "learning_rate": 2.778497409326425e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.9264546930789948, + "num_tokens": 7992630.0, + "step": 4461 + }, + { + "epoch": 0.7225325884543762, + "grad_norm": 32.62319564819336, + "learning_rate": 2.776878238341969e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.9145377278327942, + "num_tokens": 7994423.0, + "step": 4462 + }, + { + "epoch": 0.7226945186624565, + "grad_norm": 23.01576805114746, + "learning_rate": 2.775259067357513e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9253723621368408, + "num_tokens": 7996216.0, + "step": 4463 + }, + { + "epoch": 0.7228564488705368, + "grad_norm": 22.82634162902832, + "learning_rate": 2.773639896373057e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.9188725650310516, + "num_tokens": 7997999.0, + "step": 4464 + }, + { + "epoch": 0.7230183790786171, + "grad_norm": 24.165660858154297, + "learning_rate": 2.772020725388601e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.9330986142158508, + "num_tokens": 7999795.0, + "step": 4465 + }, + { + "epoch": 0.7231803092866974, + "grad_norm": 30.761465072631836, + "learning_rate": 2.7704015544041454e-06, + "loss": 0.5493, + "mean_token_accuracy": 0.9239272475242615, + "num_tokens": 8001610.0, + "step": 4466 + }, + { + "epoch": 0.7233422394947777, + "grad_norm": 24.10455894470215, + "learning_rate": 2.7687823834196894e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.8966230750083923, + "num_tokens": 8003393.0, + "step": 4467 + }, + { + "epoch": 0.723504169702858, + "grad_norm": 26.58614730834961, + "learning_rate": 2.7671632124352334e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.9163140058517456, + "num_tokens": 8005180.0, + "step": 4468 + }, + { + "epoch": 0.7236660999109383, + "grad_norm": 29.153697967529297, + "learning_rate": 2.7655440414507774e-06, + "loss": 0.5736, + "mean_token_accuracy": 0.9194581210613251, + "num_tokens": 8006977.0, + "step": 4469 + }, + { + "epoch": 0.7238280301190188, + "grad_norm": 28.157649993896484, + "learning_rate": 2.7639248704663214e-06, + "loss": 0.5522, + "mean_token_accuracy": 0.9168067276477814, + "num_tokens": 8008765.0, + "step": 4470 + }, + { + "epoch": 0.7239899603270991, + "grad_norm": 35.398372650146484, + "learning_rate": 2.7623056994818654e-06, + "loss": 0.616, + "mean_token_accuracy": 0.9041857421398163, + "num_tokens": 8010559.0, + "step": 4471 + }, + { + "epoch": 0.7241518905351794, + "grad_norm": 39.47248077392578, + "learning_rate": 2.7606865284974094e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.8973684012889862, + "num_tokens": 8012344.0, + "step": 4472 + }, + { + "epoch": 0.7243138207432597, + "grad_norm": 21.398977279663086, + "learning_rate": 2.7590673575129535e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.9199346303939819, + "num_tokens": 8014145.0, + "step": 4473 + }, + { + "epoch": 0.72447575095134, + "grad_norm": 32.29393768310547, + "learning_rate": 2.7574481865284975e-06, + "loss": 0.6709, + "mean_token_accuracy": 0.9151960909366608, + "num_tokens": 8015928.0, + "step": 4474 + }, + { + "epoch": 0.7246376811594203, + "grad_norm": 24.342317581176758, + "learning_rate": 2.7558290155440415e-06, + "loss": 0.5865, + "mean_token_accuracy": 0.9176767766475677, + "num_tokens": 8017707.0, + "step": 4475 + }, + { + "epoch": 0.7247996113675006, + "grad_norm": 27.52862548828125, + "learning_rate": 2.7542098445595855e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.9293971955776215, + "num_tokens": 8019503.0, + "step": 4476 + }, + { + "epoch": 0.7249615415755809, + "grad_norm": 24.93927001953125, + "learning_rate": 2.7525906735751295e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.9150778949260712, + "num_tokens": 8021286.0, + "step": 4477 + }, + { + "epoch": 0.7251234717836612, + "grad_norm": 31.143369674682617, + "learning_rate": 2.7509715025906735e-06, + "loss": 0.657, + "mean_token_accuracy": 0.9091029465198517, + "num_tokens": 8023073.0, + "step": 4478 + }, + { + "epoch": 0.7252854019917415, + "grad_norm": 23.28335189819336, + "learning_rate": 2.7493523316062175e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.9248132109642029, + "num_tokens": 8024863.0, + "step": 4479 + }, + { + "epoch": 0.7254473321998218, + "grad_norm": 35.27939987182617, + "learning_rate": 2.7477331606217615e-06, + "loss": 0.6116, + "mean_token_accuracy": 0.9028554856777191, + "num_tokens": 8026663.0, + "step": 4480 + }, + { + "epoch": 0.7256092624079022, + "grad_norm": 28.665430068969727, + "learning_rate": 2.7461139896373056e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.9157810211181641, + "num_tokens": 8028448.0, + "step": 4481 + }, + { + "epoch": 0.7257711926159826, + "grad_norm": 30.171079635620117, + "learning_rate": 2.7444948186528496e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.9107106626033783, + "num_tokens": 8030241.0, + "step": 4482 + }, + { + "epoch": 0.7259331228240629, + "grad_norm": 25.86968231201172, + "learning_rate": 2.7428756476683936e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.9139243066310883, + "num_tokens": 8032032.0, + "step": 4483 + }, + { + "epoch": 0.7260950530321432, + "grad_norm": 35.32841491699219, + "learning_rate": 2.7412564766839376e-06, + "loss": 0.6923, + "mean_token_accuracy": 0.9093185067176819, + "num_tokens": 8033831.0, + "step": 4484 + }, + { + "epoch": 0.7262569832402235, + "grad_norm": 20.021224975585938, + "learning_rate": 2.739637305699482e-06, + "loss": 0.534, + "mean_token_accuracy": 0.9287814497947693, + "num_tokens": 8035624.0, + "step": 4485 + }, + { + "epoch": 0.7264189134483038, + "grad_norm": 30.640483856201172, + "learning_rate": 2.738018134715026e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.9145525991916656, + "num_tokens": 8037416.0, + "step": 4486 + }, + { + "epoch": 0.7265808436563841, + "grad_norm": 21.73912239074707, + "learning_rate": 2.73639896373057e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.9213893115520477, + "num_tokens": 8039195.0, + "step": 4487 + }, + { + "epoch": 0.7267427738644644, + "grad_norm": 42.398555755615234, + "learning_rate": 2.734779792746114e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.8822393715381622, + "num_tokens": 8040995.0, + "step": 4488 + }, + { + "epoch": 0.7269047040725447, + "grad_norm": 28.758895874023438, + "learning_rate": 2.7331606217616585e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.9237777590751648, + "num_tokens": 8042783.0, + "step": 4489 + }, + { + "epoch": 0.727066634280625, + "grad_norm": 33.264892578125, + "learning_rate": 2.7315414507772025e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.8986244201660156, + "num_tokens": 8044590.0, + "step": 4490 + }, + { + "epoch": 0.7272285644887053, + "grad_norm": 31.843368530273438, + "learning_rate": 2.7299222797927465e-06, + "loss": 0.619, + "mean_token_accuracy": 0.9055226147174835, + "num_tokens": 8046377.0, + "step": 4491 + }, + { + "epoch": 0.7273904946967857, + "grad_norm": 30.81119155883789, + "learning_rate": 2.7283031088082906e-06, + "loss": 0.6423, + "mean_token_accuracy": 0.9065735042095184, + "num_tokens": 8048167.0, + "step": 4492 + }, + { + "epoch": 0.727552424904866, + "grad_norm": 28.915250778198242, + "learning_rate": 2.7266839378238346e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.9042074978351593, + "num_tokens": 8049959.0, + "step": 4493 + }, + { + "epoch": 0.7277143551129464, + "grad_norm": 21.78362274169922, + "learning_rate": 2.7250647668393786e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.9322995841503143, + "num_tokens": 8051752.0, + "step": 4494 + }, + { + "epoch": 0.7278762853210267, + "grad_norm": 30.810924530029297, + "learning_rate": 2.7234455958549226e-06, + "loss": 0.6185, + "mean_token_accuracy": 0.9154135286808014, + "num_tokens": 8053549.0, + "step": 4495 + }, + { + "epoch": 0.728038215529107, + "grad_norm": 32.02534484863281, + "learning_rate": 2.7218264248704666e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.9167470633983612, + "num_tokens": 8055349.0, + "step": 4496 + }, + { + "epoch": 0.7282001457371873, + "grad_norm": 26.702011108398438, + "learning_rate": 2.7202072538860106e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.9259259104728699, + "num_tokens": 8057131.0, + "step": 4497 + }, + { + "epoch": 0.7283620759452676, + "grad_norm": 19.676074981689453, + "learning_rate": 2.718588082901555e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.9384453892707825, + "num_tokens": 8058919.0, + "step": 4498 + }, + { + "epoch": 0.7285240061533479, + "grad_norm": 30.8987979888916, + "learning_rate": 2.716968911917099e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.9179501235485077, + "num_tokens": 8060712.0, + "step": 4499 + }, + { + "epoch": 0.7286859363614282, + "grad_norm": 21.482254028320312, + "learning_rate": 2.715349740932643e-06, + "loss": 0.5554, + "mean_token_accuracy": 0.9185061454772949, + "num_tokens": 8062492.0, + "step": 4500 + }, + { + "epoch": 0.7288478665695085, + "grad_norm": 32.7093620300293, + "learning_rate": 2.713730569948187e-06, + "loss": 0.76, + "mean_token_accuracy": 0.9074721336364746, + "num_tokens": 8064285.0, + "step": 4501 + }, + { + "epoch": 0.7290097967775888, + "grad_norm": 30.86467933654785, + "learning_rate": 2.712111398963731e-06, + "loss": 0.6555, + "mean_token_accuracy": 0.9074627757072449, + "num_tokens": 8066078.0, + "step": 4502 + }, + { + "epoch": 0.7291717269856691, + "grad_norm": 30.52082633972168, + "learning_rate": 2.710492227979275e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.9227604269981384, + "num_tokens": 8067875.0, + "step": 4503 + }, + { + "epoch": 0.7293336571937495, + "grad_norm": 17.34865379333496, + "learning_rate": 2.708873056994819e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.9340579807758331, + "num_tokens": 8069675.0, + "step": 4504 + }, + { + "epoch": 0.7294955874018298, + "grad_norm": 19.844480514526367, + "learning_rate": 2.707253886010363e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.927152156829834, + "num_tokens": 8071475.0, + "step": 4505 + }, + { + "epoch": 0.7296575176099102, + "grad_norm": 31.885820388793945, + "learning_rate": 2.705634715025907e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.9118930697441101, + "num_tokens": 8073271.0, + "step": 4506 + }, + { + "epoch": 0.7298194478179905, + "grad_norm": 25.198123931884766, + "learning_rate": 2.704015544041451e-06, + "loss": 0.5735, + "mean_token_accuracy": 0.9150778949260712, + "num_tokens": 8075054.0, + "step": 4507 + }, + { + "epoch": 0.7299813780260708, + "grad_norm": 29.360118865966797, + "learning_rate": 2.702396373056995e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.9232352077960968, + "num_tokens": 8076852.0, + "step": 4508 + }, + { + "epoch": 0.7301433082341511, + "grad_norm": 35.02790832519531, + "learning_rate": 2.700777202072539e-06, + "loss": 0.6386, + "mean_token_accuracy": 0.9157635569572449, + "num_tokens": 8078649.0, + "step": 4509 + }, + { + "epoch": 0.7303052384422314, + "grad_norm": 23.5816650390625, + "learning_rate": 2.699158031088083e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.9192083179950714, + "num_tokens": 8080433.0, + "step": 4510 + }, + { + "epoch": 0.7304671686503117, + "grad_norm": 17.632251739501953, + "learning_rate": 2.6975388601036272e-06, + "loss": 0.471, + "mean_token_accuracy": 0.9327344298362732, + "num_tokens": 8082212.0, + "step": 4511 + }, + { + "epoch": 0.730629098858392, + "grad_norm": 29.55930519104004, + "learning_rate": 2.6959196891191712e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.913509339094162, + "num_tokens": 8084002.0, + "step": 4512 + }, + { + "epoch": 0.7307910290664723, + "grad_norm": 17.11616325378418, + "learning_rate": 2.6943005181347152e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.9365563690662384, + "num_tokens": 8085797.0, + "step": 4513 + }, + { + "epoch": 0.7309529592745526, + "grad_norm": 29.031143188476562, + "learning_rate": 2.6926813471502593e-06, + "loss": 0.5994, + "mean_token_accuracy": 0.9231182336807251, + "num_tokens": 8087582.0, + "step": 4514 + }, + { + "epoch": 0.731114889482633, + "grad_norm": 25.365745544433594, + "learning_rate": 2.6910621761658033e-06, + "loss": 0.5936, + "mean_token_accuracy": 0.9157062470912933, + "num_tokens": 8089379.0, + "step": 4515 + }, + { + "epoch": 0.7312768196907133, + "grad_norm": 41.268287658691406, + "learning_rate": 2.6894430051813473e-06, + "loss": 0.677, + "mean_token_accuracy": 0.905601978302002, + "num_tokens": 8091166.0, + "step": 4516 + }, + { + "epoch": 0.7314387498987936, + "grad_norm": 27.64450454711914, + "learning_rate": 2.6878238341968917e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.918313592672348, + "num_tokens": 8092948.0, + "step": 4517 + }, + { + "epoch": 0.731600680106874, + "grad_norm": 28.30603790283203, + "learning_rate": 2.6862046632124357e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.9302924573421478, + "num_tokens": 8094747.0, + "step": 4518 + }, + { + "epoch": 0.7317626103149543, + "grad_norm": 34.9337272644043, + "learning_rate": 2.6845854922279798e-06, + "loss": 0.6212, + "mean_token_accuracy": 0.9106077551841736, + "num_tokens": 8096538.0, + "step": 4519 + }, + { + "epoch": 0.7319245405230346, + "grad_norm": 34.09270477294922, + "learning_rate": 2.6829663212435238e-06, + "loss": 0.6642, + "mean_token_accuracy": 0.9173611104488373, + "num_tokens": 8098329.0, + "step": 4520 + }, + { + "epoch": 0.7320864707311149, + "grad_norm": 24.97817611694336, + "learning_rate": 2.6813471502590678e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.9181795120239258, + "num_tokens": 8100122.0, + "step": 4521 + }, + { + "epoch": 0.7322484009391952, + "grad_norm": 25.789073944091797, + "learning_rate": 2.679727979274612e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.9198764860630035, + "num_tokens": 8101921.0, + "step": 4522 + }, + { + "epoch": 0.7324103311472755, + "grad_norm": 32.63774490356445, + "learning_rate": 2.678108808290156e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.9186813235282898, + "num_tokens": 8103716.0, + "step": 4523 + }, + { + "epoch": 0.7325722613553558, + "grad_norm": 38.643985748291016, + "learning_rate": 2.6764896373057e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.9055226147174835, + "num_tokens": 8105503.0, + "step": 4524 + }, + { + "epoch": 0.7327341915634361, + "grad_norm": 26.946096420288086, + "learning_rate": 2.674870466321244e-06, + "loss": 0.6258, + "mean_token_accuracy": 0.9114170372486115, + "num_tokens": 8107309.0, + "step": 4525 + }, + { + "epoch": 0.7328961217715165, + "grad_norm": 43.800350189208984, + "learning_rate": 2.673251295336788e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.8983269035816193, + "num_tokens": 8109116.0, + "step": 4526 + }, + { + "epoch": 0.7330580519795968, + "grad_norm": 21.17057991027832, + "learning_rate": 2.671632124352332e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.9265628457069397, + "num_tokens": 8110900.0, + "step": 4527 + }, + { + "epoch": 0.7332199821876771, + "grad_norm": 23.218595504760742, + "learning_rate": 2.670012953367876e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9279391169548035, + "num_tokens": 8112690.0, + "step": 4528 + }, + { + "epoch": 0.7333819123957575, + "grad_norm": 36.31922912597656, + "learning_rate": 2.66839378238342e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.9055076837539673, + "num_tokens": 8114478.0, + "step": 4529 + }, + { + "epoch": 0.7335438426038378, + "grad_norm": 37.84391784667969, + "learning_rate": 2.666774611398964e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.9069534540176392, + "num_tokens": 8116269.0, + "step": 4530 + }, + { + "epoch": 0.7337057728119181, + "grad_norm": 19.089780807495117, + "learning_rate": 2.665155440414508e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.9347689151763916, + "num_tokens": 8118057.0, + "step": 4531 + }, + { + "epoch": 0.7338677030199984, + "grad_norm": 23.26643180847168, + "learning_rate": 2.663536269430052e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.9172661900520325, + "num_tokens": 8119847.0, + "step": 4532 + }, + { + "epoch": 0.7340296332280787, + "grad_norm": 34.35528564453125, + "learning_rate": 2.661917098445596e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.9210858941078186, + "num_tokens": 8121641.0, + "step": 4533 + }, + { + "epoch": 0.734191563436159, + "grad_norm": 26.72820472717285, + "learning_rate": 2.66029792746114e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.9178895652294159, + "num_tokens": 8123431.0, + "step": 4534 + }, + { + "epoch": 0.7343534936442393, + "grad_norm": 20.602807998657227, + "learning_rate": 2.658678756476684e-06, + "loss": 0.461, + "mean_token_accuracy": 0.9303059875965118, + "num_tokens": 8125230.0, + "step": 4535 + }, + { + "epoch": 0.7345154238523196, + "grad_norm": 34.85890197753906, + "learning_rate": 2.6570595854922284e-06, + "loss": 0.5697, + "mean_token_accuracy": 0.9105250835418701, + "num_tokens": 8127033.0, + "step": 4536 + }, + { + "epoch": 0.7346773540604, + "grad_norm": 18.087966918945312, + "learning_rate": 2.6554404145077724e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.9372571706771851, + "num_tokens": 8128832.0, + "step": 4537 + }, + { + "epoch": 0.7348392842684803, + "grad_norm": 23.630273818969727, + "learning_rate": 2.6538212435233164e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.9117899835109711, + "num_tokens": 8130617.0, + "step": 4538 + }, + { + "epoch": 0.7350012144765606, + "grad_norm": 20.222585678100586, + "learning_rate": 2.6522020725388604e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.931367963552475, + "num_tokens": 8132406.0, + "step": 4539 + }, + { + "epoch": 0.735163144684641, + "grad_norm": 22.488021850585938, + "learning_rate": 2.6505829015544044e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.928437352180481, + "num_tokens": 8134199.0, + "step": 4540 + }, + { + "epoch": 0.7353250748927213, + "grad_norm": 39.27302551269531, + "learning_rate": 2.6489637305699485e-06, + "loss": 0.5943, + "mean_token_accuracy": 0.9212057292461395, + "num_tokens": 8136003.0, + "step": 4541 + }, + { + "epoch": 0.7354870051008016, + "grad_norm": 10.930018424987793, + "learning_rate": 2.6473445595854925e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.935300201177597, + "num_tokens": 8137793.0, + "step": 4542 + }, + { + "epoch": 0.7356489353088819, + "grad_norm": 27.647703170776367, + "learning_rate": 2.6457253886010365e-06, + "loss": 0.5296, + "mean_token_accuracy": 0.9278675615787506, + "num_tokens": 8139582.0, + "step": 4543 + }, + { + "epoch": 0.7358108655169622, + "grad_norm": 26.010183334350586, + "learning_rate": 2.6441062176165805e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.9241397082805634, + "num_tokens": 8141371.0, + "step": 4544 + }, + { + "epoch": 0.7359727957250425, + "grad_norm": 18.476205825805664, + "learning_rate": 2.6424870466321245e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.9265206754207611, + "num_tokens": 8143155.0, + "step": 4545 + }, + { + "epoch": 0.7361347259331228, + "grad_norm": 26.902023315429688, + "learning_rate": 2.6408678756476685e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.9239118993282318, + "num_tokens": 8144942.0, + "step": 4546 + }, + { + "epoch": 0.7362966561412031, + "grad_norm": 21.286592483520508, + "learning_rate": 2.6392487046632125e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9273109138011932, + "num_tokens": 8146730.0, + "step": 4547 + }, + { + "epoch": 0.7364585863492834, + "grad_norm": 22.702796936035156, + "learning_rate": 2.6376295336787566e-06, + "loss": 0.53, + "mean_token_accuracy": 0.9247430562973022, + "num_tokens": 8148521.0, + "step": 4548 + }, + { + "epoch": 0.7366205165573638, + "grad_norm": 28.53870391845703, + "learning_rate": 2.6360103626943006e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.9134083986282349, + "num_tokens": 8150310.0, + "step": 4549 + }, + { + "epoch": 0.7367824467654441, + "grad_norm": 33.73854064941406, + "learning_rate": 2.6343911917098446e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.9062696099281311, + "num_tokens": 8152110.0, + "step": 4550 + }, + { + "epoch": 0.7369443769735244, + "grad_norm": 30.758499145507812, + "learning_rate": 2.6327720207253886e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9248366057872772, + "num_tokens": 8153902.0, + "step": 4551 + }, + { + "epoch": 0.7371063071816047, + "grad_norm": 34.09508514404297, + "learning_rate": 2.6311528497409326e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.9076973795890808, + "num_tokens": 8155695.0, + "step": 4552 + }, + { + "epoch": 0.737268237389685, + "grad_norm": 35.46419143676758, + "learning_rate": 2.6295336787564766e-06, + "loss": 0.6093, + "mean_token_accuracy": 0.9127551019191742, + "num_tokens": 8157494.0, + "step": 4553 + }, + { + "epoch": 0.7374301675977654, + "grad_norm": 26.774829864501953, + "learning_rate": 2.6279145077720206e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.9240111112594604, + "num_tokens": 8159280.0, + "step": 4554 + }, + { + "epoch": 0.7375920978058457, + "grad_norm": 28.504465103149414, + "learning_rate": 2.626295336787565e-06, + "loss": 0.561, + "mean_token_accuracy": 0.9186064004898071, + "num_tokens": 8161075.0, + "step": 4555 + }, + { + "epoch": 0.737754028013926, + "grad_norm": 23.045183181762695, + "learning_rate": 2.624676165803109e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.9279979169368744, + "num_tokens": 8162865.0, + "step": 4556 + }, + { + "epoch": 0.7379159582220063, + "grad_norm": 36.4827880859375, + "learning_rate": 2.623056994818653e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.905844658613205, + "num_tokens": 8164653.0, + "step": 4557 + }, + { + "epoch": 0.7380778884300866, + "grad_norm": 34.03557586669922, + "learning_rate": 2.621437823834197e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.9111787378787994, + "num_tokens": 8166447.0, + "step": 4558 + }, + { + "epoch": 0.7382398186381669, + "grad_norm": 21.248807907104492, + "learning_rate": 2.619818652849741e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.931494414806366, + "num_tokens": 8168237.0, + "step": 4559 + }, + { + "epoch": 0.7384017488462473, + "grad_norm": 27.218454360961914, + "learning_rate": 2.618199481865285e-06, + "loss": 0.5675, + "mean_token_accuracy": 0.9199725091457367, + "num_tokens": 8170024.0, + "step": 4560 + }, + { + "epoch": 0.7385636790543276, + "grad_norm": 27.11505699157715, + "learning_rate": 2.616580310880829e-06, + "loss": 0.6031, + "mean_token_accuracy": 0.920443207025528, + "num_tokens": 8171812.0, + "step": 4561 + }, + { + "epoch": 0.7387256092624079, + "grad_norm": 36.43574905395508, + "learning_rate": 2.614961139896373e-06, + "loss": 0.577, + "mean_token_accuracy": 0.905918687582016, + "num_tokens": 8173611.0, + "step": 4562 + }, + { + "epoch": 0.7388875394704882, + "grad_norm": 21.802791595458984, + "learning_rate": 2.613341968911917e-06, + "loss": 0.479, + "mean_token_accuracy": 0.9290744364261627, + "num_tokens": 8175405.0, + "step": 4563 + }, + { + "epoch": 0.7390494696785685, + "grad_norm": 25.826353073120117, + "learning_rate": 2.611722797927461e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.9202898740768433, + "num_tokens": 8177193.0, + "step": 4564 + }, + { + "epoch": 0.7392113998866489, + "grad_norm": 35.67538070678711, + "learning_rate": 2.610103626943005e-06, + "loss": 0.5741, + "mean_token_accuracy": 0.9167016744613647, + "num_tokens": 8178981.0, + "step": 4565 + }, + { + "epoch": 0.7393733300947292, + "grad_norm": 30.726619720458984, + "learning_rate": 2.608484455958549e-06, + "loss": 0.6001, + "mean_token_accuracy": 0.9203213453292847, + "num_tokens": 8180781.0, + "step": 4566 + }, + { + "epoch": 0.7395352603028095, + "grad_norm": 26.228023529052734, + "learning_rate": 2.6068652849740932e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.9026198387145996, + "num_tokens": 8182559.0, + "step": 4567 + }, + { + "epoch": 0.7396971905108898, + "grad_norm": 27.367204666137695, + "learning_rate": 2.6052461139896372e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.9166931509971619, + "num_tokens": 8184346.0, + "step": 4568 + }, + { + "epoch": 0.7398591207189701, + "grad_norm": 20.97516441345215, + "learning_rate": 2.6036269430051813e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.9273166060447693, + "num_tokens": 8186146.0, + "step": 4569 + }, + { + "epoch": 0.7400210509270504, + "grad_norm": 28.53069305419922, + "learning_rate": 2.6020077720207253e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.9271235466003418, + "num_tokens": 8187946.0, + "step": 4570 + }, + { + "epoch": 0.7401829811351308, + "grad_norm": 39.1164665222168, + "learning_rate": 2.6003886010362693e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.9281156361103058, + "num_tokens": 8189750.0, + "step": 4571 + }, + { + "epoch": 0.7403449113432111, + "grad_norm": 35.342411041259766, + "learning_rate": 2.5987694300518133e-06, + "loss": 0.6741, + "mean_token_accuracy": 0.9109818339347839, + "num_tokens": 8191532.0, + "step": 4572 + }, + { + "epoch": 0.7405068415512914, + "grad_norm": 32.37525939941406, + "learning_rate": 2.5971502590673577e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.9048504829406738, + "num_tokens": 8193328.0, + "step": 4573 + }, + { + "epoch": 0.7406687717593717, + "grad_norm": 19.774675369262695, + "learning_rate": 2.5955310880829017e-06, + "loss": 0.469, + "mean_token_accuracy": 0.936292290687561, + "num_tokens": 8195122.0, + "step": 4574 + }, + { + "epoch": 0.740830701967452, + "grad_norm": 35.81763458251953, + "learning_rate": 2.5939119170984458e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.9225564002990723, + "num_tokens": 8196907.0, + "step": 4575 + }, + { + "epoch": 0.7409926321755324, + "grad_norm": 26.13691520690918, + "learning_rate": 2.5922927461139898e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.9167953729629517, + "num_tokens": 8198707.0, + "step": 4576 + }, + { + "epoch": 0.7411545623836127, + "grad_norm": 23.173473358154297, + "learning_rate": 2.5906735751295338e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.923865407705307, + "num_tokens": 8200496.0, + "step": 4577 + }, + { + "epoch": 0.741316492591693, + "grad_norm": 26.636085510253906, + "learning_rate": 2.589054404145078e-06, + "loss": 0.5835, + "mean_token_accuracy": 0.9238302707672119, + "num_tokens": 8202284.0, + "step": 4578 + }, + { + "epoch": 0.7414784227997733, + "grad_norm": 28.24758529663086, + "learning_rate": 2.587435233160622e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.9003292620182037, + "num_tokens": 8204077.0, + "step": 4579 + }, + { + "epoch": 0.7416403530078536, + "grad_norm": 27.013755798339844, + "learning_rate": 2.585816062176166e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.9155176877975464, + "num_tokens": 8205861.0, + "step": 4580 + }, + { + "epoch": 0.7418022832159339, + "grad_norm": 22.328622817993164, + "learning_rate": 2.58419689119171e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.9242043793201447, + "num_tokens": 8207635.0, + "step": 4581 + }, + { + "epoch": 0.7419642134240142, + "grad_norm": 21.46522331237793, + "learning_rate": 2.582577720207254e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.9261177480220795, + "num_tokens": 8209418.0, + "step": 4582 + }, + { + "epoch": 0.7421261436320946, + "grad_norm": 31.285398483276367, + "learning_rate": 2.580958549222798e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.9101460576057434, + "num_tokens": 8211207.0, + "step": 4583 + }, + { + "epoch": 0.7422880738401749, + "grad_norm": 33.974853515625, + "learning_rate": 2.579339378238342e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.9106192588806152, + "num_tokens": 8213011.0, + "step": 4584 + }, + { + "epoch": 0.7424500040482552, + "grad_norm": 25.935009002685547, + "learning_rate": 2.5777202072538863e-06, + "loss": 0.51, + "mean_token_accuracy": 0.9252786040306091, + "num_tokens": 8214804.0, + "step": 4585 + }, + { + "epoch": 0.7426119342563355, + "grad_norm": 24.187061309814453, + "learning_rate": 2.5761010362694307e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.9250700175762177, + "num_tokens": 8216609.0, + "step": 4586 + }, + { + "epoch": 0.7427738644644158, + "grad_norm": 29.433578491210938, + "learning_rate": 2.5744818652849748e-06, + "loss": 0.6014, + "mean_token_accuracy": 0.9217752516269684, + "num_tokens": 8218402.0, + "step": 4587 + }, + { + "epoch": 0.7429357946724962, + "grad_norm": 32.3162727355957, + "learning_rate": 2.5728626943005188e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.917548805475235, + "num_tokens": 8220193.0, + "step": 4588 + }, + { + "epoch": 0.7430977248805765, + "grad_norm": 32.099700927734375, + "learning_rate": 2.5712435233160628e-06, + "loss": 0.586, + "mean_token_accuracy": 0.9170315861701965, + "num_tokens": 8221982.0, + "step": 4589 + }, + { + "epoch": 0.7432596550886568, + "grad_norm": 29.042383193969727, + "learning_rate": 2.569624352331607e-06, + "loss": 0.6318, + "mean_token_accuracy": 0.9181869029998779, + "num_tokens": 8223775.0, + "step": 4590 + }, + { + "epoch": 0.7434215852967371, + "grad_norm": 27.842449188232422, + "learning_rate": 2.568005181347151e-06, + "loss": 0.725, + "mean_token_accuracy": 0.9128378331661224, + "num_tokens": 8225565.0, + "step": 4591 + }, + { + "epoch": 0.7435835155048174, + "grad_norm": 28.261457443237305, + "learning_rate": 2.566386010362695e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.9254246950149536, + "num_tokens": 8227345.0, + "step": 4592 + }, + { + "epoch": 0.7437454457128977, + "grad_norm": 30.498554229736328, + "learning_rate": 2.564766839378239e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.9200068414211273, + "num_tokens": 8229143.0, + "step": 4593 + }, + { + "epoch": 0.7439073759209781, + "grad_norm": 29.88692283630371, + "learning_rate": 2.563147668393783e-06, + "loss": 0.6413, + "mean_token_accuracy": 0.9191001653671265, + "num_tokens": 8230927.0, + "step": 4594 + }, + { + "epoch": 0.7440693061290584, + "grad_norm": 29.168485641479492, + "learning_rate": 2.561528497409327e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.9114203155040741, + "num_tokens": 8232720.0, + "step": 4595 + }, + { + "epoch": 0.7442312363371387, + "grad_norm": 19.69017791748047, + "learning_rate": 2.559909326424871e-06, + "loss": 0.5573, + "mean_token_accuracy": 0.9266602396965027, + "num_tokens": 8234505.0, + "step": 4596 + }, + { + "epoch": 0.744393166545219, + "grad_norm": 24.87456512451172, + "learning_rate": 2.558290155440415e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.9115867018699646, + "num_tokens": 8236288.0, + "step": 4597 + }, + { + "epoch": 0.7445550967532993, + "grad_norm": 32.47265625, + "learning_rate": 2.556670984455959e-06, + "loss": 0.6707, + "mean_token_accuracy": 0.9211202263832092, + "num_tokens": 8238079.0, + "step": 4598 + }, + { + "epoch": 0.7447170269613796, + "grad_norm": 29.497760772705078, + "learning_rate": 2.555051813471503e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.9106115698814392, + "num_tokens": 8239860.0, + "step": 4599 + }, + { + "epoch": 0.74487895716946, + "grad_norm": 25.900714874267578, + "learning_rate": 2.553432642487047e-06, + "loss": 0.5471, + "mean_token_accuracy": 0.9231597185134888, + "num_tokens": 8241658.0, + "step": 4600 + }, + { + "epoch": 0.7450408873775403, + "grad_norm": 35.44560623168945, + "learning_rate": 2.551813471502591e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.9112118780612946, + "num_tokens": 8243451.0, + "step": 4601 + }, + { + "epoch": 0.7452028175856206, + "grad_norm": 30.347888946533203, + "learning_rate": 2.550194300518135e-06, + "loss": 0.5692, + "mean_token_accuracy": 0.9249999821186066, + "num_tokens": 8245243.0, + "step": 4602 + }, + { + "epoch": 0.7453647477937009, + "grad_norm": 18.550918579101562, + "learning_rate": 2.548575129533679e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.9314507842063904, + "num_tokens": 8247032.0, + "step": 4603 + }, + { + "epoch": 0.7455266780017812, + "grad_norm": 30.45406723022461, + "learning_rate": 2.546955958549223e-06, + "loss": 0.5682, + "mean_token_accuracy": 0.9149391055107117, + "num_tokens": 8248828.0, + "step": 4604 + }, + { + "epoch": 0.7456886082098616, + "grad_norm": 26.664411544799805, + "learning_rate": 2.5453367875647674e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.9168159067630768, + "num_tokens": 8250624.0, + "step": 4605 + }, + { + "epoch": 0.7458505384179419, + "grad_norm": 25.156368255615234, + "learning_rate": 2.5437176165803114e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9264123439788818, + "num_tokens": 8252422.0, + "step": 4606 + }, + { + "epoch": 0.7460124686260222, + "grad_norm": 36.718666076660156, + "learning_rate": 2.5420984455958554e-06, + "loss": 0.6701, + "mean_token_accuracy": 0.91087207198143, + "num_tokens": 8254204.0, + "step": 4607 + }, + { + "epoch": 0.7461743988341025, + "grad_norm": 32.656837463378906, + "learning_rate": 2.5404792746113995e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.9159872233867645, + "num_tokens": 8255990.0, + "step": 4608 + }, + { + "epoch": 0.7463363290421828, + "grad_norm": 27.766300201416016, + "learning_rate": 2.5388601036269435e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.9062694609165192, + "num_tokens": 8257768.0, + "step": 4609 + }, + { + "epoch": 0.7464982592502631, + "grad_norm": 32.55924987792969, + "learning_rate": 2.5372409326424875e-06, + "loss": 0.5781, + "mean_token_accuracy": 0.9174365699291229, + "num_tokens": 8259560.0, + "step": 4610 + }, + { + "epoch": 0.7466601894583434, + "grad_norm": 25.397869110107422, + "learning_rate": 2.5356217616580315e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.920992910861969, + "num_tokens": 8261363.0, + "step": 4611 + }, + { + "epoch": 0.7468221196664238, + "grad_norm": 23.16935157775879, + "learning_rate": 2.5340025906735755e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.9278171956539154, + "num_tokens": 8263151.0, + "step": 4612 + }, + { + "epoch": 0.7469840498745041, + "grad_norm": 19.186124801635742, + "learning_rate": 2.5323834196891195e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.9312020540237427, + "num_tokens": 8264953.0, + "step": 4613 + }, + { + "epoch": 0.7471459800825844, + "grad_norm": 24.751462936401367, + "learning_rate": 2.5307642487046635e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.9265314936637878, + "num_tokens": 8266736.0, + "step": 4614 + }, + { + "epoch": 0.7473079102906647, + "grad_norm": 32.06297302246094, + "learning_rate": 2.5291450777202075e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.9055467844009399, + "num_tokens": 8268533.0, + "step": 4615 + }, + { + "epoch": 0.747469840498745, + "grad_norm": 33.905338287353516, + "learning_rate": 2.5275259067357516e-06, + "loss": 0.6462, + "mean_token_accuracy": 0.9189484119415283, + "num_tokens": 8270329.0, + "step": 4616 + }, + { + "epoch": 0.7476317707068254, + "grad_norm": 42.5034065246582, + "learning_rate": 2.5259067357512956e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.8910945355892181, + "num_tokens": 8272125.0, + "step": 4617 + }, + { + "epoch": 0.7477937009149057, + "grad_norm": 25.228912353515625, + "learning_rate": 2.5242875647668396e-06, + "loss": 0.5443, + "mean_token_accuracy": 0.9131458103656769, + "num_tokens": 8273901.0, + "step": 4618 + }, + { + "epoch": 0.747955631122986, + "grad_norm": 25.824670791625977, + "learning_rate": 2.5226683937823836e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.9333793222904205, + "num_tokens": 8275698.0, + "step": 4619 + }, + { + "epoch": 0.7481175613310663, + "grad_norm": 41.90966033935547, + "learning_rate": 2.5210492227979276e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.8914726674556732, + "num_tokens": 8277495.0, + "step": 4620 + }, + { + "epoch": 0.7482794915391466, + "grad_norm": 43.290367126464844, + "learning_rate": 2.5194300518134716e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.8785714209079742, + "num_tokens": 8279287.0, + "step": 4621 + }, + { + "epoch": 0.7484414217472269, + "grad_norm": 36.1877555847168, + "learning_rate": 2.5178108808290156e-06, + "loss": 0.6258, + "mean_token_accuracy": 0.9169794619083405, + "num_tokens": 8281076.0, + "step": 4622 + }, + { + "epoch": 0.7486033519553073, + "grad_norm": 16.613479614257812, + "learning_rate": 2.5161917098445597e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.9309405386447906, + "num_tokens": 8282863.0, + "step": 4623 + }, + { + "epoch": 0.7487652821633876, + "grad_norm": 26.51201057434082, + "learning_rate": 2.514572538860104e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.9281793832778931, + "num_tokens": 8284653.0, + "step": 4624 + }, + { + "epoch": 0.7489272123714679, + "grad_norm": 37.206119537353516, + "learning_rate": 2.512953367875648e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.9136128425598145, + "num_tokens": 8286443.0, + "step": 4625 + }, + { + "epoch": 0.7490891425795482, + "grad_norm": 29.836753845214844, + "learning_rate": 2.511334196891192e-06, + "loss": 0.565, + "mean_token_accuracy": 0.9248905181884766, + "num_tokens": 8288248.0, + "step": 4626 + }, + { + "epoch": 0.7492510727876285, + "grad_norm": 31.03231430053711, + "learning_rate": 2.509715025906736e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.9151099026203156, + "num_tokens": 8290043.0, + "step": 4627 + }, + { + "epoch": 0.7494130029957089, + "grad_norm": 24.713272094726562, + "learning_rate": 2.50809585492228e-06, + "loss": 0.58, + "mean_token_accuracy": 0.9167449176311493, + "num_tokens": 8291832.0, + "step": 4628 + }, + { + "epoch": 0.7495749332037892, + "grad_norm": 22.026756286621094, + "learning_rate": 2.506476683937824e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.9290744364261627, + "num_tokens": 8293626.0, + "step": 4629 + }, + { + "epoch": 0.7497368634118695, + "grad_norm": 33.0578498840332, + "learning_rate": 2.504857512953368e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.9198528230190277, + "num_tokens": 8295412.0, + "step": 4630 + }, + { + "epoch": 0.7498987936199498, + "grad_norm": 19.807037353515625, + "learning_rate": 2.503238341968912e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.9306386113166809, + "num_tokens": 8297212.0, + "step": 4631 + }, + { + "epoch": 0.7500607238280301, + "grad_norm": 33.38854217529297, + "learning_rate": 2.501619170984456e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.9045868515968323, + "num_tokens": 8299007.0, + "step": 4632 + }, + { + "epoch": 0.7502226540361104, + "grad_norm": 23.449975967407227, + "learning_rate": 2.5e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.9205673635005951, + "num_tokens": 8300795.0, + "step": 4633 + }, + { + "epoch": 0.7503845842441907, + "grad_norm": 43.37062072753906, + "learning_rate": 2.4983808290155442e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.9068877696990967, + "num_tokens": 8302606.0, + "step": 4634 + }, + { + "epoch": 0.750546514452271, + "grad_norm": 30.077634811401367, + "learning_rate": 2.4967616580310882e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9325833320617676, + "num_tokens": 8304385.0, + "step": 4635 + }, + { + "epoch": 0.7507084446603514, + "grad_norm": 20.742412567138672, + "learning_rate": 2.4951424870466322e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.929352730512619, + "num_tokens": 8306180.0, + "step": 4636 + }, + { + "epoch": 0.7508703748684317, + "grad_norm": 25.69804573059082, + "learning_rate": 2.4935233160621763e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.9270072877407074, + "num_tokens": 8307966.0, + "step": 4637 + }, + { + "epoch": 0.751032305076512, + "grad_norm": 34.1600227355957, + "learning_rate": 2.4919041450777203e-06, + "loss": 0.6319, + "mean_token_accuracy": 0.91131791472435, + "num_tokens": 8309760.0, + "step": 4638 + }, + { + "epoch": 0.7511942352845924, + "grad_norm": 33.87798309326172, + "learning_rate": 2.4902849740932643e-06, + "loss": 0.5594, + "mean_token_accuracy": 0.9168231785297394, + "num_tokens": 8311549.0, + "step": 4639 + }, + { + "epoch": 0.7513561654926727, + "grad_norm": 32.02516174316406, + "learning_rate": 2.4886658031088083e-06, + "loss": 0.5984, + "mean_token_accuracy": 0.91366907954216, + "num_tokens": 8313339.0, + "step": 4640 + }, + { + "epoch": 0.751518095700753, + "grad_norm": 35.43289566040039, + "learning_rate": 2.4870466321243523e-06, + "loss": 0.557, + "mean_token_accuracy": 0.9243023991584778, + "num_tokens": 8315143.0, + "step": 4641 + }, + { + "epoch": 0.7516800259088333, + "grad_norm": 18.416576385498047, + "learning_rate": 2.4854274611398963e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.9332747459411621, + "num_tokens": 8316925.0, + "step": 4642 + }, + { + "epoch": 0.7518419561169136, + "grad_norm": 23.071779251098633, + "learning_rate": 2.4838082901554408e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.927532434463501, + "num_tokens": 8318713.0, + "step": 4643 + }, + { + "epoch": 0.7520038863249939, + "grad_norm": 26.339641571044922, + "learning_rate": 2.4821891191709848e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.9337091147899628, + "num_tokens": 8320526.0, + "step": 4644 + }, + { + "epoch": 0.7521658165330742, + "grad_norm": 23.68473243713379, + "learning_rate": 2.4805699481865288e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.9181873500347137, + "num_tokens": 8322307.0, + "step": 4645 + }, + { + "epoch": 0.7523277467411545, + "grad_norm": 32.67833709716797, + "learning_rate": 2.478950777202073e-06, + "loss": 0.6478, + "mean_token_accuracy": 0.9097140729427338, + "num_tokens": 8324105.0, + "step": 4646 + }, + { + "epoch": 0.7524896769492349, + "grad_norm": 29.60883140563965, + "learning_rate": 2.477331606217617e-06, + "loss": 0.5686, + "mean_token_accuracy": 0.9205682873725891, + "num_tokens": 8325894.0, + "step": 4647 + }, + { + "epoch": 0.7526516071573152, + "grad_norm": 32.173824310302734, + "learning_rate": 2.475712435233161e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.9168404340744019, + "num_tokens": 8327683.0, + "step": 4648 + }, + { + "epoch": 0.7528135373653955, + "grad_norm": 40.390472412109375, + "learning_rate": 2.474093264248705e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.8936694264411926, + "num_tokens": 8329476.0, + "step": 4649 + }, + { + "epoch": 0.7529754675734759, + "grad_norm": 39.334659576416016, + "learning_rate": 2.472474093264249e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.9029101133346558, + "num_tokens": 8331266.0, + "step": 4650 + }, + { + "epoch": 0.7531373977815562, + "grad_norm": 36.24227523803711, + "learning_rate": 2.470854922279793e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.916788250207901, + "num_tokens": 8333065.0, + "step": 4651 + }, + { + "epoch": 0.7532993279896365, + "grad_norm": 36.09779357910156, + "learning_rate": 2.469235751295337e-06, + "loss": 0.581, + "mean_token_accuracy": 0.9110330045223236, + "num_tokens": 8334847.0, + "step": 4652 + }, + { + "epoch": 0.7534612581977168, + "grad_norm": 35.25068664550781, + "learning_rate": 2.467616580310881e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.9177764356136322, + "num_tokens": 8336639.0, + "step": 4653 + }, + { + "epoch": 0.7536231884057971, + "grad_norm": 38.59444046020508, + "learning_rate": 2.465997409326425e-06, + "loss": 0.795, + "mean_token_accuracy": 0.9070371091365814, + "num_tokens": 8338430.0, + "step": 4654 + }, + { + "epoch": 0.7537851186138774, + "grad_norm": 33.41994094848633, + "learning_rate": 2.464378238341969e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.9203906953334808, + "num_tokens": 8340218.0, + "step": 4655 + }, + { + "epoch": 0.7539470488219577, + "grad_norm": 24.58260154724121, + "learning_rate": 2.462759067357513e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.9277708232402802, + "num_tokens": 8342007.0, + "step": 4656 + }, + { + "epoch": 0.754108979030038, + "grad_norm": 24.311532974243164, + "learning_rate": 2.461139896373057e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.9292216897010803, + "num_tokens": 8343802.0, + "step": 4657 + }, + { + "epoch": 0.7542709092381183, + "grad_norm": 27.534183502197266, + "learning_rate": 2.4595207253886014e-06, + "loss": 0.545, + "mean_token_accuracy": 0.9205517172813416, + "num_tokens": 8345591.0, + "step": 4658 + }, + { + "epoch": 0.7544328394461987, + "grad_norm": 24.61557388305664, + "learning_rate": 2.4579015544041454e-06, + "loss": 0.562, + "mean_token_accuracy": 0.9242710769176483, + "num_tokens": 8347379.0, + "step": 4659 + }, + { + "epoch": 0.754594769654279, + "grad_norm": 12.89478588104248, + "learning_rate": 2.4562823834196894e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.9332089424133301, + "num_tokens": 8349161.0, + "step": 4660 + }, + { + "epoch": 0.7547566998623593, + "grad_norm": 27.645849227905273, + "learning_rate": 2.4546632124352334e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.918974906206131, + "num_tokens": 8350944.0, + "step": 4661 + }, + { + "epoch": 0.7549186300704397, + "grad_norm": 27.31709861755371, + "learning_rate": 2.4530440414507774e-06, + "loss": 0.5532, + "mean_token_accuracy": 0.9161653220653534, + "num_tokens": 8352742.0, + "step": 4662 + }, + { + "epoch": 0.75508056027852, + "grad_norm": 36.87115478515625, + "learning_rate": 2.4514248704663214e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.9064536690711975, + "num_tokens": 8354541.0, + "step": 4663 + }, + { + "epoch": 0.7552424904866003, + "grad_norm": 26.110366821289062, + "learning_rate": 2.4498056994818655e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.9216992855072021, + "num_tokens": 8356334.0, + "step": 4664 + }, + { + "epoch": 0.7554044206946806, + "grad_norm": 45.888031005859375, + "learning_rate": 2.4481865284974095e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.8923611044883728, + "num_tokens": 8358134.0, + "step": 4665 + }, + { + "epoch": 0.7555663509027609, + "grad_norm": 32.6623420715332, + "learning_rate": 2.4465673575129535e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.9144723415374756, + "num_tokens": 8359926.0, + "step": 4666 + }, + { + "epoch": 0.7557282811108412, + "grad_norm": 31.467660903930664, + "learning_rate": 2.4449481865284975e-06, + "loss": 0.606, + "mean_token_accuracy": 0.919433057308197, + "num_tokens": 8361721.0, + "step": 4667 + }, + { + "epoch": 0.7558902113189215, + "grad_norm": 31.99997901916504, + "learning_rate": 2.443329015544042e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.9298729598522186, + "num_tokens": 8363518.0, + "step": 4668 + }, + { + "epoch": 0.7560521415270018, + "grad_norm": 28.23126983642578, + "learning_rate": 2.441709844559586e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.9062043726444244, + "num_tokens": 8365307.0, + "step": 4669 + }, + { + "epoch": 0.7562140717350821, + "grad_norm": 32.10374069213867, + "learning_rate": 2.44009067357513e-06, + "loss": 0.604, + "mean_token_accuracy": 0.9172833859920502, + "num_tokens": 8367098.0, + "step": 4670 + }, + { + "epoch": 0.7563760019431625, + "grad_norm": 40.423118591308594, + "learning_rate": 2.438471502590674e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.8920863270759583, + "num_tokens": 8368888.0, + "step": 4671 + }, + { + "epoch": 0.7565379321512428, + "grad_norm": 24.046968460083008, + "learning_rate": 2.436852331606218e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.9222372174263, + "num_tokens": 8370683.0, + "step": 4672 + }, + { + "epoch": 0.7566998623593232, + "grad_norm": 35.0728645324707, + "learning_rate": 2.435233160621762e-06, + "loss": 0.6499, + "mean_token_accuracy": 0.9087809026241302, + "num_tokens": 8372469.0, + "step": 4673 + }, + { + "epoch": 0.7568617925674035, + "grad_norm": 24.768760681152344, + "learning_rate": 2.433613989637306e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.9199725091457367, + "num_tokens": 8374256.0, + "step": 4674 + }, + { + "epoch": 0.7570237227754838, + "grad_norm": 28.990802764892578, + "learning_rate": 2.43199481865285e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.9111787378787994, + "num_tokens": 8376050.0, + "step": 4675 + }, + { + "epoch": 0.7571856529835641, + "grad_norm": 22.618728637695312, + "learning_rate": 2.430375647668394e-06, + "loss": 0.5965, + "mean_token_accuracy": 0.9192129373550415, + "num_tokens": 8377836.0, + "step": 4676 + }, + { + "epoch": 0.7573475831916444, + "grad_norm": 38.99509048461914, + "learning_rate": 2.428756476683938e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.9091029465198517, + "num_tokens": 8379623.0, + "step": 4677 + }, + { + "epoch": 0.7575095133997247, + "grad_norm": 28.140243530273438, + "learning_rate": 2.427137305699482e-06, + "loss": 0.5513, + "mean_token_accuracy": 0.9213735461235046, + "num_tokens": 8381415.0, + "step": 4678 + }, + { + "epoch": 0.757671443607805, + "grad_norm": 31.008007049560547, + "learning_rate": 2.425518134715026e-06, + "loss": 0.552, + "mean_token_accuracy": 0.9112727642059326, + "num_tokens": 8383198.0, + "step": 4679 + }, + { + "epoch": 0.7578333738158853, + "grad_norm": 37.896514892578125, + "learning_rate": 2.42389896373057e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.902437835931778, + "num_tokens": 8384997.0, + "step": 4680 + }, + { + "epoch": 0.7579953040239656, + "grad_norm": 30.200599670410156, + "learning_rate": 2.422279792746114e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.9153688251972198, + "num_tokens": 8386781.0, + "step": 4681 + }, + { + "epoch": 0.758157234232046, + "grad_norm": 30.259031295776367, + "learning_rate": 2.420660621761658e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.9187537133693695, + "num_tokens": 8388565.0, + "step": 4682 + }, + { + "epoch": 0.7583191644401263, + "grad_norm": 39.184200286865234, + "learning_rate": 2.419041450777202e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.9019865691661835, + "num_tokens": 8390353.0, + "step": 4683 + }, + { + "epoch": 0.7584810946482067, + "grad_norm": 22.71794319152832, + "learning_rate": 2.417422279792746e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.9216834008693695, + "num_tokens": 8392146.0, + "step": 4684 + }, + { + "epoch": 0.758643024856287, + "grad_norm": 31.540409088134766, + "learning_rate": 2.41580310880829e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.9207015037536621, + "num_tokens": 8393944.0, + "step": 4685 + }, + { + "epoch": 0.7588049550643673, + "grad_norm": 27.81080436706543, + "learning_rate": 2.414183937823834e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.918353796005249, + "num_tokens": 8395725.0, + "step": 4686 + }, + { + "epoch": 0.7589668852724476, + "grad_norm": 21.210376739501953, + "learning_rate": 2.4125647668393786e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.9255583882331848, + "num_tokens": 8397506.0, + "step": 4687 + }, + { + "epoch": 0.7591288154805279, + "grad_norm": 13.841824531555176, + "learning_rate": 2.4109455958549226e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9324916005134583, + "num_tokens": 8399285.0, + "step": 4688 + }, + { + "epoch": 0.7592907456886082, + "grad_norm": 30.768701553344727, + "learning_rate": 2.4093264248704666e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.9124899506568909, + "num_tokens": 8401080.0, + "step": 4689 + }, + { + "epoch": 0.7594526758966885, + "grad_norm": 17.460729598999023, + "learning_rate": 2.4077072538860106e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.9330704808235168, + "num_tokens": 8402876.0, + "step": 4690 + }, + { + "epoch": 0.7596146061047688, + "grad_norm": 35.453426361083984, + "learning_rate": 2.4060880829015547e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.9070870876312256, + "num_tokens": 8404668.0, + "step": 4691 + }, + { + "epoch": 0.7597765363128491, + "grad_norm": 24.08220672607422, + "learning_rate": 2.4044689119170987e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.917548805475235, + "num_tokens": 8406459.0, + "step": 4692 + }, + { + "epoch": 0.7599384665209294, + "grad_norm": 24.43300437927246, + "learning_rate": 2.4028497409326427e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.9188774228096008, + "num_tokens": 8408253.0, + "step": 4693 + }, + { + "epoch": 0.7601003967290098, + "grad_norm": 22.149044036865234, + "learning_rate": 2.4012305699481867e-06, + "loss": 0.5507, + "mean_token_accuracy": 0.9208475053310394, + "num_tokens": 8410056.0, + "step": 4694 + }, + { + "epoch": 0.7602623269370901, + "grad_norm": 23.576396942138672, + "learning_rate": 2.3996113989637307e-06, + "loss": 0.6056, + "mean_token_accuracy": 0.9255533218383789, + "num_tokens": 8411850.0, + "step": 4695 + }, + { + "epoch": 0.7604242571451705, + "grad_norm": 25.82002830505371, + "learning_rate": 2.3979922279792747e-06, + "loss": 0.5768, + "mean_token_accuracy": 0.920152485370636, + "num_tokens": 8413637.0, + "step": 4696 + }, + { + "epoch": 0.7605861873532508, + "grad_norm": 29.509443283081055, + "learning_rate": 2.3963730569948187e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.9179607033729553, + "num_tokens": 8415428.0, + "step": 4697 + }, + { + "epoch": 0.7607481175613311, + "grad_norm": 22.199304580688477, + "learning_rate": 2.3947538860103627e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.9225405752658844, + "num_tokens": 8417211.0, + "step": 4698 + }, + { + "epoch": 0.7609100477694114, + "grad_norm": 28.593584060668945, + "learning_rate": 2.3931347150259068e-06, + "loss": 0.6197, + "mean_token_accuracy": 0.9057729840278625, + "num_tokens": 8419009.0, + "step": 4699 + }, + { + "epoch": 0.7610719779774917, + "grad_norm": 34.31404495239258, + "learning_rate": 2.3915155440414508e-06, + "loss": 0.6337, + "mean_token_accuracy": 0.9113828539848328, + "num_tokens": 8420803.0, + "step": 4700 + }, + { + "epoch": 0.761233908185572, + "grad_norm": 26.927459716796875, + "learning_rate": 2.3898963730569948e-06, + "loss": 0.6147, + "mean_token_accuracy": 0.9112305343151093, + "num_tokens": 8422609.0, + "step": 4701 + }, + { + "epoch": 0.7613958383936523, + "grad_norm": 27.589406967163086, + "learning_rate": 2.388277202072539e-06, + "loss": 0.5832, + "mean_token_accuracy": 0.9190182685852051, + "num_tokens": 8424405.0, + "step": 4702 + }, + { + "epoch": 0.7615577686017326, + "grad_norm": 27.552906036376953, + "learning_rate": 2.386658031088083e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.9192849397659302, + "num_tokens": 8426202.0, + "step": 4703 + }, + { + "epoch": 0.7617196988098129, + "grad_norm": 32.81730651855469, + "learning_rate": 2.385038860103627e-06, + "loss": 0.6052, + "mean_token_accuracy": 0.9151932895183563, + "num_tokens": 8427997.0, + "step": 4704 + }, + { + "epoch": 0.7618816290178932, + "grad_norm": 32.94668197631836, + "learning_rate": 2.383419689119171e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.90427565574646, + "num_tokens": 8429791.0, + "step": 4705 + }, + { + "epoch": 0.7620435592259736, + "grad_norm": 38.35142517089844, + "learning_rate": 2.3818005181347153e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.9066639840602875, + "num_tokens": 8431602.0, + "step": 4706 + }, + { + "epoch": 0.762205489434054, + "grad_norm": 26.179454803466797, + "learning_rate": 2.3801813471502593e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.9137202799320221, + "num_tokens": 8433391.0, + "step": 4707 + }, + { + "epoch": 0.7623674196421343, + "grad_norm": 23.248441696166992, + "learning_rate": 2.3785621761658033e-06, + "loss": 0.6551, + "mean_token_accuracy": 0.9233946204185486, + "num_tokens": 8435177.0, + "step": 4708 + }, + { + "epoch": 0.7625293498502146, + "grad_norm": 29.711692810058594, + "learning_rate": 2.3769430051813473e-06, + "loss": 0.5935, + "mean_token_accuracy": 0.9165835082530975, + "num_tokens": 8436965.0, + "step": 4709 + }, + { + "epoch": 0.7626912800582949, + "grad_norm": 30.165103912353516, + "learning_rate": 2.3753238341968913e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.9077596664428711, + "num_tokens": 8438759.0, + "step": 4710 + }, + { + "epoch": 0.7628532102663752, + "grad_norm": 30.472110748291016, + "learning_rate": 2.3737046632124353e-06, + "loss": 0.6836, + "mean_token_accuracy": 0.9259753525257111, + "num_tokens": 8440553.0, + "step": 4711 + }, + { + "epoch": 0.7630151404744555, + "grad_norm": 18.754375457763672, + "learning_rate": 2.3720854922279798e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.9336056709289551, + "num_tokens": 8442336.0, + "step": 4712 + }, + { + "epoch": 0.7631770706825358, + "grad_norm": 26.887163162231445, + "learning_rate": 2.370466321243524e-06, + "loss": 0.6045, + "mean_token_accuracy": 0.9098075330257416, + "num_tokens": 8444128.0, + "step": 4713 + }, + { + "epoch": 0.7633390008906161, + "grad_norm": 37.439884185791016, + "learning_rate": 2.368847150259068e-06, + "loss": 0.5815, + "mean_token_accuracy": 0.9199119508266449, + "num_tokens": 8445926.0, + "step": 4714 + }, + { + "epoch": 0.7635009310986964, + "grad_norm": 16.836505889892578, + "learning_rate": 2.367227979274612e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.9361188113689423, + "num_tokens": 8447720.0, + "step": 4715 + }, + { + "epoch": 0.7636628613067767, + "grad_norm": 28.99469566345215, + "learning_rate": 2.365608808290156e-06, + "loss": 0.6049, + "mean_token_accuracy": 0.9113828539848328, + "num_tokens": 8449514.0, + "step": 4716 + }, + { + "epoch": 0.763824791514857, + "grad_norm": 24.486297607421875, + "learning_rate": 2.3639896373057e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.9205766022205353, + "num_tokens": 8451303.0, + "step": 4717 + }, + { + "epoch": 0.7639867217229375, + "grad_norm": 20.67795181274414, + "learning_rate": 2.362370466321244e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.9270983338356018, + "num_tokens": 8453089.0, + "step": 4718 + }, + { + "epoch": 0.7641486519310178, + "grad_norm": 23.633642196655273, + "learning_rate": 2.360751295336788e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.9193262457847595, + "num_tokens": 8454874.0, + "step": 4719 + }, + { + "epoch": 0.7643105821390981, + "grad_norm": 37.5645866394043, + "learning_rate": 2.359132124352332e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.901867538690567, + "num_tokens": 8456671.0, + "step": 4720 + }, + { + "epoch": 0.7644725123471784, + "grad_norm": 32.31490707397461, + "learning_rate": 2.357512953367876e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.9169971942901611, + "num_tokens": 8458460.0, + "step": 4721 + }, + { + "epoch": 0.7646344425552587, + "grad_norm": 26.569786071777344, + "learning_rate": 2.35589378238342e-06, + "loss": 0.5869, + "mean_token_accuracy": 0.9225463271141052, + "num_tokens": 8460243.0, + "step": 4722 + }, + { + "epoch": 0.764796372763339, + "grad_norm": 29.161724090576172, + "learning_rate": 2.354274611398964e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.9216987490653992, + "num_tokens": 8462035.0, + "step": 4723 + }, + { + "epoch": 0.7649583029714193, + "grad_norm": 28.16563606262207, + "learning_rate": 2.352655440414508e-06, + "loss": 0.5542, + "mean_token_accuracy": 0.9241643846035004, + "num_tokens": 8463823.0, + "step": 4724 + }, + { + "epoch": 0.7651202331794996, + "grad_norm": 30.769689559936523, + "learning_rate": 2.351036269430052e-06, + "loss": 0.5844, + "mean_token_accuracy": 0.9133403301239014, + "num_tokens": 8465611.0, + "step": 4725 + }, + { + "epoch": 0.7652821633875799, + "grad_norm": 33.77940368652344, + "learning_rate": 2.349417098445596e-06, + "loss": 0.576, + "mean_token_accuracy": 0.9166666865348816, + "num_tokens": 8467399.0, + "step": 4726 + }, + { + "epoch": 0.7654440935956602, + "grad_norm": 26.95114517211914, + "learning_rate": 2.34779792746114e-06, + "loss": 0.572, + "mean_token_accuracy": 0.9237219393253326, + "num_tokens": 8469201.0, + "step": 4727 + }, + { + "epoch": 0.7656060238037405, + "grad_norm": 35.32869338989258, + "learning_rate": 2.346178756476684e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.9130434989929199, + "num_tokens": 8470989.0, + "step": 4728 + }, + { + "epoch": 0.7657679540118209, + "grad_norm": 37.70273208618164, + "learning_rate": 2.344559585492228e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.9060872197151184, + "num_tokens": 8472789.0, + "step": 4729 + }, + { + "epoch": 0.7659298842199013, + "grad_norm": 17.869884490966797, + "learning_rate": 2.342940414507772e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.9311594367027283, + "num_tokens": 8474577.0, + "step": 4730 + }, + { + "epoch": 0.7660918144279816, + "grad_norm": 17.92411994934082, + "learning_rate": 2.3413212435233164e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.9236221313476562, + "num_tokens": 8476364.0, + "step": 4731 + }, + { + "epoch": 0.7662537446360619, + "grad_norm": 32.35110092163086, + "learning_rate": 2.3397020725388605e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.9107346832752228, + "num_tokens": 8478166.0, + "step": 4732 + }, + { + "epoch": 0.7664156748441422, + "grad_norm": 37.07913589477539, + "learning_rate": 2.3380829015544045e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.9038697779178619, + "num_tokens": 8479969.0, + "step": 4733 + }, + { + "epoch": 0.7665776050522225, + "grad_norm": 34.32615661621094, + "learning_rate": 2.3364637305699485e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.900503009557724, + "num_tokens": 8481763.0, + "step": 4734 + }, + { + "epoch": 0.7667395352603028, + "grad_norm": 10.937378883361816, + "learning_rate": 2.3348445595854925e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.9391196966171265, + "num_tokens": 8483554.0, + "step": 4735 + }, + { + "epoch": 0.7669014654683831, + "grad_norm": 36.966796875, + "learning_rate": 2.3332253886010365e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.9075387418270111, + "num_tokens": 8485347.0, + "step": 4736 + }, + { + "epoch": 0.7670633956764634, + "grad_norm": 24.56297492980957, + "learning_rate": 2.3316062176165805e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.9169186651706696, + "num_tokens": 8487136.0, + "step": 4737 + }, + { + "epoch": 0.7672253258845437, + "grad_norm": 31.4870662689209, + "learning_rate": 2.3299870466321245e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.9183647632598877, + "num_tokens": 8488929.0, + "step": 4738 + }, + { + "epoch": 0.767387256092624, + "grad_norm": 33.57414245605469, + "learning_rate": 2.3283678756476686e-06, + "loss": 0.6023, + "mean_token_accuracy": 0.9137547016143799, + "num_tokens": 8490719.0, + "step": 4739 + }, + { + "epoch": 0.7675491863007043, + "grad_norm": 22.86031723022461, + "learning_rate": 2.3267487046632126e-06, + "loss": 0.5663, + "mean_token_accuracy": 0.9298503696918488, + "num_tokens": 8492517.0, + "step": 4740 + }, + { + "epoch": 0.7677111165087848, + "grad_norm": 41.793975830078125, + "learning_rate": 2.3251295336787566e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.8914404511451721, + "num_tokens": 8494315.0, + "step": 4741 + }, + { + "epoch": 0.7678730467168651, + "grad_norm": 27.46380043029785, + "learning_rate": 2.3235103626943006e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.9282702505588531, + "num_tokens": 8496106.0, + "step": 4742 + }, + { + "epoch": 0.7680349769249454, + "grad_norm": 32.72002029418945, + "learning_rate": 2.3218911917098446e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.9085765480995178, + "num_tokens": 8497902.0, + "step": 4743 + }, + { + "epoch": 0.7681969071330257, + "grad_norm": 32.569854736328125, + "learning_rate": 2.3202720207253886e-06, + "loss": 0.703, + "mean_token_accuracy": 0.9122023582458496, + "num_tokens": 8499698.0, + "step": 4744 + }, + { + "epoch": 0.768358837341106, + "grad_norm": 27.73603057861328, + "learning_rate": 2.3186528497409326e-06, + "loss": 0.6119, + "mean_token_accuracy": 0.9216992855072021, + "num_tokens": 8501491.0, + "step": 4745 + }, + { + "epoch": 0.7685207675491863, + "grad_norm": 27.155590057373047, + "learning_rate": 2.3170336787564766e-06, + "loss": 0.6124, + "mean_token_accuracy": 0.9231884181499481, + "num_tokens": 8503276.0, + "step": 4746 + }, + { + "epoch": 0.7686826977572666, + "grad_norm": 23.853473663330078, + "learning_rate": 2.3154145077720207e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.9225198328495026, + "num_tokens": 8505072.0, + "step": 4747 + }, + { + "epoch": 0.7688446279653469, + "grad_norm": 27.608078002929688, + "learning_rate": 2.3137953367875647e-06, + "loss": 0.5704, + "mean_token_accuracy": 0.921171635389328, + "num_tokens": 8506863.0, + "step": 4748 + }, + { + "epoch": 0.7690065581734272, + "grad_norm": 27.092342376708984, + "learning_rate": 2.3121761658031087e-06, + "loss": 0.543, + "mean_token_accuracy": 0.9326961934566498, + "num_tokens": 8508657.0, + "step": 4749 + }, + { + "epoch": 0.7691684883815075, + "grad_norm": 42.29644775390625, + "learning_rate": 2.310556994818653e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.8955715000629425, + "num_tokens": 8510455.0, + "step": 4750 + }, + { + "epoch": 0.7693304185895878, + "grad_norm": 22.910228729248047, + "learning_rate": 2.308937823834197e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.9312728047370911, + "num_tokens": 8512244.0, + "step": 4751 + }, + { + "epoch": 0.7694923487976683, + "grad_norm": 35.55131530761719, + "learning_rate": 2.307318652849741e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.9022064507007599, + "num_tokens": 8514032.0, + "step": 4752 + }, + { + "epoch": 0.7696542790057486, + "grad_norm": 24.13022804260254, + "learning_rate": 2.305699481865285e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.9214245676994324, + "num_tokens": 8515824.0, + "step": 4753 + }, + { + "epoch": 0.7698162092138289, + "grad_norm": 34.95334243774414, + "learning_rate": 2.304080310880829e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.9150264263153076, + "num_tokens": 8517618.0, + "step": 4754 + }, + { + "epoch": 0.7699781394219092, + "grad_norm": 22.432109832763672, + "learning_rate": 2.302461139896373e-06, + "loss": 0.5564, + "mean_token_accuracy": 0.9173611104488373, + "num_tokens": 8519409.0, + "step": 4755 + }, + { + "epoch": 0.7701400696299895, + "grad_norm": 21.526447296142578, + "learning_rate": 2.3008419689119176e-06, + "loss": 0.5283, + "mean_token_accuracy": 0.9362995326519012, + "num_tokens": 8521204.0, + "step": 4756 + }, + { + "epoch": 0.7703019998380698, + "grad_norm": 20.997190475463867, + "learning_rate": 2.2992227979274616e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.9280426800251007, + "num_tokens": 8522994.0, + "step": 4757 + }, + { + "epoch": 0.7704639300461501, + "grad_norm": 16.79971694946289, + "learning_rate": 2.2976036269430056e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.9201631844043732, + "num_tokens": 8524781.0, + "step": 4758 + }, + { + "epoch": 0.7706258602542304, + "grad_norm": 26.214557647705078, + "learning_rate": 2.2959844559585497e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.9293757081031799, + "num_tokens": 8526561.0, + "step": 4759 + }, + { + "epoch": 0.7707877904623107, + "grad_norm": 26.027692794799805, + "learning_rate": 2.2943652849740937e-06, + "loss": 0.5559, + "mean_token_accuracy": 0.9179593324661255, + "num_tokens": 8528355.0, + "step": 4760 + }, + { + "epoch": 0.770949720670391, + "grad_norm": 35.00592803955078, + "learning_rate": 2.2927461139896377e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.9098277688026428, + "num_tokens": 8530155.0, + "step": 4761 + }, + { + "epoch": 0.7711116508784713, + "grad_norm": 24.112192153930664, + "learning_rate": 2.2911269430051817e-06, + "loss": 0.6061, + "mean_token_accuracy": 0.9177893698215485, + "num_tokens": 8531947.0, + "step": 4762 + }, + { + "epoch": 0.7712735810865518, + "grad_norm": 31.94225311279297, + "learning_rate": 2.2895077720207257e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.9119047820568085, + "num_tokens": 8533743.0, + "step": 4763 + }, + { + "epoch": 0.7714355112946321, + "grad_norm": 27.06142234802246, + "learning_rate": 2.2878886010362697e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9188838601112366, + "num_tokens": 8535527.0, + "step": 4764 + }, + { + "epoch": 0.7715974415027124, + "grad_norm": 30.965436935424805, + "learning_rate": 2.2862694300518137e-06, + "loss": 0.5861, + "mean_token_accuracy": 0.9149631559848785, + "num_tokens": 8537333.0, + "step": 4765 + }, + { + "epoch": 0.7717593717107927, + "grad_norm": 35.833438873291016, + "learning_rate": 2.2846502590673578e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.9075655937194824, + "num_tokens": 8539126.0, + "step": 4766 + }, + { + "epoch": 0.771921301918873, + "grad_norm": 23.763675689697266, + "learning_rate": 2.2830310880829018e-06, + "loss": 0.5567, + "mean_token_accuracy": 0.9263465404510498, + "num_tokens": 8540910.0, + "step": 4767 + }, + { + "epoch": 0.7720832321269533, + "grad_norm": 30.188648223876953, + "learning_rate": 2.2814119170984458e-06, + "loss": 0.6418, + "mean_token_accuracy": 0.9117647111415863, + "num_tokens": 8542694.0, + "step": 4768 + }, + { + "epoch": 0.7722451623350336, + "grad_norm": 17.313716888427734, + "learning_rate": 2.27979274611399e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.9344349503517151, + "num_tokens": 8544480.0, + "step": 4769 + }, + { + "epoch": 0.7724070925431139, + "grad_norm": 19.52690315246582, + "learning_rate": 2.278173575129534e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.9340715110301971, + "num_tokens": 8546280.0, + "step": 4770 + }, + { + "epoch": 0.7725690227511942, + "grad_norm": 34.0203742980957, + "learning_rate": 2.276554404145078e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.9107851386070251, + "num_tokens": 8548083.0, + "step": 4771 + }, + { + "epoch": 0.7727309529592745, + "grad_norm": 28.469274520874023, + "learning_rate": 2.274935233160622e-06, + "loss": 0.5651, + "mean_token_accuracy": 0.9120330810546875, + "num_tokens": 8549868.0, + "step": 4772 + }, + { + "epoch": 0.7728928831673548, + "grad_norm": 31.10840606689453, + "learning_rate": 2.273316062176166e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.9148764908313751, + "num_tokens": 8551662.0, + "step": 4773 + }, + { + "epoch": 0.7730548133754351, + "grad_norm": 36.347286224365234, + "learning_rate": 2.27169689119171e-06, + "loss": 0.667, + "mean_token_accuracy": 0.8953522741794586, + "num_tokens": 8553460.0, + "step": 4774 + }, + { + "epoch": 0.7732167435835156, + "grad_norm": 31.08747100830078, + "learning_rate": 2.2700777202072543e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.9216834008693695, + "num_tokens": 8555253.0, + "step": 4775 + }, + { + "epoch": 0.7733786737915959, + "grad_norm": 30.034589767456055, + "learning_rate": 2.2684585492227983e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.9203781485557556, + "num_tokens": 8557041.0, + "step": 4776 + }, + { + "epoch": 0.7735406039996762, + "grad_norm": 28.906774520874023, + "learning_rate": 2.2668393782383423e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.9107142984867096, + "num_tokens": 8558833.0, + "step": 4777 + }, + { + "epoch": 0.7737025342077565, + "grad_norm": 29.198413848876953, + "learning_rate": 2.2652202072538863e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.9191037714481354, + "num_tokens": 8560633.0, + "step": 4778 + }, + { + "epoch": 0.7738644644158368, + "grad_norm": 29.853670120239258, + "learning_rate": 2.2636010362694303e-06, + "loss": 0.5517, + "mean_token_accuracy": 0.9243197441101074, + "num_tokens": 8562424.0, + "step": 4779 + }, + { + "epoch": 0.7740263946239171, + "grad_norm": 37.71849060058594, + "learning_rate": 2.2619818652849744e-06, + "loss": 0.6316, + "mean_token_accuracy": 0.9079670310020447, + "num_tokens": 8564219.0, + "step": 4780 + }, + { + "epoch": 0.7741883248319974, + "grad_norm": 29.527971267700195, + "learning_rate": 2.2603626943005184e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.9216992855072021, + "num_tokens": 8566012.0, + "step": 4781 + }, + { + "epoch": 0.7743502550400777, + "grad_norm": 21.974746704101562, + "learning_rate": 2.2587435233160624e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.9302884638309479, + "num_tokens": 8567811.0, + "step": 4782 + }, + { + "epoch": 0.774512185248158, + "grad_norm": 32.31019973754883, + "learning_rate": 2.2571243523316064e-06, + "loss": 0.6468, + "mean_token_accuracy": 0.908225953578949, + "num_tokens": 8569606.0, + "step": 4783 + }, + { + "epoch": 0.7746741154562383, + "grad_norm": 29.321130752563477, + "learning_rate": 2.2555051813471504e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.922252744436264, + "num_tokens": 8571401.0, + "step": 4784 + }, + { + "epoch": 0.7748360456643186, + "grad_norm": 28.01386070251465, + "learning_rate": 2.2538860103626944e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.9248905181884766, + "num_tokens": 8573206.0, + "step": 4785 + }, + { + "epoch": 0.774997975872399, + "grad_norm": 27.028675079345703, + "learning_rate": 2.2522668393782384e-06, + "loss": 0.5818, + "mean_token_accuracy": 0.9160839319229126, + "num_tokens": 8575004.0, + "step": 4786 + }, + { + "epoch": 0.7751599060804794, + "grad_norm": 27.31130027770996, + "learning_rate": 2.2506476683937825e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.9285386204719543, + "num_tokens": 8576796.0, + "step": 4787 + }, + { + "epoch": 0.7753218362885597, + "grad_norm": 22.87717056274414, + "learning_rate": 2.2490284974093265e-06, + "loss": 0.5309, + "mean_token_accuracy": 0.9212121367454529, + "num_tokens": 8578586.0, + "step": 4788 + }, + { + "epoch": 0.77548376649664, + "grad_norm": 31.363008499145508, + "learning_rate": 2.2474093264248705e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.9259920716285706, + "num_tokens": 8580382.0, + "step": 4789 + }, + { + "epoch": 0.7756456967047203, + "grad_norm": 25.953182220458984, + "learning_rate": 2.2457901554404145e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.9233365952968597, + "num_tokens": 8582180.0, + "step": 4790 + }, + { + "epoch": 0.7758076269128006, + "grad_norm": 30.845632553100586, + "learning_rate": 2.2441709844559585e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.9121931195259094, + "num_tokens": 8583963.0, + "step": 4791 + }, + { + "epoch": 0.7759695571208809, + "grad_norm": 25.139423370361328, + "learning_rate": 2.2425518134715025e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.916979968547821, + "num_tokens": 8585740.0, + "step": 4792 + }, + { + "epoch": 0.7761314873289612, + "grad_norm": 33.40688705444336, + "learning_rate": 2.2409326424870465e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.9129201769828796, + "num_tokens": 8587528.0, + "step": 4793 + }, + { + "epoch": 0.7762934175370415, + "grad_norm": 27.23075294494629, + "learning_rate": 2.239313471502591e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.922448992729187, + "num_tokens": 8589322.0, + "step": 4794 + }, + { + "epoch": 0.7764553477451218, + "grad_norm": 37.810176849365234, + "learning_rate": 2.237694300518135e-06, + "loss": 0.693, + "mean_token_accuracy": 0.9033302664756775, + "num_tokens": 8591113.0, + "step": 4795 + }, + { + "epoch": 0.7766172779532021, + "grad_norm": 34.69388198852539, + "learning_rate": 2.236075129533679e-06, + "loss": 0.6295, + "mean_token_accuracy": 0.9183673560619354, + "num_tokens": 8592919.0, + "step": 4796 + }, + { + "epoch": 0.7767792081612825, + "grad_norm": 36.328636169433594, + "learning_rate": 2.234455958549223e-06, + "loss": 0.6683, + "mean_token_accuracy": 0.920410692691803, + "num_tokens": 8594721.0, + "step": 4797 + }, + { + "epoch": 0.7769411383693628, + "grad_norm": 21.276636123657227, + "learning_rate": 2.232836787564767e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.9275849461555481, + "num_tokens": 8596509.0, + "step": 4798 + }, + { + "epoch": 0.7771030685774432, + "grad_norm": 23.949460983276367, + "learning_rate": 2.231217616580311e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.9252279698848724, + "num_tokens": 8598302.0, + "step": 4799 + }, + { + "epoch": 0.7772649987855235, + "grad_norm": 20.94979476928711, + "learning_rate": 2.229598445595855e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9183273017406464, + "num_tokens": 8600096.0, + "step": 4800 + }, + { + "epoch": 0.7774269289936038, + "grad_norm": 35.0922966003418, + "learning_rate": 2.227979274611399e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.9007898569107056, + "num_tokens": 8601890.0, + "step": 4801 + }, + { + "epoch": 0.7775888592016841, + "grad_norm": 33.468990325927734, + "learning_rate": 2.2263601036269435e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.9107142984867096, + "num_tokens": 8603682.0, + "step": 4802 + }, + { + "epoch": 0.7777507894097644, + "grad_norm": 35.45664596557617, + "learning_rate": 2.2247409326424875e-06, + "loss": 0.5794, + "mean_token_accuracy": 0.9195031523704529, + "num_tokens": 8605480.0, + "step": 4803 + }, + { + "epoch": 0.7779127196178447, + "grad_norm": 24.298919677734375, + "learning_rate": 2.2231217616580315e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.9256495237350464, + "num_tokens": 8607261.0, + "step": 4804 + }, + { + "epoch": 0.778074649825925, + "grad_norm": 25.70359230041504, + "learning_rate": 2.2215025906735755e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.9264666140079498, + "num_tokens": 8609045.0, + "step": 4805 + }, + { + "epoch": 0.7782365800340053, + "grad_norm": 19.799482345581055, + "learning_rate": 2.2198834196891195e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.9280538260936737, + "num_tokens": 8610835.0, + "step": 4806 + }, + { + "epoch": 0.7783985102420856, + "grad_norm": 32.9798583984375, + "learning_rate": 2.2182642487046636e-06, + "loss": 0.5757, + "mean_token_accuracy": 0.9200254082679749, + "num_tokens": 8612622.0, + "step": 4807 + }, + { + "epoch": 0.7785604404501659, + "grad_norm": 25.638708114624023, + "learning_rate": 2.2166450777202076e-06, + "loss": 0.511, + "mean_token_accuracy": 0.9263465404510498, + "num_tokens": 8614406.0, + "step": 4808 + }, + { + "epoch": 0.7787223706582463, + "grad_norm": 38.800079345703125, + "learning_rate": 2.2150259067357516e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.8971631228923798, + "num_tokens": 8616200.0, + "step": 4809 + }, + { + "epoch": 0.7788843008663267, + "grad_norm": 32.94105911254883, + "learning_rate": 2.2134067357512956e-06, + "loss": 0.649, + "mean_token_accuracy": 0.9181869029998779, + "num_tokens": 8617993.0, + "step": 4810 + }, + { + "epoch": 0.779046231074407, + "grad_norm": 27.18923568725586, + "learning_rate": 2.2117875647668396e-06, + "loss": 0.5818, + "mean_token_accuracy": 0.9149899184703827, + "num_tokens": 8619787.0, + "step": 4811 + }, + { + "epoch": 0.7792081612824873, + "grad_norm": 39.83348083496094, + "learning_rate": 2.2101683937823836e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.9142982661724091, + "num_tokens": 8621601.0, + "step": 4812 + }, + { + "epoch": 0.7793700914905676, + "grad_norm": 28.60740852355957, + "learning_rate": 2.2085492227979276e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.9256678521633148, + "num_tokens": 8623395.0, + "step": 4813 + }, + { + "epoch": 0.7795320216986479, + "grad_norm": 35.63589096069336, + "learning_rate": 2.2069300518134717e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.9226918518543243, + "num_tokens": 8625192.0, + "step": 4814 + }, + { + "epoch": 0.7796939519067282, + "grad_norm": 30.61024284362793, + "learning_rate": 2.2053108808290157e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.930644690990448, + "num_tokens": 8626993.0, + "step": 4815 + }, + { + "epoch": 0.7798558821148085, + "grad_norm": 31.67363929748535, + "learning_rate": 2.2036917098445597e-06, + "loss": 0.5714, + "mean_token_accuracy": 0.9154835939407349, + "num_tokens": 8628790.0, + "step": 4816 + }, + { + "epoch": 0.7800178123228888, + "grad_norm": 31.31595802307129, + "learning_rate": 2.2020725388601037e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.9266505837440491, + "num_tokens": 8630575.0, + "step": 4817 + }, + { + "epoch": 0.7801797425309691, + "grad_norm": 23.93700408935547, + "learning_rate": 2.2004533678756477e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.9220321774482727, + "num_tokens": 8632369.0, + "step": 4818 + }, + { + "epoch": 0.7803416727390494, + "grad_norm": 12.18370246887207, + "learning_rate": 2.198834196891192e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.9328877627849579, + "num_tokens": 8634149.0, + "step": 4819 + }, + { + "epoch": 0.7805036029471298, + "grad_norm": 41.27445983886719, + "learning_rate": 2.197215025906736e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.8943661749362946, + "num_tokens": 8635945.0, + "step": 4820 + }, + { + "epoch": 0.7806655331552101, + "grad_norm": 25.761497497558594, + "learning_rate": 2.19559585492228e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.9316956996917725, + "num_tokens": 8637735.0, + "step": 4821 + }, + { + "epoch": 0.7808274633632905, + "grad_norm": 21.085548400878906, + "learning_rate": 2.193976683937824e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.9350104331970215, + "num_tokens": 8639524.0, + "step": 4822 + }, + { + "epoch": 0.7809893935713708, + "grad_norm": 35.967491149902344, + "learning_rate": 2.192357512953368e-06, + "loss": 0.6627, + "mean_token_accuracy": 0.9107352197170258, + "num_tokens": 8641316.0, + "step": 4823 + }, + { + "epoch": 0.7811513237794511, + "grad_norm": 29.158815383911133, + "learning_rate": 2.190738341968912e-06, + "loss": 0.5839, + "mean_token_accuracy": 0.9137443602085114, + "num_tokens": 8643107.0, + "step": 4824 + }, + { + "epoch": 0.7813132539875314, + "grad_norm": 36.52337646484375, + "learning_rate": 2.1891191709844562e-06, + "loss": 0.6592, + "mean_token_accuracy": 0.9040176272392273, + "num_tokens": 8644900.0, + "step": 4825 + }, + { + "epoch": 0.7814751841956117, + "grad_norm": 12.187175750732422, + "learning_rate": 2.1875000000000002e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.9343030750751495, + "num_tokens": 8646686.0, + "step": 4826 + }, + { + "epoch": 0.781637114403692, + "grad_norm": 33.27248764038086, + "learning_rate": 2.1858808290155442e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.9012077450752258, + "num_tokens": 8648471.0, + "step": 4827 + }, + { + "epoch": 0.7817990446117723, + "grad_norm": 28.864276885986328, + "learning_rate": 2.1842616580310883e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.9133041501045227, + "num_tokens": 8650260.0, + "step": 4828 + }, + { + "epoch": 0.7819609748198526, + "grad_norm": 26.825626373291016, + "learning_rate": 2.1826424870466323e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9299019575119019, + "num_tokens": 8652043.0, + "step": 4829 + }, + { + "epoch": 0.7821229050279329, + "grad_norm": 20.098388671875, + "learning_rate": 2.1810233160621763e-06, + "loss": 0.6166, + "mean_token_accuracy": 0.9106098711490631, + "num_tokens": 8653823.0, + "step": 4830 + }, + { + "epoch": 0.7822848352360133, + "grad_norm": 25.42427635192871, + "learning_rate": 2.1794041450777203e-06, + "loss": 0.5618, + "mean_token_accuracy": 0.9236750304698944, + "num_tokens": 8655610.0, + "step": 4831 + }, + { + "epoch": 0.7824467654440936, + "grad_norm": 32.184326171875, + "learning_rate": 2.1777849740932643e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.91847363114357, + "num_tokens": 8657404.0, + "step": 4832 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 17.487751007080078, + "learning_rate": 2.1761658031088083e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.9345445930957794, + "num_tokens": 8659191.0, + "step": 4833 + }, + { + "epoch": 0.7827706258602543, + "grad_norm": 26.082229614257812, + "learning_rate": 2.1745466321243523e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.9234008491039276, + "num_tokens": 8660977.0, + "step": 4834 + }, + { + "epoch": 0.7829325560683346, + "grad_norm": 38.65194320678711, + "learning_rate": 2.1729274611398963e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.9030129015445709, + "num_tokens": 8662767.0, + "step": 4835 + }, + { + "epoch": 0.7830944862764149, + "grad_norm": 20.90742301940918, + "learning_rate": 2.1713082901554404e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.9310612082481384, + "num_tokens": 8664553.0, + "step": 4836 + }, + { + "epoch": 0.7832564164844952, + "grad_norm": 31.811559677124023, + "learning_rate": 2.1696891191709844e-06, + "loss": 0.572, + "mean_token_accuracy": 0.9241343140602112, + "num_tokens": 8666355.0, + "step": 4837 + }, + { + "epoch": 0.7834183466925755, + "grad_norm": 36.27912521362305, + "learning_rate": 2.168069948186529e-06, + "loss": 0.6736, + "mean_token_accuracy": 0.9035409688949585, + "num_tokens": 8668147.0, + "step": 4838 + }, + { + "epoch": 0.7835802769006558, + "grad_norm": 17.592748641967773, + "learning_rate": 2.166450777202073e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.9352216124534607, + "num_tokens": 8669937.0, + "step": 4839 + }, + { + "epoch": 0.7837422071087361, + "grad_norm": 40.53508758544922, + "learning_rate": 2.164831606217617e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.9002532958984375, + "num_tokens": 8671730.0, + "step": 4840 + }, + { + "epoch": 0.7839041373168164, + "grad_norm": 33.6708984375, + "learning_rate": 2.163212435233161e-06, + "loss": 0.6062, + "mean_token_accuracy": 0.9094492495059967, + "num_tokens": 8673518.0, + "step": 4841 + }, + { + "epoch": 0.7840660675248967, + "grad_norm": 32.10221862792969, + "learning_rate": 2.161593264248705e-06, + "loss": 0.6511, + "mean_token_accuracy": 0.905809611082077, + "num_tokens": 8675315.0, + "step": 4842 + }, + { + "epoch": 0.7842279977329771, + "grad_norm": 45.56889343261719, + "learning_rate": 2.159974093264249e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.891838401556015, + "num_tokens": 8677122.0, + "step": 4843 + }, + { + "epoch": 0.7843899279410574, + "grad_norm": 44.523189544677734, + "learning_rate": 2.158354922279793e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.899463564157486, + "num_tokens": 8678923.0, + "step": 4844 + }, + { + "epoch": 0.7845518581491377, + "grad_norm": 43.56492614746094, + "learning_rate": 2.156735751295337e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.9078092277050018, + "num_tokens": 8680718.0, + "step": 4845 + }, + { + "epoch": 0.7847137883572181, + "grad_norm": 36.994571685791016, + "learning_rate": 2.155116580310881e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.9053634703159332, + "num_tokens": 8682515.0, + "step": 4846 + }, + { + "epoch": 0.7848757185652984, + "grad_norm": 19.518110275268555, + "learning_rate": 2.153497409326425e-06, + "loss": 0.453, + "mean_token_accuracy": 0.9306555390357971, + "num_tokens": 8684316.0, + "step": 4847 + }, + { + "epoch": 0.7850376487733787, + "grad_norm": 32.8856201171875, + "learning_rate": 2.151878238341969e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.9245336949825287, + "num_tokens": 8686118.0, + "step": 4848 + }, + { + "epoch": 0.785199578981459, + "grad_norm": 31.418115615844727, + "learning_rate": 2.150259067357513e-06, + "loss": 0.5939, + "mean_token_accuracy": 0.907657653093338, + "num_tokens": 8687922.0, + "step": 4849 + }, + { + "epoch": 0.7853615091895393, + "grad_norm": 27.000045776367188, + "learning_rate": 2.148639896373057e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.9189908504486084, + "num_tokens": 8689719.0, + "step": 4850 + }, + { + "epoch": 0.7855234393976196, + "grad_norm": 30.59200096130371, + "learning_rate": 2.1470207253886014e-06, + "loss": 0.5563, + "mean_token_accuracy": 0.9228060841560364, + "num_tokens": 8691516.0, + "step": 4851 + }, + { + "epoch": 0.7856853696056999, + "grad_norm": 37.32300567626953, + "learning_rate": 2.1454015544041454e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.9141661822795868, + "num_tokens": 8693308.0, + "step": 4852 + }, + { + "epoch": 0.7858472998137802, + "grad_norm": 39.51362991333008, + "learning_rate": 2.1437823834196894e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.8935782313346863, + "num_tokens": 8695103.0, + "step": 4853 + }, + { + "epoch": 0.7860092300218606, + "grad_norm": 35.616764068603516, + "learning_rate": 2.1421632124352334e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.917391300201416, + "num_tokens": 8696893.0, + "step": 4854 + }, + { + "epoch": 0.7861711602299409, + "grad_norm": 35.1362419128418, + "learning_rate": 2.1405440414507775e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.902877688407898, + "num_tokens": 8698683.0, + "step": 4855 + }, + { + "epoch": 0.7863330904380212, + "grad_norm": 25.9366512298584, + "learning_rate": 2.1389248704663215e-06, + "loss": 0.6074, + "mean_token_accuracy": 0.9285386204719543, + "num_tokens": 8700475.0, + "step": 4856 + }, + { + "epoch": 0.7864950206461016, + "grad_norm": 29.31695556640625, + "learning_rate": 2.1373056994818655e-06, + "loss": 0.582, + "mean_token_accuracy": 0.922835499048233, + "num_tokens": 8702259.0, + "step": 4857 + }, + { + "epoch": 0.7866569508541819, + "grad_norm": 25.394182205200195, + "learning_rate": 2.1356865284974095e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.9307131469249725, + "num_tokens": 8704059.0, + "step": 4858 + }, + { + "epoch": 0.7868188810622622, + "grad_norm": 33.61517333984375, + "learning_rate": 2.1340673575129535e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.9166666865348816, + "num_tokens": 8705847.0, + "step": 4859 + }, + { + "epoch": 0.7869808112703425, + "grad_norm": 35.47023010253906, + "learning_rate": 2.1324481865284975e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.9097758233547211, + "num_tokens": 8707636.0, + "step": 4860 + }, + { + "epoch": 0.7871427414784228, + "grad_norm": 37.46030044555664, + "learning_rate": 2.1308290155440415e-06, + "loss": 0.6382, + "mean_token_accuracy": 0.9103462398052216, + "num_tokens": 8709429.0, + "step": 4861 + }, + { + "epoch": 0.7873046716865031, + "grad_norm": 30.227519989013672, + "learning_rate": 2.1292098445595855e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.9176872968673706, + "num_tokens": 8711221.0, + "step": 4862 + }, + { + "epoch": 0.7874666018945834, + "grad_norm": 33.537479400634766, + "learning_rate": 2.1275906735751296e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.9075141549110413, + "num_tokens": 8713012.0, + "step": 4863 + }, + { + "epoch": 0.7876285321026637, + "grad_norm": 33.87030792236328, + "learning_rate": 2.125971502590674e-06, + "loss": 0.6378, + "mean_token_accuracy": 0.9187562465667725, + "num_tokens": 8714807.0, + "step": 4864 + }, + { + "epoch": 0.7877904623107441, + "grad_norm": 35.78220748901367, + "learning_rate": 2.124352331606218e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.9263144731521606, + "num_tokens": 8716605.0, + "step": 4865 + }, + { + "epoch": 0.7879523925188244, + "grad_norm": 19.19643211364746, + "learning_rate": 2.122733160621762e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.9324290156364441, + "num_tokens": 8718398.0, + "step": 4866 + }, + { + "epoch": 0.7881143227269047, + "grad_norm": 32.678890228271484, + "learning_rate": 2.121113989637306e-06, + "loss": 0.6302, + "mean_token_accuracy": 0.9119426608085632, + "num_tokens": 8720194.0, + "step": 4867 + }, + { + "epoch": 0.788276252934985, + "grad_norm": 31.408491134643555, + "learning_rate": 2.11949481865285e-06, + "loss": 0.6147, + "mean_token_accuracy": 0.9159872233867645, + "num_tokens": 8721980.0, + "step": 4868 + }, + { + "epoch": 0.7884381831430654, + "grad_norm": 35.651039123535156, + "learning_rate": 2.117875647668394e-06, + "loss": 0.5737, + "mean_token_accuracy": 0.9194001257419586, + "num_tokens": 8723777.0, + "step": 4869 + }, + { + "epoch": 0.7886001133511457, + "grad_norm": 24.259984970092773, + "learning_rate": 2.116256476683938e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.930820107460022, + "num_tokens": 8725564.0, + "step": 4870 + }, + { + "epoch": 0.788762043559226, + "grad_norm": 31.06159782409668, + "learning_rate": 2.114637305699482e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.919669508934021, + "num_tokens": 8727350.0, + "step": 4871 + }, + { + "epoch": 0.7889239737673063, + "grad_norm": 23.367807388305664, + "learning_rate": 2.113018134715026e-06, + "loss": 0.501, + "mean_token_accuracy": 0.9280426800251007, + "num_tokens": 8729140.0, + "step": 4872 + }, + { + "epoch": 0.7890859039753866, + "grad_norm": 22.92611312866211, + "learning_rate": 2.11139896373057e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.929380863904953, + "num_tokens": 8730921.0, + "step": 4873 + }, + { + "epoch": 0.7892478341834669, + "grad_norm": 37.05588912963867, + "learning_rate": 2.109779792746114e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.911843478679657, + "num_tokens": 8732717.0, + "step": 4874 + }, + { + "epoch": 0.7894097643915472, + "grad_norm": 31.46372413635254, + "learning_rate": 2.108160621761658e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.9193595945835114, + "num_tokens": 8734515.0, + "step": 4875 + }, + { + "epoch": 0.7895716945996276, + "grad_norm": 32.76239013671875, + "learning_rate": 2.106541450777202e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.9167143404483795, + "num_tokens": 8736317.0, + "step": 4876 + }, + { + "epoch": 0.7897336248077079, + "grad_norm": 40.44179153442383, + "learning_rate": 2.104922279792746e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.9103313684463501, + "num_tokens": 8738116.0, + "step": 4877 + }, + { + "epoch": 0.7898955550157882, + "grad_norm": 44.074798583984375, + "learning_rate": 2.10330310880829e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.9016563296318054, + "num_tokens": 8739913.0, + "step": 4878 + }, + { + "epoch": 0.7900574852238685, + "grad_norm": 24.902088165283203, + "learning_rate": 2.101683937823834e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.9304866194725037, + "num_tokens": 8741712.0, + "step": 4879 + }, + { + "epoch": 0.7902194154319488, + "grad_norm": 22.09727668762207, + "learning_rate": 2.100064766839378e-06, + "loss": 0.472, + "mean_token_accuracy": 0.9330623745918274, + "num_tokens": 8743493.0, + "step": 4880 + }, + { + "epoch": 0.7903813456400292, + "grad_norm": 33.19005584716797, + "learning_rate": 2.0984455958549222e-06, + "loss": 0.584, + "mean_token_accuracy": 0.9115451872348785, + "num_tokens": 8745276.0, + "step": 4881 + }, + { + "epoch": 0.7905432758481095, + "grad_norm": 33.45199203491211, + "learning_rate": 2.0968264248704667e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.9236669540405273, + "num_tokens": 8747063.0, + "step": 4882 + }, + { + "epoch": 0.7907052060561898, + "grad_norm": 38.517459869384766, + "learning_rate": 2.0952072538860107e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.9150349497795105, + "num_tokens": 8748858.0, + "step": 4883 + }, + { + "epoch": 0.7908671362642701, + "grad_norm": 31.903398513793945, + "learning_rate": 2.0935880829015547e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.918178141117096, + "num_tokens": 8750651.0, + "step": 4884 + }, + { + "epoch": 0.7910290664723504, + "grad_norm": 33.66367721557617, + "learning_rate": 2.0919689119170987e-06, + "loss": 0.5642, + "mean_token_accuracy": 0.9136288166046143, + "num_tokens": 8752441.0, + "step": 4885 + }, + { + "epoch": 0.7911909966804307, + "grad_norm": 32.21567916870117, + "learning_rate": 2.0903497409326427e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.9151678681373596, + "num_tokens": 8754236.0, + "step": 4886 + }, + { + "epoch": 0.791352926888511, + "grad_norm": 37.72034454345703, + "learning_rate": 2.0887305699481867e-06, + "loss": 0.6633, + "mean_token_accuracy": 0.9047702252864838, + "num_tokens": 8756031.0, + "step": 4887 + }, + { + "epoch": 0.7915148570965914, + "grad_norm": 31.206485748291016, + "learning_rate": 2.0871113989637307e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.9037570655345917, + "num_tokens": 8757825.0, + "step": 4888 + }, + { + "epoch": 0.7916767873046717, + "grad_norm": 32.79133224487305, + "learning_rate": 2.0854922279792747e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.9108372926712036, + "num_tokens": 8759617.0, + "step": 4889 + }, + { + "epoch": 0.791838717512752, + "grad_norm": 22.16862678527832, + "learning_rate": 2.0838730569948188e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9345376491546631, + "num_tokens": 8761404.0, + "step": 4890 + }, + { + "epoch": 0.7920006477208323, + "grad_norm": 32.187461853027344, + "learning_rate": 2.0822538860103628e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.9046019315719604, + "num_tokens": 8763190.0, + "step": 4891 + }, + { + "epoch": 0.7921625779289126, + "grad_norm": 25.909048080444336, + "learning_rate": 2.080634715025907e-06, + "loss": 0.575, + "mean_token_accuracy": 0.9232880473136902, + "num_tokens": 8764976.0, + "step": 4892 + }, + { + "epoch": 0.792324508136993, + "grad_norm": 25.27642822265625, + "learning_rate": 2.079015544041451e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.9211102426052094, + "num_tokens": 8766755.0, + "step": 4893 + }, + { + "epoch": 0.7924864383450733, + "grad_norm": 27.0764217376709, + "learning_rate": 2.077396373056995e-06, + "loss": 0.527, + "mean_token_accuracy": 0.9177428483963013, + "num_tokens": 8768547.0, + "step": 4894 + }, + { + "epoch": 0.7926483685531536, + "grad_norm": 19.349010467529297, + "learning_rate": 2.075777202072539e-06, + "loss": 0.488, + "mean_token_accuracy": 0.9340579807758331, + "num_tokens": 8770332.0, + "step": 4895 + }, + { + "epoch": 0.7928102987612339, + "grad_norm": 29.76388168334961, + "learning_rate": 2.074158031088083e-06, + "loss": 0.531, + "mean_token_accuracy": 0.92323437333107, + "num_tokens": 8772129.0, + "step": 4896 + }, + { + "epoch": 0.7929722289693142, + "grad_norm": 23.53697395324707, + "learning_rate": 2.072538860103627e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.918645828962326, + "num_tokens": 8773912.0, + "step": 4897 + }, + { + "epoch": 0.7931341591773945, + "grad_norm": 34.235599517822266, + "learning_rate": 2.070919689119171e-06, + "loss": 0.6104, + "mean_token_accuracy": 0.9162139892578125, + "num_tokens": 8775710.0, + "step": 4898 + }, + { + "epoch": 0.7932960893854749, + "grad_norm": 35.62824249267578, + "learning_rate": 2.0693005181347153e-06, + "loss": 0.5939, + "mean_token_accuracy": 0.9228060841560364, + "num_tokens": 8777507.0, + "step": 4899 + }, + { + "epoch": 0.7934580195935552, + "grad_norm": 36.67351531982422, + "learning_rate": 2.0676813471502593e-06, + "loss": 0.6124, + "mean_token_accuracy": 0.91355299949646, + "num_tokens": 8779285.0, + "step": 4900 + }, + { + "epoch": 0.7936199498016355, + "grad_norm": 27.98644256591797, + "learning_rate": 2.0660621761658033e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.9155879616737366, + "num_tokens": 8781081.0, + "step": 4901 + }, + { + "epoch": 0.7937818800097158, + "grad_norm": 28.42221450805664, + "learning_rate": 2.0644430051813473e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.9132690131664276, + "num_tokens": 8782870.0, + "step": 4902 + }, + { + "epoch": 0.7939438102177961, + "grad_norm": 38.13542175292969, + "learning_rate": 2.0628238341968914e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.9028213322162628, + "num_tokens": 8784670.0, + "step": 4903 + }, + { + "epoch": 0.7941057404258764, + "grad_norm": 35.347747802734375, + "learning_rate": 2.0612046632124354e-06, + "loss": 0.6464, + "mean_token_accuracy": 0.9111111164093018, + "num_tokens": 8786452.0, + "step": 4904 + }, + { + "epoch": 0.7942676706339568, + "grad_norm": 34.25712203979492, + "learning_rate": 2.0595854922279794e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.9136646091938019, + "num_tokens": 8788242.0, + "step": 4905 + }, + { + "epoch": 0.7944296008420371, + "grad_norm": 26.22393226623535, + "learning_rate": 2.0579663212435234e-06, + "loss": 0.5495, + "mean_token_accuracy": 0.9239244163036346, + "num_tokens": 8790031.0, + "step": 4906 + }, + { + "epoch": 0.7945915310501174, + "grad_norm": 27.886669158935547, + "learning_rate": 2.0563471502590674e-06, + "loss": 0.5987, + "mean_token_accuracy": 0.9204118847846985, + "num_tokens": 8791820.0, + "step": 4907 + }, + { + "epoch": 0.7947534612581977, + "grad_norm": 31.319236755371094, + "learning_rate": 2.054727979274612e-06, + "loss": 0.6636, + "mean_token_accuracy": 0.9242258369922638, + "num_tokens": 8793609.0, + "step": 4908 + }, + { + "epoch": 0.794915391466278, + "grad_norm": 33.08340835571289, + "learning_rate": 2.053108808290156e-06, + "loss": 0.5995, + "mean_token_accuracy": 0.9230731725692749, + "num_tokens": 8795407.0, + "step": 4909 + }, + { + "epoch": 0.7950773216743584, + "grad_norm": 24.904603958129883, + "learning_rate": 2.0514896373057e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.9261710345745087, + "num_tokens": 8797190.0, + "step": 4910 + }, + { + "epoch": 0.7952392518824387, + "grad_norm": 25.797454833984375, + "learning_rate": 2.049870466321244e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.9210539758205414, + "num_tokens": 8798980.0, + "step": 4911 + }, + { + "epoch": 0.795401182090519, + "grad_norm": 35.37569808959961, + "learning_rate": 2.048251295336788e-06, + "loss": 0.6591, + "mean_token_accuracy": 0.9074468016624451, + "num_tokens": 8800783.0, + "step": 4912 + }, + { + "epoch": 0.7955631122985993, + "grad_norm": 36.26353454589844, + "learning_rate": 2.046632124352332e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.9172413945198059, + "num_tokens": 8802585.0, + "step": 4913 + }, + { + "epoch": 0.7957250425066796, + "grad_norm": 33.37550354003906, + "learning_rate": 2.045012953367876e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.9199725091457367, + "num_tokens": 8804372.0, + "step": 4914 + }, + { + "epoch": 0.7958869727147599, + "grad_norm": 18.971160888671875, + "learning_rate": 2.04339378238342e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.9350087344646454, + "num_tokens": 8806163.0, + "step": 4915 + }, + { + "epoch": 0.7960489029228403, + "grad_norm": 16.852298736572266, + "learning_rate": 2.041774611398964e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.9350886344909668, + "num_tokens": 8807952.0, + "step": 4916 + }, + { + "epoch": 0.7962108331309206, + "grad_norm": 30.142396926879883, + "learning_rate": 2.040155440414508e-06, + "loss": 0.5566, + "mean_token_accuracy": 0.9183647632598877, + "num_tokens": 8809745.0, + "step": 4917 + }, + { + "epoch": 0.7963727633390009, + "grad_norm": 30.790508270263672, + "learning_rate": 2.038536269430052e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.9131302535533905, + "num_tokens": 8811533.0, + "step": 4918 + }, + { + "epoch": 0.7965346935470812, + "grad_norm": 27.842195510864258, + "learning_rate": 2.036917098445596e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.9230769276618958, + "num_tokens": 8813331.0, + "step": 4919 + }, + { + "epoch": 0.7966966237551615, + "grad_norm": 29.73750114440918, + "learning_rate": 2.03529792746114e-06, + "loss": 0.582, + "mean_token_accuracy": 0.920550525188446, + "num_tokens": 8815120.0, + "step": 4920 + }, + { + "epoch": 0.7968585539632418, + "grad_norm": 26.64364242553711, + "learning_rate": 2.033678756476684e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.9187643826007843, + "num_tokens": 8816915.0, + "step": 4921 + }, + { + "epoch": 0.7970204841713222, + "grad_norm": 31.05823516845703, + "learning_rate": 2.032059585492228e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.9124758243560791, + "num_tokens": 8818700.0, + "step": 4922 + }, + { + "epoch": 0.7971824143794025, + "grad_norm": 22.934885025024414, + "learning_rate": 2.030440414507772e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.9160583913326263, + "num_tokens": 8820486.0, + "step": 4923 + }, + { + "epoch": 0.7973443445874828, + "grad_norm": 37.852237701416016, + "learning_rate": 2.028821243523316e-06, + "loss": 0.6053, + "mean_token_accuracy": 0.9090909361839294, + "num_tokens": 8822284.0, + "step": 4924 + }, + { + "epoch": 0.7975062747955631, + "grad_norm": 31.20152473449707, + "learning_rate": 2.02720207253886e-06, + "loss": 0.6762, + "mean_token_accuracy": 0.9095588028430939, + "num_tokens": 8824072.0, + "step": 4925 + }, + { + "epoch": 0.7976682050036434, + "grad_norm": 24.399518966674805, + "learning_rate": 2.025582901554404e-06, + "loss": 0.6049, + "mean_token_accuracy": 0.9180602133274078, + "num_tokens": 8825852.0, + "step": 4926 + }, + { + "epoch": 0.7978301352117237, + "grad_norm": 27.379980087280273, + "learning_rate": 2.0239637305699485e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.9193868935108185, + "num_tokens": 8827637.0, + "step": 4927 + }, + { + "epoch": 0.797992065419804, + "grad_norm": 24.655641555786133, + "learning_rate": 2.0223445595854925e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.927152156829834, + "num_tokens": 8829437.0, + "step": 4928 + }, + { + "epoch": 0.7981539956278844, + "grad_norm": 33.263370513916016, + "learning_rate": 2.0207253886010365e-06, + "loss": 0.6643, + "mean_token_accuracy": 0.9106077551841736, + "num_tokens": 8831228.0, + "step": 4929 + }, + { + "epoch": 0.7983159258359647, + "grad_norm": 34.86585235595703, + "learning_rate": 2.0191062176165806e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.9195588231086731, + "num_tokens": 8833026.0, + "step": 4930 + }, + { + "epoch": 0.798477856044045, + "grad_norm": 22.551219940185547, + "learning_rate": 2.0174870466321246e-06, + "loss": 0.561, + "mean_token_accuracy": 0.9298475086688995, + "num_tokens": 8834809.0, + "step": 4931 + }, + { + "epoch": 0.7986397862521253, + "grad_norm": 30.917335510253906, + "learning_rate": 2.0158678756476686e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.9185907244682312, + "num_tokens": 8836604.0, + "step": 4932 + }, + { + "epoch": 0.7988017164602057, + "grad_norm": 28.22004508972168, + "learning_rate": 2.0142487046632126e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.9184397161006927, + "num_tokens": 8838398.0, + "step": 4933 + }, + { + "epoch": 0.798963646668286, + "grad_norm": 28.48418617248535, + "learning_rate": 2.0126295336787566e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.908056229352951, + "num_tokens": 8840182.0, + "step": 4934 + }, + { + "epoch": 0.7991255768763663, + "grad_norm": 38.426944732666016, + "learning_rate": 2.0110103626943006e-06, + "loss": 0.6018, + "mean_token_accuracy": 0.9211459457874298, + "num_tokens": 8841973.0, + "step": 4935 + }, + { + "epoch": 0.7992875070844466, + "grad_norm": 26.526845932006836, + "learning_rate": 2.0093911917098446e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.9291968643665314, + "num_tokens": 8843767.0, + "step": 4936 + }, + { + "epoch": 0.7994494372925269, + "grad_norm": 28.1953182220459, + "learning_rate": 2.0077720207253886e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.9151570200920105, + "num_tokens": 8845561.0, + "step": 4937 + }, + { + "epoch": 0.7996113675006072, + "grad_norm": 27.99931526184082, + "learning_rate": 2.0061528497409327e-06, + "loss": 0.575, + "mean_token_accuracy": 0.9201068580150604, + "num_tokens": 8847360.0, + "step": 4938 + }, + { + "epoch": 0.7997732977086875, + "grad_norm": 30.18328857421875, + "learning_rate": 2.0045336787564767e-06, + "loss": 0.6622, + "mean_token_accuracy": 0.9166313707828522, + "num_tokens": 8849147.0, + "step": 4939 + }, + { + "epoch": 0.7999352279167679, + "grad_norm": 26.061443328857422, + "learning_rate": 2.0029145077720207e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.9259218573570251, + "num_tokens": 8850929.0, + "step": 4940 + }, + { + "epoch": 0.8000971581248482, + "grad_norm": 12.377771377563477, + "learning_rate": 2.0012953367875647e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.9335711896419525, + "num_tokens": 8852712.0, + "step": 4941 + }, + { + "epoch": 0.8002590883329285, + "grad_norm": 36.25849151611328, + "learning_rate": 1.9996761658031087e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.9162139892578125, + "num_tokens": 8854510.0, + "step": 4942 + }, + { + "epoch": 0.8004210185410088, + "grad_norm": 19.264659881591797, + "learning_rate": 1.9980569948186527e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.928205132484436, + "num_tokens": 8856300.0, + "step": 4943 + }, + { + "epoch": 0.8005829487490892, + "grad_norm": 21.427783966064453, + "learning_rate": 1.9964378238341967e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.9230356216430664, + "num_tokens": 8858085.0, + "step": 4944 + }, + { + "epoch": 0.8007448789571695, + "grad_norm": 27.587329864501953, + "learning_rate": 1.994818652849741e-06, + "loss": 0.5756, + "mean_token_accuracy": 0.9212149381637573, + "num_tokens": 8859876.0, + "step": 4945 + }, + { + "epoch": 0.8009068091652498, + "grad_norm": 26.34791374206543, + "learning_rate": 1.993199481865285e-06, + "loss": 0.539, + "mean_token_accuracy": 0.9259096682071686, + "num_tokens": 8861658.0, + "step": 4946 + }, + { + "epoch": 0.8010687393733301, + "grad_norm": 8.806448936462402, + "learning_rate": 1.991580310880829e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.9402685761451721, + "num_tokens": 8863438.0, + "step": 4947 + }, + { + "epoch": 0.8012306695814104, + "grad_norm": 24.630069732666016, + "learning_rate": 1.9899611398963732e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.9250357449054718, + "num_tokens": 8865230.0, + "step": 4948 + }, + { + "epoch": 0.8013925997894907, + "grad_norm": 22.68179702758789, + "learning_rate": 1.9883419689119172e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.927003413438797, + "num_tokens": 8867016.0, + "step": 4949 + }, + { + "epoch": 0.801554529997571, + "grad_norm": 28.216548919677734, + "learning_rate": 1.9867227979274612e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.9272453188896179, + "num_tokens": 8868803.0, + "step": 4950 + }, + { + "epoch": 0.8017164602056513, + "grad_norm": 30.94228172302246, + "learning_rate": 1.9851036269430053e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.9182596802711487, + "num_tokens": 8870609.0, + "step": 4951 + }, + { + "epoch": 0.8018783904137317, + "grad_norm": 26.164602279663086, + "learning_rate": 1.9834844559585497e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.9185742437839508, + "num_tokens": 8872403.0, + "step": 4952 + }, + { + "epoch": 0.802040320621812, + "grad_norm": 20.47823143005371, + "learning_rate": 1.9818652849740937e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.9307331740856171, + "num_tokens": 8874189.0, + "step": 4953 + }, + { + "epoch": 0.8022022508298923, + "grad_norm": 41.84910202026367, + "learning_rate": 1.9802461139896377e-06, + "loss": 0.693, + "mean_token_accuracy": 0.9099412262439728, + "num_tokens": 8875980.0, + "step": 4954 + }, + { + "epoch": 0.8023641810379726, + "grad_norm": 23.878442764282227, + "learning_rate": 1.9786269430051817e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.9302631616592407, + "num_tokens": 8877765.0, + "step": 4955 + }, + { + "epoch": 0.802526111246053, + "grad_norm": 28.336414337158203, + "learning_rate": 1.9770077720207257e-06, + "loss": 0.6398, + "mean_token_accuracy": 0.9217752516269684, + "num_tokens": 8879558.0, + "step": 4956 + }, + { + "epoch": 0.8026880414541333, + "grad_norm": 34.412994384765625, + "learning_rate": 1.9753886010362698e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.9311330616474152, + "num_tokens": 8881361.0, + "step": 4957 + }, + { + "epoch": 0.8028499716622136, + "grad_norm": 22.01692008972168, + "learning_rate": 1.9737694300518138e-06, + "loss": 0.522, + "mean_token_accuracy": 0.9132490456104279, + "num_tokens": 8883138.0, + "step": 4958 + }, + { + "epoch": 0.8030119018702939, + "grad_norm": 38.56572723388672, + "learning_rate": 1.9721502590673578e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.8962166905403137, + "num_tokens": 8884929.0, + "step": 4959 + }, + { + "epoch": 0.8031738320783742, + "grad_norm": 29.538925170898438, + "learning_rate": 1.970531088082902e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.9315811395645142, + "num_tokens": 8886718.0, + "step": 4960 + }, + { + "epoch": 0.8033357622864545, + "grad_norm": 24.75006675720215, + "learning_rate": 1.968911917098446e-06, + "loss": 0.6186, + "mean_token_accuracy": 0.9185185134410858, + "num_tokens": 8888500.0, + "step": 4961 + }, + { + "epoch": 0.8034976924945348, + "grad_norm": 33.16312789916992, + "learning_rate": 1.96729274611399e-06, + "loss": 0.643, + "mean_token_accuracy": 0.9095345139503479, + "num_tokens": 8890289.0, + "step": 4962 + }, + { + "epoch": 0.8036596227026152, + "grad_norm": 24.376237869262695, + "learning_rate": 1.965673575129534e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.9245014190673828, + "num_tokens": 8892079.0, + "step": 4963 + }, + { + "epoch": 0.8038215529106955, + "grad_norm": 22.80145835876465, + "learning_rate": 1.964054404145078e-06, + "loss": 0.483, + "mean_token_accuracy": 0.9297200739383698, + "num_tokens": 8893875.0, + "step": 4964 + }, + { + "epoch": 0.8039834831187758, + "grad_norm": 39.409786224365234, + "learning_rate": 1.962435233160622e-06, + "loss": 0.7582, + "mean_token_accuracy": 0.9011950492858887, + "num_tokens": 8895661.0, + "step": 4965 + }, + { + "epoch": 0.8041454133268561, + "grad_norm": 27.341442108154297, + "learning_rate": 1.960816062176166e-06, + "loss": 0.5924, + "mean_token_accuracy": 0.9181216359138489, + "num_tokens": 8897467.0, + "step": 4966 + }, + { + "epoch": 0.8043073435349365, + "grad_norm": 25.562259674072266, + "learning_rate": 1.95919689119171e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9275019764900208, + "num_tokens": 8899255.0, + "step": 4967 + }, + { + "epoch": 0.8044692737430168, + "grad_norm": 25.12514305114746, + "learning_rate": 1.957577720207254e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.9295327067375183, + "num_tokens": 8901052.0, + "step": 4968 + }, + { + "epoch": 0.8046312039510971, + "grad_norm": 26.805830001831055, + "learning_rate": 1.955958549222798e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.9183908700942993, + "num_tokens": 8902834.0, + "step": 4969 + }, + { + "epoch": 0.8047931341591774, + "grad_norm": 21.42875862121582, + "learning_rate": 1.954339378238342e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.9270073175430298, + "num_tokens": 8904620.0, + "step": 4970 + }, + { + "epoch": 0.8049550643672577, + "grad_norm": 27.879472732543945, + "learning_rate": 1.9527202072538864e-06, + "loss": 0.585, + "mean_token_accuracy": 0.9134846329689026, + "num_tokens": 8906409.0, + "step": 4971 + }, + { + "epoch": 0.805116994575338, + "grad_norm": 24.235980987548828, + "learning_rate": 1.9511010362694304e-06, + "loss": 0.5604, + "mean_token_accuracy": 0.9246819317340851, + "num_tokens": 8908187.0, + "step": 4972 + }, + { + "epoch": 0.8052789247834183, + "grad_norm": 43.45313262939453, + "learning_rate": 1.9494818652849744e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.8996002972126007, + "num_tokens": 8909989.0, + "step": 4973 + }, + { + "epoch": 0.8054408549914986, + "grad_norm": 20.992921829223633, + "learning_rate": 1.9478626943005184e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.930149257183075, + "num_tokens": 8911785.0, + "step": 4974 + }, + { + "epoch": 0.805602785199579, + "grad_norm": 29.420166015625, + "learning_rate": 1.9462435233160624e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.9212501347064972, + "num_tokens": 8913576.0, + "step": 4975 + }, + { + "epoch": 0.8057647154076593, + "grad_norm": 26.890817642211914, + "learning_rate": 1.9446243523316064e-06, + "loss": 0.579, + "mean_token_accuracy": 0.9232778251171112, + "num_tokens": 8915373.0, + "step": 4976 + }, + { + "epoch": 0.8059266456157396, + "grad_norm": 34.62211227416992, + "learning_rate": 1.9430051813471504e-06, + "loss": 0.6254, + "mean_token_accuracy": 0.9217016398906708, + "num_tokens": 8917178.0, + "step": 4977 + }, + { + "epoch": 0.80608857582382, + "grad_norm": 23.213016510009766, + "learning_rate": 1.9413860103626945e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.9294007122516632, + "num_tokens": 8918959.0, + "step": 4978 + }, + { + "epoch": 0.8062505060319003, + "grad_norm": 31.16729736328125, + "learning_rate": 1.9397668393782385e-06, + "loss": 0.6342, + "mean_token_accuracy": 0.9093801379203796, + "num_tokens": 8920758.0, + "step": 4979 + }, + { + "epoch": 0.8064124362399806, + "grad_norm": 24.584293365478516, + "learning_rate": 1.9381476683937825e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.9288030862808228, + "num_tokens": 8922551.0, + "step": 4980 + }, + { + "epoch": 0.8065743664480609, + "grad_norm": 24.814489364624023, + "learning_rate": 1.9365284974093265e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.9219858050346375, + "num_tokens": 8924345.0, + "step": 4981 + }, + { + "epoch": 0.8067362966561412, + "grad_norm": 19.52880859375, + "learning_rate": 1.9349093264248705e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.9311594367027283, + "num_tokens": 8926133.0, + "step": 4982 + }, + { + "epoch": 0.8068982268642215, + "grad_norm": 32.615623474121094, + "learning_rate": 1.9332901554404145e-06, + "loss": 0.5891, + "mean_token_accuracy": 0.906274139881134, + "num_tokens": 8927933.0, + "step": 4983 + }, + { + "epoch": 0.8070601570723018, + "grad_norm": 27.989227294921875, + "learning_rate": 1.9316709844559585e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.9232409298419952, + "num_tokens": 8929719.0, + "step": 4984 + }, + { + "epoch": 0.8072220872803821, + "grad_norm": 33.77449417114258, + "learning_rate": 1.9300518134715025e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.9123667478561401, + "num_tokens": 8931505.0, + "step": 4985 + }, + { + "epoch": 0.8073840174884624, + "grad_norm": 33.327735900878906, + "learning_rate": 1.9284326424870466e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.905802845954895, + "num_tokens": 8933301.0, + "step": 4986 + }, + { + "epoch": 0.8075459476965428, + "grad_norm": 23.688329696655273, + "learning_rate": 1.9268134715025906e-06, + "loss": 0.5632, + "mean_token_accuracy": 0.9284613132476807, + "num_tokens": 8935092.0, + "step": 4987 + }, + { + "epoch": 0.8077078779046231, + "grad_norm": 33.938846588134766, + "learning_rate": 1.9251943005181346e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.9131661653518677, + "num_tokens": 8936881.0, + "step": 4988 + }, + { + "epoch": 0.8078698081127035, + "grad_norm": 30.068599700927734, + "learning_rate": 1.923575129533679e-06, + "loss": 0.6921, + "mean_token_accuracy": 0.9058587849140167, + "num_tokens": 8938679.0, + "step": 4989 + }, + { + "epoch": 0.8080317383207838, + "grad_norm": 38.505531311035156, + "learning_rate": 1.921955958549223e-06, + "loss": 0.6378, + "mean_token_accuracy": 0.9060071706771851, + "num_tokens": 8940478.0, + "step": 4990 + }, + { + "epoch": 0.8081936685288641, + "grad_norm": 12.059277534484863, + "learning_rate": 1.920336787564767e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.9357620477676392, + "num_tokens": 8942270.0, + "step": 4991 + }, + { + "epoch": 0.8083555987369444, + "grad_norm": 30.33990478515625, + "learning_rate": 1.918717616580311e-06, + "loss": 0.5558, + "mean_token_accuracy": 0.9227495193481445, + "num_tokens": 8944068.0, + "step": 4992 + }, + { + "epoch": 0.8085175289450247, + "grad_norm": 35.630008697509766, + "learning_rate": 1.917098445595855e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.90386563539505, + "num_tokens": 8945861.0, + "step": 4993 + }, + { + "epoch": 0.808679459153105, + "grad_norm": 27.060644149780273, + "learning_rate": 1.915479274611399e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.9235956966876984, + "num_tokens": 8947648.0, + "step": 4994 + }, + { + "epoch": 0.8088413893611853, + "grad_norm": 27.75632667541504, + "learning_rate": 1.913860103626943e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.9207249879837036, + "num_tokens": 8949436.0, + "step": 4995 + }, + { + "epoch": 0.8090033195692656, + "grad_norm": 37.02408218383789, + "learning_rate": 1.9122409326424875e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.903947502374649, + "num_tokens": 8951229.0, + "step": 4996 + }, + { + "epoch": 0.8091652497773459, + "grad_norm": 29.825345993041992, + "learning_rate": 1.9106217616580315e-06, + "loss": 0.5626, + "mean_token_accuracy": 0.9220841825008392, + "num_tokens": 8953022.0, + "step": 4997 + }, + { + "epoch": 0.8093271799854262, + "grad_norm": 23.454288482666016, + "learning_rate": 1.9090025906735756e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.9195157885551453, + "num_tokens": 8954820.0, + "step": 4998 + }, + { + "epoch": 0.8094891101935066, + "grad_norm": 16.398780822753906, + "learning_rate": 1.9073834196891196e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.9383763074874878, + "num_tokens": 8956608.0, + "step": 4999 + }, + { + "epoch": 0.8096510404015869, + "grad_norm": 34.983497619628906, + "learning_rate": 1.9057642487046634e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.9228723645210266, + "num_tokens": 8958405.0, + "step": 5000 + }, + { + "epoch": 0.8098129706096673, + "grad_norm": 31.942378997802734, + "learning_rate": 1.9041450777202076e-06, + "loss": 0.6105, + "mean_token_accuracy": 0.925709456205368, + "num_tokens": 8960200.0, + "step": 5001 + }, + { + "epoch": 0.8099749008177476, + "grad_norm": 26.622617721557617, + "learning_rate": 1.9025259067357516e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.9286187887191772, + "num_tokens": 8961992.0, + "step": 5002 + }, + { + "epoch": 0.8101368310258279, + "grad_norm": 24.186302185058594, + "learning_rate": 1.9009067357512956e-06, + "loss": 0.5843, + "mean_token_accuracy": 0.92356076836586, + "num_tokens": 8963778.0, + "step": 5003 + }, + { + "epoch": 0.8102987612339082, + "grad_norm": 43.259578704833984, + "learning_rate": 1.8992875647668396e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.9033996760845184, + "num_tokens": 8965568.0, + "step": 5004 + }, + { + "epoch": 0.8104606914419885, + "grad_norm": 35.650535583496094, + "learning_rate": 1.8976683937823837e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.9090909063816071, + "num_tokens": 8967366.0, + "step": 5005 + }, + { + "epoch": 0.8106226216500688, + "grad_norm": 30.017370223999023, + "learning_rate": 1.8960492227979277e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.9219819009304047, + "num_tokens": 8969160.0, + "step": 5006 + }, + { + "epoch": 0.8107845518581491, + "grad_norm": 26.688739776611328, + "learning_rate": 1.8944300518134717e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.9291816651821136, + "num_tokens": 8970955.0, + "step": 5007 + }, + { + "epoch": 0.8109464820662294, + "grad_norm": 31.082406997680664, + "learning_rate": 1.8928108808290157e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.9210242033004761, + "num_tokens": 8972746.0, + "step": 5008 + }, + { + "epoch": 0.8111084122743097, + "grad_norm": 35.904319763183594, + "learning_rate": 1.8911917098445597e-06, + "loss": 0.6266, + "mean_token_accuracy": 0.9144370257854462, + "num_tokens": 8974539.0, + "step": 5009 + }, + { + "epoch": 0.81127034248239, + "grad_norm": 21.578155517578125, + "learning_rate": 1.889572538860104e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.9282300472259521, + "num_tokens": 8976330.0, + "step": 5010 + }, + { + "epoch": 0.8114322726904704, + "grad_norm": 24.939559936523438, + "learning_rate": 1.887953367875648e-06, + "loss": 0.605, + "mean_token_accuracy": 0.9198097884654999, + "num_tokens": 8978115.0, + "step": 5011 + }, + { + "epoch": 0.8115942028985508, + "grad_norm": 29.3984432220459, + "learning_rate": 1.886334196891192e-06, + "loss": 0.5933, + "mean_token_accuracy": 0.9181357622146606, + "num_tokens": 8979908.0, + "step": 5012 + }, + { + "epoch": 0.8117561331066311, + "grad_norm": 27.354196548461914, + "learning_rate": 1.884715025906736e-06, + "loss": 0.553, + "mean_token_accuracy": 0.9194396734237671, + "num_tokens": 8981693.0, + "step": 5013 + }, + { + "epoch": 0.8119180633147114, + "grad_norm": 23.668514251708984, + "learning_rate": 1.88309585492228e-06, + "loss": 0.6157, + "mean_token_accuracy": 0.918721467256546, + "num_tokens": 8983486.0, + "step": 5014 + }, + { + "epoch": 0.8120799935227917, + "grad_norm": 33.348533630371094, + "learning_rate": 1.881476683937824e-06, + "loss": 0.5884, + "mean_token_accuracy": 0.9122872650623322, + "num_tokens": 8985281.0, + "step": 5015 + }, + { + "epoch": 0.812241923730872, + "grad_norm": 27.45981216430664, + "learning_rate": 1.879857512953368e-06, + "loss": 0.5137, + "mean_token_accuracy": 0.9193357825279236, + "num_tokens": 8987078.0, + "step": 5016 + }, + { + "epoch": 0.8124038539389523, + "grad_norm": 32.7164306640625, + "learning_rate": 1.878238341968912e-06, + "loss": 0.5786, + "mean_token_accuracy": 0.9204114377498627, + "num_tokens": 8988877.0, + "step": 5017 + }, + { + "epoch": 0.8125657841470326, + "grad_norm": 27.328479766845703, + "learning_rate": 1.876619170984456e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.9185314476490021, + "num_tokens": 8990682.0, + "step": 5018 + }, + { + "epoch": 0.8127277143551129, + "grad_norm": 40.90134048461914, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.6497, + "mean_token_accuracy": 0.9035685360431671, + "num_tokens": 8992484.0, + "step": 5019 + }, + { + "epoch": 0.8128896445631932, + "grad_norm": 43.68349075317383, + "learning_rate": 1.8733808290155443e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.9085317552089691, + "num_tokens": 8994280.0, + "step": 5020 + }, + { + "epoch": 0.8130515747712735, + "grad_norm": 19.25556755065918, + "learning_rate": 1.8717616580310883e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.9258071482181549, + "num_tokens": 8996061.0, + "step": 5021 + }, + { + "epoch": 0.8132135049793539, + "grad_norm": 21.8875675201416, + "learning_rate": 1.8701424870466323e-06, + "loss": 0.522, + "mean_token_accuracy": 0.9285391271114349, + "num_tokens": 8997839.0, + "step": 5022 + }, + { + "epoch": 0.8133754351874343, + "grad_norm": 32.607398986816406, + "learning_rate": 1.8685233160621763e-06, + "loss": 0.6238, + "mean_token_accuracy": 0.9147392213344574, + "num_tokens": 8999633.0, + "step": 5023 + }, + { + "epoch": 0.8135373653955146, + "grad_norm": 29.330286026000977, + "learning_rate": 1.8669041450777203e-06, + "loss": 0.6338, + "mean_token_accuracy": 0.917548805475235, + "num_tokens": 9001424.0, + "step": 5024 + }, + { + "epoch": 0.8136992956035949, + "grad_norm": 27.08781623840332, + "learning_rate": 1.8652849740932643e-06, + "loss": 0.5544, + "mean_token_accuracy": 0.9140931963920593, + "num_tokens": 9003216.0, + "step": 5025 + }, + { + "epoch": 0.8138612258116752, + "grad_norm": 22.69425392150879, + "learning_rate": 1.8636658031088084e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9253132939338684, + "num_tokens": 9004996.0, + "step": 5026 + }, + { + "epoch": 0.8140231560197555, + "grad_norm": 29.096426010131836, + "learning_rate": 1.8620466321243524e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.9062924981117249, + "num_tokens": 9006795.0, + "step": 5027 + }, + { + "epoch": 0.8141850862278358, + "grad_norm": 18.942089080810547, + "learning_rate": 1.8604274611398964e-06, + "loss": 0.461, + "mean_token_accuracy": 0.9269501268863678, + "num_tokens": 9008581.0, + "step": 5028 + }, + { + "epoch": 0.8143470164359161, + "grad_norm": 24.701074600219727, + "learning_rate": 1.8588082901554406e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.9173434674739838, + "num_tokens": 9010372.0, + "step": 5029 + }, + { + "epoch": 0.8145089466439964, + "grad_norm": 25.215373992919922, + "learning_rate": 1.8571891191709846e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.9240403473377228, + "num_tokens": 9012160.0, + "step": 5030 + }, + { + "epoch": 0.8146708768520767, + "grad_norm": 31.006916046142578, + "learning_rate": 1.8555699481865286e-06, + "loss": 0.6238, + "mean_token_accuracy": 0.9104573428630829, + "num_tokens": 9013951.0, + "step": 5031 + }, + { + "epoch": 0.814832807060157, + "grad_norm": 35.33442687988281, + "learning_rate": 1.8539507772020726e-06, + "loss": 0.6493, + "mean_token_accuracy": 0.907475471496582, + "num_tokens": 9015743.0, + "step": 5032 + }, + { + "epoch": 0.8149947372682373, + "grad_norm": 28.915390014648438, + "learning_rate": 1.8523316062176167e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.9240546226501465, + "num_tokens": 9017531.0, + "step": 5033 + }, + { + "epoch": 0.8151566674763177, + "grad_norm": 33.211761474609375, + "learning_rate": 1.8507124352331607e-06, + "loss": 0.591, + "mean_token_accuracy": 0.9072712361812592, + "num_tokens": 9019323.0, + "step": 5034 + }, + { + "epoch": 0.8153185976843981, + "grad_norm": 45.33363342285156, + "learning_rate": 1.8490932642487047e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.8881118893623352, + "num_tokens": 9021121.0, + "step": 5035 + }, + { + "epoch": 0.8154805278924784, + "grad_norm": 34.62540054321289, + "learning_rate": 1.8474740932642487e-06, + "loss": 0.6736, + "mean_token_accuracy": 0.9056512117385864, + "num_tokens": 9022920.0, + "step": 5036 + }, + { + "epoch": 0.8156424581005587, + "grad_norm": 32.71051788330078, + "learning_rate": 1.8458549222797927e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.9195804297924042, + "num_tokens": 9024707.0, + "step": 5037 + }, + { + "epoch": 0.815804388308639, + "grad_norm": 29.727039337158203, + "learning_rate": 1.844235751295337e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.916402131319046, + "num_tokens": 9026494.0, + "step": 5038 + }, + { + "epoch": 0.8159663185167193, + "grad_norm": 21.784082412719727, + "learning_rate": 1.842616580310881e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.9304879307746887, + "num_tokens": 9028293.0, + "step": 5039 + }, + { + "epoch": 0.8161282487247996, + "grad_norm": 42.53742218017578, + "learning_rate": 1.840997409326425e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.879334956407547, + "num_tokens": 9030095.0, + "step": 5040 + }, + { + "epoch": 0.8162901789328799, + "grad_norm": 35.216217041015625, + "learning_rate": 1.839378238341969e-06, + "loss": 0.6533, + "mean_token_accuracy": 0.9123152792453766, + "num_tokens": 9031892.0, + "step": 5041 + }, + { + "epoch": 0.8164521091409602, + "grad_norm": 29.873117446899414, + "learning_rate": 1.837759067357513e-06, + "loss": 0.6474, + "mean_token_accuracy": 0.9191147685050964, + "num_tokens": 9033690.0, + "step": 5042 + }, + { + "epoch": 0.8166140393490405, + "grad_norm": 37.184669494628906, + "learning_rate": 1.836139896373057e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.9075706899166107, + "num_tokens": 9035483.0, + "step": 5043 + }, + { + "epoch": 0.8167759695571208, + "grad_norm": 33.99110412597656, + "learning_rate": 1.8345207253886012e-06, + "loss": 0.625, + "mean_token_accuracy": 0.921171635389328, + "num_tokens": 9037274.0, + "step": 5044 + }, + { + "epoch": 0.8169378997652011, + "grad_norm": 38.40345764160156, + "learning_rate": 1.8329015544041454e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.9059343636035919, + "num_tokens": 9039073.0, + "step": 5045 + }, + { + "epoch": 0.8170998299732816, + "grad_norm": 28.11101722717285, + "learning_rate": 1.8312823834196895e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.9241737127304077, + "num_tokens": 9040862.0, + "step": 5046 + }, + { + "epoch": 0.8172617601813619, + "grad_norm": 30.107608795166016, + "learning_rate": 1.8296632124352335e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.9182541370391846, + "num_tokens": 9042655.0, + "step": 5047 + }, + { + "epoch": 0.8174236903894422, + "grad_norm": 26.59319496154785, + "learning_rate": 1.8280440414507775e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.9291816651821136, + "num_tokens": 9044450.0, + "step": 5048 + }, + { + "epoch": 0.8175856205975225, + "grad_norm": 30.14449119567871, + "learning_rate": 1.8264248704663215e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.9236736297607422, + "num_tokens": 9046237.0, + "step": 5049 + }, + { + "epoch": 0.8177475508056028, + "grad_norm": 27.055479049682617, + "learning_rate": 1.8248056994818655e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.9198883175849915, + "num_tokens": 9048036.0, + "step": 5050 + }, + { + "epoch": 0.8179094810136831, + "grad_norm": 27.560745239257812, + "learning_rate": 1.8231865284974095e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.9132340252399445, + "num_tokens": 9049837.0, + "step": 5051 + }, + { + "epoch": 0.8180714112217634, + "grad_norm": 24.66472625732422, + "learning_rate": 1.8215673575129535e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.9180099368095398, + "num_tokens": 9051630.0, + "step": 5052 + }, + { + "epoch": 0.8182333414298437, + "grad_norm": 29.276945114135742, + "learning_rate": 1.8199481865284976e-06, + "loss": 0.5795, + "mean_token_accuracy": 0.9122137427330017, + "num_tokens": 9053404.0, + "step": 5053 + }, + { + "epoch": 0.818395271637924, + "grad_norm": 33.57276153564453, + "learning_rate": 1.8183290155440418e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.9111111164093018, + "num_tokens": 9055186.0, + "step": 5054 + }, + { + "epoch": 0.8185572018460043, + "grad_norm": 32.89570999145508, + "learning_rate": 1.8167098445595858e-06, + "loss": 0.6408, + "mean_token_accuracy": 0.9133738577365875, + "num_tokens": 9056986.0, + "step": 5055 + }, + { + "epoch": 0.8187191320540846, + "grad_norm": 29.869258880615234, + "learning_rate": 1.8150906735751298e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.9194128215312958, + "num_tokens": 9058771.0, + "step": 5056 + }, + { + "epoch": 0.8188810622621651, + "grad_norm": 31.697872161865234, + "learning_rate": 1.8134715025906738e-06, + "loss": 0.5747, + "mean_token_accuracy": 0.9231202900409698, + "num_tokens": 9060556.0, + "step": 5057 + }, + { + "epoch": 0.8190429924702454, + "grad_norm": 37.611793518066406, + "learning_rate": 1.8118523316062178e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.9233954548835754, + "num_tokens": 9062353.0, + "step": 5058 + }, + { + "epoch": 0.8192049226783257, + "grad_norm": 27.354713439941406, + "learning_rate": 1.8102331606217618e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.9144460260868073, + "num_tokens": 9064134.0, + "step": 5059 + }, + { + "epoch": 0.819366852886406, + "grad_norm": 24.37343406677246, + "learning_rate": 1.8086139896373059e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.9254246950149536, + "num_tokens": 9065914.0, + "step": 5060 + }, + { + "epoch": 0.8195287830944863, + "grad_norm": 40.12774658203125, + "learning_rate": 1.8069948186528499e-06, + "loss": 0.5851, + "mean_token_accuracy": 0.9234335124492645, + "num_tokens": 9067700.0, + "step": 5061 + }, + { + "epoch": 0.8196907133025666, + "grad_norm": 26.485366821289062, + "learning_rate": 1.8053756476683939e-06, + "loss": 0.6, + "mean_token_accuracy": 0.9287814497947693, + "num_tokens": 9069493.0, + "step": 5062 + }, + { + "epoch": 0.8198526435106469, + "grad_norm": 29.425695419311523, + "learning_rate": 1.8037564766839379e-06, + "loss": 0.617, + "mean_token_accuracy": 0.9188909530639648, + "num_tokens": 9071276.0, + "step": 5063 + }, + { + "epoch": 0.8200145737187272, + "grad_norm": 34.081573486328125, + "learning_rate": 1.8021373056994821e-06, + "loss": 0.5677, + "mean_token_accuracy": 0.9157801568508148, + "num_tokens": 9073073.0, + "step": 5064 + }, + { + "epoch": 0.8201765039268075, + "grad_norm": 32.858665466308594, + "learning_rate": 1.8005181347150261e-06, + "loss": 0.54, + "mean_token_accuracy": 0.9168636500835419, + "num_tokens": 9074861.0, + "step": 5065 + }, + { + "epoch": 0.8203384341348878, + "grad_norm": 14.388801574707031, + "learning_rate": 1.7988989637305701e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.9316923022270203, + "num_tokens": 9076651.0, + "step": 5066 + }, + { + "epoch": 0.8205003643429681, + "grad_norm": 19.80535125732422, + "learning_rate": 1.7972797927461142e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.9297581613063812, + "num_tokens": 9078434.0, + "step": 5067 + }, + { + "epoch": 0.8206622945510484, + "grad_norm": 16.693622589111328, + "learning_rate": 1.7956606217616582e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.9354573488235474, + "num_tokens": 9080225.0, + "step": 5068 + }, + { + "epoch": 0.8208242247591289, + "grad_norm": 36.904083251953125, + "learning_rate": 1.7940414507772022e-06, + "loss": 0.735, + "mean_token_accuracy": 0.8950424492359161, + "num_tokens": 9082030.0, + "step": 5069 + }, + { + "epoch": 0.8209861549672092, + "grad_norm": 42.37964630126953, + "learning_rate": 1.7924222797927462e-06, + "loss": 0.6681, + "mean_token_accuracy": 0.90625, + "num_tokens": 9083839.0, + "step": 5070 + }, + { + "epoch": 0.8211480851752895, + "grad_norm": 28.88477325439453, + "learning_rate": 1.7908031088082902e-06, + "loss": 0.5966, + "mean_token_accuracy": 0.9210049211978912, + "num_tokens": 9085629.0, + "step": 5071 + }, + { + "epoch": 0.8213100153833698, + "grad_norm": 31.190231323242188, + "learning_rate": 1.7891839378238342e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.9163140058517456, + "num_tokens": 9087416.0, + "step": 5072 + }, + { + "epoch": 0.8214719455914501, + "grad_norm": 29.80884552001953, + "learning_rate": 1.7875647668393784e-06, + "loss": 0.6244, + "mean_token_accuracy": 0.9169534146785736, + "num_tokens": 9089205.0, + "step": 5073 + }, + { + "epoch": 0.8216338757995304, + "grad_norm": 36.65719223022461, + "learning_rate": 1.7859455958549225e-06, + "loss": 0.6304, + "mean_token_accuracy": 0.9178557693958282, + "num_tokens": 9091008.0, + "step": 5074 + }, + { + "epoch": 0.8217958060076107, + "grad_norm": 27.19016456604004, + "learning_rate": 1.7843264248704665e-06, + "loss": 0.5498, + "mean_token_accuracy": 0.9180395901203156, + "num_tokens": 9092789.0, + "step": 5075 + }, + { + "epoch": 0.821957736215691, + "grad_norm": 27.414865493774414, + "learning_rate": 1.7827072538860105e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.9160506129264832, + "num_tokens": 9094585.0, + "step": 5076 + }, + { + "epoch": 0.8221196664237713, + "grad_norm": 27.482736587524414, + "learning_rate": 1.7810880829015545e-06, + "loss": 0.5436, + "mean_token_accuracy": 0.9298194348812103, + "num_tokens": 9096383.0, + "step": 5077 + }, + { + "epoch": 0.8222815966318516, + "grad_norm": 32.32894515991211, + "learning_rate": 1.7794689119170985e-06, + "loss": 0.6317, + "mean_token_accuracy": 0.9154929518699646, + "num_tokens": 9098179.0, + "step": 5078 + }, + { + "epoch": 0.8224435268399319, + "grad_norm": 26.610849380493164, + "learning_rate": 1.7778497409326425e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.9193262457847595, + "num_tokens": 9099976.0, + "step": 5079 + }, + { + "epoch": 0.8226054570480124, + "grad_norm": 35.28965759277344, + "learning_rate": 1.7762305699481865e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.9151785671710968, + "num_tokens": 9101772.0, + "step": 5080 + }, + { + "epoch": 0.8227673872560927, + "grad_norm": 30.78964614868164, + "learning_rate": 1.7746113989637306e-06, + "loss": 0.618, + "mean_token_accuracy": 0.9125258922576904, + "num_tokens": 9103569.0, + "step": 5081 + }, + { + "epoch": 0.822929317464173, + "grad_norm": 31.36067771911621, + "learning_rate": 1.7729922279792748e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.922252744436264, + "num_tokens": 9105364.0, + "step": 5082 + }, + { + "epoch": 0.8230912476722533, + "grad_norm": 26.089340209960938, + "learning_rate": 1.7713730569948188e-06, + "loss": 0.547, + "mean_token_accuracy": 0.9258646070957184, + "num_tokens": 9107171.0, + "step": 5083 + }, + { + "epoch": 0.8232531778803336, + "grad_norm": 38.50564956665039, + "learning_rate": 1.7697538860103628e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.9099378883838654, + "num_tokens": 9108961.0, + "step": 5084 + }, + { + "epoch": 0.8234151080884139, + "grad_norm": 20.82295799255371, + "learning_rate": 1.7681347150259068e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.9291044771671295, + "num_tokens": 9110741.0, + "step": 5085 + }, + { + "epoch": 0.8235770382964942, + "grad_norm": 30.624006271362305, + "learning_rate": 1.7665155440414508e-06, + "loss": 0.6629, + "mean_token_accuracy": 0.9072911739349365, + "num_tokens": 9112533.0, + "step": 5086 + }, + { + "epoch": 0.8237389685045745, + "grad_norm": 38.582275390625, + "learning_rate": 1.7648963730569948e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.917391300201416, + "num_tokens": 9114323.0, + "step": 5087 + }, + { + "epoch": 0.8239008987126548, + "grad_norm": 21.064800262451172, + "learning_rate": 1.7632772020725389e-06, + "loss": 0.502, + "mean_token_accuracy": 0.9282866418361664, + "num_tokens": 9116100.0, + "step": 5088 + }, + { + "epoch": 0.8240628289207351, + "grad_norm": 32.7095947265625, + "learning_rate": 1.7616580310880829e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.9054268598556519, + "num_tokens": 9117887.0, + "step": 5089 + }, + { + "epoch": 0.8242247591288154, + "grad_norm": 37.57026290893555, + "learning_rate": 1.7600388601036269e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.9239130616188049, + "num_tokens": 9119675.0, + "step": 5090 + }, + { + "epoch": 0.8243866893368959, + "grad_norm": 27.784326553344727, + "learning_rate": 1.7584196891191709e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.9209979176521301, + "num_tokens": 9121465.0, + "step": 5091 + }, + { + "epoch": 0.8245486195449762, + "grad_norm": 16.347412109375, + "learning_rate": 1.7568005181347153e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.9335060119628906, + "num_tokens": 9123248.0, + "step": 5092 + }, + { + "epoch": 0.8247105497530565, + "grad_norm": 31.57411766052246, + "learning_rate": 1.7551813471502593e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.9114651679992676, + "num_tokens": 9125031.0, + "step": 5093 + }, + { + "epoch": 0.8248724799611368, + "grad_norm": 23.453144073486328, + "learning_rate": 1.7535621761658034e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.9272640645503998, + "num_tokens": 9126818.0, + "step": 5094 + }, + { + "epoch": 0.8250344101692171, + "grad_norm": 22.057931900024414, + "learning_rate": 1.7519430051813474e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.9317004978656769, + "num_tokens": 9128608.0, + "step": 5095 + }, + { + "epoch": 0.8251963403772974, + "grad_norm": 28.360332489013672, + "learning_rate": 1.7503238341968914e-06, + "loss": 0.5578, + "mean_token_accuracy": 0.9209504723548889, + "num_tokens": 9130398.0, + "step": 5096 + }, + { + "epoch": 0.8253582705853777, + "grad_norm": 34.65985107421875, + "learning_rate": 1.7487046632124354e-06, + "loss": 0.642, + "mean_token_accuracy": 0.9083965718746185, + "num_tokens": 9132194.0, + "step": 5097 + }, + { + "epoch": 0.825520200793458, + "grad_norm": 31.360326766967773, + "learning_rate": 1.7470854922279796e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.9146948456764221, + "num_tokens": 9133975.0, + "step": 5098 + }, + { + "epoch": 0.8256821310015383, + "grad_norm": 22.05139923095703, + "learning_rate": 1.7454663212435236e-06, + "loss": 0.5859, + "mean_token_accuracy": 0.9194042086601257, + "num_tokens": 9135760.0, + "step": 5099 + }, + { + "epoch": 0.8258440612096186, + "grad_norm": 43.60239791870117, + "learning_rate": 1.7438471502590676e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.8967578411102295, + "num_tokens": 9137553.0, + "step": 5100 + }, + { + "epoch": 0.8260059914176989, + "grad_norm": 38.74407196044922, + "learning_rate": 1.7422279792746117e-06, + "loss": 0.5897, + "mean_token_accuracy": 0.919247567653656, + "num_tokens": 9139349.0, + "step": 5101 + }, + { + "epoch": 0.8261679216257793, + "grad_norm": 31.658662796020508, + "learning_rate": 1.7406088082901557e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.9111062288284302, + "num_tokens": 9141131.0, + "step": 5102 + }, + { + "epoch": 0.8263298518338597, + "grad_norm": 28.7454833984375, + "learning_rate": 1.7389896373056997e-06, + "loss": 0.5259, + "mean_token_accuracy": 0.9229462146759033, + "num_tokens": 9142928.0, + "step": 5103 + }, + { + "epoch": 0.82649178204194, + "grad_norm": 20.431854248046875, + "learning_rate": 1.7373704663212437e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.9321882426738739, + "num_tokens": 9144720.0, + "step": 5104 + }, + { + "epoch": 0.8266537122500203, + "grad_norm": 19.881669998168945, + "learning_rate": 1.7357512953367877e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.9272982180118561, + "num_tokens": 9146507.0, + "step": 5105 + }, + { + "epoch": 0.8268156424581006, + "grad_norm": 34.45378875732422, + "learning_rate": 1.7341321243523317e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.9045882225036621, + "num_tokens": 9148301.0, + "step": 5106 + }, + { + "epoch": 0.8269775726661809, + "grad_norm": 36.82697677612305, + "learning_rate": 1.7325129533678757e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.9122673273086548, + "num_tokens": 9150098.0, + "step": 5107 + }, + { + "epoch": 0.8271395028742612, + "grad_norm": 26.04399299621582, + "learning_rate": 1.73089378238342e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.9233269393444061, + "num_tokens": 9151884.0, + "step": 5108 + }, + { + "epoch": 0.8273014330823415, + "grad_norm": 25.937597274780273, + "learning_rate": 1.729274611398964e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.9165484607219696, + "num_tokens": 9153672.0, + "step": 5109 + }, + { + "epoch": 0.8274633632904218, + "grad_norm": 39.61464309692383, + "learning_rate": 1.727655440414508e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.9186058044433594, + "num_tokens": 9155468.0, + "step": 5110 + }, + { + "epoch": 0.8276252934985021, + "grad_norm": 36.036861419677734, + "learning_rate": 1.726036269430052e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.9117965996265411, + "num_tokens": 9157265.0, + "step": 5111 + }, + { + "epoch": 0.8277872237065824, + "grad_norm": 35.327327728271484, + "learning_rate": 1.724417098445596e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.9148550927639008, + "num_tokens": 9159047.0, + "step": 5112 + }, + { + "epoch": 0.8279491539146627, + "grad_norm": 17.89873504638672, + "learning_rate": 1.72279792746114e-06, + "loss": 0.5344, + "mean_token_accuracy": 0.9304511249065399, + "num_tokens": 9160832.0, + "step": 5113 + }, + { + "epoch": 0.8281110841227431, + "grad_norm": 38.50513458251953, + "learning_rate": 1.721178756476684e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.9002834260463715, + "num_tokens": 9162635.0, + "step": 5114 + }, + { + "epoch": 0.8282730143308235, + "grad_norm": 30.08908462524414, + "learning_rate": 1.719559585492228e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.927003413438797, + "num_tokens": 9164434.0, + "step": 5115 + }, + { + "epoch": 0.8284349445389038, + "grad_norm": 37.07180404663086, + "learning_rate": 1.717940414507772e-06, + "loss": 0.5952, + "mean_token_accuracy": 0.9064598381519318, + "num_tokens": 9166245.0, + "step": 5116 + }, + { + "epoch": 0.8285968747469841, + "grad_norm": 30.928010940551758, + "learning_rate": 1.7163212435233163e-06, + "loss": 0.5704, + "mean_token_accuracy": 0.9178914129734039, + "num_tokens": 9168037.0, + "step": 5117 + }, + { + "epoch": 0.8287588049550644, + "grad_norm": 28.24135398864746, + "learning_rate": 1.7147020725388603e-06, + "loss": 0.5354, + "mean_token_accuracy": 0.9126707017421722, + "num_tokens": 9169824.0, + "step": 5118 + }, + { + "epoch": 0.8289207351631447, + "grad_norm": 23.04939079284668, + "learning_rate": 1.7130829015544043e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.93058642745018, + "num_tokens": 9171611.0, + "step": 5119 + }, + { + "epoch": 0.829082665371225, + "grad_norm": 27.069425582885742, + "learning_rate": 1.7114637305699483e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.9143135845661163, + "num_tokens": 9173391.0, + "step": 5120 + }, + { + "epoch": 0.8292445955793053, + "grad_norm": 24.500484466552734, + "learning_rate": 1.7098445595854923e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.9274434447288513, + "num_tokens": 9175177.0, + "step": 5121 + }, + { + "epoch": 0.8294065257873856, + "grad_norm": 27.171897888183594, + "learning_rate": 1.7082253886010364e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.9146149754524231, + "num_tokens": 9176970.0, + "step": 5122 + }, + { + "epoch": 0.8295684559954659, + "grad_norm": 22.192378997802734, + "learning_rate": 1.7066062176165804e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.924199789762497, + "num_tokens": 9178759.0, + "step": 5123 + }, + { + "epoch": 0.8297303862035462, + "grad_norm": 30.80027961730957, + "learning_rate": 1.7049870466321244e-06, + "loss": 0.676, + "mean_token_accuracy": 0.9167623519897461, + "num_tokens": 9180548.0, + "step": 5124 + }, + { + "epoch": 0.8298923164116266, + "grad_norm": 20.733373641967773, + "learning_rate": 1.7033678756476684e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.9320779740810394, + "num_tokens": 9182340.0, + "step": 5125 + }, + { + "epoch": 0.830054246619707, + "grad_norm": 31.85842514038086, + "learning_rate": 1.7017487046632124e-06, + "loss": 0.5949, + "mean_token_accuracy": 0.9159872233867645, + "num_tokens": 9184126.0, + "step": 5126 + }, + { + "epoch": 0.8302161768277873, + "grad_norm": 34.65639114379883, + "learning_rate": 1.7001295336787566e-06, + "loss": 0.716, + "mean_token_accuracy": 0.9073200225830078, + "num_tokens": 9185908.0, + "step": 5127 + }, + { + "epoch": 0.8303781070358676, + "grad_norm": 43.541866302490234, + "learning_rate": 1.6985103626943006e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.8877550959587097, + "num_tokens": 9187714.0, + "step": 5128 + }, + { + "epoch": 0.8305400372439479, + "grad_norm": 34.468994140625, + "learning_rate": 1.6968911917098447e-06, + "loss": 0.6502, + "mean_token_accuracy": 0.9151678681373596, + "num_tokens": 9189509.0, + "step": 5129 + }, + { + "epoch": 0.8307019674520282, + "grad_norm": 27.8601016998291, + "learning_rate": 1.6952720207253887e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.9220841825008392, + "num_tokens": 9191302.0, + "step": 5130 + }, + { + "epoch": 0.8308638976601085, + "grad_norm": 23.061010360717773, + "learning_rate": 1.6936528497409327e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.9340918958187103, + "num_tokens": 9193087.0, + "step": 5131 + }, + { + "epoch": 0.8310258278681888, + "grad_norm": 37.76912307739258, + "learning_rate": 1.6920336787564767e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.9142682254314423, + "num_tokens": 9194879.0, + "step": 5132 + }, + { + "epoch": 0.8311877580762691, + "grad_norm": 21.804033279418945, + "learning_rate": 1.6904145077720207e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.9270403385162354, + "num_tokens": 9196666.0, + "step": 5133 + }, + { + "epoch": 0.8313496882843494, + "grad_norm": 17.48240089416504, + "learning_rate": 1.6887953367875647e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.9376190304756165, + "num_tokens": 9198468.0, + "step": 5134 + }, + { + "epoch": 0.8315116184924297, + "grad_norm": 23.484766006469727, + "learning_rate": 1.6871761658031087e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.9239541888237, + "num_tokens": 9200256.0, + "step": 5135 + }, + { + "epoch": 0.8316735487005101, + "grad_norm": 29.64613914489746, + "learning_rate": 1.685556994818653e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.9287356436252594, + "num_tokens": 9202063.0, + "step": 5136 + }, + { + "epoch": 0.8318354789085904, + "grad_norm": 27.142866134643555, + "learning_rate": 1.683937823834197e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.9251237213611603, + "num_tokens": 9203855.0, + "step": 5137 + }, + { + "epoch": 0.8319974091166707, + "grad_norm": 25.860971450805664, + "learning_rate": 1.682318652849741e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.9209773242473602, + "num_tokens": 9205647.0, + "step": 5138 + }, + { + "epoch": 0.8321593393247511, + "grad_norm": 33.77476119995117, + "learning_rate": 1.680699481865285e-06, + "loss": 0.5666, + "mean_token_accuracy": 0.9138603508472443, + "num_tokens": 9207449.0, + "step": 5139 + }, + { + "epoch": 0.8323212695328314, + "grad_norm": 29.214630126953125, + "learning_rate": 1.6790803108808292e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.9210607707500458, + "num_tokens": 9209240.0, + "step": 5140 + }, + { + "epoch": 0.8324831997409117, + "grad_norm": 32.52553176879883, + "learning_rate": 1.6774611398963732e-06, + "loss": 0.6186, + "mean_token_accuracy": 0.9179638922214508, + "num_tokens": 9211030.0, + "step": 5141 + }, + { + "epoch": 0.832645129948992, + "grad_norm": 30.272579193115234, + "learning_rate": 1.6758419689119175e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.9156434237957001, + "num_tokens": 9212814.0, + "step": 5142 + }, + { + "epoch": 0.8328070601570723, + "grad_norm": 23.379135131835938, + "learning_rate": 1.6742227979274615e-06, + "loss": 0.5444, + "mean_token_accuracy": 0.9275362491607666, + "num_tokens": 9214602.0, + "step": 5143 + }, + { + "epoch": 0.8329689903651526, + "grad_norm": 33.608604431152344, + "learning_rate": 1.6726036269430055e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.9148170650005341, + "num_tokens": 9216395.0, + "step": 5144 + }, + { + "epoch": 0.8331309205732329, + "grad_norm": 29.665849685668945, + "learning_rate": 1.6709844559585495e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.9248120188713074, + "num_tokens": 9218199.0, + "step": 5145 + }, + { + "epoch": 0.8332928507813132, + "grad_norm": 25.669754028320312, + "learning_rate": 1.6693652849740935e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.9223214387893677, + "num_tokens": 9219995.0, + "step": 5146 + }, + { + "epoch": 0.8334547809893935, + "grad_norm": 27.32154655456543, + "learning_rate": 1.6677461139896375e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.9213924705982208, + "num_tokens": 9221787.0, + "step": 5147 + }, + { + "epoch": 0.8336167111974739, + "grad_norm": 30.10114097595215, + "learning_rate": 1.6661269430051815e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.9244702756404877, + "num_tokens": 9223575.0, + "step": 5148 + }, + { + "epoch": 0.8337786414055542, + "grad_norm": 30.388446807861328, + "learning_rate": 1.6645077720207256e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.9330845773220062, + "num_tokens": 9225356.0, + "step": 5149 + }, + { + "epoch": 0.8339405716136346, + "grad_norm": 35.90901184082031, + "learning_rate": 1.6628886010362696e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.907696932554245, + "num_tokens": 9227139.0, + "step": 5150 + }, + { + "epoch": 0.8341025018217149, + "grad_norm": 24.924636840820312, + "learning_rate": 1.6612694300518136e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.9301470518112183, + "num_tokens": 9228923.0, + "step": 5151 + }, + { + "epoch": 0.8342644320297952, + "grad_norm": 29.92999839782715, + "learning_rate": 1.6596502590673578e-06, + "loss": 0.5449, + "mean_token_accuracy": 0.9247430562973022, + "num_tokens": 9230714.0, + "step": 5152 + }, + { + "epoch": 0.8344263622378755, + "grad_norm": 32.451297760009766, + "learning_rate": 1.6580310880829018e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.9251377880573273, + "num_tokens": 9232506.0, + "step": 5153 + }, + { + "epoch": 0.8345882924459558, + "grad_norm": 36.252906799316406, + "learning_rate": 1.6564119170984458e-06, + "loss": 0.6115, + "mean_token_accuracy": 0.9087617993354797, + "num_tokens": 9234303.0, + "step": 5154 + }, + { + "epoch": 0.8347502226540361, + "grad_norm": 33.95320510864258, + "learning_rate": 1.6547927461139898e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.9160937964916229, + "num_tokens": 9236089.0, + "step": 5155 + }, + { + "epoch": 0.8349121528621164, + "grad_norm": 36.22539138793945, + "learning_rate": 1.6531735751295339e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.9133562743663788, + "num_tokens": 9237878.0, + "step": 5156 + }, + { + "epoch": 0.8350740830701967, + "grad_norm": 40.29890060424805, + "learning_rate": 1.6515544041450779e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.9049762189388275, + "num_tokens": 9239683.0, + "step": 5157 + }, + { + "epoch": 0.835236013278277, + "grad_norm": 25.09396743774414, + "learning_rate": 1.6499352331606219e-06, + "loss": 0.527, + "mean_token_accuracy": 0.9343563616275787, + "num_tokens": 9241469.0, + "step": 5158 + }, + { + "epoch": 0.8353979434863574, + "grad_norm": 21.579505920410156, + "learning_rate": 1.648316062176166e-06, + "loss": 0.484, + "mean_token_accuracy": 0.9228307008743286, + "num_tokens": 9243266.0, + "step": 5159 + }, + { + "epoch": 0.8355598736944377, + "grad_norm": 29.61525535583496, + "learning_rate": 1.64669689119171e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.9149396419525146, + "num_tokens": 9245060.0, + "step": 5160 + }, + { + "epoch": 0.835721803902518, + "grad_norm": 17.483936309814453, + "learning_rate": 1.6450777202072541e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.9319444596767426, + "num_tokens": 9246851.0, + "step": 5161 + }, + { + "epoch": 0.8358837341105984, + "grad_norm": 34.57814407348633, + "learning_rate": 1.6434585492227982e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.9102630317211151, + "num_tokens": 9248641.0, + "step": 5162 + }, + { + "epoch": 0.8360456643186787, + "grad_norm": 35.656150817871094, + "learning_rate": 1.6418393782383422e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.9138981103897095, + "num_tokens": 9250422.0, + "step": 5163 + }, + { + "epoch": 0.836207594526759, + "grad_norm": 27.35439682006836, + "learning_rate": 1.6402202072538862e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.9219858348369598, + "num_tokens": 9252216.0, + "step": 5164 + }, + { + "epoch": 0.8363695247348393, + "grad_norm": 39.7840576171875, + "learning_rate": 1.6386010362694302e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.908483624458313, + "num_tokens": 9254021.0, + "step": 5165 + }, + { + "epoch": 0.8365314549429196, + "grad_norm": 30.98016929626465, + "learning_rate": 1.6369818652849742e-06, + "loss": 0.5912, + "mean_token_accuracy": 0.9146729707717896, + "num_tokens": 9255803.0, + "step": 5166 + }, + { + "epoch": 0.8366933851509999, + "grad_norm": 43.43905258178711, + "learning_rate": 1.6353626943005182e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.891643226146698, + "num_tokens": 9257592.0, + "step": 5167 + }, + { + "epoch": 0.8368553153590802, + "grad_norm": 24.704357147216797, + "learning_rate": 1.6337435233160622e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.9299019575119019, + "num_tokens": 9259375.0, + "step": 5168 + }, + { + "epoch": 0.8370172455671605, + "grad_norm": 50.093650817871094, + "learning_rate": 1.6321243523316062e-06, + "loss": 0.682, + "mean_token_accuracy": 0.9022791683673859, + "num_tokens": 9261171.0, + "step": 5169 + }, + { + "epoch": 0.8371791757752409, + "grad_norm": 29.147891998291016, + "learning_rate": 1.6305051813471503e-06, + "loss": 0.5846, + "mean_token_accuracy": 0.9264666140079498, + "num_tokens": 9262955.0, + "step": 5170 + }, + { + "epoch": 0.8373411059833212, + "grad_norm": 30.28306007385254, + "learning_rate": 1.6288860103626945e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.9194042086601257, + "num_tokens": 9264740.0, + "step": 5171 + }, + { + "epoch": 0.8375030361914015, + "grad_norm": 28.574092864990234, + "learning_rate": 1.6272668393782385e-06, + "loss": 0.5502, + "mean_token_accuracy": 0.9181795120239258, + "num_tokens": 9266533.0, + "step": 5172 + }, + { + "epoch": 0.8376649663994818, + "grad_norm": 20.729867935180664, + "learning_rate": 1.6256476683937825e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.934440553188324, + "num_tokens": 9268320.0, + "step": 5173 + }, + { + "epoch": 0.8378268966075622, + "grad_norm": 29.25957489013672, + "learning_rate": 1.6240284974093265e-06, + "loss": 0.5372, + "mean_token_accuracy": 0.9268495738506317, + "num_tokens": 9270105.0, + "step": 5174 + }, + { + "epoch": 0.8379888268156425, + "grad_norm": 37.63037109375, + "learning_rate": 1.6224093264248705e-06, + "loss": 0.6335, + "mean_token_accuracy": 0.9069159030914307, + "num_tokens": 9271907.0, + "step": 5175 + }, + { + "epoch": 0.8381507570237228, + "grad_norm": 22.6409854888916, + "learning_rate": 1.6207901554404145e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.9326458871364594, + "num_tokens": 9273701.0, + "step": 5176 + }, + { + "epoch": 0.8383126872318031, + "grad_norm": 26.478984832763672, + "learning_rate": 1.6191709844559586e-06, + "loss": 0.542, + "mean_token_accuracy": 0.9210858643054962, + "num_tokens": 9275489.0, + "step": 5177 + }, + { + "epoch": 0.8384746174398834, + "grad_norm": 29.775672912597656, + "learning_rate": 1.6175518134715026e-06, + "loss": 0.596, + "mean_token_accuracy": 0.9166586995124817, + "num_tokens": 9277290.0, + "step": 5178 + }, + { + "epoch": 0.8386365476479637, + "grad_norm": 32.43235778808594, + "learning_rate": 1.6159326424870466e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.9064536690711975, + "num_tokens": 9279089.0, + "step": 5179 + }, + { + "epoch": 0.838798477856044, + "grad_norm": 40.449771881103516, + "learning_rate": 1.6143134715025908e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.9030129015445709, + "num_tokens": 9280879.0, + "step": 5180 + }, + { + "epoch": 0.8389604080641243, + "grad_norm": 22.111553192138672, + "learning_rate": 1.6126943005181348e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.9339599609375, + "num_tokens": 9282678.0, + "step": 5181 + }, + { + "epoch": 0.8391223382722047, + "grad_norm": 32.93363952636719, + "learning_rate": 1.6110751295336788e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.9233197569847107, + "num_tokens": 9284477.0, + "step": 5182 + }, + { + "epoch": 0.839284268480285, + "grad_norm": 25.053237915039062, + "learning_rate": 1.6094559585492228e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.9318676292896271, + "num_tokens": 9286283.0, + "step": 5183 + }, + { + "epoch": 0.8394461986883653, + "grad_norm": 35.9697380065918, + "learning_rate": 1.6078367875647669e-06, + "loss": 0.639, + "mean_token_accuracy": 0.9150429666042328, + "num_tokens": 9288078.0, + "step": 5184 + }, + { + "epoch": 0.8396081288964456, + "grad_norm": 25.6292724609375, + "learning_rate": 1.6062176165803109e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.9239495694637299, + "num_tokens": 9289866.0, + "step": 5185 + }, + { + "epoch": 0.839770059104526, + "grad_norm": 23.238449096679688, + "learning_rate": 1.6045984455958549e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.9361238181591034, + "num_tokens": 9291659.0, + "step": 5186 + }, + { + "epoch": 0.8399319893126063, + "grad_norm": 21.21307945251465, + "learning_rate": 1.602979274611399e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.9322535693645477, + "num_tokens": 9293450.0, + "step": 5187 + }, + { + "epoch": 0.8400939195206866, + "grad_norm": 30.97864532470703, + "learning_rate": 1.6013601036269433e-06, + "loss": 0.6244, + "mean_token_accuracy": 0.9097330868244171, + "num_tokens": 9295239.0, + "step": 5188 + }, + { + "epoch": 0.8402558497287669, + "grad_norm": 25.986616134643555, + "learning_rate": 1.5997409326424874e-06, + "loss": 0.5671, + "mean_token_accuracy": 0.925000011920929, + "num_tokens": 9297018.0, + "step": 5189 + }, + { + "epoch": 0.8404177799368472, + "grad_norm": 36.92472457885742, + "learning_rate": 1.5981217616580314e-06, + "loss": 0.681, + "mean_token_accuracy": 0.9088054895401001, + "num_tokens": 9298816.0, + "step": 5190 + }, + { + "epoch": 0.8405797101449275, + "grad_norm": 38.010169982910156, + "learning_rate": 1.5965025906735754e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.9239118993282318, + "num_tokens": 9300603.0, + "step": 5191 + }, + { + "epoch": 0.8407416403530078, + "grad_norm": 31.113197326660156, + "learning_rate": 1.5948834196891194e-06, + "loss": 0.5856, + "mean_token_accuracy": 0.9186238050460815, + "num_tokens": 9302385.0, + "step": 5192 + }, + { + "epoch": 0.8409035705610882, + "grad_norm": 16.064685821533203, + "learning_rate": 1.5932642487046634e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.9298747181892395, + "num_tokens": 9304168.0, + "step": 5193 + }, + { + "epoch": 0.8410655007691685, + "grad_norm": 21.1790771484375, + "learning_rate": 1.5916450777202074e-06, + "loss": 0.5633, + "mean_token_accuracy": 0.9303885698318481, + "num_tokens": 9305953.0, + "step": 5194 + }, + { + "epoch": 0.8412274309772488, + "grad_norm": 26.771747589111328, + "learning_rate": 1.5900259067357514e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.920152485370636, + "num_tokens": 9307740.0, + "step": 5195 + }, + { + "epoch": 0.8413893611853291, + "grad_norm": 32.2213134765625, + "learning_rate": 1.5884067357512957e-06, + "loss": 0.566, + "mean_token_accuracy": 0.9192460477352142, + "num_tokens": 9309536.0, + "step": 5196 + }, + { + "epoch": 0.8415512913934095, + "grad_norm": 26.875646591186523, + "learning_rate": 1.5867875647668397e-06, + "loss": 0.5918, + "mean_token_accuracy": 0.9231202900409698, + "num_tokens": 9311321.0, + "step": 5197 + }, + { + "epoch": 0.8417132216014898, + "grad_norm": 37.39484405517578, + "learning_rate": 1.5851683937823837e-06, + "loss": 0.575, + "mean_token_accuracy": 0.9157534241676331, + "num_tokens": 9313119.0, + "step": 5198 + }, + { + "epoch": 0.8418751518095701, + "grad_norm": 34.829490661621094, + "learning_rate": 1.5835492227979277e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.9170750975608826, + "num_tokens": 9314908.0, + "step": 5199 + }, + { + "epoch": 0.8420370820176504, + "grad_norm": 33.96983337402344, + "learning_rate": 1.5819300518134717e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.9201591610908508, + "num_tokens": 9316708.0, + "step": 5200 + }, + { + "epoch": 0.8421990122257307, + "grad_norm": 45.317203521728516, + "learning_rate": 1.5803108808290157e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.9023555219173431, + "num_tokens": 9318507.0, + "step": 5201 + }, + { + "epoch": 0.842360942433811, + "grad_norm": 19.666397094726562, + "learning_rate": 1.5786917098445597e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.9231141209602356, + "num_tokens": 9320292.0, + "step": 5202 + }, + { + "epoch": 0.8425228726418913, + "grad_norm": 35.46826934814453, + "learning_rate": 1.5770725388601037e-06, + "loss": 0.6058, + "mean_token_accuracy": 0.9114635288715363, + "num_tokens": 9322087.0, + "step": 5203 + }, + { + "epoch": 0.8426848028499717, + "grad_norm": 17.691511154174805, + "learning_rate": 1.5754533678756478e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.929598331451416, + "num_tokens": 9323869.0, + "step": 5204 + }, + { + "epoch": 0.842846733058052, + "grad_norm": 15.61081314086914, + "learning_rate": 1.573834196891192e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.9430540204048157, + "num_tokens": 9325662.0, + "step": 5205 + }, + { + "epoch": 0.8430086632661323, + "grad_norm": 37.852821350097656, + "learning_rate": 1.572215025906736e-06, + "loss": 0.5868, + "mean_token_accuracy": 0.9144531786441803, + "num_tokens": 9327465.0, + "step": 5206 + }, + { + "epoch": 0.8431705934742126, + "grad_norm": 27.265666961669922, + "learning_rate": 1.57059585492228e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.9189557433128357, + "num_tokens": 9329248.0, + "step": 5207 + }, + { + "epoch": 0.843332523682293, + "grad_norm": 35.064300537109375, + "learning_rate": 1.568976683937824e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.9145896732807159, + "num_tokens": 9331041.0, + "step": 5208 + }, + { + "epoch": 0.8434944538903733, + "grad_norm": 20.65400505065918, + "learning_rate": 1.567357512953368e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.9275849461555481, + "num_tokens": 9332829.0, + "step": 5209 + }, + { + "epoch": 0.8436563840984536, + "grad_norm": 33.832725524902344, + "learning_rate": 1.565738341968912e-06, + "loss": 0.6208, + "mean_token_accuracy": 0.9134169816970825, + "num_tokens": 9334629.0, + "step": 5210 + }, + { + "epoch": 0.8438183143065339, + "grad_norm": 19.647703170776367, + "learning_rate": 1.564119170984456e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.9269837737083435, + "num_tokens": 9336414.0, + "step": 5211 + }, + { + "epoch": 0.8439802445146142, + "grad_norm": 36.529197692871094, + "learning_rate": 1.5625e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.9046049118041992, + "num_tokens": 9338209.0, + "step": 5212 + }, + { + "epoch": 0.8441421747226945, + "grad_norm": 32.02098083496094, + "learning_rate": 1.560880829015544e-06, + "loss": 0.742, + "mean_token_accuracy": 0.9062597751617432, + "num_tokens": 9339998.0, + "step": 5213 + }, + { + "epoch": 0.8443041049307748, + "grad_norm": 25.5738525390625, + "learning_rate": 1.559261658031088e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.9282407462596893, + "num_tokens": 9341789.0, + "step": 5214 + }, + { + "epoch": 0.8444660351388552, + "grad_norm": 30.551128387451172, + "learning_rate": 1.5576424870466323e-06, + "loss": 0.5717, + "mean_token_accuracy": 0.9190176129341125, + "num_tokens": 9343584.0, + "step": 5215 + }, + { + "epoch": 0.8446279653469355, + "grad_norm": 23.832063674926758, + "learning_rate": 1.5560233160621763e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.927143782377243, + "num_tokens": 9345369.0, + "step": 5216 + }, + { + "epoch": 0.8447898955550158, + "grad_norm": 25.7690486907959, + "learning_rate": 1.5544041450777204e-06, + "loss": 0.5562, + "mean_token_accuracy": 0.9275223016738892, + "num_tokens": 9347143.0, + "step": 5217 + }, + { + "epoch": 0.8449518257630961, + "grad_norm": 28.362577438354492, + "learning_rate": 1.5527849740932644e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.923882782459259, + "num_tokens": 9348931.0, + "step": 5218 + }, + { + "epoch": 0.8451137559711764, + "grad_norm": 22.581287384033203, + "learning_rate": 1.5511658031088084e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.9196909368038177, + "num_tokens": 9350717.0, + "step": 5219 + }, + { + "epoch": 0.8452756861792567, + "grad_norm": 36.679969787597656, + "learning_rate": 1.5495466321243524e-06, + "loss": 0.6566, + "mean_token_accuracy": 0.9045893847942352, + "num_tokens": 9352511.0, + "step": 5220 + }, + { + "epoch": 0.845437616387337, + "grad_norm": 23.944931030273438, + "learning_rate": 1.5479274611398964e-06, + "loss": 0.5596, + "mean_token_accuracy": 0.9231429696083069, + "num_tokens": 9354296.0, + "step": 5221 + }, + { + "epoch": 0.8455995465954174, + "grad_norm": 32.104888916015625, + "learning_rate": 1.5463082901554404e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.9145392179489136, + "num_tokens": 9356086.0, + "step": 5222 + }, + { + "epoch": 0.8457614768034977, + "grad_norm": 25.463022232055664, + "learning_rate": 1.5446891191709844e-06, + "loss": 0.5227, + "mean_token_accuracy": 0.9313608109951019, + "num_tokens": 9357875.0, + "step": 5223 + }, + { + "epoch": 0.845923407011578, + "grad_norm": 22.11979866027832, + "learning_rate": 1.5430699481865287e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.924761027097702, + "num_tokens": 9359666.0, + "step": 5224 + }, + { + "epoch": 0.8460853372196583, + "grad_norm": 19.9564151763916, + "learning_rate": 1.5414507772020727e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.9281638562679291, + "num_tokens": 9361457.0, + "step": 5225 + }, + { + "epoch": 0.8462472674277386, + "grad_norm": 31.90955352783203, + "learning_rate": 1.5398316062176167e-06, + "loss": 0.6011, + "mean_token_accuracy": 0.9158540070056915, + "num_tokens": 9363254.0, + "step": 5226 + }, + { + "epoch": 0.846409197635819, + "grad_norm": 33.45148468017578, + "learning_rate": 1.5382124352331607e-06, + "loss": 0.5661, + "mean_token_accuracy": 0.9033444821834564, + "num_tokens": 9365034.0, + "step": 5227 + }, + { + "epoch": 0.8465711278438993, + "grad_norm": 17.83403968811035, + "learning_rate": 1.5365932642487047e-06, + "loss": 0.463, + "mean_token_accuracy": 0.9355916678905487, + "num_tokens": 9366825.0, + "step": 5228 + }, + { + "epoch": 0.8467330580519796, + "grad_norm": 23.559110641479492, + "learning_rate": 1.5349740932642487e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.9296627044677734, + "num_tokens": 9368621.0, + "step": 5229 + }, + { + "epoch": 0.8468949882600599, + "grad_norm": 26.534774780273438, + "learning_rate": 1.5333549222797927e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.9227233529090881, + "num_tokens": 9370405.0, + "step": 5230 + }, + { + "epoch": 0.8470569184681402, + "grad_norm": 23.205982208251953, + "learning_rate": 1.5317357512953367e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.9272717833518982, + "num_tokens": 9372192.0, + "step": 5231 + }, + { + "epoch": 0.8472188486762205, + "grad_norm": 20.43933868408203, + "learning_rate": 1.5301165803108808e-06, + "loss": 0.5704, + "mean_token_accuracy": 0.931197464466095, + "num_tokens": 9373980.0, + "step": 5232 + }, + { + "epoch": 0.8473807788843009, + "grad_norm": 24.96548080444336, + "learning_rate": 1.5284974093264248e-06, + "loss": 0.5504, + "mean_token_accuracy": 0.924717366695404, + "num_tokens": 9375771.0, + "step": 5233 + }, + { + "epoch": 0.8475427090923812, + "grad_norm": 23.594953536987305, + "learning_rate": 1.526878238341969e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9256001710891724, + "num_tokens": 9377552.0, + "step": 5234 + }, + { + "epoch": 0.8477046393004615, + "grad_norm": 42.07714080810547, + "learning_rate": 1.525259067357513e-06, + "loss": 0.804, + "mean_token_accuracy": 0.9071239233016968, + "num_tokens": 9379344.0, + "step": 5235 + }, + { + "epoch": 0.8478665695085418, + "grad_norm": 14.972844123840332, + "learning_rate": 1.523639896373057e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.9334944188594818, + "num_tokens": 9381126.0, + "step": 5236 + }, + { + "epoch": 0.8480284997166221, + "grad_norm": 28.37084197998047, + "learning_rate": 1.5220207253886012e-06, + "loss": 0.5771, + "mean_token_accuracy": 0.9172877967357635, + "num_tokens": 9382916.0, + "step": 5237 + }, + { + "epoch": 0.8481904299247025, + "grad_norm": 30.509963989257812, + "learning_rate": 1.5204015544041453e-06, + "loss": 0.5485, + "mean_token_accuracy": 0.9153555035591125, + "num_tokens": 9384700.0, + "step": 5238 + }, + { + "epoch": 0.8483523601327828, + "grad_norm": 31.961938858032227, + "learning_rate": 1.5187823834196893e-06, + "loss": 0.5363, + "mean_token_accuracy": 0.9149473309516907, + "num_tokens": 9386482.0, + "step": 5239 + }, + { + "epoch": 0.8485142903408631, + "grad_norm": 34.837135314941406, + "learning_rate": 1.5171632124352335e-06, + "loss": 0.6282, + "mean_token_accuracy": 0.9156671762466431, + "num_tokens": 9388277.0, + "step": 5240 + }, + { + "epoch": 0.8486762205489434, + "grad_norm": 41.47296142578125, + "learning_rate": 1.5155440414507775e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.8975524306297302, + "num_tokens": 9390072.0, + "step": 5241 + }, + { + "epoch": 0.8488381507570237, + "grad_norm": 14.563606262207031, + "learning_rate": 1.5139248704663215e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.9341780245304108, + "num_tokens": 9391857.0, + "step": 5242 + }, + { + "epoch": 0.849000080965104, + "grad_norm": 17.507963180541992, + "learning_rate": 1.5123056994818655e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.9395670592784882, + "num_tokens": 9393651.0, + "step": 5243 + }, + { + "epoch": 0.8491620111731844, + "grad_norm": 20.93804931640625, + "learning_rate": 1.5106865284974096e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.9236750304698944, + "num_tokens": 9395438.0, + "step": 5244 + }, + { + "epoch": 0.8493239413812647, + "grad_norm": 38.6978759765625, + "learning_rate": 1.5090673575129536e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.909768134355545, + "num_tokens": 9397239.0, + "step": 5245 + }, + { + "epoch": 0.849485871589345, + "grad_norm": 22.95217514038086, + "learning_rate": 1.5074481865284976e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.9285568594932556, + "num_tokens": 9399031.0, + "step": 5246 + }, + { + "epoch": 0.8496478017974253, + "grad_norm": 29.574277877807617, + "learning_rate": 1.5058290155440416e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.9162346720695496, + "num_tokens": 9400818.0, + "step": 5247 + }, + { + "epoch": 0.8498097320055056, + "grad_norm": 35.608184814453125, + "learning_rate": 1.5042098445595856e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.9149899184703827, + "num_tokens": 9402612.0, + "step": 5248 + }, + { + "epoch": 0.849971662213586, + "grad_norm": 38.882118225097656, + "learning_rate": 1.5025906735751296e-06, + "loss": 0.6616, + "mean_token_accuracy": 0.9241134822368622, + "num_tokens": 9404400.0, + "step": 5249 + }, + { + "epoch": 0.8501335924216663, + "grad_norm": 19.722766876220703, + "learning_rate": 1.5009715025906738e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.9305421710014343, + "num_tokens": 9406200.0, + "step": 5250 + }, + { + "epoch": 0.8502955226297466, + "grad_norm": 18.36075210571289, + "learning_rate": 1.4993523316062179e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.934779167175293, + "num_tokens": 9407988.0, + "step": 5251 + }, + { + "epoch": 0.8504574528378269, + "grad_norm": 49.62141418457031, + "learning_rate": 1.4977331606217619e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.9069773554801941, + "num_tokens": 9409777.0, + "step": 5252 + }, + { + "epoch": 0.8506193830459072, + "grad_norm": 35.11046600341797, + "learning_rate": 1.4961139896373059e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.9067688584327698, + "num_tokens": 9411569.0, + "step": 5253 + }, + { + "epoch": 0.8507813132539875, + "grad_norm": 22.82764434814453, + "learning_rate": 1.49449481865285e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9363758862018585, + "num_tokens": 9413364.0, + "step": 5254 + }, + { + "epoch": 0.8509432434620678, + "grad_norm": 37.2925910949707, + "learning_rate": 1.492875647668394e-06, + "loss": 0.6002, + "mean_token_accuracy": 0.9128788113594055, + "num_tokens": 9415151.0, + "step": 5255 + }, + { + "epoch": 0.8511051736701482, + "grad_norm": 32.3619384765625, + "learning_rate": 1.491256476683938e-06, + "loss": 0.572, + "mean_token_accuracy": 0.9295774698257446, + "num_tokens": 9416947.0, + "step": 5256 + }, + { + "epoch": 0.8512671038782285, + "grad_norm": 36.41777420043945, + "learning_rate": 1.489637305699482e-06, + "loss": 0.577, + "mean_token_accuracy": 0.9193286001682281, + "num_tokens": 9418744.0, + "step": 5257 + }, + { + "epoch": 0.8514290340863088, + "grad_norm": 29.06987762451172, + "learning_rate": 1.488018134715026e-06, + "loss": 0.5833, + "mean_token_accuracy": 0.9214285910129547, + "num_tokens": 9420536.0, + "step": 5258 + }, + { + "epoch": 0.8515909642943891, + "grad_norm": 17.234357833862305, + "learning_rate": 1.4863989637305702e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.93778195977211, + "num_tokens": 9422321.0, + "step": 5259 + }, + { + "epoch": 0.8517528945024694, + "grad_norm": 22.571989059448242, + "learning_rate": 1.4847797927461142e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.936292290687561, + "num_tokens": 9424115.0, + "step": 5260 + }, + { + "epoch": 0.8519148247105498, + "grad_norm": 24.668811798095703, + "learning_rate": 1.4831606217616582e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.9315451383590698, + "num_tokens": 9425905.0, + "step": 5261 + }, + { + "epoch": 0.8520767549186301, + "grad_norm": 29.632442474365234, + "learning_rate": 1.4815414507772022e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.9184397161006927, + "num_tokens": 9427699.0, + "step": 5262 + }, + { + "epoch": 0.8522386851267104, + "grad_norm": 37.060890197753906, + "learning_rate": 1.4799222797927462e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.9139516949653625, + "num_tokens": 9429490.0, + "step": 5263 + }, + { + "epoch": 0.8524006153347907, + "grad_norm": 32.838233947753906, + "learning_rate": 1.4783031088082902e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.8965953886508942, + "num_tokens": 9431283.0, + "step": 5264 + }, + { + "epoch": 0.852562545542871, + "grad_norm": 23.7088623046875, + "learning_rate": 1.4766839378238342e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.9229343831539154, + "num_tokens": 9433066.0, + "step": 5265 + }, + { + "epoch": 0.8527244757509513, + "grad_norm": 35.90534210205078, + "learning_rate": 1.4750647668393783e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.9177428483963013, + "num_tokens": 9434858.0, + "step": 5266 + }, + { + "epoch": 0.8528864059590316, + "grad_norm": 45.42204284667969, + "learning_rate": 1.4734455958549223e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.9087147414684296, + "num_tokens": 9436652.0, + "step": 5267 + }, + { + "epoch": 0.853048336167112, + "grad_norm": 41.98497009277344, + "learning_rate": 1.4718264248704665e-06, + "loss": 0.5543, + "mean_token_accuracy": 0.9231898188591003, + "num_tokens": 9438450.0, + "step": 5268 + }, + { + "epoch": 0.8532102663751923, + "grad_norm": 29.44783592224121, + "learning_rate": 1.4702072538860105e-06, + "loss": 0.6051, + "mean_token_accuracy": 0.9165343940258026, + "num_tokens": 9440237.0, + "step": 5269 + }, + { + "epoch": 0.8533721965832726, + "grad_norm": 27.19260025024414, + "learning_rate": 1.4685880829015545e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.9265811145305634, + "num_tokens": 9442022.0, + "step": 5270 + }, + { + "epoch": 0.8535341267913529, + "grad_norm": 36.47469711303711, + "learning_rate": 1.4669689119170985e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.9087591171264648, + "num_tokens": 9443808.0, + "step": 5271 + }, + { + "epoch": 0.8536960569994333, + "grad_norm": 29.44383430480957, + "learning_rate": 1.4653497409326426e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.9209109842777252, + "num_tokens": 9445598.0, + "step": 5272 + }, + { + "epoch": 0.8538579872075136, + "grad_norm": 41.81792449951172, + "learning_rate": 1.4637305699481866e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.9034347832202911, + "num_tokens": 9447389.0, + "step": 5273 + }, + { + "epoch": 0.8540199174155939, + "grad_norm": 38.32749938964844, + "learning_rate": 1.4621113989637306e-06, + "loss": 0.5778, + "mean_token_accuracy": 0.9225809872150421, + "num_tokens": 9449185.0, + "step": 5274 + }, + { + "epoch": 0.8541818476236742, + "grad_norm": 27.269559860229492, + "learning_rate": 1.4604922279792746e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.9247555434703827, + "num_tokens": 9450976.0, + "step": 5275 + }, + { + "epoch": 0.8543437778317545, + "grad_norm": 27.9717960357666, + "learning_rate": 1.4588730569948186e-06, + "loss": 0.5513, + "mean_token_accuracy": 0.9266248941421509, + "num_tokens": 9452775.0, + "step": 5276 + }, + { + "epoch": 0.8545057080398348, + "grad_norm": 18.759340286254883, + "learning_rate": 1.4572538860103626e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.9354764223098755, + "num_tokens": 9454566.0, + "step": 5277 + }, + { + "epoch": 0.8546676382479151, + "grad_norm": 34.24370193481445, + "learning_rate": 1.4556347150259068e-06, + "loss": 0.579, + "mean_token_accuracy": 0.9276260435581207, + "num_tokens": 9456354.0, + "step": 5278 + }, + { + "epoch": 0.8548295684559954, + "grad_norm": 50.75994110107422, + "learning_rate": 1.4540155440414509e-06, + "loss": 0.666, + "mean_token_accuracy": 0.9055653214454651, + "num_tokens": 9458152.0, + "step": 5279 + }, + { + "epoch": 0.8549914986640758, + "grad_norm": 27.73817253112793, + "learning_rate": 1.4523963730569949e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.9202228486537933, + "num_tokens": 9459940.0, + "step": 5280 + }, + { + "epoch": 0.8551534288721561, + "grad_norm": 30.87932777404785, + "learning_rate": 1.4507772020725389e-06, + "loss": 0.5533, + "mean_token_accuracy": 0.9202381074428558, + "num_tokens": 9461727.0, + "step": 5281 + }, + { + "epoch": 0.8553153590802364, + "grad_norm": 22.789653778076172, + "learning_rate": 1.449158031088083e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.933610588312149, + "num_tokens": 9463525.0, + "step": 5282 + }, + { + "epoch": 0.8554772892883168, + "grad_norm": 36.28479766845703, + "learning_rate": 1.447538860103627e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.9053024351596832, + "num_tokens": 9465311.0, + "step": 5283 + }, + { + "epoch": 0.8556392194963971, + "grad_norm": 36.82135009765625, + "learning_rate": 1.445919689119171e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.902877688407898, + "num_tokens": 9467101.0, + "step": 5284 + }, + { + "epoch": 0.8558011497044774, + "grad_norm": 34.84588623046875, + "learning_rate": 1.4443005181347154e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.9129759073257446, + "num_tokens": 9468900.0, + "step": 5285 + }, + { + "epoch": 0.8559630799125577, + "grad_norm": 33.86688995361328, + "learning_rate": 1.4426813471502594e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.9186871349811554, + "num_tokens": 9470682.0, + "step": 5286 + }, + { + "epoch": 0.856125010120638, + "grad_norm": 43.037452697753906, + "learning_rate": 1.4410621761658034e-06, + "loss": 0.5541, + "mean_token_accuracy": 0.9121672511100769, + "num_tokens": 9472467.0, + "step": 5287 + }, + { + "epoch": 0.8562869403287183, + "grad_norm": 26.84910774230957, + "learning_rate": 1.4394430051813474e-06, + "loss": 0.5614, + "mean_token_accuracy": 0.9165966212749481, + "num_tokens": 9474255.0, + "step": 5288 + }, + { + "epoch": 0.8564488705367986, + "grad_norm": 35.926666259765625, + "learning_rate": 1.4378238341968914e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.9180253744125366, + "num_tokens": 9476049.0, + "step": 5289 + }, + { + "epoch": 0.8566108007448789, + "grad_norm": 24.452714920043945, + "learning_rate": 1.4362046632124354e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.9376903474330902, + "num_tokens": 9477849.0, + "step": 5290 + }, + { + "epoch": 0.8567727309529592, + "grad_norm": 30.211950302124023, + "learning_rate": 1.4345854922279794e-06, + "loss": 0.5629, + "mean_token_accuracy": 0.923116147518158, + "num_tokens": 9479634.0, + "step": 5291 + }, + { + "epoch": 0.8569346611610396, + "grad_norm": 33.4118537902832, + "learning_rate": 1.4329663212435234e-06, + "loss": 0.5949, + "mean_token_accuracy": 0.9261437952518463, + "num_tokens": 9481417.0, + "step": 5292 + }, + { + "epoch": 0.8570965913691199, + "grad_norm": 32.84480285644531, + "learning_rate": 1.4313471502590675e-06, + "loss": 0.597, + "mean_token_accuracy": 0.9277969002723694, + "num_tokens": 9483206.0, + "step": 5293 + }, + { + "epoch": 0.8572585215772002, + "grad_norm": 27.25927734375, + "learning_rate": 1.4297279792746117e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.9275445640087128, + "num_tokens": 9485007.0, + "step": 5294 + }, + { + "epoch": 0.8574204517852806, + "grad_norm": 18.003103256225586, + "learning_rate": 1.4281088082901557e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.931636780500412, + "num_tokens": 9486782.0, + "step": 5295 + }, + { + "epoch": 0.8575823819933609, + "grad_norm": 44.89909744262695, + "learning_rate": 1.4264896373056997e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.9070360660552979, + "num_tokens": 9488574.0, + "step": 5296 + }, + { + "epoch": 0.8577443122014412, + "grad_norm": 36.857364654541016, + "learning_rate": 1.4248704663212437e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.9054232835769653, + "num_tokens": 9490361.0, + "step": 5297 + }, + { + "epoch": 0.8579062424095215, + "grad_norm": 29.76721954345703, + "learning_rate": 1.4232512953367877e-06, + "loss": 0.5381, + "mean_token_accuracy": 0.9216417968273163, + "num_tokens": 9492141.0, + "step": 5298 + }, + { + "epoch": 0.8580681726176018, + "grad_norm": 31.824663162231445, + "learning_rate": 1.4216321243523318e-06, + "loss": 0.5538, + "mean_token_accuracy": 0.9156932830810547, + "num_tokens": 9493925.0, + "step": 5299 + }, + { + "epoch": 0.8582301028256821, + "grad_norm": 39.13085174560547, + "learning_rate": 1.4200129533678758e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.9127168655395508, + "num_tokens": 9495712.0, + "step": 5300 + }, + { + "epoch": 0.8583920330337624, + "grad_norm": 24.199796676635742, + "learning_rate": 1.4183937823834198e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.936141312122345, + "num_tokens": 9497506.0, + "step": 5301 + }, + { + "epoch": 0.8585539632418427, + "grad_norm": 36.09194564819336, + "learning_rate": 1.4167746113989638e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.8959660828113556, + "num_tokens": 9499297.0, + "step": 5302 + }, + { + "epoch": 0.858715893449923, + "grad_norm": 26.725933074951172, + "learning_rate": 1.415155440414508e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.9176878929138184, + "num_tokens": 9501089.0, + "step": 5303 + }, + { + "epoch": 0.8588778236580034, + "grad_norm": 24.072927474975586, + "learning_rate": 1.413536269430052e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.9240499436855316, + "num_tokens": 9502876.0, + "step": 5304 + }, + { + "epoch": 0.8590397538660837, + "grad_norm": 23.584243774414062, + "learning_rate": 1.411917098445596e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9252917170524597, + "num_tokens": 9504669.0, + "step": 5305 + }, + { + "epoch": 0.8592016840741641, + "grad_norm": 27.123111724853516, + "learning_rate": 1.41029792746114e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.925000011920929, + "num_tokens": 9506461.0, + "step": 5306 + }, + { + "epoch": 0.8593636142822444, + "grad_norm": 30.561246871948242, + "learning_rate": 1.408678756476684e-06, + "loss": 0.6103, + "mean_token_accuracy": 0.9201650023460388, + "num_tokens": 9508250.0, + "step": 5307 + }, + { + "epoch": 0.8595255444903247, + "grad_norm": 39.82710647583008, + "learning_rate": 1.407059585492228e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.913473516702652, + "num_tokens": 9510040.0, + "step": 5308 + }, + { + "epoch": 0.859687474698405, + "grad_norm": 47.155033111572266, + "learning_rate": 1.405440414507772e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.887859046459198, + "num_tokens": 9511828.0, + "step": 5309 + }, + { + "epoch": 0.8598494049064853, + "grad_norm": 42.68217468261719, + "learning_rate": 1.4038212435233161e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.9014605581760406, + "num_tokens": 9513623.0, + "step": 5310 + }, + { + "epoch": 0.8600113351145656, + "grad_norm": 38.64399337768555, + "learning_rate": 1.4022020725388601e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.9169971942901611, + "num_tokens": 9515412.0, + "step": 5311 + }, + { + "epoch": 0.8601732653226459, + "grad_norm": 30.379274368286133, + "learning_rate": 1.4005829015544041e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.9295739829540253, + "num_tokens": 9517208.0, + "step": 5312 + }, + { + "epoch": 0.8603351955307262, + "grad_norm": 38.085323333740234, + "learning_rate": 1.3989637305699484e-06, + "loss": 0.5439, + "mean_token_accuracy": 0.9214891493320465, + "num_tokens": 9519013.0, + "step": 5313 + }, + { + "epoch": 0.8604971257388065, + "grad_norm": 42.7342643737793, + "learning_rate": 1.3973445595854924e-06, + "loss": 0.6798, + "mean_token_accuracy": 0.9008310735225677, + "num_tokens": 9520806.0, + "step": 5314 + }, + { + "epoch": 0.8606590559468869, + "grad_norm": 32.91374588012695, + "learning_rate": 1.3957253886010364e-06, + "loss": 0.6395, + "mean_token_accuracy": 0.9209633767604828, + "num_tokens": 9522606.0, + "step": 5315 + }, + { + "epoch": 0.8608209861549672, + "grad_norm": 24.890933990478516, + "learning_rate": 1.3941062176165804e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.9266248941421509, + "num_tokens": 9524405.0, + "step": 5316 + }, + { + "epoch": 0.8609829163630476, + "grad_norm": 35.44380569458008, + "learning_rate": 1.3924870466321244e-06, + "loss": 0.5942, + "mean_token_accuracy": 0.9170057475566864, + "num_tokens": 9526194.0, + "step": 5317 + }, + { + "epoch": 0.8611448465711279, + "grad_norm": 36.12889862060547, + "learning_rate": 1.3908678756476684e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.8999999761581421, + "num_tokens": 9527986.0, + "step": 5318 + }, + { + "epoch": 0.8613067767792082, + "grad_norm": 36.112369537353516, + "learning_rate": 1.3892487046632124e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.9140466749668121, + "num_tokens": 9529779.0, + "step": 5319 + }, + { + "epoch": 0.8614687069872885, + "grad_norm": 27.888492584228516, + "learning_rate": 1.3876295336787565e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.9170750975608826, + "num_tokens": 9531568.0, + "step": 5320 + }, + { + "epoch": 0.8616306371953688, + "grad_norm": 34.0111198425293, + "learning_rate": 1.3860103626943005e-06, + "loss": 0.6139, + "mean_token_accuracy": 0.9171499609947205, + "num_tokens": 9533358.0, + "step": 5321 + }, + { + "epoch": 0.8617925674034491, + "grad_norm": 38.50695037841797, + "learning_rate": 1.3843911917098447e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.9128755629062653, + "num_tokens": 9535145.0, + "step": 5322 + }, + { + "epoch": 0.8619544976115294, + "grad_norm": 27.167665481567383, + "learning_rate": 1.3827720207253887e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.9292457401752472, + "num_tokens": 9536940.0, + "step": 5323 + }, + { + "epoch": 0.8621164278196097, + "grad_norm": 27.86199951171875, + "learning_rate": 1.3811528497409327e-06, + "loss": 0.5591, + "mean_token_accuracy": 0.9208633303642273, + "num_tokens": 9538730.0, + "step": 5324 + }, + { + "epoch": 0.86227835802769, + "grad_norm": 24.35065460205078, + "learning_rate": 1.3795336787564767e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.9201333820819855, + "num_tokens": 9540517.0, + "step": 5325 + }, + { + "epoch": 0.8624402882357703, + "grad_norm": 37.47370529174805, + "learning_rate": 1.3779145077720207e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.9202331602573395, + "num_tokens": 9542305.0, + "step": 5326 + }, + { + "epoch": 0.8626022184438507, + "grad_norm": 31.073057174682617, + "learning_rate": 1.3762953367875648e-06, + "loss": 0.5546, + "mean_token_accuracy": 0.918313592672348, + "num_tokens": 9544098.0, + "step": 5327 + }, + { + "epoch": 0.8627641486519311, + "grad_norm": 30.671916961669922, + "learning_rate": 1.3746761658031088e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.9140793681144714, + "num_tokens": 9545889.0, + "step": 5328 + }, + { + "epoch": 0.8629260788600114, + "grad_norm": 22.734460830688477, + "learning_rate": 1.3730569948186528e-06, + "loss": 0.592, + "mean_token_accuracy": 0.9173451066017151, + "num_tokens": 9547667.0, + "step": 5329 + }, + { + "epoch": 0.8630880090680917, + "grad_norm": 25.46052360534668, + "learning_rate": 1.3714378238341968e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.9366315007209778, + "num_tokens": 9549464.0, + "step": 5330 + }, + { + "epoch": 0.863249939276172, + "grad_norm": 24.857929229736328, + "learning_rate": 1.369818652849741e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.919113278388977, + "num_tokens": 9551248.0, + "step": 5331 + }, + { + "epoch": 0.8634118694842523, + "grad_norm": 29.277265548706055, + "learning_rate": 1.368199481865285e-06, + "loss": 0.5568, + "mean_token_accuracy": 0.9205766022205353, + "num_tokens": 9553037.0, + "step": 5332 + }, + { + "epoch": 0.8635737996923326, + "grad_norm": 33.98667526245117, + "learning_rate": 1.3665803108808293e-06, + "loss": 0.6152, + "mean_token_accuracy": 0.9170559346675873, + "num_tokens": 9554814.0, + "step": 5333 + }, + { + "epoch": 0.8637357299004129, + "grad_norm": 24.756214141845703, + "learning_rate": 1.3649611398963733e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.9255319237709045, + "num_tokens": 9556608.0, + "step": 5334 + }, + { + "epoch": 0.8638976601084932, + "grad_norm": 28.062997817993164, + "learning_rate": 1.3633419689119173e-06, + "loss": 0.537, + "mean_token_accuracy": 0.9251798987388611, + "num_tokens": 9558401.0, + "step": 5335 + }, + { + "epoch": 0.8640595903165735, + "grad_norm": 36.446346282958984, + "learning_rate": 1.3617227979274613e-06, + "loss": 0.503, + "mean_token_accuracy": 0.9258498251438141, + "num_tokens": 9560194.0, + "step": 5336 + }, + { + "epoch": 0.8642215205246538, + "grad_norm": 27.742956161499023, + "learning_rate": 1.3601036269430053e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.9208531081676483, + "num_tokens": 9561985.0, + "step": 5337 + }, + { + "epoch": 0.8643834507327341, + "grad_norm": 33.15011215209961, + "learning_rate": 1.3584844559585495e-06, + "loss": 0.566, + "mean_token_accuracy": 0.9153173565864563, + "num_tokens": 9563780.0, + "step": 5338 + }, + { + "epoch": 0.8645453809408145, + "grad_norm": 43.914527893066406, + "learning_rate": 1.3568652849740935e-06, + "loss": 0.829, + "mean_token_accuracy": 0.8969465494155884, + "num_tokens": 9565554.0, + "step": 5339 + }, + { + "epoch": 0.8647073111488949, + "grad_norm": 33.37308883666992, + "learning_rate": 1.3552461139896376e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.9087617993354797, + "num_tokens": 9567351.0, + "step": 5340 + }, + { + "epoch": 0.8648692413569752, + "grad_norm": 27.9453067779541, + "learning_rate": 1.3536269430051816e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.928024023771286, + "num_tokens": 9569141.0, + "step": 5341 + }, + { + "epoch": 0.8650311715650555, + "grad_norm": 35.81235122680664, + "learning_rate": 1.3520077720207256e-06, + "loss": 0.5704, + "mean_token_accuracy": 0.9141156673431396, + "num_tokens": 9570944.0, + "step": 5342 + }, + { + "epoch": 0.8651931017731358, + "grad_norm": 28.17653465270996, + "learning_rate": 1.3503886010362696e-06, + "loss": 0.5776, + "mean_token_accuracy": 0.9258343279361725, + "num_tokens": 9572739.0, + "step": 5343 + }, + { + "epoch": 0.8653550319812161, + "grad_norm": 26.94464874267578, + "learning_rate": 1.3487694300518136e-06, + "loss": 0.572, + "mean_token_accuracy": 0.9279641211032867, + "num_tokens": 9574544.0, + "step": 5344 + }, + { + "epoch": 0.8655169621892964, + "grad_norm": 33.66481018066406, + "learning_rate": 1.3471502590673576e-06, + "loss": 0.5675, + "mean_token_accuracy": 0.9205682873725891, + "num_tokens": 9576333.0, + "step": 5345 + }, + { + "epoch": 0.8656788923973767, + "grad_norm": 17.2431697845459, + "learning_rate": 1.3455310880829016e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.9284547865390778, + "num_tokens": 9578125.0, + "step": 5346 + }, + { + "epoch": 0.865840822605457, + "grad_norm": 25.35887908935547, + "learning_rate": 1.3439119170984459e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.9295460283756256, + "num_tokens": 9579921.0, + "step": 5347 + }, + { + "epoch": 0.8660027528135373, + "grad_norm": 26.909486770629883, + "learning_rate": 1.3422927461139899e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.9326230585575104, + "num_tokens": 9581716.0, + "step": 5348 + }, + { + "epoch": 0.8661646830216176, + "grad_norm": 31.3704833984375, + "learning_rate": 1.3406735751295339e-06, + "loss": 0.583, + "mean_token_accuracy": 0.9169946312904358, + "num_tokens": 9583506.0, + "step": 5349 + }, + { + "epoch": 0.866326613229698, + "grad_norm": 20.884910583496094, + "learning_rate": 1.339054404145078e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.9309569299221039, + "num_tokens": 9585293.0, + "step": 5350 + }, + { + "epoch": 0.8664885434377784, + "grad_norm": 35.53430938720703, + "learning_rate": 1.337435233160622e-06, + "loss": 0.5685, + "mean_token_accuracy": 0.9236669540405273, + "num_tokens": 9587080.0, + "step": 5351 + }, + { + "epoch": 0.8666504736458587, + "grad_norm": 37.16301345825195, + "learning_rate": 1.335816062176166e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.9137637317180634, + "num_tokens": 9588883.0, + "step": 5352 + }, + { + "epoch": 0.866812403853939, + "grad_norm": 35.00413131713867, + "learning_rate": 1.33419689119171e-06, + "loss": 0.5561, + "mean_token_accuracy": 0.9220143854618073, + "num_tokens": 9590676.0, + "step": 5353 + }, + { + "epoch": 0.8669743340620193, + "grad_norm": 25.990755081176758, + "learning_rate": 1.332577720207254e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.9266828000545502, + "num_tokens": 9592473.0, + "step": 5354 + }, + { + "epoch": 0.8671362642700996, + "grad_norm": 33.43516159057617, + "learning_rate": 1.330958549222798e-06, + "loss": 0.6081, + "mean_token_accuracy": 0.9130252003669739, + "num_tokens": 9594261.0, + "step": 5355 + }, + { + "epoch": 0.8672981944781799, + "grad_norm": 28.51774024963379, + "learning_rate": 1.329339378238342e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.9271240532398224, + "num_tokens": 9596061.0, + "step": 5356 + }, + { + "epoch": 0.8674601246862602, + "grad_norm": 31.21385383605957, + "learning_rate": 1.3277202072538862e-06, + "loss": 0.5907, + "mean_token_accuracy": 0.9199481308460236, + "num_tokens": 9597848.0, + "step": 5357 + }, + { + "epoch": 0.8676220548943405, + "grad_norm": 21.149391174316406, + "learning_rate": 1.3261010362694302e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.9268495738506317, + "num_tokens": 9599633.0, + "step": 5358 + }, + { + "epoch": 0.8677839851024208, + "grad_norm": 25.976675033569336, + "learning_rate": 1.3244818652849742e-06, + "loss": 0.554, + "mean_token_accuracy": 0.9257242381572723, + "num_tokens": 9601414.0, + "step": 5359 + }, + { + "epoch": 0.8679459153105011, + "grad_norm": 28.256155014038086, + "learning_rate": 1.3228626943005182e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.9248135983943939, + "num_tokens": 9603204.0, + "step": 5360 + }, + { + "epoch": 0.8681078455185814, + "grad_norm": 35.25698471069336, + "learning_rate": 1.3212435233160623e-06, + "loss": 0.6125, + "mean_token_accuracy": 0.9152521193027496, + "num_tokens": 9604999.0, + "step": 5361 + }, + { + "epoch": 0.8682697757266619, + "grad_norm": 48.168243408203125, + "learning_rate": 1.3196243523316063e-06, + "loss": 0.644, + "mean_token_accuracy": 0.9091709554195404, + "num_tokens": 9606797.0, + "step": 5362 + }, + { + "epoch": 0.8684317059347422, + "grad_norm": 25.43707275390625, + "learning_rate": 1.3180051813471503e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.9239267706871033, + "num_tokens": 9608585.0, + "step": 5363 + }, + { + "epoch": 0.8685936361428225, + "grad_norm": 42.56907272338867, + "learning_rate": 1.3163860103626943e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.89665287733078, + "num_tokens": 9610377.0, + "step": 5364 + }, + { + "epoch": 0.8687555663509028, + "grad_norm": 30.153684616088867, + "learning_rate": 1.3147668393782383e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9293177723884583, + "num_tokens": 9612158.0, + "step": 5365 + }, + { + "epoch": 0.8689174965589831, + "grad_norm": 36.382179260253906, + "learning_rate": 1.3131476683937825e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.9216992855072021, + "num_tokens": 9613951.0, + "step": 5366 + }, + { + "epoch": 0.8690794267670634, + "grad_norm": 40.38933181762695, + "learning_rate": 1.3115284974093265e-06, + "loss": 0.6292, + "mean_token_accuracy": 0.9060838520526886, + "num_tokens": 9615740.0, + "step": 5367 + }, + { + "epoch": 0.8692413569751437, + "grad_norm": 36.16415786743164, + "learning_rate": 1.3099093264248706e-06, + "loss": 0.5337, + "mean_token_accuracy": 0.9251798987388611, + "num_tokens": 9617533.0, + "step": 5368 + }, + { + "epoch": 0.869403287183224, + "grad_norm": 41.457305908203125, + "learning_rate": 1.3082901554404146e-06, + "loss": 0.6407, + "mean_token_accuracy": 0.9205517172813416, + "num_tokens": 9619322.0, + "step": 5369 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 28.34734535217285, + "learning_rate": 1.3066709844559586e-06, + "loss": 0.496, + "mean_token_accuracy": 0.9258503317832947, + "num_tokens": 9621131.0, + "step": 5370 + }, + { + "epoch": 0.8697271475993846, + "grad_norm": 32.29867172241211, + "learning_rate": 1.3050518134715026e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.9258370995521545, + "num_tokens": 9622926.0, + "step": 5371 + }, + { + "epoch": 0.8698890778074649, + "grad_norm": 43.4794807434082, + "learning_rate": 1.3034326424870466e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.902512788772583, + "num_tokens": 9624715.0, + "step": 5372 + }, + { + "epoch": 0.8700510080155452, + "grad_norm": 27.954435348510742, + "learning_rate": 1.3018134715025906e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.9271929860115051, + "num_tokens": 9626514.0, + "step": 5373 + }, + { + "epoch": 0.8702129382236257, + "grad_norm": 12.004798889160156, + "learning_rate": 1.3001943005181346e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.9381933808326721, + "num_tokens": 9628302.0, + "step": 5374 + }, + { + "epoch": 0.870374868431706, + "grad_norm": 28.274578094482422, + "learning_rate": 1.2985751295336789e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.9133472442626953, + "num_tokens": 9630091.0, + "step": 5375 + }, + { + "epoch": 0.8705367986397863, + "grad_norm": 21.06334114074707, + "learning_rate": 1.2969559585492229e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.9327763915061951, + "num_tokens": 9631871.0, + "step": 5376 + }, + { + "epoch": 0.8706987288478666, + "grad_norm": 58.294342041015625, + "learning_rate": 1.2953367875647669e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.8911684453487396, + "num_tokens": 9633667.0, + "step": 5377 + }, + { + "epoch": 0.8708606590559469, + "grad_norm": 35.823368072509766, + "learning_rate": 1.293717616580311e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.91847363114357, + "num_tokens": 9635461.0, + "step": 5378 + }, + { + "epoch": 0.8710225892640272, + "grad_norm": 22.113191604614258, + "learning_rate": 1.292098445595855e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.934779167175293, + "num_tokens": 9637249.0, + "step": 5379 + }, + { + "epoch": 0.8711845194721075, + "grad_norm": 38.277122497558594, + "learning_rate": 1.290479274611399e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.9134254455566406, + "num_tokens": 9639038.0, + "step": 5380 + }, + { + "epoch": 0.8713464496801878, + "grad_norm": 29.207454681396484, + "learning_rate": 1.2888601036269432e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.9193262457847595, + "num_tokens": 9640835.0, + "step": 5381 + }, + { + "epoch": 0.8715083798882681, + "grad_norm": 39.02018737792969, + "learning_rate": 1.2872409326424874e-06, + "loss": 0.5852, + "mean_token_accuracy": 0.9146403074264526, + "num_tokens": 9642628.0, + "step": 5382 + }, + { + "epoch": 0.8716703100963484, + "grad_norm": 24.970739364624023, + "learning_rate": 1.2856217616580314e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.9326691031455994, + "num_tokens": 9644422.0, + "step": 5383 + }, + { + "epoch": 0.8718322403044287, + "grad_norm": 26.072452545166016, + "learning_rate": 1.2840025906735754e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.9227917194366455, + "num_tokens": 9646205.0, + "step": 5384 + }, + { + "epoch": 0.8719941705125092, + "grad_norm": 19.36985206604004, + "learning_rate": 1.2823834196891194e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.927003413438797, + "num_tokens": 9647991.0, + "step": 5385 + }, + { + "epoch": 0.8721561007205895, + "grad_norm": 22.55524253845215, + "learning_rate": 1.2807642487046634e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.9282300472259521, + "num_tokens": 9649782.0, + "step": 5386 + }, + { + "epoch": 0.8723180309286698, + "grad_norm": 42.381439208984375, + "learning_rate": 1.2791450777202074e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.908076137304306, + "num_tokens": 9651577.0, + "step": 5387 + }, + { + "epoch": 0.8724799611367501, + "grad_norm": 26.439218521118164, + "learning_rate": 1.2775259067357515e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.9259259402751923, + "num_tokens": 9653359.0, + "step": 5388 + }, + { + "epoch": 0.8726418913448304, + "grad_norm": 36.559120178222656, + "learning_rate": 1.2759067357512955e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.9083333313465118, + "num_tokens": 9655143.0, + "step": 5389 + }, + { + "epoch": 0.8728038215529107, + "grad_norm": 25.096086502075195, + "learning_rate": 1.2742875647668395e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.9220143854618073, + "num_tokens": 9656936.0, + "step": 5390 + }, + { + "epoch": 0.872965751760991, + "grad_norm": 25.132068634033203, + "learning_rate": 1.2726683937823837e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.9248885214328766, + "num_tokens": 9658742.0, + "step": 5391 + }, + { + "epoch": 0.8731276819690713, + "grad_norm": 31.604137420654297, + "learning_rate": 1.2710492227979277e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.9112793803215027, + "num_tokens": 9660536.0, + "step": 5392 + }, + { + "epoch": 0.8732896121771516, + "grad_norm": 25.80408477783203, + "learning_rate": 1.2694300518134717e-06, + "loss": 0.5838, + "mean_token_accuracy": 0.9157401323318481, + "num_tokens": 9662319.0, + "step": 5393 + }, + { + "epoch": 0.8734515423852319, + "grad_norm": 32.02791213989258, + "learning_rate": 1.2678108808290157e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.9275862276554108, + "num_tokens": 9664121.0, + "step": 5394 + }, + { + "epoch": 0.8736134725933122, + "grad_norm": 21.863733291625977, + "learning_rate": 1.2661917098445598e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.9280020892620087, + "num_tokens": 9665911.0, + "step": 5395 + }, + { + "epoch": 0.8737754028013927, + "grad_norm": 36.63125991821289, + "learning_rate": 1.2645725388601038e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.9142682254314423, + "num_tokens": 9667703.0, + "step": 5396 + }, + { + "epoch": 0.873937333009473, + "grad_norm": 33.44142532348633, + "learning_rate": 1.2629533678756478e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.9208920300006866, + "num_tokens": 9669507.0, + "step": 5397 + }, + { + "epoch": 0.8740992632175533, + "grad_norm": 34.996124267578125, + "learning_rate": 1.2613341968911918e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.9180955290794373, + "num_tokens": 9671299.0, + "step": 5398 + }, + { + "epoch": 0.8742611934256336, + "grad_norm": 31.619152069091797, + "learning_rate": 1.2597150259067358e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.9276795983314514, + "num_tokens": 9673100.0, + "step": 5399 + }, + { + "epoch": 0.8744231236337139, + "grad_norm": 17.50958824157715, + "learning_rate": 1.2580958549222798e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.9314461648464203, + "num_tokens": 9674889.0, + "step": 5400 + }, + { + "epoch": 0.8745850538417942, + "grad_norm": 34.72541809082031, + "learning_rate": 1.256476683937824e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.9213040471076965, + "num_tokens": 9676694.0, + "step": 5401 + }, + { + "epoch": 0.8747469840498745, + "grad_norm": 30.25588035583496, + "learning_rate": 1.254857512953368e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.9248889684677124, + "num_tokens": 9678486.0, + "step": 5402 + }, + { + "epoch": 0.8749089142579548, + "grad_norm": 35.3316764831543, + "learning_rate": 1.253238341968912e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.9195241630077362, + "num_tokens": 9680284.0, + "step": 5403 + }, + { + "epoch": 0.8750708444660351, + "grad_norm": 39.66303253173828, + "learning_rate": 1.251619170984456e-06, + "loss": 0.6706, + "mean_token_accuracy": 0.917792797088623, + "num_tokens": 9682088.0, + "step": 5404 + }, + { + "epoch": 0.8752327746741154, + "grad_norm": 46.51969909667969, + "learning_rate": 1.25e-06, + "loss": 1.1844, + "mean_token_accuracy": 0.8842163383960724, + "num_tokens": 9683882.0, + "step": 5405 + }, + { + "epoch": 0.8753947048821957, + "grad_norm": 37.89459228515625, + "learning_rate": 1.2483808290155441e-06, + "loss": 0.6197, + "mean_token_accuracy": 0.9179335236549377, + "num_tokens": 9685675.0, + "step": 5406 + }, + { + "epoch": 0.875556635090276, + "grad_norm": 21.040109634399414, + "learning_rate": 1.2467616580310881e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.9314432144165039, + "num_tokens": 9687464.0, + "step": 5407 + }, + { + "epoch": 0.8757185652983565, + "grad_norm": 28.99443817138672, + "learning_rate": 1.2451424870466321e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.9277016520500183, + "num_tokens": 9689266.0, + "step": 5408 + }, + { + "epoch": 0.8758804955064368, + "grad_norm": 15.494927406311035, + "learning_rate": 1.2435233160621762e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.937282145023346, + "num_tokens": 9691049.0, + "step": 5409 + }, + { + "epoch": 0.8760424257145171, + "grad_norm": 25.255373001098633, + "learning_rate": 1.2419041450777204e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.9268123805522919, + "num_tokens": 9692835.0, + "step": 5410 + }, + { + "epoch": 0.8762043559225974, + "grad_norm": 38.62944030761719, + "learning_rate": 1.2402849740932644e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.9086863696575165, + "num_tokens": 9694621.0, + "step": 5411 + }, + { + "epoch": 0.8763662861306777, + "grad_norm": 33.97884750366211, + "learning_rate": 1.2386658031088084e-06, + "loss": 0.5772, + "mean_token_accuracy": 0.9258066415786743, + "num_tokens": 9696416.0, + "step": 5412 + }, + { + "epoch": 0.876528216338758, + "grad_norm": 27.318815231323242, + "learning_rate": 1.2370466321243524e-06, + "loss": 0.5754, + "mean_token_accuracy": 0.9300699234008789, + "num_tokens": 9698214.0, + "step": 5413 + }, + { + "epoch": 0.8766901465468383, + "grad_norm": 16.050477981567383, + "learning_rate": 1.2354274611398964e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.9397663176059723, + "num_tokens": 9700008.0, + "step": 5414 + }, + { + "epoch": 0.8768520767549186, + "grad_norm": 28.83686637878418, + "learning_rate": 1.2338082901554404e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.9285677969455719, + "num_tokens": 9701800.0, + "step": 5415 + }, + { + "epoch": 0.8770140069629989, + "grad_norm": 41.68056869506836, + "learning_rate": 1.2321891191709845e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.9078092277050018, + "num_tokens": 9703595.0, + "step": 5416 + }, + { + "epoch": 0.8771759371710792, + "grad_norm": 49.28692626953125, + "learning_rate": 1.2305699481865285e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.9088148176670074, + "num_tokens": 9705403.0, + "step": 5417 + }, + { + "epoch": 0.8773378673791595, + "grad_norm": 25.735937118530273, + "learning_rate": 1.2289507772020727e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.9228169620037079, + "num_tokens": 9707187.0, + "step": 5418 + }, + { + "epoch": 0.87749979758724, + "grad_norm": 36.187679290771484, + "learning_rate": 1.2273316062176167e-06, + "loss": 0.6171, + "mean_token_accuracy": 0.9142682254314423, + "num_tokens": 9708979.0, + "step": 5419 + }, + { + "epoch": 0.8776617277953203, + "grad_norm": 30.78145408630371, + "learning_rate": 1.2257124352331607e-06, + "loss": 0.5827, + "mean_token_accuracy": 0.9225936532020569, + "num_tokens": 9710788.0, + "step": 5420 + }, + { + "epoch": 0.8778236580034006, + "grad_norm": 35.40193557739258, + "learning_rate": 1.2240932642487047e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.9213040471076965, + "num_tokens": 9712593.0, + "step": 5421 + }, + { + "epoch": 0.8779855882114809, + "grad_norm": 33.412254333496094, + "learning_rate": 1.2224740932642487e-06, + "loss": 0.5743, + "mean_token_accuracy": 0.9235592186450958, + "num_tokens": 9714393.0, + "step": 5422 + }, + { + "epoch": 0.8781475184195612, + "grad_norm": 30.46841812133789, + "learning_rate": 1.220854922279793e-06, + "loss": 0.6017, + "mean_token_accuracy": 0.9124864637851715, + "num_tokens": 9716190.0, + "step": 5423 + }, + { + "epoch": 0.8783094486276415, + "grad_norm": 15.910722732543945, + "learning_rate": 1.219235751295337e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.9379962682723999, + "num_tokens": 9717976.0, + "step": 5424 + }, + { + "epoch": 0.8784713788357218, + "grad_norm": 33.84764099121094, + "learning_rate": 1.217616580310881e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.9226473271846771, + "num_tokens": 9719774.0, + "step": 5425 + }, + { + "epoch": 0.8786333090438021, + "grad_norm": 28.132081985473633, + "learning_rate": 1.215997409326425e-06, + "loss": 0.6544, + "mean_token_accuracy": 0.9157897531986237, + "num_tokens": 9721559.0, + "step": 5426 + }, + { + "epoch": 0.8787952392518824, + "grad_norm": 21.49597930908203, + "learning_rate": 1.214378238341969e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.9350104331970215, + "num_tokens": 9723348.0, + "step": 5427 + }, + { + "epoch": 0.8789571694599627, + "grad_norm": 35.330177307128906, + "learning_rate": 1.212759067357513e-06, + "loss": 0.6067, + "mean_token_accuracy": 0.9175586998462677, + "num_tokens": 9725152.0, + "step": 5428 + }, + { + "epoch": 0.879119099668043, + "grad_norm": 23.184669494628906, + "learning_rate": 1.211139896373057e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.9258315861225128, + "num_tokens": 9726947.0, + "step": 5429 + }, + { + "epoch": 0.8792810298761234, + "grad_norm": 50.213504791259766, + "learning_rate": 1.209520725388601e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.9148170650005341, + "num_tokens": 9728740.0, + "step": 5430 + }, + { + "epoch": 0.8794429600842038, + "grad_norm": 15.217734336853027, + "learning_rate": 1.207901554404145e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.9375236630439758, + "num_tokens": 9730524.0, + "step": 5431 + }, + { + "epoch": 0.8796048902922841, + "grad_norm": 37.054935455322266, + "learning_rate": 1.2062823834196893e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.9176258742809296, + "num_tokens": 9732325.0, + "step": 5432 + }, + { + "epoch": 0.8797668205003644, + "grad_norm": 27.339685440063477, + "learning_rate": 1.2046632124352333e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.9255028665065765, + "num_tokens": 9734105.0, + "step": 5433 + }, + { + "epoch": 0.8799287507084447, + "grad_norm": 27.949697494506836, + "learning_rate": 1.2030440414507773e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.9264184534549713, + "num_tokens": 9735902.0, + "step": 5434 + }, + { + "epoch": 0.880090680916525, + "grad_norm": 39.175819396972656, + "learning_rate": 1.2014248704663213e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.9151424467563629, + "num_tokens": 9737697.0, + "step": 5435 + }, + { + "epoch": 0.8802526111246053, + "grad_norm": 21.478050231933594, + "learning_rate": 1.1998056994818654e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9289954304695129, + "num_tokens": 9739490.0, + "step": 5436 + }, + { + "epoch": 0.8804145413326856, + "grad_norm": 30.33591651916504, + "learning_rate": 1.1981865284974094e-06, + "loss": 0.5978, + "mean_token_accuracy": 0.9098258912563324, + "num_tokens": 9741280.0, + "step": 5437 + }, + { + "epoch": 0.8805764715407659, + "grad_norm": 21.779216766357422, + "learning_rate": 1.1965673575129534e-06, + "loss": 0.5521, + "mean_token_accuracy": 0.9285677969455719, + "num_tokens": 9743072.0, + "step": 5438 + }, + { + "epoch": 0.8807384017488462, + "grad_norm": 23.569303512573242, + "learning_rate": 1.1949481865284974e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.9283842146396637, + "num_tokens": 9744863.0, + "step": 5439 + }, + { + "epoch": 0.8809003319569265, + "grad_norm": 42.27528762817383, + "learning_rate": 1.1933290155440414e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.9139501452445984, + "num_tokens": 9746664.0, + "step": 5440 + }, + { + "epoch": 0.8810622621650069, + "grad_norm": 36.370704650878906, + "learning_rate": 1.1917098445595854e-06, + "loss": 0.5712, + "mean_token_accuracy": 0.9200586974620819, + "num_tokens": 9748462.0, + "step": 5441 + }, + { + "epoch": 0.8812241923730872, + "grad_norm": 44.77403259277344, + "learning_rate": 1.1900906735751296e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.91131791472435, + "num_tokens": 9750256.0, + "step": 5442 + }, + { + "epoch": 0.8813861225811676, + "grad_norm": 20.939626693725586, + "learning_rate": 1.1884715025906737e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.9349493682384491, + "num_tokens": 9752045.0, + "step": 5443 + }, + { + "epoch": 0.8815480527892479, + "grad_norm": 41.33281707763672, + "learning_rate": 1.1868523316062177e-06, + "loss": 0.6113, + "mean_token_accuracy": 0.9201333820819855, + "num_tokens": 9753832.0, + "step": 5444 + }, + { + "epoch": 0.8817099829973282, + "grad_norm": 49.69961166381836, + "learning_rate": 1.185233160621762e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.8946863114833832, + "num_tokens": 9755629.0, + "step": 5445 + }, + { + "epoch": 0.8818719132054085, + "grad_norm": 38.220726013183594, + "learning_rate": 1.183613989637306e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.9111111164093018, + "num_tokens": 9757420.0, + "step": 5446 + }, + { + "epoch": 0.8820338434134888, + "grad_norm": 41.787593841552734, + "learning_rate": 1.18199481865285e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.9041208624839783, + "num_tokens": 9759202.0, + "step": 5447 + }, + { + "epoch": 0.8821957736215691, + "grad_norm": 33.63442611694336, + "learning_rate": 1.180375647668394e-06, + "loss": 0.6627, + "mean_token_accuracy": 0.922222226858139, + "num_tokens": 9760999.0, + "step": 5448 + }, + { + "epoch": 0.8823577038296494, + "grad_norm": 40.562564849853516, + "learning_rate": 1.178756476683938e-06, + "loss": 0.745, + "mean_token_accuracy": 0.903522789478302, + "num_tokens": 9762790.0, + "step": 5449 + }, + { + "epoch": 0.8825196340377297, + "grad_norm": 29.548768997192383, + "learning_rate": 1.177137305699482e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.9309645891189575, + "num_tokens": 9764576.0, + "step": 5450 + }, + { + "epoch": 0.88268156424581, + "grad_norm": 37.60084915161133, + "learning_rate": 1.175518134715026e-06, + "loss": 0.5499, + "mean_token_accuracy": 0.9195803999900818, + "num_tokens": 9766374.0, + "step": 5451 + }, + { + "epoch": 0.8828434944538903, + "grad_norm": 31.79119110107422, + "learning_rate": 1.17389896373057e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.9222524166107178, + "num_tokens": 9768168.0, + "step": 5452 + }, + { + "epoch": 0.8830054246619707, + "grad_norm": 31.390846252441406, + "learning_rate": 1.172279792746114e-06, + "loss": 0.6112, + "mean_token_accuracy": 0.912058413028717, + "num_tokens": 9769953.0, + "step": 5453 + }, + { + "epoch": 0.883167354870051, + "grad_norm": 30.057666778564453, + "learning_rate": 1.1706606217616582e-06, + "loss": 0.5655, + "mean_token_accuracy": 0.9232736527919769, + "num_tokens": 9771739.0, + "step": 5454 + }, + { + "epoch": 0.8833292850781314, + "grad_norm": 36.96970748901367, + "learning_rate": 1.1690414507772022e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.9174208045005798, + "num_tokens": 9773530.0, + "step": 5455 + }, + { + "epoch": 0.8834912152862117, + "grad_norm": 23.222259521484375, + "learning_rate": 1.1674222797927463e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.9334284663200378, + "num_tokens": 9775312.0, + "step": 5456 + }, + { + "epoch": 0.883653145494292, + "grad_norm": 52.42688751220703, + "learning_rate": 1.1658031088082903e-06, + "loss": 0.5837, + "mean_token_accuracy": 0.9202856719493866, + "num_tokens": 9777100.0, + "step": 5457 + }, + { + "epoch": 0.8838150757023723, + "grad_norm": 40.5949821472168, + "learning_rate": 1.1641839378238343e-06, + "loss": 0.5956, + "mean_token_accuracy": 0.916930079460144, + "num_tokens": 9778901.0, + "step": 5458 + }, + { + "epoch": 0.8839770059104526, + "grad_norm": 29.494794845581055, + "learning_rate": 1.1625647668393783e-06, + "loss": 0.57, + "mean_token_accuracy": 0.9202425479888916, + "num_tokens": 9780691.0, + "step": 5459 + }, + { + "epoch": 0.8841389361185329, + "grad_norm": 37.74163055419922, + "learning_rate": 1.1609455958549223e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.924199789762497, + "num_tokens": 9782480.0, + "step": 5460 + }, + { + "epoch": 0.8843008663266132, + "grad_norm": 31.0628662109375, + "learning_rate": 1.1593264248704663e-06, + "loss": 0.5406, + "mean_token_accuracy": 0.9258000552654266, + "num_tokens": 9784276.0, + "step": 5461 + }, + { + "epoch": 0.8844627965346935, + "grad_norm": 27.933813095092773, + "learning_rate": 1.1577072538860103e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.9289631247520447, + "num_tokens": 9786070.0, + "step": 5462 + }, + { + "epoch": 0.8846247267427738, + "grad_norm": 43.04594039916992, + "learning_rate": 1.1560880829015543e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.9126865565776825, + "num_tokens": 9787856.0, + "step": 5463 + }, + { + "epoch": 0.8847866569508542, + "grad_norm": 19.215984344482422, + "learning_rate": 1.1544689119170986e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.9377415180206299, + "num_tokens": 9789641.0, + "step": 5464 + }, + { + "epoch": 0.8849485871589345, + "grad_norm": 35.10929489135742, + "learning_rate": 1.1528497409326426e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.9192118346691132, + "num_tokens": 9791438.0, + "step": 5465 + }, + { + "epoch": 0.8851105173670148, + "grad_norm": 29.689105987548828, + "learning_rate": 1.1512305699481866e-06, + "loss": 0.5648, + "mean_token_accuracy": 0.9204521775245667, + "num_tokens": 9793227.0, + "step": 5466 + }, + { + "epoch": 0.8852724475750952, + "grad_norm": 33.17251968383789, + "learning_rate": 1.1496113989637308e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.9064182341098785, + "num_tokens": 9795017.0, + "step": 5467 + }, + { + "epoch": 0.8854343777831755, + "grad_norm": 29.482418060302734, + "learning_rate": 1.1479922279792748e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.9093915224075317, + "num_tokens": 9796804.0, + "step": 5468 + }, + { + "epoch": 0.8855963079912558, + "grad_norm": 21.892906188964844, + "learning_rate": 1.1463730569948188e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.9308949708938599, + "num_tokens": 9798591.0, + "step": 5469 + }, + { + "epoch": 0.8857582381993361, + "grad_norm": 37.91713333129883, + "learning_rate": 1.1447538860103629e-06, + "loss": 0.7578, + "mean_token_accuracy": 0.9072786867618561, + "num_tokens": 9800383.0, + "step": 5470 + }, + { + "epoch": 0.8859201684074164, + "grad_norm": 38.04140853881836, + "learning_rate": 1.1431347150259069e-06, + "loss": 0.6503, + "mean_token_accuracy": 0.9178571403026581, + "num_tokens": 9802175.0, + "step": 5471 + }, + { + "epoch": 0.8860820986154967, + "grad_norm": 55.2664680480957, + "learning_rate": 1.1415155440414509e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.9011915028095245, + "num_tokens": 9803960.0, + "step": 5472 + }, + { + "epoch": 0.886244028823577, + "grad_norm": 39.57736587524414, + "learning_rate": 1.139896373056995e-06, + "loss": 0.5623, + "mean_token_accuracy": 0.9201765656471252, + "num_tokens": 9805760.0, + "step": 5473 + }, + { + "epoch": 0.8864059590316573, + "grad_norm": 30.44786834716797, + "learning_rate": 1.138277202072539e-06, + "loss": 0.592, + "mean_token_accuracy": 0.9146301448345184, + "num_tokens": 9807542.0, + "step": 5474 + }, + { + "epoch": 0.8865678892397377, + "grad_norm": 34.01777267456055, + "learning_rate": 1.136658031088083e-06, + "loss": 0.6296, + "mean_token_accuracy": 0.9155383706092834, + "num_tokens": 9809327.0, + "step": 5475 + }, + { + "epoch": 0.886729819447818, + "grad_norm": 43.928245544433594, + "learning_rate": 1.1350388601036271e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.8968085050582886, + "num_tokens": 9811120.0, + "step": 5476 + }, + { + "epoch": 0.8868917496558983, + "grad_norm": 27.02503204345703, + "learning_rate": 1.1334196891191712e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.920409768819809, + "num_tokens": 9812908.0, + "step": 5477 + }, + { + "epoch": 0.8870536798639787, + "grad_norm": 17.587738037109375, + "learning_rate": 1.1318005181347152e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.9365914762020111, + "num_tokens": 9814688.0, + "step": 5478 + }, + { + "epoch": 0.887215610072059, + "grad_norm": 32.40117263793945, + "learning_rate": 1.1301813471502592e-06, + "loss": 0.659, + "mean_token_accuracy": 0.9170294404029846, + "num_tokens": 9816478.0, + "step": 5479 + }, + { + "epoch": 0.8873775402801393, + "grad_norm": 28.365272521972656, + "learning_rate": 1.1285621761658032e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.9309440553188324, + "num_tokens": 9818265.0, + "step": 5480 + }, + { + "epoch": 0.8875394704882196, + "grad_norm": 33.94539260864258, + "learning_rate": 1.1269430051813472e-06, + "loss": 0.5904, + "mean_token_accuracy": 0.9171883165836334, + "num_tokens": 9820054.0, + "step": 5481 + }, + { + "epoch": 0.8877014006962999, + "grad_norm": 43.25166702270508, + "learning_rate": 1.1253238341968912e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.9142191112041473, + "num_tokens": 9821859.0, + "step": 5482 + }, + { + "epoch": 0.8878633309043802, + "grad_norm": 43.0670051574707, + "learning_rate": 1.1237046632124352e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.9249196350574493, + "num_tokens": 9823651.0, + "step": 5483 + }, + { + "epoch": 0.8880252611124605, + "grad_norm": 38.24034118652344, + "learning_rate": 1.1220854922279793e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.9070218503475189, + "num_tokens": 9825443.0, + "step": 5484 + }, + { + "epoch": 0.8881871913205408, + "grad_norm": 37.42241287231445, + "learning_rate": 1.1204663212435233e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.9157422184944153, + "num_tokens": 9827237.0, + "step": 5485 + }, + { + "epoch": 0.8883491215286211, + "grad_norm": 36.45575714111328, + "learning_rate": 1.1188471502590675e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.9177807569503784, + "num_tokens": 9829028.0, + "step": 5486 + }, + { + "epoch": 0.8885110517367015, + "grad_norm": 32.521209716796875, + "learning_rate": 1.1172279792746115e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.919047623872757, + "num_tokens": 9830824.0, + "step": 5487 + }, + { + "epoch": 0.8886729819447818, + "grad_norm": 31.005590438842773, + "learning_rate": 1.1156088082901555e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.9232736527919769, + "num_tokens": 9832610.0, + "step": 5488 + }, + { + "epoch": 0.8888349121528621, + "grad_norm": 27.770160675048828, + "learning_rate": 1.1139896373056995e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.9220707416534424, + "num_tokens": 9834404.0, + "step": 5489 + }, + { + "epoch": 0.8889968423609425, + "grad_norm": 15.018160820007324, + "learning_rate": 1.1123704663212438e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.9379370510578156, + "num_tokens": 9836191.0, + "step": 5490 + }, + { + "epoch": 0.8891587725690228, + "grad_norm": 33.26008987426758, + "learning_rate": 1.1107512953367878e-06, + "loss": 0.6464, + "mean_token_accuracy": 0.921881914138794, + "num_tokens": 9837972.0, + "step": 5491 + }, + { + "epoch": 0.8893207027771031, + "grad_norm": 41.01582336425781, + "learning_rate": 1.1091321243523318e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.9051518738269806, + "num_tokens": 9839768.0, + "step": 5492 + }, + { + "epoch": 0.8894826329851834, + "grad_norm": 38.1275520324707, + "learning_rate": 1.1075129533678758e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.917723149061203, + "num_tokens": 9841561.0, + "step": 5493 + }, + { + "epoch": 0.8896445631932637, + "grad_norm": 26.14552116394043, + "learning_rate": 1.1058937823834198e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.9280426800251007, + "num_tokens": 9843351.0, + "step": 5494 + }, + { + "epoch": 0.889806493401344, + "grad_norm": 39.142913818359375, + "learning_rate": 1.1042746113989638e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.9058857858181, + "num_tokens": 9845150.0, + "step": 5495 + }, + { + "epoch": 0.8899684236094243, + "grad_norm": 37.10499954223633, + "learning_rate": 1.1026554404145078e-06, + "loss": 0.6321, + "mean_token_accuracy": 0.9139243066310883, + "num_tokens": 9846941.0, + "step": 5496 + }, + { + "epoch": 0.8901303538175046, + "grad_norm": 31.25570297241211, + "learning_rate": 1.1010362694300518e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.9201439619064331, + "num_tokens": 9848740.0, + "step": 5497 + }, + { + "epoch": 0.890292284025585, + "grad_norm": 26.704837799072266, + "learning_rate": 1.099417098445596e-06, + "loss": 0.5572, + "mean_token_accuracy": 0.9177764356136322, + "num_tokens": 9850532.0, + "step": 5498 + }, + { + "epoch": 0.8904542142336653, + "grad_norm": 20.3362979888916, + "learning_rate": 1.09779792746114e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.9323454797267914, + "num_tokens": 9852325.0, + "step": 5499 + }, + { + "epoch": 0.8906161444417456, + "grad_norm": 39.48313522338867, + "learning_rate": 1.096178756476684e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.910526305437088, + "num_tokens": 9854129.0, + "step": 5500 + }, + { + "epoch": 0.890778074649826, + "grad_norm": 32.83544158935547, + "learning_rate": 1.0945595854922281e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.9306486546993256, + "num_tokens": 9855929.0, + "step": 5501 + }, + { + "epoch": 0.8909400048579063, + "grad_norm": 40.115447998046875, + "learning_rate": 1.0929404145077721e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.9080671668052673, + "num_tokens": 9857721.0, + "step": 5502 + }, + { + "epoch": 0.8911019350659866, + "grad_norm": 41.007667541503906, + "learning_rate": 1.0913212435233161e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.9038208425045013, + "num_tokens": 9859514.0, + "step": 5503 + }, + { + "epoch": 0.8912638652740669, + "grad_norm": 31.94751739501953, + "learning_rate": 1.0897020725388601e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.9216166734695435, + "num_tokens": 9861306.0, + "step": 5504 + }, + { + "epoch": 0.8914257954821472, + "grad_norm": 49.957332611083984, + "learning_rate": 1.0880829015544042e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.9133029878139496, + "num_tokens": 9863107.0, + "step": 5505 + }, + { + "epoch": 0.8915877256902275, + "grad_norm": 37.95313262939453, + "learning_rate": 1.0864637305699482e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.9104166626930237, + "num_tokens": 9864898.0, + "step": 5506 + }, + { + "epoch": 0.8917496558983078, + "grad_norm": 23.988075256347656, + "learning_rate": 1.0848445595854922e-06, + "loss": 0.5343, + "mean_token_accuracy": 0.9312728047370911, + "num_tokens": 9866687.0, + "step": 5507 + }, + { + "epoch": 0.8919115861063881, + "grad_norm": 32.6848030090332, + "learning_rate": 1.0832253886010364e-06, + "loss": 0.6078, + "mean_token_accuracy": 0.9140145480632782, + "num_tokens": 9868489.0, + "step": 5508 + }, + { + "epoch": 0.8920735163144685, + "grad_norm": 38.78463363647461, + "learning_rate": 1.0816062176165804e-06, + "loss": 0.6847, + "mean_token_accuracy": 0.9045549929141998, + "num_tokens": 9870284.0, + "step": 5509 + }, + { + "epoch": 0.8922354465225488, + "grad_norm": 26.64596939086914, + "learning_rate": 1.0799870466321244e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.915575385093689, + "num_tokens": 9872080.0, + "step": 5510 + }, + { + "epoch": 0.8923973767306291, + "grad_norm": 45.724063873291016, + "learning_rate": 1.0783678756476685e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.8937133550643921, + "num_tokens": 9873884.0, + "step": 5511 + }, + { + "epoch": 0.8925593069387094, + "grad_norm": 36.1963005065918, + "learning_rate": 1.0767487046632125e-06, + "loss": 0.6287, + "mean_token_accuracy": 0.9106331765651703, + "num_tokens": 9875676.0, + "step": 5512 + }, + { + "epoch": 0.8927212371467897, + "grad_norm": 28.377832412719727, + "learning_rate": 1.0751295336787565e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.9254679083824158, + "num_tokens": 9877456.0, + "step": 5513 + }, + { + "epoch": 0.89288316735487, + "grad_norm": 32.506141662597656, + "learning_rate": 1.0735103626943007e-06, + "loss": 0.5575, + "mean_token_accuracy": 0.9107142984867096, + "num_tokens": 9879248.0, + "step": 5514 + }, + { + "epoch": 0.8930450975629504, + "grad_norm": 31.269174575805664, + "learning_rate": 1.0718911917098447e-06, + "loss": 0.5461, + "mean_token_accuracy": 0.9178921580314636, + "num_tokens": 9881028.0, + "step": 5515 + }, + { + "epoch": 0.8932070277710307, + "grad_norm": 39.83466720581055, + "learning_rate": 1.0702720207253887e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.9078677594661713, + "num_tokens": 9882833.0, + "step": 5516 + }, + { + "epoch": 0.893368957979111, + "grad_norm": 35.14051818847656, + "learning_rate": 1.0686528497409327e-06, + "loss": 0.6161, + "mean_token_accuracy": 0.9247555434703827, + "num_tokens": 9884624.0, + "step": 5517 + }, + { + "epoch": 0.8935308881871913, + "grad_norm": 36.77198791503906, + "learning_rate": 1.0670336787564768e-06, + "loss": 0.642, + "mean_token_accuracy": 0.9180261492729187, + "num_tokens": 9886417.0, + "step": 5518 + }, + { + "epoch": 0.8936928183952716, + "grad_norm": 41.0582389831543, + "learning_rate": 1.0654145077720208e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.9088725745677948, + "num_tokens": 9888215.0, + "step": 5519 + }, + { + "epoch": 0.8938547486033519, + "grad_norm": 19.575010299682617, + "learning_rate": 1.0637953367875648e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.9345238208770752, + "num_tokens": 9890002.0, + "step": 5520 + }, + { + "epoch": 0.8940166788114323, + "grad_norm": 44.42578125, + "learning_rate": 1.062176165803109e-06, + "loss": 0.64, + "mean_token_accuracy": 0.9094496071338654, + "num_tokens": 9891799.0, + "step": 5521 + }, + { + "epoch": 0.8941786090195126, + "grad_norm": 35.70947265625, + "learning_rate": 1.060556994818653e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.9081260561943054, + "num_tokens": 9893594.0, + "step": 5522 + }, + { + "epoch": 0.8943405392275929, + "grad_norm": 33.810394287109375, + "learning_rate": 1.058937823834197e-06, + "loss": 0.5674, + "mean_token_accuracy": 0.9229840040206909, + "num_tokens": 9895379.0, + "step": 5523 + }, + { + "epoch": 0.8945024694356732, + "grad_norm": 35.11822509765625, + "learning_rate": 1.057318652849741e-06, + "loss": 0.6478, + "mean_token_accuracy": 0.9127053916454315, + "num_tokens": 9897177.0, + "step": 5524 + }, + { + "epoch": 0.8946643996437535, + "grad_norm": 34.517784118652344, + "learning_rate": 1.055699481865285e-06, + "loss": 0.6369, + "mean_token_accuracy": 0.9124087691307068, + "num_tokens": 9898963.0, + "step": 5525 + }, + { + "epoch": 0.8948263298518339, + "grad_norm": 34.195335388183594, + "learning_rate": 1.054080310880829e-06, + "loss": 0.5762, + "mean_token_accuracy": 0.9208633005619049, + "num_tokens": 9900753.0, + "step": 5526 + }, + { + "epoch": 0.8949882600599142, + "grad_norm": 22.482501983642578, + "learning_rate": 1.052461139896373e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.9306569397449493, + "num_tokens": 9902539.0, + "step": 5527 + }, + { + "epoch": 0.8951501902679945, + "grad_norm": 27.418664932250977, + "learning_rate": 1.050841968911917e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.9242281913757324, + "num_tokens": 9904328.0, + "step": 5528 + }, + { + "epoch": 0.8953121204760748, + "grad_norm": 22.196041107177734, + "learning_rate": 1.0492227979274611e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.9369458258152008, + "num_tokens": 9906125.0, + "step": 5529 + }, + { + "epoch": 0.8954740506841551, + "grad_norm": 33.088775634765625, + "learning_rate": 1.0476036269430053e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.915304571390152, + "num_tokens": 9907908.0, + "step": 5530 + }, + { + "epoch": 0.8956359808922354, + "grad_norm": 38.18043518066406, + "learning_rate": 1.0459844559585493e-06, + "loss": 0.6287, + "mean_token_accuracy": 0.9059343636035919, + "num_tokens": 9909707.0, + "step": 5531 + }, + { + "epoch": 0.8957979111003158, + "grad_norm": 22.550262451171875, + "learning_rate": 1.0443652849740934e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.9303542673587799, + "num_tokens": 9911492.0, + "step": 5532 + }, + { + "epoch": 0.8959598413083961, + "grad_norm": 18.326772689819336, + "learning_rate": 1.0427461139896374e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.9306266009807587, + "num_tokens": 9913278.0, + "step": 5533 + }, + { + "epoch": 0.8961217715164764, + "grad_norm": 36.65408706665039, + "learning_rate": 1.0411269430051814e-06, + "loss": 0.5197, + "mean_token_accuracy": 0.9194630980491638, + "num_tokens": 9915088.0, + "step": 5534 + }, + { + "epoch": 0.8962837017245567, + "grad_norm": 32.820743560791016, + "learning_rate": 1.0395077720207254e-06, + "loss": 0.6174, + "mean_token_accuracy": 0.9140287637710571, + "num_tokens": 9916879.0, + "step": 5535 + }, + { + "epoch": 0.896445631932637, + "grad_norm": 23.256084442138672, + "learning_rate": 1.0378886010362694e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.9277215301990509, + "num_tokens": 9918668.0, + "step": 5536 + }, + { + "epoch": 0.8966075621407174, + "grad_norm": 40.22314453125, + "learning_rate": 1.0362694300518134e-06, + "loss": 0.6323, + "mean_token_accuracy": 0.9107346832752228, + "num_tokens": 9920470.0, + "step": 5537 + }, + { + "epoch": 0.8967694923487977, + "grad_norm": 26.55768394470215, + "learning_rate": 1.0346502590673577e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.9210265278816223, + "num_tokens": 9922259.0, + "step": 5538 + }, + { + "epoch": 0.896931422556878, + "grad_norm": 26.869733810424805, + "learning_rate": 1.0330310880829017e-06, + "loss": 0.6406, + "mean_token_accuracy": 0.9202073812484741, + "num_tokens": 9924059.0, + "step": 5539 + }, + { + "epoch": 0.8970933527649583, + "grad_norm": 33.09824752807617, + "learning_rate": 1.0314119170984457e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.9156827032566071, + "num_tokens": 9925844.0, + "step": 5540 + }, + { + "epoch": 0.8972552829730386, + "grad_norm": 20.166194915771484, + "learning_rate": 1.0297927461139897e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.9291283786296844, + "num_tokens": 9927624.0, + "step": 5541 + }, + { + "epoch": 0.8974172131811189, + "grad_norm": 33.16535568237305, + "learning_rate": 1.0281735751295337e-06, + "loss": 0.5728, + "mean_token_accuracy": 0.919654130935669, + "num_tokens": 9929423.0, + "step": 5542 + }, + { + "epoch": 0.8975791433891993, + "grad_norm": 44.94796371459961, + "learning_rate": 1.026554404145078e-06, + "loss": 0.735, + "mean_token_accuracy": 0.902492344379425, + "num_tokens": 9931224.0, + "step": 5543 + }, + { + "epoch": 0.8977410735972796, + "grad_norm": 16.171478271484375, + "learning_rate": 1.024935233160622e-06, + "loss": 0.467, + "mean_token_accuracy": 0.9362794756889343, + "num_tokens": 9933003.0, + "step": 5544 + }, + { + "epoch": 0.8979030038053599, + "grad_norm": 35.25217819213867, + "learning_rate": 1.023316062176166e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.923706978559494, + "num_tokens": 9934802.0, + "step": 5545 + }, + { + "epoch": 0.8980649340134402, + "grad_norm": 34.0245475769043, + "learning_rate": 1.02169689119171e-06, + "loss": 0.6254, + "mean_token_accuracy": 0.9211711883544922, + "num_tokens": 9936606.0, + "step": 5546 + }, + { + "epoch": 0.8982268642215205, + "grad_norm": 30.119239807128906, + "learning_rate": 1.020077720207254e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.9306798875331879, + "num_tokens": 9938392.0, + "step": 5547 + }, + { + "epoch": 0.8983887944296008, + "grad_norm": 42.19300079345703, + "learning_rate": 1.018458549222798e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.9049295783042908, + "num_tokens": 9940188.0, + "step": 5548 + }, + { + "epoch": 0.8985507246376812, + "grad_norm": 39.933815002441406, + "learning_rate": 1.016839378238342e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.9065695703029633, + "num_tokens": 9941988.0, + "step": 5549 + }, + { + "epoch": 0.8987126548457615, + "grad_norm": 40.42913055419922, + "learning_rate": 1.015220207253886e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.8923467397689819, + "num_tokens": 9943778.0, + "step": 5550 + }, + { + "epoch": 0.8988745850538418, + "grad_norm": 42.780517578125, + "learning_rate": 1.01360103626943e-06, + "loss": 0.6108, + "mean_token_accuracy": 0.9238984882831573, + "num_tokens": 9945579.0, + "step": 5551 + }, + { + "epoch": 0.8990365152619221, + "grad_norm": 42.6334114074707, + "learning_rate": 1.0119818652849743e-06, + "loss": 0.6754, + "mean_token_accuracy": 0.9129156172275543, + "num_tokens": 9947378.0, + "step": 5552 + }, + { + "epoch": 0.8991984454700024, + "grad_norm": 33.08951187133789, + "learning_rate": 1.0103626943005183e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.9253065884113312, + "num_tokens": 9949171.0, + "step": 5553 + }, + { + "epoch": 0.8993603756780828, + "grad_norm": 21.853233337402344, + "learning_rate": 1.0087435233160623e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.9364425539970398, + "num_tokens": 9950967.0, + "step": 5554 + }, + { + "epoch": 0.8995223058861631, + "grad_norm": 34.43678283691406, + "learning_rate": 1.0071243523316063e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.9160805642604828, + "num_tokens": 9952753.0, + "step": 5555 + }, + { + "epoch": 0.8996842360942434, + "grad_norm": 29.670330047607422, + "learning_rate": 1.0055051813471503e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.9125639498233795, + "num_tokens": 9954539.0, + "step": 5556 + }, + { + "epoch": 0.8998461663023237, + "grad_norm": 39.295562744140625, + "learning_rate": 1.0038860103626943e-06, + "loss": 0.5462, + "mean_token_accuracy": 0.9163995683193207, + "num_tokens": 9956338.0, + "step": 5557 + }, + { + "epoch": 0.900008096510404, + "grad_norm": 35.4046630859375, + "learning_rate": 1.0022668393782383e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.9139003157615662, + "num_tokens": 9958129.0, + "step": 5558 + }, + { + "epoch": 0.9001700267184843, + "grad_norm": 33.43301010131836, + "learning_rate": 1.0006476683937823e-06, + "loss": 0.6627, + "mean_token_accuracy": 0.9188452959060669, + "num_tokens": 9959912.0, + "step": 5559 + }, + { + "epoch": 0.9003319569265646, + "grad_norm": 26.200008392333984, + "learning_rate": 9.990284974093264e-07, + "loss": 0.5716, + "mean_token_accuracy": 0.926546722650528, + "num_tokens": 9961698.0, + "step": 5560 + }, + { + "epoch": 0.900493887134645, + "grad_norm": 31.46231460571289, + "learning_rate": 9.974093264248706e-07, + "loss": 0.5552, + "mean_token_accuracy": 0.9246582388877869, + "num_tokens": 9963488.0, + "step": 5561 + }, + { + "epoch": 0.9006558173427253, + "grad_norm": 36.524810791015625, + "learning_rate": 9.957901554404146e-07, + "loss": 0.6253, + "mean_token_accuracy": 0.9042443931102753, + "num_tokens": 9965272.0, + "step": 5562 + }, + { + "epoch": 0.9008177475508056, + "grad_norm": 35.576995849609375, + "learning_rate": 9.941709844559586e-07, + "loss": 0.6209, + "mean_token_accuracy": 0.9190490543842316, + "num_tokens": 9967068.0, + "step": 5563 + }, + { + "epoch": 0.9009796777588859, + "grad_norm": 35.891422271728516, + "learning_rate": 9.925518134715026e-07, + "loss": 0.5842, + "mean_token_accuracy": 0.9223912060260773, + "num_tokens": 9968862.0, + "step": 5564 + }, + { + "epoch": 0.9011416079669662, + "grad_norm": 38.83442306518555, + "learning_rate": 9.909326424870469e-07, + "loss": 0.8795, + "mean_token_accuracy": 0.8858012855052948, + "num_tokens": 9970664.0, + "step": 5565 + }, + { + "epoch": 0.9013035381750466, + "grad_norm": 19.877239227294922, + "learning_rate": 9.893134715025909e-07, + "loss": 0.445, + "mean_token_accuracy": 0.9428542256355286, + "num_tokens": 9972456.0, + "step": 5566 + }, + { + "epoch": 0.9014654683831269, + "grad_norm": 28.527721405029297, + "learning_rate": 9.876943005181349e-07, + "loss": 0.568, + "mean_token_accuracy": 0.9239444732666016, + "num_tokens": 9974231.0, + "step": 5567 + }, + { + "epoch": 0.9016273985912072, + "grad_norm": 35.573429107666016, + "learning_rate": 9.860751295336789e-07, + "loss": 0.6037, + "mean_token_accuracy": 0.9158718585968018, + "num_tokens": 9976029.0, + "step": 5568 + }, + { + "epoch": 0.9017893287992875, + "grad_norm": 42.55729675292969, + "learning_rate": 9.84455958549223e-07, + "loss": 0.7589, + "mean_token_accuracy": 0.9001418650150299, + "num_tokens": 9977832.0, + "step": 5569 + }, + { + "epoch": 0.9019512590073678, + "grad_norm": 31.62367057800293, + "learning_rate": 9.82836787564767e-07, + "loss": 0.5984, + "mean_token_accuracy": 0.9091842770576477, + "num_tokens": 9979630.0, + "step": 5570 + }, + { + "epoch": 0.9021131892154481, + "grad_norm": 39.88440704345703, + "learning_rate": 9.81217616580311e-07, + "loss": 0.6495, + "mean_token_accuracy": 0.9078014194965363, + "num_tokens": 9981424.0, + "step": 5571 + }, + { + "epoch": 0.9022751194235284, + "grad_norm": 31.60286521911621, + "learning_rate": 9.79598445595855e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.9173451066017151, + "num_tokens": 9983202.0, + "step": 5572 + }, + { + "epoch": 0.9024370496316088, + "grad_norm": 18.23585319519043, + "learning_rate": 9.77979274611399e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.9340406954288483, + "num_tokens": 9985002.0, + "step": 5573 + }, + { + "epoch": 0.9025989798396891, + "grad_norm": 31.150304794311523, + "learning_rate": 9.763601036269432e-07, + "loss": 0.5291, + "mean_token_accuracy": 0.9270210564136505, + "num_tokens": 9986802.0, + "step": 5574 + }, + { + "epoch": 0.9027609100477694, + "grad_norm": 24.057653427124023, + "learning_rate": 9.747409326424872e-07, + "loss": 0.5204, + "mean_token_accuracy": 0.9246257245540619, + "num_tokens": 9988593.0, + "step": 5575 + }, + { + "epoch": 0.9029228402558497, + "grad_norm": 35.47649383544922, + "learning_rate": 9.731217616580312e-07, + "loss": 0.7049, + "mean_token_accuracy": 0.9016874730587006, + "num_tokens": 9990380.0, + "step": 5576 + }, + { + "epoch": 0.9030847704639301, + "grad_norm": 21.994081497192383, + "learning_rate": 9.715025906735752e-07, + "loss": 0.487, + "mean_token_accuracy": 0.9292778074741364, + "num_tokens": 9992175.0, + "step": 5577 + }, + { + "epoch": 0.9032467006720104, + "grad_norm": 25.741830825805664, + "learning_rate": 9.698834196891192e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.93315589427948, + "num_tokens": 9993972.0, + "step": 5578 + }, + { + "epoch": 0.9034086308800907, + "grad_norm": 32.7096061706543, + "learning_rate": 9.682642487046632e-07, + "loss": 0.5658, + "mean_token_accuracy": 0.9143994748592377, + "num_tokens": 9995764.0, + "step": 5579 + }, + { + "epoch": 0.903570561088171, + "grad_norm": 22.142864227294922, + "learning_rate": 9.666450777202073e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.926271915435791, + "num_tokens": 9997547.0, + "step": 5580 + }, + { + "epoch": 0.9037324912962513, + "grad_norm": 25.86760139465332, + "learning_rate": 9.650259067357513e-07, + "loss": 0.4969, + "mean_token_accuracy": 0.9226506948471069, + "num_tokens": 9999344.0, + "step": 5581 + }, + { + "epoch": 0.9038944215043316, + "grad_norm": 32.678104400634766, + "learning_rate": 9.634067357512953e-07, + "loss": 0.5989, + "mean_token_accuracy": 0.9224154949188232, + "num_tokens": 10001141.0, + "step": 5582 + }, + { + "epoch": 0.9040563517124119, + "grad_norm": 26.3841609954834, + "learning_rate": 9.617875647668395e-07, + "loss": 0.5577, + "mean_token_accuracy": 0.9201717376708984, + "num_tokens": 10002941.0, + "step": 5583 + }, + { + "epoch": 0.9042182819204923, + "grad_norm": 32.73980712890625, + "learning_rate": 9.601683937823835e-07, + "loss": 0.5904, + "mean_token_accuracy": 0.9127261340618134, + "num_tokens": 10004728.0, + "step": 5584 + }, + { + "epoch": 0.9043802121285726, + "grad_norm": 44.728477478027344, + "learning_rate": 9.585492227979275e-07, + "loss": 0.6862, + "mean_token_accuracy": 0.9037828147411346, + "num_tokens": 10006519.0, + "step": 5585 + }, + { + "epoch": 0.9045421423366529, + "grad_norm": 34.9400749206543, + "learning_rate": 9.569300518134715e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.9162631630897522, + "num_tokens": 10008317.0, + "step": 5586 + }, + { + "epoch": 0.9047040725447332, + "grad_norm": 30.03016471862793, + "learning_rate": 9.553108808290158e-07, + "loss": 0.5408, + "mean_token_accuracy": 0.9222355484962463, + "num_tokens": 10010112.0, + "step": 5587 + }, + { + "epoch": 0.9048660027528136, + "grad_norm": 43.839599609375, + "learning_rate": 9.536917098445598e-07, + "loss": 0.6521, + "mean_token_accuracy": 0.9141042828559875, + "num_tokens": 10011903.0, + "step": 5588 + }, + { + "epoch": 0.9050279329608939, + "grad_norm": 33.1029052734375, + "learning_rate": 9.520725388601038e-07, + "loss": 0.6207, + "mean_token_accuracy": 0.914814829826355, + "num_tokens": 10013685.0, + "step": 5589 + }, + { + "epoch": 0.9051898631689742, + "grad_norm": 24.754907608032227, + "learning_rate": 9.504533678756478e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.9328171610832214, + "num_tokens": 10015480.0, + "step": 5590 + }, + { + "epoch": 0.9053517933770545, + "grad_norm": 29.743974685668945, + "learning_rate": 9.488341968911918e-07, + "loss": 0.6505, + "mean_token_accuracy": 0.9197651445865631, + "num_tokens": 10017278.0, + "step": 5591 + }, + { + "epoch": 0.9055137235851348, + "grad_norm": 27.809846878051758, + "learning_rate": 9.472150259067358e-07, + "loss": 0.6011, + "mean_token_accuracy": 0.9170576632022858, + "num_tokens": 10019067.0, + "step": 5592 + }, + { + "epoch": 0.9056756537932151, + "grad_norm": 35.40176010131836, + "learning_rate": 9.455958549222799e-07, + "loss": 0.7072, + "mean_token_accuracy": 0.913382351398468, + "num_tokens": 10020856.0, + "step": 5593 + }, + { + "epoch": 0.9058375840012954, + "grad_norm": 25.61206817626953, + "learning_rate": 9.43976683937824e-07, + "loss": 0.5826, + "mean_token_accuracy": 0.9251377880573273, + "num_tokens": 10022648.0, + "step": 5594 + }, + { + "epoch": 0.9059995142093757, + "grad_norm": 38.32581329345703, + "learning_rate": 9.42357512953368e-07, + "loss": 0.6433, + "mean_token_accuracy": 0.9216595590114594, + "num_tokens": 10024441.0, + "step": 5595 + }, + { + "epoch": 0.906161444417456, + "grad_norm": 31.303855895996094, + "learning_rate": 9.40738341968912e-07, + "loss": 0.6156, + "mean_token_accuracy": 0.9184311330318451, + "num_tokens": 10026246.0, + "step": 5596 + }, + { + "epoch": 0.9063233746255364, + "grad_norm": 33.21104049682617, + "learning_rate": 9.39119170984456e-07, + "loss": 0.5997, + "mean_token_accuracy": 0.9181873500347137, + "num_tokens": 10028027.0, + "step": 5597 + }, + { + "epoch": 0.9064853048336167, + "grad_norm": 34.308135986328125, + "learning_rate": 9.375000000000001e-07, + "loss": 0.6465, + "mean_token_accuracy": 0.9123152792453766, + "num_tokens": 10029824.0, + "step": 5598 + }, + { + "epoch": 0.906647235041697, + "grad_norm": 27.1445255279541, + "learning_rate": 9.358808290155441e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.9279046654701233, + "num_tokens": 10031613.0, + "step": 5599 + }, + { + "epoch": 0.9068091652497774, + "grad_norm": 13.958855628967285, + "learning_rate": 9.342616580310882e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.9290329813957214, + "num_tokens": 10033393.0, + "step": 5600 + }, + { + "epoch": 0.9069710954578577, + "grad_norm": 25.900976181030273, + "learning_rate": 9.326424870466322e-07, + "loss": 0.4807, + "mean_token_accuracy": 0.9319444596767426, + "num_tokens": 10035199.0, + "step": 5601 + }, + { + "epoch": 0.907133025665938, + "grad_norm": 28.463598251342773, + "learning_rate": 9.310233160621762e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.9299546480178833, + "num_tokens": 10036996.0, + "step": 5602 + }, + { + "epoch": 0.9072949558740183, + "grad_norm": 27.228099822998047, + "learning_rate": 9.294041450777203e-07, + "loss": 0.6496, + "mean_token_accuracy": 0.9178799688816071, + "num_tokens": 10038788.0, + "step": 5603 + }, + { + "epoch": 0.9074568860820986, + "grad_norm": 35.022727966308594, + "learning_rate": 9.277849740932643e-07, + "loss": 0.5784, + "mean_token_accuracy": 0.9098696708679199, + "num_tokens": 10040576.0, + "step": 5604 + }, + { + "epoch": 0.9076188162901789, + "grad_norm": 29.726205825805664, + "learning_rate": 9.261658031088083e-07, + "loss": 0.5884, + "mean_token_accuracy": 0.9121578335762024, + "num_tokens": 10042361.0, + "step": 5605 + }, + { + "epoch": 0.9077807464982592, + "grad_norm": 27.568334579467773, + "learning_rate": 9.245466321243523e-07, + "loss": 0.5778, + "mean_token_accuracy": 0.9315451383590698, + "num_tokens": 10044151.0, + "step": 5606 + }, + { + "epoch": 0.9079426767063395, + "grad_norm": 31.959720611572266, + "learning_rate": 9.229274611398964e-07, + "loss": 0.6138, + "mean_token_accuracy": 0.919584333896637, + "num_tokens": 10045937.0, + "step": 5607 + }, + { + "epoch": 0.9081046069144199, + "grad_norm": 23.01288604736328, + "learning_rate": 9.213082901554405e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.925253301858902, + "num_tokens": 10047730.0, + "step": 5608 + }, + { + "epoch": 0.9082665371225002, + "grad_norm": 30.675859451293945, + "learning_rate": 9.196891191709845e-07, + "loss": 0.5674, + "mean_token_accuracy": 0.9190844893455505, + "num_tokens": 10049526.0, + "step": 5609 + }, + { + "epoch": 0.9084284673305805, + "grad_norm": 47.50618362426758, + "learning_rate": 9.180699481865285e-07, + "loss": 0.626, + "mean_token_accuracy": 0.9221827685832977, + "num_tokens": 10051321.0, + "step": 5610 + }, + { + "epoch": 0.9085903975386609, + "grad_norm": 28.85889434814453, + "learning_rate": 9.164507772020727e-07, + "loss": 0.576, + "mean_token_accuracy": 0.9235875904560089, + "num_tokens": 10053108.0, + "step": 5611 + }, + { + "epoch": 0.9087523277467412, + "grad_norm": 31.189441680908203, + "learning_rate": 9.148316062176167e-07, + "loss": 0.627, + "mean_token_accuracy": 0.9221123158931732, + "num_tokens": 10054903.0, + "step": 5612 + }, + { + "epoch": 0.9089142579548215, + "grad_norm": 27.5843448638916, + "learning_rate": 9.132124352331607e-07, + "loss": 0.546, + "mean_token_accuracy": 0.924397736787796, + "num_tokens": 10056706.0, + "step": 5613 + }, + { + "epoch": 0.9090761881629018, + "grad_norm": 25.00461769104004, + "learning_rate": 9.115932642487048e-07, + "loss": 0.5639, + "mean_token_accuracy": 0.9233333468437195, + "num_tokens": 10058503.0, + "step": 5614 + }, + { + "epoch": 0.9092381183709821, + "grad_norm": 41.570770263671875, + "learning_rate": 9.099740932642488e-07, + "loss": 0.6861, + "mean_token_accuracy": 0.9107142984867096, + "num_tokens": 10060295.0, + "step": 5615 + }, + { + "epoch": 0.9094000485790624, + "grad_norm": 24.20473289489746, + "learning_rate": 9.083549222797929e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.9264347851276398, + "num_tokens": 10062079.0, + "step": 5616 + }, + { + "epoch": 0.9095619787871427, + "grad_norm": 25.2628231048584, + "learning_rate": 9.067357512953369e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.9245341718196869, + "num_tokens": 10063869.0, + "step": 5617 + }, + { + "epoch": 0.909723908995223, + "grad_norm": 36.49765396118164, + "learning_rate": 9.051165803108809e-07, + "loss": 0.7258, + "mean_token_accuracy": 0.9176114499568939, + "num_tokens": 10065659.0, + "step": 5618 + }, + { + "epoch": 0.9098858392033033, + "grad_norm": 23.01960563659668, + "learning_rate": 9.034974093264249e-07, + "loss": 0.5524, + "mean_token_accuracy": 0.9188311696052551, + "num_tokens": 10067454.0, + "step": 5619 + }, + { + "epoch": 0.9100477694113837, + "grad_norm": 48.52146530151367, + "learning_rate": 9.018782383419689e-07, + "loss": 0.6845, + "mean_token_accuracy": 0.9037317037582397, + "num_tokens": 10069257.0, + "step": 5620 + }, + { + "epoch": 0.910209699619464, + "grad_norm": 31.42661476135254, + "learning_rate": 9.002590673575131e-07, + "loss": 0.6713, + "mean_token_accuracy": 0.9149396419525146, + "num_tokens": 10071051.0, + "step": 5621 + }, + { + "epoch": 0.9103716298275444, + "grad_norm": 28.61754608154297, + "learning_rate": 8.986398963730571e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.9224390387535095, + "num_tokens": 10072847.0, + "step": 5622 + }, + { + "epoch": 0.9105335600356247, + "grad_norm": 13.720559120178223, + "learning_rate": 8.970207253886011e-07, + "loss": 0.441, + "mean_token_accuracy": 0.941980242729187, + "num_tokens": 10074635.0, + "step": 5623 + }, + { + "epoch": 0.910695490243705, + "grad_norm": 29.109756469726562, + "learning_rate": 8.954015544041451e-07, + "loss": 0.5531, + "mean_token_accuracy": 0.9231220781803131, + "num_tokens": 10076421.0, + "step": 5624 + }, + { + "epoch": 0.9108574204517853, + "grad_norm": 38.76572036743164, + "learning_rate": 8.937823834196892e-07, + "loss": 0.6592, + "mean_token_accuracy": 0.9203285872936249, + "num_tokens": 10078220.0, + "step": 5625 + }, + { + "epoch": 0.9110193506598656, + "grad_norm": 32.222843170166016, + "learning_rate": 8.921632124352332e-07, + "loss": 0.6332, + "mean_token_accuracy": 0.9201058149337769, + "num_tokens": 10080007.0, + "step": 5626 + }, + { + "epoch": 0.9111812808679459, + "grad_norm": 24.84505271911621, + "learning_rate": 8.905440414507772e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.9236596822738647, + "num_tokens": 10081794.0, + "step": 5627 + }, + { + "epoch": 0.9113432110760262, + "grad_norm": 27.645280838012695, + "learning_rate": 8.889248704663213e-07, + "loss": 0.54, + "mean_token_accuracy": 0.9226305782794952, + "num_tokens": 10083590.0, + "step": 5628 + }, + { + "epoch": 0.9115051412841065, + "grad_norm": 37.0853385925293, + "learning_rate": 8.873056994818653e-07, + "loss": 0.5899, + "mean_token_accuracy": 0.9148657023906708, + "num_tokens": 10085392.0, + "step": 5629 + }, + { + "epoch": 0.9116670714921868, + "grad_norm": 37.30296325683594, + "learning_rate": 8.856865284974094e-07, + "loss": 0.7585, + "mean_token_accuracy": 0.9079285264015198, + "num_tokens": 10087186.0, + "step": 5630 + }, + { + "epoch": 0.9118290017002671, + "grad_norm": 31.483062744140625, + "learning_rate": 8.840673575129534e-07, + "loss": 0.8331, + "mean_token_accuracy": 0.9114106595516205, + "num_tokens": 10088969.0, + "step": 5631 + }, + { + "epoch": 0.9119909319083475, + "grad_norm": 41.978206634521484, + "learning_rate": 8.824481865284974e-07, + "loss": 0.6265, + "mean_token_accuracy": 0.9154929518699646, + "num_tokens": 10090765.0, + "step": 5632 + }, + { + "epoch": 0.9121528621164278, + "grad_norm": 34.29523468017578, + "learning_rate": 8.808290155440414e-07, + "loss": 0.6251, + "mean_token_accuracy": 0.9159002006053925, + "num_tokens": 10092563.0, + "step": 5633 + }, + { + "epoch": 0.9123147923245082, + "grad_norm": 34.834869384765625, + "learning_rate": 8.792098445595854e-07, + "loss": 0.6144, + "mean_token_accuracy": 0.9056878387928009, + "num_tokens": 10094350.0, + "step": 5634 + }, + { + "epoch": 0.9124767225325885, + "grad_norm": 30.736188888549805, + "learning_rate": 8.775906735751297e-07, + "loss": 0.5401, + "mean_token_accuracy": 0.9170315861701965, + "num_tokens": 10096139.0, + "step": 5635 + }, + { + "epoch": 0.9126386527406688, + "grad_norm": 22.47742462158203, + "learning_rate": 8.759715025906737e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.9296235740184784, + "num_tokens": 10097935.0, + "step": 5636 + }, + { + "epoch": 0.9128005829487491, + "grad_norm": 17.95374298095703, + "learning_rate": 8.743523316062177e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.9295592904090881, + "num_tokens": 10099717.0, + "step": 5637 + }, + { + "epoch": 0.9129625131568294, + "grad_norm": 30.614118576049805, + "learning_rate": 8.727331606217618e-07, + "loss": 0.7216, + "mean_token_accuracy": 0.9087953269481659, + "num_tokens": 10101503.0, + "step": 5638 + }, + { + "epoch": 0.9131244433649097, + "grad_norm": 30.87086296081543, + "learning_rate": 8.711139896373058e-07, + "loss": 0.5369, + "mean_token_accuracy": 0.9161937236785889, + "num_tokens": 10103301.0, + "step": 5639 + }, + { + "epoch": 0.91328637357299, + "grad_norm": 23.20081329345703, + "learning_rate": 8.694948186528498e-07, + "loss": 0.5807, + "mean_token_accuracy": 0.9295125305652618, + "num_tokens": 10105083.0, + "step": 5640 + }, + { + "epoch": 0.9134483037810703, + "grad_norm": 35.932891845703125, + "learning_rate": 8.678756476683939e-07, + "loss": 0.6153, + "mean_token_accuracy": 0.9159381687641144, + "num_tokens": 10106869.0, + "step": 5641 + }, + { + "epoch": 0.9136102339891506, + "grad_norm": 27.949153900146484, + "learning_rate": 8.662564766839379e-07, + "loss": 0.5626, + "mean_token_accuracy": 0.923882782459259, + "num_tokens": 10108657.0, + "step": 5642 + }, + { + "epoch": 0.913772164197231, + "grad_norm": 26.4757080078125, + "learning_rate": 8.64637305699482e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.9294602870941162, + "num_tokens": 10110452.0, + "step": 5643 + }, + { + "epoch": 0.9139340944053113, + "grad_norm": 35.90068817138672, + "learning_rate": 8.63018134715026e-07, + "loss": 0.6087, + "mean_token_accuracy": 0.9055944085121155, + "num_tokens": 10112239.0, + "step": 5644 + }, + { + "epoch": 0.9140960246133917, + "grad_norm": 37.963050842285156, + "learning_rate": 8.6139896373057e-07, + "loss": 0.6104, + "mean_token_accuracy": 0.9178784787654877, + "num_tokens": 10114031.0, + "step": 5645 + }, + { + "epoch": 0.914257954821472, + "grad_norm": 33.743892669677734, + "learning_rate": 8.59779792746114e-07, + "loss": 0.5575, + "mean_token_accuracy": 0.9151099026203156, + "num_tokens": 10115826.0, + "step": 5646 + }, + { + "epoch": 0.9144198850295523, + "grad_norm": 43.81098556518555, + "learning_rate": 8.581606217616581e-07, + "loss": 0.7211, + "mean_token_accuracy": 0.9140456318855286, + "num_tokens": 10117617.0, + "step": 5647 + }, + { + "epoch": 0.9145818152376326, + "grad_norm": 30.712921142578125, + "learning_rate": 8.565414507772022e-07, + "loss": 0.6105, + "mean_token_accuracy": 0.9220825135707855, + "num_tokens": 10119411.0, + "step": 5648 + }, + { + "epoch": 0.9147437454457129, + "grad_norm": 37.56822204589844, + "learning_rate": 8.549222797927462e-07, + "loss": 0.6529, + "mean_token_accuracy": 0.8962404131889343, + "num_tokens": 10121212.0, + "step": 5649 + }, + { + "epoch": 0.9149056756537932, + "grad_norm": 37.450218200683594, + "learning_rate": 8.533031088082902e-07, + "loss": 0.6117, + "mean_token_accuracy": 0.9177117049694061, + "num_tokens": 10123016.0, + "step": 5650 + }, + { + "epoch": 0.9150676058618735, + "grad_norm": 36.97028350830078, + "learning_rate": 8.516839378238342e-07, + "loss": 0.6267, + "mean_token_accuracy": 0.89896559715271, + "num_tokens": 10124815.0, + "step": 5651 + }, + { + "epoch": 0.9152295360699538, + "grad_norm": 35.0447883605957, + "learning_rate": 8.500647668393783e-07, + "loss": 0.6177, + "mean_token_accuracy": 0.9137841761112213, + "num_tokens": 10126605.0, + "step": 5652 + }, + { + "epoch": 0.9153914662780341, + "grad_norm": 31.010713577270508, + "learning_rate": 8.484455958549223e-07, + "loss": 0.6752, + "mean_token_accuracy": 0.9100414216518402, + "num_tokens": 10128395.0, + "step": 5653 + }, + { + "epoch": 0.9155533964861144, + "grad_norm": 35.68456268310547, + "learning_rate": 8.468264248704663e-07, + "loss": 0.546, + "mean_token_accuracy": 0.9210144877433777, + "num_tokens": 10130185.0, + "step": 5654 + }, + { + "epoch": 0.9157153266941948, + "grad_norm": 25.924238204956055, + "learning_rate": 8.452072538860104e-07, + "loss": 0.537, + "mean_token_accuracy": 0.9246916770935059, + "num_tokens": 10131976.0, + "step": 5655 + }, + { + "epoch": 0.9158772569022752, + "grad_norm": 41.085655212402344, + "learning_rate": 8.435880829015544e-07, + "loss": 0.628, + "mean_token_accuracy": 0.9146403074264526, + "num_tokens": 10133769.0, + "step": 5656 + }, + { + "epoch": 0.9160391871103555, + "grad_norm": 26.742158889770508, + "learning_rate": 8.419689119170985e-07, + "loss": 0.547, + "mean_token_accuracy": 0.9243585765361786, + "num_tokens": 10135558.0, + "step": 5657 + }, + { + "epoch": 0.9162011173184358, + "grad_norm": 28.290122985839844, + "learning_rate": 8.403497409326425e-07, + "loss": 0.5472, + "mean_token_accuracy": 0.9261270761489868, + "num_tokens": 10137354.0, + "step": 5658 + }, + { + "epoch": 0.9163630475265161, + "grad_norm": 28.697431564331055, + "learning_rate": 8.387305699481866e-07, + "loss": 0.5726, + "mean_token_accuracy": 0.9147413671016693, + "num_tokens": 10139146.0, + "step": 5659 + }, + { + "epoch": 0.9165249777345964, + "grad_norm": 24.759017944335938, + "learning_rate": 8.371113989637307e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.9188180863857269, + "num_tokens": 10140929.0, + "step": 5660 + }, + { + "epoch": 0.9166869079426767, + "grad_norm": 23.902603149414062, + "learning_rate": 8.354922279792748e-07, + "loss": 0.6295, + "mean_token_accuracy": 0.9216485619544983, + "num_tokens": 10142707.0, + "step": 5661 + }, + { + "epoch": 0.916848838150757, + "grad_norm": 27.962326049804688, + "learning_rate": 8.338730569948188e-07, + "loss": 0.5359, + "mean_token_accuracy": 0.9301154017448425, + "num_tokens": 10144505.0, + "step": 5662 + }, + { + "epoch": 0.9170107683588373, + "grad_norm": 31.878097534179688, + "learning_rate": 8.322538860103628e-07, + "loss": 0.5757, + "mean_token_accuracy": 0.9202381074428558, + "num_tokens": 10146304.0, + "step": 5663 + }, + { + "epoch": 0.9171726985669176, + "grad_norm": 32.4810791015625, + "learning_rate": 8.306347150259068e-07, + "loss": 0.6764, + "mean_token_accuracy": 0.9166666865348816, + "num_tokens": 10148092.0, + "step": 5664 + }, + { + "epoch": 0.9173346287749979, + "grad_norm": 24.931467056274414, + "learning_rate": 8.290155440414509e-07, + "loss": 0.5577, + "mean_token_accuracy": 0.9205157458782196, + "num_tokens": 10149882.0, + "step": 5665 + }, + { + "epoch": 0.9174965589830782, + "grad_norm": 27.32235336303711, + "learning_rate": 8.273963730569949e-07, + "loss": 0.5457, + "mean_token_accuracy": 0.9245841801166534, + "num_tokens": 10151659.0, + "step": 5666 + }, + { + "epoch": 0.9176584891911587, + "grad_norm": 32.354312896728516, + "learning_rate": 8.257772020725389e-07, + "loss": 0.6513, + "mean_token_accuracy": 0.9060316979885101, + "num_tokens": 10153448.0, + "step": 5667 + }, + { + "epoch": 0.917820419399239, + "grad_norm": 46.82957077026367, + "learning_rate": 8.24158031088083e-07, + "loss": 0.7634, + "mean_token_accuracy": 0.9009303748607635, + "num_tokens": 10155232.0, + "step": 5668 + }, + { + "epoch": 0.9179823496073193, + "grad_norm": 19.23516845703125, + "learning_rate": 8.225388601036271e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.9261291921138763, + "num_tokens": 10157014.0, + "step": 5669 + }, + { + "epoch": 0.9181442798153996, + "grad_norm": 43.904720306396484, + "learning_rate": 8.209196891191711e-07, + "loss": 0.7039, + "mean_token_accuracy": 0.9056650102138519, + "num_tokens": 10158811.0, + "step": 5670 + }, + { + "epoch": 0.9183062100234799, + "grad_norm": 35.308448791503906, + "learning_rate": 8.193005181347151e-07, + "loss": 0.5694, + "mean_token_accuracy": 0.9209109842777252, + "num_tokens": 10160601.0, + "step": 5671 + }, + { + "epoch": 0.9184681402315602, + "grad_norm": 21.94005012512207, + "learning_rate": 8.176813471502591e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.932189553976059, + "num_tokens": 10162393.0, + "step": 5672 + }, + { + "epoch": 0.9186300704396405, + "grad_norm": 35.60520935058594, + "learning_rate": 8.160621761658031e-07, + "loss": 0.7035, + "mean_token_accuracy": 0.9185185134410858, + "num_tokens": 10164175.0, + "step": 5673 + }, + { + "epoch": 0.9187920006477208, + "grad_norm": 18.40984535217285, + "learning_rate": 8.144430051813472e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.9313203990459442, + "num_tokens": 10165964.0, + "step": 5674 + }, + { + "epoch": 0.9189539308558011, + "grad_norm": 26.56291389465332, + "learning_rate": 8.128238341968913e-07, + "loss": 0.523, + "mean_token_accuracy": 0.923154354095459, + "num_tokens": 10167765.0, + "step": 5675 + }, + { + "epoch": 0.9191158610638814, + "grad_norm": 32.96531677246094, + "learning_rate": 8.112046632124353e-07, + "loss": 0.6833, + "mean_token_accuracy": 0.9144968390464783, + "num_tokens": 10169569.0, + "step": 5676 + }, + { + "epoch": 0.9192777912719617, + "grad_norm": 31.745628356933594, + "learning_rate": 8.095854922279793e-07, + "loss": 0.537, + "mean_token_accuracy": 0.9137841761112213, + "num_tokens": 10171359.0, + "step": 5677 + }, + { + "epoch": 0.919439721480042, + "grad_norm": 35.57515335083008, + "learning_rate": 8.079663212435233e-07, + "loss": 0.5738, + "mean_token_accuracy": 0.9209109842777252, + "num_tokens": 10173149.0, + "step": 5678 + }, + { + "epoch": 0.9196016516881225, + "grad_norm": 46.098636627197266, + "learning_rate": 8.063471502590674e-07, + "loss": 0.6672, + "mean_token_accuracy": 0.9124832451343536, + "num_tokens": 10174945.0, + "step": 5679 + }, + { + "epoch": 0.9197635818962028, + "grad_norm": 35.43976593017578, + "learning_rate": 8.047279792746114e-07, + "loss": 0.6219, + "mean_token_accuracy": 0.9116374850273132, + "num_tokens": 10176729.0, + "step": 5680 + }, + { + "epoch": 0.9199255121042831, + "grad_norm": 37.70087814331055, + "learning_rate": 8.031088082901554e-07, + "loss": 0.6327, + "mean_token_accuracy": 0.9091293811798096, + "num_tokens": 10178516.0, + "step": 5681 + }, + { + "epoch": 0.9200874423123634, + "grad_norm": 33.48616409301758, + "learning_rate": 8.014896373056995e-07, + "loss": 0.6085, + "mean_token_accuracy": 0.9150060415267944, + "num_tokens": 10180310.0, + "step": 5682 + }, + { + "epoch": 0.9202493725204437, + "grad_norm": 36.87192153930664, + "learning_rate": 7.998704663212437e-07, + "loss": 0.6487, + "mean_token_accuracy": 0.9119458198547363, + "num_tokens": 10182107.0, + "step": 5683 + }, + { + "epoch": 0.920411302728524, + "grad_norm": 27.395814895629883, + "learning_rate": 7.982512953367877e-07, + "loss": 0.5926, + "mean_token_accuracy": 0.9119922816753387, + "num_tokens": 10183892.0, + "step": 5684 + }, + { + "epoch": 0.9205732329366043, + "grad_norm": 32.98408889770508, + "learning_rate": 7.966321243523317e-07, + "loss": 0.6257, + "mean_token_accuracy": 0.9139703512191772, + "num_tokens": 10185681.0, + "step": 5685 + }, + { + "epoch": 0.9207351631446846, + "grad_norm": 43.61354446411133, + "learning_rate": 7.950129533678757e-07, + "loss": 0.6622, + "mean_token_accuracy": 0.9034912288188934, + "num_tokens": 10187483.0, + "step": 5686 + }, + { + "epoch": 0.9208970933527649, + "grad_norm": 31.741113662719727, + "learning_rate": 7.933937823834198e-07, + "loss": 0.5667, + "mean_token_accuracy": 0.9224945604801178, + "num_tokens": 10189266.0, + "step": 5687 + }, + { + "epoch": 0.9210590235608452, + "grad_norm": 39.15793991088867, + "learning_rate": 7.917746113989638e-07, + "loss": 0.5785, + "mean_token_accuracy": 0.9186292290687561, + "num_tokens": 10191060.0, + "step": 5688 + }, + { + "epoch": 0.9212209537689255, + "grad_norm": 25.637378692626953, + "learning_rate": 7.901554404145079e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.9165966212749481, + "num_tokens": 10192848.0, + "step": 5689 + }, + { + "epoch": 0.921382883977006, + "grad_norm": 34.98750305175781, + "learning_rate": 7.885362694300519e-07, + "loss": 0.7024, + "mean_token_accuracy": 0.9069086015224457, + "num_tokens": 10194639.0, + "step": 5690 + }, + { + "epoch": 0.9215448141850863, + "grad_norm": 36.5889778137207, + "learning_rate": 7.86917098445596e-07, + "loss": 0.6802, + "mean_token_accuracy": 0.9096163213253021, + "num_tokens": 10196438.0, + "step": 5691 + }, + { + "epoch": 0.9217067443931666, + "grad_norm": 30.991409301757812, + "learning_rate": 7.8529792746114e-07, + "loss": 0.6048, + "mean_token_accuracy": 0.9179058969020844, + "num_tokens": 10198218.0, + "step": 5692 + }, + { + "epoch": 0.9218686746012469, + "grad_norm": 38.29185104370117, + "learning_rate": 7.83678756476684e-07, + "loss": 0.6094, + "mean_token_accuracy": 0.9166666865348816, + "num_tokens": 10200006.0, + "step": 5693 + }, + { + "epoch": 0.9220306048093272, + "grad_norm": 25.256343841552734, + "learning_rate": 7.82059585492228e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.9328320920467377, + "num_tokens": 10201786.0, + "step": 5694 + }, + { + "epoch": 0.9221925350174075, + "grad_norm": 44.55492401123047, + "learning_rate": 7.80440414507772e-07, + "loss": 0.8545, + "mean_token_accuracy": 0.9045868515968323, + "num_tokens": 10203581.0, + "step": 5695 + }, + { + "epoch": 0.9223544652254878, + "grad_norm": 32.62047576904297, + "learning_rate": 7.788212435233162e-07, + "loss": 0.7024, + "mean_token_accuracy": 0.9137443602085114, + "num_tokens": 10205359.0, + "step": 5696 + }, + { + "epoch": 0.9225163954335681, + "grad_norm": 42.59144973754883, + "learning_rate": 7.772020725388602e-07, + "loss": 0.6492, + "mean_token_accuracy": 0.9049725234508514, + "num_tokens": 10207154.0, + "step": 5697 + }, + { + "epoch": 0.9226783256416484, + "grad_norm": 34.90169906616211, + "learning_rate": 7.755829015544042e-07, + "loss": 0.6237, + "mean_token_accuracy": 0.9083519577980042, + "num_tokens": 10208947.0, + "step": 5698 + }, + { + "epoch": 0.9228402558497287, + "grad_norm": 33.17928695678711, + "learning_rate": 7.739637305699482e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.9179480075836182, + "num_tokens": 10210751.0, + "step": 5699 + }, + { + "epoch": 0.923002186057809, + "grad_norm": 37.96687698364258, + "learning_rate": 7.723445595854922e-07, + "loss": 0.7671, + "mean_token_accuracy": 0.9024621248245239, + "num_tokens": 10212550.0, + "step": 5700 + }, + { + "epoch": 0.9231641162658895, + "grad_norm": 24.348209381103516, + "learning_rate": 7.707253886010363e-07, + "loss": 0.5406, + "mean_token_accuracy": 0.9197278916835785, + "num_tokens": 10214349.0, + "step": 5701 + }, + { + "epoch": 0.9233260464739698, + "grad_norm": 27.91250991821289, + "learning_rate": 7.691062176165803e-07, + "loss": 0.595, + "mean_token_accuracy": 0.9114651679992676, + "num_tokens": 10216132.0, + "step": 5702 + }, + { + "epoch": 0.9234879766820501, + "grad_norm": 17.55530548095703, + "learning_rate": 7.674870466321244e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.9358378946781158, + "num_tokens": 10217924.0, + "step": 5703 + }, + { + "epoch": 0.9236499068901304, + "grad_norm": 39.04444122314453, + "learning_rate": 7.658678756476684e-07, + "loss": 0.6716, + "mean_token_accuracy": 0.911374568939209, + "num_tokens": 10219708.0, + "step": 5704 + }, + { + "epoch": 0.9238118370982107, + "grad_norm": 32.61463928222656, + "learning_rate": 7.642487046632124e-07, + "loss": 0.5467, + "mean_token_accuracy": 0.9208469092845917, + "num_tokens": 10221498.0, + "step": 5705 + }, + { + "epoch": 0.923973767306291, + "grad_norm": 29.0821475982666, + "learning_rate": 7.626295336787565e-07, + "loss": 0.5633, + "mean_token_accuracy": 0.9200184643268585, + "num_tokens": 10223297.0, + "step": 5706 + }, + { + "epoch": 0.9241356975143713, + "grad_norm": 39.21720886230469, + "learning_rate": 7.610103626943006e-07, + "loss": 0.6529, + "mean_token_accuracy": 0.9014098644256592, + "num_tokens": 10225092.0, + "step": 5707 + }, + { + "epoch": 0.9242976277224516, + "grad_norm": 41.244686126708984, + "learning_rate": 7.593911917098446e-07, + "loss": 0.8263, + "mean_token_accuracy": 0.9001617133617401, + "num_tokens": 10226894.0, + "step": 5708 + }, + { + "epoch": 0.9244595579305319, + "grad_norm": 35.275299072265625, + "learning_rate": 7.577720207253888e-07, + "loss": 0.626, + "mean_token_accuracy": 0.9007092118263245, + "num_tokens": 10228688.0, + "step": 5709 + }, + { + "epoch": 0.9246214881386122, + "grad_norm": 37.34368896484375, + "learning_rate": 7.561528497409328e-07, + "loss": 0.6226, + "mean_token_accuracy": 0.9140456318855286, + "num_tokens": 10230479.0, + "step": 5710 + }, + { + "epoch": 0.9247834183466925, + "grad_norm": 23.267709732055664, + "learning_rate": 7.545336787564768e-07, + "loss": 0.5312, + "mean_token_accuracy": 0.9282923936843872, + "num_tokens": 10232270.0, + "step": 5711 + }, + { + "epoch": 0.9249453485547728, + "grad_norm": 18.114795684814453, + "learning_rate": 7.529145077720208e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.935844749212265, + "num_tokens": 10234063.0, + "step": 5712 + }, + { + "epoch": 0.9251072787628533, + "grad_norm": 35.68361282348633, + "learning_rate": 7.512953367875648e-07, + "loss": 0.5715, + "mean_token_accuracy": 0.9169460833072662, + "num_tokens": 10235864.0, + "step": 5713 + }, + { + "epoch": 0.9252692089709336, + "grad_norm": 34.319976806640625, + "learning_rate": 7.496761658031089e-07, + "loss": 0.5646, + "mean_token_accuracy": 0.9205268919467926, + "num_tokens": 10237653.0, + "step": 5714 + }, + { + "epoch": 0.9254311391790139, + "grad_norm": 38.801002502441406, + "learning_rate": 7.480569948186529e-07, + "loss": 0.6453, + "mean_token_accuracy": 0.9113413393497467, + "num_tokens": 10239436.0, + "step": 5715 + }, + { + "epoch": 0.9255930693870942, + "grad_norm": 26.99283790588379, + "learning_rate": 7.46437823834197e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.9309567511081696, + "num_tokens": 10241240.0, + "step": 5716 + }, + { + "epoch": 0.9257549995951745, + "grad_norm": 28.322425842285156, + "learning_rate": 7.44818652849741e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.9305039942264557, + "num_tokens": 10243040.0, + "step": 5717 + }, + { + "epoch": 0.9259169298032548, + "grad_norm": 31.51887321472168, + "learning_rate": 7.431994818652851e-07, + "loss": 0.6125, + "mean_token_accuracy": 0.9218875765800476, + "num_tokens": 10244834.0, + "step": 5718 + }, + { + "epoch": 0.9260788600113351, + "grad_norm": 40.880550384521484, + "learning_rate": 7.415803108808291e-07, + "loss": 0.7141, + "mean_token_accuracy": 0.903620183467865, + "num_tokens": 10246636.0, + "step": 5719 + }, + { + "epoch": 0.9262407902194154, + "grad_norm": 25.18109893798828, + "learning_rate": 7.399611398963731e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.9209183752536774, + "num_tokens": 10248439.0, + "step": 5720 + }, + { + "epoch": 0.9264027204274957, + "grad_norm": 17.177614212036133, + "learning_rate": 7.383419689119171e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.933318704366684, + "num_tokens": 10250221.0, + "step": 5721 + }, + { + "epoch": 0.926564650635576, + "grad_norm": 31.88681983947754, + "learning_rate": 7.367227979274611e-07, + "loss": 0.6099, + "mean_token_accuracy": 0.9260573983192444, + "num_tokens": 10252016.0, + "step": 5722 + }, + { + "epoch": 0.9267265808436563, + "grad_norm": 31.584835052490234, + "learning_rate": 7.351036269430053e-07, + "loss": 0.5624, + "mean_token_accuracy": 0.9157884418964386, + "num_tokens": 10253813.0, + "step": 5723 + }, + { + "epoch": 0.9268885110517368, + "grad_norm": 23.47626304626465, + "learning_rate": 7.334844559585493e-07, + "loss": 0.504, + "mean_token_accuracy": 0.9362173080444336, + "num_tokens": 10255607.0, + "step": 5724 + }, + { + "epoch": 0.9270504412598171, + "grad_norm": 25.658193588256836, + "learning_rate": 7.318652849740933e-07, + "loss": 0.555, + "mean_token_accuracy": 0.9304879307746887, + "num_tokens": 10257406.0, + "step": 5725 + }, + { + "epoch": 0.9272123714678974, + "grad_norm": 43.04893493652344, + "learning_rate": 7.302461139896373e-07, + "loss": 0.7422, + "mean_token_accuracy": 0.9074468016624451, + "num_tokens": 10259199.0, + "step": 5726 + }, + { + "epoch": 0.9273743016759777, + "grad_norm": 40.37648010253906, + "learning_rate": 7.286269430051813e-07, + "loss": 0.7065, + "mean_token_accuracy": 0.9010588228702545, + "num_tokens": 10260994.0, + "step": 5727 + }, + { + "epoch": 0.927536231884058, + "grad_norm": 23.8811092376709, + "learning_rate": 7.270077720207254e-07, + "loss": 0.514, + "mean_token_accuracy": 0.927003413438797, + "num_tokens": 10262780.0, + "step": 5728 + }, + { + "epoch": 0.9276981620921383, + "grad_norm": 31.99302864074707, + "learning_rate": 7.253886010362694e-07, + "loss": 0.5752, + "mean_token_accuracy": 0.919961154460907, + "num_tokens": 10264579.0, + "step": 5729 + }, + { + "epoch": 0.9278600923002186, + "grad_norm": 37.32364273071289, + "learning_rate": 7.237694300518135e-07, + "loss": 0.7003, + "mean_token_accuracy": 0.9114923775196075, + "num_tokens": 10266362.0, + "step": 5730 + }, + { + "epoch": 0.9280220225082989, + "grad_norm": 38.94670104980469, + "learning_rate": 7.221502590673577e-07, + "loss": 0.7412, + "mean_token_accuracy": 0.9000253677368164, + "num_tokens": 10268155.0, + "step": 5731 + }, + { + "epoch": 0.9281839527163792, + "grad_norm": 32.523738861083984, + "learning_rate": 7.205310880829017e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.9232374131679535, + "num_tokens": 10269956.0, + "step": 5732 + }, + { + "epoch": 0.9283458829244595, + "grad_norm": 30.830522537231445, + "learning_rate": 7.189119170984457e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.9295215606689453, + "num_tokens": 10271752.0, + "step": 5733 + }, + { + "epoch": 0.9285078131325398, + "grad_norm": 27.818565368652344, + "learning_rate": 7.172927461139897e-07, + "loss": 0.533, + "mean_token_accuracy": 0.9264125525951385, + "num_tokens": 10273536.0, + "step": 5734 + }, + { + "epoch": 0.9286697433406202, + "grad_norm": 39.40623474121094, + "learning_rate": 7.156735751295337e-07, + "loss": 0.6366, + "mean_token_accuracy": 0.9203781485557556, + "num_tokens": 10275324.0, + "step": 5735 + }, + { + "epoch": 0.9288316735487006, + "grad_norm": 33.7591552734375, + "learning_rate": 7.140544041450779e-07, + "loss": 0.7824, + "mean_token_accuracy": 0.8988374769687653, + "num_tokens": 10277113.0, + "step": 5736 + }, + { + "epoch": 0.9289936037567809, + "grad_norm": 19.77239990234375, + "learning_rate": 7.124352331606219e-07, + "loss": 0.478, + "mean_token_accuracy": 0.9317810237407684, + "num_tokens": 10278905.0, + "step": 5737 + }, + { + "epoch": 0.9291555339648612, + "grad_norm": 23.44733428955078, + "learning_rate": 7.108160621761659e-07, + "loss": 0.5158, + "mean_token_accuracy": 0.9296627044677734, + "num_tokens": 10280701.0, + "step": 5738 + }, + { + "epoch": 0.9293174641729415, + "grad_norm": 27.864002227783203, + "learning_rate": 7.091968911917099e-07, + "loss": 0.5723, + "mean_token_accuracy": 0.9202898740768433, + "num_tokens": 10282489.0, + "step": 5739 + }, + { + "epoch": 0.9294793943810218, + "grad_norm": 22.53125, + "learning_rate": 7.07577720207254e-07, + "loss": 0.5625, + "mean_token_accuracy": 0.9229468703269958, + "num_tokens": 10284274.0, + "step": 5740 + }, + { + "epoch": 0.9296413245891021, + "grad_norm": 41.4619140625, + "learning_rate": 7.05958549222798e-07, + "loss": 0.7821, + "mean_token_accuracy": 0.9032891094684601, + "num_tokens": 10286055.0, + "step": 5741 + }, + { + "epoch": 0.9298032547971824, + "grad_norm": 47.84886932373047, + "learning_rate": 7.04339378238342e-07, + "loss": 0.839, + "mean_token_accuracy": 0.8907374143600464, + "num_tokens": 10287842.0, + "step": 5742 + }, + { + "epoch": 0.9299651850052627, + "grad_norm": 35.309696197509766, + "learning_rate": 7.02720207253886e-07, + "loss": 0.7763, + "mean_token_accuracy": 0.9215071499347687, + "num_tokens": 10289633.0, + "step": 5743 + }, + { + "epoch": 0.930127115213343, + "grad_norm": 24.76080322265625, + "learning_rate": 7.011010362694301e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.9190051257610321, + "num_tokens": 10291417.0, + "step": 5744 + }, + { + "epoch": 0.9302890454214233, + "grad_norm": 30.902511596679688, + "learning_rate": 6.994818652849742e-07, + "loss": 0.6837, + "mean_token_accuracy": 0.9091127216815948, + "num_tokens": 10293203.0, + "step": 5745 + }, + { + "epoch": 0.9304509756295036, + "grad_norm": 34.811954498291016, + "learning_rate": 6.978626943005182e-07, + "loss": 0.6562, + "mean_token_accuracy": 0.9221293330192566, + "num_tokens": 10294983.0, + "step": 5746 + }, + { + "epoch": 0.930612905837584, + "grad_norm": 29.93048667907715, + "learning_rate": 6.962435233160622e-07, + "loss": 0.5489, + "mean_token_accuracy": 0.9260977506637573, + "num_tokens": 10296779.0, + "step": 5747 + }, + { + "epoch": 0.9307748360456644, + "grad_norm": 34.49272155761719, + "learning_rate": 6.946243523316062e-07, + "loss": 0.6162, + "mean_token_accuracy": 0.9203381836414337, + "num_tokens": 10298567.0, + "step": 5748 + }, + { + "epoch": 0.9309367662537447, + "grad_norm": 39.570648193359375, + "learning_rate": 6.930051813471502e-07, + "loss": 0.6321, + "mean_token_accuracy": 0.9064182341098785, + "num_tokens": 10300357.0, + "step": 5749 + }, + { + "epoch": 0.931098696461825, + "grad_norm": 33.147220611572266, + "learning_rate": 6.913860103626944e-07, + "loss": 0.5631, + "mean_token_accuracy": 0.9224033951759338, + "num_tokens": 10302151.0, + "step": 5750 + }, + { + "epoch": 0.9312606266699053, + "grad_norm": 32.000308990478516, + "learning_rate": 6.897668393782384e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.9275209903717041, + "num_tokens": 10303939.0, + "step": 5751 + }, + { + "epoch": 0.9314225568779856, + "grad_norm": 31.271921157836914, + "learning_rate": 6.881476683937824e-07, + "loss": 0.6159, + "mean_token_accuracy": 0.9061065912246704, + "num_tokens": 10305728.0, + "step": 5752 + }, + { + "epoch": 0.9315844870860659, + "grad_norm": 34.152748107910156, + "learning_rate": 6.865284974093264e-07, + "loss": 0.5409, + "mean_token_accuracy": 0.9229753613471985, + "num_tokens": 10307526.0, + "step": 5753 + }, + { + "epoch": 0.9317464172941462, + "grad_norm": 31.071353912353516, + "learning_rate": 6.849093264248705e-07, + "loss": 0.6629, + "mean_token_accuracy": 0.9213735461235046, + "num_tokens": 10309318.0, + "step": 5754 + }, + { + "epoch": 0.9319083475022265, + "grad_norm": 36.599456787109375, + "learning_rate": 6.832901554404146e-07, + "loss": 0.5807, + "mean_token_accuracy": 0.9183647632598877, + "num_tokens": 10311111.0, + "step": 5755 + }, + { + "epoch": 0.9320702777103068, + "grad_norm": 12.201830863952637, + "learning_rate": 6.816709844559586e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.935803234577179, + "num_tokens": 10312903.0, + "step": 5756 + }, + { + "epoch": 0.9322322079183871, + "grad_norm": 35.14945602416992, + "learning_rate": 6.800518134715027e-07, + "loss": 0.7068, + "mean_token_accuracy": 0.9058971703052521, + "num_tokens": 10314691.0, + "step": 5757 + }, + { + "epoch": 0.9323941381264675, + "grad_norm": 29.290454864501953, + "learning_rate": 6.784326424870468e-07, + "loss": 0.5436, + "mean_token_accuracy": 0.9232524335384369, + "num_tokens": 10316476.0, + "step": 5758 + }, + { + "epoch": 0.9325560683345478, + "grad_norm": 28.59797477722168, + "learning_rate": 6.768134715025908e-07, + "loss": 0.6128, + "mean_token_accuracy": 0.920354425907135, + "num_tokens": 10318277.0, + "step": 5759 + }, + { + "epoch": 0.9327179985426282, + "grad_norm": 30.71377944946289, + "learning_rate": 6.751943005181348e-07, + "loss": 0.5534, + "mean_token_accuracy": 0.9224945604801178, + "num_tokens": 10320060.0, + "step": 5760 + }, + { + "epoch": 0.9328799287507085, + "grad_norm": 21.75169563293457, + "learning_rate": 6.735751295336788e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.930170327425003, + "num_tokens": 10321844.0, + "step": 5761 + }, + { + "epoch": 0.9330418589587888, + "grad_norm": 29.982439041137695, + "learning_rate": 6.719559585492229e-07, + "loss": 0.5425, + "mean_token_accuracy": 0.924717366695404, + "num_tokens": 10323635.0, + "step": 5762 + }, + { + "epoch": 0.9332037891668691, + "grad_norm": 29.308612823486328, + "learning_rate": 6.703367875647669e-07, + "loss": 0.5382, + "mean_token_accuracy": 0.923739492893219, + "num_tokens": 10325423.0, + "step": 5763 + }, + { + "epoch": 0.9333657193749494, + "grad_norm": 34.78892135620117, + "learning_rate": 6.68717616580311e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.9242281913757324, + "num_tokens": 10327212.0, + "step": 5764 + }, + { + "epoch": 0.9335276495830297, + "grad_norm": 29.851804733276367, + "learning_rate": 6.67098445595855e-07, + "loss": 0.581, + "mean_token_accuracy": 0.9195082187652588, + "num_tokens": 10329009.0, + "step": 5765 + }, + { + "epoch": 0.93368957979111, + "grad_norm": 31.255325317382812, + "learning_rate": 6.65479274611399e-07, + "loss": 0.6777, + "mean_token_accuracy": 0.9130389094352722, + "num_tokens": 10330797.0, + "step": 5766 + }, + { + "epoch": 0.9338515099991903, + "grad_norm": 23.130157470703125, + "learning_rate": 6.638601036269431e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.9293027520179749, + "num_tokens": 10332592.0, + "step": 5767 + }, + { + "epoch": 0.9340134402072706, + "grad_norm": 30.275386810302734, + "learning_rate": 6.622409326424871e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.9170751571655273, + "num_tokens": 10334393.0, + "step": 5768 + }, + { + "epoch": 0.934175370415351, + "grad_norm": 30.390335083007812, + "learning_rate": 6.606217616580311e-07, + "loss": 0.5929, + "mean_token_accuracy": 0.9172143638134003, + "num_tokens": 10336181.0, + "step": 5769 + }, + { + "epoch": 0.9343373006234313, + "grad_norm": 31.979816436767578, + "learning_rate": 6.590025906735751e-07, + "loss": 0.5714, + "mean_token_accuracy": 0.926702469587326, + "num_tokens": 10337978.0, + "step": 5770 + }, + { + "epoch": 0.9344992308315117, + "grad_norm": 25.236547470092773, + "learning_rate": 6.573834196891192e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.929921567440033, + "num_tokens": 10339761.0, + "step": 5771 + }, + { + "epoch": 0.934661161039592, + "grad_norm": 29.61126136779785, + "learning_rate": 6.557642487046633e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.9196723699569702, + "num_tokens": 10341560.0, + "step": 5772 + }, + { + "epoch": 0.9348230912476723, + "grad_norm": 28.452234268188477, + "learning_rate": 6.541450777202073e-07, + "loss": 0.5122, + "mean_token_accuracy": 0.922222226858139, + "num_tokens": 10343342.0, + "step": 5773 + }, + { + "epoch": 0.9349850214557526, + "grad_norm": 35.27674102783203, + "learning_rate": 6.525259067357513e-07, + "loss": 0.6448, + "mean_token_accuracy": 0.9084461629390717, + "num_tokens": 10345138.0, + "step": 5774 + }, + { + "epoch": 0.9351469516638329, + "grad_norm": 18.735252380371094, + "learning_rate": 6.509067357512953e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.9319420158863068, + "num_tokens": 10346929.0, + "step": 5775 + }, + { + "epoch": 0.9353088818719132, + "grad_norm": 48.40019607543945, + "learning_rate": 6.492875647668394e-07, + "loss": 0.8597, + "mean_token_accuracy": 0.8916084170341492, + "num_tokens": 10348727.0, + "step": 5776 + }, + { + "epoch": 0.9354708120799935, + "grad_norm": 24.044849395751953, + "learning_rate": 6.476683937823834e-07, + "loss": 0.5205, + "mean_token_accuracy": 0.9247358441352844, + "num_tokens": 10350519.0, + "step": 5777 + }, + { + "epoch": 0.9356327422880738, + "grad_norm": 42.28438949584961, + "learning_rate": 6.460492227979275e-07, + "loss": 0.6881, + "mean_token_accuracy": 0.9107666015625, + "num_tokens": 10352322.0, + "step": 5778 + }, + { + "epoch": 0.9357946724961541, + "grad_norm": 25.993921279907227, + "learning_rate": 6.444300518134716e-07, + "loss": 0.5825, + "mean_token_accuracy": 0.9184591770172119, + "num_tokens": 10354104.0, + "step": 5779 + }, + { + "epoch": 0.9359566027042345, + "grad_norm": 35.94279098510742, + "learning_rate": 6.428108808290157e-07, + "loss": 0.7603, + "mean_token_accuracy": 0.9134947061538696, + "num_tokens": 10355893.0, + "step": 5780 + }, + { + "epoch": 0.9361185329123148, + "grad_norm": 29.93583869934082, + "learning_rate": 6.411917098445597e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.9125367701053619, + "num_tokens": 10357679.0, + "step": 5781 + }, + { + "epoch": 0.9362804631203951, + "grad_norm": 38.4170036315918, + "learning_rate": 6.395725388601037e-07, + "loss": 0.6057, + "mean_token_accuracy": 0.9129201769828796, + "num_tokens": 10359467.0, + "step": 5782 + }, + { + "epoch": 0.9364423933284755, + "grad_norm": 22.595632553100586, + "learning_rate": 6.379533678756477e-07, + "loss": 0.5961, + "mean_token_accuracy": 0.9251206517219543, + "num_tokens": 10361246.0, + "step": 5783 + }, + { + "epoch": 0.9366043235365558, + "grad_norm": 33.575931549072266, + "learning_rate": 6.363341968911919e-07, + "loss": 0.6209, + "mean_token_accuracy": 0.9157900214195251, + "num_tokens": 10363031.0, + "step": 5784 + }, + { + "epoch": 0.9367662537446361, + "grad_norm": 42.81550979614258, + "learning_rate": 6.347150259067359e-07, + "loss": 0.6505, + "mean_token_accuracy": 0.9125213921070099, + "num_tokens": 10364829.0, + "step": 5785 + }, + { + "epoch": 0.9369281839527164, + "grad_norm": 27.857851028442383, + "learning_rate": 6.330958549222799e-07, + "loss": 0.538, + "mean_token_accuracy": 0.9274108111858368, + "num_tokens": 10366616.0, + "step": 5786 + }, + { + "epoch": 0.9370901141607967, + "grad_norm": 25.998018264770508, + "learning_rate": 6.314766839378239e-07, + "loss": 0.552, + "mean_token_accuracy": 0.9184688925743103, + "num_tokens": 10368409.0, + "step": 5787 + }, + { + "epoch": 0.937252044368877, + "grad_norm": 27.31692886352539, + "learning_rate": 6.298575129533679e-07, + "loss": 0.629, + "mean_token_accuracy": 0.9177893698215485, + "num_tokens": 10370201.0, + "step": 5788 + }, + { + "epoch": 0.9374139745769573, + "grad_norm": 27.609037399291992, + "learning_rate": 6.28238341968912e-07, + "loss": 0.5893, + "mean_token_accuracy": 0.9202718138694763, + "num_tokens": 10371976.0, + "step": 5789 + }, + { + "epoch": 0.9375759047850376, + "grad_norm": 33.85944366455078, + "learning_rate": 6.26619170984456e-07, + "loss": 0.6097, + "mean_token_accuracy": 0.920534074306488, + "num_tokens": 10373778.0, + "step": 5790 + }, + { + "epoch": 0.9377378349931179, + "grad_norm": 21.128190994262695, + "learning_rate": 6.25e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.9393243491649628, + "num_tokens": 10375570.0, + "step": 5791 + }, + { + "epoch": 0.9378997652011983, + "grad_norm": 29.066722869873047, + "learning_rate": 6.233808290155441e-07, + "loss": 0.5859, + "mean_token_accuracy": 0.9271873235702515, + "num_tokens": 10377356.0, + "step": 5792 + }, + { + "epoch": 0.9380616954092786, + "grad_norm": 30.46184730529785, + "learning_rate": 6.217616580310881e-07, + "loss": 0.5964, + "mean_token_accuracy": 0.9239901900291443, + "num_tokens": 10379145.0, + "step": 5793 + }, + { + "epoch": 0.938223625617359, + "grad_norm": 22.316194534301758, + "learning_rate": 6.201424870466322e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.9348739385604858, + "num_tokens": 10380933.0, + "step": 5794 + }, + { + "epoch": 0.9383855558254393, + "grad_norm": 29.259885787963867, + "learning_rate": 6.185233160621762e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.9182733595371246, + "num_tokens": 10382727.0, + "step": 5795 + }, + { + "epoch": 0.9385474860335196, + "grad_norm": 28.33346939086914, + "learning_rate": 6.169041450777202e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.9255319237709045, + "num_tokens": 10384521.0, + "step": 5796 + }, + { + "epoch": 0.9387094162415999, + "grad_norm": 29.44464683532715, + "learning_rate": 6.152849740932642e-07, + "loss": 0.6447, + "mean_token_accuracy": 0.9214891493320465, + "num_tokens": 10386326.0, + "step": 5797 + }, + { + "epoch": 0.9388713464496802, + "grad_norm": 35.01515197753906, + "learning_rate": 6.136658031088084e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.928385466337204, + "num_tokens": 10388118.0, + "step": 5798 + }, + { + "epoch": 0.9390332766577605, + "grad_norm": 36.888492584228516, + "learning_rate": 6.120466321243524e-07, + "loss": 0.5813, + "mean_token_accuracy": 0.9263347685337067, + "num_tokens": 10389919.0, + "step": 5799 + }, + { + "epoch": 0.9391952068658408, + "grad_norm": 34.86924362182617, + "learning_rate": 6.104274611398965e-07, + "loss": 0.6326, + "mean_token_accuracy": 0.9076797366142273, + "num_tokens": 10391711.0, + "step": 5800 + }, + { + "epoch": 0.9393571370739211, + "grad_norm": 25.177936553955078, + "learning_rate": 6.088082901554405e-07, + "loss": 0.5005, + "mean_token_accuracy": 0.9267785847187042, + "num_tokens": 10393497.0, + "step": 5801 + }, + { + "epoch": 0.9395190672820014, + "grad_norm": 20.73735237121582, + "learning_rate": 6.071891191709845e-07, + "loss": 0.5146, + "mean_token_accuracy": 0.9345906674861908, + "num_tokens": 10395298.0, + "step": 5802 + }, + { + "epoch": 0.9396809974900818, + "grad_norm": 26.91309356689453, + "learning_rate": 6.055699481865285e-07, + "loss": 0.5542, + "mean_token_accuracy": 0.9189356565475464, + "num_tokens": 10397094.0, + "step": 5803 + }, + { + "epoch": 0.9398429276981621, + "grad_norm": 25.612842559814453, + "learning_rate": 6.039507772020725e-07, + "loss": 0.536, + "mean_token_accuracy": 0.9233269393444061, + "num_tokens": 10398880.0, + "step": 5804 + }, + { + "epoch": 0.9400048579062424, + "grad_norm": 18.32732391357422, + "learning_rate": 6.023316062176167e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.9314168691635132, + "num_tokens": 10400668.0, + "step": 5805 + }, + { + "epoch": 0.9401667881143227, + "grad_norm": 30.667888641357422, + "learning_rate": 6.007124352331607e-07, + "loss": 0.5826, + "mean_token_accuracy": 0.9214285910129547, + "num_tokens": 10402460.0, + "step": 5806 + }, + { + "epoch": 0.9403287183224031, + "grad_norm": 30.782594680786133, + "learning_rate": 5.990932642487047e-07, + "loss": 0.5623, + "mean_token_accuracy": 0.9192833304405212, + "num_tokens": 10404268.0, + "step": 5807 + }, + { + "epoch": 0.9404906485304834, + "grad_norm": 36.90355682373047, + "learning_rate": 5.974740932642487e-07, + "loss": 0.6794, + "mean_token_accuracy": 0.9246916770935059, + "num_tokens": 10406059.0, + "step": 5808 + }, + { + "epoch": 0.9406525787385637, + "grad_norm": 12.799569129943848, + "learning_rate": 5.958549222797927e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.9330569505691528, + "num_tokens": 10407840.0, + "step": 5809 + }, + { + "epoch": 0.940814508946644, + "grad_norm": 37.637847900390625, + "learning_rate": 5.942357512953368e-07, + "loss": 0.6563, + "mean_token_accuracy": 0.9167989492416382, + "num_tokens": 10409627.0, + "step": 5810 + }, + { + "epoch": 0.9409764391547243, + "grad_norm": 35.13724136352539, + "learning_rate": 5.92616580310881e-07, + "loss": 0.653, + "mean_token_accuracy": 0.9030934274196625, + "num_tokens": 10411415.0, + "step": 5811 + }, + { + "epoch": 0.9411383693628046, + "grad_norm": 29.806884765625, + "learning_rate": 5.90997409326425e-07, + "loss": 0.5742, + "mean_token_accuracy": 0.9264705777168274, + "num_tokens": 10413199.0, + "step": 5812 + }, + { + "epoch": 0.9413002995708849, + "grad_norm": 26.640439987182617, + "learning_rate": 5.89378238341969e-07, + "loss": 0.5137, + "mean_token_accuracy": 0.9260017573833466, + "num_tokens": 10414996.0, + "step": 5813 + }, + { + "epoch": 0.9414622297789653, + "grad_norm": 35.512027740478516, + "learning_rate": 5.87759067357513e-07, + "loss": 0.6984, + "mean_token_accuracy": 0.9146570265293121, + "num_tokens": 10416789.0, + "step": 5814 + }, + { + "epoch": 0.9416241599870456, + "grad_norm": 34.473106384277344, + "learning_rate": 5.86139896373057e-07, + "loss": 0.5895, + "mean_token_accuracy": 0.9121578335762024, + "num_tokens": 10418574.0, + "step": 5815 + }, + { + "epoch": 0.9417860901951259, + "grad_norm": 29.649921417236328, + "learning_rate": 5.845207253886011e-07, + "loss": 0.6216, + "mean_token_accuracy": 0.9138047397136688, + "num_tokens": 10420364.0, + "step": 5816 + }, + { + "epoch": 0.9419480204032062, + "grad_norm": 36.92958068847656, + "learning_rate": 5.829015544041451e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.9215603470802307, + "num_tokens": 10422168.0, + "step": 5817 + }, + { + "epoch": 0.9421099506112866, + "grad_norm": 23.391632080078125, + "learning_rate": 5.812823834196891e-07, + "loss": 0.5285, + "mean_token_accuracy": 0.9199735522270203, + "num_tokens": 10423955.0, + "step": 5818 + }, + { + "epoch": 0.9422718808193669, + "grad_norm": 31.84099006652832, + "learning_rate": 5.796632124352332e-07, + "loss": 0.5901, + "mean_token_accuracy": 0.9063178300857544, + "num_tokens": 10425755.0, + "step": 5819 + }, + { + "epoch": 0.9424338110274472, + "grad_norm": 27.196680068969727, + "learning_rate": 5.780440414507772e-07, + "loss": 0.6378, + "mean_token_accuracy": 0.9129368960857391, + "num_tokens": 10427544.0, + "step": 5820 + }, + { + "epoch": 0.9425957412355275, + "grad_norm": 19.640888214111328, + "learning_rate": 5.764248704663213e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.9330845773220062, + "num_tokens": 10429340.0, + "step": 5821 + }, + { + "epoch": 0.9427576714436078, + "grad_norm": 50.8370246887207, + "learning_rate": 5.748056994818654e-07, + "loss": 0.7706, + "mean_token_accuracy": 0.8990526497364044, + "num_tokens": 10431130.0, + "step": 5822 + }, + { + "epoch": 0.9429196016516881, + "grad_norm": 27.670997619628906, + "learning_rate": 5.731865284974094e-07, + "loss": 0.6112, + "mean_token_accuracy": 0.9309405386447906, + "num_tokens": 10432917.0, + "step": 5823 + }, + { + "epoch": 0.9430815318597684, + "grad_norm": 25.704076766967773, + "learning_rate": 5.715673575129534e-07, + "loss": 0.51, + "mean_token_accuracy": 0.9206287264823914, + "num_tokens": 10434706.0, + "step": 5824 + }, + { + "epoch": 0.9432434620678487, + "grad_norm": 26.186450958251953, + "learning_rate": 5.699481865284974e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.9260194301605225, + "num_tokens": 10436488.0, + "step": 5825 + }, + { + "epoch": 0.9434053922759291, + "grad_norm": 17.411903381347656, + "learning_rate": 5.683290155440415e-07, + "loss": 0.447, + "mean_token_accuracy": 0.9361573457717896, + "num_tokens": 10438282.0, + "step": 5826 + }, + { + "epoch": 0.9435673224840094, + "grad_norm": 35.3471794128418, + "learning_rate": 5.667098445595856e-07, + "loss": 0.6162, + "mean_token_accuracy": 0.9028058052062988, + "num_tokens": 10440072.0, + "step": 5827 + }, + { + "epoch": 0.9437292526920897, + "grad_norm": 27.19516372680664, + "learning_rate": 5.650906735751296e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.9184841811656952, + "num_tokens": 10441855.0, + "step": 5828 + }, + { + "epoch": 0.94389118290017, + "grad_norm": 36.242149353027344, + "learning_rate": 5.634715025906736e-07, + "loss": 0.6853, + "mean_token_accuracy": 0.9169794619083405, + "num_tokens": 10443644.0, + "step": 5829 + }, + { + "epoch": 0.9440531131082504, + "grad_norm": 30.05845832824707, + "learning_rate": 5.618523316062176e-07, + "loss": 0.5165, + "mean_token_accuracy": 0.931231677532196, + "num_tokens": 10445447.0, + "step": 5830 + }, + { + "epoch": 0.9442150433163307, + "grad_norm": 25.900711059570312, + "learning_rate": 5.602331606217616e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.9367478787899017, + "num_tokens": 10447244.0, + "step": 5831 + }, + { + "epoch": 0.944376973524411, + "grad_norm": 33.00969314575195, + "learning_rate": 5.586139896373058e-07, + "loss": 0.5266, + "mean_token_accuracy": 0.9230356216430664, + "num_tokens": 10449029.0, + "step": 5832 + }, + { + "epoch": 0.9445389037324913, + "grad_norm": 28.1531925201416, + "learning_rate": 5.569948186528498e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.9274159669876099, + "num_tokens": 10450817.0, + "step": 5833 + }, + { + "epoch": 0.9447008339405716, + "grad_norm": 30.398107528686523, + "learning_rate": 5.553756476683939e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.9264705777168274, + "num_tokens": 10452601.0, + "step": 5834 + }, + { + "epoch": 0.9448627641486519, + "grad_norm": 34.01579666137695, + "learning_rate": 5.537564766839379e-07, + "loss": 0.6909, + "mean_token_accuracy": 0.9025560617446899, + "num_tokens": 10454390.0, + "step": 5835 + }, + { + "epoch": 0.9450246943567322, + "grad_norm": 34.65859603881836, + "learning_rate": 5.521373056994819e-07, + "loss": 0.5887, + "mean_token_accuracy": 0.9172320365905762, + "num_tokens": 10456179.0, + "step": 5836 + }, + { + "epoch": 0.9451866245648126, + "grad_norm": 28.093215942382812, + "learning_rate": 5.505181347150259e-07, + "loss": 0.5394, + "mean_token_accuracy": 0.9314554035663605, + "num_tokens": 10457968.0, + "step": 5837 + }, + { + "epoch": 0.9453485547728929, + "grad_norm": 41.703670501708984, + "learning_rate": 5.4889896373057e-07, + "loss": 0.6117, + "mean_token_accuracy": 0.9221778213977814, + "num_tokens": 10459763.0, + "step": 5838 + }, + { + "epoch": 0.9455104849809732, + "grad_norm": 31.17424964904785, + "learning_rate": 5.472797927461141e-07, + "loss": 0.6505, + "mean_token_accuracy": 0.9208592176437378, + "num_tokens": 10461553.0, + "step": 5839 + }, + { + "epoch": 0.9456724151890535, + "grad_norm": 26.08766746520996, + "learning_rate": 5.456606217616581e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.9295634925365448, + "num_tokens": 10463349.0, + "step": 5840 + }, + { + "epoch": 0.9458343453971338, + "grad_norm": 38.63079071044922, + "learning_rate": 5.440414507772021e-07, + "loss": 0.6373, + "mean_token_accuracy": 0.9077271521091461, + "num_tokens": 10465143.0, + "step": 5841 + }, + { + "epoch": 0.9459962756052142, + "grad_norm": 35.97124099731445, + "learning_rate": 5.424222797927461e-07, + "loss": 0.6741, + "mean_token_accuracy": 0.9083965718746185, + "num_tokens": 10466939.0, + "step": 5842 + }, + { + "epoch": 0.9461582058132945, + "grad_norm": 30.48563003540039, + "learning_rate": 5.408031088082902e-07, + "loss": 0.5578, + "mean_token_accuracy": 0.9144777357578278, + "num_tokens": 10468719.0, + "step": 5843 + }, + { + "epoch": 0.9463201360213748, + "grad_norm": 37.033348083496094, + "learning_rate": 5.391839378238342e-07, + "loss": 0.564, + "mean_token_accuracy": 0.9245690703392029, + "num_tokens": 10470523.0, + "step": 5844 + }, + { + "epoch": 0.9464820662294551, + "grad_norm": 25.39202308654785, + "learning_rate": 5.375647668393782e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.9207701981067657, + "num_tokens": 10472311.0, + "step": 5845 + }, + { + "epoch": 0.9466439964375354, + "grad_norm": 27.598430633544922, + "learning_rate": 5.359455958549224e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.9278229773044586, + "num_tokens": 10474100.0, + "step": 5846 + }, + { + "epoch": 0.9468059266456157, + "grad_norm": 36.57701110839844, + "learning_rate": 5.343264248704664e-07, + "loss": 0.5869, + "mean_token_accuracy": 0.916979968547821, + "num_tokens": 10475877.0, + "step": 5847 + }, + { + "epoch": 0.9469678568536961, + "grad_norm": 22.122133255004883, + "learning_rate": 5.327072538860104e-07, + "loss": 0.5685, + "mean_token_accuracy": 0.9289623200893402, + "num_tokens": 10477668.0, + "step": 5848 + }, + { + "epoch": 0.9471297870617764, + "grad_norm": 23.43597984313965, + "learning_rate": 5.310880829015545e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.9350432753562927, + "num_tokens": 10479457.0, + "step": 5849 + }, + { + "epoch": 0.9472917172698567, + "grad_norm": 37.66163635253906, + "learning_rate": 5.294689119170985e-07, + "loss": 0.776, + "mean_token_accuracy": 0.8942778706550598, + "num_tokens": 10481252.0, + "step": 5850 + }, + { + "epoch": 0.947453647477937, + "grad_norm": 26.512353897094727, + "learning_rate": 5.278497409326425e-07, + "loss": 0.5611, + "mean_token_accuracy": 0.9245029091835022, + "num_tokens": 10483042.0, + "step": 5851 + }, + { + "epoch": 0.9476155776860173, + "grad_norm": 23.671476364135742, + "learning_rate": 5.262305699481865e-07, + "loss": 0.6051, + "mean_token_accuracy": 0.913717657327652, + "num_tokens": 10484844.0, + "step": 5852 + }, + { + "epoch": 0.9477775078940976, + "grad_norm": 23.282915115356445, + "learning_rate": 5.246113989637306e-07, + "loss": 0.5693, + "mean_token_accuracy": 0.9245370626449585, + "num_tokens": 10486635.0, + "step": 5853 + }, + { + "epoch": 0.947939438102178, + "grad_norm": 28.932723999023438, + "learning_rate": 5.229922279792747e-07, + "loss": 0.5824, + "mean_token_accuracy": 0.9268666207790375, + "num_tokens": 10488434.0, + "step": 5854 + }, + { + "epoch": 0.9481013683102583, + "grad_norm": 27.782014846801758, + "learning_rate": 5.213730569948187e-07, + "loss": 0.5236, + "mean_token_accuracy": 0.9244604408740997, + "num_tokens": 10490224.0, + "step": 5855 + }, + { + "epoch": 0.9482632985183386, + "grad_norm": 25.03302764892578, + "learning_rate": 5.197538860103627e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.92771115899086, + "num_tokens": 10492013.0, + "step": 5856 + }, + { + "epoch": 0.9484252287264189, + "grad_norm": 35.20737838745117, + "learning_rate": 5.181347150259067e-07, + "loss": 0.7127, + "mean_token_accuracy": 0.9011110067367554, + "num_tokens": 10493798.0, + "step": 5857 + }, + { + "epoch": 0.9485871589344992, + "grad_norm": 20.374496459960938, + "learning_rate": 5.165155440414508e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.9309523701667786, + "num_tokens": 10495585.0, + "step": 5858 + }, + { + "epoch": 0.9487490891425795, + "grad_norm": 32.45448303222656, + "learning_rate": 5.148963730569948e-07, + "loss": 0.5677, + "mean_token_accuracy": 0.9218812882900238, + "num_tokens": 10497379.0, + "step": 5859 + }, + { + "epoch": 0.9489110193506599, + "grad_norm": 33.16579055786133, + "learning_rate": 5.13277202072539e-07, + "loss": 0.584, + "mean_token_accuracy": 0.9181104600429535, + "num_tokens": 10499172.0, + "step": 5860 + }, + { + "epoch": 0.9490729495587402, + "grad_norm": 29.40290069580078, + "learning_rate": 5.11658031088083e-07, + "loss": 0.5612, + "mean_token_accuracy": 0.9170315861701965, + "num_tokens": 10500961.0, + "step": 5861 + }, + { + "epoch": 0.9492348797668205, + "grad_norm": 28.405311584472656, + "learning_rate": 5.10038860103627e-07, + "loss": 0.5255, + "mean_token_accuracy": 0.9284502267837524, + "num_tokens": 10502752.0, + "step": 5862 + }, + { + "epoch": 0.9493968099749008, + "grad_norm": 29.962665557861328, + "learning_rate": 5.08419689119171e-07, + "loss": 0.5613, + "mean_token_accuracy": 0.9136690497398376, + "num_tokens": 10504542.0, + "step": 5863 + }, + { + "epoch": 0.9495587401829811, + "grad_norm": 26.08298110961914, + "learning_rate": 5.06800518134715e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.9237206876277924, + "num_tokens": 10506328.0, + "step": 5864 + }, + { + "epoch": 0.9497206703910615, + "grad_norm": 35.55826950073242, + "learning_rate": 5.051813471502591e-07, + "loss": 0.6273, + "mean_token_accuracy": 0.9112173020839691, + "num_tokens": 10508122.0, + "step": 5865 + }, + { + "epoch": 0.9498826005991418, + "grad_norm": 32.004852294921875, + "learning_rate": 5.035621761658031e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.9157631993293762, + "num_tokens": 10509907.0, + "step": 5866 + }, + { + "epoch": 0.9500445308072221, + "grad_norm": 21.698143005371094, + "learning_rate": 5.019430051813472e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.9259259402751923, + "num_tokens": 10511689.0, + "step": 5867 + }, + { + "epoch": 0.9502064610153024, + "grad_norm": 30.377098083496094, + "learning_rate": 5.003238341968912e-07, + "loss": 0.4962, + "mean_token_accuracy": 0.9244965612888336, + "num_tokens": 10513479.0, + "step": 5868 + }, + { + "epoch": 0.9503683912233827, + "grad_norm": 23.717273712158203, + "learning_rate": 4.987046632124353e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.9337513446807861, + "num_tokens": 10515262.0, + "step": 5869 + }, + { + "epoch": 0.950530321431463, + "grad_norm": 19.434326171875, + "learning_rate": 4.970854922279793e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.9277893602848053, + "num_tokens": 10517051.0, + "step": 5870 + }, + { + "epoch": 0.9506922516395434, + "grad_norm": 36.024112701416016, + "learning_rate": 4.954663212435234e-07, + "loss": 0.8456, + "mean_token_accuracy": 0.90591099858284, + "num_tokens": 10518846.0, + "step": 5871 + }, + { + "epoch": 0.9508541818476237, + "grad_norm": 43.867584228515625, + "learning_rate": 4.938471502590674e-07, + "loss": 0.6232, + "mean_token_accuracy": 0.9055784940719604, + "num_tokens": 10520633.0, + "step": 5872 + }, + { + "epoch": 0.951016112055704, + "grad_norm": 39.61911392211914, + "learning_rate": 4.922279792746115e-07, + "loss": 0.6962, + "mean_token_accuracy": 0.9130696952342987, + "num_tokens": 10522433.0, + "step": 5873 + }, + { + "epoch": 0.9511780422637843, + "grad_norm": 17.039655685424805, + "learning_rate": 4.906088082901555e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.9372208416461945, + "num_tokens": 10524216.0, + "step": 5874 + }, + { + "epoch": 0.9513399724718646, + "grad_norm": 46.72130584716797, + "learning_rate": 4.889896373056995e-07, + "loss": 0.748, + "mean_token_accuracy": 0.9018268287181854, + "num_tokens": 10526011.0, + "step": 5875 + }, + { + "epoch": 0.9515019026799449, + "grad_norm": 29.14239501953125, + "learning_rate": 4.873704663212436e-07, + "loss": 0.5335, + "mean_token_accuracy": 0.9172661900520325, + "num_tokens": 10527801.0, + "step": 5876 + }, + { + "epoch": 0.9516638328880253, + "grad_norm": 20.54410171508789, + "learning_rate": 4.857512953367876e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.9325833320617676, + "num_tokens": 10529580.0, + "step": 5877 + }, + { + "epoch": 0.9518257630961056, + "grad_norm": 34.51727294921875, + "learning_rate": 4.841321243523316e-07, + "loss": 0.6738, + "mean_token_accuracy": 0.9103787243366241, + "num_tokens": 10531381.0, + "step": 5878 + }, + { + "epoch": 0.9519876933041859, + "grad_norm": 40.768619537353516, + "learning_rate": 4.825129533678756e-07, + "loss": 0.6622, + "mean_token_accuracy": 0.9035272896289825, + "num_tokens": 10533185.0, + "step": 5879 + }, + { + "epoch": 0.9521496235122662, + "grad_norm": 38.52097702026367, + "learning_rate": 4.808937823834198e-07, + "loss": 0.6334, + "mean_token_accuracy": 0.913214385509491, + "num_tokens": 10534985.0, + "step": 5880 + }, + { + "epoch": 0.9523115537203465, + "grad_norm": 31.541362762451172, + "learning_rate": 4.792746113989638e-07, + "loss": 0.5962, + "mean_token_accuracy": 0.9225879609584808, + "num_tokens": 10536782.0, + "step": 5881 + }, + { + "epoch": 0.9524734839284269, + "grad_norm": 20.68697166442871, + "learning_rate": 4.776554404145079e-07, + "loss": 0.4952, + "mean_token_accuracy": 0.9324290156364441, + "num_tokens": 10538575.0, + "step": 5882 + }, + { + "epoch": 0.9526354141365072, + "grad_norm": 39.71917724609375, + "learning_rate": 4.760362694300519e-07, + "loss": 0.6931, + "mean_token_accuracy": 0.9010036587715149, + "num_tokens": 10540368.0, + "step": 5883 + }, + { + "epoch": 0.9527973443445875, + "grad_norm": 21.238994598388672, + "learning_rate": 4.744170984455959e-07, + "loss": 0.4972, + "mean_token_accuracy": 0.9281055927276611, + "num_tokens": 10542158.0, + "step": 5884 + }, + { + "epoch": 0.9529592745526678, + "grad_norm": 43.01884460449219, + "learning_rate": 4.727979274611399e-07, + "loss": 0.6987, + "mean_token_accuracy": 0.9073708951473236, + "num_tokens": 10543962.0, + "step": 5885 + }, + { + "epoch": 0.9531212047607481, + "grad_norm": 17.031818389892578, + "learning_rate": 4.71178756476684e-07, + "loss": 0.4329, + "mean_token_accuracy": 0.9368039667606354, + "num_tokens": 10545759.0, + "step": 5886 + }, + { + "epoch": 0.9532831349688284, + "grad_norm": 23.41274642944336, + "learning_rate": 4.69559585492228e-07, + "loss": 0.5103, + "mean_token_accuracy": 0.9263271987438202, + "num_tokens": 10547556.0, + "step": 5887 + }, + { + "epoch": 0.9534450651769087, + "grad_norm": 27.527408599853516, + "learning_rate": 4.6794041450777207e-07, + "loss": 0.5717, + "mean_token_accuracy": 0.9177459180355072, + "num_tokens": 10549347.0, + "step": 5888 + }, + { + "epoch": 0.953606995384989, + "grad_norm": 25.210742950439453, + "learning_rate": 4.663212435233161e-07, + "loss": 0.4956, + "mean_token_accuracy": 0.9309873878955841, + "num_tokens": 10551135.0, + "step": 5889 + }, + { + "epoch": 0.9537689255930694, + "grad_norm": 32.522613525390625, + "learning_rate": 4.6470207253886015e-07, + "loss": 0.7416, + "mean_token_accuracy": 0.9124854505062103, + "num_tokens": 10552922.0, + "step": 5890 + }, + { + "epoch": 0.9539308558011497, + "grad_norm": 37.279945373535156, + "learning_rate": 4.6308290155440416e-07, + "loss": 0.6317, + "mean_token_accuracy": 0.9061627388000488, + "num_tokens": 10554720.0, + "step": 5891 + }, + { + "epoch": 0.95409278600923, + "grad_norm": 29.628122329711914, + "learning_rate": 4.614637305699482e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.9250216782093048, + "num_tokens": 10556512.0, + "step": 5892 + }, + { + "epoch": 0.9542547162173104, + "grad_norm": 36.75865173339844, + "learning_rate": 4.5984455958549224e-07, + "loss": 0.5909, + "mean_token_accuracy": 0.9177224338054657, + "num_tokens": 10558313.0, + "step": 5893 + }, + { + "epoch": 0.9544166464253907, + "grad_norm": 38.11921310424805, + "learning_rate": 4.5822538860103636e-07, + "loss": 0.6103, + "mean_token_accuracy": 0.9054532945156097, + "num_tokens": 10560100.0, + "step": 5894 + }, + { + "epoch": 0.954578576633471, + "grad_norm": 33.460697174072266, + "learning_rate": 4.566062176165804e-07, + "loss": 0.62, + "mean_token_accuracy": 0.9171499609947205, + "num_tokens": 10561890.0, + "step": 5895 + }, + { + "epoch": 0.9547405068415513, + "grad_norm": 30.896888732910156, + "learning_rate": 4.549870466321244e-07, + "loss": 0.553, + "mean_token_accuracy": 0.9255921542644501, + "num_tokens": 10563686.0, + "step": 5896 + }, + { + "epoch": 0.9549024370496316, + "grad_norm": 23.847209930419922, + "learning_rate": 4.5336787564766845e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.9306978285312653, + "num_tokens": 10565472.0, + "step": 5897 + }, + { + "epoch": 0.9550643672577119, + "grad_norm": 31.429126739501953, + "learning_rate": 4.5174870466321247e-07, + "loss": 0.6237, + "mean_token_accuracy": 0.9162161946296692, + "num_tokens": 10567272.0, + "step": 5898 + }, + { + "epoch": 0.9552262974657922, + "grad_norm": 27.474557876586914, + "learning_rate": 4.5012953367875653e-07, + "loss": 0.5835, + "mean_token_accuracy": 0.9202898740768433, + "num_tokens": 10569060.0, + "step": 5899 + }, + { + "epoch": 0.9553882276738725, + "grad_norm": 52.21749496459961, + "learning_rate": 4.4851036269430055e-07, + "loss": 1.0172, + "mean_token_accuracy": 0.8912636637687683, + "num_tokens": 10570857.0, + "step": 5900 + }, + { + "epoch": 0.9555501578819529, + "grad_norm": 33.61493682861328, + "learning_rate": 4.468911917098446e-07, + "loss": 0.6756, + "mean_token_accuracy": 0.9161654114723206, + "num_tokens": 10572642.0, + "step": 5901 + }, + { + "epoch": 0.9557120880900332, + "grad_norm": 16.410654067993164, + "learning_rate": 4.452720207253886e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.9313203990459442, + "num_tokens": 10574431.0, + "step": 5902 + }, + { + "epoch": 0.9558740182981135, + "grad_norm": 37.07381057739258, + "learning_rate": 4.4365284974093264e-07, + "loss": 0.6728, + "mean_token_accuracy": 0.907142847776413, + "num_tokens": 10576233.0, + "step": 5903 + }, + { + "epoch": 0.9560359485061938, + "grad_norm": 32.26778030395508, + "learning_rate": 4.420336787564767e-07, + "loss": 0.5842, + "mean_token_accuracy": 0.920152485370636, + "num_tokens": 10578020.0, + "step": 5904 + }, + { + "epoch": 0.9561978787142742, + "grad_norm": 36.27422332763672, + "learning_rate": 4.404145077720207e-07, + "loss": 0.7774, + "mean_token_accuracy": 0.9026908576488495, + "num_tokens": 10579809.0, + "step": 5905 + }, + { + "epoch": 0.9563598089223545, + "grad_norm": 28.081642150878906, + "learning_rate": 4.3879533678756484e-07, + "loss": 0.5155, + "mean_token_accuracy": 0.9258669912815094, + "num_tokens": 10581591.0, + "step": 5906 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 23.881322860717773, + "learning_rate": 4.3717616580310885e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.9328738451004028, + "num_tokens": 10583386.0, + "step": 5907 + }, + { + "epoch": 0.9566836693385151, + "grad_norm": 42.368778228759766, + "learning_rate": 4.355569948186529e-07, + "loss": 0.826, + "mean_token_accuracy": 0.8999671041965485, + "num_tokens": 10585168.0, + "step": 5908 + }, + { + "epoch": 0.9568455995465954, + "grad_norm": 27.772842407226562, + "learning_rate": 4.3393782383419693e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.9305555522441864, + "num_tokens": 10586968.0, + "step": 5909 + }, + { + "epoch": 0.9570075297546757, + "grad_norm": 29.969303131103516, + "learning_rate": 4.32318652849741e-07, + "loss": 0.525, + "mean_token_accuracy": 0.9265023469924927, + "num_tokens": 10588766.0, + "step": 5910 + }, + { + "epoch": 0.957169459962756, + "grad_norm": 26.879058837890625, + "learning_rate": 4.30699481865285e-07, + "loss": 0.4995, + "mean_token_accuracy": 0.92127725481987, + "num_tokens": 10590557.0, + "step": 5911 + }, + { + "epoch": 0.9573313901708363, + "grad_norm": 29.942100524902344, + "learning_rate": 4.2908031088082907e-07, + "loss": 0.5879, + "mean_token_accuracy": 0.9240978360176086, + "num_tokens": 10592346.0, + "step": 5912 + }, + { + "epoch": 0.9574933203789167, + "grad_norm": 29.039840698242188, + "learning_rate": 4.274611398963731e-07, + "loss": 0.603, + "mean_token_accuracy": 0.9223621189594269, + "num_tokens": 10594141.0, + "step": 5913 + }, + { + "epoch": 0.957655250586997, + "grad_norm": 37.86997985839844, + "learning_rate": 4.258419689119171e-07, + "loss": 0.6288, + "mean_token_accuracy": 0.9067164361476898, + "num_tokens": 10595921.0, + "step": 5914 + }, + { + "epoch": 0.9578171807950773, + "grad_norm": 23.851858139038086, + "learning_rate": 4.2422279792746117e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.920639842748642, + "num_tokens": 10597711.0, + "step": 5915 + }, + { + "epoch": 0.9579791110031577, + "grad_norm": 42.675838470458984, + "learning_rate": 4.226036269430052e-07, + "loss": 0.7095, + "mean_token_accuracy": 0.9021739363670349, + "num_tokens": 10599499.0, + "step": 5916 + }, + { + "epoch": 0.958141041211238, + "grad_norm": 54.55388641357422, + "learning_rate": 4.2098445595854924e-07, + "loss": 0.8528, + "mean_token_accuracy": 0.887012243270874, + "num_tokens": 10601301.0, + "step": 5917 + }, + { + "epoch": 0.9583029714193183, + "grad_norm": 38.37525939941406, + "learning_rate": 4.193652849740933e-07, + "loss": 0.5969, + "mean_token_accuracy": 0.9152597486972809, + "num_tokens": 10603096.0, + "step": 5918 + }, + { + "epoch": 0.9584649016273986, + "grad_norm": 33.2430305480957, + "learning_rate": 4.177461139896374e-07, + "loss": 0.526, + "mean_token_accuracy": 0.9164961278438568, + "num_tokens": 10604895.0, + "step": 5919 + }, + { + "epoch": 0.9586268318354789, + "grad_norm": 24.482637405395508, + "learning_rate": 4.161269430051814e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.9333201944828033, + "num_tokens": 10606692.0, + "step": 5920 + }, + { + "epoch": 0.9587887620435592, + "grad_norm": 32.0526237487793, + "learning_rate": 4.1450777202072546e-07, + "loss": 0.579, + "mean_token_accuracy": 0.9181353747844696, + "num_tokens": 10608484.0, + "step": 5921 + }, + { + "epoch": 0.9589506922516395, + "grad_norm": 28.71051597595215, + "learning_rate": 4.1288860103626947e-07, + "loss": 0.5547, + "mean_token_accuracy": 0.9195210933685303, + "num_tokens": 10610269.0, + "step": 5922 + }, + { + "epoch": 0.9591126224597198, + "grad_norm": 23.614892959594727, + "learning_rate": 4.1126943005181353e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.9240615367889404, + "num_tokens": 10612058.0, + "step": 5923 + }, + { + "epoch": 0.9592745526678002, + "grad_norm": 32.31985092163086, + "learning_rate": 4.0965025906735755e-07, + "loss": 0.5378, + "mean_token_accuracy": 0.9255583882331848, + "num_tokens": 10613839.0, + "step": 5924 + }, + { + "epoch": 0.9594364828758805, + "grad_norm": 41.85490417480469, + "learning_rate": 4.0803108808290156e-07, + "loss": 0.589, + "mean_token_accuracy": 0.9192523658275604, + "num_tokens": 10615636.0, + "step": 5925 + }, + { + "epoch": 0.9595984130839608, + "grad_norm": 49.366825103759766, + "learning_rate": 4.0641191709844563e-07, + "loss": 0.9697, + "mean_token_accuracy": 0.8929684460163116, + "num_tokens": 10617435.0, + "step": 5926 + }, + { + "epoch": 0.9597603432920412, + "grad_norm": 23.694772720336914, + "learning_rate": 4.0479274611398964e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.9331349432468414, + "num_tokens": 10619231.0, + "step": 5927 + }, + { + "epoch": 0.9599222735001215, + "grad_norm": 27.595884323120117, + "learning_rate": 4.031735751295337e-07, + "loss": 0.5221, + "mean_token_accuracy": 0.9216987490653992, + "num_tokens": 10621023.0, + "step": 5928 + }, + { + "epoch": 0.9600842037082018, + "grad_norm": 39.44364547729492, + "learning_rate": 4.015544041450777e-07, + "loss": 0.6418, + "mean_token_accuracy": 0.9181104600429535, + "num_tokens": 10622816.0, + "step": 5929 + }, + { + "epoch": 0.9602461339162821, + "grad_norm": 31.61448860168457, + "learning_rate": 3.9993523316062184e-07, + "loss": 0.5314, + "mean_token_accuracy": 0.9178589880466461, + "num_tokens": 10624607.0, + "step": 5930 + }, + { + "epoch": 0.9604080641243624, + "grad_norm": 24.970609664916992, + "learning_rate": 3.9831606217616585e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.9246819317340851, + "num_tokens": 10626385.0, + "step": 5931 + }, + { + "epoch": 0.9605699943324427, + "grad_norm": 33.35499572753906, + "learning_rate": 3.966968911917099e-07, + "loss": 0.5944, + "mean_token_accuracy": 0.9234962463378906, + "num_tokens": 10628170.0, + "step": 5932 + }, + { + "epoch": 0.960731924540523, + "grad_norm": 20.065067291259766, + "learning_rate": 3.9507772020725393e-07, + "loss": 0.459, + "mean_token_accuracy": 0.9368659555912018, + "num_tokens": 10629967.0, + "step": 5933 + }, + { + "epoch": 0.9608938547486033, + "grad_norm": 26.598369598388672, + "learning_rate": 3.93458549222798e-07, + "loss": 0.5355, + "mean_token_accuracy": 0.9282888174057007, + "num_tokens": 10631758.0, + "step": 5934 + }, + { + "epoch": 0.9610557849566836, + "grad_norm": 32.36192321777344, + "learning_rate": 3.91839378238342e-07, + "loss": 0.6004, + "mean_token_accuracy": 0.9073112308979034, + "num_tokens": 10633559.0, + "step": 5935 + }, + { + "epoch": 0.961217715164764, + "grad_norm": 28.539447784423828, + "learning_rate": 3.90220207253886e-07, + "loss": 0.5208, + "mean_token_accuracy": 0.9290241301059723, + "num_tokens": 10635353.0, + "step": 5936 + }, + { + "epoch": 0.9613796453728443, + "grad_norm": 36.23398208618164, + "learning_rate": 3.886010362694301e-07, + "loss": 0.5983, + "mean_token_accuracy": 0.9195847511291504, + "num_tokens": 10637150.0, + "step": 5937 + }, + { + "epoch": 0.9615415755809246, + "grad_norm": 33.15549087524414, + "learning_rate": 3.869818652849741e-07, + "loss": 0.6119, + "mean_token_accuracy": 0.924199789762497, + "num_tokens": 10638939.0, + "step": 5938 + }, + { + "epoch": 0.961703505789005, + "grad_norm": 26.716331481933594, + "learning_rate": 3.8536269430051817e-07, + "loss": 0.6122, + "mean_token_accuracy": 0.9200627207756042, + "num_tokens": 10640728.0, + "step": 5939 + }, + { + "epoch": 0.9618654359970853, + "grad_norm": 15.944022178649902, + "learning_rate": 3.837435233160622e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.9378464818000793, + "num_tokens": 10642514.0, + "step": 5940 + }, + { + "epoch": 0.9620273662051656, + "grad_norm": 43.929901123046875, + "learning_rate": 3.821243523316062e-07, + "loss": 0.6901, + "mean_token_accuracy": 0.9004509150981903, + "num_tokens": 10644318.0, + "step": 5941 + }, + { + "epoch": 0.9621892964132459, + "grad_norm": 20.468719482421875, + "learning_rate": 3.805051813471503e-07, + "loss": 0.4968, + "mean_token_accuracy": 0.9345710575580597, + "num_tokens": 10646105.0, + "step": 5942 + }, + { + "epoch": 0.9623512266213262, + "grad_norm": 26.64945411682129, + "learning_rate": 3.788860103626944e-07, + "loss": 0.5983, + "mean_token_accuracy": 0.9306386113166809, + "num_tokens": 10647905.0, + "step": 5943 + }, + { + "epoch": 0.9625131568294065, + "grad_norm": 41.12892532348633, + "learning_rate": 3.772668393782384e-07, + "loss": 0.5951, + "mean_token_accuracy": 0.9156040847301483, + "num_tokens": 10649701.0, + "step": 5944 + }, + { + "epoch": 0.9626750870374868, + "grad_norm": 37.25678253173828, + "learning_rate": 3.756476683937824e-07, + "loss": 0.6089, + "mean_token_accuracy": 0.9087215662002563, + "num_tokens": 10651498.0, + "step": 5945 + }, + { + "epoch": 0.9628370172455671, + "grad_norm": 42.789878845214844, + "learning_rate": 3.7402849740932647e-07, + "loss": 0.5852, + "mean_token_accuracy": 0.9198232591152191, + "num_tokens": 10653297.0, + "step": 5946 + }, + { + "epoch": 0.9629989474536474, + "grad_norm": 36.14419937133789, + "learning_rate": 3.724093264248705e-07, + "loss": 0.6273, + "mean_token_accuracy": 0.9062924981117249, + "num_tokens": 10655096.0, + "step": 5947 + }, + { + "epoch": 0.9631608776617278, + "grad_norm": 29.066207885742188, + "learning_rate": 3.7079015544041455e-07, + "loss": 0.496, + "mean_token_accuracy": 0.9302737712860107, + "num_tokens": 10656881.0, + "step": 5948 + }, + { + "epoch": 0.9633228078698081, + "grad_norm": 16.833707809448242, + "learning_rate": 3.6917098445595856e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.9347826242446899, + "num_tokens": 10658669.0, + "step": 5949 + }, + { + "epoch": 0.9634847380778885, + "grad_norm": 40.5849723815918, + "learning_rate": 3.6755181347150263e-07, + "loss": 0.8734, + "mean_token_accuracy": 0.8952994644641876, + "num_tokens": 10660459.0, + "step": 5950 + }, + { + "epoch": 0.9636466682859688, + "grad_norm": 30.560302734375, + "learning_rate": 3.6593264248704664e-07, + "loss": 0.6522, + "mean_token_accuracy": 0.9181357622146606, + "num_tokens": 10662252.0, + "step": 5951 + }, + { + "epoch": 0.9638085984940491, + "grad_norm": 33.43016815185547, + "learning_rate": 3.6431347150259065e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.9249196350574493, + "num_tokens": 10664044.0, + "step": 5952 + }, + { + "epoch": 0.9639705287021294, + "grad_norm": 28.4686279296875, + "learning_rate": 3.626943005181347e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.9294314384460449, + "num_tokens": 10665839.0, + "step": 5953 + }, + { + "epoch": 0.9641324589102097, + "grad_norm": 19.430391311645508, + "learning_rate": 3.6107512953367884e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.9301540851593018, + "num_tokens": 10667637.0, + "step": 5954 + }, + { + "epoch": 0.96429438911829, + "grad_norm": 31.777631759643555, + "learning_rate": 3.5945595854922285e-07, + "loss": 0.6552, + "mean_token_accuracy": 0.9137681126594543, + "num_tokens": 10669427.0, + "step": 5955 + }, + { + "epoch": 0.9644563193263703, + "grad_norm": 40.9117317199707, + "learning_rate": 3.5783678756476687e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.916402131319046, + "num_tokens": 10671214.0, + "step": 5956 + }, + { + "epoch": 0.9646182495344506, + "grad_norm": 37.1632194519043, + "learning_rate": 3.5621761658031093e-07, + "loss": 0.5636, + "mean_token_accuracy": 0.9131466746330261, + "num_tokens": 10673002.0, + "step": 5957 + }, + { + "epoch": 0.9647801797425309, + "grad_norm": 21.992605209350586, + "learning_rate": 3.5459844559585494e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.937282145023346, + "num_tokens": 10674785.0, + "step": 5958 + }, + { + "epoch": 0.9649421099506112, + "grad_norm": 36.02892303466797, + "learning_rate": 3.52979274611399e-07, + "loss": 0.5359, + "mean_token_accuracy": 0.9150150120258331, + "num_tokens": 10676580.0, + "step": 5959 + }, + { + "epoch": 0.9651040401586916, + "grad_norm": 27.73234748840332, + "learning_rate": 3.51360103626943e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.9242115616798401, + "num_tokens": 10678369.0, + "step": 5960 + }, + { + "epoch": 0.965265970366772, + "grad_norm": 25.672147750854492, + "learning_rate": 3.497409326424871e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.9201631844043732, + "num_tokens": 10680156.0, + "step": 5961 + }, + { + "epoch": 0.9654279005748523, + "grad_norm": 25.759294509887695, + "learning_rate": 3.481217616580311e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.9189557433128357, + "num_tokens": 10681939.0, + "step": 5962 + }, + { + "epoch": 0.9655898307829326, + "grad_norm": 38.69147491455078, + "learning_rate": 3.465025906735751e-07, + "loss": 0.55, + "mean_token_accuracy": 0.9253824651241302, + "num_tokens": 10683734.0, + "step": 5963 + }, + { + "epoch": 0.9657517609910129, + "grad_norm": 36.372276306152344, + "learning_rate": 3.448834196891192e-07, + "loss": 0.5695, + "mean_token_accuracy": 0.9181021451950073, + "num_tokens": 10685527.0, + "step": 5964 + }, + { + "epoch": 0.9659136911990932, + "grad_norm": 27.32954978942871, + "learning_rate": 3.432642487046632e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.9296296238899231, + "num_tokens": 10687309.0, + "step": 5965 + }, + { + "epoch": 0.9660756214071735, + "grad_norm": 35.11532211303711, + "learning_rate": 3.416450777202073e-07, + "loss": 0.617, + "mean_token_accuracy": 0.9107518196105957, + "num_tokens": 10689090.0, + "step": 5966 + }, + { + "epoch": 0.9662375516152538, + "grad_norm": 31.95028305053711, + "learning_rate": 3.4002590673575133e-07, + "loss": 0.5947, + "mean_token_accuracy": 0.914434403181076, + "num_tokens": 10690882.0, + "step": 5967 + }, + { + "epoch": 0.9663994818233341, + "grad_norm": 40.09518814086914, + "learning_rate": 3.384067357512954e-07, + "loss": 0.5634, + "mean_token_accuracy": 0.9173708856105804, + "num_tokens": 10692671.0, + "step": 5968 + }, + { + "epoch": 0.9665614120314144, + "grad_norm": 34.51797866821289, + "learning_rate": 3.367875647668394e-07, + "loss": 0.619, + "mean_token_accuracy": 0.9096527099609375, + "num_tokens": 10694460.0, + "step": 5969 + }, + { + "epoch": 0.9667233422394947, + "grad_norm": 26.454313278198242, + "learning_rate": 3.3516839378238347e-07, + "loss": 0.5458, + "mean_token_accuracy": 0.9162003695964813, + "num_tokens": 10696246.0, + "step": 5970 + }, + { + "epoch": 0.966885272447575, + "grad_norm": 41.99522399902344, + "learning_rate": 3.335492227979275e-07, + "loss": 0.672, + "mean_token_accuracy": 0.9021767675876617, + "num_tokens": 10698034.0, + "step": 5971 + }, + { + "epoch": 0.9670472026556554, + "grad_norm": 36.688777923583984, + "learning_rate": 3.3193005181347155e-07, + "loss": 0.6112, + "mean_token_accuracy": 0.9166666865348816, + "num_tokens": 10699821.0, + "step": 5972 + }, + { + "epoch": 0.9672091328637358, + "grad_norm": 21.46832847595215, + "learning_rate": 3.3031088082901556e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.925168365240097, + "num_tokens": 10701600.0, + "step": 5973 + }, + { + "epoch": 0.9673710630718161, + "grad_norm": 40.13090896606445, + "learning_rate": 3.286917098445596e-07, + "loss": 0.6789, + "mean_token_accuracy": 0.9142034649848938, + "num_tokens": 10703402.0, + "step": 5974 + }, + { + "epoch": 0.9675329932798964, + "grad_norm": 22.817853927612305, + "learning_rate": 3.2707253886010364e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.9265454113483429, + "num_tokens": 10705200.0, + "step": 5975 + }, + { + "epoch": 0.9676949234879767, + "grad_norm": 26.60140037536621, + "learning_rate": 3.2545336787564766e-07, + "loss": 0.5816, + "mean_token_accuracy": 0.9187650084495544, + "num_tokens": 10706983.0, + "step": 5976 + }, + { + "epoch": 0.967856853696057, + "grad_norm": 37.0765380859375, + "learning_rate": 3.238341968911917e-07, + "loss": 0.6895, + "mean_token_accuracy": 0.9037989974021912, + "num_tokens": 10708775.0, + "step": 5977 + }, + { + "epoch": 0.9680187839041373, + "grad_norm": 29.61202621459961, + "learning_rate": 3.222150259067358e-07, + "loss": 0.5255, + "mean_token_accuracy": 0.9323671460151672, + "num_tokens": 10710569.0, + "step": 5978 + }, + { + "epoch": 0.9681807141122176, + "grad_norm": 32.91376876831055, + "learning_rate": 3.2059585492227985e-07, + "loss": 0.6344, + "mean_token_accuracy": 0.9078812301158905, + "num_tokens": 10712374.0, + "step": 5979 + }, + { + "epoch": 0.9683426443202979, + "grad_norm": 32.294612884521484, + "learning_rate": 3.1897668393782387e-07, + "loss": 0.6169, + "mean_token_accuracy": 0.9115384519100189, + "num_tokens": 10714179.0, + "step": 5980 + }, + { + "epoch": 0.9685045745283782, + "grad_norm": 21.905338287353516, + "learning_rate": 3.1735751295336793e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.9309440553188324, + "num_tokens": 10715966.0, + "step": 5981 + }, + { + "epoch": 0.9686665047364585, + "grad_norm": 40.02385711669922, + "learning_rate": 3.1573834196891195e-07, + "loss": 0.5309, + "mean_token_accuracy": 0.9155176877975464, + "num_tokens": 10717750.0, + "step": 5982 + }, + { + "epoch": 0.9688284349445389, + "grad_norm": 37.48805618286133, + "learning_rate": 3.14119170984456e-07, + "loss": 0.6857, + "mean_token_accuracy": 0.9033192992210388, + "num_tokens": 10719538.0, + "step": 5983 + }, + { + "epoch": 0.9689903651526193, + "grad_norm": 31.978900909423828, + "learning_rate": 3.125e-07, + "loss": 0.6735, + "mean_token_accuracy": 0.9110100269317627, + "num_tokens": 10721330.0, + "step": 5984 + }, + { + "epoch": 0.9691522953606996, + "grad_norm": 30.921436309814453, + "learning_rate": 3.1088082901554404e-07, + "loss": 0.5255, + "mean_token_accuracy": 0.9191147685050964, + "num_tokens": 10723128.0, + "step": 5985 + }, + { + "epoch": 0.9693142255687799, + "grad_norm": 31.98086929321289, + "learning_rate": 3.092616580310881e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.924838125705719, + "num_tokens": 10724919.0, + "step": 5986 + }, + { + "epoch": 0.9694761557768602, + "grad_norm": 42.07522201538086, + "learning_rate": 3.076424870466321e-07, + "loss": 0.6669, + "mean_token_accuracy": 0.914031058549881, + "num_tokens": 10726720.0, + "step": 5987 + }, + { + "epoch": 0.9696380859849405, + "grad_norm": 50.51162338256836, + "learning_rate": 3.060233160621762e-07, + "loss": 0.7937, + "mean_token_accuracy": 0.9051860868930817, + "num_tokens": 10728525.0, + "step": 5988 + }, + { + "epoch": 0.9698000161930208, + "grad_norm": 32.02886962890625, + "learning_rate": 3.0440414507772025e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.9227379560470581, + "num_tokens": 10730323.0, + "step": 5989 + }, + { + "epoch": 0.9699619464011011, + "grad_norm": 22.279531478881836, + "learning_rate": 3.0278497409326426e-07, + "loss": 0.483, + "mean_token_accuracy": 0.9307331740856171, + "num_tokens": 10732109.0, + "step": 5990 + }, + { + "epoch": 0.9701238766091814, + "grad_norm": 39.8419303894043, + "learning_rate": 3.0116580310880833e-07, + "loss": 0.5983, + "mean_token_accuracy": 0.9150777161121368, + "num_tokens": 10733903.0, + "step": 5991 + }, + { + "epoch": 0.9702858068172617, + "grad_norm": 25.502399444580078, + "learning_rate": 2.9954663212435234e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.9306824803352356, + "num_tokens": 10735702.0, + "step": 5992 + }, + { + "epoch": 0.970447737025342, + "grad_norm": 28.54367446899414, + "learning_rate": 2.9792746113989635e-07, + "loss": 0.5144, + "mean_token_accuracy": 0.926376461982727, + "num_tokens": 10737499.0, + "step": 5993 + }, + { + "epoch": 0.9706096672334223, + "grad_norm": 22.437623977661133, + "learning_rate": 2.963082901554405e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.9173604846000671, + "num_tokens": 10739290.0, + "step": 5994 + }, + { + "epoch": 0.9707715974415028, + "grad_norm": 23.476728439331055, + "learning_rate": 2.946891191709845e-07, + "loss": 0.4722, + "mean_token_accuracy": 0.9349094033241272, + "num_tokens": 10741078.0, + "step": 5995 + }, + { + "epoch": 0.9709335276495831, + "grad_norm": 43.632408142089844, + "learning_rate": 2.930699481865285e-07, + "loss": 0.6212, + "mean_token_accuracy": 0.9119221568107605, + "num_tokens": 10742862.0, + "step": 5996 + }, + { + "epoch": 0.9710954578576634, + "grad_norm": 31.835832595825195, + "learning_rate": 2.9145077720207257e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.927134245634079, + "num_tokens": 10744663.0, + "step": 5997 + }, + { + "epoch": 0.9712573880657437, + "grad_norm": 29.21527671813965, + "learning_rate": 2.898316062176166e-07, + "loss": 0.5331, + "mean_token_accuracy": 0.9215284287929535, + "num_tokens": 10746468.0, + "step": 5998 + }, + { + "epoch": 0.971419318273824, + "grad_norm": 41.16551208496094, + "learning_rate": 2.8821243523316065e-07, + "loss": 0.7077, + "mean_token_accuracy": 0.9138242900371552, + "num_tokens": 10748270.0, + "step": 5999 + }, + { + "epoch": 0.9715812484819043, + "grad_norm": 17.152545928955078, + "learning_rate": 2.865932642487047e-07, + "loss": 0.455, + "mean_token_accuracy": 0.9388888776302338, + "num_tokens": 10750061.0, + "step": 6000 + }, + { + "epoch": 0.9717431786899846, + "grad_norm": 39.27569580078125, + "learning_rate": 2.849740932642487e-07, + "loss": 0.741, + "mean_token_accuracy": 0.9058971703052521, + "num_tokens": 10751849.0, + "step": 6001 + }, + { + "epoch": 0.9719051088980649, + "grad_norm": 31.563203811645508, + "learning_rate": 2.833549222797928e-07, + "loss": 0.6029, + "mean_token_accuracy": 0.9245029091835022, + "num_tokens": 10753639.0, + "step": 6002 + }, + { + "epoch": 0.9720670391061452, + "grad_norm": 17.81201171875, + "learning_rate": 2.817357512953368e-07, + "loss": 0.5475, + "mean_token_accuracy": 0.9269099533557892, + "num_tokens": 10755425.0, + "step": 6003 + }, + { + "epoch": 0.9722289693142255, + "grad_norm": 30.614816665649414, + "learning_rate": 2.801165803108808e-07, + "loss": 0.5448, + "mean_token_accuracy": 0.9175745248794556, + "num_tokens": 10757216.0, + "step": 6004 + }, + { + "epoch": 0.9723908995223058, + "grad_norm": 28.59065055847168, + "learning_rate": 2.784974093264249e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.9221905171871185, + "num_tokens": 10758998.0, + "step": 6005 + }, + { + "epoch": 0.9725528297303863, + "grad_norm": 18.304765701293945, + "learning_rate": 2.7687823834196895e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.93180251121521, + "num_tokens": 10760774.0, + "step": 6006 + }, + { + "epoch": 0.9727147599384666, + "grad_norm": 27.235986709594727, + "learning_rate": 2.7525906735751296e-07, + "loss": 0.6238, + "mean_token_accuracy": 0.9137681126594543, + "num_tokens": 10762564.0, + "step": 6007 + }, + { + "epoch": 0.9728766901465469, + "grad_norm": 30.85948944091797, + "learning_rate": 2.7363989637305703e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.9283658862113953, + "num_tokens": 10764355.0, + "step": 6008 + }, + { + "epoch": 0.9730386203546272, + "grad_norm": 21.221689224243164, + "learning_rate": 2.7202072538860104e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.9296703040599823, + "num_tokens": 10766137.0, + "step": 6009 + }, + { + "epoch": 0.9732005505627075, + "grad_norm": 40.840728759765625, + "learning_rate": 2.704015544041451e-07, + "loss": 0.7529, + "mean_token_accuracy": 0.8986394703388214, + "num_tokens": 10767936.0, + "step": 6010 + }, + { + "epoch": 0.9733624807707878, + "grad_norm": 37.72563934326172, + "learning_rate": 2.687823834196891e-07, + "loss": 0.7232, + "mean_token_accuracy": 0.9042553305625916, + "num_tokens": 10769730.0, + "step": 6011 + }, + { + "epoch": 0.9735244109788681, + "grad_norm": 37.85989761352539, + "learning_rate": 2.671632124352332e-07, + "loss": 0.586, + "mean_token_accuracy": 0.9097758233547211, + "num_tokens": 10771519.0, + "step": 6012 + }, + { + "epoch": 0.9736863411869484, + "grad_norm": 25.049800872802734, + "learning_rate": 2.6554404145077725e-07, + "loss": 0.52, + "mean_token_accuracy": 0.9276439249515533, + "num_tokens": 10773308.0, + "step": 6013 + }, + { + "epoch": 0.9738482713950287, + "grad_norm": 19.83831214904785, + "learning_rate": 2.6392487046632126e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.9333629012107849, + "num_tokens": 10775105.0, + "step": 6014 + }, + { + "epoch": 0.974010201603109, + "grad_norm": 27.396163940429688, + "learning_rate": 2.623056994818653e-07, + "loss": 0.518, + "mean_token_accuracy": 0.9186064004898071, + "num_tokens": 10776900.0, + "step": 6015 + }, + { + "epoch": 0.9741721318111893, + "grad_norm": 19.901174545288086, + "learning_rate": 2.6068652849740934e-07, + "loss": 0.5469, + "mean_token_accuracy": 0.9259218573570251, + "num_tokens": 10778682.0, + "step": 6016 + }, + { + "epoch": 0.9743340620192696, + "grad_norm": 40.09729766845703, + "learning_rate": 2.5906735751295336e-07, + "loss": 0.6807, + "mean_token_accuracy": 0.9069696962833405, + "num_tokens": 10780476.0, + "step": 6017 + }, + { + "epoch": 0.9744959922273501, + "grad_norm": 16.281892776489258, + "learning_rate": 2.574481865284974e-07, + "loss": 0.464, + "mean_token_accuracy": 0.9371484518051147, + "num_tokens": 10782274.0, + "step": 6018 + }, + { + "epoch": 0.9746579224354304, + "grad_norm": 13.015487670898438, + "learning_rate": 2.558290155440415e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.9383876323699951, + "num_tokens": 10784063.0, + "step": 6019 + }, + { + "epoch": 0.9748198526435107, + "grad_norm": 29.84139633178711, + "learning_rate": 2.542098445595855e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.9300492703914642, + "num_tokens": 10785860.0, + "step": 6020 + }, + { + "epoch": 0.974981782851591, + "grad_norm": 15.85504150390625, + "learning_rate": 2.5259067357512957e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.9379370510578156, + "num_tokens": 10787647.0, + "step": 6021 + }, + { + "epoch": 0.9751437130596713, + "grad_norm": 28.816621780395508, + "learning_rate": 2.509715025906736e-07, + "loss": 0.5851, + "mean_token_accuracy": 0.9251377880573273, + "num_tokens": 10789439.0, + "step": 6022 + }, + { + "epoch": 0.9753056432677516, + "grad_norm": 17.971837997436523, + "learning_rate": 2.4935233160621765e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.9307036101818085, + "num_tokens": 10791225.0, + "step": 6023 + }, + { + "epoch": 0.9754675734758319, + "grad_norm": 35.113929748535156, + "learning_rate": 2.477331606217617e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.9269722700119019, + "num_tokens": 10793011.0, + "step": 6024 + }, + { + "epoch": 0.9756295036839122, + "grad_norm": 42.85952377319336, + "learning_rate": 2.461139896373057e-07, + "loss": 0.6255, + "mean_token_accuracy": 0.9185110628604889, + "num_tokens": 10794805.0, + "step": 6025 + }, + { + "epoch": 0.9757914338919925, + "grad_norm": 51.56884002685547, + "learning_rate": 2.4449481865284974e-07, + "loss": 1.0414, + "mean_token_accuracy": 0.8857594728469849, + "num_tokens": 10796597.0, + "step": 6026 + }, + { + "epoch": 0.9759533641000728, + "grad_norm": 17.89520835876465, + "learning_rate": 2.428756476683938e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.9281793832778931, + "num_tokens": 10798387.0, + "step": 6027 + }, + { + "epoch": 0.9761152943081531, + "grad_norm": 30.426298141479492, + "learning_rate": 2.412564766839378e-07, + "loss": 0.5773, + "mean_token_accuracy": 0.9236669540405273, + "num_tokens": 10800174.0, + "step": 6028 + }, + { + "epoch": 0.9762772245162336, + "grad_norm": 23.444368362426758, + "learning_rate": 2.396373056994819e-07, + "loss": 0.529, + "mean_token_accuracy": 0.9178959727287292, + "num_tokens": 10801966.0, + "step": 6029 + }, + { + "epoch": 0.9764391547243139, + "grad_norm": 35.45286178588867, + "learning_rate": 2.3801813471502595e-07, + "loss": 0.5536, + "mean_token_accuracy": 0.9255319237709045, + "num_tokens": 10803760.0, + "step": 6030 + }, + { + "epoch": 0.9766010849323942, + "grad_norm": 32.57078552246094, + "learning_rate": 2.3639896373056996e-07, + "loss": 0.5584, + "mean_token_accuracy": 0.9166023135185242, + "num_tokens": 10805560.0, + "step": 6031 + }, + { + "epoch": 0.9767630151404745, + "grad_norm": 30.741939544677734, + "learning_rate": 2.34779792746114e-07, + "loss": 0.5929, + "mean_token_accuracy": 0.9180491268634796, + "num_tokens": 10807352.0, + "step": 6032 + }, + { + "epoch": 0.9769249453485548, + "grad_norm": 23.60361099243164, + "learning_rate": 2.3316062176165804e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.9252451062202454, + "num_tokens": 10809132.0, + "step": 6033 + }, + { + "epoch": 0.9770868755566351, + "grad_norm": 18.486042022705078, + "learning_rate": 2.3154145077720208e-07, + "loss": 0.4392, + "mean_token_accuracy": 0.9397663176059723, + "num_tokens": 10810926.0, + "step": 6034 + }, + { + "epoch": 0.9772488057647154, + "grad_norm": 27.600706100463867, + "learning_rate": 2.2992227979274612e-07, + "loss": 0.5306, + "mean_token_accuracy": 0.9242961406707764, + "num_tokens": 10812715.0, + "step": 6035 + }, + { + "epoch": 0.9774107359727957, + "grad_norm": 29.841510772705078, + "learning_rate": 2.283031088082902e-07, + "loss": 0.6309, + "mean_token_accuracy": 0.9229629635810852, + "num_tokens": 10814512.0, + "step": 6036 + }, + { + "epoch": 0.977572666180876, + "grad_norm": 35.488861083984375, + "learning_rate": 2.2668393782383423e-07, + "loss": 0.6175, + "mean_token_accuracy": 0.9221839308738708, + "num_tokens": 10816319.0, + "step": 6037 + }, + { + "epoch": 0.9777345963889563, + "grad_norm": 22.250490188598633, + "learning_rate": 2.2506476683937827e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.9390225112438202, + "num_tokens": 10818110.0, + "step": 6038 + }, + { + "epoch": 0.9778965265970366, + "grad_norm": 31.22648048400879, + "learning_rate": 2.234455958549223e-07, + "loss": 0.5728, + "mean_token_accuracy": 0.9159740209579468, + "num_tokens": 10819896.0, + "step": 6039 + }, + { + "epoch": 0.978058456805117, + "grad_norm": 26.290372848510742, + "learning_rate": 2.2182642487046632e-07, + "loss": 0.6016, + "mean_token_accuracy": 0.9218370020389557, + "num_tokens": 10821677.0, + "step": 6040 + }, + { + "epoch": 0.9782203870131974, + "grad_norm": 35.564205169677734, + "learning_rate": 2.2020725388601036e-07, + "loss": 0.6527, + "mean_token_accuracy": 0.908225953578949, + "num_tokens": 10823472.0, + "step": 6041 + }, + { + "epoch": 0.9783823172212777, + "grad_norm": 35.654117584228516, + "learning_rate": 2.1858808290155442e-07, + "loss": 0.6623, + "mean_token_accuracy": 0.9147413671016693, + "num_tokens": 10825264.0, + "step": 6042 + }, + { + "epoch": 0.978544247429358, + "grad_norm": 23.416078567504883, + "learning_rate": 2.1696891191709846e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.9244604408740997, + "num_tokens": 10827054.0, + "step": 6043 + }, + { + "epoch": 0.9787061776374383, + "grad_norm": 46.12065887451172, + "learning_rate": 2.153497409326425e-07, + "loss": 0.7212, + "mean_token_accuracy": 0.9029177725315094, + "num_tokens": 10828854.0, + "step": 6044 + }, + { + "epoch": 0.9788681078455186, + "grad_norm": 28.441999435424805, + "learning_rate": 2.1373056994818654e-07, + "loss": 0.5147, + "mean_token_accuracy": 0.9282888174057007, + "num_tokens": 10830645.0, + "step": 6045 + }, + { + "epoch": 0.9790300380535989, + "grad_norm": 48.80912780761719, + "learning_rate": 2.1211139896373058e-07, + "loss": 0.7984, + "mean_token_accuracy": 0.9010533392429352, + "num_tokens": 10832450.0, + "step": 6046 + }, + { + "epoch": 0.9791919682616792, + "grad_norm": 27.271411895751953, + "learning_rate": 2.1049222797927462e-07, + "loss": 0.5874, + "mean_token_accuracy": 0.9176878929138184, + "num_tokens": 10834242.0, + "step": 6047 + }, + { + "epoch": 0.9793538984697595, + "grad_norm": 20.118410110473633, + "learning_rate": 2.088730569948187e-07, + "loss": 0.5544, + "mean_token_accuracy": 0.931756854057312, + "num_tokens": 10836018.0, + "step": 6048 + }, + { + "epoch": 0.9795158286778398, + "grad_norm": 20.52791404724121, + "learning_rate": 2.0725388601036273e-07, + "loss": 0.49, + "mean_token_accuracy": 0.9365914165973663, + "num_tokens": 10837814.0, + "step": 6049 + }, + { + "epoch": 0.9796777588859201, + "grad_norm": 37.942100524902344, + "learning_rate": 2.0563471502590677e-07, + "loss": 0.5689, + "mean_token_accuracy": 0.9179934859275818, + "num_tokens": 10839606.0, + "step": 6050 + }, + { + "epoch": 0.9798396890940004, + "grad_norm": 25.589406967163086, + "learning_rate": 2.0401554404145078e-07, + "loss": 0.497, + "mean_token_accuracy": 0.9297619163990021, + "num_tokens": 10841390.0, + "step": 6051 + }, + { + "epoch": 0.9800016193020809, + "grad_norm": 28.715259552001953, + "learning_rate": 2.0239637305699482e-07, + "loss": 0.6608, + "mean_token_accuracy": 0.9190140962600708, + "num_tokens": 10843186.0, + "step": 6052 + }, + { + "epoch": 0.9801635495101612, + "grad_norm": 35.99466323852539, + "learning_rate": 2.0077720207253886e-07, + "loss": 0.5384, + "mean_token_accuracy": 0.9206710755825043, + "num_tokens": 10844976.0, + "step": 6053 + }, + { + "epoch": 0.9803254797182415, + "grad_norm": 23.725095748901367, + "learning_rate": 1.9915803108808293e-07, + "loss": 0.5711, + "mean_token_accuracy": 0.9247512519359589, + "num_tokens": 10846754.0, + "step": 6054 + }, + { + "epoch": 0.9804874099263218, + "grad_norm": 21.931045532226562, + "learning_rate": 1.9753886010362696e-07, + "loss": 0.5309, + "mean_token_accuracy": 0.9264666140079498, + "num_tokens": 10848538.0, + "step": 6055 + }, + { + "epoch": 0.9806493401344021, + "grad_norm": 21.672197341918945, + "learning_rate": 1.95919689119171e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.9357143044471741, + "num_tokens": 10850330.0, + "step": 6056 + }, + { + "epoch": 0.9808112703424824, + "grad_norm": 35.66658020019531, + "learning_rate": 1.9430051813471504e-07, + "loss": 0.6408, + "mean_token_accuracy": 0.9014598429203033, + "num_tokens": 10852116.0, + "step": 6057 + }, + { + "epoch": 0.9809732005505627, + "grad_norm": 32.96878433227539, + "learning_rate": 1.9268134715025908e-07, + "loss": 0.5789, + "mean_token_accuracy": 0.9253712594509125, + "num_tokens": 10853910.0, + "step": 6058 + }, + { + "epoch": 0.981135130758643, + "grad_norm": 30.29815101623535, + "learning_rate": 1.910621761658031e-07, + "loss": 0.545, + "mean_token_accuracy": 0.9165872633457184, + "num_tokens": 10855697.0, + "step": 6059 + }, + { + "epoch": 0.9812970609667233, + "grad_norm": 27.359214782714844, + "learning_rate": 1.894430051813472e-07, + "loss": 0.5873, + "mean_token_accuracy": 0.9195342361927032, + "num_tokens": 10857493.0, + "step": 6060 + }, + { + "epoch": 0.9814589911748036, + "grad_norm": 37.08452224731445, + "learning_rate": 1.878238341968912e-07, + "loss": 0.5888, + "mean_token_accuracy": 0.9253042638301849, + "num_tokens": 10859286.0, + "step": 6061 + }, + { + "epoch": 0.9816209213828839, + "grad_norm": 36.1778564453125, + "learning_rate": 1.8620466321243524e-07, + "loss": 0.6158, + "mean_token_accuracy": 0.9154887795448303, + "num_tokens": 10861082.0, + "step": 6062 + }, + { + "epoch": 0.9817828515909643, + "grad_norm": 40.09469985961914, + "learning_rate": 1.8458549222797928e-07, + "loss": 0.7144, + "mean_token_accuracy": 0.9097559750080109, + "num_tokens": 10862881.0, + "step": 6063 + }, + { + "epoch": 0.9819447817990447, + "grad_norm": 26.364931106567383, + "learning_rate": 1.8296632124352332e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.9285494089126587, + "num_tokens": 10864672.0, + "step": 6064 + }, + { + "epoch": 0.982106712007125, + "grad_norm": 39.00387954711914, + "learning_rate": 1.8134715025906736e-07, + "loss": 0.6389, + "mean_token_accuracy": 0.9003799557685852, + "num_tokens": 10866465.0, + "step": 6065 + }, + { + "epoch": 0.9822686422152053, + "grad_norm": 36.239017486572266, + "learning_rate": 1.7972797927461143e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.924430638551712, + "num_tokens": 10868255.0, + "step": 6066 + }, + { + "epoch": 0.9824305724232856, + "grad_norm": 27.12883186340332, + "learning_rate": 1.7810880829015547e-07, + "loss": 0.4989, + "mean_token_accuracy": 0.9236111044883728, + "num_tokens": 10870055.0, + "step": 6067 + }, + { + "epoch": 0.9825925026313659, + "grad_norm": 28.723773956298828, + "learning_rate": 1.764896373056995e-07, + "loss": 0.5617, + "mean_token_accuracy": 0.9240368902683258, + "num_tokens": 10871845.0, + "step": 6068 + }, + { + "epoch": 0.9827544328394462, + "grad_norm": 28.85513687133789, + "learning_rate": 1.7487046632124354e-07, + "loss": 0.5217, + "mean_token_accuracy": 0.9230356216430664, + "num_tokens": 10873630.0, + "step": 6069 + }, + { + "epoch": 0.9829163630475265, + "grad_norm": 34.46381378173828, + "learning_rate": 1.7325129533678756e-07, + "loss": 0.5636, + "mean_token_accuracy": 0.9206026494503021, + "num_tokens": 10875419.0, + "step": 6070 + }, + { + "epoch": 0.9830782932556068, + "grad_norm": 42.50251007080078, + "learning_rate": 1.716321243523316e-07, + "loss": 0.7293, + "mean_token_accuracy": 0.9172661900520325, + "num_tokens": 10877209.0, + "step": 6071 + }, + { + "epoch": 0.9832402234636871, + "grad_norm": 29.35532569885254, + "learning_rate": 1.7001295336787566e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.9300562739372253, + "num_tokens": 10879007.0, + "step": 6072 + }, + { + "epoch": 0.9834021536717674, + "grad_norm": 31.294803619384766, + "learning_rate": 1.683937823834197e-07, + "loss": 0.6497, + "mean_token_accuracy": 0.9120863080024719, + "num_tokens": 10880804.0, + "step": 6073 + }, + { + "epoch": 0.9835640838798478, + "grad_norm": 37.88172149658203, + "learning_rate": 1.6677461139896374e-07, + "loss": 0.613, + "mean_token_accuracy": 0.9165484607219696, + "num_tokens": 10882592.0, + "step": 6074 + }, + { + "epoch": 0.9837260140879281, + "grad_norm": 41.460655212402344, + "learning_rate": 1.6515544041450778e-07, + "loss": 0.6881, + "mean_token_accuracy": 0.911738395690918, + "num_tokens": 10884388.0, + "step": 6075 + }, + { + "epoch": 0.9838879442960085, + "grad_norm": 31.226266860961914, + "learning_rate": 1.6353626943005182e-07, + "loss": 0.5525, + "mean_token_accuracy": 0.9224588871002197, + "num_tokens": 10886171.0, + "step": 6076 + }, + { + "epoch": 0.9840498745040888, + "grad_norm": 21.181459426879883, + "learning_rate": 1.6191709844559586e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.9240615367889404, + "num_tokens": 10887960.0, + "step": 6077 + }, + { + "epoch": 0.9842118047121691, + "grad_norm": 34.684906005859375, + "learning_rate": 1.6029792746113993e-07, + "loss": 0.5951, + "mean_token_accuracy": 0.9217275083065033, + "num_tokens": 10889764.0, + "step": 6078 + }, + { + "epoch": 0.9843737349202494, + "grad_norm": 34.19395065307617, + "learning_rate": 1.5867875647668397e-07, + "loss": 0.588, + "mean_token_accuracy": 0.9210144877433777, + "num_tokens": 10891554.0, + "step": 6079 + }, + { + "epoch": 0.9845356651283297, + "grad_norm": 31.684844970703125, + "learning_rate": 1.57059585492228e-07, + "loss": 0.543, + "mean_token_accuracy": 0.9258241653442383, + "num_tokens": 10893349.0, + "step": 6080 + }, + { + "epoch": 0.98469759533641, + "grad_norm": 26.12248992919922, + "learning_rate": 1.5544041450777202e-07, + "loss": 0.5226, + "mean_token_accuracy": 0.9227941036224365, + "num_tokens": 10895133.0, + "step": 6081 + }, + { + "epoch": 0.9848595255444903, + "grad_norm": 27.49913787841797, + "learning_rate": 1.5382124352331606e-07, + "loss": 0.8488, + "mean_token_accuracy": 0.9158163368701935, + "num_tokens": 10896932.0, + "step": 6082 + }, + { + "epoch": 0.9850214557525706, + "grad_norm": 42.77002716064453, + "learning_rate": 1.5220207253886012e-07, + "loss": 0.6557, + "mean_token_accuracy": 0.9087461829185486, + "num_tokens": 10898729.0, + "step": 6083 + }, + { + "epoch": 0.9851833859606509, + "grad_norm": 26.817346572875977, + "learning_rate": 1.5058290155440416e-07, + "loss": 0.5311, + "mean_token_accuracy": 0.9306978285312653, + "num_tokens": 10900515.0, + "step": 6084 + }, + { + "epoch": 0.9853453161687312, + "grad_norm": 36.96622848510742, + "learning_rate": 1.4896373056994818e-07, + "loss": 0.5652, + "mean_token_accuracy": 0.9170940220355988, + "num_tokens": 10902305.0, + "step": 6085 + }, + { + "epoch": 0.9855072463768116, + "grad_norm": 35.64229965209961, + "learning_rate": 1.4734455958549224e-07, + "loss": 0.565, + "mean_token_accuracy": 0.9087215662002563, + "num_tokens": 10904102.0, + "step": 6086 + }, + { + "epoch": 0.985669176584892, + "grad_norm": 19.56130027770996, + "learning_rate": 1.4572538860103628e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.9355087280273438, + "num_tokens": 10905893.0, + "step": 6087 + }, + { + "epoch": 0.9858311067929723, + "grad_norm": 23.261049270629883, + "learning_rate": 1.4410621761658032e-07, + "loss": 0.4519, + "mean_token_accuracy": 0.9330357015132904, + "num_tokens": 10907689.0, + "step": 6088 + }, + { + "epoch": 0.9859930370010526, + "grad_norm": 28.457412719726562, + "learning_rate": 1.4248704663212436e-07, + "loss": 0.5121, + "mean_token_accuracy": 0.9225198328495026, + "num_tokens": 10909485.0, + "step": 6089 + }, + { + "epoch": 0.9861549672091329, + "grad_norm": 40.233192443847656, + "learning_rate": 1.408678756476684e-07, + "loss": 0.5921, + "mean_token_accuracy": 0.9188180863857269, + "num_tokens": 10911268.0, + "step": 6090 + }, + { + "epoch": 0.9863168974172132, + "grad_norm": 24.293006896972656, + "learning_rate": 1.3924870466321244e-07, + "loss": 0.539, + "mean_token_accuracy": 0.9285391271114349, + "num_tokens": 10913046.0, + "step": 6091 + }, + { + "epoch": 0.9864788276252935, + "grad_norm": 21.613971710205078, + "learning_rate": 1.3762953367875648e-07, + "loss": 0.5312, + "mean_token_accuracy": 0.9321029782295227, + "num_tokens": 10914823.0, + "step": 6092 + }, + { + "epoch": 0.9866407578333738, + "grad_norm": 41.04182434082031, + "learning_rate": 1.3601036269430052e-07, + "loss": 0.5689, + "mean_token_accuracy": 0.9084957540035248, + "num_tokens": 10916619.0, + "step": 6093 + }, + { + "epoch": 0.9868026880414541, + "grad_norm": 28.33021354675293, + "learning_rate": 1.3439119170984456e-07, + "loss": 0.6061, + "mean_token_accuracy": 0.9226754009723663, + "num_tokens": 10918403.0, + "step": 6094 + }, + { + "epoch": 0.9869646182495344, + "grad_norm": 23.87122344970703, + "learning_rate": 1.3277202072538863e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.9175904989242554, + "num_tokens": 10920181.0, + "step": 6095 + }, + { + "epoch": 0.9871265484576147, + "grad_norm": 32.19245529174805, + "learning_rate": 1.3115284974093264e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.9201388955116272, + "num_tokens": 10921981.0, + "step": 6096 + }, + { + "epoch": 0.9872884786656951, + "grad_norm": 28.99053382873535, + "learning_rate": 1.2953367875647668e-07, + "loss": 0.5562, + "mean_token_accuracy": 0.921686053276062, + "num_tokens": 10923761.0, + "step": 6097 + }, + { + "epoch": 0.9874504088737754, + "grad_norm": 17.995075225830078, + "learning_rate": 1.2791450777202074e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.9311594367027283, + "num_tokens": 10925549.0, + "step": 6098 + }, + { + "epoch": 0.9876123390818558, + "grad_norm": 29.703542709350586, + "learning_rate": 1.2629533678756478e-07, + "loss": 0.5394, + "mean_token_accuracy": 0.9282581508159637, + "num_tokens": 10927326.0, + "step": 6099 + }, + { + "epoch": 0.9877742692899361, + "grad_norm": 38.95078659057617, + "learning_rate": 1.2467616580310882e-07, + "loss": 0.6264, + "mean_token_accuracy": 0.9168460965156555, + "num_tokens": 10929114.0, + "step": 6100 + }, + { + "epoch": 0.9879361994980164, + "grad_norm": 27.35525894165039, + "learning_rate": 1.2305699481865286e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.9236221313476562, + "num_tokens": 10930901.0, + "step": 6101 + }, + { + "epoch": 0.9880981297060967, + "grad_norm": 26.736326217651367, + "learning_rate": 1.214378238341969e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.9283658862113953, + "num_tokens": 10932692.0, + "step": 6102 + }, + { + "epoch": 0.988260059914177, + "grad_norm": 41.304874420166016, + "learning_rate": 1.1981865284974094e-07, + "loss": 0.7467, + "mean_token_accuracy": 0.9007352888584137, + "num_tokens": 10934484.0, + "step": 6103 + }, + { + "epoch": 0.9884219901222573, + "grad_norm": 35.604061126708984, + "learning_rate": 1.1819948186528498e-07, + "loss": 0.6182, + "mean_token_accuracy": 0.9174606502056122, + "num_tokens": 10936274.0, + "step": 6104 + }, + { + "epoch": 0.9885839203303376, + "grad_norm": 37.450592041015625, + "learning_rate": 1.1658031088082902e-07, + "loss": 0.6378, + "mean_token_accuracy": 0.9177156090736389, + "num_tokens": 10938064.0, + "step": 6105 + }, + { + "epoch": 0.9887458505384179, + "grad_norm": 30.13355255126953, + "learning_rate": 1.1496113989637306e-07, + "loss": 0.5315, + "mean_token_accuracy": 0.9208469092845917, + "num_tokens": 10939854.0, + "step": 6106 + }, + { + "epoch": 0.9889077807464982, + "grad_norm": 21.570594787597656, + "learning_rate": 1.1334196891191711e-07, + "loss": 0.5805, + "mean_token_accuracy": 0.9233802258968353, + "num_tokens": 10941640.0, + "step": 6107 + }, + { + "epoch": 0.9890697109545786, + "grad_norm": 30.5667724609375, + "learning_rate": 1.1172279792746115e-07, + "loss": 0.651, + "mean_token_accuracy": 0.9154411554336548, + "num_tokens": 10943424.0, + "step": 6108 + }, + { + "epoch": 0.9892316411626589, + "grad_norm": 24.86794090270996, + "learning_rate": 1.1010362694300518e-07, + "loss": 0.5424, + "mean_token_accuracy": 0.9242705702781677, + "num_tokens": 10945226.0, + "step": 6109 + }, + { + "epoch": 0.9893935713707392, + "grad_norm": 30.092241287231445, + "learning_rate": 1.0848445595854923e-07, + "loss": 0.6199, + "mean_token_accuracy": 0.914893627166748, + "num_tokens": 10947020.0, + "step": 6110 + }, + { + "epoch": 0.9895555015788196, + "grad_norm": 30.275177001953125, + "learning_rate": 1.0686528497409327e-07, + "loss": 0.5777, + "mean_token_accuracy": 0.9218100905418396, + "num_tokens": 10948813.0, + "step": 6111 + }, + { + "epoch": 0.9897174317868999, + "grad_norm": 37.1219596862793, + "learning_rate": 1.0524611398963731e-07, + "loss": 0.66, + "mean_token_accuracy": 0.9192523658275604, + "num_tokens": 10950610.0, + "step": 6112 + }, + { + "epoch": 0.9898793619949802, + "grad_norm": 42.09202194213867, + "learning_rate": 1.0362694300518136e-07, + "loss": 0.8004, + "mean_token_accuracy": 0.9107498526573181, + "num_tokens": 10952402.0, + "step": 6113 + }, + { + "epoch": 0.9900412922030605, + "grad_norm": 22.387855529785156, + "learning_rate": 1.0200777202072539e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.9309523701667786, + "num_tokens": 10954189.0, + "step": 6114 + }, + { + "epoch": 0.9902032224111408, + "grad_norm": 22.554393768310547, + "learning_rate": 1.0038860103626943e-07, + "loss": 0.5868, + "mean_token_accuracy": 0.9241362810134888, + "num_tokens": 10955988.0, + "step": 6115 + }, + { + "epoch": 0.9903651526192211, + "grad_norm": 35.55607986450195, + "learning_rate": 9.876943005181348e-08, + "loss": 0.5389, + "mean_token_accuracy": 0.917129635810852, + "num_tokens": 10957779.0, + "step": 6116 + }, + { + "epoch": 0.9905270828273014, + "grad_norm": 28.683509826660156, + "learning_rate": 9.715025906735752e-08, + "loss": 0.5162, + "mean_token_accuracy": 0.922762930393219, + "num_tokens": 10959563.0, + "step": 6117 + }, + { + "epoch": 0.9906890130353817, + "grad_norm": 31.29978370666504, + "learning_rate": 9.553108808290155e-08, + "loss": 0.6304, + "mean_token_accuracy": 0.9152413010597229, + "num_tokens": 10961346.0, + "step": 6118 + }, + { + "epoch": 0.9908509432434621, + "grad_norm": 31.566434860229492, + "learning_rate": 9.39119170984456e-08, + "loss": 0.5446, + "mean_token_accuracy": 0.9244025647640228, + "num_tokens": 10963122.0, + "step": 6119 + }, + { + "epoch": 0.9910128734515424, + "grad_norm": 28.78155517578125, + "learning_rate": 9.229274611398964e-08, + "loss": 0.534, + "mean_token_accuracy": 0.928385466337204, + "num_tokens": 10964914.0, + "step": 6120 + }, + { + "epoch": 0.9911748036596227, + "grad_norm": 28.31410789489746, + "learning_rate": 9.067357512953368e-08, + "loss": 0.5834, + "mean_token_accuracy": 0.9239130616188049, + "num_tokens": 10966702.0, + "step": 6121 + }, + { + "epoch": 0.991336733867703, + "grad_norm": 29.469974517822266, + "learning_rate": 8.905440414507773e-08, + "loss": 0.5646, + "mean_token_accuracy": 0.9210539758205414, + "num_tokens": 10968492.0, + "step": 6122 + }, + { + "epoch": 0.9914986640757834, + "grad_norm": 28.789026260375977, + "learning_rate": 8.743523316062177e-08, + "loss": 0.6883, + "mean_token_accuracy": 0.9196135997772217, + "num_tokens": 10970290.0, + "step": 6123 + }, + { + "epoch": 0.9916605942838637, + "grad_norm": 35.08652877807617, + "learning_rate": 8.58160621761658e-08, + "loss": 0.5894, + "mean_token_accuracy": 0.9113209545612335, + "num_tokens": 10972085.0, + "step": 6124 + }, + { + "epoch": 0.991822524491944, + "grad_norm": 27.712631225585938, + "learning_rate": 8.419689119170985e-08, + "loss": 0.5539, + "mean_token_accuracy": 0.923882782459259, + "num_tokens": 10973873.0, + "step": 6125 + }, + { + "epoch": 0.9919844547000243, + "grad_norm": 25.96645164489746, + "learning_rate": 8.257772020725389e-08, + "loss": 0.6109, + "mean_token_accuracy": 0.9154387712478638, + "num_tokens": 10975657.0, + "step": 6126 + }, + { + "epoch": 0.9921463849081046, + "grad_norm": 35.20242691040039, + "learning_rate": 8.095854922279793e-08, + "loss": 0.563, + "mean_token_accuracy": 0.9211711883544922, + "num_tokens": 10977461.0, + "step": 6127 + }, + { + "epoch": 0.9923083151161849, + "grad_norm": 25.255678176879883, + "learning_rate": 7.933937823834198e-08, + "loss": 0.5379, + "mean_token_accuracy": 0.9321728944778442, + "num_tokens": 10979254.0, + "step": 6128 + }, + { + "epoch": 0.9924702453242652, + "grad_norm": 33.57099151611328, + "learning_rate": 7.772020725388601e-08, + "loss": 0.4965, + "mean_token_accuracy": 0.9245297908782959, + "num_tokens": 10981043.0, + "step": 6129 + }, + { + "epoch": 0.9926321755323455, + "grad_norm": 41.490814208984375, + "learning_rate": 7.610103626943006e-08, + "loss": 0.8067, + "mean_token_accuracy": 0.9038345515727997, + "num_tokens": 10982837.0, + "step": 6130 + }, + { + "epoch": 0.9927941057404259, + "grad_norm": 39.52449417114258, + "learning_rate": 7.448186528497409e-08, + "loss": 0.7137, + "mean_token_accuracy": 0.9032630920410156, + "num_tokens": 10984628.0, + "step": 6131 + }, + { + "epoch": 0.9929560359485062, + "grad_norm": 38.833099365234375, + "learning_rate": 7.286269430051814e-08, + "loss": 0.6062, + "mean_token_accuracy": 0.9128719866275787, + "num_tokens": 10986415.0, + "step": 6132 + }, + { + "epoch": 0.9931179661565865, + "grad_norm": 41.01188659667969, + "learning_rate": 7.124352331606218e-08, + "loss": 0.6664, + "mean_token_accuracy": 0.8998928666114807, + "num_tokens": 10988207.0, + "step": 6133 + }, + { + "epoch": 0.9932798963646668, + "grad_norm": 24.844482421875, + "learning_rate": 6.962435233160622e-08, + "loss": 0.5086, + "mean_token_accuracy": 0.9241379499435425, + "num_tokens": 10990009.0, + "step": 6134 + }, + { + "epoch": 0.9934418265727472, + "grad_norm": 36.196598052978516, + "learning_rate": 6.800518134715026e-08, + "loss": 0.6314, + "mean_token_accuracy": 0.9134441316127777, + "num_tokens": 10991799.0, + "step": 6135 + }, + { + "epoch": 0.9936037567808275, + "grad_norm": 33.09366226196289, + "learning_rate": 6.638601036269431e-08, + "loss": 0.5003, + "mean_token_accuracy": 0.9190376698970795, + "num_tokens": 10993595.0, + "step": 6136 + }, + { + "epoch": 0.9937656869889078, + "grad_norm": 20.64581298828125, + "learning_rate": 6.476683937823834e-08, + "loss": 0.5398, + "mean_token_accuracy": 0.9324262738227844, + "num_tokens": 10995388.0, + "step": 6137 + }, + { + "epoch": 0.9939276171969881, + "grad_norm": 34.03934860229492, + "learning_rate": 6.314766839378239e-08, + "loss": 0.5506, + "mean_token_accuracy": 0.9146149754524231, + "num_tokens": 10997181.0, + "step": 6138 + }, + { + "epoch": 0.9940895474050684, + "grad_norm": 23.238513946533203, + "learning_rate": 6.152849740932643e-08, + "loss": 0.5709, + "mean_token_accuracy": 0.9254483878612518, + "num_tokens": 10998974.0, + "step": 6139 + }, + { + "epoch": 0.9942514776131487, + "grad_norm": 32.46528244018555, + "learning_rate": 5.990932642487047e-08, + "loss": 0.6204, + "mean_token_accuracy": 0.9181353747844696, + "num_tokens": 11000766.0, + "step": 6140 + }, + { + "epoch": 0.994413407821229, + "grad_norm": 44.0908088684082, + "learning_rate": 5.829015544041451e-08, + "loss": 0.6499, + "mean_token_accuracy": 0.9096594452857971, + "num_tokens": 11002554.0, + "step": 6141 + }, + { + "epoch": 0.9945753380293094, + "grad_norm": 32.787559509277344, + "learning_rate": 5.667098445595856e-08, + "loss": 0.6001, + "mean_token_accuracy": 0.9146570265293121, + "num_tokens": 11004347.0, + "step": 6142 + }, + { + "epoch": 0.9947372682373897, + "grad_norm": 14.359176635742188, + "learning_rate": 5.505181347150259e-08, + "loss": 0.4428, + "mean_token_accuracy": 0.9351591169834137, + "num_tokens": 11006136.0, + "step": 6143 + }, + { + "epoch": 0.99489919844547, + "grad_norm": 38.899715423583984, + "learning_rate": 5.3432642487046636e-08, + "loss": 0.6727, + "mean_token_accuracy": 0.9116721749305725, + "num_tokens": 11007931.0, + "step": 6144 + }, + { + "epoch": 0.9950611286535503, + "grad_norm": 29.351993560791016, + "learning_rate": 5.181347150259068e-08, + "loss": 0.5363, + "mean_token_accuracy": 0.9275849461555481, + "num_tokens": 11009719.0, + "step": 6145 + }, + { + "epoch": 0.9952230588616306, + "grad_norm": 25.75142478942871, + "learning_rate": 5.0194300518134715e-08, + "loss": 0.5221, + "mean_token_accuracy": 0.9304347932338715, + "num_tokens": 11011504.0, + "step": 6146 + }, + { + "epoch": 0.995384989069711, + "grad_norm": 30.866397857666016, + "learning_rate": 4.857512953367876e-08, + "loss": 0.601, + "mean_token_accuracy": 0.9249081313610077, + "num_tokens": 11013282.0, + "step": 6147 + }, + { + "epoch": 0.9955469192777913, + "grad_norm": 33.82006072998047, + "learning_rate": 4.69559585492228e-08, + "loss": 0.618, + "mean_token_accuracy": 0.9103545546531677, + "num_tokens": 11015073.0, + "step": 6148 + }, + { + "epoch": 0.9957088494858716, + "grad_norm": 35.07512664794922, + "learning_rate": 4.533678756476684e-08, + "loss": 0.6008, + "mean_token_accuracy": 0.9192603230476379, + "num_tokens": 11016870.0, + "step": 6149 + }, + { + "epoch": 0.9958707796939519, + "grad_norm": 34.10052490234375, + "learning_rate": 4.3717616580310886e-08, + "loss": 0.6477, + "mean_token_accuracy": 0.9154769778251648, + "num_tokens": 11018654.0, + "step": 6150 + }, + { + "epoch": 0.9960327099020322, + "grad_norm": 37.87874221801758, + "learning_rate": 4.2098445595854926e-08, + "loss": 0.6204, + "mean_token_accuracy": 0.9128997325897217, + "num_tokens": 11020451.0, + "step": 6151 + }, + { + "epoch": 0.9961946401101125, + "grad_norm": 24.359533309936523, + "learning_rate": 4.0479274611398965e-08, + "loss": 0.5329, + "mean_token_accuracy": 0.9236669540405273, + "num_tokens": 11022238.0, + "step": 6152 + }, + { + "epoch": 0.9963565703181929, + "grad_norm": 31.06570816040039, + "learning_rate": 3.8860103626943005e-08, + "loss": 0.4875, + "mean_token_accuracy": 0.9280538260936737, + "num_tokens": 11024028.0, + "step": 6153 + }, + { + "epoch": 0.9965185005262732, + "grad_norm": 36.55165100097656, + "learning_rate": 3.7240932642487044e-08, + "loss": 0.7102, + "mean_token_accuracy": 0.9087276458740234, + "num_tokens": 11025814.0, + "step": 6154 + }, + { + "epoch": 0.9966804307343535, + "grad_norm": 40.653141021728516, + "learning_rate": 3.562176165803109e-08, + "loss": 0.6687, + "mean_token_accuracy": 0.9028468132019043, + "num_tokens": 11027604.0, + "step": 6155 + }, + { + "epoch": 0.9968423609424338, + "grad_norm": 30.16074562072754, + "learning_rate": 3.400259067357513e-08, + "loss": 0.5826, + "mean_token_accuracy": 0.9290780127048492, + "num_tokens": 11029398.0, + "step": 6156 + }, + { + "epoch": 0.9970042911505141, + "grad_norm": 40.1210823059082, + "learning_rate": 3.238341968911917e-08, + "loss": 0.5933, + "mean_token_accuracy": 0.920409768819809, + "num_tokens": 11031186.0, + "step": 6157 + }, + { + "epoch": 0.9971662213585945, + "grad_norm": 16.119884490966797, + "learning_rate": 3.0764248704663216e-08, + "loss": 0.4468, + "mean_token_accuracy": 0.935238391160965, + "num_tokens": 11032976.0, + "step": 6158 + }, + { + "epoch": 0.9973281515666748, + "grad_norm": 35.559635162353516, + "learning_rate": 2.9145077720207255e-08, + "loss": 0.6321, + "mean_token_accuracy": 0.9090500473976135, + "num_tokens": 11034763.0, + "step": 6159 + }, + { + "epoch": 0.9974900817747551, + "grad_norm": 21.047258377075195, + "learning_rate": 2.7525906735751295e-08, + "loss": 0.511, + "mean_token_accuracy": 0.929934561252594, + "num_tokens": 11036546.0, + "step": 6160 + }, + { + "epoch": 0.9976520119828354, + "grad_norm": 26.26460075378418, + "learning_rate": 2.590673575129534e-08, + "loss": 0.5131, + "mean_token_accuracy": 0.931506872177124, + "num_tokens": 11038350.0, + "step": 6161 + }, + { + "epoch": 0.9978139421909157, + "grad_norm": 29.879308700561523, + "learning_rate": 2.428756476683938e-08, + "loss": 0.5796, + "mean_token_accuracy": 0.9318859279155731, + "num_tokens": 11040141.0, + "step": 6162 + }, + { + "epoch": 0.997975872398996, + "grad_norm": 15.822606086730957, + "learning_rate": 2.266839378238342e-08, + "loss": 0.4507, + "mean_token_accuracy": 0.9370780885219574, + "num_tokens": 11041923.0, + "step": 6163 + }, + { + "epoch": 0.9981378026070763, + "grad_norm": 30.000288009643555, + "learning_rate": 2.1049222797927463e-08, + "loss": 0.5289, + "mean_token_accuracy": 0.9241134822368622, + "num_tokens": 11043711.0, + "step": 6164 + }, + { + "epoch": 0.9982997328151567, + "grad_norm": 41.298301696777344, + "learning_rate": 1.9430051813471502e-08, + "loss": 0.6972, + "mean_token_accuracy": 0.9019809365272522, + "num_tokens": 11045509.0, + "step": 6165 + }, + { + "epoch": 0.998461663023237, + "grad_norm": 25.58466911315918, + "learning_rate": 1.7810880829015545e-08, + "loss": 0.5316, + "mean_token_accuracy": 0.9219752252101898, + "num_tokens": 11047290.0, + "step": 6166 + }, + { + "epoch": 0.9986235932313173, + "grad_norm": 34.52656555175781, + "learning_rate": 1.6191709844559585e-08, + "loss": 0.5587, + "mean_token_accuracy": 0.9244749844074249, + "num_tokens": 11049080.0, + "step": 6167 + }, + { + "epoch": 0.9987855234393976, + "grad_norm": 27.237966537475586, + "learning_rate": 1.4572538860103628e-08, + "loss": 0.5112, + "mean_token_accuracy": 0.9234431385993958, + "num_tokens": 11050879.0, + "step": 6168 + }, + { + "epoch": 0.998947453647478, + "grad_norm": 28.404184341430664, + "learning_rate": 1.295336787564767e-08, + "loss": 0.5139, + "mean_token_accuracy": 0.9289588332176208, + "num_tokens": 11052672.0, + "step": 6169 + }, + { + "epoch": 0.9991093838555583, + "grad_norm": 36.872406005859375, + "learning_rate": 1.133419689119171e-08, + "loss": 0.5901, + "mean_token_accuracy": 0.9189852774143219, + "num_tokens": 11054468.0, + "step": 6170 + }, + { + "epoch": 0.9992713140636386, + "grad_norm": 23.31491470336914, + "learning_rate": 9.715025906735751e-09, + "loss": 0.6014, + "mean_token_accuracy": 0.925579309463501, + "num_tokens": 11056248.0, + "step": 6171 + }, + { + "epoch": 0.9994332442717189, + "grad_norm": 39.50399398803711, + "learning_rate": 8.095854922279792e-09, + "loss": 0.6999, + "mean_token_accuracy": 0.9109838902950287, + "num_tokens": 11058041.0, + "step": 6172 + }, + { + "epoch": 0.9995951744797992, + "grad_norm": 27.043882369995117, + "learning_rate": 6.476683937823835e-09, + "loss": 0.5004, + "mean_token_accuracy": 0.926991730928421, + "num_tokens": 11059827.0, + "step": 6173 + }, + { + "epoch": 0.9997571046878795, + "grad_norm": 31.093772888183594, + "learning_rate": 4.857512953367876e-09, + "loss": 0.5152, + "mean_token_accuracy": 0.925220400094986, + "num_tokens": 11061620.0, + "step": 6174 + }, + { + "epoch": 0.9999190348959598, + "grad_norm": 31.358657836914062, + "learning_rate": 3.2383419689119176e-09, + "loss": 0.681, + "mean_token_accuracy": 0.9112877249717712, + "num_tokens": 11063405.0, + "step": 6175 + }, + { + "epoch": 1.0, + "grad_norm": 53.96744918823242, + "learning_rate": 1.6191709844559588e-09, + "loss": 0.3714, + "mean_token_accuracy": 0.8999999761581421, + "num_tokens": 11063853.0, + "step": 6176 + } + ], + "logging_steps": 1, + "max_steps": 6176, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.4054739140955386e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}