| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9998892948079265, | |
| "eval_steps": 500, | |
| "global_step": 2258, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002214103841470165, | |
| "grad_norm": 42.3470458984375, | |
| "learning_rate": 0.0002, | |
| "loss": 15.5484, | |
| "mean_token_accuracy": 0.4835517302155495, | |
| "num_tokens": 11000.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.00442820768294033, | |
| "grad_norm": 14.259652137756348, | |
| "learning_rate": 0.0002, | |
| "loss": 6.8519, | |
| "mean_token_accuracy": 0.691154733300209, | |
| "num_tokens": 22942.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.006642311524410495, | |
| "grad_norm": 11.712563514709473, | |
| "learning_rate": 0.0002, | |
| "loss": 4.074, | |
| "mean_token_accuracy": 0.8152767598628998, | |
| "num_tokens": 35335.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.00885641536588066, | |
| "grad_norm": 9.237317085266113, | |
| "learning_rate": 0.0002, | |
| "loss": 3.2515, | |
| "mean_token_accuracy": 0.8487898230552673, | |
| "num_tokens": 46080.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.011070519207350825, | |
| "grad_norm": 7.927119731903076, | |
| "learning_rate": 0.0002, | |
| "loss": 2.6725, | |
| "mean_token_accuracy": 0.8698692440986633, | |
| "num_tokens": 57931.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01328462304882099, | |
| "grad_norm": 12.773114204406738, | |
| "learning_rate": 0.0002, | |
| "loss": 2.3487, | |
| "mean_token_accuracy": 0.8942261308431625, | |
| "num_tokens": 70831.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.015498726890291154, | |
| "grad_norm": 13.333335876464844, | |
| "learning_rate": 0.0002, | |
| "loss": 2.4343, | |
| "mean_token_accuracy": 0.8810921609401703, | |
| "num_tokens": 82909.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.01771283073176132, | |
| "grad_norm": 28.373117446899414, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7865, | |
| "mean_token_accuracy": 0.896348387002945, | |
| "num_tokens": 96654.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.019926934573231483, | |
| "grad_norm": 5.377050876617432, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3522, | |
| "mean_token_accuracy": 0.9119709521532059, | |
| "num_tokens": 110664.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.02214103841470165, | |
| "grad_norm": 9.99971866607666, | |
| "learning_rate": 0.0002, | |
| "loss": 1.487, | |
| "mean_token_accuracy": 0.9074823081493377, | |
| "num_tokens": 124077.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.024355142256171816, | |
| "grad_norm": 5.345480918884277, | |
| "learning_rate": 0.0002, | |
| "loss": 1.8425, | |
| "mean_token_accuracy": 0.8920609176158905, | |
| "num_tokens": 137134.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.02656924609764198, | |
| "grad_norm": 7.6641950607299805, | |
| "learning_rate": 0.0002, | |
| "loss": 1.6796, | |
| "mean_token_accuracy": 0.9035607397556304, | |
| "num_tokens": 149388.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.028783349939112145, | |
| "grad_norm": 11.887953758239746, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7672, | |
| "mean_token_accuracy": 0.8967141926288604, | |
| "num_tokens": 160071.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.03099745378058231, | |
| "grad_norm": 5.832238674163818, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2079, | |
| "mean_token_accuracy": 0.9185251384973526, | |
| "num_tokens": 173268.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.033211557622052475, | |
| "grad_norm": 8.69938850402832, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5593, | |
| "mean_token_accuracy": 0.9084058552980423, | |
| "num_tokens": 186169.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.03542566146352264, | |
| "grad_norm": 4.20233678817749, | |
| "learning_rate": 0.0002, | |
| "loss": 1.193, | |
| "mean_token_accuracy": 0.9241156429052353, | |
| "num_tokens": 201336.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03763976530499281, | |
| "grad_norm": 5.642890453338623, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2642, | |
| "mean_token_accuracy": 0.9184715151786804, | |
| "num_tokens": 213213.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.03985386914646297, | |
| "grad_norm": 5.765429973602295, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4019, | |
| "mean_token_accuracy": 0.9147763669490814, | |
| "num_tokens": 224027.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.04206797298793313, | |
| "grad_norm": 5.6816630363464355, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3954, | |
| "mean_token_accuracy": 0.9195666879415512, | |
| "num_tokens": 235465.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.0442820768294033, | |
| "grad_norm": 6.504826068878174, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0328, | |
| "mean_token_accuracy": 0.925902035832405, | |
| "num_tokens": 248735.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.046496180670873466, | |
| "grad_norm": 5.916987895965576, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3818, | |
| "mean_token_accuracy": 0.910975980758667, | |
| "num_tokens": 260910.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.04871028451234363, | |
| "grad_norm": 4.899371147155762, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3134, | |
| "mean_token_accuracy": 0.911156702041626, | |
| "num_tokens": 272818.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.05092438835381379, | |
| "grad_norm": 4.977072238922119, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0713, | |
| "mean_token_accuracy": 0.9260460823774338, | |
| "num_tokens": 285231.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.05313849219528396, | |
| "grad_norm": 5.722350120544434, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5063, | |
| "mean_token_accuracy": 0.9071427851915359, | |
| "num_tokens": 295970.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.055352596036754124, | |
| "grad_norm": 6.670350551605225, | |
| "learning_rate": 0.0002, | |
| "loss": 1.48, | |
| "mean_token_accuracy": 0.9137888938188553, | |
| "num_tokens": 307700.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.05756669987822429, | |
| "grad_norm": 4.701105117797852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9216, | |
| "mean_token_accuracy": 0.9317790538072586, | |
| "num_tokens": 321924.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05978080371969445, | |
| "grad_norm": 6.084046840667725, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1839, | |
| "mean_token_accuracy": 0.9231065452098847, | |
| "num_tokens": 336808.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.06199490756116462, | |
| "grad_norm": 4.739227294921875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4177, | |
| "mean_token_accuracy": 0.9183769166469574, | |
| "num_tokens": 349784.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06420901140263478, | |
| "grad_norm": 5.750140190124512, | |
| "learning_rate": 0.0002, | |
| "loss": 1.57, | |
| "mean_token_accuracy": 0.9093784838914871, | |
| "num_tokens": 359809.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.06642311524410495, | |
| "grad_norm": 5.502381324768066, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0926, | |
| "mean_token_accuracy": 0.9278808891773224, | |
| "num_tokens": 373540.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06863721908557512, | |
| "grad_norm": 3.8856916427612305, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0794, | |
| "mean_token_accuracy": 0.9323706835508346, | |
| "num_tokens": 385610.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.07085132292704528, | |
| "grad_norm": 5.632940769195557, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5044, | |
| "mean_token_accuracy": 0.904329365491867, | |
| "num_tokens": 396616.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.07306542676851545, | |
| "grad_norm": 5.6375203132629395, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4283, | |
| "mean_token_accuracy": 0.9134370684623718, | |
| "num_tokens": 408011.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.07527953060998561, | |
| "grad_norm": 6.19705057144165, | |
| "learning_rate": 0.0002, | |
| "loss": 1.529, | |
| "mean_token_accuracy": 0.9081434190273285, | |
| "num_tokens": 417320.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.07749363445145577, | |
| "grad_norm": 5.687817573547363, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0551, | |
| "mean_token_accuracy": 0.9281114786863327, | |
| "num_tokens": 428148.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.07970773829292593, | |
| "grad_norm": 4.656964302062988, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0233, | |
| "mean_token_accuracy": 0.9278254687786103, | |
| "num_tokens": 440931.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0819218421343961, | |
| "grad_norm": 5.18228816986084, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5104, | |
| "mean_token_accuracy": 0.9057249039411545, | |
| "num_tokens": 451561.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.08413594597586627, | |
| "grad_norm": 4.148241996765137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9641, | |
| "mean_token_accuracy": 0.9276774257421494, | |
| "num_tokens": 464562.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08635004981733643, | |
| "grad_norm": 5.226706027984619, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3431, | |
| "mean_token_accuracy": 0.9116898536682129, | |
| "num_tokens": 476950.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.0885641536588066, | |
| "grad_norm": 4.673404693603516, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1031, | |
| "mean_token_accuracy": 0.924088642001152, | |
| "num_tokens": 490346.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09077825750027677, | |
| "grad_norm": 4.617090225219727, | |
| "learning_rate": 0.0002, | |
| "loss": 1.013, | |
| "mean_token_accuracy": 0.922841164469719, | |
| "num_tokens": 501766.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.09299236134174693, | |
| "grad_norm": 3.0853629112243652, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0736, | |
| "mean_token_accuracy": 0.9237724006175995, | |
| "num_tokens": 514665.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.0952064651832171, | |
| "grad_norm": 4.994758129119873, | |
| "learning_rate": 0.0002, | |
| "loss": 1.202, | |
| "mean_token_accuracy": 0.9217059254646301, | |
| "num_tokens": 526702.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.09742056902468726, | |
| "grad_norm": 5.424100875854492, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0885, | |
| "mean_token_accuracy": 0.9214400589466095, | |
| "num_tokens": 538211.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.09963467286615742, | |
| "grad_norm": 6.007224082946777, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2519, | |
| "mean_token_accuracy": 0.9193921983242035, | |
| "num_tokens": 549294.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.10184877670762758, | |
| "grad_norm": 4.146254062652588, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0478, | |
| "mean_token_accuracy": 0.9264948040246963, | |
| "num_tokens": 561455.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.10406288054909775, | |
| "grad_norm": 5.269349575042725, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0395, | |
| "mean_token_accuracy": 0.9255057454109192, | |
| "num_tokens": 571769.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.10627698439056792, | |
| "grad_norm": 3.518718957901001, | |
| "learning_rate": 0.0002, | |
| "loss": 1.103, | |
| "mean_token_accuracy": 0.9197809398174286, | |
| "num_tokens": 583866.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.10849108823203808, | |
| "grad_norm": 4.272643566131592, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5091, | |
| "mean_token_accuracy": 0.9108827739953995, | |
| "num_tokens": 593617.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.11070519207350825, | |
| "grad_norm": 4.085506439208984, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3358, | |
| "mean_token_accuracy": 0.9100114196538925, | |
| "num_tokens": 605999.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.11291929591497842, | |
| "grad_norm": 3.2901113033294678, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0284, | |
| "mean_token_accuracy": 0.9262891203165055, | |
| "num_tokens": 618331.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.11513339975644858, | |
| "grad_norm": 4.289281368255615, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2362, | |
| "mean_token_accuracy": 0.9141905009746552, | |
| "num_tokens": 630021.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.11734750359791875, | |
| "grad_norm": 4.200511932373047, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4356, | |
| "mean_token_accuracy": 0.9113897413015366, | |
| "num_tokens": 641232.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.1195616074393889, | |
| "grad_norm": 3.7031190395355225, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0592, | |
| "mean_token_accuracy": 0.9252003788948059, | |
| "num_tokens": 655240.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.12177571128085907, | |
| "grad_norm": 3.439293622970581, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1109, | |
| "mean_token_accuracy": 0.923374080657959, | |
| "num_tokens": 667642.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.12398981512232923, | |
| "grad_norm": 4.368510723114014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.976, | |
| "mean_token_accuracy": 0.9249310046434402, | |
| "num_tokens": 679262.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1262039189637994, | |
| "grad_norm": 3.9658987522125244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9673, | |
| "mean_token_accuracy": 0.9295818597078324, | |
| "num_tokens": 691910.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.12841802280526957, | |
| "grad_norm": 4.339285850524902, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0667, | |
| "mean_token_accuracy": 0.9245558708906174, | |
| "num_tokens": 703552.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.13063212664673973, | |
| "grad_norm": 4.079878330230713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8804, | |
| "mean_token_accuracy": 0.9306642562150955, | |
| "num_tokens": 716599.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.1328462304882099, | |
| "grad_norm": 3.9294116497039795, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1058, | |
| "mean_token_accuracy": 0.9195070207118988, | |
| "num_tokens": 729446.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.13506033432968007, | |
| "grad_norm": 4.728193759918213, | |
| "learning_rate": 0.0002, | |
| "loss": 1.132, | |
| "mean_token_accuracy": 0.9184233337640763, | |
| "num_tokens": 740855.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.13727443817115023, | |
| "grad_norm": 4.081950664520264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9906, | |
| "mean_token_accuracy": 0.9285815507173538, | |
| "num_tokens": 753698.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1394885420126204, | |
| "grad_norm": 5.046234130859375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.385, | |
| "mean_token_accuracy": 0.914243558049202, | |
| "num_tokens": 764462.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.14170264585409056, | |
| "grad_norm": 4.2347002029418945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8938, | |
| "mean_token_accuracy": 0.9304849207401276, | |
| "num_tokens": 777047.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.14391674969556073, | |
| "grad_norm": 4.928355693817139, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0848, | |
| "mean_token_accuracy": 0.9283693462610245, | |
| "num_tokens": 790880.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.1461308535370309, | |
| "grad_norm": 4.760014057159424, | |
| "learning_rate": 0.0002, | |
| "loss": 1.076, | |
| "mean_token_accuracy": 0.9280766844749451, | |
| "num_tokens": 804314.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.14834495737850106, | |
| "grad_norm": 3.9877500534057617, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2699, | |
| "mean_token_accuracy": 0.9156712800264358, | |
| "num_tokens": 815261.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.15055906121997123, | |
| "grad_norm": 4.126375198364258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9963, | |
| "mean_token_accuracy": 0.9256706595420837, | |
| "num_tokens": 829528.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.15277316506144137, | |
| "grad_norm": 3.9972124099731445, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3455, | |
| "mean_token_accuracy": 0.9122998714447021, | |
| "num_tokens": 841276.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.15498726890291153, | |
| "grad_norm": 5.413994312286377, | |
| "learning_rate": 0.0002, | |
| "loss": 1.101, | |
| "mean_token_accuracy": 0.9225263863801956, | |
| "num_tokens": 854080.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1572013727443817, | |
| "grad_norm": 3.65535306930542, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2503, | |
| "mean_token_accuracy": 0.9237045079469681, | |
| "num_tokens": 866984.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.15941547658585187, | |
| "grad_norm": 4.747088432312012, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3619, | |
| "mean_token_accuracy": 0.9102030217647552, | |
| "num_tokens": 878494.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.16162958042732203, | |
| "grad_norm": 4.269980430603027, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1624, | |
| "mean_token_accuracy": 0.9170969694852829, | |
| "num_tokens": 889350.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.1638436842687922, | |
| "grad_norm": 3.83870530128479, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0977, | |
| "mean_token_accuracy": 0.9311458617448807, | |
| "num_tokens": 900460.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.16605778811026237, | |
| "grad_norm": 4.76396369934082, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1958, | |
| "mean_token_accuracy": 0.9178014636039734, | |
| "num_tokens": 911749.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.16827189195173253, | |
| "grad_norm": 4.328544616699219, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0498, | |
| "mean_token_accuracy": 0.9265041768550872, | |
| "num_tokens": 924982.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1704859957932027, | |
| "grad_norm": 4.840917110443115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8995, | |
| "mean_token_accuracy": 0.9299014925956726, | |
| "num_tokens": 938464.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.17270009963467287, | |
| "grad_norm": 3.3656206130981445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9166, | |
| "mean_token_accuracy": 0.9308703899383545, | |
| "num_tokens": 950480.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.17491420347614303, | |
| "grad_norm": 4.094184398651123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9128, | |
| "mean_token_accuracy": 0.9329801052808762, | |
| "num_tokens": 962975.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.1771283073176132, | |
| "grad_norm": 4.07610559463501, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9266, | |
| "mean_token_accuracy": 0.9289239794015884, | |
| "num_tokens": 976843.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.17934241115908336, | |
| "grad_norm": 3.3643202781677246, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0849, | |
| "mean_token_accuracy": 0.9257559090852737, | |
| "num_tokens": 988427.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.18155651500055353, | |
| "grad_norm": 4.841256141662598, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2045, | |
| "mean_token_accuracy": 0.9165349155664444, | |
| "num_tokens": 999829.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.1837706188420237, | |
| "grad_norm": 3.4727354049682617, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0157, | |
| "mean_token_accuracy": 0.9250179201364517, | |
| "num_tokens": 1010929.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.18598472268349386, | |
| "grad_norm": 3.2475087642669678, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1052, | |
| "mean_token_accuracy": 0.9202539622783661, | |
| "num_tokens": 1022824.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.18819882652496403, | |
| "grad_norm": 3.1745808124542236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7683, | |
| "mean_token_accuracy": 0.9444057643413544, | |
| "num_tokens": 1036553.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.1904129303664342, | |
| "grad_norm": 3.939055919647217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7823, | |
| "mean_token_accuracy": 0.9350433677434922, | |
| "num_tokens": 1048393.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.19262703420790436, | |
| "grad_norm": 3.397245407104492, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1069, | |
| "mean_token_accuracy": 0.9203731089830398, | |
| "num_tokens": 1060606.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.19484113804937453, | |
| "grad_norm": 3.675420045852661, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2814, | |
| "mean_token_accuracy": 0.9185589760541916, | |
| "num_tokens": 1073102.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.19705524189084467, | |
| "grad_norm": 2.8432865142822266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9715, | |
| "mean_token_accuracy": 0.9279666066169738, | |
| "num_tokens": 1083869.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.19926934573231483, | |
| "grad_norm": 3.5305676460266113, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0675, | |
| "mean_token_accuracy": 0.9236095041036606, | |
| "num_tokens": 1094722.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.201483449573785, | |
| "grad_norm": 4.308902263641357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7586, | |
| "mean_token_accuracy": 0.9407237708568573, | |
| "num_tokens": 1109517.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.20369755341525517, | |
| "grad_norm": 3.947713613510132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8975, | |
| "mean_token_accuracy": 0.9331141859292984, | |
| "num_tokens": 1122116.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.20591165725672533, | |
| "grad_norm": 2.7408392429351807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9984, | |
| "mean_token_accuracy": 0.9276015996932984, | |
| "num_tokens": 1133434.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.2081257610981955, | |
| "grad_norm": 4.87424373626709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9096, | |
| "mean_token_accuracy": 0.9369789361953735, | |
| "num_tokens": 1147039.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.21033986493966567, | |
| "grad_norm": 3.165412425994873, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1, | |
| "mean_token_accuracy": 0.9302688419818879, | |
| "num_tokens": 1160845.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.21255396878113583, | |
| "grad_norm": 3.668769121170044, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0275, | |
| "mean_token_accuracy": 0.9271714627742768, | |
| "num_tokens": 1172244.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.214768072622606, | |
| "grad_norm": 3.2549095153808594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9565, | |
| "mean_token_accuracy": 0.9336414575576782, | |
| "num_tokens": 1185395.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.21698217646407617, | |
| "grad_norm": 3.9204583168029785, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0073, | |
| "mean_token_accuracy": 0.9254105865955353, | |
| "num_tokens": 1197385.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.21919628030554633, | |
| "grad_norm": 4.254587650299072, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2027, | |
| "mean_token_accuracy": 0.9190206825733185, | |
| "num_tokens": 1207777.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.2214103841470165, | |
| "grad_norm": 3.455690622329712, | |
| "learning_rate": 0.0002, | |
| "loss": 1.335, | |
| "mean_token_accuracy": 0.9205510348081589, | |
| "num_tokens": 1218537.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22362448798848666, | |
| "grad_norm": 3.2426981925964355, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1186, | |
| "mean_token_accuracy": 0.9233917683362961, | |
| "num_tokens": 1229882.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.22583859182995683, | |
| "grad_norm": 3.5431432723999023, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1298, | |
| "mean_token_accuracy": 0.9185365289449692, | |
| "num_tokens": 1241357.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.228052695671427, | |
| "grad_norm": 3.3408544063568115, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0158, | |
| "mean_token_accuracy": 0.931389006972313, | |
| "num_tokens": 1254158.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.23026679951289716, | |
| "grad_norm": 3.5583953857421875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.985, | |
| "mean_token_accuracy": 0.9288320362567901, | |
| "num_tokens": 1265454.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.23248090335436733, | |
| "grad_norm": 3.7565269470214844, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1319, | |
| "mean_token_accuracy": 0.9232024788856507, | |
| "num_tokens": 1276256.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.2346950071958375, | |
| "grad_norm": 3.4486448764801025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9273, | |
| "mean_token_accuracy": 0.9286326110363007, | |
| "num_tokens": 1289049.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.23690911103730766, | |
| "grad_norm": 3.341252565383911, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9678, | |
| "mean_token_accuracy": 0.9301002591848373, | |
| "num_tokens": 1301964.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.2391232148787778, | |
| "grad_norm": 3.2227766513824463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9778, | |
| "mean_token_accuracy": 0.9308173507452011, | |
| "num_tokens": 1316098.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.24133731872024797, | |
| "grad_norm": 4.225726127624512, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0343, | |
| "mean_token_accuracy": 0.9257538586854934, | |
| "num_tokens": 1327919.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.24355142256171813, | |
| "grad_norm": 3.6367788314819336, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0479, | |
| "mean_token_accuracy": 0.9220652222633362, | |
| "num_tokens": 1341243.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2457655264031883, | |
| "grad_norm": 2.7346153259277344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9359, | |
| "mean_token_accuracy": 0.928188094496727, | |
| "num_tokens": 1353419.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.24797963024465847, | |
| "grad_norm": 3.3693747520446777, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0423, | |
| "mean_token_accuracy": 0.9239319413900375, | |
| "num_tokens": 1364006.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.25019373408612866, | |
| "grad_norm": 4.45697546005249, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0615, | |
| "mean_token_accuracy": 0.9232381820678711, | |
| "num_tokens": 1375497.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.2524078379275988, | |
| "grad_norm": 3.7682595252990723, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0123, | |
| "mean_token_accuracy": 0.9235726416110992, | |
| "num_tokens": 1388604.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.254621941769069, | |
| "grad_norm": 2.5660793781280518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7591, | |
| "mean_token_accuracy": 0.9384708911180496, | |
| "num_tokens": 1402952.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.25683604561053913, | |
| "grad_norm": 3.347537040710449, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8635, | |
| "mean_token_accuracy": 0.9377257645130157, | |
| "num_tokens": 1416588.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.2590501494520093, | |
| "grad_norm": 11.165135383605957, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1461, | |
| "mean_token_accuracy": 0.9261222094297409, | |
| "num_tokens": 1429142.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.26126425329347946, | |
| "grad_norm": 3.1489033699035645, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2318, | |
| "mean_token_accuracy": 0.9255981892347336, | |
| "num_tokens": 1440345.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.2634783571349496, | |
| "grad_norm": 3.541116952896118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8121, | |
| "mean_token_accuracy": 0.9322708487510681, | |
| "num_tokens": 1455224.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.2656924609764198, | |
| "grad_norm": 4.341325283050537, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1886, | |
| "mean_token_accuracy": 0.920156580209732, | |
| "num_tokens": 1468143.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.26790656481788994, | |
| "grad_norm": 3.9943504333496094, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0343, | |
| "mean_token_accuracy": 0.9286263018846512, | |
| "num_tokens": 1482762.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.27012066865936013, | |
| "grad_norm": 3.0633606910705566, | |
| "learning_rate": 0.0002, | |
| "loss": 1.162, | |
| "mean_token_accuracy": 0.9224074572324753, | |
| "num_tokens": 1494120.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.27233477250083027, | |
| "grad_norm": 3.8182456493377686, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1397, | |
| "mean_token_accuracy": 0.9227182388305664, | |
| "num_tokens": 1505877.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.27454887634230046, | |
| "grad_norm": 4.838993549346924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9525, | |
| "mean_token_accuracy": 0.934059739112854, | |
| "num_tokens": 1519437.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.2767629801837706, | |
| "grad_norm": 3.2925949096679688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7975, | |
| "mean_token_accuracy": 0.936807957291603, | |
| "num_tokens": 1533986.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.2789770840252408, | |
| "grad_norm": 3.633017063140869, | |
| "learning_rate": 0.0002, | |
| "loss": 1.022, | |
| "mean_token_accuracy": 0.9283811062574386, | |
| "num_tokens": 1545664.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.28119118786671093, | |
| "grad_norm": 3.6399173736572266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.844, | |
| "mean_token_accuracy": 0.9315326452255249, | |
| "num_tokens": 1558398.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.28340529170818113, | |
| "grad_norm": 4.304896831512451, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1099, | |
| "mean_token_accuracy": 0.9182446151971817, | |
| "num_tokens": 1569769.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.28561939554965127, | |
| "grad_norm": 3.875694513320923, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1155, | |
| "mean_token_accuracy": 0.923322680592537, | |
| "num_tokens": 1581951.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.28783349939112146, | |
| "grad_norm": 2.719801187515259, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0578, | |
| "mean_token_accuracy": 0.9241158574819565, | |
| "num_tokens": 1594802.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.2900476032325916, | |
| "grad_norm": 2.3830995559692383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9112, | |
| "mean_token_accuracy": 0.9314200520515442, | |
| "num_tokens": 1608969.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.2922617070740618, | |
| "grad_norm": 2.6518445014953613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7838, | |
| "mean_token_accuracy": 0.9370227992534638, | |
| "num_tokens": 1622121.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.29447581091553193, | |
| "grad_norm": 3.3631813526153564, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1264, | |
| "mean_token_accuracy": 0.9195171415805816, | |
| "num_tokens": 1635382.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.2966899147570021, | |
| "grad_norm": 2.3228812217712402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8994, | |
| "mean_token_accuracy": 0.933735242486, | |
| "num_tokens": 1647948.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.29890401859847227, | |
| "grad_norm": 4.091598033905029, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1017, | |
| "mean_token_accuracy": 0.9218682497739792, | |
| "num_tokens": 1659267.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.30111812243994246, | |
| "grad_norm": 3.764561414718628, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7148, | |
| "mean_token_accuracy": 0.9434669315814972, | |
| "num_tokens": 1672417.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3033322262814126, | |
| "grad_norm": 3.185284376144409, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8594, | |
| "mean_token_accuracy": 0.9352281510829925, | |
| "num_tokens": 1686121.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.30554633012288274, | |
| "grad_norm": 3.7809314727783203, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0109, | |
| "mean_token_accuracy": 0.9245012730360032, | |
| "num_tokens": 1697450.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.30776043396435293, | |
| "grad_norm": 3.160498857498169, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9013, | |
| "mean_token_accuracy": 0.9293900519609452, | |
| "num_tokens": 1708446.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.30997453780582307, | |
| "grad_norm": 3.9027180671691895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9421, | |
| "mean_token_accuracy": 0.9305406659841537, | |
| "num_tokens": 1721554.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.31218864164729326, | |
| "grad_norm": 4.140758514404297, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1733, | |
| "mean_token_accuracy": 0.9222266197204589, | |
| "num_tokens": 1732417.0, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.3144027454887634, | |
| "grad_norm": 3.0247952938079834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9215, | |
| "mean_token_accuracy": 0.933441498875618, | |
| "num_tokens": 1745534.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3166168493302336, | |
| "grad_norm": 3.145435094833374, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1067, | |
| "mean_token_accuracy": 0.9226934105157852, | |
| "num_tokens": 1757200.0, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.31883095317170373, | |
| "grad_norm": 2.747141122817993, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1384, | |
| "mean_token_accuracy": 0.9188572406768799, | |
| "num_tokens": 1768726.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.32104505701317393, | |
| "grad_norm": 3.0959110260009766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9543, | |
| "mean_token_accuracy": 0.930645814538002, | |
| "num_tokens": 1779198.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.32325916085464407, | |
| "grad_norm": 3.135096788406372, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9297, | |
| "mean_token_accuracy": 0.9337345957756042, | |
| "num_tokens": 1792258.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.32547326469611426, | |
| "grad_norm": 2.297475814819336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6964, | |
| "mean_token_accuracy": 0.9390480488538742, | |
| "num_tokens": 1804579.0, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.3276873685375844, | |
| "grad_norm": 2.391242265701294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.877, | |
| "mean_token_accuracy": 0.9332740783691407, | |
| "num_tokens": 1817721.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3299014723790546, | |
| "grad_norm": 2.637448787689209, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9144, | |
| "mean_token_accuracy": 0.9269635200500488, | |
| "num_tokens": 1829841.0, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.33211557622052473, | |
| "grad_norm": 2.4021666049957275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7773, | |
| "mean_token_accuracy": 0.9371457427740097, | |
| "num_tokens": 1843795.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3343296800619949, | |
| "grad_norm": 3.012258768081665, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2914, | |
| "mean_token_accuracy": 0.9153674453496933, | |
| "num_tokens": 1854609.0, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.33654378390346507, | |
| "grad_norm": 3.001725912094116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9337, | |
| "mean_token_accuracy": 0.929664534330368, | |
| "num_tokens": 1867176.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.33875788774493526, | |
| "grad_norm": 2.9781148433685303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8391, | |
| "mean_token_accuracy": 0.9307692885398865, | |
| "num_tokens": 1879001.0, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.3409719915864054, | |
| "grad_norm": 3.3859033584594727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9095, | |
| "mean_token_accuracy": 0.9333516269922256, | |
| "num_tokens": 1891502.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.3431860954278756, | |
| "grad_norm": 2.874831199645996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7998, | |
| "mean_token_accuracy": 0.9348277896642685, | |
| "num_tokens": 1903279.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.34540019926934573, | |
| "grad_norm": 3.1517276763916016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9995, | |
| "mean_token_accuracy": 0.9329377114772797, | |
| "num_tokens": 1915655.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.34761430311081587, | |
| "grad_norm": 3.497373342514038, | |
| "learning_rate": 0.0002, | |
| "loss": 0.964, | |
| "mean_token_accuracy": 0.9265813857316971, | |
| "num_tokens": 1928510.0, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.34982840695228606, | |
| "grad_norm": 2.791043996810913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9767, | |
| "mean_token_accuracy": 0.927400279045105, | |
| "num_tokens": 1940056.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.3520425107937562, | |
| "grad_norm": 2.9507498741149902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.818, | |
| "mean_token_accuracy": 0.9325792044401169, | |
| "num_tokens": 1953063.0, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.3542566146352264, | |
| "grad_norm": 3.5697410106658936, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1487, | |
| "mean_token_accuracy": 0.9165920346975327, | |
| "num_tokens": 1962616.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.35647071847669654, | |
| "grad_norm": 2.9599061012268066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9402, | |
| "mean_token_accuracy": 0.9306722432374954, | |
| "num_tokens": 1974475.0, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.35868482231816673, | |
| "grad_norm": 2.6589152812957764, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0531, | |
| "mean_token_accuracy": 0.9262530177831649, | |
| "num_tokens": 1986985.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.36089892615963687, | |
| "grad_norm": 2.572406530380249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9497, | |
| "mean_token_accuracy": 0.9344472140073776, | |
| "num_tokens": 1998976.0, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.36311303000110706, | |
| "grad_norm": 3.164608955383301, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8457, | |
| "mean_token_accuracy": 0.9335892468690872, | |
| "num_tokens": 2011484.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.3653271338425772, | |
| "grad_norm": 2.683702230453491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9487, | |
| "mean_token_accuracy": 0.9289698421955108, | |
| "num_tokens": 2023211.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.3675412376840474, | |
| "grad_norm": 3.9282374382019043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.852, | |
| "mean_token_accuracy": 0.9424950003623962, | |
| "num_tokens": 2034942.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.36975534152551753, | |
| "grad_norm": 3.512605905532837, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0478, | |
| "mean_token_accuracy": 0.9244632363319397, | |
| "num_tokens": 2047139.0, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.3719694453669877, | |
| "grad_norm": 2.8640499114990234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9073, | |
| "mean_token_accuracy": 0.9334672391414642, | |
| "num_tokens": 2060706.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.37418354920845787, | |
| "grad_norm": 2.45273756980896, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1191, | |
| "mean_token_accuracy": 0.9188451081514358, | |
| "num_tokens": 2071960.0, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.37639765304992806, | |
| "grad_norm": 3.4304628372192383, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0851, | |
| "mean_token_accuracy": 0.9217288702726364, | |
| "num_tokens": 2083408.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.3786117568913982, | |
| "grad_norm": 2.3865091800689697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7252, | |
| "mean_token_accuracy": 0.9369473248720169, | |
| "num_tokens": 2096307.0, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.3808258607328684, | |
| "grad_norm": 3.7749905586242676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8653, | |
| "mean_token_accuracy": 0.9342508226633072, | |
| "num_tokens": 2109585.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.38303996457433853, | |
| "grad_norm": 2.718156099319458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7336, | |
| "mean_token_accuracy": 0.9394059836864471, | |
| "num_tokens": 2124731.0, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.3852540684158087, | |
| "grad_norm": 2.924015522003174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8086, | |
| "mean_token_accuracy": 0.9387334406375885, | |
| "num_tokens": 2138127.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.38746817225727886, | |
| "grad_norm": 3.366246461868286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.814, | |
| "mean_token_accuracy": 0.9380306899547577, | |
| "num_tokens": 2148305.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.38968227609874906, | |
| "grad_norm": 3.231900930404663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9297, | |
| "mean_token_accuracy": 0.927461439371109, | |
| "num_tokens": 2160722.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3918963799402192, | |
| "grad_norm": 2.826343059539795, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1812, | |
| "mean_token_accuracy": 0.9228267341852188, | |
| "num_tokens": 2171118.0, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.39411048378168934, | |
| "grad_norm": 3.783430814743042, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9572, | |
| "mean_token_accuracy": 0.9258956193923951, | |
| "num_tokens": 2182445.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.39632458762315953, | |
| "grad_norm": 3.7381107807159424, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0978, | |
| "mean_token_accuracy": 0.9256362348794938, | |
| "num_tokens": 2195042.0, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.39853869146462967, | |
| "grad_norm": 3.0702905654907227, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7616, | |
| "mean_token_accuracy": 0.9401916921138763, | |
| "num_tokens": 2210503.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.40075279530609986, | |
| "grad_norm": 3.0583152770996094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9957, | |
| "mean_token_accuracy": 0.9271980673074722, | |
| "num_tokens": 2222075.0, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.40296689914757, | |
| "grad_norm": 2.851524591445923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9718, | |
| "mean_token_accuracy": 0.9335306733846664, | |
| "num_tokens": 2234666.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.4051810029890402, | |
| "grad_norm": 3.090538740158081, | |
| "learning_rate": 0.0002, | |
| "loss": 0.887, | |
| "mean_token_accuracy": 0.9297337353229522, | |
| "num_tokens": 2245402.0, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.40739510683051033, | |
| "grad_norm": 2.5975914001464844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7172, | |
| "mean_token_accuracy": 0.9442079395055771, | |
| "num_tokens": 2258066.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.40960921067198053, | |
| "grad_norm": 2.861872911453247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8477, | |
| "mean_token_accuracy": 0.9340190798044204, | |
| "num_tokens": 2269745.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.41182331451345067, | |
| "grad_norm": 3.162109136581421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9025, | |
| "mean_token_accuracy": 0.9301832497119904, | |
| "num_tokens": 2282483.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.41403741835492086, | |
| "grad_norm": 2.9675636291503906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8154, | |
| "mean_token_accuracy": 0.9318990021944046, | |
| "num_tokens": 2294175.0, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.416251522196391, | |
| "grad_norm": 3.2092788219451904, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2112, | |
| "mean_token_accuracy": 0.9157642692327499, | |
| "num_tokens": 2305822.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.4184656260378612, | |
| "grad_norm": 3.086061954498291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8472, | |
| "mean_token_accuracy": 0.9323055207729339, | |
| "num_tokens": 2319852.0, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.42067972987933133, | |
| "grad_norm": 2.875953435897827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8917, | |
| "mean_token_accuracy": 0.9304117172956466, | |
| "num_tokens": 2330826.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.4228938337208015, | |
| "grad_norm": 3.364098072052002, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1513, | |
| "mean_token_accuracy": 0.9273996829986573, | |
| "num_tokens": 2340121.0, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.42510793756227166, | |
| "grad_norm": 2.3155770301818848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8481, | |
| "mean_token_accuracy": 0.9310476630926132, | |
| "num_tokens": 2353874.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.42732204140374186, | |
| "grad_norm": 2.873863458633423, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0553, | |
| "mean_token_accuracy": 0.9190466612577438, | |
| "num_tokens": 2364722.0, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.429536145245212, | |
| "grad_norm": 3.088542938232422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7824, | |
| "mean_token_accuracy": 0.9386604636907577, | |
| "num_tokens": 2377983.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.4317502490866822, | |
| "grad_norm": 3.2161245346069336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9667, | |
| "mean_token_accuracy": 0.9316904872655869, | |
| "num_tokens": 2388214.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.43396435292815233, | |
| "grad_norm": 3.019383192062378, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8876, | |
| "mean_token_accuracy": 0.9336837440729141, | |
| "num_tokens": 2402069.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.43617845676962247, | |
| "grad_norm": 3.164597988128662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9017, | |
| "mean_token_accuracy": 0.9319613158702851, | |
| "num_tokens": 2415773.0, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.43839256061109266, | |
| "grad_norm": 3.070587635040283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.812, | |
| "mean_token_accuracy": 0.9365138530731201, | |
| "num_tokens": 2428468.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4406066644525628, | |
| "grad_norm": 2.605221748352051, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8997, | |
| "mean_token_accuracy": 0.9311312526464463, | |
| "num_tokens": 2439495.0, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.442820768294033, | |
| "grad_norm": 3.4609057903289795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9947, | |
| "mean_token_accuracy": 0.9287879914045334, | |
| "num_tokens": 2449133.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.44503487213550313, | |
| "grad_norm": 3.2428348064422607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9216, | |
| "mean_token_accuracy": 0.9293368250131607, | |
| "num_tokens": 2460987.0, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.44724897597697333, | |
| "grad_norm": 2.541088342666626, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1029, | |
| "mean_token_accuracy": 0.9256097286939621, | |
| "num_tokens": 2472360.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.44946307981844347, | |
| "grad_norm": 3.3123016357421875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0771, | |
| "mean_token_accuracy": 0.9271711260080338, | |
| "num_tokens": 2482790.0, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.45167718365991366, | |
| "grad_norm": 2.808271884918213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9692, | |
| "mean_token_accuracy": 0.9322762846946716, | |
| "num_tokens": 2494158.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.4538912875013838, | |
| "grad_norm": 3.2123773097991943, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2038, | |
| "mean_token_accuracy": 0.9229989141225815, | |
| "num_tokens": 2504659.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.456105391342854, | |
| "grad_norm": 3.9224977493286133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9177, | |
| "mean_token_accuracy": 0.9301177680492401, | |
| "num_tokens": 2515947.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.45831949518432413, | |
| "grad_norm": 3.159163236618042, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1681, | |
| "mean_token_accuracy": 0.9179033428430557, | |
| "num_tokens": 2527377.0, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.4605335990257943, | |
| "grad_norm": 2.5653562545776367, | |
| "learning_rate": 0.0002, | |
| "loss": 1.054, | |
| "mean_token_accuracy": 0.9257000118494034, | |
| "num_tokens": 2539474.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.46274770286726447, | |
| "grad_norm": 2.449213981628418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9338, | |
| "mean_token_accuracy": 0.9319339364767074, | |
| "num_tokens": 2553545.0, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.46496180670873466, | |
| "grad_norm": 3.5044524669647217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9448, | |
| "mean_token_accuracy": 0.9295208871364593, | |
| "num_tokens": 2564811.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.4671759105502048, | |
| "grad_norm": 3.5413150787353516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9424, | |
| "mean_token_accuracy": 0.9302045583724976, | |
| "num_tokens": 2576751.0, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.469390014391675, | |
| "grad_norm": 3.0598480701446533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9126, | |
| "mean_token_accuracy": 0.9285120725631714, | |
| "num_tokens": 2588358.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.47160411823314513, | |
| "grad_norm": 2.6380608081817627, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9495, | |
| "mean_token_accuracy": 0.9323398619890213, | |
| "num_tokens": 2599877.0, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.4738182220746153, | |
| "grad_norm": 3.1709139347076416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9925, | |
| "mean_token_accuracy": 0.9324059277772904, | |
| "num_tokens": 2610770.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.47603232591608546, | |
| "grad_norm": 2.9764163494110107, | |
| "learning_rate": 0.0002, | |
| "loss": 0.782, | |
| "mean_token_accuracy": 0.9320799469947815, | |
| "num_tokens": 2621941.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.4782464297575556, | |
| "grad_norm": 2.2882895469665527, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0986, | |
| "mean_token_accuracy": 0.9247247219085694, | |
| "num_tokens": 2631475.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.4804605335990258, | |
| "grad_norm": 3.075330972671509, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0145, | |
| "mean_token_accuracy": 0.9294690877199173, | |
| "num_tokens": 2645344.0, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.48267463744049593, | |
| "grad_norm": 3.256373167037964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8895, | |
| "mean_token_accuracy": 0.9273504942655564, | |
| "num_tokens": 2655935.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.48488874128196613, | |
| "grad_norm": 3.454824924468994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8622, | |
| "mean_token_accuracy": 0.9389674246311188, | |
| "num_tokens": 2667932.0, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.48710284512343627, | |
| "grad_norm": 3.4632182121276855, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0757, | |
| "mean_token_accuracy": 0.9280873537063599, | |
| "num_tokens": 2679252.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.48931694896490646, | |
| "grad_norm": 2.618551254272461, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7642, | |
| "mean_token_accuracy": 0.9406112372875214, | |
| "num_tokens": 2694935.0, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.4915310528063766, | |
| "grad_norm": 3.4649856090545654, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1715, | |
| "mean_token_accuracy": 0.9203990876674653, | |
| "num_tokens": 2705892.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.4937451566478468, | |
| "grad_norm": 1.9820575714111328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7926, | |
| "mean_token_accuracy": 0.9387197762727737, | |
| "num_tokens": 2717836.0, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.49595926048931693, | |
| "grad_norm": 3.768416404724121, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1023, | |
| "mean_token_accuracy": 0.9265471220016479, | |
| "num_tokens": 2728995.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.4981733643307871, | |
| "grad_norm": 2.721743106842041, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7971, | |
| "mean_token_accuracy": 0.9334240764379501, | |
| "num_tokens": 2741977.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.5003874681722573, | |
| "grad_norm": 2.004788398742676, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2163, | |
| "mean_token_accuracy": 0.9211069196462631, | |
| "num_tokens": 2755116.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.5026015720137275, | |
| "grad_norm": 3.46217942237854, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0581, | |
| "mean_token_accuracy": 0.9227796822786332, | |
| "num_tokens": 2766455.0, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.5048156758551976, | |
| "grad_norm": 2.795225143432617, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0172, | |
| "mean_token_accuracy": 0.9251609027385712, | |
| "num_tokens": 2780252.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.5070297796966677, | |
| "grad_norm": 2.5644898414611816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8366, | |
| "mean_token_accuracy": 0.9360420912504196, | |
| "num_tokens": 2793158.0, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.509243883538138, | |
| "grad_norm": 2.85178279876709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8914, | |
| "mean_token_accuracy": 0.9331649392843246, | |
| "num_tokens": 2803304.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.5114579873796081, | |
| "grad_norm": 4.3757429122924805, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3006, | |
| "mean_token_accuracy": 0.912799459695816, | |
| "num_tokens": 2813346.0, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.5136720912210783, | |
| "grad_norm": 3.0821921825408936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8931, | |
| "mean_token_accuracy": 0.9292008608579636, | |
| "num_tokens": 2825150.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.5158861950625484, | |
| "grad_norm": 2.4634435176849365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7139, | |
| "mean_token_accuracy": 0.9386299520730972, | |
| "num_tokens": 2840010.0, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.5181002989040187, | |
| "grad_norm": 3.792142391204834, | |
| "learning_rate": 0.0002, | |
| "loss": 1.019, | |
| "mean_token_accuracy": 0.9267308801412583, | |
| "num_tokens": 2851060.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.5203144027454888, | |
| "grad_norm": 3.2142715454101562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8126, | |
| "mean_token_accuracy": 0.936069804430008, | |
| "num_tokens": 2863848.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.5225285065869589, | |
| "grad_norm": 3.059175729751587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.936, | |
| "mean_token_accuracy": 0.9262942969799042, | |
| "num_tokens": 2875045.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.5247426104284291, | |
| "grad_norm": 2.397021532058716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7636, | |
| "mean_token_accuracy": 0.9363964319229126, | |
| "num_tokens": 2888765.0, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.5269567142698992, | |
| "grad_norm": 2.311615228652954, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1155, | |
| "mean_token_accuracy": 0.9244213849306107, | |
| "num_tokens": 2898522.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.5291708181113695, | |
| "grad_norm": 2.686445713043213, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0209, | |
| "mean_token_accuracy": 0.9271868228912353, | |
| "num_tokens": 2910294.0, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.5313849219528396, | |
| "grad_norm": 3.382634401321411, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2553, | |
| "mean_token_accuracy": 0.9195928305387497, | |
| "num_tokens": 2920375.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5335990257943097, | |
| "grad_norm": 2.7273964881896973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7366, | |
| "mean_token_accuracy": 0.940141350030899, | |
| "num_tokens": 2935314.0, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.5358131296357799, | |
| "grad_norm": 3.521521806716919, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1562, | |
| "mean_token_accuracy": 0.9220536708831787, | |
| "num_tokens": 2947131.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.5380272334772501, | |
| "grad_norm": 3.4570305347442627, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7671, | |
| "mean_token_accuracy": 0.9346885770559311, | |
| "num_tokens": 2958100.0, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.5402413373187203, | |
| "grad_norm": 3.236543655395508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9714, | |
| "mean_token_accuracy": 0.9265434801578522, | |
| "num_tokens": 2968906.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.5424554411601904, | |
| "grad_norm": 2.8082947731018066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8898, | |
| "mean_token_accuracy": 0.9322567820549011, | |
| "num_tokens": 2979901.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.5446695450016605, | |
| "grad_norm": 3.805554151535034, | |
| "learning_rate": 0.0002, | |
| "loss": 1.208, | |
| "mean_token_accuracy": 0.9178589969873429, | |
| "num_tokens": 2989620.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.5468836488431308, | |
| "grad_norm": 2.371670961380005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7449, | |
| "mean_token_accuracy": 0.9355430036783219, | |
| "num_tokens": 3001716.0, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.5490977526846009, | |
| "grad_norm": 3.121859550476074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8731, | |
| "mean_token_accuracy": 0.9309169679880143, | |
| "num_tokens": 3014315.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.5513118565260711, | |
| "grad_norm": 3.560624599456787, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1319, | |
| "mean_token_accuracy": 0.9211807966232299, | |
| "num_tokens": 3025741.0, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.5535259603675412, | |
| "grad_norm": 2.2103476524353027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.785, | |
| "mean_token_accuracy": 0.936233428120613, | |
| "num_tokens": 3039131.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.5557400642090115, | |
| "grad_norm": 2.7845640182495117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.91, | |
| "mean_token_accuracy": 0.931680291891098, | |
| "num_tokens": 3051046.0, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.5579541680504816, | |
| "grad_norm": 2.8331215381622314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7777, | |
| "mean_token_accuracy": 0.9354436278343201, | |
| "num_tokens": 3063282.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.5601682718919517, | |
| "grad_norm": 4.3590312004089355, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0973, | |
| "mean_token_accuracy": 0.9222743719816208, | |
| "num_tokens": 3074994.0, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.5623823757334219, | |
| "grad_norm": 2.716376543045044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8278, | |
| "mean_token_accuracy": 0.9338112890720367, | |
| "num_tokens": 3086166.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.564596479574892, | |
| "grad_norm": 2.791226625442505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9836, | |
| "mean_token_accuracy": 0.9340397655963898, | |
| "num_tokens": 3098425.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.5668105834163623, | |
| "grad_norm": 2.7921218872070312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7225, | |
| "mean_token_accuracy": 0.9339357107877732, | |
| "num_tokens": 3111129.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.5690246872578324, | |
| "grad_norm": 2.764394998550415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9283, | |
| "mean_token_accuracy": 0.9268820822238922, | |
| "num_tokens": 3123270.0, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.5712387910993025, | |
| "grad_norm": 2.195909261703491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6484, | |
| "mean_token_accuracy": 0.9442890018224717, | |
| "num_tokens": 3137978.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.5734528949407727, | |
| "grad_norm": 3.223241090774536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7461, | |
| "mean_token_accuracy": 0.9427239447832108, | |
| "num_tokens": 3151026.0, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.5756669987822429, | |
| "grad_norm": 1.9516724348068237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6012, | |
| "mean_token_accuracy": 0.9473742932081223, | |
| "num_tokens": 3165781.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5778811026237131, | |
| "grad_norm": 2.265829563140869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7933, | |
| "mean_token_accuracy": 0.9362419694662094, | |
| "num_tokens": 3179325.0, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.5800952064651832, | |
| "grad_norm": 2.4466440677642822, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7285, | |
| "mean_token_accuracy": 0.9399940431118011, | |
| "num_tokens": 3193006.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.5823093103066533, | |
| "grad_norm": 4.094124794006348, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2629, | |
| "mean_token_accuracy": 0.9156520456075669, | |
| "num_tokens": 3203008.0, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.5845234141481236, | |
| "grad_norm": 3.08402681350708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8664, | |
| "mean_token_accuracy": 0.9337212562561035, | |
| "num_tokens": 3213539.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.5867375179895937, | |
| "grad_norm": 2.6603167057037354, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0287, | |
| "mean_token_accuracy": 0.9329268485307693, | |
| "num_tokens": 3225145.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.5889516218310639, | |
| "grad_norm": 3.6709961891174316, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1247, | |
| "mean_token_accuracy": 0.9213701337575912, | |
| "num_tokens": 3235894.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.591165725672534, | |
| "grad_norm": 2.449747323989868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8486, | |
| "mean_token_accuracy": 0.9324274808168411, | |
| "num_tokens": 3250311.0, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.5933798295140043, | |
| "grad_norm": 5.862588882446289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9213, | |
| "mean_token_accuracy": 0.930136987566948, | |
| "num_tokens": 3262778.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.5955939333554744, | |
| "grad_norm": 2.749333620071411, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6496, | |
| "mean_token_accuracy": 0.9448672115802765, | |
| "num_tokens": 3275900.0, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.5978080371969445, | |
| "grad_norm": 3.583944320678711, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9604, | |
| "mean_token_accuracy": 0.9305095732212066, | |
| "num_tokens": 3288867.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.6000221410384147, | |
| "grad_norm": 2.9229469299316406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9985, | |
| "mean_token_accuracy": 0.9300340205430985, | |
| "num_tokens": 3300908.0, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.6022362448798849, | |
| "grad_norm": 2.5269253253936768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8679, | |
| "mean_token_accuracy": 0.9382039904594421, | |
| "num_tokens": 3311877.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.6044503487213551, | |
| "grad_norm": 2.8036136627197266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9413, | |
| "mean_token_accuracy": 0.9259363144636155, | |
| "num_tokens": 3323855.0, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.6066644525628252, | |
| "grad_norm": 2.521695137023926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8644, | |
| "mean_token_accuracy": 0.9354902893304825, | |
| "num_tokens": 3336303.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.6088785564042953, | |
| "grad_norm": 1.684542179107666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7695, | |
| "mean_token_accuracy": 0.9394852668046951, | |
| "num_tokens": 3348744.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.6110926602457655, | |
| "grad_norm": 2.3662304878234863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.861, | |
| "mean_token_accuracy": 0.931562864780426, | |
| "num_tokens": 3360929.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.6133067640872357, | |
| "grad_norm": 2.780378580093384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7188, | |
| "mean_token_accuracy": 0.9389736771583557, | |
| "num_tokens": 3373991.0, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.6155208679287059, | |
| "grad_norm": 3.235178232192993, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0112, | |
| "mean_token_accuracy": 0.9271218568086624, | |
| "num_tokens": 3386416.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.617734971770176, | |
| "grad_norm": 2.309201240539551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.698, | |
| "mean_token_accuracy": 0.9409903854131698, | |
| "num_tokens": 3401240.0, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.6199490756116461, | |
| "grad_norm": 3.5225324630737305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.89, | |
| "mean_token_accuracy": 0.9343404263257981, | |
| "num_tokens": 3412430.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6221631794531164, | |
| "grad_norm": 2.0347211360931396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7872, | |
| "mean_token_accuracy": 0.9413310676813126, | |
| "num_tokens": 3427316.0, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.6243772832945865, | |
| "grad_norm": 3.274460554122925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8903, | |
| "mean_token_accuracy": 0.9350662767887116, | |
| "num_tokens": 3438501.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.6265913871360567, | |
| "grad_norm": 3.090731620788574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9255, | |
| "mean_token_accuracy": 0.9328742384910583, | |
| "num_tokens": 3451424.0, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.6288054909775268, | |
| "grad_norm": 3.9713704586029053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7117, | |
| "mean_token_accuracy": 0.9369514465332032, | |
| "num_tokens": 3465061.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.631019594818997, | |
| "grad_norm": 2.745424747467041, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6563, | |
| "mean_token_accuracy": 0.9424190640449523, | |
| "num_tokens": 3479516.0, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.6332336986604672, | |
| "grad_norm": 2.7794830799102783, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6805, | |
| "mean_token_accuracy": 0.9403312534093857, | |
| "num_tokens": 3491631.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.6354478025019373, | |
| "grad_norm": 3.3232924938201904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8676, | |
| "mean_token_accuracy": 0.9338378489017487, | |
| "num_tokens": 3502686.0, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.6376619063434075, | |
| "grad_norm": 3.140780210494995, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1598, | |
| "mean_token_accuracy": 0.9237967163324357, | |
| "num_tokens": 3512469.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.6398760101848777, | |
| "grad_norm": 3.4723212718963623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9209, | |
| "mean_token_accuracy": 0.9337589800357818, | |
| "num_tokens": 3522936.0, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.6420901140263479, | |
| "grad_norm": 2.442565441131592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9235, | |
| "mean_token_accuracy": 0.9316159158945083, | |
| "num_tokens": 3533969.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.644304217867818, | |
| "grad_norm": 2.524017572402954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7758, | |
| "mean_token_accuracy": 0.9413342833518982, | |
| "num_tokens": 3547959.0, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.6465183217092881, | |
| "grad_norm": 2.5085105895996094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9613, | |
| "mean_token_accuracy": 0.9296864479780197, | |
| "num_tokens": 3560649.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.6487324255507584, | |
| "grad_norm": 3.036599636077881, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0325, | |
| "mean_token_accuracy": 0.9275905907154083, | |
| "num_tokens": 3572059.0, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.6509465293922285, | |
| "grad_norm": 3.0383479595184326, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0154, | |
| "mean_token_accuracy": 0.9247897952795029, | |
| "num_tokens": 3585441.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.6531606332336987, | |
| "grad_norm": 2.8595175743103027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6949, | |
| "mean_token_accuracy": 0.9423891752958298, | |
| "num_tokens": 3598995.0, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.6553747370751688, | |
| "grad_norm": 2.770921230316162, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1307, | |
| "mean_token_accuracy": 0.9203568994998932, | |
| "num_tokens": 3610296.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.6575888409166389, | |
| "grad_norm": 3.6914687156677246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7949, | |
| "mean_token_accuracy": 0.9348386436700821, | |
| "num_tokens": 3624724.0, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.6598029447581092, | |
| "grad_norm": 2.433919668197632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8382, | |
| "mean_token_accuracy": 0.9350390166044236, | |
| "num_tokens": 3635917.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.6620170485995793, | |
| "grad_norm": 2.6883230209350586, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0004, | |
| "mean_token_accuracy": 0.9251396596431732, | |
| "num_tokens": 3647238.0, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.6642311524410495, | |
| "grad_norm": 2.9668235778808594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9062, | |
| "mean_token_accuracy": 0.9337601840496064, | |
| "num_tokens": 3658527.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.6664452562825196, | |
| "grad_norm": 3.1381282806396484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9015, | |
| "mean_token_accuracy": 0.9375191539525985, | |
| "num_tokens": 3670617.0, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.6686593601239899, | |
| "grad_norm": 2.365852117538452, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8156, | |
| "mean_token_accuracy": 0.9330903559923172, | |
| "num_tokens": 3685340.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.67087346396546, | |
| "grad_norm": 2.9032535552978516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9033, | |
| "mean_token_accuracy": 0.9322319328784943, | |
| "num_tokens": 3697378.0, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.6730875678069301, | |
| "grad_norm": 2.333289861679077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9788, | |
| "mean_token_accuracy": 0.9274383842945099, | |
| "num_tokens": 3711043.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.6753016716484003, | |
| "grad_norm": 2.7803232669830322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.878, | |
| "mean_token_accuracy": 0.9308006018400192, | |
| "num_tokens": 3723413.0, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.6775157754898705, | |
| "grad_norm": 2.559749126434326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9522, | |
| "mean_token_accuracy": 0.9328515976667404, | |
| "num_tokens": 3738329.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.6797298793313407, | |
| "grad_norm": 2.448359489440918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8662, | |
| "mean_token_accuracy": 0.9386651337146759, | |
| "num_tokens": 3750510.0, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.6819439831728108, | |
| "grad_norm": 2.5929195880889893, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0518, | |
| "mean_token_accuracy": 0.9254045516252518, | |
| "num_tokens": 3761821.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.6841580870142809, | |
| "grad_norm": 2.6473214626312256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6234, | |
| "mean_token_accuracy": 0.9468908250331879, | |
| "num_tokens": 3774252.0, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.6863721908557512, | |
| "grad_norm": 3.025092840194702, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1481, | |
| "mean_token_accuracy": 0.9196543127298356, | |
| "num_tokens": 3786710.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.6885862946972213, | |
| "grad_norm": 2.7005512714385986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9283, | |
| "mean_token_accuracy": 0.9315001249313355, | |
| "num_tokens": 3797617.0, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.6908003985386915, | |
| "grad_norm": 2.3178861141204834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8469, | |
| "mean_token_accuracy": 0.932272481918335, | |
| "num_tokens": 3811013.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.6930145023801616, | |
| "grad_norm": 3.0088205337524414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9886, | |
| "mean_token_accuracy": 0.9232943028211593, | |
| "num_tokens": 3822926.0, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.6952286062216317, | |
| "grad_norm": 2.413239002227783, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6553, | |
| "mean_token_accuracy": 0.9435460805892945, | |
| "num_tokens": 3836364.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.697442710063102, | |
| "grad_norm": 2.6605615615844727, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0324, | |
| "mean_token_accuracy": 0.9254931479692459, | |
| "num_tokens": 3848950.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.6996568139045721, | |
| "grad_norm": 2.1413521766662598, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6661, | |
| "mean_token_accuracy": 0.9437528550624847, | |
| "num_tokens": 3863924.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.7018709177460423, | |
| "grad_norm": 2.496495485305786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8324, | |
| "mean_token_accuracy": 0.936608812212944, | |
| "num_tokens": 3876219.0, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.7040850215875124, | |
| "grad_norm": 3.7890663146972656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9902, | |
| "mean_token_accuracy": 0.9294484287500382, | |
| "num_tokens": 3886540.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.7062991254289827, | |
| "grad_norm": 2.942206621170044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8423, | |
| "mean_token_accuracy": 0.9380270838737488, | |
| "num_tokens": 3899354.0, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.7085132292704528, | |
| "grad_norm": 3.0008063316345215, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0699, | |
| "mean_token_accuracy": 0.9203650772571563, | |
| "num_tokens": 3911622.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7107273331119229, | |
| "grad_norm": 2.285707950592041, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8683, | |
| "mean_token_accuracy": 0.931014335155487, | |
| "num_tokens": 3923438.0, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.7129414369533931, | |
| "grad_norm": 2.3685543537139893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8168, | |
| "mean_token_accuracy": 0.9353397488594055, | |
| "num_tokens": 3935444.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.7151555407948633, | |
| "grad_norm": 3.0847818851470947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9466, | |
| "mean_token_accuracy": 0.929938405752182, | |
| "num_tokens": 3946607.0, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.7173696446363335, | |
| "grad_norm": 3.0750293731689453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.914, | |
| "mean_token_accuracy": 0.9286134451627731, | |
| "num_tokens": 3959199.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.7195837484778036, | |
| "grad_norm": 3.4493777751922607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.977, | |
| "mean_token_accuracy": 0.919402825832367, | |
| "num_tokens": 3970979.0, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.7217978523192737, | |
| "grad_norm": 3.124067783355713, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0367, | |
| "mean_token_accuracy": 0.9251971215009689, | |
| "num_tokens": 3981702.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.724011956160744, | |
| "grad_norm": 2.45589017868042, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8534, | |
| "mean_token_accuracy": 0.9356551617383957, | |
| "num_tokens": 3994027.0, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.7262260600022141, | |
| "grad_norm": 3.4078500270843506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.965, | |
| "mean_token_accuracy": 0.9347460746765137, | |
| "num_tokens": 4005820.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.7284401638436843, | |
| "grad_norm": 2.2892725467681885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592, | |
| "mean_token_accuracy": 0.9513197064399719, | |
| "num_tokens": 4018352.0, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.7306542676851544, | |
| "grad_norm": 1.8147987127304077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6609, | |
| "mean_token_accuracy": 0.941572979092598, | |
| "num_tokens": 4031874.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.7328683715266247, | |
| "grad_norm": 3.63505220413208, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0721, | |
| "mean_token_accuracy": 0.9192070156335831, | |
| "num_tokens": 4042614.0, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.7350824753680948, | |
| "grad_norm": 2.3137118816375732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7545, | |
| "mean_token_accuracy": 0.9381762742996216, | |
| "num_tokens": 4054608.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.7372965792095649, | |
| "grad_norm": 3.7039380073547363, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9273, | |
| "mean_token_accuracy": 0.9306770205497742, | |
| "num_tokens": 4065861.0, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.7395106830510351, | |
| "grad_norm": 2.4405832290649414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8476, | |
| "mean_token_accuracy": 0.9362583935260773, | |
| "num_tokens": 4078523.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.7417247868925052, | |
| "grad_norm": 2.31562876701355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7321, | |
| "mean_token_accuracy": 0.9406876623630523, | |
| "num_tokens": 4092330.0, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.7439388907339755, | |
| "grad_norm": 2.7616567611694336, | |
| "learning_rate": 0.0002, | |
| "loss": 1.018, | |
| "mean_token_accuracy": 0.9298185467720032, | |
| "num_tokens": 4103159.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.7461529945754456, | |
| "grad_norm": 2.3408303260803223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8177, | |
| "mean_token_accuracy": 0.9400125861167907, | |
| "num_tokens": 4116163.0, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.7483670984169157, | |
| "grad_norm": 1.8495256900787354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7702, | |
| "mean_token_accuracy": 0.9391000926494598, | |
| "num_tokens": 4130911.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.7505812022583859, | |
| "grad_norm": 2.5655927658081055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.757, | |
| "mean_token_accuracy": 0.9395717918872833, | |
| "num_tokens": 4143931.0, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.7527953060998561, | |
| "grad_norm": 3.2286360263824463, | |
| "learning_rate": 0.0002, | |
| "loss": 1.004, | |
| "mean_token_accuracy": 0.927115085721016, | |
| "num_tokens": 4155020.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.7550094099413263, | |
| "grad_norm": 2.1214611530303955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9058, | |
| "mean_token_accuracy": 0.938083803653717, | |
| "num_tokens": 4167190.0, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.7572235137827964, | |
| "grad_norm": 2.1883342266082764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9033, | |
| "mean_token_accuracy": 0.9318992733955384, | |
| "num_tokens": 4177955.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.7594376176242665, | |
| "grad_norm": 2.774677038192749, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8573, | |
| "mean_token_accuracy": 0.9322766721248626, | |
| "num_tokens": 4188804.0, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.7616517214657368, | |
| "grad_norm": 2.4907023906707764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.904, | |
| "mean_token_accuracy": 0.931341353058815, | |
| "num_tokens": 4201503.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.7638658253072069, | |
| "grad_norm": 2.5578067302703857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7897, | |
| "mean_token_accuracy": 0.937814000248909, | |
| "num_tokens": 4212954.0, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.7660799291486771, | |
| "grad_norm": 3.0754973888397217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9264, | |
| "mean_token_accuracy": 0.9291175544261933, | |
| "num_tokens": 4224571.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.7682940329901472, | |
| "grad_norm": 2.0344362258911133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9195, | |
| "mean_token_accuracy": 0.9382949858903885, | |
| "num_tokens": 4240166.0, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.7705081368316175, | |
| "grad_norm": 2.706178903579712, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8708, | |
| "mean_token_accuracy": 0.9349207997322082, | |
| "num_tokens": 4252583.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.7727222406730876, | |
| "grad_norm": 2.2786762714385986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8403, | |
| "mean_token_accuracy": 0.926530522108078, | |
| "num_tokens": 4263446.0, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.7749363445145577, | |
| "grad_norm": 2.330183506011963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7518, | |
| "mean_token_accuracy": 0.9388490498065949, | |
| "num_tokens": 4275714.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.7771504483560279, | |
| "grad_norm": 2.4576809406280518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7374, | |
| "mean_token_accuracy": 0.9381652891635894, | |
| "num_tokens": 4289168.0, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.7793645521974981, | |
| "grad_norm": 2.1529836654663086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9973, | |
| "mean_token_accuracy": 0.9245061188936233, | |
| "num_tokens": 4301644.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.7815786560389683, | |
| "grad_norm": 2.188100576400757, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8327, | |
| "mean_token_accuracy": 0.9336588770151139, | |
| "num_tokens": 4313516.0, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.7837927598804384, | |
| "grad_norm": 2.1842052936553955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.734, | |
| "mean_token_accuracy": 0.9438644349575043, | |
| "num_tokens": 4325557.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.7860068637219085, | |
| "grad_norm": 2.2100729942321777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8849, | |
| "mean_token_accuracy": 0.9334080815315247, | |
| "num_tokens": 4338260.0, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.7882209675633787, | |
| "grad_norm": 2.2355990409851074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8919, | |
| "mean_token_accuracy": 0.9319656908512115, | |
| "num_tokens": 4352238.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.7904350714048489, | |
| "grad_norm": 2.0506389141082764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8144, | |
| "mean_token_accuracy": 0.9333805292844772, | |
| "num_tokens": 4365565.0, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.7926491752463191, | |
| "grad_norm": 2.5267720222473145, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0332, | |
| "mean_token_accuracy": 0.9245569318532944, | |
| "num_tokens": 4376973.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.7948632790877892, | |
| "grad_norm": 2.453788995742798, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7972, | |
| "mean_token_accuracy": 0.9332748234272004, | |
| "num_tokens": 4390102.0, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.7970773829292593, | |
| "grad_norm": 2.7728281021118164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9427, | |
| "mean_token_accuracy": 0.9322525978088378, | |
| "num_tokens": 4401357.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7992914867707296, | |
| "grad_norm": 3.114647388458252, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0359, | |
| "mean_token_accuracy": 0.9259240895509719, | |
| "num_tokens": 4411905.0, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.8015055906121997, | |
| "grad_norm": 3.4858386516571045, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0477, | |
| "mean_token_accuracy": 0.9230340659618378, | |
| "num_tokens": 4422838.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.8037196944536699, | |
| "grad_norm": 2.627652645111084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9524, | |
| "mean_token_accuracy": 0.9306756138801575, | |
| "num_tokens": 4435022.0, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.80593379829514, | |
| "grad_norm": 3.016364336013794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7447, | |
| "mean_token_accuracy": 0.9370788335800171, | |
| "num_tokens": 4447750.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.8081479021366103, | |
| "grad_norm": 3.563826560974121, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8516, | |
| "mean_token_accuracy": 0.9333288490772247, | |
| "num_tokens": 4460185.0, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.8103620059780804, | |
| "grad_norm": 3.726036310195923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9291, | |
| "mean_token_accuracy": 0.930818784236908, | |
| "num_tokens": 4472630.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.8125761098195505, | |
| "grad_norm": 2.23213267326355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.913, | |
| "mean_token_accuracy": 0.9309399574995041, | |
| "num_tokens": 4485328.0, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.8147902136610207, | |
| "grad_norm": 3.263636827468872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9267, | |
| "mean_token_accuracy": 0.9323295533657074, | |
| "num_tokens": 4496151.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.8170043175024909, | |
| "grad_norm": 1.8619623184204102, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7147, | |
| "mean_token_accuracy": 0.9434016287326813, | |
| "num_tokens": 4509519.0, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.8192184213439611, | |
| "grad_norm": 3.046086072921753, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0037, | |
| "mean_token_accuracy": 0.9272881835699082, | |
| "num_tokens": 4521094.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.8214325251854312, | |
| "grad_norm": 2.7041449546813965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8851, | |
| "mean_token_accuracy": 0.93080253303051, | |
| "num_tokens": 4533460.0, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.8236466290269013, | |
| "grad_norm": 2.374342679977417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8168, | |
| "mean_token_accuracy": 0.9408663511276245, | |
| "num_tokens": 4545453.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.8258607328683715, | |
| "grad_norm": 2.5304906368255615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7315, | |
| "mean_token_accuracy": 0.9390550851821899, | |
| "num_tokens": 4558017.0, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.8280748367098417, | |
| "grad_norm": 3.1711394786834717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8645, | |
| "mean_token_accuracy": 0.9297717779874801, | |
| "num_tokens": 4569514.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.8302889405513119, | |
| "grad_norm": 3.0447299480438232, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8618, | |
| "mean_token_accuracy": 0.9313441842794419, | |
| "num_tokens": 4579542.0, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.832503044392782, | |
| "grad_norm": 2.770129680633545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8299, | |
| "mean_token_accuracy": 0.9362069517374039, | |
| "num_tokens": 4592482.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.8347171482342521, | |
| "grad_norm": 2.4876534938812256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8511, | |
| "mean_token_accuracy": 0.9381607830524444, | |
| "num_tokens": 4605153.0, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.8369312520757224, | |
| "grad_norm": 1.9146308898925781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8834, | |
| "mean_token_accuracy": 0.938035500049591, | |
| "num_tokens": 4617506.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.8391453559171925, | |
| "grad_norm": 2.0652332305908203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7992, | |
| "mean_token_accuracy": 0.935004535317421, | |
| "num_tokens": 4632287.0, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.8413594597586627, | |
| "grad_norm": 2.6872732639312744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7629, | |
| "mean_token_accuracy": 0.9380116105079651, | |
| "num_tokens": 4645193.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8435735636001328, | |
| "grad_norm": 2.857466220855713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8405, | |
| "mean_token_accuracy": 0.934279152750969, | |
| "num_tokens": 4656636.0, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.845787667441603, | |
| "grad_norm": 2.7037603855133057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8591, | |
| "mean_token_accuracy": 0.9366025865077973, | |
| "num_tokens": 4669629.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.8480017712830732, | |
| "grad_norm": 2.5019657611846924, | |
| "learning_rate": 0.0002, | |
| "loss": 1.054, | |
| "mean_token_accuracy": 0.9281669825315475, | |
| "num_tokens": 4682395.0, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.8502158751245433, | |
| "grad_norm": 2.6266391277313232, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8131, | |
| "mean_token_accuracy": 0.9354126363992691, | |
| "num_tokens": 4695518.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.8524299789660135, | |
| "grad_norm": 2.138951301574707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8369, | |
| "mean_token_accuracy": 0.9329825401306152, | |
| "num_tokens": 4708226.0, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.8546440828074837, | |
| "grad_norm": 2.910318374633789, | |
| "learning_rate": 0.0002, | |
| "loss": 0.928, | |
| "mean_token_accuracy": 0.9311651080846787, | |
| "num_tokens": 4718331.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.8568581866489539, | |
| "grad_norm": 3.454087734222412, | |
| "learning_rate": 0.0002, | |
| "loss": 0.878, | |
| "mean_token_accuracy": 0.9374223858118057, | |
| "num_tokens": 4731233.0, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.859072290490424, | |
| "grad_norm": 2.537177085876465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7976, | |
| "mean_token_accuracy": 0.9375020027160644, | |
| "num_tokens": 4743112.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.8612863943318941, | |
| "grad_norm": 2.521338701248169, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8091, | |
| "mean_token_accuracy": 0.938329017162323, | |
| "num_tokens": 4755570.0, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.8635004981733644, | |
| "grad_norm": 2.104426622390747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9058, | |
| "mean_token_accuracy": 0.9326849579811096, | |
| "num_tokens": 4767535.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.8657146020148345, | |
| "grad_norm": 2.7374699115753174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8472, | |
| "mean_token_accuracy": 0.9329976707696914, | |
| "num_tokens": 4777888.0, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.8679287058563047, | |
| "grad_norm": 3.3029234409332275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8242, | |
| "mean_token_accuracy": 0.9354960173368454, | |
| "num_tokens": 4791838.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.8701428096977748, | |
| "grad_norm": 2.2211055755615234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.879, | |
| "mean_token_accuracy": 0.9325517565011978, | |
| "num_tokens": 4803626.0, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.8723569135392449, | |
| "grad_norm": 2.8065617084503174, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0319, | |
| "mean_token_accuracy": 0.9292825996875763, | |
| "num_tokens": 4817090.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.8745710173807152, | |
| "grad_norm": 2.932598352432251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8184, | |
| "mean_token_accuracy": 0.9353422105312348, | |
| "num_tokens": 4831134.0, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.8767851212221853, | |
| "grad_norm": 2.5619053840637207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6391, | |
| "mean_token_accuracy": 0.9429128289222717, | |
| "num_tokens": 4843766.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.8789992250636555, | |
| "grad_norm": 2.2597715854644775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8963, | |
| "mean_token_accuracy": 0.9304134607315063, | |
| "num_tokens": 4856458.0, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.8812133289051256, | |
| "grad_norm": 1.9793012142181396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7251, | |
| "mean_token_accuracy": 0.939424803853035, | |
| "num_tokens": 4868615.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.8834274327465959, | |
| "grad_norm": 2.075303554534912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8271, | |
| "mean_token_accuracy": 0.9326154798269272, | |
| "num_tokens": 4880572.0, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.885641536588066, | |
| "grad_norm": 3.116805076599121, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9112, | |
| "mean_token_accuracy": 0.9324250787496566, | |
| "num_tokens": 4893492.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8878556404295361, | |
| "grad_norm": 2.752161741256714, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0983, | |
| "mean_token_accuracy": 0.9237021476030349, | |
| "num_tokens": 4904509.0, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.8900697442710063, | |
| "grad_norm": 2.598949670791626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8562, | |
| "mean_token_accuracy": 0.9328458935022355, | |
| "num_tokens": 4918130.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.8922838481124765, | |
| "grad_norm": 2.5332608222961426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8527, | |
| "mean_token_accuracy": 0.9309366434812546, | |
| "num_tokens": 4931211.0, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.8944979519539467, | |
| "grad_norm": 2.683284044265747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9261, | |
| "mean_token_accuracy": 0.9301638215780258, | |
| "num_tokens": 4943604.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.8967120557954168, | |
| "grad_norm": 2.730400323867798, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0182, | |
| "mean_token_accuracy": 0.9268926858901978, | |
| "num_tokens": 4954734.0, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.8989261596368869, | |
| "grad_norm": 3.0371506214141846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8396, | |
| "mean_token_accuracy": 0.9345870792865754, | |
| "num_tokens": 4967331.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.9011402634783572, | |
| "grad_norm": 2.1169416904449463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7571, | |
| "mean_token_accuracy": 0.937484648823738, | |
| "num_tokens": 4980777.0, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.9033543673198273, | |
| "grad_norm": 2.2946715354919434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6903, | |
| "mean_token_accuracy": 0.9432729125022888, | |
| "num_tokens": 4993590.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.9055684711612975, | |
| "grad_norm": 2.3784878253936768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8577, | |
| "mean_token_accuracy": 0.9391710489988327, | |
| "num_tokens": 5006396.0, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.9077825750027676, | |
| "grad_norm": 2.185091495513916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9413, | |
| "mean_token_accuracy": 0.928019043803215, | |
| "num_tokens": 5018793.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.9099966788442378, | |
| "grad_norm": 2.835411787033081, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8637, | |
| "mean_token_accuracy": 0.934612587094307, | |
| "num_tokens": 5029806.0, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.912210782685708, | |
| "grad_norm": 2.276442766189575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8431, | |
| "mean_token_accuracy": 0.9366135746240616, | |
| "num_tokens": 5042248.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.9144248865271781, | |
| "grad_norm": 1.937154769897461, | |
| "learning_rate": 0.0002, | |
| "loss": 1.23, | |
| "mean_token_accuracy": 0.918472969532013, | |
| "num_tokens": 5053886.0, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.9166389903686483, | |
| "grad_norm": 2.3872339725494385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8882, | |
| "mean_token_accuracy": 0.9267417550086975, | |
| "num_tokens": 5063900.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.9188530942101184, | |
| "grad_norm": 2.7894115447998047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7592, | |
| "mean_token_accuracy": 0.9337449461221695, | |
| "num_tokens": 5076904.0, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.9210671980515887, | |
| "grad_norm": 1.7611744403839111, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414, | |
| "mean_token_accuracy": 0.9498718023300171, | |
| "num_tokens": 5092582.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.9232813018930588, | |
| "grad_norm": 1.8192365169525146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6292, | |
| "mean_token_accuracy": 0.9412597447633744, | |
| "num_tokens": 5105494.0, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.9254954057345289, | |
| "grad_norm": 2.696876049041748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7313, | |
| "mean_token_accuracy": 0.9369488745927811, | |
| "num_tokens": 5117554.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.9277095095759991, | |
| "grad_norm": 2.514218330383301, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7942, | |
| "mean_token_accuracy": 0.9363553553819657, | |
| "num_tokens": 5129486.0, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.9299236134174693, | |
| "grad_norm": 2.5048184394836426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.887, | |
| "mean_token_accuracy": 0.9345066547393799, | |
| "num_tokens": 5141296.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9321377172589395, | |
| "grad_norm": 2.470578193664551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7653, | |
| "mean_token_accuracy": 0.941747522354126, | |
| "num_tokens": 5153435.0, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.9343518211004096, | |
| "grad_norm": 4.520934581756592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6472, | |
| "mean_token_accuracy": 0.9441186487674713, | |
| "num_tokens": 5168123.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.9365659249418797, | |
| "grad_norm": 2.874882936477661, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9228, | |
| "mean_token_accuracy": 0.9303671330213547, | |
| "num_tokens": 5180846.0, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.93878002878335, | |
| "grad_norm": 2.4952664375305176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6372, | |
| "mean_token_accuracy": 0.9446426779031754, | |
| "num_tokens": 5194790.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.9409941326248201, | |
| "grad_norm": 2.4000353813171387, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9386, | |
| "mean_token_accuracy": 0.9289501368999481, | |
| "num_tokens": 5206868.0, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.9432082364662903, | |
| "grad_norm": 3.2110652923583984, | |
| "learning_rate": 0.0002, | |
| "loss": 1.029, | |
| "mean_token_accuracy": 0.924631980061531, | |
| "num_tokens": 5218375.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.9454223403077604, | |
| "grad_norm": 2.351478099822998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8956, | |
| "mean_token_accuracy": 0.9299973398447037, | |
| "num_tokens": 5230679.0, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.9476364441492307, | |
| "grad_norm": 2.0615413188934326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8124, | |
| "mean_token_accuracy": 0.9377152562141419, | |
| "num_tokens": 5243273.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.9498505479907008, | |
| "grad_norm": 2.804684638977051, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8106, | |
| "mean_token_accuracy": 0.9386685341596603, | |
| "num_tokens": 5256283.0, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.9520646518321709, | |
| "grad_norm": 2.3394579887390137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7158, | |
| "mean_token_accuracy": 0.9425580680370331, | |
| "num_tokens": 5268819.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.9542787556736411, | |
| "grad_norm": 3.8045785427093506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8811, | |
| "mean_token_accuracy": 0.9341193944215774, | |
| "num_tokens": 5280796.0, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.9564928595151112, | |
| "grad_norm": 3.4269163608551025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8996, | |
| "mean_token_accuracy": 0.9318024456501007, | |
| "num_tokens": 5293545.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.9587069633565815, | |
| "grad_norm": 2.0108461380004883, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6059, | |
| "mean_token_accuracy": 0.9463658452033996, | |
| "num_tokens": 5307387.0, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.9609210671980516, | |
| "grad_norm": 2.5608201026916504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6557, | |
| "mean_token_accuracy": 0.9420038640499115, | |
| "num_tokens": 5320215.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.9631351710395217, | |
| "grad_norm": 3.3502604961395264, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0185, | |
| "mean_token_accuracy": 0.9242115944623948, | |
| "num_tokens": 5330521.0, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.9653492748809919, | |
| "grad_norm": 2.2960116863250732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7987, | |
| "mean_token_accuracy": 0.9346065133810043, | |
| "num_tokens": 5341552.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.9675633787224621, | |
| "grad_norm": 2.166372060775757, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427, | |
| "mean_token_accuracy": 0.9495539724826813, | |
| "num_tokens": 5355653.0, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.9697774825639323, | |
| "grad_norm": 1.9363880157470703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8866, | |
| "mean_token_accuracy": 0.9348477244377136, | |
| "num_tokens": 5370025.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.9719915864054024, | |
| "grad_norm": 2.699810028076172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9583, | |
| "mean_token_accuracy": 0.9283025532960891, | |
| "num_tokens": 5380549.0, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.9742056902468725, | |
| "grad_norm": 2.2296714782714844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9482, | |
| "mean_token_accuracy": 0.9271619528532028, | |
| "num_tokens": 5393080.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9764197940883428, | |
| "grad_norm": 2.4104833602905273, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8761, | |
| "mean_token_accuracy": 0.9381742179393768, | |
| "num_tokens": 5404547.0, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.9786338979298129, | |
| "grad_norm": 2.5453920364379883, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8772, | |
| "mean_token_accuracy": 0.9323698520660401, | |
| "num_tokens": 5416684.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.9808480017712831, | |
| "grad_norm": 2.8525002002716064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8593, | |
| "mean_token_accuracy": 0.9280822277069092, | |
| "num_tokens": 5427876.0, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.9830621056127532, | |
| "grad_norm": 2.7001919746398926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7549, | |
| "mean_token_accuracy": 0.9385748893022537, | |
| "num_tokens": 5438257.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.9852762094542235, | |
| "grad_norm": 3.1060454845428467, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1347, | |
| "mean_token_accuracy": 0.918604564666748, | |
| "num_tokens": 5449833.0, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.9874903132956936, | |
| "grad_norm": 2.4557158946990967, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7848, | |
| "mean_token_accuracy": 0.9370519310235977, | |
| "num_tokens": 5463070.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.9897044171371637, | |
| "grad_norm": 3.7874040603637695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9189, | |
| "mean_token_accuracy": 0.929209041595459, | |
| "num_tokens": 5474371.0, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.9919185209786339, | |
| "grad_norm": 2.745861053466797, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0186, | |
| "mean_token_accuracy": 0.9253447771072387, | |
| "num_tokens": 5487549.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.9941326248201041, | |
| "grad_norm": 2.013324022293091, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7117, | |
| "mean_token_accuracy": 0.9431297659873963, | |
| "num_tokens": 5502064.0, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.9963467286615743, | |
| "grad_norm": 2.179727792739868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7638, | |
| "mean_token_accuracy": 0.9363548696041107, | |
| "num_tokens": 5514455.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.9985608325030444, | |
| "grad_norm": 1.9565762281417847, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0294, | |
| "mean_token_accuracy": 0.9255888283252716, | |
| "num_tokens": 5526347.0, | |
| "step": 2255 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2258, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.430437089762216e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |