mehultyagi's picture
Upload merged Gemma 3 4B adapter
65c43b8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998892948079265,
"eval_steps": 500,
"global_step": 2258,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002214103841470165,
"grad_norm": 42.3470458984375,
"learning_rate": 0.0002,
"loss": 15.5484,
"mean_token_accuracy": 0.4835517302155495,
"num_tokens": 11000.0,
"step": 5
},
{
"epoch": 0.00442820768294033,
"grad_norm": 14.259652137756348,
"learning_rate": 0.0002,
"loss": 6.8519,
"mean_token_accuracy": 0.691154733300209,
"num_tokens": 22942.0,
"step": 10
},
{
"epoch": 0.006642311524410495,
"grad_norm": 11.712563514709473,
"learning_rate": 0.0002,
"loss": 4.074,
"mean_token_accuracy": 0.8152767598628998,
"num_tokens": 35335.0,
"step": 15
},
{
"epoch": 0.00885641536588066,
"grad_norm": 9.237317085266113,
"learning_rate": 0.0002,
"loss": 3.2515,
"mean_token_accuracy": 0.8487898230552673,
"num_tokens": 46080.0,
"step": 20
},
{
"epoch": 0.011070519207350825,
"grad_norm": 7.927119731903076,
"learning_rate": 0.0002,
"loss": 2.6725,
"mean_token_accuracy": 0.8698692440986633,
"num_tokens": 57931.0,
"step": 25
},
{
"epoch": 0.01328462304882099,
"grad_norm": 12.773114204406738,
"learning_rate": 0.0002,
"loss": 2.3487,
"mean_token_accuracy": 0.8942261308431625,
"num_tokens": 70831.0,
"step": 30
},
{
"epoch": 0.015498726890291154,
"grad_norm": 13.333335876464844,
"learning_rate": 0.0002,
"loss": 2.4343,
"mean_token_accuracy": 0.8810921609401703,
"num_tokens": 82909.0,
"step": 35
},
{
"epoch": 0.01771283073176132,
"grad_norm": 28.373117446899414,
"learning_rate": 0.0002,
"loss": 1.7865,
"mean_token_accuracy": 0.896348387002945,
"num_tokens": 96654.0,
"step": 40
},
{
"epoch": 0.019926934573231483,
"grad_norm": 5.377050876617432,
"learning_rate": 0.0002,
"loss": 1.3522,
"mean_token_accuracy": 0.9119709521532059,
"num_tokens": 110664.0,
"step": 45
},
{
"epoch": 0.02214103841470165,
"grad_norm": 9.99971866607666,
"learning_rate": 0.0002,
"loss": 1.487,
"mean_token_accuracy": 0.9074823081493377,
"num_tokens": 124077.0,
"step": 50
},
{
"epoch": 0.024355142256171816,
"grad_norm": 5.345480918884277,
"learning_rate": 0.0002,
"loss": 1.8425,
"mean_token_accuracy": 0.8920609176158905,
"num_tokens": 137134.0,
"step": 55
},
{
"epoch": 0.02656924609764198,
"grad_norm": 7.6641950607299805,
"learning_rate": 0.0002,
"loss": 1.6796,
"mean_token_accuracy": 0.9035607397556304,
"num_tokens": 149388.0,
"step": 60
},
{
"epoch": 0.028783349939112145,
"grad_norm": 11.887953758239746,
"learning_rate": 0.0002,
"loss": 1.7672,
"mean_token_accuracy": 0.8967141926288604,
"num_tokens": 160071.0,
"step": 65
},
{
"epoch": 0.03099745378058231,
"grad_norm": 5.832238674163818,
"learning_rate": 0.0002,
"loss": 1.2079,
"mean_token_accuracy": 0.9185251384973526,
"num_tokens": 173268.0,
"step": 70
},
{
"epoch": 0.033211557622052475,
"grad_norm": 8.69938850402832,
"learning_rate": 0.0002,
"loss": 1.5593,
"mean_token_accuracy": 0.9084058552980423,
"num_tokens": 186169.0,
"step": 75
},
{
"epoch": 0.03542566146352264,
"grad_norm": 4.20233678817749,
"learning_rate": 0.0002,
"loss": 1.193,
"mean_token_accuracy": 0.9241156429052353,
"num_tokens": 201336.0,
"step": 80
},
{
"epoch": 0.03763976530499281,
"grad_norm": 5.642890453338623,
"learning_rate": 0.0002,
"loss": 1.2642,
"mean_token_accuracy": 0.9184715151786804,
"num_tokens": 213213.0,
"step": 85
},
{
"epoch": 0.03985386914646297,
"grad_norm": 5.765429973602295,
"learning_rate": 0.0002,
"loss": 1.4019,
"mean_token_accuracy": 0.9147763669490814,
"num_tokens": 224027.0,
"step": 90
},
{
"epoch": 0.04206797298793313,
"grad_norm": 5.6816630363464355,
"learning_rate": 0.0002,
"loss": 1.3954,
"mean_token_accuracy": 0.9195666879415512,
"num_tokens": 235465.0,
"step": 95
},
{
"epoch": 0.0442820768294033,
"grad_norm": 6.504826068878174,
"learning_rate": 0.0002,
"loss": 1.0328,
"mean_token_accuracy": 0.925902035832405,
"num_tokens": 248735.0,
"step": 100
},
{
"epoch": 0.046496180670873466,
"grad_norm": 5.916987895965576,
"learning_rate": 0.0002,
"loss": 1.3818,
"mean_token_accuracy": 0.910975980758667,
"num_tokens": 260910.0,
"step": 105
},
{
"epoch": 0.04871028451234363,
"grad_norm": 4.899371147155762,
"learning_rate": 0.0002,
"loss": 1.3134,
"mean_token_accuracy": 0.911156702041626,
"num_tokens": 272818.0,
"step": 110
},
{
"epoch": 0.05092438835381379,
"grad_norm": 4.977072238922119,
"learning_rate": 0.0002,
"loss": 1.0713,
"mean_token_accuracy": 0.9260460823774338,
"num_tokens": 285231.0,
"step": 115
},
{
"epoch": 0.05313849219528396,
"grad_norm": 5.722350120544434,
"learning_rate": 0.0002,
"loss": 1.5063,
"mean_token_accuracy": 0.9071427851915359,
"num_tokens": 295970.0,
"step": 120
},
{
"epoch": 0.055352596036754124,
"grad_norm": 6.670350551605225,
"learning_rate": 0.0002,
"loss": 1.48,
"mean_token_accuracy": 0.9137888938188553,
"num_tokens": 307700.0,
"step": 125
},
{
"epoch": 0.05756669987822429,
"grad_norm": 4.701105117797852,
"learning_rate": 0.0002,
"loss": 0.9216,
"mean_token_accuracy": 0.9317790538072586,
"num_tokens": 321924.0,
"step": 130
},
{
"epoch": 0.05978080371969445,
"grad_norm": 6.084046840667725,
"learning_rate": 0.0002,
"loss": 1.1839,
"mean_token_accuracy": 0.9231065452098847,
"num_tokens": 336808.0,
"step": 135
},
{
"epoch": 0.06199490756116462,
"grad_norm": 4.739227294921875,
"learning_rate": 0.0002,
"loss": 1.4177,
"mean_token_accuracy": 0.9183769166469574,
"num_tokens": 349784.0,
"step": 140
},
{
"epoch": 0.06420901140263478,
"grad_norm": 5.750140190124512,
"learning_rate": 0.0002,
"loss": 1.57,
"mean_token_accuracy": 0.9093784838914871,
"num_tokens": 359809.0,
"step": 145
},
{
"epoch": 0.06642311524410495,
"grad_norm": 5.502381324768066,
"learning_rate": 0.0002,
"loss": 1.0926,
"mean_token_accuracy": 0.9278808891773224,
"num_tokens": 373540.0,
"step": 150
},
{
"epoch": 0.06863721908557512,
"grad_norm": 3.8856916427612305,
"learning_rate": 0.0002,
"loss": 1.0794,
"mean_token_accuracy": 0.9323706835508346,
"num_tokens": 385610.0,
"step": 155
},
{
"epoch": 0.07085132292704528,
"grad_norm": 5.632940769195557,
"learning_rate": 0.0002,
"loss": 1.5044,
"mean_token_accuracy": 0.904329365491867,
"num_tokens": 396616.0,
"step": 160
},
{
"epoch": 0.07306542676851545,
"grad_norm": 5.6375203132629395,
"learning_rate": 0.0002,
"loss": 1.4283,
"mean_token_accuracy": 0.9134370684623718,
"num_tokens": 408011.0,
"step": 165
},
{
"epoch": 0.07527953060998561,
"grad_norm": 6.19705057144165,
"learning_rate": 0.0002,
"loss": 1.529,
"mean_token_accuracy": 0.9081434190273285,
"num_tokens": 417320.0,
"step": 170
},
{
"epoch": 0.07749363445145577,
"grad_norm": 5.687817573547363,
"learning_rate": 0.0002,
"loss": 1.0551,
"mean_token_accuracy": 0.9281114786863327,
"num_tokens": 428148.0,
"step": 175
},
{
"epoch": 0.07970773829292593,
"grad_norm": 4.656964302062988,
"learning_rate": 0.0002,
"loss": 1.0233,
"mean_token_accuracy": 0.9278254687786103,
"num_tokens": 440931.0,
"step": 180
},
{
"epoch": 0.0819218421343961,
"grad_norm": 5.18228816986084,
"learning_rate": 0.0002,
"loss": 1.5104,
"mean_token_accuracy": 0.9057249039411545,
"num_tokens": 451561.0,
"step": 185
},
{
"epoch": 0.08413594597586627,
"grad_norm": 4.148241996765137,
"learning_rate": 0.0002,
"loss": 0.9641,
"mean_token_accuracy": 0.9276774257421494,
"num_tokens": 464562.0,
"step": 190
},
{
"epoch": 0.08635004981733643,
"grad_norm": 5.226706027984619,
"learning_rate": 0.0002,
"loss": 1.3431,
"mean_token_accuracy": 0.9116898536682129,
"num_tokens": 476950.0,
"step": 195
},
{
"epoch": 0.0885641536588066,
"grad_norm": 4.673404693603516,
"learning_rate": 0.0002,
"loss": 1.1031,
"mean_token_accuracy": 0.924088642001152,
"num_tokens": 490346.0,
"step": 200
},
{
"epoch": 0.09077825750027677,
"grad_norm": 4.617090225219727,
"learning_rate": 0.0002,
"loss": 1.013,
"mean_token_accuracy": 0.922841164469719,
"num_tokens": 501766.0,
"step": 205
},
{
"epoch": 0.09299236134174693,
"grad_norm": 3.0853629112243652,
"learning_rate": 0.0002,
"loss": 1.0736,
"mean_token_accuracy": 0.9237724006175995,
"num_tokens": 514665.0,
"step": 210
},
{
"epoch": 0.0952064651832171,
"grad_norm": 4.994758129119873,
"learning_rate": 0.0002,
"loss": 1.202,
"mean_token_accuracy": 0.9217059254646301,
"num_tokens": 526702.0,
"step": 215
},
{
"epoch": 0.09742056902468726,
"grad_norm": 5.424100875854492,
"learning_rate": 0.0002,
"loss": 1.0885,
"mean_token_accuracy": 0.9214400589466095,
"num_tokens": 538211.0,
"step": 220
},
{
"epoch": 0.09963467286615742,
"grad_norm": 6.007224082946777,
"learning_rate": 0.0002,
"loss": 1.2519,
"mean_token_accuracy": 0.9193921983242035,
"num_tokens": 549294.0,
"step": 225
},
{
"epoch": 0.10184877670762758,
"grad_norm": 4.146254062652588,
"learning_rate": 0.0002,
"loss": 1.0478,
"mean_token_accuracy": 0.9264948040246963,
"num_tokens": 561455.0,
"step": 230
},
{
"epoch": 0.10406288054909775,
"grad_norm": 5.269349575042725,
"learning_rate": 0.0002,
"loss": 1.0395,
"mean_token_accuracy": 0.9255057454109192,
"num_tokens": 571769.0,
"step": 235
},
{
"epoch": 0.10627698439056792,
"grad_norm": 3.518718957901001,
"learning_rate": 0.0002,
"loss": 1.103,
"mean_token_accuracy": 0.9197809398174286,
"num_tokens": 583866.0,
"step": 240
},
{
"epoch": 0.10849108823203808,
"grad_norm": 4.272643566131592,
"learning_rate": 0.0002,
"loss": 1.5091,
"mean_token_accuracy": 0.9108827739953995,
"num_tokens": 593617.0,
"step": 245
},
{
"epoch": 0.11070519207350825,
"grad_norm": 4.085506439208984,
"learning_rate": 0.0002,
"loss": 1.3358,
"mean_token_accuracy": 0.9100114196538925,
"num_tokens": 605999.0,
"step": 250
},
{
"epoch": 0.11291929591497842,
"grad_norm": 3.2901113033294678,
"learning_rate": 0.0002,
"loss": 1.0284,
"mean_token_accuracy": 0.9262891203165055,
"num_tokens": 618331.0,
"step": 255
},
{
"epoch": 0.11513339975644858,
"grad_norm": 4.289281368255615,
"learning_rate": 0.0002,
"loss": 1.2362,
"mean_token_accuracy": 0.9141905009746552,
"num_tokens": 630021.0,
"step": 260
},
{
"epoch": 0.11734750359791875,
"grad_norm": 4.200511932373047,
"learning_rate": 0.0002,
"loss": 1.4356,
"mean_token_accuracy": 0.9113897413015366,
"num_tokens": 641232.0,
"step": 265
},
{
"epoch": 0.1195616074393889,
"grad_norm": 3.7031190395355225,
"learning_rate": 0.0002,
"loss": 1.0592,
"mean_token_accuracy": 0.9252003788948059,
"num_tokens": 655240.0,
"step": 270
},
{
"epoch": 0.12177571128085907,
"grad_norm": 3.439293622970581,
"learning_rate": 0.0002,
"loss": 1.1109,
"mean_token_accuracy": 0.923374080657959,
"num_tokens": 667642.0,
"step": 275
},
{
"epoch": 0.12398981512232923,
"grad_norm": 4.368510723114014,
"learning_rate": 0.0002,
"loss": 0.976,
"mean_token_accuracy": 0.9249310046434402,
"num_tokens": 679262.0,
"step": 280
},
{
"epoch": 0.1262039189637994,
"grad_norm": 3.9658987522125244,
"learning_rate": 0.0002,
"loss": 0.9673,
"mean_token_accuracy": 0.9295818597078324,
"num_tokens": 691910.0,
"step": 285
},
{
"epoch": 0.12841802280526957,
"grad_norm": 4.339285850524902,
"learning_rate": 0.0002,
"loss": 1.0667,
"mean_token_accuracy": 0.9245558708906174,
"num_tokens": 703552.0,
"step": 290
},
{
"epoch": 0.13063212664673973,
"grad_norm": 4.079878330230713,
"learning_rate": 0.0002,
"loss": 0.8804,
"mean_token_accuracy": 0.9306642562150955,
"num_tokens": 716599.0,
"step": 295
},
{
"epoch": 0.1328462304882099,
"grad_norm": 3.9294116497039795,
"learning_rate": 0.0002,
"loss": 1.1058,
"mean_token_accuracy": 0.9195070207118988,
"num_tokens": 729446.0,
"step": 300
},
{
"epoch": 0.13506033432968007,
"grad_norm": 4.728193759918213,
"learning_rate": 0.0002,
"loss": 1.132,
"mean_token_accuracy": 0.9184233337640763,
"num_tokens": 740855.0,
"step": 305
},
{
"epoch": 0.13727443817115023,
"grad_norm": 4.081950664520264,
"learning_rate": 0.0002,
"loss": 0.9906,
"mean_token_accuracy": 0.9285815507173538,
"num_tokens": 753698.0,
"step": 310
},
{
"epoch": 0.1394885420126204,
"grad_norm": 5.046234130859375,
"learning_rate": 0.0002,
"loss": 1.385,
"mean_token_accuracy": 0.914243558049202,
"num_tokens": 764462.0,
"step": 315
},
{
"epoch": 0.14170264585409056,
"grad_norm": 4.2347002029418945,
"learning_rate": 0.0002,
"loss": 0.8938,
"mean_token_accuracy": 0.9304849207401276,
"num_tokens": 777047.0,
"step": 320
},
{
"epoch": 0.14391674969556073,
"grad_norm": 4.928355693817139,
"learning_rate": 0.0002,
"loss": 1.0848,
"mean_token_accuracy": 0.9283693462610245,
"num_tokens": 790880.0,
"step": 325
},
{
"epoch": 0.1461308535370309,
"grad_norm": 4.760014057159424,
"learning_rate": 0.0002,
"loss": 1.076,
"mean_token_accuracy": 0.9280766844749451,
"num_tokens": 804314.0,
"step": 330
},
{
"epoch": 0.14834495737850106,
"grad_norm": 3.9877500534057617,
"learning_rate": 0.0002,
"loss": 1.2699,
"mean_token_accuracy": 0.9156712800264358,
"num_tokens": 815261.0,
"step": 335
},
{
"epoch": 0.15055906121997123,
"grad_norm": 4.126375198364258,
"learning_rate": 0.0002,
"loss": 0.9963,
"mean_token_accuracy": 0.9256706595420837,
"num_tokens": 829528.0,
"step": 340
},
{
"epoch": 0.15277316506144137,
"grad_norm": 3.9972124099731445,
"learning_rate": 0.0002,
"loss": 1.3455,
"mean_token_accuracy": 0.9122998714447021,
"num_tokens": 841276.0,
"step": 345
},
{
"epoch": 0.15498726890291153,
"grad_norm": 5.413994312286377,
"learning_rate": 0.0002,
"loss": 1.101,
"mean_token_accuracy": 0.9225263863801956,
"num_tokens": 854080.0,
"step": 350
},
{
"epoch": 0.1572013727443817,
"grad_norm": 3.65535306930542,
"learning_rate": 0.0002,
"loss": 1.2503,
"mean_token_accuracy": 0.9237045079469681,
"num_tokens": 866984.0,
"step": 355
},
{
"epoch": 0.15941547658585187,
"grad_norm": 4.747088432312012,
"learning_rate": 0.0002,
"loss": 1.3619,
"mean_token_accuracy": 0.9102030217647552,
"num_tokens": 878494.0,
"step": 360
},
{
"epoch": 0.16162958042732203,
"grad_norm": 4.269980430603027,
"learning_rate": 0.0002,
"loss": 1.1624,
"mean_token_accuracy": 0.9170969694852829,
"num_tokens": 889350.0,
"step": 365
},
{
"epoch": 0.1638436842687922,
"grad_norm": 3.83870530128479,
"learning_rate": 0.0002,
"loss": 1.0977,
"mean_token_accuracy": 0.9311458617448807,
"num_tokens": 900460.0,
"step": 370
},
{
"epoch": 0.16605778811026237,
"grad_norm": 4.76396369934082,
"learning_rate": 0.0002,
"loss": 1.1958,
"mean_token_accuracy": 0.9178014636039734,
"num_tokens": 911749.0,
"step": 375
},
{
"epoch": 0.16827189195173253,
"grad_norm": 4.328544616699219,
"learning_rate": 0.0002,
"loss": 1.0498,
"mean_token_accuracy": 0.9265041768550872,
"num_tokens": 924982.0,
"step": 380
},
{
"epoch": 0.1704859957932027,
"grad_norm": 4.840917110443115,
"learning_rate": 0.0002,
"loss": 0.8995,
"mean_token_accuracy": 0.9299014925956726,
"num_tokens": 938464.0,
"step": 385
},
{
"epoch": 0.17270009963467287,
"grad_norm": 3.3656206130981445,
"learning_rate": 0.0002,
"loss": 0.9166,
"mean_token_accuracy": 0.9308703899383545,
"num_tokens": 950480.0,
"step": 390
},
{
"epoch": 0.17491420347614303,
"grad_norm": 4.094184398651123,
"learning_rate": 0.0002,
"loss": 0.9128,
"mean_token_accuracy": 0.9329801052808762,
"num_tokens": 962975.0,
"step": 395
},
{
"epoch": 0.1771283073176132,
"grad_norm": 4.07610559463501,
"learning_rate": 0.0002,
"loss": 0.9266,
"mean_token_accuracy": 0.9289239794015884,
"num_tokens": 976843.0,
"step": 400
},
{
"epoch": 0.17934241115908336,
"grad_norm": 3.3643202781677246,
"learning_rate": 0.0002,
"loss": 1.0849,
"mean_token_accuracy": 0.9257559090852737,
"num_tokens": 988427.0,
"step": 405
},
{
"epoch": 0.18155651500055353,
"grad_norm": 4.841256141662598,
"learning_rate": 0.0002,
"loss": 1.2045,
"mean_token_accuracy": 0.9165349155664444,
"num_tokens": 999829.0,
"step": 410
},
{
"epoch": 0.1837706188420237,
"grad_norm": 3.4727354049682617,
"learning_rate": 0.0002,
"loss": 1.0157,
"mean_token_accuracy": 0.9250179201364517,
"num_tokens": 1010929.0,
"step": 415
},
{
"epoch": 0.18598472268349386,
"grad_norm": 3.2475087642669678,
"learning_rate": 0.0002,
"loss": 1.1052,
"mean_token_accuracy": 0.9202539622783661,
"num_tokens": 1022824.0,
"step": 420
},
{
"epoch": 0.18819882652496403,
"grad_norm": 3.1745808124542236,
"learning_rate": 0.0002,
"loss": 0.7683,
"mean_token_accuracy": 0.9444057643413544,
"num_tokens": 1036553.0,
"step": 425
},
{
"epoch": 0.1904129303664342,
"grad_norm": 3.939055919647217,
"learning_rate": 0.0002,
"loss": 0.7823,
"mean_token_accuracy": 0.9350433677434922,
"num_tokens": 1048393.0,
"step": 430
},
{
"epoch": 0.19262703420790436,
"grad_norm": 3.397245407104492,
"learning_rate": 0.0002,
"loss": 1.1069,
"mean_token_accuracy": 0.9203731089830398,
"num_tokens": 1060606.0,
"step": 435
},
{
"epoch": 0.19484113804937453,
"grad_norm": 3.675420045852661,
"learning_rate": 0.0002,
"loss": 1.2814,
"mean_token_accuracy": 0.9185589760541916,
"num_tokens": 1073102.0,
"step": 440
},
{
"epoch": 0.19705524189084467,
"grad_norm": 2.8432865142822266,
"learning_rate": 0.0002,
"loss": 0.9715,
"mean_token_accuracy": 0.9279666066169738,
"num_tokens": 1083869.0,
"step": 445
},
{
"epoch": 0.19926934573231483,
"grad_norm": 3.5305676460266113,
"learning_rate": 0.0002,
"loss": 1.0675,
"mean_token_accuracy": 0.9236095041036606,
"num_tokens": 1094722.0,
"step": 450
},
{
"epoch": 0.201483449573785,
"grad_norm": 4.308902263641357,
"learning_rate": 0.0002,
"loss": 0.7586,
"mean_token_accuracy": 0.9407237708568573,
"num_tokens": 1109517.0,
"step": 455
},
{
"epoch": 0.20369755341525517,
"grad_norm": 3.947713613510132,
"learning_rate": 0.0002,
"loss": 0.8975,
"mean_token_accuracy": 0.9331141859292984,
"num_tokens": 1122116.0,
"step": 460
},
{
"epoch": 0.20591165725672533,
"grad_norm": 2.7408392429351807,
"learning_rate": 0.0002,
"loss": 0.9984,
"mean_token_accuracy": 0.9276015996932984,
"num_tokens": 1133434.0,
"step": 465
},
{
"epoch": 0.2081257610981955,
"grad_norm": 4.87424373626709,
"learning_rate": 0.0002,
"loss": 0.9096,
"mean_token_accuracy": 0.9369789361953735,
"num_tokens": 1147039.0,
"step": 470
},
{
"epoch": 0.21033986493966567,
"grad_norm": 3.165412425994873,
"learning_rate": 0.0002,
"loss": 1.1,
"mean_token_accuracy": 0.9302688419818879,
"num_tokens": 1160845.0,
"step": 475
},
{
"epoch": 0.21255396878113583,
"grad_norm": 3.668769121170044,
"learning_rate": 0.0002,
"loss": 1.0275,
"mean_token_accuracy": 0.9271714627742768,
"num_tokens": 1172244.0,
"step": 480
},
{
"epoch": 0.214768072622606,
"grad_norm": 3.2549095153808594,
"learning_rate": 0.0002,
"loss": 0.9565,
"mean_token_accuracy": 0.9336414575576782,
"num_tokens": 1185395.0,
"step": 485
},
{
"epoch": 0.21698217646407617,
"grad_norm": 3.9204583168029785,
"learning_rate": 0.0002,
"loss": 1.0073,
"mean_token_accuracy": 0.9254105865955353,
"num_tokens": 1197385.0,
"step": 490
},
{
"epoch": 0.21919628030554633,
"grad_norm": 4.254587650299072,
"learning_rate": 0.0002,
"loss": 1.2027,
"mean_token_accuracy": 0.9190206825733185,
"num_tokens": 1207777.0,
"step": 495
},
{
"epoch": 0.2214103841470165,
"grad_norm": 3.455690622329712,
"learning_rate": 0.0002,
"loss": 1.335,
"mean_token_accuracy": 0.9205510348081589,
"num_tokens": 1218537.0,
"step": 500
},
{
"epoch": 0.22362448798848666,
"grad_norm": 3.2426981925964355,
"learning_rate": 0.0002,
"loss": 1.1186,
"mean_token_accuracy": 0.9233917683362961,
"num_tokens": 1229882.0,
"step": 505
},
{
"epoch": 0.22583859182995683,
"grad_norm": 3.5431432723999023,
"learning_rate": 0.0002,
"loss": 1.1298,
"mean_token_accuracy": 0.9185365289449692,
"num_tokens": 1241357.0,
"step": 510
},
{
"epoch": 0.228052695671427,
"grad_norm": 3.3408544063568115,
"learning_rate": 0.0002,
"loss": 1.0158,
"mean_token_accuracy": 0.931389006972313,
"num_tokens": 1254158.0,
"step": 515
},
{
"epoch": 0.23026679951289716,
"grad_norm": 3.5583953857421875,
"learning_rate": 0.0002,
"loss": 0.985,
"mean_token_accuracy": 0.9288320362567901,
"num_tokens": 1265454.0,
"step": 520
},
{
"epoch": 0.23248090335436733,
"grad_norm": 3.7565269470214844,
"learning_rate": 0.0002,
"loss": 1.1319,
"mean_token_accuracy": 0.9232024788856507,
"num_tokens": 1276256.0,
"step": 525
},
{
"epoch": 0.2346950071958375,
"grad_norm": 3.4486448764801025,
"learning_rate": 0.0002,
"loss": 0.9273,
"mean_token_accuracy": 0.9286326110363007,
"num_tokens": 1289049.0,
"step": 530
},
{
"epoch": 0.23690911103730766,
"grad_norm": 3.341252565383911,
"learning_rate": 0.0002,
"loss": 0.9678,
"mean_token_accuracy": 0.9301002591848373,
"num_tokens": 1301964.0,
"step": 535
},
{
"epoch": 0.2391232148787778,
"grad_norm": 3.2227766513824463,
"learning_rate": 0.0002,
"loss": 0.9778,
"mean_token_accuracy": 0.9308173507452011,
"num_tokens": 1316098.0,
"step": 540
},
{
"epoch": 0.24133731872024797,
"grad_norm": 4.225726127624512,
"learning_rate": 0.0002,
"loss": 1.0343,
"mean_token_accuracy": 0.9257538586854934,
"num_tokens": 1327919.0,
"step": 545
},
{
"epoch": 0.24355142256171813,
"grad_norm": 3.6367788314819336,
"learning_rate": 0.0002,
"loss": 1.0479,
"mean_token_accuracy": 0.9220652222633362,
"num_tokens": 1341243.0,
"step": 550
},
{
"epoch": 0.2457655264031883,
"grad_norm": 2.7346153259277344,
"learning_rate": 0.0002,
"loss": 0.9359,
"mean_token_accuracy": 0.928188094496727,
"num_tokens": 1353419.0,
"step": 555
},
{
"epoch": 0.24797963024465847,
"grad_norm": 3.3693747520446777,
"learning_rate": 0.0002,
"loss": 1.0423,
"mean_token_accuracy": 0.9239319413900375,
"num_tokens": 1364006.0,
"step": 560
},
{
"epoch": 0.25019373408612866,
"grad_norm": 4.45697546005249,
"learning_rate": 0.0002,
"loss": 1.0615,
"mean_token_accuracy": 0.9232381820678711,
"num_tokens": 1375497.0,
"step": 565
},
{
"epoch": 0.2524078379275988,
"grad_norm": 3.7682595252990723,
"learning_rate": 0.0002,
"loss": 1.0123,
"mean_token_accuracy": 0.9235726416110992,
"num_tokens": 1388604.0,
"step": 570
},
{
"epoch": 0.254621941769069,
"grad_norm": 2.5660793781280518,
"learning_rate": 0.0002,
"loss": 0.7591,
"mean_token_accuracy": 0.9384708911180496,
"num_tokens": 1402952.0,
"step": 575
},
{
"epoch": 0.25683604561053913,
"grad_norm": 3.347537040710449,
"learning_rate": 0.0002,
"loss": 0.8635,
"mean_token_accuracy": 0.9377257645130157,
"num_tokens": 1416588.0,
"step": 580
},
{
"epoch": 0.2590501494520093,
"grad_norm": 11.165135383605957,
"learning_rate": 0.0002,
"loss": 1.1461,
"mean_token_accuracy": 0.9261222094297409,
"num_tokens": 1429142.0,
"step": 585
},
{
"epoch": 0.26126425329347946,
"grad_norm": 3.1489033699035645,
"learning_rate": 0.0002,
"loss": 1.2318,
"mean_token_accuracy": 0.9255981892347336,
"num_tokens": 1440345.0,
"step": 590
},
{
"epoch": 0.2634783571349496,
"grad_norm": 3.541116952896118,
"learning_rate": 0.0002,
"loss": 0.8121,
"mean_token_accuracy": 0.9322708487510681,
"num_tokens": 1455224.0,
"step": 595
},
{
"epoch": 0.2656924609764198,
"grad_norm": 4.341325283050537,
"learning_rate": 0.0002,
"loss": 1.1886,
"mean_token_accuracy": 0.920156580209732,
"num_tokens": 1468143.0,
"step": 600
},
{
"epoch": 0.26790656481788994,
"grad_norm": 3.9943504333496094,
"learning_rate": 0.0002,
"loss": 1.0343,
"mean_token_accuracy": 0.9286263018846512,
"num_tokens": 1482762.0,
"step": 605
},
{
"epoch": 0.27012066865936013,
"grad_norm": 3.0633606910705566,
"learning_rate": 0.0002,
"loss": 1.162,
"mean_token_accuracy": 0.9224074572324753,
"num_tokens": 1494120.0,
"step": 610
},
{
"epoch": 0.27233477250083027,
"grad_norm": 3.8182456493377686,
"learning_rate": 0.0002,
"loss": 1.1397,
"mean_token_accuracy": 0.9227182388305664,
"num_tokens": 1505877.0,
"step": 615
},
{
"epoch": 0.27454887634230046,
"grad_norm": 4.838993549346924,
"learning_rate": 0.0002,
"loss": 0.9525,
"mean_token_accuracy": 0.934059739112854,
"num_tokens": 1519437.0,
"step": 620
},
{
"epoch": 0.2767629801837706,
"grad_norm": 3.2925949096679688,
"learning_rate": 0.0002,
"loss": 0.7975,
"mean_token_accuracy": 0.936807957291603,
"num_tokens": 1533986.0,
"step": 625
},
{
"epoch": 0.2789770840252408,
"grad_norm": 3.633017063140869,
"learning_rate": 0.0002,
"loss": 1.022,
"mean_token_accuracy": 0.9283811062574386,
"num_tokens": 1545664.0,
"step": 630
},
{
"epoch": 0.28119118786671093,
"grad_norm": 3.6399173736572266,
"learning_rate": 0.0002,
"loss": 0.844,
"mean_token_accuracy": 0.9315326452255249,
"num_tokens": 1558398.0,
"step": 635
},
{
"epoch": 0.28340529170818113,
"grad_norm": 4.304896831512451,
"learning_rate": 0.0002,
"loss": 1.1099,
"mean_token_accuracy": 0.9182446151971817,
"num_tokens": 1569769.0,
"step": 640
},
{
"epoch": 0.28561939554965127,
"grad_norm": 3.875694513320923,
"learning_rate": 0.0002,
"loss": 1.1155,
"mean_token_accuracy": 0.923322680592537,
"num_tokens": 1581951.0,
"step": 645
},
{
"epoch": 0.28783349939112146,
"grad_norm": 2.719801187515259,
"learning_rate": 0.0002,
"loss": 1.0578,
"mean_token_accuracy": 0.9241158574819565,
"num_tokens": 1594802.0,
"step": 650
},
{
"epoch": 0.2900476032325916,
"grad_norm": 2.3830995559692383,
"learning_rate": 0.0002,
"loss": 0.9112,
"mean_token_accuracy": 0.9314200520515442,
"num_tokens": 1608969.0,
"step": 655
},
{
"epoch": 0.2922617070740618,
"grad_norm": 2.6518445014953613,
"learning_rate": 0.0002,
"loss": 0.7838,
"mean_token_accuracy": 0.9370227992534638,
"num_tokens": 1622121.0,
"step": 660
},
{
"epoch": 0.29447581091553193,
"grad_norm": 3.3631813526153564,
"learning_rate": 0.0002,
"loss": 1.1264,
"mean_token_accuracy": 0.9195171415805816,
"num_tokens": 1635382.0,
"step": 665
},
{
"epoch": 0.2966899147570021,
"grad_norm": 2.3228812217712402,
"learning_rate": 0.0002,
"loss": 0.8994,
"mean_token_accuracy": 0.933735242486,
"num_tokens": 1647948.0,
"step": 670
},
{
"epoch": 0.29890401859847227,
"grad_norm": 4.091598033905029,
"learning_rate": 0.0002,
"loss": 1.1017,
"mean_token_accuracy": 0.9218682497739792,
"num_tokens": 1659267.0,
"step": 675
},
{
"epoch": 0.30111812243994246,
"grad_norm": 3.764561414718628,
"learning_rate": 0.0002,
"loss": 0.7148,
"mean_token_accuracy": 0.9434669315814972,
"num_tokens": 1672417.0,
"step": 680
},
{
"epoch": 0.3033322262814126,
"grad_norm": 3.185284376144409,
"learning_rate": 0.0002,
"loss": 0.8594,
"mean_token_accuracy": 0.9352281510829925,
"num_tokens": 1686121.0,
"step": 685
},
{
"epoch": 0.30554633012288274,
"grad_norm": 3.7809314727783203,
"learning_rate": 0.0002,
"loss": 1.0109,
"mean_token_accuracy": 0.9245012730360032,
"num_tokens": 1697450.0,
"step": 690
},
{
"epoch": 0.30776043396435293,
"grad_norm": 3.160498857498169,
"learning_rate": 0.0002,
"loss": 0.9013,
"mean_token_accuracy": 0.9293900519609452,
"num_tokens": 1708446.0,
"step": 695
},
{
"epoch": 0.30997453780582307,
"grad_norm": 3.9027180671691895,
"learning_rate": 0.0002,
"loss": 0.9421,
"mean_token_accuracy": 0.9305406659841537,
"num_tokens": 1721554.0,
"step": 700
},
{
"epoch": 0.31218864164729326,
"grad_norm": 4.140758514404297,
"learning_rate": 0.0002,
"loss": 1.1733,
"mean_token_accuracy": 0.9222266197204589,
"num_tokens": 1732417.0,
"step": 705
},
{
"epoch": 0.3144027454887634,
"grad_norm": 3.0247952938079834,
"learning_rate": 0.0002,
"loss": 0.9215,
"mean_token_accuracy": 0.933441498875618,
"num_tokens": 1745534.0,
"step": 710
},
{
"epoch": 0.3166168493302336,
"grad_norm": 3.145435094833374,
"learning_rate": 0.0002,
"loss": 1.1067,
"mean_token_accuracy": 0.9226934105157852,
"num_tokens": 1757200.0,
"step": 715
},
{
"epoch": 0.31883095317170373,
"grad_norm": 2.747141122817993,
"learning_rate": 0.0002,
"loss": 1.1384,
"mean_token_accuracy": 0.9188572406768799,
"num_tokens": 1768726.0,
"step": 720
},
{
"epoch": 0.32104505701317393,
"grad_norm": 3.0959110260009766,
"learning_rate": 0.0002,
"loss": 0.9543,
"mean_token_accuracy": 0.930645814538002,
"num_tokens": 1779198.0,
"step": 725
},
{
"epoch": 0.32325916085464407,
"grad_norm": 3.135096788406372,
"learning_rate": 0.0002,
"loss": 0.9297,
"mean_token_accuracy": 0.9337345957756042,
"num_tokens": 1792258.0,
"step": 730
},
{
"epoch": 0.32547326469611426,
"grad_norm": 2.297475814819336,
"learning_rate": 0.0002,
"loss": 0.6964,
"mean_token_accuracy": 0.9390480488538742,
"num_tokens": 1804579.0,
"step": 735
},
{
"epoch": 0.3276873685375844,
"grad_norm": 2.391242265701294,
"learning_rate": 0.0002,
"loss": 0.877,
"mean_token_accuracy": 0.9332740783691407,
"num_tokens": 1817721.0,
"step": 740
},
{
"epoch": 0.3299014723790546,
"grad_norm": 2.637448787689209,
"learning_rate": 0.0002,
"loss": 0.9144,
"mean_token_accuracy": 0.9269635200500488,
"num_tokens": 1829841.0,
"step": 745
},
{
"epoch": 0.33211557622052473,
"grad_norm": 2.4021666049957275,
"learning_rate": 0.0002,
"loss": 0.7773,
"mean_token_accuracy": 0.9371457427740097,
"num_tokens": 1843795.0,
"step": 750
},
{
"epoch": 0.3343296800619949,
"grad_norm": 3.012258768081665,
"learning_rate": 0.0002,
"loss": 1.2914,
"mean_token_accuracy": 0.9153674453496933,
"num_tokens": 1854609.0,
"step": 755
},
{
"epoch": 0.33654378390346507,
"grad_norm": 3.001725912094116,
"learning_rate": 0.0002,
"loss": 0.9337,
"mean_token_accuracy": 0.929664534330368,
"num_tokens": 1867176.0,
"step": 760
},
{
"epoch": 0.33875788774493526,
"grad_norm": 2.9781148433685303,
"learning_rate": 0.0002,
"loss": 0.8391,
"mean_token_accuracy": 0.9307692885398865,
"num_tokens": 1879001.0,
"step": 765
},
{
"epoch": 0.3409719915864054,
"grad_norm": 3.3859033584594727,
"learning_rate": 0.0002,
"loss": 0.9095,
"mean_token_accuracy": 0.9333516269922256,
"num_tokens": 1891502.0,
"step": 770
},
{
"epoch": 0.3431860954278756,
"grad_norm": 2.874831199645996,
"learning_rate": 0.0002,
"loss": 0.7998,
"mean_token_accuracy": 0.9348277896642685,
"num_tokens": 1903279.0,
"step": 775
},
{
"epoch": 0.34540019926934573,
"grad_norm": 3.1517276763916016,
"learning_rate": 0.0002,
"loss": 0.9995,
"mean_token_accuracy": 0.9329377114772797,
"num_tokens": 1915655.0,
"step": 780
},
{
"epoch": 0.34761430311081587,
"grad_norm": 3.497373342514038,
"learning_rate": 0.0002,
"loss": 0.964,
"mean_token_accuracy": 0.9265813857316971,
"num_tokens": 1928510.0,
"step": 785
},
{
"epoch": 0.34982840695228606,
"grad_norm": 2.791043996810913,
"learning_rate": 0.0002,
"loss": 0.9767,
"mean_token_accuracy": 0.927400279045105,
"num_tokens": 1940056.0,
"step": 790
},
{
"epoch": 0.3520425107937562,
"grad_norm": 2.9507498741149902,
"learning_rate": 0.0002,
"loss": 0.818,
"mean_token_accuracy": 0.9325792044401169,
"num_tokens": 1953063.0,
"step": 795
},
{
"epoch": 0.3542566146352264,
"grad_norm": 3.5697410106658936,
"learning_rate": 0.0002,
"loss": 1.1487,
"mean_token_accuracy": 0.9165920346975327,
"num_tokens": 1962616.0,
"step": 800
},
{
"epoch": 0.35647071847669654,
"grad_norm": 2.9599061012268066,
"learning_rate": 0.0002,
"loss": 0.9402,
"mean_token_accuracy": 0.9306722432374954,
"num_tokens": 1974475.0,
"step": 805
},
{
"epoch": 0.35868482231816673,
"grad_norm": 2.6589152812957764,
"learning_rate": 0.0002,
"loss": 1.0531,
"mean_token_accuracy": 0.9262530177831649,
"num_tokens": 1986985.0,
"step": 810
},
{
"epoch": 0.36089892615963687,
"grad_norm": 2.572406530380249,
"learning_rate": 0.0002,
"loss": 0.9497,
"mean_token_accuracy": 0.9344472140073776,
"num_tokens": 1998976.0,
"step": 815
},
{
"epoch": 0.36311303000110706,
"grad_norm": 3.164608955383301,
"learning_rate": 0.0002,
"loss": 0.8457,
"mean_token_accuracy": 0.9335892468690872,
"num_tokens": 2011484.0,
"step": 820
},
{
"epoch": 0.3653271338425772,
"grad_norm": 2.683702230453491,
"learning_rate": 0.0002,
"loss": 0.9487,
"mean_token_accuracy": 0.9289698421955108,
"num_tokens": 2023211.0,
"step": 825
},
{
"epoch": 0.3675412376840474,
"grad_norm": 3.9282374382019043,
"learning_rate": 0.0002,
"loss": 0.852,
"mean_token_accuracy": 0.9424950003623962,
"num_tokens": 2034942.0,
"step": 830
},
{
"epoch": 0.36975534152551753,
"grad_norm": 3.512605905532837,
"learning_rate": 0.0002,
"loss": 1.0478,
"mean_token_accuracy": 0.9244632363319397,
"num_tokens": 2047139.0,
"step": 835
},
{
"epoch": 0.3719694453669877,
"grad_norm": 2.8640499114990234,
"learning_rate": 0.0002,
"loss": 0.9073,
"mean_token_accuracy": 0.9334672391414642,
"num_tokens": 2060706.0,
"step": 840
},
{
"epoch": 0.37418354920845787,
"grad_norm": 2.45273756980896,
"learning_rate": 0.0002,
"loss": 1.1191,
"mean_token_accuracy": 0.9188451081514358,
"num_tokens": 2071960.0,
"step": 845
},
{
"epoch": 0.37639765304992806,
"grad_norm": 3.4304628372192383,
"learning_rate": 0.0002,
"loss": 1.0851,
"mean_token_accuracy": 0.9217288702726364,
"num_tokens": 2083408.0,
"step": 850
},
{
"epoch": 0.3786117568913982,
"grad_norm": 2.3865091800689697,
"learning_rate": 0.0002,
"loss": 0.7252,
"mean_token_accuracy": 0.9369473248720169,
"num_tokens": 2096307.0,
"step": 855
},
{
"epoch": 0.3808258607328684,
"grad_norm": 3.7749905586242676,
"learning_rate": 0.0002,
"loss": 0.8653,
"mean_token_accuracy": 0.9342508226633072,
"num_tokens": 2109585.0,
"step": 860
},
{
"epoch": 0.38303996457433853,
"grad_norm": 2.718156099319458,
"learning_rate": 0.0002,
"loss": 0.7336,
"mean_token_accuracy": 0.9394059836864471,
"num_tokens": 2124731.0,
"step": 865
},
{
"epoch": 0.3852540684158087,
"grad_norm": 2.924015522003174,
"learning_rate": 0.0002,
"loss": 0.8086,
"mean_token_accuracy": 0.9387334406375885,
"num_tokens": 2138127.0,
"step": 870
},
{
"epoch": 0.38746817225727886,
"grad_norm": 3.366246461868286,
"learning_rate": 0.0002,
"loss": 0.814,
"mean_token_accuracy": 0.9380306899547577,
"num_tokens": 2148305.0,
"step": 875
},
{
"epoch": 0.38968227609874906,
"grad_norm": 3.231900930404663,
"learning_rate": 0.0002,
"loss": 0.9297,
"mean_token_accuracy": 0.927461439371109,
"num_tokens": 2160722.0,
"step": 880
},
{
"epoch": 0.3918963799402192,
"grad_norm": 2.826343059539795,
"learning_rate": 0.0002,
"loss": 1.1812,
"mean_token_accuracy": 0.9228267341852188,
"num_tokens": 2171118.0,
"step": 885
},
{
"epoch": 0.39411048378168934,
"grad_norm": 3.783430814743042,
"learning_rate": 0.0002,
"loss": 0.9572,
"mean_token_accuracy": 0.9258956193923951,
"num_tokens": 2182445.0,
"step": 890
},
{
"epoch": 0.39632458762315953,
"grad_norm": 3.7381107807159424,
"learning_rate": 0.0002,
"loss": 1.0978,
"mean_token_accuracy": 0.9256362348794938,
"num_tokens": 2195042.0,
"step": 895
},
{
"epoch": 0.39853869146462967,
"grad_norm": 3.0702905654907227,
"learning_rate": 0.0002,
"loss": 0.7616,
"mean_token_accuracy": 0.9401916921138763,
"num_tokens": 2210503.0,
"step": 900
},
{
"epoch": 0.40075279530609986,
"grad_norm": 3.0583152770996094,
"learning_rate": 0.0002,
"loss": 0.9957,
"mean_token_accuracy": 0.9271980673074722,
"num_tokens": 2222075.0,
"step": 905
},
{
"epoch": 0.40296689914757,
"grad_norm": 2.851524591445923,
"learning_rate": 0.0002,
"loss": 0.9718,
"mean_token_accuracy": 0.9335306733846664,
"num_tokens": 2234666.0,
"step": 910
},
{
"epoch": 0.4051810029890402,
"grad_norm": 3.090538740158081,
"learning_rate": 0.0002,
"loss": 0.887,
"mean_token_accuracy": 0.9297337353229522,
"num_tokens": 2245402.0,
"step": 915
},
{
"epoch": 0.40739510683051033,
"grad_norm": 2.5975914001464844,
"learning_rate": 0.0002,
"loss": 0.7172,
"mean_token_accuracy": 0.9442079395055771,
"num_tokens": 2258066.0,
"step": 920
},
{
"epoch": 0.40960921067198053,
"grad_norm": 2.861872911453247,
"learning_rate": 0.0002,
"loss": 0.8477,
"mean_token_accuracy": 0.9340190798044204,
"num_tokens": 2269745.0,
"step": 925
},
{
"epoch": 0.41182331451345067,
"grad_norm": 3.162109136581421,
"learning_rate": 0.0002,
"loss": 0.9025,
"mean_token_accuracy": 0.9301832497119904,
"num_tokens": 2282483.0,
"step": 930
},
{
"epoch": 0.41403741835492086,
"grad_norm": 2.9675636291503906,
"learning_rate": 0.0002,
"loss": 0.8154,
"mean_token_accuracy": 0.9318990021944046,
"num_tokens": 2294175.0,
"step": 935
},
{
"epoch": 0.416251522196391,
"grad_norm": 3.2092788219451904,
"learning_rate": 0.0002,
"loss": 1.2112,
"mean_token_accuracy": 0.9157642692327499,
"num_tokens": 2305822.0,
"step": 940
},
{
"epoch": 0.4184656260378612,
"grad_norm": 3.086061954498291,
"learning_rate": 0.0002,
"loss": 0.8472,
"mean_token_accuracy": 0.9323055207729339,
"num_tokens": 2319852.0,
"step": 945
},
{
"epoch": 0.42067972987933133,
"grad_norm": 2.875953435897827,
"learning_rate": 0.0002,
"loss": 0.8917,
"mean_token_accuracy": 0.9304117172956466,
"num_tokens": 2330826.0,
"step": 950
},
{
"epoch": 0.4228938337208015,
"grad_norm": 3.364098072052002,
"learning_rate": 0.0002,
"loss": 1.1513,
"mean_token_accuracy": 0.9273996829986573,
"num_tokens": 2340121.0,
"step": 955
},
{
"epoch": 0.42510793756227166,
"grad_norm": 2.3155770301818848,
"learning_rate": 0.0002,
"loss": 0.8481,
"mean_token_accuracy": 0.9310476630926132,
"num_tokens": 2353874.0,
"step": 960
},
{
"epoch": 0.42732204140374186,
"grad_norm": 2.873863458633423,
"learning_rate": 0.0002,
"loss": 1.0553,
"mean_token_accuracy": 0.9190466612577438,
"num_tokens": 2364722.0,
"step": 965
},
{
"epoch": 0.429536145245212,
"grad_norm": 3.088542938232422,
"learning_rate": 0.0002,
"loss": 0.7824,
"mean_token_accuracy": 0.9386604636907577,
"num_tokens": 2377983.0,
"step": 970
},
{
"epoch": 0.4317502490866822,
"grad_norm": 3.2161245346069336,
"learning_rate": 0.0002,
"loss": 0.9667,
"mean_token_accuracy": 0.9316904872655869,
"num_tokens": 2388214.0,
"step": 975
},
{
"epoch": 0.43396435292815233,
"grad_norm": 3.019383192062378,
"learning_rate": 0.0002,
"loss": 0.8876,
"mean_token_accuracy": 0.9336837440729141,
"num_tokens": 2402069.0,
"step": 980
},
{
"epoch": 0.43617845676962247,
"grad_norm": 3.164597988128662,
"learning_rate": 0.0002,
"loss": 0.9017,
"mean_token_accuracy": 0.9319613158702851,
"num_tokens": 2415773.0,
"step": 985
},
{
"epoch": 0.43839256061109266,
"grad_norm": 3.070587635040283,
"learning_rate": 0.0002,
"loss": 0.812,
"mean_token_accuracy": 0.9365138530731201,
"num_tokens": 2428468.0,
"step": 990
},
{
"epoch": 0.4406066644525628,
"grad_norm": 2.605221748352051,
"learning_rate": 0.0002,
"loss": 0.8997,
"mean_token_accuracy": 0.9311312526464463,
"num_tokens": 2439495.0,
"step": 995
},
{
"epoch": 0.442820768294033,
"grad_norm": 3.4609057903289795,
"learning_rate": 0.0002,
"loss": 0.9947,
"mean_token_accuracy": 0.9287879914045334,
"num_tokens": 2449133.0,
"step": 1000
},
{
"epoch": 0.44503487213550313,
"grad_norm": 3.2428348064422607,
"learning_rate": 0.0002,
"loss": 0.9216,
"mean_token_accuracy": 0.9293368250131607,
"num_tokens": 2460987.0,
"step": 1005
},
{
"epoch": 0.44724897597697333,
"grad_norm": 2.541088342666626,
"learning_rate": 0.0002,
"loss": 1.1029,
"mean_token_accuracy": 0.9256097286939621,
"num_tokens": 2472360.0,
"step": 1010
},
{
"epoch": 0.44946307981844347,
"grad_norm": 3.3123016357421875,
"learning_rate": 0.0002,
"loss": 1.0771,
"mean_token_accuracy": 0.9271711260080338,
"num_tokens": 2482790.0,
"step": 1015
},
{
"epoch": 0.45167718365991366,
"grad_norm": 2.808271884918213,
"learning_rate": 0.0002,
"loss": 0.9692,
"mean_token_accuracy": 0.9322762846946716,
"num_tokens": 2494158.0,
"step": 1020
},
{
"epoch": 0.4538912875013838,
"grad_norm": 3.2123773097991943,
"learning_rate": 0.0002,
"loss": 1.2038,
"mean_token_accuracy": 0.9229989141225815,
"num_tokens": 2504659.0,
"step": 1025
},
{
"epoch": 0.456105391342854,
"grad_norm": 3.9224977493286133,
"learning_rate": 0.0002,
"loss": 0.9177,
"mean_token_accuracy": 0.9301177680492401,
"num_tokens": 2515947.0,
"step": 1030
},
{
"epoch": 0.45831949518432413,
"grad_norm": 3.159163236618042,
"learning_rate": 0.0002,
"loss": 1.1681,
"mean_token_accuracy": 0.9179033428430557,
"num_tokens": 2527377.0,
"step": 1035
},
{
"epoch": 0.4605335990257943,
"grad_norm": 2.5653562545776367,
"learning_rate": 0.0002,
"loss": 1.054,
"mean_token_accuracy": 0.9257000118494034,
"num_tokens": 2539474.0,
"step": 1040
},
{
"epoch": 0.46274770286726447,
"grad_norm": 2.449213981628418,
"learning_rate": 0.0002,
"loss": 0.9338,
"mean_token_accuracy": 0.9319339364767074,
"num_tokens": 2553545.0,
"step": 1045
},
{
"epoch": 0.46496180670873466,
"grad_norm": 3.5044524669647217,
"learning_rate": 0.0002,
"loss": 0.9448,
"mean_token_accuracy": 0.9295208871364593,
"num_tokens": 2564811.0,
"step": 1050
},
{
"epoch": 0.4671759105502048,
"grad_norm": 3.5413150787353516,
"learning_rate": 0.0002,
"loss": 0.9424,
"mean_token_accuracy": 0.9302045583724976,
"num_tokens": 2576751.0,
"step": 1055
},
{
"epoch": 0.469390014391675,
"grad_norm": 3.0598480701446533,
"learning_rate": 0.0002,
"loss": 0.9126,
"mean_token_accuracy": 0.9285120725631714,
"num_tokens": 2588358.0,
"step": 1060
},
{
"epoch": 0.47160411823314513,
"grad_norm": 2.6380608081817627,
"learning_rate": 0.0002,
"loss": 0.9495,
"mean_token_accuracy": 0.9323398619890213,
"num_tokens": 2599877.0,
"step": 1065
},
{
"epoch": 0.4738182220746153,
"grad_norm": 3.1709139347076416,
"learning_rate": 0.0002,
"loss": 0.9925,
"mean_token_accuracy": 0.9324059277772904,
"num_tokens": 2610770.0,
"step": 1070
},
{
"epoch": 0.47603232591608546,
"grad_norm": 2.9764163494110107,
"learning_rate": 0.0002,
"loss": 0.782,
"mean_token_accuracy": 0.9320799469947815,
"num_tokens": 2621941.0,
"step": 1075
},
{
"epoch": 0.4782464297575556,
"grad_norm": 2.2882895469665527,
"learning_rate": 0.0002,
"loss": 1.0986,
"mean_token_accuracy": 0.9247247219085694,
"num_tokens": 2631475.0,
"step": 1080
},
{
"epoch": 0.4804605335990258,
"grad_norm": 3.075330972671509,
"learning_rate": 0.0002,
"loss": 1.0145,
"mean_token_accuracy": 0.9294690877199173,
"num_tokens": 2645344.0,
"step": 1085
},
{
"epoch": 0.48267463744049593,
"grad_norm": 3.256373167037964,
"learning_rate": 0.0002,
"loss": 0.8895,
"mean_token_accuracy": 0.9273504942655564,
"num_tokens": 2655935.0,
"step": 1090
},
{
"epoch": 0.48488874128196613,
"grad_norm": 3.454824924468994,
"learning_rate": 0.0002,
"loss": 0.8622,
"mean_token_accuracy": 0.9389674246311188,
"num_tokens": 2667932.0,
"step": 1095
},
{
"epoch": 0.48710284512343627,
"grad_norm": 3.4632182121276855,
"learning_rate": 0.0002,
"loss": 1.0757,
"mean_token_accuracy": 0.9280873537063599,
"num_tokens": 2679252.0,
"step": 1100
},
{
"epoch": 0.48931694896490646,
"grad_norm": 2.618551254272461,
"learning_rate": 0.0002,
"loss": 0.7642,
"mean_token_accuracy": 0.9406112372875214,
"num_tokens": 2694935.0,
"step": 1105
},
{
"epoch": 0.4915310528063766,
"grad_norm": 3.4649856090545654,
"learning_rate": 0.0002,
"loss": 1.1715,
"mean_token_accuracy": 0.9203990876674653,
"num_tokens": 2705892.0,
"step": 1110
},
{
"epoch": 0.4937451566478468,
"grad_norm": 1.9820575714111328,
"learning_rate": 0.0002,
"loss": 0.7926,
"mean_token_accuracy": 0.9387197762727737,
"num_tokens": 2717836.0,
"step": 1115
},
{
"epoch": 0.49595926048931693,
"grad_norm": 3.768416404724121,
"learning_rate": 0.0002,
"loss": 1.1023,
"mean_token_accuracy": 0.9265471220016479,
"num_tokens": 2728995.0,
"step": 1120
},
{
"epoch": 0.4981733643307871,
"grad_norm": 2.721743106842041,
"learning_rate": 0.0002,
"loss": 0.7971,
"mean_token_accuracy": 0.9334240764379501,
"num_tokens": 2741977.0,
"step": 1125
},
{
"epoch": 0.5003874681722573,
"grad_norm": 2.004788398742676,
"learning_rate": 0.0002,
"loss": 1.2163,
"mean_token_accuracy": 0.9211069196462631,
"num_tokens": 2755116.0,
"step": 1130
},
{
"epoch": 0.5026015720137275,
"grad_norm": 3.46217942237854,
"learning_rate": 0.0002,
"loss": 1.0581,
"mean_token_accuracy": 0.9227796822786332,
"num_tokens": 2766455.0,
"step": 1135
},
{
"epoch": 0.5048156758551976,
"grad_norm": 2.795225143432617,
"learning_rate": 0.0002,
"loss": 1.0172,
"mean_token_accuracy": 0.9251609027385712,
"num_tokens": 2780252.0,
"step": 1140
},
{
"epoch": 0.5070297796966677,
"grad_norm": 2.5644898414611816,
"learning_rate": 0.0002,
"loss": 0.8366,
"mean_token_accuracy": 0.9360420912504196,
"num_tokens": 2793158.0,
"step": 1145
},
{
"epoch": 0.509243883538138,
"grad_norm": 2.85178279876709,
"learning_rate": 0.0002,
"loss": 0.8914,
"mean_token_accuracy": 0.9331649392843246,
"num_tokens": 2803304.0,
"step": 1150
},
{
"epoch": 0.5114579873796081,
"grad_norm": 4.3757429122924805,
"learning_rate": 0.0002,
"loss": 1.3006,
"mean_token_accuracy": 0.912799459695816,
"num_tokens": 2813346.0,
"step": 1155
},
{
"epoch": 0.5136720912210783,
"grad_norm": 3.0821921825408936,
"learning_rate": 0.0002,
"loss": 0.8931,
"mean_token_accuracy": 0.9292008608579636,
"num_tokens": 2825150.0,
"step": 1160
},
{
"epoch": 0.5158861950625484,
"grad_norm": 2.4634435176849365,
"learning_rate": 0.0002,
"loss": 0.7139,
"mean_token_accuracy": 0.9386299520730972,
"num_tokens": 2840010.0,
"step": 1165
},
{
"epoch": 0.5181002989040187,
"grad_norm": 3.792142391204834,
"learning_rate": 0.0002,
"loss": 1.019,
"mean_token_accuracy": 0.9267308801412583,
"num_tokens": 2851060.0,
"step": 1170
},
{
"epoch": 0.5203144027454888,
"grad_norm": 3.2142715454101562,
"learning_rate": 0.0002,
"loss": 0.8126,
"mean_token_accuracy": 0.936069804430008,
"num_tokens": 2863848.0,
"step": 1175
},
{
"epoch": 0.5225285065869589,
"grad_norm": 3.059175729751587,
"learning_rate": 0.0002,
"loss": 0.936,
"mean_token_accuracy": 0.9262942969799042,
"num_tokens": 2875045.0,
"step": 1180
},
{
"epoch": 0.5247426104284291,
"grad_norm": 2.397021532058716,
"learning_rate": 0.0002,
"loss": 0.7636,
"mean_token_accuracy": 0.9363964319229126,
"num_tokens": 2888765.0,
"step": 1185
},
{
"epoch": 0.5269567142698992,
"grad_norm": 2.311615228652954,
"learning_rate": 0.0002,
"loss": 1.1155,
"mean_token_accuracy": 0.9244213849306107,
"num_tokens": 2898522.0,
"step": 1190
},
{
"epoch": 0.5291708181113695,
"grad_norm": 2.686445713043213,
"learning_rate": 0.0002,
"loss": 1.0209,
"mean_token_accuracy": 0.9271868228912353,
"num_tokens": 2910294.0,
"step": 1195
},
{
"epoch": 0.5313849219528396,
"grad_norm": 3.382634401321411,
"learning_rate": 0.0002,
"loss": 1.2553,
"mean_token_accuracy": 0.9195928305387497,
"num_tokens": 2920375.0,
"step": 1200
},
{
"epoch": 0.5335990257943097,
"grad_norm": 2.7273964881896973,
"learning_rate": 0.0002,
"loss": 0.7366,
"mean_token_accuracy": 0.940141350030899,
"num_tokens": 2935314.0,
"step": 1205
},
{
"epoch": 0.5358131296357799,
"grad_norm": 3.521521806716919,
"learning_rate": 0.0002,
"loss": 1.1562,
"mean_token_accuracy": 0.9220536708831787,
"num_tokens": 2947131.0,
"step": 1210
},
{
"epoch": 0.5380272334772501,
"grad_norm": 3.4570305347442627,
"learning_rate": 0.0002,
"loss": 0.7671,
"mean_token_accuracy": 0.9346885770559311,
"num_tokens": 2958100.0,
"step": 1215
},
{
"epoch": 0.5402413373187203,
"grad_norm": 3.236543655395508,
"learning_rate": 0.0002,
"loss": 0.9714,
"mean_token_accuracy": 0.9265434801578522,
"num_tokens": 2968906.0,
"step": 1220
},
{
"epoch": 0.5424554411601904,
"grad_norm": 2.8082947731018066,
"learning_rate": 0.0002,
"loss": 0.8898,
"mean_token_accuracy": 0.9322567820549011,
"num_tokens": 2979901.0,
"step": 1225
},
{
"epoch": 0.5446695450016605,
"grad_norm": 3.805554151535034,
"learning_rate": 0.0002,
"loss": 1.208,
"mean_token_accuracy": 0.9178589969873429,
"num_tokens": 2989620.0,
"step": 1230
},
{
"epoch": 0.5468836488431308,
"grad_norm": 2.371670961380005,
"learning_rate": 0.0002,
"loss": 0.7449,
"mean_token_accuracy": 0.9355430036783219,
"num_tokens": 3001716.0,
"step": 1235
},
{
"epoch": 0.5490977526846009,
"grad_norm": 3.121859550476074,
"learning_rate": 0.0002,
"loss": 0.8731,
"mean_token_accuracy": 0.9309169679880143,
"num_tokens": 3014315.0,
"step": 1240
},
{
"epoch": 0.5513118565260711,
"grad_norm": 3.560624599456787,
"learning_rate": 0.0002,
"loss": 1.1319,
"mean_token_accuracy": 0.9211807966232299,
"num_tokens": 3025741.0,
"step": 1245
},
{
"epoch": 0.5535259603675412,
"grad_norm": 2.2103476524353027,
"learning_rate": 0.0002,
"loss": 0.785,
"mean_token_accuracy": 0.936233428120613,
"num_tokens": 3039131.0,
"step": 1250
},
{
"epoch": 0.5557400642090115,
"grad_norm": 2.7845640182495117,
"learning_rate": 0.0002,
"loss": 0.91,
"mean_token_accuracy": 0.931680291891098,
"num_tokens": 3051046.0,
"step": 1255
},
{
"epoch": 0.5579541680504816,
"grad_norm": 2.8331215381622314,
"learning_rate": 0.0002,
"loss": 0.7777,
"mean_token_accuracy": 0.9354436278343201,
"num_tokens": 3063282.0,
"step": 1260
},
{
"epoch": 0.5601682718919517,
"grad_norm": 4.3590312004089355,
"learning_rate": 0.0002,
"loss": 1.0973,
"mean_token_accuracy": 0.9222743719816208,
"num_tokens": 3074994.0,
"step": 1265
},
{
"epoch": 0.5623823757334219,
"grad_norm": 2.716376543045044,
"learning_rate": 0.0002,
"loss": 0.8278,
"mean_token_accuracy": 0.9338112890720367,
"num_tokens": 3086166.0,
"step": 1270
},
{
"epoch": 0.564596479574892,
"grad_norm": 2.791226625442505,
"learning_rate": 0.0002,
"loss": 0.9836,
"mean_token_accuracy": 0.9340397655963898,
"num_tokens": 3098425.0,
"step": 1275
},
{
"epoch": 0.5668105834163623,
"grad_norm": 2.7921218872070312,
"learning_rate": 0.0002,
"loss": 0.7225,
"mean_token_accuracy": 0.9339357107877732,
"num_tokens": 3111129.0,
"step": 1280
},
{
"epoch": 0.5690246872578324,
"grad_norm": 2.764394998550415,
"learning_rate": 0.0002,
"loss": 0.9283,
"mean_token_accuracy": 0.9268820822238922,
"num_tokens": 3123270.0,
"step": 1285
},
{
"epoch": 0.5712387910993025,
"grad_norm": 2.195909261703491,
"learning_rate": 0.0002,
"loss": 0.6484,
"mean_token_accuracy": 0.9442890018224717,
"num_tokens": 3137978.0,
"step": 1290
},
{
"epoch": 0.5734528949407727,
"grad_norm": 3.223241090774536,
"learning_rate": 0.0002,
"loss": 0.7461,
"mean_token_accuracy": 0.9427239447832108,
"num_tokens": 3151026.0,
"step": 1295
},
{
"epoch": 0.5756669987822429,
"grad_norm": 1.9516724348068237,
"learning_rate": 0.0002,
"loss": 0.6012,
"mean_token_accuracy": 0.9473742932081223,
"num_tokens": 3165781.0,
"step": 1300
},
{
"epoch": 0.5778811026237131,
"grad_norm": 2.265829563140869,
"learning_rate": 0.0002,
"loss": 0.7933,
"mean_token_accuracy": 0.9362419694662094,
"num_tokens": 3179325.0,
"step": 1305
},
{
"epoch": 0.5800952064651832,
"grad_norm": 2.4466440677642822,
"learning_rate": 0.0002,
"loss": 0.7285,
"mean_token_accuracy": 0.9399940431118011,
"num_tokens": 3193006.0,
"step": 1310
},
{
"epoch": 0.5823093103066533,
"grad_norm": 4.094124794006348,
"learning_rate": 0.0002,
"loss": 1.2629,
"mean_token_accuracy": 0.9156520456075669,
"num_tokens": 3203008.0,
"step": 1315
},
{
"epoch": 0.5845234141481236,
"grad_norm": 3.08402681350708,
"learning_rate": 0.0002,
"loss": 0.8664,
"mean_token_accuracy": 0.9337212562561035,
"num_tokens": 3213539.0,
"step": 1320
},
{
"epoch": 0.5867375179895937,
"grad_norm": 2.6603167057037354,
"learning_rate": 0.0002,
"loss": 1.0287,
"mean_token_accuracy": 0.9329268485307693,
"num_tokens": 3225145.0,
"step": 1325
},
{
"epoch": 0.5889516218310639,
"grad_norm": 3.6709961891174316,
"learning_rate": 0.0002,
"loss": 1.1247,
"mean_token_accuracy": 0.9213701337575912,
"num_tokens": 3235894.0,
"step": 1330
},
{
"epoch": 0.591165725672534,
"grad_norm": 2.449747323989868,
"learning_rate": 0.0002,
"loss": 0.8486,
"mean_token_accuracy": 0.9324274808168411,
"num_tokens": 3250311.0,
"step": 1335
},
{
"epoch": 0.5933798295140043,
"grad_norm": 5.862588882446289,
"learning_rate": 0.0002,
"loss": 0.9213,
"mean_token_accuracy": 0.930136987566948,
"num_tokens": 3262778.0,
"step": 1340
},
{
"epoch": 0.5955939333554744,
"grad_norm": 2.749333620071411,
"learning_rate": 0.0002,
"loss": 0.6496,
"mean_token_accuracy": 0.9448672115802765,
"num_tokens": 3275900.0,
"step": 1345
},
{
"epoch": 0.5978080371969445,
"grad_norm": 3.583944320678711,
"learning_rate": 0.0002,
"loss": 0.9604,
"mean_token_accuracy": 0.9305095732212066,
"num_tokens": 3288867.0,
"step": 1350
},
{
"epoch": 0.6000221410384147,
"grad_norm": 2.9229469299316406,
"learning_rate": 0.0002,
"loss": 0.9985,
"mean_token_accuracy": 0.9300340205430985,
"num_tokens": 3300908.0,
"step": 1355
},
{
"epoch": 0.6022362448798849,
"grad_norm": 2.5269253253936768,
"learning_rate": 0.0002,
"loss": 0.8679,
"mean_token_accuracy": 0.9382039904594421,
"num_tokens": 3311877.0,
"step": 1360
},
{
"epoch": 0.6044503487213551,
"grad_norm": 2.8036136627197266,
"learning_rate": 0.0002,
"loss": 0.9413,
"mean_token_accuracy": 0.9259363144636155,
"num_tokens": 3323855.0,
"step": 1365
},
{
"epoch": 0.6066644525628252,
"grad_norm": 2.521695137023926,
"learning_rate": 0.0002,
"loss": 0.8644,
"mean_token_accuracy": 0.9354902893304825,
"num_tokens": 3336303.0,
"step": 1370
},
{
"epoch": 0.6088785564042953,
"grad_norm": 1.684542179107666,
"learning_rate": 0.0002,
"loss": 0.7695,
"mean_token_accuracy": 0.9394852668046951,
"num_tokens": 3348744.0,
"step": 1375
},
{
"epoch": 0.6110926602457655,
"grad_norm": 2.3662304878234863,
"learning_rate": 0.0002,
"loss": 0.861,
"mean_token_accuracy": 0.931562864780426,
"num_tokens": 3360929.0,
"step": 1380
},
{
"epoch": 0.6133067640872357,
"grad_norm": 2.780378580093384,
"learning_rate": 0.0002,
"loss": 0.7188,
"mean_token_accuracy": 0.9389736771583557,
"num_tokens": 3373991.0,
"step": 1385
},
{
"epoch": 0.6155208679287059,
"grad_norm": 3.235178232192993,
"learning_rate": 0.0002,
"loss": 1.0112,
"mean_token_accuracy": 0.9271218568086624,
"num_tokens": 3386416.0,
"step": 1390
},
{
"epoch": 0.617734971770176,
"grad_norm": 2.309201240539551,
"learning_rate": 0.0002,
"loss": 0.698,
"mean_token_accuracy": 0.9409903854131698,
"num_tokens": 3401240.0,
"step": 1395
},
{
"epoch": 0.6199490756116461,
"grad_norm": 3.5225324630737305,
"learning_rate": 0.0002,
"loss": 0.89,
"mean_token_accuracy": 0.9343404263257981,
"num_tokens": 3412430.0,
"step": 1400
},
{
"epoch": 0.6221631794531164,
"grad_norm": 2.0347211360931396,
"learning_rate": 0.0002,
"loss": 0.7872,
"mean_token_accuracy": 0.9413310676813126,
"num_tokens": 3427316.0,
"step": 1405
},
{
"epoch": 0.6243772832945865,
"grad_norm": 3.274460554122925,
"learning_rate": 0.0002,
"loss": 0.8903,
"mean_token_accuracy": 0.9350662767887116,
"num_tokens": 3438501.0,
"step": 1410
},
{
"epoch": 0.6265913871360567,
"grad_norm": 3.090731620788574,
"learning_rate": 0.0002,
"loss": 0.9255,
"mean_token_accuracy": 0.9328742384910583,
"num_tokens": 3451424.0,
"step": 1415
},
{
"epoch": 0.6288054909775268,
"grad_norm": 3.9713704586029053,
"learning_rate": 0.0002,
"loss": 0.7117,
"mean_token_accuracy": 0.9369514465332032,
"num_tokens": 3465061.0,
"step": 1420
},
{
"epoch": 0.631019594818997,
"grad_norm": 2.745424747467041,
"learning_rate": 0.0002,
"loss": 0.6563,
"mean_token_accuracy": 0.9424190640449523,
"num_tokens": 3479516.0,
"step": 1425
},
{
"epoch": 0.6332336986604672,
"grad_norm": 2.7794830799102783,
"learning_rate": 0.0002,
"loss": 0.6805,
"mean_token_accuracy": 0.9403312534093857,
"num_tokens": 3491631.0,
"step": 1430
},
{
"epoch": 0.6354478025019373,
"grad_norm": 3.3232924938201904,
"learning_rate": 0.0002,
"loss": 0.8676,
"mean_token_accuracy": 0.9338378489017487,
"num_tokens": 3502686.0,
"step": 1435
},
{
"epoch": 0.6376619063434075,
"grad_norm": 3.140780210494995,
"learning_rate": 0.0002,
"loss": 1.1598,
"mean_token_accuracy": 0.9237967163324357,
"num_tokens": 3512469.0,
"step": 1440
},
{
"epoch": 0.6398760101848777,
"grad_norm": 3.4723212718963623,
"learning_rate": 0.0002,
"loss": 0.9209,
"mean_token_accuracy": 0.9337589800357818,
"num_tokens": 3522936.0,
"step": 1445
},
{
"epoch": 0.6420901140263479,
"grad_norm": 2.442565441131592,
"learning_rate": 0.0002,
"loss": 0.9235,
"mean_token_accuracy": 0.9316159158945083,
"num_tokens": 3533969.0,
"step": 1450
},
{
"epoch": 0.644304217867818,
"grad_norm": 2.524017572402954,
"learning_rate": 0.0002,
"loss": 0.7758,
"mean_token_accuracy": 0.9413342833518982,
"num_tokens": 3547959.0,
"step": 1455
},
{
"epoch": 0.6465183217092881,
"grad_norm": 2.5085105895996094,
"learning_rate": 0.0002,
"loss": 0.9613,
"mean_token_accuracy": 0.9296864479780197,
"num_tokens": 3560649.0,
"step": 1460
},
{
"epoch": 0.6487324255507584,
"grad_norm": 3.036599636077881,
"learning_rate": 0.0002,
"loss": 1.0325,
"mean_token_accuracy": 0.9275905907154083,
"num_tokens": 3572059.0,
"step": 1465
},
{
"epoch": 0.6509465293922285,
"grad_norm": 3.0383479595184326,
"learning_rate": 0.0002,
"loss": 1.0154,
"mean_token_accuracy": 0.9247897952795029,
"num_tokens": 3585441.0,
"step": 1470
},
{
"epoch": 0.6531606332336987,
"grad_norm": 2.8595175743103027,
"learning_rate": 0.0002,
"loss": 0.6949,
"mean_token_accuracy": 0.9423891752958298,
"num_tokens": 3598995.0,
"step": 1475
},
{
"epoch": 0.6553747370751688,
"grad_norm": 2.770921230316162,
"learning_rate": 0.0002,
"loss": 1.1307,
"mean_token_accuracy": 0.9203568994998932,
"num_tokens": 3610296.0,
"step": 1480
},
{
"epoch": 0.6575888409166389,
"grad_norm": 3.6914687156677246,
"learning_rate": 0.0002,
"loss": 0.7949,
"mean_token_accuracy": 0.9348386436700821,
"num_tokens": 3624724.0,
"step": 1485
},
{
"epoch": 0.6598029447581092,
"grad_norm": 2.433919668197632,
"learning_rate": 0.0002,
"loss": 0.8382,
"mean_token_accuracy": 0.9350390166044236,
"num_tokens": 3635917.0,
"step": 1490
},
{
"epoch": 0.6620170485995793,
"grad_norm": 2.6883230209350586,
"learning_rate": 0.0002,
"loss": 1.0004,
"mean_token_accuracy": 0.9251396596431732,
"num_tokens": 3647238.0,
"step": 1495
},
{
"epoch": 0.6642311524410495,
"grad_norm": 2.9668235778808594,
"learning_rate": 0.0002,
"loss": 0.9062,
"mean_token_accuracy": 0.9337601840496064,
"num_tokens": 3658527.0,
"step": 1500
},
{
"epoch": 0.6664452562825196,
"grad_norm": 3.1381282806396484,
"learning_rate": 0.0002,
"loss": 0.9015,
"mean_token_accuracy": 0.9375191539525985,
"num_tokens": 3670617.0,
"step": 1505
},
{
"epoch": 0.6686593601239899,
"grad_norm": 2.365852117538452,
"learning_rate": 0.0002,
"loss": 0.8156,
"mean_token_accuracy": 0.9330903559923172,
"num_tokens": 3685340.0,
"step": 1510
},
{
"epoch": 0.67087346396546,
"grad_norm": 2.9032535552978516,
"learning_rate": 0.0002,
"loss": 0.9033,
"mean_token_accuracy": 0.9322319328784943,
"num_tokens": 3697378.0,
"step": 1515
},
{
"epoch": 0.6730875678069301,
"grad_norm": 2.333289861679077,
"learning_rate": 0.0002,
"loss": 0.9788,
"mean_token_accuracy": 0.9274383842945099,
"num_tokens": 3711043.0,
"step": 1520
},
{
"epoch": 0.6753016716484003,
"grad_norm": 2.7803232669830322,
"learning_rate": 0.0002,
"loss": 0.878,
"mean_token_accuracy": 0.9308006018400192,
"num_tokens": 3723413.0,
"step": 1525
},
{
"epoch": 0.6775157754898705,
"grad_norm": 2.559749126434326,
"learning_rate": 0.0002,
"loss": 0.9522,
"mean_token_accuracy": 0.9328515976667404,
"num_tokens": 3738329.0,
"step": 1530
},
{
"epoch": 0.6797298793313407,
"grad_norm": 2.448359489440918,
"learning_rate": 0.0002,
"loss": 0.8662,
"mean_token_accuracy": 0.9386651337146759,
"num_tokens": 3750510.0,
"step": 1535
},
{
"epoch": 0.6819439831728108,
"grad_norm": 2.5929195880889893,
"learning_rate": 0.0002,
"loss": 1.0518,
"mean_token_accuracy": 0.9254045516252518,
"num_tokens": 3761821.0,
"step": 1540
},
{
"epoch": 0.6841580870142809,
"grad_norm": 2.6473214626312256,
"learning_rate": 0.0002,
"loss": 0.6234,
"mean_token_accuracy": 0.9468908250331879,
"num_tokens": 3774252.0,
"step": 1545
},
{
"epoch": 0.6863721908557512,
"grad_norm": 3.025092840194702,
"learning_rate": 0.0002,
"loss": 1.1481,
"mean_token_accuracy": 0.9196543127298356,
"num_tokens": 3786710.0,
"step": 1550
},
{
"epoch": 0.6885862946972213,
"grad_norm": 2.7005512714385986,
"learning_rate": 0.0002,
"loss": 0.9283,
"mean_token_accuracy": 0.9315001249313355,
"num_tokens": 3797617.0,
"step": 1555
},
{
"epoch": 0.6908003985386915,
"grad_norm": 2.3178861141204834,
"learning_rate": 0.0002,
"loss": 0.8469,
"mean_token_accuracy": 0.932272481918335,
"num_tokens": 3811013.0,
"step": 1560
},
{
"epoch": 0.6930145023801616,
"grad_norm": 3.0088205337524414,
"learning_rate": 0.0002,
"loss": 0.9886,
"mean_token_accuracy": 0.9232943028211593,
"num_tokens": 3822926.0,
"step": 1565
},
{
"epoch": 0.6952286062216317,
"grad_norm": 2.413239002227783,
"learning_rate": 0.0002,
"loss": 0.6553,
"mean_token_accuracy": 0.9435460805892945,
"num_tokens": 3836364.0,
"step": 1570
},
{
"epoch": 0.697442710063102,
"grad_norm": 2.6605615615844727,
"learning_rate": 0.0002,
"loss": 1.0324,
"mean_token_accuracy": 0.9254931479692459,
"num_tokens": 3848950.0,
"step": 1575
},
{
"epoch": 0.6996568139045721,
"grad_norm": 2.1413521766662598,
"learning_rate": 0.0002,
"loss": 0.6661,
"mean_token_accuracy": 0.9437528550624847,
"num_tokens": 3863924.0,
"step": 1580
},
{
"epoch": 0.7018709177460423,
"grad_norm": 2.496495485305786,
"learning_rate": 0.0002,
"loss": 0.8324,
"mean_token_accuracy": 0.936608812212944,
"num_tokens": 3876219.0,
"step": 1585
},
{
"epoch": 0.7040850215875124,
"grad_norm": 3.7890663146972656,
"learning_rate": 0.0002,
"loss": 0.9902,
"mean_token_accuracy": 0.9294484287500382,
"num_tokens": 3886540.0,
"step": 1590
},
{
"epoch": 0.7062991254289827,
"grad_norm": 2.942206621170044,
"learning_rate": 0.0002,
"loss": 0.8423,
"mean_token_accuracy": 0.9380270838737488,
"num_tokens": 3899354.0,
"step": 1595
},
{
"epoch": 0.7085132292704528,
"grad_norm": 3.0008063316345215,
"learning_rate": 0.0002,
"loss": 1.0699,
"mean_token_accuracy": 0.9203650772571563,
"num_tokens": 3911622.0,
"step": 1600
},
{
"epoch": 0.7107273331119229,
"grad_norm": 2.285707950592041,
"learning_rate": 0.0002,
"loss": 0.8683,
"mean_token_accuracy": 0.931014335155487,
"num_tokens": 3923438.0,
"step": 1605
},
{
"epoch": 0.7129414369533931,
"grad_norm": 2.3685543537139893,
"learning_rate": 0.0002,
"loss": 0.8168,
"mean_token_accuracy": 0.9353397488594055,
"num_tokens": 3935444.0,
"step": 1610
},
{
"epoch": 0.7151555407948633,
"grad_norm": 3.0847818851470947,
"learning_rate": 0.0002,
"loss": 0.9466,
"mean_token_accuracy": 0.929938405752182,
"num_tokens": 3946607.0,
"step": 1615
},
{
"epoch": 0.7173696446363335,
"grad_norm": 3.0750293731689453,
"learning_rate": 0.0002,
"loss": 0.914,
"mean_token_accuracy": 0.9286134451627731,
"num_tokens": 3959199.0,
"step": 1620
},
{
"epoch": 0.7195837484778036,
"grad_norm": 3.4493777751922607,
"learning_rate": 0.0002,
"loss": 0.977,
"mean_token_accuracy": 0.919402825832367,
"num_tokens": 3970979.0,
"step": 1625
},
{
"epoch": 0.7217978523192737,
"grad_norm": 3.124067783355713,
"learning_rate": 0.0002,
"loss": 1.0367,
"mean_token_accuracy": 0.9251971215009689,
"num_tokens": 3981702.0,
"step": 1630
},
{
"epoch": 0.724011956160744,
"grad_norm": 2.45589017868042,
"learning_rate": 0.0002,
"loss": 0.8534,
"mean_token_accuracy": 0.9356551617383957,
"num_tokens": 3994027.0,
"step": 1635
},
{
"epoch": 0.7262260600022141,
"grad_norm": 3.4078500270843506,
"learning_rate": 0.0002,
"loss": 0.965,
"mean_token_accuracy": 0.9347460746765137,
"num_tokens": 4005820.0,
"step": 1640
},
{
"epoch": 0.7284401638436843,
"grad_norm": 2.2892725467681885,
"learning_rate": 0.0002,
"loss": 0.5592,
"mean_token_accuracy": 0.9513197064399719,
"num_tokens": 4018352.0,
"step": 1645
},
{
"epoch": 0.7306542676851544,
"grad_norm": 1.8147987127304077,
"learning_rate": 0.0002,
"loss": 0.6609,
"mean_token_accuracy": 0.941572979092598,
"num_tokens": 4031874.0,
"step": 1650
},
{
"epoch": 0.7328683715266247,
"grad_norm": 3.63505220413208,
"learning_rate": 0.0002,
"loss": 1.0721,
"mean_token_accuracy": 0.9192070156335831,
"num_tokens": 4042614.0,
"step": 1655
},
{
"epoch": 0.7350824753680948,
"grad_norm": 2.3137118816375732,
"learning_rate": 0.0002,
"loss": 0.7545,
"mean_token_accuracy": 0.9381762742996216,
"num_tokens": 4054608.0,
"step": 1660
},
{
"epoch": 0.7372965792095649,
"grad_norm": 3.7039380073547363,
"learning_rate": 0.0002,
"loss": 0.9273,
"mean_token_accuracy": 0.9306770205497742,
"num_tokens": 4065861.0,
"step": 1665
},
{
"epoch": 0.7395106830510351,
"grad_norm": 2.4405832290649414,
"learning_rate": 0.0002,
"loss": 0.8476,
"mean_token_accuracy": 0.9362583935260773,
"num_tokens": 4078523.0,
"step": 1670
},
{
"epoch": 0.7417247868925052,
"grad_norm": 2.31562876701355,
"learning_rate": 0.0002,
"loss": 0.7321,
"mean_token_accuracy": 0.9406876623630523,
"num_tokens": 4092330.0,
"step": 1675
},
{
"epoch": 0.7439388907339755,
"grad_norm": 2.7616567611694336,
"learning_rate": 0.0002,
"loss": 1.018,
"mean_token_accuracy": 0.9298185467720032,
"num_tokens": 4103159.0,
"step": 1680
},
{
"epoch": 0.7461529945754456,
"grad_norm": 2.3408303260803223,
"learning_rate": 0.0002,
"loss": 0.8177,
"mean_token_accuracy": 0.9400125861167907,
"num_tokens": 4116163.0,
"step": 1685
},
{
"epoch": 0.7483670984169157,
"grad_norm": 1.8495256900787354,
"learning_rate": 0.0002,
"loss": 0.7702,
"mean_token_accuracy": 0.9391000926494598,
"num_tokens": 4130911.0,
"step": 1690
},
{
"epoch": 0.7505812022583859,
"grad_norm": 2.5655927658081055,
"learning_rate": 0.0002,
"loss": 0.757,
"mean_token_accuracy": 0.9395717918872833,
"num_tokens": 4143931.0,
"step": 1695
},
{
"epoch": 0.7527953060998561,
"grad_norm": 3.2286360263824463,
"learning_rate": 0.0002,
"loss": 1.004,
"mean_token_accuracy": 0.927115085721016,
"num_tokens": 4155020.0,
"step": 1700
},
{
"epoch": 0.7550094099413263,
"grad_norm": 2.1214611530303955,
"learning_rate": 0.0002,
"loss": 0.9058,
"mean_token_accuracy": 0.938083803653717,
"num_tokens": 4167190.0,
"step": 1705
},
{
"epoch": 0.7572235137827964,
"grad_norm": 2.1883342266082764,
"learning_rate": 0.0002,
"loss": 0.9033,
"mean_token_accuracy": 0.9318992733955384,
"num_tokens": 4177955.0,
"step": 1710
},
{
"epoch": 0.7594376176242665,
"grad_norm": 2.774677038192749,
"learning_rate": 0.0002,
"loss": 0.8573,
"mean_token_accuracy": 0.9322766721248626,
"num_tokens": 4188804.0,
"step": 1715
},
{
"epoch": 0.7616517214657368,
"grad_norm": 2.4907023906707764,
"learning_rate": 0.0002,
"loss": 0.904,
"mean_token_accuracy": 0.931341353058815,
"num_tokens": 4201503.0,
"step": 1720
},
{
"epoch": 0.7638658253072069,
"grad_norm": 2.5578067302703857,
"learning_rate": 0.0002,
"loss": 0.7897,
"mean_token_accuracy": 0.937814000248909,
"num_tokens": 4212954.0,
"step": 1725
},
{
"epoch": 0.7660799291486771,
"grad_norm": 3.0754973888397217,
"learning_rate": 0.0002,
"loss": 0.9264,
"mean_token_accuracy": 0.9291175544261933,
"num_tokens": 4224571.0,
"step": 1730
},
{
"epoch": 0.7682940329901472,
"grad_norm": 2.0344362258911133,
"learning_rate": 0.0002,
"loss": 0.9195,
"mean_token_accuracy": 0.9382949858903885,
"num_tokens": 4240166.0,
"step": 1735
},
{
"epoch": 0.7705081368316175,
"grad_norm": 2.706178903579712,
"learning_rate": 0.0002,
"loss": 0.8708,
"mean_token_accuracy": 0.9349207997322082,
"num_tokens": 4252583.0,
"step": 1740
},
{
"epoch": 0.7727222406730876,
"grad_norm": 2.2786762714385986,
"learning_rate": 0.0002,
"loss": 0.8403,
"mean_token_accuracy": 0.926530522108078,
"num_tokens": 4263446.0,
"step": 1745
},
{
"epoch": 0.7749363445145577,
"grad_norm": 2.330183506011963,
"learning_rate": 0.0002,
"loss": 0.7518,
"mean_token_accuracy": 0.9388490498065949,
"num_tokens": 4275714.0,
"step": 1750
},
{
"epoch": 0.7771504483560279,
"grad_norm": 2.4576809406280518,
"learning_rate": 0.0002,
"loss": 0.7374,
"mean_token_accuracy": 0.9381652891635894,
"num_tokens": 4289168.0,
"step": 1755
},
{
"epoch": 0.7793645521974981,
"grad_norm": 2.1529836654663086,
"learning_rate": 0.0002,
"loss": 0.9973,
"mean_token_accuracy": 0.9245061188936233,
"num_tokens": 4301644.0,
"step": 1760
},
{
"epoch": 0.7815786560389683,
"grad_norm": 2.188100576400757,
"learning_rate": 0.0002,
"loss": 0.8327,
"mean_token_accuracy": 0.9336588770151139,
"num_tokens": 4313516.0,
"step": 1765
},
{
"epoch": 0.7837927598804384,
"grad_norm": 2.1842052936553955,
"learning_rate": 0.0002,
"loss": 0.734,
"mean_token_accuracy": 0.9438644349575043,
"num_tokens": 4325557.0,
"step": 1770
},
{
"epoch": 0.7860068637219085,
"grad_norm": 2.2100729942321777,
"learning_rate": 0.0002,
"loss": 0.8849,
"mean_token_accuracy": 0.9334080815315247,
"num_tokens": 4338260.0,
"step": 1775
},
{
"epoch": 0.7882209675633787,
"grad_norm": 2.2355990409851074,
"learning_rate": 0.0002,
"loss": 0.8919,
"mean_token_accuracy": 0.9319656908512115,
"num_tokens": 4352238.0,
"step": 1780
},
{
"epoch": 0.7904350714048489,
"grad_norm": 2.0506389141082764,
"learning_rate": 0.0002,
"loss": 0.8144,
"mean_token_accuracy": 0.9333805292844772,
"num_tokens": 4365565.0,
"step": 1785
},
{
"epoch": 0.7926491752463191,
"grad_norm": 2.5267720222473145,
"learning_rate": 0.0002,
"loss": 1.0332,
"mean_token_accuracy": 0.9245569318532944,
"num_tokens": 4376973.0,
"step": 1790
},
{
"epoch": 0.7948632790877892,
"grad_norm": 2.453788995742798,
"learning_rate": 0.0002,
"loss": 0.7972,
"mean_token_accuracy": 0.9332748234272004,
"num_tokens": 4390102.0,
"step": 1795
},
{
"epoch": 0.7970773829292593,
"grad_norm": 2.7728281021118164,
"learning_rate": 0.0002,
"loss": 0.9427,
"mean_token_accuracy": 0.9322525978088378,
"num_tokens": 4401357.0,
"step": 1800
},
{
"epoch": 0.7992914867707296,
"grad_norm": 3.114647388458252,
"learning_rate": 0.0002,
"loss": 1.0359,
"mean_token_accuracy": 0.9259240895509719,
"num_tokens": 4411905.0,
"step": 1805
},
{
"epoch": 0.8015055906121997,
"grad_norm": 3.4858386516571045,
"learning_rate": 0.0002,
"loss": 1.0477,
"mean_token_accuracy": 0.9230340659618378,
"num_tokens": 4422838.0,
"step": 1810
},
{
"epoch": 0.8037196944536699,
"grad_norm": 2.627652645111084,
"learning_rate": 0.0002,
"loss": 0.9524,
"mean_token_accuracy": 0.9306756138801575,
"num_tokens": 4435022.0,
"step": 1815
},
{
"epoch": 0.80593379829514,
"grad_norm": 3.016364336013794,
"learning_rate": 0.0002,
"loss": 0.7447,
"mean_token_accuracy": 0.9370788335800171,
"num_tokens": 4447750.0,
"step": 1820
},
{
"epoch": 0.8081479021366103,
"grad_norm": 3.563826560974121,
"learning_rate": 0.0002,
"loss": 0.8516,
"mean_token_accuracy": 0.9333288490772247,
"num_tokens": 4460185.0,
"step": 1825
},
{
"epoch": 0.8103620059780804,
"grad_norm": 3.726036310195923,
"learning_rate": 0.0002,
"loss": 0.9291,
"mean_token_accuracy": 0.930818784236908,
"num_tokens": 4472630.0,
"step": 1830
},
{
"epoch": 0.8125761098195505,
"grad_norm": 2.23213267326355,
"learning_rate": 0.0002,
"loss": 0.913,
"mean_token_accuracy": 0.9309399574995041,
"num_tokens": 4485328.0,
"step": 1835
},
{
"epoch": 0.8147902136610207,
"grad_norm": 3.263636827468872,
"learning_rate": 0.0002,
"loss": 0.9267,
"mean_token_accuracy": 0.9323295533657074,
"num_tokens": 4496151.0,
"step": 1840
},
{
"epoch": 0.8170043175024909,
"grad_norm": 1.8619623184204102,
"learning_rate": 0.0002,
"loss": 0.7147,
"mean_token_accuracy": 0.9434016287326813,
"num_tokens": 4509519.0,
"step": 1845
},
{
"epoch": 0.8192184213439611,
"grad_norm": 3.046086072921753,
"learning_rate": 0.0002,
"loss": 1.0037,
"mean_token_accuracy": 0.9272881835699082,
"num_tokens": 4521094.0,
"step": 1850
},
{
"epoch": 0.8214325251854312,
"grad_norm": 2.7041449546813965,
"learning_rate": 0.0002,
"loss": 0.8851,
"mean_token_accuracy": 0.93080253303051,
"num_tokens": 4533460.0,
"step": 1855
},
{
"epoch": 0.8236466290269013,
"grad_norm": 2.374342679977417,
"learning_rate": 0.0002,
"loss": 0.8168,
"mean_token_accuracy": 0.9408663511276245,
"num_tokens": 4545453.0,
"step": 1860
},
{
"epoch": 0.8258607328683715,
"grad_norm": 2.5304906368255615,
"learning_rate": 0.0002,
"loss": 0.7315,
"mean_token_accuracy": 0.9390550851821899,
"num_tokens": 4558017.0,
"step": 1865
},
{
"epoch": 0.8280748367098417,
"grad_norm": 3.1711394786834717,
"learning_rate": 0.0002,
"loss": 0.8645,
"mean_token_accuracy": 0.9297717779874801,
"num_tokens": 4569514.0,
"step": 1870
},
{
"epoch": 0.8302889405513119,
"grad_norm": 3.0447299480438232,
"learning_rate": 0.0002,
"loss": 0.8618,
"mean_token_accuracy": 0.9313441842794419,
"num_tokens": 4579542.0,
"step": 1875
},
{
"epoch": 0.832503044392782,
"grad_norm": 2.770129680633545,
"learning_rate": 0.0002,
"loss": 0.8299,
"mean_token_accuracy": 0.9362069517374039,
"num_tokens": 4592482.0,
"step": 1880
},
{
"epoch": 0.8347171482342521,
"grad_norm": 2.4876534938812256,
"learning_rate": 0.0002,
"loss": 0.8511,
"mean_token_accuracy": 0.9381607830524444,
"num_tokens": 4605153.0,
"step": 1885
},
{
"epoch": 0.8369312520757224,
"grad_norm": 1.9146308898925781,
"learning_rate": 0.0002,
"loss": 0.8834,
"mean_token_accuracy": 0.938035500049591,
"num_tokens": 4617506.0,
"step": 1890
},
{
"epoch": 0.8391453559171925,
"grad_norm": 2.0652332305908203,
"learning_rate": 0.0002,
"loss": 0.7992,
"mean_token_accuracy": 0.935004535317421,
"num_tokens": 4632287.0,
"step": 1895
},
{
"epoch": 0.8413594597586627,
"grad_norm": 2.6872732639312744,
"learning_rate": 0.0002,
"loss": 0.7629,
"mean_token_accuracy": 0.9380116105079651,
"num_tokens": 4645193.0,
"step": 1900
},
{
"epoch": 0.8435735636001328,
"grad_norm": 2.857466220855713,
"learning_rate": 0.0002,
"loss": 0.8405,
"mean_token_accuracy": 0.934279152750969,
"num_tokens": 4656636.0,
"step": 1905
},
{
"epoch": 0.845787667441603,
"grad_norm": 2.7037603855133057,
"learning_rate": 0.0002,
"loss": 0.8591,
"mean_token_accuracy": 0.9366025865077973,
"num_tokens": 4669629.0,
"step": 1910
},
{
"epoch": 0.8480017712830732,
"grad_norm": 2.5019657611846924,
"learning_rate": 0.0002,
"loss": 1.054,
"mean_token_accuracy": 0.9281669825315475,
"num_tokens": 4682395.0,
"step": 1915
},
{
"epoch": 0.8502158751245433,
"grad_norm": 2.6266391277313232,
"learning_rate": 0.0002,
"loss": 0.8131,
"mean_token_accuracy": 0.9354126363992691,
"num_tokens": 4695518.0,
"step": 1920
},
{
"epoch": 0.8524299789660135,
"grad_norm": 2.138951301574707,
"learning_rate": 0.0002,
"loss": 0.8369,
"mean_token_accuracy": 0.9329825401306152,
"num_tokens": 4708226.0,
"step": 1925
},
{
"epoch": 0.8546440828074837,
"grad_norm": 2.910318374633789,
"learning_rate": 0.0002,
"loss": 0.928,
"mean_token_accuracy": 0.9311651080846787,
"num_tokens": 4718331.0,
"step": 1930
},
{
"epoch": 0.8568581866489539,
"grad_norm": 3.454087734222412,
"learning_rate": 0.0002,
"loss": 0.878,
"mean_token_accuracy": 0.9374223858118057,
"num_tokens": 4731233.0,
"step": 1935
},
{
"epoch": 0.859072290490424,
"grad_norm": 2.537177085876465,
"learning_rate": 0.0002,
"loss": 0.7976,
"mean_token_accuracy": 0.9375020027160644,
"num_tokens": 4743112.0,
"step": 1940
},
{
"epoch": 0.8612863943318941,
"grad_norm": 2.521338701248169,
"learning_rate": 0.0002,
"loss": 0.8091,
"mean_token_accuracy": 0.938329017162323,
"num_tokens": 4755570.0,
"step": 1945
},
{
"epoch": 0.8635004981733644,
"grad_norm": 2.104426622390747,
"learning_rate": 0.0002,
"loss": 0.9058,
"mean_token_accuracy": 0.9326849579811096,
"num_tokens": 4767535.0,
"step": 1950
},
{
"epoch": 0.8657146020148345,
"grad_norm": 2.7374699115753174,
"learning_rate": 0.0002,
"loss": 0.8472,
"mean_token_accuracy": 0.9329976707696914,
"num_tokens": 4777888.0,
"step": 1955
},
{
"epoch": 0.8679287058563047,
"grad_norm": 3.3029234409332275,
"learning_rate": 0.0002,
"loss": 0.8242,
"mean_token_accuracy": 0.9354960173368454,
"num_tokens": 4791838.0,
"step": 1960
},
{
"epoch": 0.8701428096977748,
"grad_norm": 2.2211055755615234,
"learning_rate": 0.0002,
"loss": 0.879,
"mean_token_accuracy": 0.9325517565011978,
"num_tokens": 4803626.0,
"step": 1965
},
{
"epoch": 0.8723569135392449,
"grad_norm": 2.8065617084503174,
"learning_rate": 0.0002,
"loss": 1.0319,
"mean_token_accuracy": 0.9292825996875763,
"num_tokens": 4817090.0,
"step": 1970
},
{
"epoch": 0.8745710173807152,
"grad_norm": 2.932598352432251,
"learning_rate": 0.0002,
"loss": 0.8184,
"mean_token_accuracy": 0.9353422105312348,
"num_tokens": 4831134.0,
"step": 1975
},
{
"epoch": 0.8767851212221853,
"grad_norm": 2.5619053840637207,
"learning_rate": 0.0002,
"loss": 0.6391,
"mean_token_accuracy": 0.9429128289222717,
"num_tokens": 4843766.0,
"step": 1980
},
{
"epoch": 0.8789992250636555,
"grad_norm": 2.2597715854644775,
"learning_rate": 0.0002,
"loss": 0.8963,
"mean_token_accuracy": 0.9304134607315063,
"num_tokens": 4856458.0,
"step": 1985
},
{
"epoch": 0.8812133289051256,
"grad_norm": 1.9793012142181396,
"learning_rate": 0.0002,
"loss": 0.7251,
"mean_token_accuracy": 0.939424803853035,
"num_tokens": 4868615.0,
"step": 1990
},
{
"epoch": 0.8834274327465959,
"grad_norm": 2.075303554534912,
"learning_rate": 0.0002,
"loss": 0.8271,
"mean_token_accuracy": 0.9326154798269272,
"num_tokens": 4880572.0,
"step": 1995
},
{
"epoch": 0.885641536588066,
"grad_norm": 3.116805076599121,
"learning_rate": 0.0002,
"loss": 0.9112,
"mean_token_accuracy": 0.9324250787496566,
"num_tokens": 4893492.0,
"step": 2000
},
{
"epoch": 0.8878556404295361,
"grad_norm": 2.752161741256714,
"learning_rate": 0.0002,
"loss": 1.0983,
"mean_token_accuracy": 0.9237021476030349,
"num_tokens": 4904509.0,
"step": 2005
},
{
"epoch": 0.8900697442710063,
"grad_norm": 2.598949670791626,
"learning_rate": 0.0002,
"loss": 0.8562,
"mean_token_accuracy": 0.9328458935022355,
"num_tokens": 4918130.0,
"step": 2010
},
{
"epoch": 0.8922838481124765,
"grad_norm": 2.5332608222961426,
"learning_rate": 0.0002,
"loss": 0.8527,
"mean_token_accuracy": 0.9309366434812546,
"num_tokens": 4931211.0,
"step": 2015
},
{
"epoch": 0.8944979519539467,
"grad_norm": 2.683284044265747,
"learning_rate": 0.0002,
"loss": 0.9261,
"mean_token_accuracy": 0.9301638215780258,
"num_tokens": 4943604.0,
"step": 2020
},
{
"epoch": 0.8967120557954168,
"grad_norm": 2.730400323867798,
"learning_rate": 0.0002,
"loss": 1.0182,
"mean_token_accuracy": 0.9268926858901978,
"num_tokens": 4954734.0,
"step": 2025
},
{
"epoch": 0.8989261596368869,
"grad_norm": 3.0371506214141846,
"learning_rate": 0.0002,
"loss": 0.8396,
"mean_token_accuracy": 0.9345870792865754,
"num_tokens": 4967331.0,
"step": 2030
},
{
"epoch": 0.9011402634783572,
"grad_norm": 2.1169416904449463,
"learning_rate": 0.0002,
"loss": 0.7571,
"mean_token_accuracy": 0.937484648823738,
"num_tokens": 4980777.0,
"step": 2035
},
{
"epoch": 0.9033543673198273,
"grad_norm": 2.2946715354919434,
"learning_rate": 0.0002,
"loss": 0.6903,
"mean_token_accuracy": 0.9432729125022888,
"num_tokens": 4993590.0,
"step": 2040
},
{
"epoch": 0.9055684711612975,
"grad_norm": 2.3784878253936768,
"learning_rate": 0.0002,
"loss": 0.8577,
"mean_token_accuracy": 0.9391710489988327,
"num_tokens": 5006396.0,
"step": 2045
},
{
"epoch": 0.9077825750027676,
"grad_norm": 2.185091495513916,
"learning_rate": 0.0002,
"loss": 0.9413,
"mean_token_accuracy": 0.928019043803215,
"num_tokens": 5018793.0,
"step": 2050
},
{
"epoch": 0.9099966788442378,
"grad_norm": 2.835411787033081,
"learning_rate": 0.0002,
"loss": 0.8637,
"mean_token_accuracy": 0.934612587094307,
"num_tokens": 5029806.0,
"step": 2055
},
{
"epoch": 0.912210782685708,
"grad_norm": 2.276442766189575,
"learning_rate": 0.0002,
"loss": 0.8431,
"mean_token_accuracy": 0.9366135746240616,
"num_tokens": 5042248.0,
"step": 2060
},
{
"epoch": 0.9144248865271781,
"grad_norm": 1.937154769897461,
"learning_rate": 0.0002,
"loss": 1.23,
"mean_token_accuracy": 0.918472969532013,
"num_tokens": 5053886.0,
"step": 2065
},
{
"epoch": 0.9166389903686483,
"grad_norm": 2.3872339725494385,
"learning_rate": 0.0002,
"loss": 0.8882,
"mean_token_accuracy": 0.9267417550086975,
"num_tokens": 5063900.0,
"step": 2070
},
{
"epoch": 0.9188530942101184,
"grad_norm": 2.7894115447998047,
"learning_rate": 0.0002,
"loss": 0.7592,
"mean_token_accuracy": 0.9337449461221695,
"num_tokens": 5076904.0,
"step": 2075
},
{
"epoch": 0.9210671980515887,
"grad_norm": 1.7611744403839111,
"learning_rate": 0.0002,
"loss": 0.5414,
"mean_token_accuracy": 0.9498718023300171,
"num_tokens": 5092582.0,
"step": 2080
},
{
"epoch": 0.9232813018930588,
"grad_norm": 1.8192365169525146,
"learning_rate": 0.0002,
"loss": 0.6292,
"mean_token_accuracy": 0.9412597447633744,
"num_tokens": 5105494.0,
"step": 2085
},
{
"epoch": 0.9254954057345289,
"grad_norm": 2.696876049041748,
"learning_rate": 0.0002,
"loss": 0.7313,
"mean_token_accuracy": 0.9369488745927811,
"num_tokens": 5117554.0,
"step": 2090
},
{
"epoch": 0.9277095095759991,
"grad_norm": 2.514218330383301,
"learning_rate": 0.0002,
"loss": 0.7942,
"mean_token_accuracy": 0.9363553553819657,
"num_tokens": 5129486.0,
"step": 2095
},
{
"epoch": 0.9299236134174693,
"grad_norm": 2.5048184394836426,
"learning_rate": 0.0002,
"loss": 0.887,
"mean_token_accuracy": 0.9345066547393799,
"num_tokens": 5141296.0,
"step": 2100
},
{
"epoch": 0.9321377172589395,
"grad_norm": 2.470578193664551,
"learning_rate": 0.0002,
"loss": 0.7653,
"mean_token_accuracy": 0.941747522354126,
"num_tokens": 5153435.0,
"step": 2105
},
{
"epoch": 0.9343518211004096,
"grad_norm": 4.520934581756592,
"learning_rate": 0.0002,
"loss": 0.6472,
"mean_token_accuracy": 0.9441186487674713,
"num_tokens": 5168123.0,
"step": 2110
},
{
"epoch": 0.9365659249418797,
"grad_norm": 2.874882936477661,
"learning_rate": 0.0002,
"loss": 0.9228,
"mean_token_accuracy": 0.9303671330213547,
"num_tokens": 5180846.0,
"step": 2115
},
{
"epoch": 0.93878002878335,
"grad_norm": 2.4952664375305176,
"learning_rate": 0.0002,
"loss": 0.6372,
"mean_token_accuracy": 0.9446426779031754,
"num_tokens": 5194790.0,
"step": 2120
},
{
"epoch": 0.9409941326248201,
"grad_norm": 2.4000353813171387,
"learning_rate": 0.0002,
"loss": 0.9386,
"mean_token_accuracy": 0.9289501368999481,
"num_tokens": 5206868.0,
"step": 2125
},
{
"epoch": 0.9432082364662903,
"grad_norm": 3.2110652923583984,
"learning_rate": 0.0002,
"loss": 1.029,
"mean_token_accuracy": 0.924631980061531,
"num_tokens": 5218375.0,
"step": 2130
},
{
"epoch": 0.9454223403077604,
"grad_norm": 2.351478099822998,
"learning_rate": 0.0002,
"loss": 0.8956,
"mean_token_accuracy": 0.9299973398447037,
"num_tokens": 5230679.0,
"step": 2135
},
{
"epoch": 0.9476364441492307,
"grad_norm": 2.0615413188934326,
"learning_rate": 0.0002,
"loss": 0.8124,
"mean_token_accuracy": 0.9377152562141419,
"num_tokens": 5243273.0,
"step": 2140
},
{
"epoch": 0.9498505479907008,
"grad_norm": 2.804684638977051,
"learning_rate": 0.0002,
"loss": 0.8106,
"mean_token_accuracy": 0.9386685341596603,
"num_tokens": 5256283.0,
"step": 2145
},
{
"epoch": 0.9520646518321709,
"grad_norm": 2.3394579887390137,
"learning_rate": 0.0002,
"loss": 0.7158,
"mean_token_accuracy": 0.9425580680370331,
"num_tokens": 5268819.0,
"step": 2150
},
{
"epoch": 0.9542787556736411,
"grad_norm": 3.8045785427093506,
"learning_rate": 0.0002,
"loss": 0.8811,
"mean_token_accuracy": 0.9341193944215774,
"num_tokens": 5280796.0,
"step": 2155
},
{
"epoch": 0.9564928595151112,
"grad_norm": 3.4269163608551025,
"learning_rate": 0.0002,
"loss": 0.8996,
"mean_token_accuracy": 0.9318024456501007,
"num_tokens": 5293545.0,
"step": 2160
},
{
"epoch": 0.9587069633565815,
"grad_norm": 2.0108461380004883,
"learning_rate": 0.0002,
"loss": 0.6059,
"mean_token_accuracy": 0.9463658452033996,
"num_tokens": 5307387.0,
"step": 2165
},
{
"epoch": 0.9609210671980516,
"grad_norm": 2.5608201026916504,
"learning_rate": 0.0002,
"loss": 0.6557,
"mean_token_accuracy": 0.9420038640499115,
"num_tokens": 5320215.0,
"step": 2170
},
{
"epoch": 0.9631351710395217,
"grad_norm": 3.3502604961395264,
"learning_rate": 0.0002,
"loss": 1.0185,
"mean_token_accuracy": 0.9242115944623948,
"num_tokens": 5330521.0,
"step": 2175
},
{
"epoch": 0.9653492748809919,
"grad_norm": 2.2960116863250732,
"learning_rate": 0.0002,
"loss": 0.7987,
"mean_token_accuracy": 0.9346065133810043,
"num_tokens": 5341552.0,
"step": 2180
},
{
"epoch": 0.9675633787224621,
"grad_norm": 2.166372060775757,
"learning_rate": 0.0002,
"loss": 0.5427,
"mean_token_accuracy": 0.9495539724826813,
"num_tokens": 5355653.0,
"step": 2185
},
{
"epoch": 0.9697774825639323,
"grad_norm": 1.9363880157470703,
"learning_rate": 0.0002,
"loss": 0.8866,
"mean_token_accuracy": 0.9348477244377136,
"num_tokens": 5370025.0,
"step": 2190
},
{
"epoch": 0.9719915864054024,
"grad_norm": 2.699810028076172,
"learning_rate": 0.0002,
"loss": 0.9583,
"mean_token_accuracy": 0.9283025532960891,
"num_tokens": 5380549.0,
"step": 2195
},
{
"epoch": 0.9742056902468725,
"grad_norm": 2.2296714782714844,
"learning_rate": 0.0002,
"loss": 0.9482,
"mean_token_accuracy": 0.9271619528532028,
"num_tokens": 5393080.0,
"step": 2200
},
{
"epoch": 0.9764197940883428,
"grad_norm": 2.4104833602905273,
"learning_rate": 0.0002,
"loss": 0.8761,
"mean_token_accuracy": 0.9381742179393768,
"num_tokens": 5404547.0,
"step": 2205
},
{
"epoch": 0.9786338979298129,
"grad_norm": 2.5453920364379883,
"learning_rate": 0.0002,
"loss": 0.8772,
"mean_token_accuracy": 0.9323698520660401,
"num_tokens": 5416684.0,
"step": 2210
},
{
"epoch": 0.9808480017712831,
"grad_norm": 2.8525002002716064,
"learning_rate": 0.0002,
"loss": 0.8593,
"mean_token_accuracy": 0.9280822277069092,
"num_tokens": 5427876.0,
"step": 2215
},
{
"epoch": 0.9830621056127532,
"grad_norm": 2.7001919746398926,
"learning_rate": 0.0002,
"loss": 0.7549,
"mean_token_accuracy": 0.9385748893022537,
"num_tokens": 5438257.0,
"step": 2220
},
{
"epoch": 0.9852762094542235,
"grad_norm": 3.1060454845428467,
"learning_rate": 0.0002,
"loss": 1.1347,
"mean_token_accuracy": 0.918604564666748,
"num_tokens": 5449833.0,
"step": 2225
},
{
"epoch": 0.9874903132956936,
"grad_norm": 2.4557158946990967,
"learning_rate": 0.0002,
"loss": 0.7848,
"mean_token_accuracy": 0.9370519310235977,
"num_tokens": 5463070.0,
"step": 2230
},
{
"epoch": 0.9897044171371637,
"grad_norm": 3.7874040603637695,
"learning_rate": 0.0002,
"loss": 0.9189,
"mean_token_accuracy": 0.929209041595459,
"num_tokens": 5474371.0,
"step": 2235
},
{
"epoch": 0.9919185209786339,
"grad_norm": 2.745861053466797,
"learning_rate": 0.0002,
"loss": 1.0186,
"mean_token_accuracy": 0.9253447771072387,
"num_tokens": 5487549.0,
"step": 2240
},
{
"epoch": 0.9941326248201041,
"grad_norm": 2.013324022293091,
"learning_rate": 0.0002,
"loss": 0.7117,
"mean_token_accuracy": 0.9431297659873963,
"num_tokens": 5502064.0,
"step": 2245
},
{
"epoch": 0.9963467286615743,
"grad_norm": 2.179727792739868,
"learning_rate": 0.0002,
"loss": 0.7638,
"mean_token_accuracy": 0.9363548696041107,
"num_tokens": 5514455.0,
"step": 2250
},
{
"epoch": 0.9985608325030444,
"grad_norm": 1.9565762281417847,
"learning_rate": 0.0002,
"loss": 1.0294,
"mean_token_accuracy": 0.9255888283252716,
"num_tokens": 5526347.0,
"step": 2255
}
],
"logging_steps": 5,
"max_steps": 2258,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.430437089762216e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}