| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.287925696594428, | |
| "eval_steps": 500, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.030959752321981424, | |
| "grad_norm": 9.773098945617676, | |
| "learning_rate": 2.991640866873065e-05, | |
| "loss": 9.245, | |
| "mean_token_accuracy": 0.2123243510723114, | |
| "num_tokens": 5327.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06191950464396285, | |
| "grad_norm": 3.128091335296631, | |
| "learning_rate": 2.9823529411764707e-05, | |
| "loss": 6.1602, | |
| "mean_token_accuracy": 0.25044268518686297, | |
| "num_tokens": 10795.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09287925696594428, | |
| "grad_norm": 4.203906536102295, | |
| "learning_rate": 2.973065015479876e-05, | |
| "loss": 5.6948, | |
| "mean_token_accuracy": 0.262071692943573, | |
| "num_tokens": 16240.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1238390092879257, | |
| "grad_norm": 4.233511924743652, | |
| "learning_rate": 2.9637770897832817e-05, | |
| "loss": 5.2733, | |
| "mean_token_accuracy": 0.27527774721384046, | |
| "num_tokens": 21582.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15479876160990713, | |
| "grad_norm": 7.080459117889404, | |
| "learning_rate": 2.9544891640866874e-05, | |
| "loss": 4.9304, | |
| "mean_token_accuracy": 0.2877007365226746, | |
| "num_tokens": 27142.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18575851393188855, | |
| "grad_norm": 7.273204326629639, | |
| "learning_rate": 2.945201238390093e-05, | |
| "loss": 4.689, | |
| "mean_token_accuracy": 0.28890604972839357, | |
| "num_tokens": 32801.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21671826625386997, | |
| "grad_norm": 2.2185206413269043, | |
| "learning_rate": 2.9359133126934984e-05, | |
| "loss": 4.3965, | |
| "mean_token_accuracy": 0.28117197155952456, | |
| "num_tokens": 38472.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2476780185758514, | |
| "grad_norm": 2.0464794635772705, | |
| "learning_rate": 2.926625386996904e-05, | |
| "loss": 4.062, | |
| "mean_token_accuracy": 0.299494668841362, | |
| "num_tokens": 43743.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2786377708978328, | |
| "grad_norm": 1.633155345916748, | |
| "learning_rate": 2.9173374613003097e-05, | |
| "loss": 4.0378, | |
| "mean_token_accuracy": 0.30819864571094513, | |
| "num_tokens": 49087.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.30959752321981426, | |
| "grad_norm": 1.4594247341156006, | |
| "learning_rate": 2.908049535603715e-05, | |
| "loss": 3.8513, | |
| "mean_token_accuracy": 0.3258361428976059, | |
| "num_tokens": 54433.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.34055727554179566, | |
| "grad_norm": 1.5312635898590088, | |
| "learning_rate": 2.898761609907121e-05, | |
| "loss": 3.9162, | |
| "mean_token_accuracy": 0.32140363454818727, | |
| "num_tokens": 59629.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3715170278637771, | |
| "grad_norm": 1.3190491199493408, | |
| "learning_rate": 2.8894736842105263e-05, | |
| "loss": 3.902, | |
| "mean_token_accuracy": 0.3103078156709671, | |
| "num_tokens": 65326.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4024767801857585, | |
| "grad_norm": 1.6095689535140991, | |
| "learning_rate": 2.880185758513932e-05, | |
| "loss": 3.7107, | |
| "mean_token_accuracy": 0.3353793561458588, | |
| "num_tokens": 70440.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.43343653250773995, | |
| "grad_norm": 1.6634972095489502, | |
| "learning_rate": 2.8708978328173377e-05, | |
| "loss": 3.7747, | |
| "mean_token_accuracy": 0.3298566401004791, | |
| "num_tokens": 75712.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.46439628482972134, | |
| "grad_norm": 1.3906605243682861, | |
| "learning_rate": 2.861609907120743e-05, | |
| "loss": 3.7344, | |
| "mean_token_accuracy": 0.34043932259082793, | |
| "num_tokens": 81272.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4953560371517028, | |
| "grad_norm": 1.6273926496505737, | |
| "learning_rate": 2.8523219814241487e-05, | |
| "loss": 3.6722, | |
| "mean_token_accuracy": 0.33802524507045745, | |
| "num_tokens": 86836.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 1.595566987991333, | |
| "learning_rate": 2.8430340557275543e-05, | |
| "loss": 3.5486, | |
| "mean_token_accuracy": 0.36929037272930143, | |
| "num_tokens": 91622.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5572755417956656, | |
| "grad_norm": 1.9571454524993896, | |
| "learning_rate": 2.83374613003096e-05, | |
| "loss": 3.6849, | |
| "mean_token_accuracy": 0.3387055486440659, | |
| "num_tokens": 97019.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 1.6203333139419556, | |
| "learning_rate": 2.8244582043343653e-05, | |
| "loss": 3.5592, | |
| "mean_token_accuracy": 0.36260710954666137, | |
| "num_tokens": 102273.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6191950464396285, | |
| "grad_norm": 1.8625439405441284, | |
| "learning_rate": 2.815170278637771e-05, | |
| "loss": 3.4542, | |
| "mean_token_accuracy": 0.3554231733083725, | |
| "num_tokens": 107847.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6501547987616099, | |
| "grad_norm": 1.5171610116958618, | |
| "learning_rate": 2.8058823529411766e-05, | |
| "loss": 3.6914, | |
| "mean_token_accuracy": 0.3506886214017868, | |
| "num_tokens": 113499.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6811145510835913, | |
| "grad_norm": 1.465408205986023, | |
| "learning_rate": 2.796594427244582e-05, | |
| "loss": 3.6008, | |
| "mean_token_accuracy": 0.3558589071035385, | |
| "num_tokens": 119014.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7120743034055728, | |
| "grad_norm": 1.5382874011993408, | |
| "learning_rate": 2.787306501547988e-05, | |
| "loss": 3.5375, | |
| "mean_token_accuracy": 0.3548148155212402, | |
| "num_tokens": 124170.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7430340557275542, | |
| "grad_norm": 1.773881196975708, | |
| "learning_rate": 2.7780185758513933e-05, | |
| "loss": 3.573, | |
| "mean_token_accuracy": 0.3465736091136932, | |
| "num_tokens": 129487.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7739938080495357, | |
| "grad_norm": 1.7652744054794312, | |
| "learning_rate": 2.7687306501547986e-05, | |
| "loss": 3.6811, | |
| "mean_token_accuracy": 0.33623204231262205, | |
| "num_tokens": 135007.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.804953560371517, | |
| "grad_norm": 1.7662419080734253, | |
| "learning_rate": 2.7594427244582046e-05, | |
| "loss": 3.505, | |
| "mean_token_accuracy": 0.3567329585552216, | |
| "num_tokens": 140143.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8359133126934984, | |
| "grad_norm": 1.9441474676132202, | |
| "learning_rate": 2.75015479876161e-05, | |
| "loss": 3.4804, | |
| "mean_token_accuracy": 0.36218210160732267, | |
| "num_tokens": 145363.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8668730650154799, | |
| "grad_norm": 1.745896816253662, | |
| "learning_rate": 2.7408668730650156e-05, | |
| "loss": 3.6519, | |
| "mean_token_accuracy": 0.34941086173057556, | |
| "num_tokens": 150840.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8978328173374613, | |
| "grad_norm": 1.928284764289856, | |
| "learning_rate": 2.7315789473684213e-05, | |
| "loss": 3.6138, | |
| "mean_token_accuracy": 0.34826839864254, | |
| "num_tokens": 156077.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9287925696594427, | |
| "grad_norm": 2.177100896835327, | |
| "learning_rate": 2.722291021671827e-05, | |
| "loss": 3.4666, | |
| "mean_token_accuracy": 0.36537405848503113, | |
| "num_tokens": 160953.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9597523219814241, | |
| "grad_norm": 2.203282594680786, | |
| "learning_rate": 2.7130030959752322e-05, | |
| "loss": 3.5842, | |
| "mean_token_accuracy": 0.34776660799980164, | |
| "num_tokens": 166286.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9907120743034056, | |
| "grad_norm": 1.724373459815979, | |
| "learning_rate": 2.7037151702786376e-05, | |
| "loss": 3.4961, | |
| "mean_token_accuracy": 0.35577190220355986, | |
| "num_tokens": 171629.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.021671826625387, | |
| "grad_norm": 1.7433867454528809, | |
| "learning_rate": 2.6944272445820436e-05, | |
| "loss": 3.4088, | |
| "mean_token_accuracy": 0.36975419521331787, | |
| "num_tokens": 176968.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 2.0577471256256104, | |
| "learning_rate": 2.685139318885449e-05, | |
| "loss": 3.5364, | |
| "mean_token_accuracy": 0.35527182221412656, | |
| "num_tokens": 182390.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.08359133126935, | |
| "grad_norm": 1.7357635498046875, | |
| "learning_rate": 2.6758513931888546e-05, | |
| "loss": 3.4495, | |
| "mean_token_accuracy": 0.3565235286951065, | |
| "num_tokens": 188181.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1145510835913313, | |
| "grad_norm": 2.024507761001587, | |
| "learning_rate": 2.6665634674922602e-05, | |
| "loss": 3.3766, | |
| "mean_token_accuracy": 0.3696540713310242, | |
| "num_tokens": 193427.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.1455108359133126, | |
| "grad_norm": 2.1170592308044434, | |
| "learning_rate": 2.6572755417956655e-05, | |
| "loss": 3.4759, | |
| "mean_token_accuracy": 0.35803447663784027, | |
| "num_tokens": 199046.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.1764705882352942, | |
| "grad_norm": 2.118878126144409, | |
| "learning_rate": 2.6479876160990712e-05, | |
| "loss": 3.4685, | |
| "mean_token_accuracy": 0.3570233076810837, | |
| "num_tokens": 204089.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2074303405572755, | |
| "grad_norm": 2.2280914783477783, | |
| "learning_rate": 2.638699690402477e-05, | |
| "loss": 3.4822, | |
| "mean_token_accuracy": 0.36153341829776764, | |
| "num_tokens": 209656.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.238390092879257, | |
| "grad_norm": 2.444979667663574, | |
| "learning_rate": 2.6294117647058825e-05, | |
| "loss": 3.3666, | |
| "mean_token_accuracy": 0.3748770415782928, | |
| "num_tokens": 214465.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2693498452012384, | |
| "grad_norm": 1.9609659910202026, | |
| "learning_rate": 2.620123839009288e-05, | |
| "loss": 3.4161, | |
| "mean_token_accuracy": 0.3594685852527618, | |
| "num_tokens": 219728.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3003095975232197, | |
| "grad_norm": 1.9759095907211304, | |
| "learning_rate": 2.6108359133126935e-05, | |
| "loss": 3.405, | |
| "mean_token_accuracy": 0.3668099522590637, | |
| "num_tokens": 224992.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.3312693498452013, | |
| "grad_norm": 2.1737940311431885, | |
| "learning_rate": 2.6015479876160992e-05, | |
| "loss": 3.3809, | |
| "mean_token_accuracy": 0.37462269365787504, | |
| "num_tokens": 230431.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3622291021671826, | |
| "grad_norm": 2.475351333618164, | |
| "learning_rate": 2.5922600619195045e-05, | |
| "loss": 3.3454, | |
| "mean_token_accuracy": 0.36902076900005343, | |
| "num_tokens": 235868.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3931888544891642, | |
| "grad_norm": 2.1027772426605225, | |
| "learning_rate": 2.5829721362229105e-05, | |
| "loss": 3.3967, | |
| "mean_token_accuracy": 0.3779816538095474, | |
| "num_tokens": 241149.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4241486068111455, | |
| "grad_norm": 2.613186836242676, | |
| "learning_rate": 2.5736842105263158e-05, | |
| "loss": 3.3383, | |
| "mean_token_accuracy": 0.37455591559410095, | |
| "num_tokens": 246200.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4551083591331269, | |
| "grad_norm": 2.1689629554748535, | |
| "learning_rate": 2.5643962848297215e-05, | |
| "loss": 3.4927, | |
| "mean_token_accuracy": 0.3641968876123428, | |
| "num_tokens": 251354.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.4860681114551084, | |
| "grad_norm": 1.9075849056243896, | |
| "learning_rate": 2.555108359133127e-05, | |
| "loss": 3.4211, | |
| "mean_token_accuracy": 0.3670921057462692, | |
| "num_tokens": 257201.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5170278637770898, | |
| "grad_norm": 2.128737211227417, | |
| "learning_rate": 2.5458204334365325e-05, | |
| "loss": 3.3306, | |
| "mean_token_accuracy": 0.3747966349124908, | |
| "num_tokens": 262640.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5479876160990713, | |
| "grad_norm": 1.9061874151229858, | |
| "learning_rate": 2.536532507739938e-05, | |
| "loss": 3.3874, | |
| "mean_token_accuracy": 0.3706284284591675, | |
| "num_tokens": 268129.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 1.8868329524993896, | |
| "learning_rate": 2.5272445820433438e-05, | |
| "loss": 3.3185, | |
| "mean_token_accuracy": 0.37699449956417086, | |
| "num_tokens": 273489.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.609907120743034, | |
| "grad_norm": 1.8507658243179321, | |
| "learning_rate": 2.5179566563467495e-05, | |
| "loss": 3.2901, | |
| "mean_token_accuracy": 0.3797824054956436, | |
| "num_tokens": 279142.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6408668730650153, | |
| "grad_norm": 2.396951198577881, | |
| "learning_rate": 2.5086687306501548e-05, | |
| "loss": 3.3503, | |
| "mean_token_accuracy": 0.37934728860855105, | |
| "num_tokens": 284049.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.671826625386997, | |
| "grad_norm": 2.256753921508789, | |
| "learning_rate": 2.4993808049535605e-05, | |
| "loss": 3.3706, | |
| "mean_token_accuracy": 0.38219387233257296, | |
| "num_tokens": 289514.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7027863777089784, | |
| "grad_norm": 1.9369879961013794, | |
| "learning_rate": 2.490092879256966e-05, | |
| "loss": 3.3615, | |
| "mean_token_accuracy": 0.37476457953453063, | |
| "num_tokens": 295091.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.7337461300309598, | |
| "grad_norm": 2.6628825664520264, | |
| "learning_rate": 2.4808049535603714e-05, | |
| "loss": 3.3121, | |
| "mean_token_accuracy": 0.3830788493156433, | |
| "num_tokens": 300102.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.7647058823529411, | |
| "grad_norm": 2.6188621520996094, | |
| "learning_rate": 2.4715170278637774e-05, | |
| "loss": 3.4005, | |
| "mean_token_accuracy": 0.3631168991327286, | |
| "num_tokens": 305652.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.7956656346749225, | |
| "grad_norm": 2.093585968017578, | |
| "learning_rate": 2.4622291021671828e-05, | |
| "loss": 3.4483, | |
| "mean_token_accuracy": 0.3639900177717209, | |
| "num_tokens": 311333.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.826625386996904, | |
| "grad_norm": 2.355714797973633, | |
| "learning_rate": 2.452941176470588e-05, | |
| "loss": 3.2946, | |
| "mean_token_accuracy": 0.3784641414880753, | |
| "num_tokens": 316699.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.8575851393188856, | |
| "grad_norm": 2.5051403045654297, | |
| "learning_rate": 2.4436532507739938e-05, | |
| "loss": 3.2612, | |
| "mean_token_accuracy": 0.3874821364879608, | |
| "num_tokens": 321718.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.888544891640867, | |
| "grad_norm": 2.4884731769561768, | |
| "learning_rate": 2.4343653250773994e-05, | |
| "loss": 3.3375, | |
| "mean_token_accuracy": 0.37331779301166534, | |
| "num_tokens": 327347.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9195046439628483, | |
| "grad_norm": 2.7246131896972656, | |
| "learning_rate": 2.425077399380805e-05, | |
| "loss": 3.3235, | |
| "mean_token_accuracy": 0.38473727405071256, | |
| "num_tokens": 332563.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.9504643962848296, | |
| "grad_norm": 2.420604705810547, | |
| "learning_rate": 2.4157894736842104e-05, | |
| "loss": 3.4027, | |
| "mean_token_accuracy": 0.36371313631534574, | |
| "num_tokens": 337880.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.9814241486068112, | |
| "grad_norm": 2.1465680599212646, | |
| "learning_rate": 2.4065015479876164e-05, | |
| "loss": 3.3508, | |
| "mean_token_accuracy": 0.3747645616531372, | |
| "num_tokens": 343193.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.0123839009287927, | |
| "grad_norm": 2.0795834064483643, | |
| "learning_rate": 2.3972136222910217e-05, | |
| "loss": 3.3529, | |
| "mean_token_accuracy": 0.3744541972875595, | |
| "num_tokens": 348460.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.043343653250774, | |
| "grad_norm": 2.4051778316497803, | |
| "learning_rate": 2.387925696594427e-05, | |
| "loss": 3.2404, | |
| "mean_token_accuracy": 0.38319246768951415, | |
| "num_tokens": 353701.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.0743034055727554, | |
| "grad_norm": 2.401045322418213, | |
| "learning_rate": 2.378637770897833e-05, | |
| "loss": 3.2801, | |
| "mean_token_accuracy": 0.3773155301809311, | |
| "num_tokens": 359147.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 2.554138422012329, | |
| "learning_rate": 2.3693498452012384e-05, | |
| "loss": 3.2227, | |
| "mean_token_accuracy": 0.37754152715206146, | |
| "num_tokens": 364696.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.136222910216718, | |
| "grad_norm": 2.4874625205993652, | |
| "learning_rate": 2.360061919504644e-05, | |
| "loss": 3.2534, | |
| "mean_token_accuracy": 0.3920708328485489, | |
| "num_tokens": 370207.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.1671826625387, | |
| "grad_norm": 2.638068675994873, | |
| "learning_rate": 2.3507739938080497e-05, | |
| "loss": 3.2635, | |
| "mean_token_accuracy": 0.3835586577653885, | |
| "num_tokens": 375977.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.198142414860681, | |
| "grad_norm": 2.5179686546325684, | |
| "learning_rate": 2.341486068111455e-05, | |
| "loss": 3.1599, | |
| "mean_token_accuracy": 0.39867666363716125, | |
| "num_tokens": 381398.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.2291021671826625, | |
| "grad_norm": 2.9102959632873535, | |
| "learning_rate": 2.3321981424148607e-05, | |
| "loss": 3.2686, | |
| "mean_token_accuracy": 0.3878729552030563, | |
| "num_tokens": 386460.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.260061919504644, | |
| "grad_norm": 2.641160726547241, | |
| "learning_rate": 2.3229102167182663e-05, | |
| "loss": 3.2218, | |
| "mean_token_accuracy": 0.38672482669353486, | |
| "num_tokens": 391538.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.291021671826625, | |
| "grad_norm": 2.947000026702881, | |
| "learning_rate": 2.313622291021672e-05, | |
| "loss": 3.2215, | |
| "mean_token_accuracy": 0.38650966584682467, | |
| "num_tokens": 396881.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.321981424148607, | |
| "grad_norm": 2.3614673614501953, | |
| "learning_rate": 2.3043343653250773e-05, | |
| "loss": 3.2233, | |
| "mean_token_accuracy": 0.388895845413208, | |
| "num_tokens": 402418.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.3529411764705883, | |
| "grad_norm": 2.492814302444458, | |
| "learning_rate": 2.295046439628483e-05, | |
| "loss": 3.2005, | |
| "mean_token_accuracy": 0.382270821928978, | |
| "num_tokens": 407733.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.3839009287925697, | |
| "grad_norm": 2.646655321121216, | |
| "learning_rate": 2.2857585139318887e-05, | |
| "loss": 3.1642, | |
| "mean_token_accuracy": 0.388735693693161, | |
| "num_tokens": 413101.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.414860681114551, | |
| "grad_norm": 2.782440662384033, | |
| "learning_rate": 2.276470588235294e-05, | |
| "loss": 3.2513, | |
| "mean_token_accuracy": 0.38574750125408175, | |
| "num_tokens": 418754.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.4458204334365323, | |
| "grad_norm": 2.7094547748565674, | |
| "learning_rate": 2.2671826625387e-05, | |
| "loss": 3.3007, | |
| "mean_token_accuracy": 0.38453402519226076, | |
| "num_tokens": 424232.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.476780185758514, | |
| "grad_norm": 2.697098970413208, | |
| "learning_rate": 2.2578947368421053e-05, | |
| "loss": 3.254, | |
| "mean_token_accuracy": 0.3893850326538086, | |
| "num_tokens": 429369.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.5077399380804954, | |
| "grad_norm": 3.2908523082733154, | |
| "learning_rate": 2.248606811145511e-05, | |
| "loss": 3.1944, | |
| "mean_token_accuracy": 0.39619002044200896, | |
| "num_tokens": 434497.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.538699690402477, | |
| "grad_norm": 3.068455696105957, | |
| "learning_rate": 2.2393188854489166e-05, | |
| "loss": 3.2034, | |
| "mean_token_accuracy": 0.3942800432443619, | |
| "num_tokens": 439892.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.569659442724458, | |
| "grad_norm": 2.7893826961517334, | |
| "learning_rate": 2.230030959752322e-05, | |
| "loss": 3.0723, | |
| "mean_token_accuracy": 0.39791189730167387, | |
| "num_tokens": 445189.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.6006191950464395, | |
| "grad_norm": 2.7569446563720703, | |
| "learning_rate": 2.2207430340557276e-05, | |
| "loss": 3.187, | |
| "mean_token_accuracy": 0.3884226083755493, | |
| "num_tokens": 450338.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 3.16340708732605, | |
| "learning_rate": 2.2114551083591333e-05, | |
| "loss": 3.1942, | |
| "mean_token_accuracy": 0.3921455442905426, | |
| "num_tokens": 455583.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.6625386996904026, | |
| "grad_norm": 2.549273729324341, | |
| "learning_rate": 2.202167182662539e-05, | |
| "loss": 3.278, | |
| "mean_token_accuracy": 0.3782683253288269, | |
| "num_tokens": 461089.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.693498452012384, | |
| "grad_norm": 3.216149091720581, | |
| "learning_rate": 2.1928792569659443e-05, | |
| "loss": 3.2023, | |
| "mean_token_accuracy": 0.3940991997718811, | |
| "num_tokens": 466239.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.7244582043343653, | |
| "grad_norm": 2.4680261611938477, | |
| "learning_rate": 2.18359133126935e-05, | |
| "loss": 3.2017, | |
| "mean_token_accuracy": 0.3878098428249359, | |
| "num_tokens": 471660.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.7554179566563466, | |
| "grad_norm": 3.6166999340057373, | |
| "learning_rate": 2.1743034055727556e-05, | |
| "loss": 3.2068, | |
| "mean_token_accuracy": 0.3789886265993118, | |
| "num_tokens": 476713.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.7863777089783284, | |
| "grad_norm": 2.0997393131256104, | |
| "learning_rate": 2.165015479876161e-05, | |
| "loss": 3.2216, | |
| "mean_token_accuracy": 0.3913588523864746, | |
| "num_tokens": 482569.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.8173374613003097, | |
| "grad_norm": 3.0189061164855957, | |
| "learning_rate": 2.1557275541795666e-05, | |
| "loss": 3.1346, | |
| "mean_token_accuracy": 0.3967192888259888, | |
| "num_tokens": 487629.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.848297213622291, | |
| "grad_norm": 2.8610568046569824, | |
| "learning_rate": 2.1464396284829722e-05, | |
| "loss": 3.2061, | |
| "mean_token_accuracy": 0.3921816825866699, | |
| "num_tokens": 493227.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.8792569659442724, | |
| "grad_norm": 3.023012638092041, | |
| "learning_rate": 2.1371517027863776e-05, | |
| "loss": 3.2208, | |
| "mean_token_accuracy": 0.3786366432905197, | |
| "num_tokens": 498394.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.9102167182662537, | |
| "grad_norm": 2.908886194229126, | |
| "learning_rate": 2.1278637770897832e-05, | |
| "loss": 3.1648, | |
| "mean_token_accuracy": 0.3914026439189911, | |
| "num_tokens": 503752.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.9411764705882355, | |
| "grad_norm": 3.078397750854492, | |
| "learning_rate": 2.118575851393189e-05, | |
| "loss": 3.1891, | |
| "mean_token_accuracy": 0.39542897045612335, | |
| "num_tokens": 509218.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.972136222910217, | |
| "grad_norm": 2.740389823913574, | |
| "learning_rate": 2.1092879256965946e-05, | |
| "loss": 3.1953, | |
| "mean_token_accuracy": 0.3934174537658691, | |
| "num_tokens": 514716.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.003095975232198, | |
| "grad_norm": 2.6426961421966553, | |
| "learning_rate": 2.1e-05, | |
| "loss": 3.1084, | |
| "mean_token_accuracy": 0.39845702350139617, | |
| "num_tokens": 520000.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.0340557275541795, | |
| "grad_norm": 2.8173840045928955, | |
| "learning_rate": 2.090712074303406e-05, | |
| "loss": 3.1574, | |
| "mean_token_accuracy": 0.39481441378593446, | |
| "num_tokens": 525561.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.065015479876161, | |
| "grad_norm": 3.29856014251709, | |
| "learning_rate": 2.0814241486068112e-05, | |
| "loss": 3.0613, | |
| "mean_token_accuracy": 0.4054213762283325, | |
| "num_tokens": 531021.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.0959752321981426, | |
| "grad_norm": 3.463890314102173, | |
| "learning_rate": 2.0721362229102165e-05, | |
| "loss": 3.0282, | |
| "mean_token_accuracy": 0.4025467813014984, | |
| "num_tokens": 536260.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.126934984520124, | |
| "grad_norm": 3.134387731552124, | |
| "learning_rate": 2.0628482972136225e-05, | |
| "loss": 3.1147, | |
| "mean_token_accuracy": 0.3955592781305313, | |
| "num_tokens": 541721.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.1578947368421053, | |
| "grad_norm": 3.237518072128296, | |
| "learning_rate": 2.053560371517028e-05, | |
| "loss": 3.0828, | |
| "mean_token_accuracy": 0.40579850077629087, | |
| "num_tokens": 547334.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.1888544891640866, | |
| "grad_norm": 3.2742724418640137, | |
| "learning_rate": 2.0442724458204335e-05, | |
| "loss": 3.0907, | |
| "mean_token_accuracy": 0.40091423988342284, | |
| "num_tokens": 552904.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.219814241486068, | |
| "grad_norm": 3.0646955966949463, | |
| "learning_rate": 2.0349845201238392e-05, | |
| "loss": 3.0758, | |
| "mean_token_accuracy": 0.40378672182559966, | |
| "num_tokens": 557743.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.2507739938080498, | |
| "grad_norm": 3.2615506649017334, | |
| "learning_rate": 2.0256965944272445e-05, | |
| "loss": 3.0844, | |
| "mean_token_accuracy": 0.41148235499858854, | |
| "num_tokens": 563604.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.281733746130031, | |
| "grad_norm": 3.001723527908325, | |
| "learning_rate": 2.0164086687306502e-05, | |
| "loss": 2.9993, | |
| "mean_token_accuracy": 0.41538413166999816, | |
| "num_tokens": 568980.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.3126934984520124, | |
| "grad_norm": 3.545367956161499, | |
| "learning_rate": 2.007120743034056e-05, | |
| "loss": 3.0765, | |
| "mean_token_accuracy": 0.40937572419643403, | |
| "num_tokens": 573896.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 3.343653250773994, | |
| "grad_norm": 3.4989709854125977, | |
| "learning_rate": 1.9978328173374615e-05, | |
| "loss": 3.1538, | |
| "mean_token_accuracy": 0.3883706986904144, | |
| "num_tokens": 579254.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.374613003095975, | |
| "grad_norm": 3.964334487915039, | |
| "learning_rate": 1.9885448916408668e-05, | |
| "loss": 3.1399, | |
| "mean_token_accuracy": 0.3968294531106949, | |
| "num_tokens": 584603.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 3.405572755417957, | |
| "grad_norm": 4.132855415344238, | |
| "learning_rate": 1.9792569659442725e-05, | |
| "loss": 3.1428, | |
| "mean_token_accuracy": 0.3817721128463745, | |
| "num_tokens": 590267.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.4365325077399382, | |
| "grad_norm": 3.718858242034912, | |
| "learning_rate": 1.969969040247678e-05, | |
| "loss": 3.1355, | |
| "mean_token_accuracy": 0.3892773300409317, | |
| "num_tokens": 595886.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 3.4674922600619196, | |
| "grad_norm": 3.084014892578125, | |
| "learning_rate": 1.9606811145510835e-05, | |
| "loss": 3.0849, | |
| "mean_token_accuracy": 0.40462585389614103, | |
| "num_tokens": 601344.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 3.498452012383901, | |
| "grad_norm": 3.183264970779419, | |
| "learning_rate": 1.9513931888544895e-05, | |
| "loss": 3.0683, | |
| "mean_token_accuracy": 0.4050568699836731, | |
| "num_tokens": 606963.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 3.5294117647058822, | |
| "grad_norm": 3.7558557987213135, | |
| "learning_rate": 1.9421052631578948e-05, | |
| "loss": 3.0557, | |
| "mean_token_accuracy": 0.40454983711242676, | |
| "num_tokens": 612142.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 3.560371517027864, | |
| "grad_norm": 3.376959800720215, | |
| "learning_rate": 1.9328173374613005e-05, | |
| "loss": 3.1175, | |
| "mean_token_accuracy": 0.4029729425907135, | |
| "num_tokens": 617092.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.5913312693498454, | |
| "grad_norm": 3.923366069793701, | |
| "learning_rate": 1.923529411764706e-05, | |
| "loss": 3.0476, | |
| "mean_token_accuracy": 0.41196897327899934, | |
| "num_tokens": 622341.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 3.6222910216718267, | |
| "grad_norm": 3.7128777503967285, | |
| "learning_rate": 1.9142414860681114e-05, | |
| "loss": 3.0241, | |
| "mean_token_accuracy": 0.4104953706264496, | |
| "num_tokens": 627607.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 3.653250773993808, | |
| "grad_norm": 3.040436029434204, | |
| "learning_rate": 1.904953560371517e-05, | |
| "loss": 3.0558, | |
| "mean_token_accuracy": 0.40349632799625396, | |
| "num_tokens": 633241.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 3.6842105263157894, | |
| "grad_norm": 3.3926219940185547, | |
| "learning_rate": 1.8956656346749224e-05, | |
| "loss": 3.0746, | |
| "mean_token_accuracy": 0.40370720326900483, | |
| "num_tokens": 637933.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 3.715170278637771, | |
| "grad_norm": 4.019270420074463, | |
| "learning_rate": 1.8863777089783284e-05, | |
| "loss": 3.0409, | |
| "mean_token_accuracy": 0.4087739437818527, | |
| "num_tokens": 643011.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.746130030959752, | |
| "grad_norm": 3.6900274753570557, | |
| "learning_rate": 1.8770897832817338e-05, | |
| "loss": 3.0748, | |
| "mean_token_accuracy": 0.4023460865020752, | |
| "num_tokens": 648247.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 3.777089783281734, | |
| "grad_norm": 3.2771356105804443, | |
| "learning_rate": 1.867801857585139e-05, | |
| "loss": 3.0661, | |
| "mean_token_accuracy": 0.39979439079761503, | |
| "num_tokens": 653450.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 3.808049535603715, | |
| "grad_norm": 3.597771406173706, | |
| "learning_rate": 1.858513931888545e-05, | |
| "loss": 3.0705, | |
| "mean_token_accuracy": 0.411418029665947, | |
| "num_tokens": 659055.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 3.8390092879256965, | |
| "grad_norm": 3.0550687313079834, | |
| "learning_rate": 1.8492260061919504e-05, | |
| "loss": 3.0555, | |
| "mean_token_accuracy": 0.4089734494686127, | |
| "num_tokens": 664626.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 3.8699690402476783, | |
| "grad_norm": 3.5179007053375244, | |
| "learning_rate": 1.839938080495356e-05, | |
| "loss": 3.0326, | |
| "mean_token_accuracy": 0.4063202112913132, | |
| "num_tokens": 669916.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.900928792569659, | |
| "grad_norm": 3.915254592895508, | |
| "learning_rate": 1.8306501547987617e-05, | |
| "loss": 3.1095, | |
| "mean_token_accuracy": 0.4005684494972229, | |
| "num_tokens": 675037.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 3.931888544891641, | |
| "grad_norm": 3.3169171810150146, | |
| "learning_rate": 1.821362229102167e-05, | |
| "loss": 3.0712, | |
| "mean_token_accuracy": 0.40806442499160767, | |
| "num_tokens": 680640.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 3.9628482972136223, | |
| "grad_norm": 3.805070400238037, | |
| "learning_rate": 1.8120743034055727e-05, | |
| "loss": 3.0823, | |
| "mean_token_accuracy": 0.3995911568403244, | |
| "num_tokens": 686241.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 3.9938080495356036, | |
| "grad_norm": 3.5613934993743896, | |
| "learning_rate": 1.8027863777089784e-05, | |
| "loss": 3.1261, | |
| "mean_token_accuracy": 0.4023303121328354, | |
| "num_tokens": 691665.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 4.024767801857585, | |
| "grad_norm": 3.410487174987793, | |
| "learning_rate": 1.793498452012384e-05, | |
| "loss": 3.0566, | |
| "mean_token_accuracy": 0.41004966497421264, | |
| "num_tokens": 697140.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.055727554179566, | |
| "grad_norm": 4.058082580566406, | |
| "learning_rate": 1.7842105263157894e-05, | |
| "loss": 2.9663, | |
| "mean_token_accuracy": 0.4240562438964844, | |
| "num_tokens": 702748.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 4.086687306501548, | |
| "grad_norm": 4.1768927574157715, | |
| "learning_rate": 1.7749226006191954e-05, | |
| "loss": 2.8659, | |
| "mean_token_accuracy": 0.43476164638996123, | |
| "num_tokens": 708103.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 4.117647058823529, | |
| "grad_norm": 5.143772602081299, | |
| "learning_rate": 1.7656346749226007e-05, | |
| "loss": 2.8599, | |
| "mean_token_accuracy": 0.43590405583381653, | |
| "num_tokens": 713343.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 4.148606811145511, | |
| "grad_norm": 4.065184593200684, | |
| "learning_rate": 1.756346749226006e-05, | |
| "loss": 2.9289, | |
| "mean_token_accuracy": 0.41564173996448517, | |
| "num_tokens": 719105.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 4.179566563467493, | |
| "grad_norm": 3.6309211254119873, | |
| "learning_rate": 1.747058823529412e-05, | |
| "loss": 2.9321, | |
| "mean_token_accuracy": 0.42523694932460787, | |
| "num_tokens": 724312.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 4.2105263157894735, | |
| "grad_norm": 3.6920065879821777, | |
| "learning_rate": 1.7377708978328173e-05, | |
| "loss": 2.9575, | |
| "mean_token_accuracy": 0.4250216782093048, | |
| "num_tokens": 729631.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 4.241486068111455, | |
| "grad_norm": 3.9649791717529297, | |
| "learning_rate": 1.728482972136223e-05, | |
| "loss": 2.9366, | |
| "mean_token_accuracy": 0.42727407813072205, | |
| "num_tokens": 734761.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 4.272445820433436, | |
| "grad_norm": 3.711487054824829, | |
| "learning_rate": 1.7191950464396287e-05, | |
| "loss": 3.0164, | |
| "mean_token_accuracy": 0.3995800256729126, | |
| "num_tokens": 740354.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 4.303405572755418, | |
| "grad_norm": 3.4379665851593018, | |
| "learning_rate": 1.709907120743034e-05, | |
| "loss": 2.9486, | |
| "mean_token_accuracy": 0.4161150187253952, | |
| "num_tokens": 745954.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 4.3343653250774, | |
| "grad_norm": 3.7292895317077637, | |
| "learning_rate": 1.7006191950464397e-05, | |
| "loss": 2.983, | |
| "mean_token_accuracy": 0.4198610633611679, | |
| "num_tokens": 751580.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.365325077399381, | |
| "grad_norm": 4.407292366027832, | |
| "learning_rate": 1.6913312693498453e-05, | |
| "loss": 2.9269, | |
| "mean_token_accuracy": 0.42490100860595703, | |
| "num_tokens": 756887.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 4.396284829721362, | |
| "grad_norm": 4.097774028778076, | |
| "learning_rate": 1.682043343653251e-05, | |
| "loss": 2.9669, | |
| "mean_token_accuracy": 0.41851912140846254, | |
| "num_tokens": 761780.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 4.427244582043343, | |
| "grad_norm": 3.7991371154785156, | |
| "learning_rate": 1.6727554179566563e-05, | |
| "loss": 3.0145, | |
| "mean_token_accuracy": 0.4155064642429352, | |
| "num_tokens": 766845.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 4.458204334365325, | |
| "grad_norm": 4.137938022613525, | |
| "learning_rate": 1.663467492260062e-05, | |
| "loss": 3.0174, | |
| "mean_token_accuracy": 0.40559997260570524, | |
| "num_tokens": 772275.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 4.489164086687307, | |
| "grad_norm": 3.8077549934387207, | |
| "learning_rate": 1.6541795665634676e-05, | |
| "loss": 3.0307, | |
| "mean_token_accuracy": 0.4062090069055557, | |
| "num_tokens": 777364.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.520123839009288, | |
| "grad_norm": 3.8626601696014404, | |
| "learning_rate": 1.644891640866873e-05, | |
| "loss": 2.9755, | |
| "mean_token_accuracy": 0.4198408961296082, | |
| "num_tokens": 782591.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 4.5510835913312695, | |
| "grad_norm": 3.966916084289551, | |
| "learning_rate": 1.6356037151702786e-05, | |
| "loss": 2.9512, | |
| "mean_token_accuracy": 0.418072497844696, | |
| "num_tokens": 788022.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 4.58204334365325, | |
| "grad_norm": 4.426165580749512, | |
| "learning_rate": 1.6263157894736843e-05, | |
| "loss": 2.8709, | |
| "mean_token_accuracy": 0.42306858897209165, | |
| "num_tokens": 793198.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 4.613003095975232, | |
| "grad_norm": 4.337894916534424, | |
| "learning_rate": 1.61702786377709e-05, | |
| "loss": 2.9103, | |
| "mean_token_accuracy": 0.41692661941051484, | |
| "num_tokens": 798582.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 4.643962848297214, | |
| "grad_norm": 3.9984989166259766, | |
| "learning_rate": 1.6077399380804953e-05, | |
| "loss": 2.947, | |
| "mean_token_accuracy": 0.41879336535930634, | |
| "num_tokens": 803953.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 4.674922600619195, | |
| "grad_norm": 3.8538806438446045, | |
| "learning_rate": 1.598452012383901e-05, | |
| "loss": 2.8998, | |
| "mean_token_accuracy": 0.4262635886669159, | |
| "num_tokens": 809538.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 4.705882352941177, | |
| "grad_norm": 4.303225517272949, | |
| "learning_rate": 1.5891640866873066e-05, | |
| "loss": 2.947, | |
| "mean_token_accuracy": 0.41001501083374026, | |
| "num_tokens": 814725.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 4.7368421052631575, | |
| "grad_norm": 4.340036392211914, | |
| "learning_rate": 1.579876160990712e-05, | |
| "loss": 2.949, | |
| "mean_token_accuracy": 0.4151626467704773, | |
| "num_tokens": 819971.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 4.767801857585139, | |
| "grad_norm": 4.898163318634033, | |
| "learning_rate": 1.570588235294118e-05, | |
| "loss": 2.9909, | |
| "mean_token_accuracy": 0.4096081703901291, | |
| "num_tokens": 825750.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 4.798761609907121, | |
| "grad_norm": 4.30304479598999, | |
| "learning_rate": 1.5613003095975232e-05, | |
| "loss": 3.041, | |
| "mean_token_accuracy": 0.4108206033706665, | |
| "num_tokens": 830852.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 4.829721362229102, | |
| "grad_norm": 3.9434711933135986, | |
| "learning_rate": 1.5520123839009286e-05, | |
| "loss": 2.974, | |
| "mean_token_accuracy": 0.41676563322544097, | |
| "num_tokens": 836433.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 4.860681114551084, | |
| "grad_norm": 4.404800891876221, | |
| "learning_rate": 1.5427244582043346e-05, | |
| "loss": 2.8701, | |
| "mean_token_accuracy": 0.4312462568283081, | |
| "num_tokens": 841676.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 4.891640866873065, | |
| "grad_norm": 4.066771984100342, | |
| "learning_rate": 1.53343653250774e-05, | |
| "loss": 2.9858, | |
| "mean_token_accuracy": 0.4170028865337372, | |
| "num_tokens": 847278.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 4.922600619195046, | |
| "grad_norm": 4.176848411560059, | |
| "learning_rate": 1.5241486068111454e-05, | |
| "loss": 2.9811, | |
| "mean_token_accuracy": 0.40572755932807925, | |
| "num_tokens": 852327.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 4.953560371517028, | |
| "grad_norm": 4.330312252044678, | |
| "learning_rate": 1.5148606811145512e-05, | |
| "loss": 2.9677, | |
| "mean_token_accuracy": 0.413565531373024, | |
| "num_tokens": 858017.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.984520123839009, | |
| "grad_norm": 4.786016464233398, | |
| "learning_rate": 1.5055727554179567e-05, | |
| "loss": 2.9261, | |
| "mean_token_accuracy": 0.41552698612213135, | |
| "num_tokens": 863144.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 5.015479876160991, | |
| "grad_norm": 3.516746759414673, | |
| "learning_rate": 1.4962848297213624e-05, | |
| "loss": 2.9576, | |
| "mean_token_accuracy": 0.41805381774902345, | |
| "num_tokens": 868715.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 5.046439628482972, | |
| "grad_norm": 4.921317100524902, | |
| "learning_rate": 1.4869969040247679e-05, | |
| "loss": 2.855, | |
| "mean_token_accuracy": 0.42026965618133544, | |
| "num_tokens": 874005.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 5.077399380804954, | |
| "grad_norm": 5.511002063751221, | |
| "learning_rate": 1.4777089783281734e-05, | |
| "loss": 2.8432, | |
| "mean_token_accuracy": 0.42276104390621183, | |
| "num_tokens": 879303.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 5.108359133126935, | |
| "grad_norm": 4.707708835601807, | |
| "learning_rate": 1.468421052631579e-05, | |
| "loss": 2.7773, | |
| "mean_token_accuracy": 0.4471729189157486, | |
| "num_tokens": 884423.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 5.139318885448916, | |
| "grad_norm": 5.216972827911377, | |
| "learning_rate": 1.4591331269349845e-05, | |
| "loss": 2.8921, | |
| "mean_token_accuracy": 0.4164726287126541, | |
| "num_tokens": 889957.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 5.170278637770898, | |
| "grad_norm": 4.838964939117432, | |
| "learning_rate": 1.4498452012383902e-05, | |
| "loss": 2.878, | |
| "mean_token_accuracy": 0.4192724585533142, | |
| "num_tokens": 895206.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 5.201238390092879, | |
| "grad_norm": 4.58376407623291, | |
| "learning_rate": 1.4405572755417958e-05, | |
| "loss": 2.785, | |
| "mean_token_accuracy": 0.43853980004787446, | |
| "num_tokens": 900401.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 5.232198142414861, | |
| "grad_norm": 4.764224052429199, | |
| "learning_rate": 1.4312693498452012e-05, | |
| "loss": 2.8059, | |
| "mean_token_accuracy": 0.4390226393938065, | |
| "num_tokens": 906021.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 5.2631578947368425, | |
| "grad_norm": 4.119351387023926, | |
| "learning_rate": 1.4219814241486068e-05, | |
| "loss": 2.8601, | |
| "mean_token_accuracy": 0.42875273823738097, | |
| "num_tokens": 911905.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 5.294117647058823, | |
| "grad_norm": 5.8420209884643555, | |
| "learning_rate": 1.4126934984520123e-05, | |
| "loss": 2.8181, | |
| "mean_token_accuracy": 0.4388769954442978, | |
| "num_tokens": 917004.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 5.325077399380805, | |
| "grad_norm": 5.457785129547119, | |
| "learning_rate": 1.403405572755418e-05, | |
| "loss": 2.8248, | |
| "mean_token_accuracy": 0.4400555491447449, | |
| "num_tokens": 922022.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 5.356037151702786, | |
| "grad_norm": 5.914788246154785, | |
| "learning_rate": 1.3941176470588236e-05, | |
| "loss": 2.8293, | |
| "mean_token_accuracy": 0.424467995762825, | |
| "num_tokens": 927509.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 5.386996904024768, | |
| "grad_norm": 4.7415971755981445, | |
| "learning_rate": 1.3848297213622291e-05, | |
| "loss": 2.8837, | |
| "mean_token_accuracy": 0.4276946932077408, | |
| "num_tokens": 933043.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 5.41795665634675, | |
| "grad_norm": 4.0114426612854, | |
| "learning_rate": 1.3755417956656346e-05, | |
| "loss": 2.8743, | |
| "mean_token_accuracy": 0.43589029014110564, | |
| "num_tokens": 939236.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 5.4489164086687305, | |
| "grad_norm": 4.747798919677734, | |
| "learning_rate": 1.3662538699690403e-05, | |
| "loss": 2.9124, | |
| "mean_token_accuracy": 0.42590193450450897, | |
| "num_tokens": 944423.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 5.479876160990712, | |
| "grad_norm": 4.681455135345459, | |
| "learning_rate": 1.3569659442724458e-05, | |
| "loss": 2.8632, | |
| "mean_token_accuracy": 0.42523313462734225, | |
| "num_tokens": 949950.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 5.510835913312693, | |
| "grad_norm": 5.279947280883789, | |
| "learning_rate": 1.3476780185758514e-05, | |
| "loss": 2.8384, | |
| "mean_token_accuracy": 0.436612194776535, | |
| "num_tokens": 955442.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 5.541795665634675, | |
| "grad_norm": 5.95959997177124, | |
| "learning_rate": 1.3383900928792571e-05, | |
| "loss": 2.7698, | |
| "mean_token_accuracy": 0.4417139500379562, | |
| "num_tokens": 960733.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 5.572755417956657, | |
| "grad_norm": 5.599917888641357, | |
| "learning_rate": 1.3291021671826626e-05, | |
| "loss": 2.7786, | |
| "mean_token_accuracy": 0.44270198941230776, | |
| "num_tokens": 965648.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 5.603715170278638, | |
| "grad_norm": 5.011136531829834, | |
| "learning_rate": 1.3198142414860681e-05, | |
| "loss": 2.8077, | |
| "mean_token_accuracy": 0.43424761593341826, | |
| "num_tokens": 971280.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 5.634674922600619, | |
| "grad_norm": 5.402850151062012, | |
| "learning_rate": 1.3105263157894738e-05, | |
| "loss": 2.8336, | |
| "mean_token_accuracy": 0.43530034720897676, | |
| "num_tokens": 976786.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 5.6656346749226, | |
| "grad_norm": 6.188055038452148, | |
| "learning_rate": 1.3012383900928793e-05, | |
| "loss": 2.8559, | |
| "mean_token_accuracy": 0.42733065485954286, | |
| "num_tokens": 982150.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 5.696594427244582, | |
| "grad_norm": 5.491219520568848, | |
| "learning_rate": 1.291950464396285e-05, | |
| "loss": 2.8993, | |
| "mean_token_accuracy": 0.4176638275384903, | |
| "num_tokens": 987190.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 5.727554179566564, | |
| "grad_norm": 5.429774761199951, | |
| "learning_rate": 1.2826625386996904e-05, | |
| "loss": 2.8738, | |
| "mean_token_accuracy": 0.42554824650287626, | |
| "num_tokens": 992301.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 5.758513931888545, | |
| "grad_norm": 4.794564247131348, | |
| "learning_rate": 1.2733746130030959e-05, | |
| "loss": 2.8354, | |
| "mean_token_accuracy": 0.4368333965539932, | |
| "num_tokens": 997463.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 5.7894736842105265, | |
| "grad_norm": 5.444267749786377, | |
| "learning_rate": 1.2640866873065016e-05, | |
| "loss": 2.8709, | |
| "mean_token_accuracy": 0.42507857382297515, | |
| "num_tokens": 1002982.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 5.820433436532507, | |
| "grad_norm": 4.823091506958008, | |
| "learning_rate": 1.254798761609907e-05, | |
| "loss": 2.8811, | |
| "mean_token_accuracy": 0.42272760570049284, | |
| "num_tokens": 1008274.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 5.851393188854489, | |
| "grad_norm": 4.862705707550049, | |
| "learning_rate": 1.2455108359133127e-05, | |
| "loss": 2.9577, | |
| "mean_token_accuracy": 0.4152037352323532, | |
| "num_tokens": 1013214.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 5.882352941176471, | |
| "grad_norm": 5.220913887023926, | |
| "learning_rate": 1.2362229102167184e-05, | |
| "loss": 2.9042, | |
| "mean_token_accuracy": 0.42974009811878205, | |
| "num_tokens": 1018645.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 5.913312693498452, | |
| "grad_norm": 5.20965576171875, | |
| "learning_rate": 1.2269349845201239e-05, | |
| "loss": 2.8643, | |
| "mean_token_accuracy": 0.4309177756309509, | |
| "num_tokens": 1024323.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 5.944272445820434, | |
| "grad_norm": 4.878232002258301, | |
| "learning_rate": 1.2176470588235294e-05, | |
| "loss": 2.8095, | |
| "mean_token_accuracy": 0.43144825398921965, | |
| "num_tokens": 1029506.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 5.975232198142415, | |
| "grad_norm": 5.641796112060547, | |
| "learning_rate": 1.208359133126935e-05, | |
| "loss": 2.8189, | |
| "mean_token_accuracy": 0.4325381100177765, | |
| "num_tokens": 1034689.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 6.006191950464396, | |
| "grad_norm": 5.058184623718262, | |
| "learning_rate": 1.1990712074303405e-05, | |
| "loss": 2.8242, | |
| "mean_token_accuracy": 0.43917117118835447, | |
| "num_tokens": 1039846.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 6.037151702786378, | |
| "grad_norm": 4.433888912200928, | |
| "learning_rate": 1.1897832817337462e-05, | |
| "loss": 2.73, | |
| "mean_token_accuracy": 0.4466862291097641, | |
| "num_tokens": 1045512.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 6.068111455108359, | |
| "grad_norm": 5.775953769683838, | |
| "learning_rate": 1.1804953560371519e-05, | |
| "loss": 2.7625, | |
| "mean_token_accuracy": 0.4397457420825958, | |
| "num_tokens": 1050876.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 6.099071207430341, | |
| "grad_norm": 5.579873561859131, | |
| "learning_rate": 1.1712074303405573e-05, | |
| "loss": 2.8456, | |
| "mean_token_accuracy": 0.4306044101715088, | |
| "num_tokens": 1056379.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 6.130030959752322, | |
| "grad_norm": 6.171997547149658, | |
| "learning_rate": 1.1619195046439628e-05, | |
| "loss": 2.7931, | |
| "mean_token_accuracy": 0.43514950573444366, | |
| "num_tokens": 1061908.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 6.1609907120743035, | |
| "grad_norm": 4.454577445983887, | |
| "learning_rate": 1.1526315789473683e-05, | |
| "loss": 2.6557, | |
| "mean_token_accuracy": 0.45830960273742677, | |
| "num_tokens": 1067516.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 6.191950464396285, | |
| "grad_norm": 6.2078328132629395, | |
| "learning_rate": 1.143343653250774e-05, | |
| "loss": 2.6819, | |
| "mean_token_accuracy": 0.45693929195404054, | |
| "num_tokens": 1072685.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 6.222910216718266, | |
| "grad_norm": 5.479898452758789, | |
| "learning_rate": 1.1340557275541797e-05, | |
| "loss": 2.789, | |
| "mean_token_accuracy": 0.4357097387313843, | |
| "num_tokens": 1078055.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 6.253869969040248, | |
| "grad_norm": 5.758952617645264, | |
| "learning_rate": 1.1247678018575852e-05, | |
| "loss": 2.7735, | |
| "mean_token_accuracy": 0.45056099593639376, | |
| "num_tokens": 1083290.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 6.284829721362229, | |
| "grad_norm": 6.096948623657227, | |
| "learning_rate": 1.1154798761609906e-05, | |
| "loss": 2.7711, | |
| "mean_token_accuracy": 0.43407263457775114, | |
| "num_tokens": 1088985.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 6.315789473684211, | |
| "grad_norm": 6.147327423095703, | |
| "learning_rate": 1.1061919504643963e-05, | |
| "loss": 2.7582, | |
| "mean_token_accuracy": 0.4345243155956268, | |
| "num_tokens": 1094264.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 6.346749226006192, | |
| "grad_norm": 6.030445575714111, | |
| "learning_rate": 1.0969040247678018e-05, | |
| "loss": 2.6719, | |
| "mean_token_accuracy": 0.450255024433136, | |
| "num_tokens": 1099356.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 6.377708978328173, | |
| "grad_norm": 5.9045257568359375, | |
| "learning_rate": 1.0876160990712075e-05, | |
| "loss": 2.6644, | |
| "mean_token_accuracy": 0.45699230432510374, | |
| "num_tokens": 1104688.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 6.408668730650155, | |
| "grad_norm": 6.710201263427734, | |
| "learning_rate": 1.0783281733746131e-05, | |
| "loss": 2.7785, | |
| "mean_token_accuracy": 0.4386188894510269, | |
| "num_tokens": 1109910.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 6.439628482972136, | |
| "grad_norm": 5.746718406677246, | |
| "learning_rate": 1.0690402476780186e-05, | |
| "loss": 2.7606, | |
| "mean_token_accuracy": 0.43888486325740816, | |
| "num_tokens": 1114991.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 6.470588235294118, | |
| "grad_norm": 6.118679523468018, | |
| "learning_rate": 1.0597523219814241e-05, | |
| "loss": 2.7081, | |
| "mean_token_accuracy": 0.44661730229854585, | |
| "num_tokens": 1120605.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 6.5015479876160995, | |
| "grad_norm": 6.148837566375732, | |
| "learning_rate": 1.0504643962848298e-05, | |
| "loss": 2.7338, | |
| "mean_token_accuracy": 0.44874733686447144, | |
| "num_tokens": 1125668.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 6.53250773993808, | |
| "grad_norm": 5.635681629180908, | |
| "learning_rate": 1.0411764705882353e-05, | |
| "loss": 2.7199, | |
| "mean_token_accuracy": 0.4529722988605499, | |
| "num_tokens": 1130957.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 6.563467492260062, | |
| "grad_norm": 6.4440789222717285, | |
| "learning_rate": 1.031888544891641e-05, | |
| "loss": 2.7533, | |
| "mean_token_accuracy": 0.43866740763187406, | |
| "num_tokens": 1136628.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 6.594427244582043, | |
| "grad_norm": 6.844357490539551, | |
| "learning_rate": 1.0226006191950464e-05, | |
| "loss": 2.8406, | |
| "mean_token_accuracy": 0.42767872512340543, | |
| "num_tokens": 1142363.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 6.625386996904025, | |
| "grad_norm": 5.9922990798950195, | |
| "learning_rate": 1.0133126934984521e-05, | |
| "loss": 2.7447, | |
| "mean_token_accuracy": 0.44888330399990084, | |
| "num_tokens": 1147428.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 6.656346749226007, | |
| "grad_norm": 6.7769880294799805, | |
| "learning_rate": 1.0040247678018576e-05, | |
| "loss": 2.7043, | |
| "mean_token_accuracy": 0.45271354615688325, | |
| "num_tokens": 1152552.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 6.687306501547988, | |
| "grad_norm": 5.876704216003418, | |
| "learning_rate": 9.94736842105263e-06, | |
| "loss": 2.7482, | |
| "mean_token_accuracy": 0.4465304523706436, | |
| "num_tokens": 1158112.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 6.718266253869969, | |
| "grad_norm": 5.388707637786865, | |
| "learning_rate": 9.854489164086687e-06, | |
| "loss": 2.8146, | |
| "mean_token_accuracy": 0.43294175863265993, | |
| "num_tokens": 1163681.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 6.74922600619195, | |
| "grad_norm": 5.411194801330566, | |
| "learning_rate": 9.761609907120744e-06, | |
| "loss": 2.686, | |
| "mean_token_accuracy": 0.45299853682518004, | |
| "num_tokens": 1169213.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 6.780185758513932, | |
| "grad_norm": 5.515957832336426, | |
| "learning_rate": 9.668730650154799e-06, | |
| "loss": 2.7516, | |
| "mean_token_accuracy": 0.45130361914634703, | |
| "num_tokens": 1174534.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 6.811145510835914, | |
| "grad_norm": 6.845043659210205, | |
| "learning_rate": 9.575851393188854e-06, | |
| "loss": 2.7274, | |
| "mean_token_accuracy": 0.4553706705570221, | |
| "num_tokens": 1179712.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 6.842105263157895, | |
| "grad_norm": 6.185451984405518, | |
| "learning_rate": 9.48297213622291e-06, | |
| "loss": 2.7128, | |
| "mean_token_accuracy": 0.44898030161857605, | |
| "num_tokens": 1185188.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 6.8730650154798765, | |
| "grad_norm": 7.061018943786621, | |
| "learning_rate": 9.390092879256965e-06, | |
| "loss": 2.8095, | |
| "mean_token_accuracy": 0.43425504565238954, | |
| "num_tokens": 1190570.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 6.904024767801857, | |
| "grad_norm": 5.647558212280273, | |
| "learning_rate": 9.297213622291022e-06, | |
| "loss": 2.6993, | |
| "mean_token_accuracy": 0.44335272908210754, | |
| "num_tokens": 1195894.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 6.934984520123839, | |
| "grad_norm": 6.966801166534424, | |
| "learning_rate": 9.204334365325079e-06, | |
| "loss": 2.8366, | |
| "mean_token_accuracy": 0.4338228702545166, | |
| "num_tokens": 1201236.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 6.965944272445821, | |
| "grad_norm": 5.80783224105835, | |
| "learning_rate": 9.111455108359134e-06, | |
| "loss": 2.7241, | |
| "mean_token_accuracy": 0.4401988387107849, | |
| "num_tokens": 1206531.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 6.996904024767802, | |
| "grad_norm": 6.14738655090332, | |
| "learning_rate": 9.018575851393189e-06, | |
| "loss": 2.7851, | |
| "mean_token_accuracy": 0.44606389105319977, | |
| "num_tokens": 1211608.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 7.027863777089784, | |
| "grad_norm": 6.024252414703369, | |
| "learning_rate": 8.925696594427245e-06, | |
| "loss": 2.6929, | |
| "mean_token_accuracy": 0.4425144582986832, | |
| "num_tokens": 1216967.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 7.0588235294117645, | |
| "grad_norm": 6.029590606689453, | |
| "learning_rate": 8.8328173374613e-06, | |
| "loss": 2.6463, | |
| "mean_token_accuracy": 0.4519709438085556, | |
| "num_tokens": 1222642.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 7.089783281733746, | |
| "grad_norm": 7.154483795166016, | |
| "learning_rate": 8.739938080495357e-06, | |
| "loss": 2.7, | |
| "mean_token_accuracy": 0.45297570526599884, | |
| "num_tokens": 1228182.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 7.120743034055727, | |
| "grad_norm": 7.579859256744385, | |
| "learning_rate": 8.647058823529412e-06, | |
| "loss": 2.7066, | |
| "mean_token_accuracy": 0.4409604281187057, | |
| "num_tokens": 1233221.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 7.151702786377709, | |
| "grad_norm": 7.6017537117004395, | |
| "learning_rate": 8.554179566563468e-06, | |
| "loss": 2.6415, | |
| "mean_token_accuracy": 0.4527024358510971, | |
| "num_tokens": 1238367.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 7.182662538699691, | |
| "grad_norm": 6.553985595703125, | |
| "learning_rate": 8.461300309597523e-06, | |
| "loss": 2.5998, | |
| "mean_token_accuracy": 0.4632930040359497, | |
| "num_tokens": 1243283.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 7.213622291021672, | |
| "grad_norm": 5.968512058258057, | |
| "learning_rate": 8.368421052631578e-06, | |
| "loss": 2.6925, | |
| "mean_token_accuracy": 0.45536734759807584, | |
| "num_tokens": 1249101.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 7.244582043343653, | |
| "grad_norm": 7.108147621154785, | |
| "learning_rate": 8.275541795665635e-06, | |
| "loss": 2.7118, | |
| "mean_token_accuracy": 0.4545805901288986, | |
| "num_tokens": 1253957.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 7.275541795665634, | |
| "grad_norm": 6.648678302764893, | |
| "learning_rate": 8.182662538699691e-06, | |
| "loss": 2.5597, | |
| "mean_token_accuracy": 0.4763183623552322, | |
| "num_tokens": 1259120.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 7.306501547987616, | |
| "grad_norm": 6.633580684661865, | |
| "learning_rate": 8.089783281733746e-06, | |
| "loss": 2.6495, | |
| "mean_token_accuracy": 0.44946308732032775, | |
| "num_tokens": 1264564.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 7.337461300309598, | |
| "grad_norm": 7.78735876083374, | |
| "learning_rate": 7.996904024767801e-06, | |
| "loss": 2.7111, | |
| "mean_token_accuracy": 0.449133089184761, | |
| "num_tokens": 1269865.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 7.368421052631579, | |
| "grad_norm": 6.6130852699279785, | |
| "learning_rate": 7.904024767801858e-06, | |
| "loss": 2.5428, | |
| "mean_token_accuracy": 0.46245581805706026, | |
| "num_tokens": 1275179.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 7.3993808049535605, | |
| "grad_norm": 5.41319465637207, | |
| "learning_rate": 7.811145510835913e-06, | |
| "loss": 2.6658, | |
| "mean_token_accuracy": 0.46077215373516084, | |
| "num_tokens": 1280771.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 7.430340557275541, | |
| "grad_norm": 6.882344722747803, | |
| "learning_rate": 7.71826625386997e-06, | |
| "loss": 2.6419, | |
| "mean_token_accuracy": 0.4477766364812851, | |
| "num_tokens": 1286080.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 7.461300309597523, | |
| "grad_norm": 7.123750686645508, | |
| "learning_rate": 7.625386996904026e-06, | |
| "loss": 2.6225, | |
| "mean_token_accuracy": 0.4646787643432617, | |
| "num_tokens": 1291519.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 7.492260061919505, | |
| "grad_norm": 7.669636249542236, | |
| "learning_rate": 7.53250773993808e-06, | |
| "loss": 2.6362, | |
| "mean_token_accuracy": 0.46397568881511686, | |
| "num_tokens": 1296632.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 7.523219814241486, | |
| "grad_norm": 7.0702948570251465, | |
| "learning_rate": 7.439628482972137e-06, | |
| "loss": 2.678, | |
| "mean_token_accuracy": 0.44617280960083006, | |
| "num_tokens": 1301718.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 7.554179566563468, | |
| "grad_norm": 5.621570110321045, | |
| "learning_rate": 7.346749226006193e-06, | |
| "loss": 2.6256, | |
| "mean_token_accuracy": 0.4561785846948624, | |
| "num_tokens": 1307465.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 7.585139318885449, | |
| "grad_norm": 6.752878189086914, | |
| "learning_rate": 7.2538699690402475e-06, | |
| "loss": 2.6154, | |
| "mean_token_accuracy": 0.4639400511980057, | |
| "num_tokens": 1312899.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 7.61609907120743, | |
| "grad_norm": 7.214348316192627, | |
| "learning_rate": 7.160990712074304e-06, | |
| "loss": 2.6367, | |
| "mean_token_accuracy": 0.45843425989151, | |
| "num_tokens": 1318089.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 7.647058823529412, | |
| "grad_norm": 6.726053714752197, | |
| "learning_rate": 7.068111455108359e-06, | |
| "loss": 2.6594, | |
| "mean_token_accuracy": 0.4545712649822235, | |
| "num_tokens": 1323755.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 7.678018575851393, | |
| "grad_norm": 5.724924087524414, | |
| "learning_rate": 6.975232198142415e-06, | |
| "loss": 2.6932, | |
| "mean_token_accuracy": 0.4435712039470673, | |
| "num_tokens": 1329223.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 7.708978328173375, | |
| "grad_norm": 6.38251256942749, | |
| "learning_rate": 6.882352941176471e-06, | |
| "loss": 2.6248, | |
| "mean_token_accuracy": 0.46376303732395174, | |
| "num_tokens": 1334687.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 7.739938080495356, | |
| "grad_norm": 8.527246475219727, | |
| "learning_rate": 6.7894736842105264e-06, | |
| "loss": 2.6804, | |
| "mean_token_accuracy": 0.4527428478002548, | |
| "num_tokens": 1340116.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 7.7708978328173375, | |
| "grad_norm": 7.136129856109619, | |
| "learning_rate": 6.696594427244582e-06, | |
| "loss": 2.6653, | |
| "mean_token_accuracy": 0.4520092993974686, | |
| "num_tokens": 1345766.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 7.801857585139319, | |
| "grad_norm": 6.609195232391357, | |
| "learning_rate": 6.603715170278638e-06, | |
| "loss": 2.6667, | |
| "mean_token_accuracy": 0.45012139081954955, | |
| "num_tokens": 1351196.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 7.8328173374613, | |
| "grad_norm": 7.436903953552246, | |
| "learning_rate": 6.510835913312694e-06, | |
| "loss": 2.701, | |
| "mean_token_accuracy": 0.44297235906124116, | |
| "num_tokens": 1356559.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 7.863777089783282, | |
| "grad_norm": 7.845468521118164, | |
| "learning_rate": 6.4179566563467496e-06, | |
| "loss": 2.7405, | |
| "mean_token_accuracy": 0.4466375082731247, | |
| "num_tokens": 1361826.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 7.894736842105263, | |
| "grad_norm": 7.6677985191345215, | |
| "learning_rate": 6.325077399380805e-06, | |
| "loss": 2.6876, | |
| "mean_token_accuracy": 0.4568956196308136, | |
| "num_tokens": 1367324.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 7.925696594427245, | |
| "grad_norm": 6.1485137939453125, | |
| "learning_rate": 6.23219814241486e-06, | |
| "loss": 2.6606, | |
| "mean_token_accuracy": 0.46587195694446565, | |
| "num_tokens": 1372463.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 7.956656346749226, | |
| "grad_norm": 6.914153575897217, | |
| "learning_rate": 6.139318885448917e-06, | |
| "loss": 2.6929, | |
| "mean_token_accuracy": 0.43928238451480867, | |
| "num_tokens": 1377954.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 7.987616099071207, | |
| "grad_norm": 7.685713291168213, | |
| "learning_rate": 6.046439628482973e-06, | |
| "loss": 2.6364, | |
| "mean_token_accuracy": 0.46373621821403505, | |
| "num_tokens": 1383140.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 8.018575851393189, | |
| "grad_norm": 8.195664405822754, | |
| "learning_rate": 5.953560371517028e-06, | |
| "loss": 2.679, | |
| "mean_token_accuracy": 0.4609240561723709, | |
| "num_tokens": 1388084.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 8.04953560371517, | |
| "grad_norm": 9.644103050231934, | |
| "learning_rate": 5.860681114551084e-06, | |
| "loss": 2.5422, | |
| "mean_token_accuracy": 0.47635687291622164, | |
| "num_tokens": 1393610.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 8.08049535603715, | |
| "grad_norm": 7.982243061065674, | |
| "learning_rate": 5.76780185758514e-06, | |
| "loss": 2.5414, | |
| "mean_token_accuracy": 0.47877854108810425, | |
| "num_tokens": 1398897.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 8.111455108359133, | |
| "grad_norm": 7.29356050491333, | |
| "learning_rate": 5.674922600619195e-06, | |
| "loss": 2.5538, | |
| "mean_token_accuracy": 0.470195335149765, | |
| "num_tokens": 1404287.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 8.142414860681114, | |
| "grad_norm": 6.84483003616333, | |
| "learning_rate": 5.582043343653251e-06, | |
| "loss": 2.6535, | |
| "mean_token_accuracy": 0.4449913322925568, | |
| "num_tokens": 1409539.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 8.173374613003096, | |
| "grad_norm": 7.278718948364258, | |
| "learning_rate": 5.4891640866873065e-06, | |
| "loss": 2.6108, | |
| "mean_token_accuracy": 0.4717632919549942, | |
| "num_tokens": 1415058.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 8.204334365325078, | |
| "grad_norm": 7.1849822998046875, | |
| "learning_rate": 5.396284829721362e-06, | |
| "loss": 2.6109, | |
| "mean_token_accuracy": 0.46283615231513975, | |
| "num_tokens": 1420643.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 8.235294117647058, | |
| "grad_norm": 7.145821571350098, | |
| "learning_rate": 5.303405572755418e-06, | |
| "loss": 2.5693, | |
| "mean_token_accuracy": 0.46199973821640017, | |
| "num_tokens": 1425739.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 8.26625386996904, | |
| "grad_norm": 7.301225662231445, | |
| "learning_rate": 5.210526315789474e-06, | |
| "loss": 2.6182, | |
| "mean_token_accuracy": 0.4647399663925171, | |
| "num_tokens": 1430994.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 8.297213622291022, | |
| "grad_norm": 7.631274223327637, | |
| "learning_rate": 5.11764705882353e-06, | |
| "loss": 2.6195, | |
| "mean_token_accuracy": 0.4686288446187973, | |
| "num_tokens": 1436680.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 8.328173374613003, | |
| "grad_norm": 7.482963562011719, | |
| "learning_rate": 5.024767801857585e-06, | |
| "loss": 2.5188, | |
| "mean_token_accuracy": 0.47202539145946504, | |
| "num_tokens": 1442223.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 8.359133126934985, | |
| "grad_norm": 6.863107204437256, | |
| "learning_rate": 4.93188854489164e-06, | |
| "loss": 2.6144, | |
| "mean_token_accuracy": 0.46447587609291074, | |
| "num_tokens": 1448092.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 8.390092879256965, | |
| "grad_norm": 7.401064395904541, | |
| "learning_rate": 4.839009287925697e-06, | |
| "loss": 2.5588, | |
| "mean_token_accuracy": 0.4691672682762146, | |
| "num_tokens": 1453415.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 8.421052631578947, | |
| "grad_norm": 7.8336005210876465, | |
| "learning_rate": 4.746130030959753e-06, | |
| "loss": 2.5518, | |
| "mean_token_accuracy": 0.4755162805318832, | |
| "num_tokens": 1459059.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 8.452012383900929, | |
| "grad_norm": 7.099536418914795, | |
| "learning_rate": 4.653250773993808e-06, | |
| "loss": 2.6405, | |
| "mean_token_accuracy": 0.4545671075582504, | |
| "num_tokens": 1464454.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 8.48297213622291, | |
| "grad_norm": 8.621232032775879, | |
| "learning_rate": 4.560371517027864e-06, | |
| "loss": 2.616, | |
| "mean_token_accuracy": 0.4564115792512894, | |
| "num_tokens": 1470015.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 8.513931888544892, | |
| "grad_norm": 8.490961074829102, | |
| "learning_rate": 4.46749226006192e-06, | |
| "loss": 2.6172, | |
| "mean_token_accuracy": 0.46763722896575927, | |
| "num_tokens": 1475233.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 8.544891640866872, | |
| "grad_norm": 6.8198652267456055, | |
| "learning_rate": 4.374613003095975e-06, | |
| "loss": 2.5141, | |
| "mean_token_accuracy": 0.4751830160617828, | |
| "num_tokens": 1480304.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 8.575851393188854, | |
| "grad_norm": 6.859917640686035, | |
| "learning_rate": 4.281733746130031e-06, | |
| "loss": 2.6015, | |
| "mean_token_accuracy": 0.45911831259727476, | |
| "num_tokens": 1485954.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 8.606811145510836, | |
| "grad_norm": 6.8318681716918945, | |
| "learning_rate": 4.1888544891640874e-06, | |
| "loss": 2.6111, | |
| "mean_token_accuracy": 0.46316723227500917, | |
| "num_tokens": 1491227.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 8.637770897832818, | |
| "grad_norm": 10.72941780090332, | |
| "learning_rate": 4.095975232198142e-06, | |
| "loss": 2.5735, | |
| "mean_token_accuracy": 0.4674788177013397, | |
| "num_tokens": 1496483.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 8.6687306501548, | |
| "grad_norm": 7.597776889801025, | |
| "learning_rate": 4.003095975232198e-06, | |
| "loss": 2.516, | |
| "mean_token_accuracy": 0.47651293873786926, | |
| "num_tokens": 1501699.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 8.69969040247678, | |
| "grad_norm": 7.134951591491699, | |
| "learning_rate": 3.910216718266254e-06, | |
| "loss": 2.607, | |
| "mean_token_accuracy": 0.4592158287763596, | |
| "num_tokens": 1507242.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 8.730650154798761, | |
| "grad_norm": 8.513171195983887, | |
| "learning_rate": 3.81733746130031e-06, | |
| "loss": 2.5366, | |
| "mean_token_accuracy": 0.4778905093669891, | |
| "num_tokens": 1512352.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 8.761609907120743, | |
| "grad_norm": 7.019115447998047, | |
| "learning_rate": 3.7244582043343655e-06, | |
| "loss": 2.7203, | |
| "mean_token_accuracy": 0.4413450926542282, | |
| "num_tokens": 1517522.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 8.792569659442725, | |
| "grad_norm": 7.2936248779296875, | |
| "learning_rate": 3.6315789473684213e-06, | |
| "loss": 2.5408, | |
| "mean_token_accuracy": 0.47601982653141023, | |
| "num_tokens": 1523192.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 8.823529411764707, | |
| "grad_norm": 7.474860191345215, | |
| "learning_rate": 3.5386996904024766e-06, | |
| "loss": 2.6206, | |
| "mean_token_accuracy": 0.4601287513971329, | |
| "num_tokens": 1528649.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 8.854489164086687, | |
| "grad_norm": 7.905037879943848, | |
| "learning_rate": 3.4458204334365324e-06, | |
| "loss": 2.6112, | |
| "mean_token_accuracy": 0.46144779920578005, | |
| "num_tokens": 1533699.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 8.885448916408668, | |
| "grad_norm": 9.305508613586426, | |
| "learning_rate": 3.3529411764705886e-06, | |
| "loss": 2.6039, | |
| "mean_token_accuracy": 0.45062055587768557, | |
| "num_tokens": 1539345.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 8.91640866873065, | |
| "grad_norm": 8.350320816040039, | |
| "learning_rate": 3.260061919504644e-06, | |
| "loss": 2.5891, | |
| "mean_token_accuracy": 0.46280418038368226, | |
| "num_tokens": 1544487.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 8.947368421052632, | |
| "grad_norm": 8.346325874328613, | |
| "learning_rate": 3.1671826625386998e-06, | |
| "loss": 2.5341, | |
| "mean_token_accuracy": 0.47052243947982786, | |
| "num_tokens": 1549692.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 8.978328173374614, | |
| "grad_norm": 8.169231414794922, | |
| "learning_rate": 3.0743034055727555e-06, | |
| "loss": 2.6247, | |
| "mean_token_accuracy": 0.46556260883808137, | |
| "num_tokens": 1554596.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 9.009287925696594, | |
| "grad_norm": 7.494675636291504, | |
| "learning_rate": 2.9814241486068113e-06, | |
| "loss": 2.5711, | |
| "mean_token_accuracy": 0.4640702366828918, | |
| "num_tokens": 1559968.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 9.040247678018575, | |
| "grad_norm": 7.734643936157227, | |
| "learning_rate": 2.8885448916408667e-06, | |
| "loss": 2.5401, | |
| "mean_token_accuracy": 0.4737162679433823, | |
| "num_tokens": 1565380.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 9.071207430340557, | |
| "grad_norm": 8.363790512084961, | |
| "learning_rate": 2.7956656346749225e-06, | |
| "loss": 2.3902, | |
| "mean_token_accuracy": 0.5005412518978118, | |
| "num_tokens": 1570701.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 9.102167182662539, | |
| "grad_norm": 8.183832168579102, | |
| "learning_rate": 2.7027863777089787e-06, | |
| "loss": 2.5151, | |
| "mean_token_accuracy": 0.47198016941547394, | |
| "num_tokens": 1575669.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 9.13312693498452, | |
| "grad_norm": 8.469263076782227, | |
| "learning_rate": 2.609907120743034e-06, | |
| "loss": 2.5915, | |
| "mean_token_accuracy": 0.4714679390192032, | |
| "num_tokens": 1580976.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 9.1640866873065, | |
| "grad_norm": 8.882843971252441, | |
| "learning_rate": 2.51702786377709e-06, | |
| "loss": 2.5683, | |
| "mean_token_accuracy": 0.470071816444397, | |
| "num_tokens": 1586021.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 9.195046439628483, | |
| "grad_norm": 7.277812957763672, | |
| "learning_rate": 2.4241486068111456e-06, | |
| "loss": 2.5238, | |
| "mean_token_accuracy": 0.47225017547607423, | |
| "num_tokens": 1591414.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 9.226006191950464, | |
| "grad_norm": 10.459854125976562, | |
| "learning_rate": 2.3312693498452014e-06, | |
| "loss": 2.5775, | |
| "mean_token_accuracy": 0.4601638674736023, | |
| "num_tokens": 1597450.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 9.256965944272446, | |
| "grad_norm": 9.789606094360352, | |
| "learning_rate": 2.238390092879257e-06, | |
| "loss": 2.5546, | |
| "mean_token_accuracy": 0.47142831385135653, | |
| "num_tokens": 1602666.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 9.287925696594428, | |
| "grad_norm": 7.218743801116943, | |
| "learning_rate": 2.145510835913313e-06, | |
| "loss": 2.5998, | |
| "mean_token_accuracy": 0.46776662170886996, | |
| "num_tokens": 1608362.0, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3230, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.550181169982669e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |