| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.405572755417957, | |
| "eval_steps": 500, | |
| "global_step": 1100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.030959752321981424, | |
| "grad_norm": 9.773098945617676, | |
| "learning_rate": 2.991640866873065e-05, | |
| "loss": 9.245, | |
| "mean_token_accuracy": 0.2123243510723114, | |
| "num_tokens": 5327.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06191950464396285, | |
| "grad_norm": 3.128091335296631, | |
| "learning_rate": 2.9823529411764707e-05, | |
| "loss": 6.1602, | |
| "mean_token_accuracy": 0.25044268518686297, | |
| "num_tokens": 10795.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09287925696594428, | |
| "grad_norm": 4.203906536102295, | |
| "learning_rate": 2.973065015479876e-05, | |
| "loss": 5.6948, | |
| "mean_token_accuracy": 0.262071692943573, | |
| "num_tokens": 16240.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1238390092879257, | |
| "grad_norm": 4.233511924743652, | |
| "learning_rate": 2.9637770897832817e-05, | |
| "loss": 5.2733, | |
| "mean_token_accuracy": 0.27527774721384046, | |
| "num_tokens": 21582.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15479876160990713, | |
| "grad_norm": 7.080459117889404, | |
| "learning_rate": 2.9544891640866874e-05, | |
| "loss": 4.9304, | |
| "mean_token_accuracy": 0.2877007365226746, | |
| "num_tokens": 27142.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18575851393188855, | |
| "grad_norm": 7.273204326629639, | |
| "learning_rate": 2.945201238390093e-05, | |
| "loss": 4.689, | |
| "mean_token_accuracy": 0.28890604972839357, | |
| "num_tokens": 32801.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21671826625386997, | |
| "grad_norm": 2.2185206413269043, | |
| "learning_rate": 2.9359133126934984e-05, | |
| "loss": 4.3965, | |
| "mean_token_accuracy": 0.28117197155952456, | |
| "num_tokens": 38472.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2476780185758514, | |
| "grad_norm": 2.0464794635772705, | |
| "learning_rate": 2.926625386996904e-05, | |
| "loss": 4.062, | |
| "mean_token_accuracy": 0.299494668841362, | |
| "num_tokens": 43743.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2786377708978328, | |
| "grad_norm": 1.633155345916748, | |
| "learning_rate": 2.9173374613003097e-05, | |
| "loss": 4.0378, | |
| "mean_token_accuracy": 0.30819864571094513, | |
| "num_tokens": 49087.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.30959752321981426, | |
| "grad_norm": 1.4594247341156006, | |
| "learning_rate": 2.908049535603715e-05, | |
| "loss": 3.8513, | |
| "mean_token_accuracy": 0.3258361428976059, | |
| "num_tokens": 54433.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.34055727554179566, | |
| "grad_norm": 1.5312635898590088, | |
| "learning_rate": 2.898761609907121e-05, | |
| "loss": 3.9162, | |
| "mean_token_accuracy": 0.32140363454818727, | |
| "num_tokens": 59629.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3715170278637771, | |
| "grad_norm": 1.3190491199493408, | |
| "learning_rate": 2.8894736842105263e-05, | |
| "loss": 3.902, | |
| "mean_token_accuracy": 0.3103078156709671, | |
| "num_tokens": 65326.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4024767801857585, | |
| "grad_norm": 1.6095689535140991, | |
| "learning_rate": 2.880185758513932e-05, | |
| "loss": 3.7107, | |
| "mean_token_accuracy": 0.3353793561458588, | |
| "num_tokens": 70440.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.43343653250773995, | |
| "grad_norm": 1.6634972095489502, | |
| "learning_rate": 2.8708978328173377e-05, | |
| "loss": 3.7747, | |
| "mean_token_accuracy": 0.3298566401004791, | |
| "num_tokens": 75712.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.46439628482972134, | |
| "grad_norm": 1.3906605243682861, | |
| "learning_rate": 2.861609907120743e-05, | |
| "loss": 3.7344, | |
| "mean_token_accuracy": 0.34043932259082793, | |
| "num_tokens": 81272.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4953560371517028, | |
| "grad_norm": 1.6273926496505737, | |
| "learning_rate": 2.8523219814241487e-05, | |
| "loss": 3.6722, | |
| "mean_token_accuracy": 0.33802524507045745, | |
| "num_tokens": 86836.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 1.595566987991333, | |
| "learning_rate": 2.8430340557275543e-05, | |
| "loss": 3.5486, | |
| "mean_token_accuracy": 0.36929037272930143, | |
| "num_tokens": 91622.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5572755417956656, | |
| "grad_norm": 1.9571454524993896, | |
| "learning_rate": 2.83374613003096e-05, | |
| "loss": 3.6849, | |
| "mean_token_accuracy": 0.3387055486440659, | |
| "num_tokens": 97019.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 1.6203333139419556, | |
| "learning_rate": 2.8244582043343653e-05, | |
| "loss": 3.5592, | |
| "mean_token_accuracy": 0.36260710954666137, | |
| "num_tokens": 102273.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6191950464396285, | |
| "grad_norm": 1.8625439405441284, | |
| "learning_rate": 2.815170278637771e-05, | |
| "loss": 3.4542, | |
| "mean_token_accuracy": 0.3554231733083725, | |
| "num_tokens": 107847.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6501547987616099, | |
| "grad_norm": 1.5171610116958618, | |
| "learning_rate": 2.8058823529411766e-05, | |
| "loss": 3.6914, | |
| "mean_token_accuracy": 0.3506886214017868, | |
| "num_tokens": 113499.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6811145510835913, | |
| "grad_norm": 1.465408205986023, | |
| "learning_rate": 2.796594427244582e-05, | |
| "loss": 3.6008, | |
| "mean_token_accuracy": 0.3558589071035385, | |
| "num_tokens": 119014.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7120743034055728, | |
| "grad_norm": 1.5382874011993408, | |
| "learning_rate": 2.787306501547988e-05, | |
| "loss": 3.5375, | |
| "mean_token_accuracy": 0.3548148155212402, | |
| "num_tokens": 124170.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7430340557275542, | |
| "grad_norm": 1.773881196975708, | |
| "learning_rate": 2.7780185758513933e-05, | |
| "loss": 3.573, | |
| "mean_token_accuracy": 0.3465736091136932, | |
| "num_tokens": 129487.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7739938080495357, | |
| "grad_norm": 1.7652744054794312, | |
| "learning_rate": 2.7687306501547986e-05, | |
| "loss": 3.6811, | |
| "mean_token_accuracy": 0.33623204231262205, | |
| "num_tokens": 135007.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.804953560371517, | |
| "grad_norm": 1.7662419080734253, | |
| "learning_rate": 2.7594427244582046e-05, | |
| "loss": 3.505, | |
| "mean_token_accuracy": 0.3567329585552216, | |
| "num_tokens": 140143.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8359133126934984, | |
| "grad_norm": 1.9441474676132202, | |
| "learning_rate": 2.75015479876161e-05, | |
| "loss": 3.4804, | |
| "mean_token_accuracy": 0.36218210160732267, | |
| "num_tokens": 145363.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8668730650154799, | |
| "grad_norm": 1.745896816253662, | |
| "learning_rate": 2.7408668730650156e-05, | |
| "loss": 3.6519, | |
| "mean_token_accuracy": 0.34941086173057556, | |
| "num_tokens": 150840.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8978328173374613, | |
| "grad_norm": 1.928284764289856, | |
| "learning_rate": 2.7315789473684213e-05, | |
| "loss": 3.6138, | |
| "mean_token_accuracy": 0.34826839864254, | |
| "num_tokens": 156077.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9287925696594427, | |
| "grad_norm": 2.177100896835327, | |
| "learning_rate": 2.722291021671827e-05, | |
| "loss": 3.4666, | |
| "mean_token_accuracy": 0.36537405848503113, | |
| "num_tokens": 160953.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9597523219814241, | |
| "grad_norm": 2.203282594680786, | |
| "learning_rate": 2.7130030959752322e-05, | |
| "loss": 3.5842, | |
| "mean_token_accuracy": 0.34776660799980164, | |
| "num_tokens": 166286.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9907120743034056, | |
| "grad_norm": 1.724373459815979, | |
| "learning_rate": 2.7037151702786376e-05, | |
| "loss": 3.4961, | |
| "mean_token_accuracy": 0.35577190220355986, | |
| "num_tokens": 171629.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.021671826625387, | |
| "grad_norm": 1.7433867454528809, | |
| "learning_rate": 2.6944272445820436e-05, | |
| "loss": 3.4088, | |
| "mean_token_accuracy": 0.36975419521331787, | |
| "num_tokens": 176968.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 2.0577471256256104, | |
| "learning_rate": 2.685139318885449e-05, | |
| "loss": 3.5364, | |
| "mean_token_accuracy": 0.35527182221412656, | |
| "num_tokens": 182390.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.08359133126935, | |
| "grad_norm": 1.7357635498046875, | |
| "learning_rate": 2.6758513931888546e-05, | |
| "loss": 3.4495, | |
| "mean_token_accuracy": 0.3565235286951065, | |
| "num_tokens": 188181.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1145510835913313, | |
| "grad_norm": 2.024507761001587, | |
| "learning_rate": 2.6665634674922602e-05, | |
| "loss": 3.3766, | |
| "mean_token_accuracy": 0.3696540713310242, | |
| "num_tokens": 193427.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.1455108359133126, | |
| "grad_norm": 2.1170592308044434, | |
| "learning_rate": 2.6572755417956655e-05, | |
| "loss": 3.4759, | |
| "mean_token_accuracy": 0.35803447663784027, | |
| "num_tokens": 199046.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.1764705882352942, | |
| "grad_norm": 2.118878126144409, | |
| "learning_rate": 2.6479876160990712e-05, | |
| "loss": 3.4685, | |
| "mean_token_accuracy": 0.3570233076810837, | |
| "num_tokens": 204089.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2074303405572755, | |
| "grad_norm": 2.2280914783477783, | |
| "learning_rate": 2.638699690402477e-05, | |
| "loss": 3.4822, | |
| "mean_token_accuracy": 0.36153341829776764, | |
| "num_tokens": 209656.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.238390092879257, | |
| "grad_norm": 2.444979667663574, | |
| "learning_rate": 2.6294117647058825e-05, | |
| "loss": 3.3666, | |
| "mean_token_accuracy": 0.3748770415782928, | |
| "num_tokens": 214465.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2693498452012384, | |
| "grad_norm": 1.9609659910202026, | |
| "learning_rate": 2.620123839009288e-05, | |
| "loss": 3.4161, | |
| "mean_token_accuracy": 0.3594685852527618, | |
| "num_tokens": 219728.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3003095975232197, | |
| "grad_norm": 1.9759095907211304, | |
| "learning_rate": 2.6108359133126935e-05, | |
| "loss": 3.405, | |
| "mean_token_accuracy": 0.3668099522590637, | |
| "num_tokens": 224992.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.3312693498452013, | |
| "grad_norm": 2.1737940311431885, | |
| "learning_rate": 2.6015479876160992e-05, | |
| "loss": 3.3809, | |
| "mean_token_accuracy": 0.37462269365787504, | |
| "num_tokens": 230431.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3622291021671826, | |
| "grad_norm": 2.475351333618164, | |
| "learning_rate": 2.5922600619195045e-05, | |
| "loss": 3.3454, | |
| "mean_token_accuracy": 0.36902076900005343, | |
| "num_tokens": 235868.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3931888544891642, | |
| "grad_norm": 2.1027772426605225, | |
| "learning_rate": 2.5829721362229105e-05, | |
| "loss": 3.3967, | |
| "mean_token_accuracy": 0.3779816538095474, | |
| "num_tokens": 241149.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4241486068111455, | |
| "grad_norm": 2.613186836242676, | |
| "learning_rate": 2.5736842105263158e-05, | |
| "loss": 3.3383, | |
| "mean_token_accuracy": 0.37455591559410095, | |
| "num_tokens": 246200.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4551083591331269, | |
| "grad_norm": 2.1689629554748535, | |
| "learning_rate": 2.5643962848297215e-05, | |
| "loss": 3.4927, | |
| "mean_token_accuracy": 0.3641968876123428, | |
| "num_tokens": 251354.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.4860681114551084, | |
| "grad_norm": 1.9075849056243896, | |
| "learning_rate": 2.555108359133127e-05, | |
| "loss": 3.4211, | |
| "mean_token_accuracy": 0.3670921057462692, | |
| "num_tokens": 257201.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5170278637770898, | |
| "grad_norm": 2.128737211227417, | |
| "learning_rate": 2.5458204334365325e-05, | |
| "loss": 3.3306, | |
| "mean_token_accuracy": 0.3747966349124908, | |
| "num_tokens": 262640.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5479876160990713, | |
| "grad_norm": 1.9061874151229858, | |
| "learning_rate": 2.536532507739938e-05, | |
| "loss": 3.3874, | |
| "mean_token_accuracy": 0.3706284284591675, | |
| "num_tokens": 268129.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 1.8868329524993896, | |
| "learning_rate": 2.5272445820433438e-05, | |
| "loss": 3.3185, | |
| "mean_token_accuracy": 0.37699449956417086, | |
| "num_tokens": 273489.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.609907120743034, | |
| "grad_norm": 1.8507658243179321, | |
| "learning_rate": 2.5179566563467495e-05, | |
| "loss": 3.2901, | |
| "mean_token_accuracy": 0.3797824054956436, | |
| "num_tokens": 279142.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6408668730650153, | |
| "grad_norm": 2.396951198577881, | |
| "learning_rate": 2.5086687306501548e-05, | |
| "loss": 3.3503, | |
| "mean_token_accuracy": 0.37934728860855105, | |
| "num_tokens": 284049.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.671826625386997, | |
| "grad_norm": 2.256753921508789, | |
| "learning_rate": 2.4993808049535605e-05, | |
| "loss": 3.3706, | |
| "mean_token_accuracy": 0.38219387233257296, | |
| "num_tokens": 289514.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7027863777089784, | |
| "grad_norm": 1.9369879961013794, | |
| "learning_rate": 2.490092879256966e-05, | |
| "loss": 3.3615, | |
| "mean_token_accuracy": 0.37476457953453063, | |
| "num_tokens": 295091.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.7337461300309598, | |
| "grad_norm": 2.6628825664520264, | |
| "learning_rate": 2.4808049535603714e-05, | |
| "loss": 3.3121, | |
| "mean_token_accuracy": 0.3830788493156433, | |
| "num_tokens": 300102.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.7647058823529411, | |
| "grad_norm": 2.6188621520996094, | |
| "learning_rate": 2.4715170278637774e-05, | |
| "loss": 3.4005, | |
| "mean_token_accuracy": 0.3631168991327286, | |
| "num_tokens": 305652.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.7956656346749225, | |
| "grad_norm": 2.093585968017578, | |
| "learning_rate": 2.4622291021671828e-05, | |
| "loss": 3.4483, | |
| "mean_token_accuracy": 0.3639900177717209, | |
| "num_tokens": 311333.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.826625386996904, | |
| "grad_norm": 2.355714797973633, | |
| "learning_rate": 2.452941176470588e-05, | |
| "loss": 3.2946, | |
| "mean_token_accuracy": 0.3784641414880753, | |
| "num_tokens": 316699.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.8575851393188856, | |
| "grad_norm": 2.5051403045654297, | |
| "learning_rate": 2.4436532507739938e-05, | |
| "loss": 3.2612, | |
| "mean_token_accuracy": 0.3874821364879608, | |
| "num_tokens": 321718.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.888544891640867, | |
| "grad_norm": 2.4884731769561768, | |
| "learning_rate": 2.4343653250773994e-05, | |
| "loss": 3.3375, | |
| "mean_token_accuracy": 0.37331779301166534, | |
| "num_tokens": 327347.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9195046439628483, | |
| "grad_norm": 2.7246131896972656, | |
| "learning_rate": 2.425077399380805e-05, | |
| "loss": 3.3235, | |
| "mean_token_accuracy": 0.38473727405071256, | |
| "num_tokens": 332563.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.9504643962848296, | |
| "grad_norm": 2.420604705810547, | |
| "learning_rate": 2.4157894736842104e-05, | |
| "loss": 3.4027, | |
| "mean_token_accuracy": 0.36371313631534574, | |
| "num_tokens": 337880.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.9814241486068112, | |
| "grad_norm": 2.1465680599212646, | |
| "learning_rate": 2.4065015479876164e-05, | |
| "loss": 3.3508, | |
| "mean_token_accuracy": 0.3747645616531372, | |
| "num_tokens": 343193.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.0123839009287927, | |
| "grad_norm": 2.0795834064483643, | |
| "learning_rate": 2.3972136222910217e-05, | |
| "loss": 3.3529, | |
| "mean_token_accuracy": 0.3744541972875595, | |
| "num_tokens": 348460.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.043343653250774, | |
| "grad_norm": 2.4051778316497803, | |
| "learning_rate": 2.387925696594427e-05, | |
| "loss": 3.2404, | |
| "mean_token_accuracy": 0.38319246768951415, | |
| "num_tokens": 353701.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.0743034055727554, | |
| "grad_norm": 2.401045322418213, | |
| "learning_rate": 2.378637770897833e-05, | |
| "loss": 3.2801, | |
| "mean_token_accuracy": 0.3773155301809311, | |
| "num_tokens": 359147.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 2.554138422012329, | |
| "learning_rate": 2.3693498452012384e-05, | |
| "loss": 3.2227, | |
| "mean_token_accuracy": 0.37754152715206146, | |
| "num_tokens": 364696.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.136222910216718, | |
| "grad_norm": 2.4874625205993652, | |
| "learning_rate": 2.360061919504644e-05, | |
| "loss": 3.2534, | |
| "mean_token_accuracy": 0.3920708328485489, | |
| "num_tokens": 370207.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.1671826625387, | |
| "grad_norm": 2.638068675994873, | |
| "learning_rate": 2.3507739938080497e-05, | |
| "loss": 3.2635, | |
| "mean_token_accuracy": 0.3835586577653885, | |
| "num_tokens": 375977.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.198142414860681, | |
| "grad_norm": 2.5179686546325684, | |
| "learning_rate": 2.341486068111455e-05, | |
| "loss": 3.1599, | |
| "mean_token_accuracy": 0.39867666363716125, | |
| "num_tokens": 381398.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.2291021671826625, | |
| "grad_norm": 2.9102959632873535, | |
| "learning_rate": 2.3321981424148607e-05, | |
| "loss": 3.2686, | |
| "mean_token_accuracy": 0.3878729552030563, | |
| "num_tokens": 386460.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.260061919504644, | |
| "grad_norm": 2.641160726547241, | |
| "learning_rate": 2.3229102167182663e-05, | |
| "loss": 3.2218, | |
| "mean_token_accuracy": 0.38672482669353486, | |
| "num_tokens": 391538.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.291021671826625, | |
| "grad_norm": 2.947000026702881, | |
| "learning_rate": 2.313622291021672e-05, | |
| "loss": 3.2215, | |
| "mean_token_accuracy": 0.38650966584682467, | |
| "num_tokens": 396881.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.321981424148607, | |
| "grad_norm": 2.3614673614501953, | |
| "learning_rate": 2.3043343653250773e-05, | |
| "loss": 3.2233, | |
| "mean_token_accuracy": 0.388895845413208, | |
| "num_tokens": 402418.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.3529411764705883, | |
| "grad_norm": 2.492814302444458, | |
| "learning_rate": 2.295046439628483e-05, | |
| "loss": 3.2005, | |
| "mean_token_accuracy": 0.382270821928978, | |
| "num_tokens": 407733.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.3839009287925697, | |
| "grad_norm": 2.646655321121216, | |
| "learning_rate": 2.2857585139318887e-05, | |
| "loss": 3.1642, | |
| "mean_token_accuracy": 0.388735693693161, | |
| "num_tokens": 413101.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.414860681114551, | |
| "grad_norm": 2.782440662384033, | |
| "learning_rate": 2.276470588235294e-05, | |
| "loss": 3.2513, | |
| "mean_token_accuracy": 0.38574750125408175, | |
| "num_tokens": 418754.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.4458204334365323, | |
| "grad_norm": 2.7094547748565674, | |
| "learning_rate": 2.2671826625387e-05, | |
| "loss": 3.3007, | |
| "mean_token_accuracy": 0.38453402519226076, | |
| "num_tokens": 424232.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.476780185758514, | |
| "grad_norm": 2.697098970413208, | |
| "learning_rate": 2.2578947368421053e-05, | |
| "loss": 3.254, | |
| "mean_token_accuracy": 0.3893850326538086, | |
| "num_tokens": 429369.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.5077399380804954, | |
| "grad_norm": 3.2908523082733154, | |
| "learning_rate": 2.248606811145511e-05, | |
| "loss": 3.1944, | |
| "mean_token_accuracy": 0.39619002044200896, | |
| "num_tokens": 434497.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.538699690402477, | |
| "grad_norm": 3.068455696105957, | |
| "learning_rate": 2.2393188854489166e-05, | |
| "loss": 3.2034, | |
| "mean_token_accuracy": 0.3942800432443619, | |
| "num_tokens": 439892.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.569659442724458, | |
| "grad_norm": 2.7893826961517334, | |
| "learning_rate": 2.230030959752322e-05, | |
| "loss": 3.0723, | |
| "mean_token_accuracy": 0.39791189730167387, | |
| "num_tokens": 445189.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.6006191950464395, | |
| "grad_norm": 2.7569446563720703, | |
| "learning_rate": 2.2207430340557276e-05, | |
| "loss": 3.187, | |
| "mean_token_accuracy": 0.3884226083755493, | |
| "num_tokens": 450338.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 3.16340708732605, | |
| "learning_rate": 2.2114551083591333e-05, | |
| "loss": 3.1942, | |
| "mean_token_accuracy": 0.3921455442905426, | |
| "num_tokens": 455583.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.6625386996904026, | |
| "grad_norm": 2.549273729324341, | |
| "learning_rate": 2.202167182662539e-05, | |
| "loss": 3.278, | |
| "mean_token_accuracy": 0.3782683253288269, | |
| "num_tokens": 461089.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.693498452012384, | |
| "grad_norm": 3.216149091720581, | |
| "learning_rate": 2.1928792569659443e-05, | |
| "loss": 3.2023, | |
| "mean_token_accuracy": 0.3940991997718811, | |
| "num_tokens": 466239.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.7244582043343653, | |
| "grad_norm": 2.4680261611938477, | |
| "learning_rate": 2.18359133126935e-05, | |
| "loss": 3.2017, | |
| "mean_token_accuracy": 0.3878098428249359, | |
| "num_tokens": 471660.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.7554179566563466, | |
| "grad_norm": 3.6166999340057373, | |
| "learning_rate": 2.1743034055727556e-05, | |
| "loss": 3.2068, | |
| "mean_token_accuracy": 0.3789886265993118, | |
| "num_tokens": 476713.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.7863777089783284, | |
| "grad_norm": 2.0997393131256104, | |
| "learning_rate": 2.165015479876161e-05, | |
| "loss": 3.2216, | |
| "mean_token_accuracy": 0.3913588523864746, | |
| "num_tokens": 482569.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.8173374613003097, | |
| "grad_norm": 3.0189061164855957, | |
| "learning_rate": 2.1557275541795666e-05, | |
| "loss": 3.1346, | |
| "mean_token_accuracy": 0.3967192888259888, | |
| "num_tokens": 487629.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.848297213622291, | |
| "grad_norm": 2.8610568046569824, | |
| "learning_rate": 2.1464396284829722e-05, | |
| "loss": 3.2061, | |
| "mean_token_accuracy": 0.3921816825866699, | |
| "num_tokens": 493227.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.8792569659442724, | |
| "grad_norm": 3.023012638092041, | |
| "learning_rate": 2.1371517027863776e-05, | |
| "loss": 3.2208, | |
| "mean_token_accuracy": 0.3786366432905197, | |
| "num_tokens": 498394.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.9102167182662537, | |
| "grad_norm": 2.908886194229126, | |
| "learning_rate": 2.1278637770897832e-05, | |
| "loss": 3.1648, | |
| "mean_token_accuracy": 0.3914026439189911, | |
| "num_tokens": 503752.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.9411764705882355, | |
| "grad_norm": 3.078397750854492, | |
| "learning_rate": 2.118575851393189e-05, | |
| "loss": 3.1891, | |
| "mean_token_accuracy": 0.39542897045612335, | |
| "num_tokens": 509218.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.972136222910217, | |
| "grad_norm": 2.740389823913574, | |
| "learning_rate": 2.1092879256965946e-05, | |
| "loss": 3.1953, | |
| "mean_token_accuracy": 0.3934174537658691, | |
| "num_tokens": 514716.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.003095975232198, | |
| "grad_norm": 2.6426961421966553, | |
| "learning_rate": 2.1e-05, | |
| "loss": 3.1084, | |
| "mean_token_accuracy": 0.39845702350139617, | |
| "num_tokens": 520000.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.0340557275541795, | |
| "grad_norm": 2.8173840045928955, | |
| "learning_rate": 2.090712074303406e-05, | |
| "loss": 3.1574, | |
| "mean_token_accuracy": 0.39481441378593446, | |
| "num_tokens": 525561.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.065015479876161, | |
| "grad_norm": 3.29856014251709, | |
| "learning_rate": 2.0814241486068112e-05, | |
| "loss": 3.0613, | |
| "mean_token_accuracy": 0.4054213762283325, | |
| "num_tokens": 531021.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.0959752321981426, | |
| "grad_norm": 3.463890314102173, | |
| "learning_rate": 2.0721362229102165e-05, | |
| "loss": 3.0282, | |
| "mean_token_accuracy": 0.4025467813014984, | |
| "num_tokens": 536260.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.126934984520124, | |
| "grad_norm": 3.134387731552124, | |
| "learning_rate": 2.0628482972136225e-05, | |
| "loss": 3.1147, | |
| "mean_token_accuracy": 0.3955592781305313, | |
| "num_tokens": 541721.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.1578947368421053, | |
| "grad_norm": 3.237518072128296, | |
| "learning_rate": 2.053560371517028e-05, | |
| "loss": 3.0828, | |
| "mean_token_accuracy": 0.40579850077629087, | |
| "num_tokens": 547334.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.1888544891640866, | |
| "grad_norm": 3.2742724418640137, | |
| "learning_rate": 2.0442724458204335e-05, | |
| "loss": 3.0907, | |
| "mean_token_accuracy": 0.40091423988342284, | |
| "num_tokens": 552904.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.219814241486068, | |
| "grad_norm": 3.0646955966949463, | |
| "learning_rate": 2.0349845201238392e-05, | |
| "loss": 3.0758, | |
| "mean_token_accuracy": 0.40378672182559966, | |
| "num_tokens": 557743.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.2507739938080498, | |
| "grad_norm": 3.2615506649017334, | |
| "learning_rate": 2.0256965944272445e-05, | |
| "loss": 3.0844, | |
| "mean_token_accuracy": 0.41148235499858854, | |
| "num_tokens": 563604.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.281733746130031, | |
| "grad_norm": 3.001723527908325, | |
| "learning_rate": 2.0164086687306502e-05, | |
| "loss": 2.9993, | |
| "mean_token_accuracy": 0.41538413166999816, | |
| "num_tokens": 568980.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.3126934984520124, | |
| "grad_norm": 3.545367956161499, | |
| "learning_rate": 2.007120743034056e-05, | |
| "loss": 3.0765, | |
| "mean_token_accuracy": 0.40937572419643403, | |
| "num_tokens": 573896.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 3.343653250773994, | |
| "grad_norm": 3.4989709854125977, | |
| "learning_rate": 1.9978328173374615e-05, | |
| "loss": 3.1538, | |
| "mean_token_accuracy": 0.3883706986904144, | |
| "num_tokens": 579254.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.374613003095975, | |
| "grad_norm": 3.964334487915039, | |
| "learning_rate": 1.9885448916408668e-05, | |
| "loss": 3.1399, | |
| "mean_token_accuracy": 0.3968294531106949, | |
| "num_tokens": 584603.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 3.405572755417957, | |
| "grad_norm": 4.132855415344238, | |
| "learning_rate": 1.9792569659442725e-05, | |
| "loss": 3.1428, | |
| "mean_token_accuracy": 0.3817721128463745, | |
| "num_tokens": 590267.0, | |
| "step": 1100 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3230, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.778658848011059e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |