| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9999270764967549, | |
| "eval_steps": 500, | |
| "global_step": 1714, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002916940129803836, | |
| "grad_norm": 9.421533600382833, | |
| "learning_rate": 1.1627906976744187e-07, | |
| "loss": 1.4153, | |
| "num_tokens": 671048.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.005833880259607672, | |
| "grad_norm": 9.417087087487797, | |
| "learning_rate": 2.616279069767442e-07, | |
| "loss": 1.4115, | |
| "num_tokens": 1340514.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008750820389411508, | |
| "grad_norm": 8.64652450056725, | |
| "learning_rate": 4.0697674418604655e-07, | |
| "loss": 1.3864, | |
| "num_tokens": 1975142.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.011667760519215344, | |
| "grad_norm": 6.364802582777174, | |
| "learning_rate": 5.523255813953489e-07, | |
| "loss": 1.2845, | |
| "num_tokens": 2629706.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01458470064901918, | |
| "grad_norm": 4.036625471123866, | |
| "learning_rate": 6.976744186046513e-07, | |
| "loss": 1.1622, | |
| "num_tokens": 3282682.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.017501640778823015, | |
| "grad_norm": 4.1984081053855205, | |
| "learning_rate": 8.430232558139535e-07, | |
| "loss": 0.9647, | |
| "num_tokens": 3956656.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02041858090862685, | |
| "grad_norm": 2.9803387136629893, | |
| "learning_rate": 9.88372093023256e-07, | |
| "loss": 0.7913, | |
| "num_tokens": 4609315.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.023335521038430687, | |
| "grad_norm": 1.5758517615202334, | |
| "learning_rate": 1.1337209302325581e-06, | |
| "loss": 0.6332, | |
| "num_tokens": 5258689.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02625246116823452, | |
| "grad_norm": 1.5452778456677885, | |
| "learning_rate": 1.2790697674418605e-06, | |
| "loss": 0.5782, | |
| "num_tokens": 5918842.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.02916940129803836, | |
| "grad_norm": 1.3636179516378841, | |
| "learning_rate": 1.424418604651163e-06, | |
| "loss": 0.5198, | |
| "num_tokens": 6581062.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0320863414278422, | |
| "grad_norm": 1.3117664668985787, | |
| "learning_rate": 1.5697674418604653e-06, | |
| "loss": 0.4967, | |
| "num_tokens": 7229502.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.03500328155764603, | |
| "grad_norm": 1.120351932885373, | |
| "learning_rate": 1.7151162790697675e-06, | |
| "loss": 0.4632, | |
| "num_tokens": 7873010.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.037920221687449865, | |
| "grad_norm": 1.1652086123427123, | |
| "learning_rate": 1.86046511627907e-06, | |
| "loss": 0.4468, | |
| "num_tokens": 8521924.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0408371618172537, | |
| "grad_norm": 1.1356964733122763, | |
| "learning_rate": 2.005813953488372e-06, | |
| "loss": 0.4216, | |
| "num_tokens": 9195311.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04375410194705753, | |
| "grad_norm": 1.125126715027727, | |
| "learning_rate": 2.1511627906976745e-06, | |
| "loss": 0.4149, | |
| "num_tokens": 9857902.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.046671042076861374, | |
| "grad_norm": 1.3774618307546735, | |
| "learning_rate": 2.296511627906977e-06, | |
| "loss": 0.4073, | |
| "num_tokens": 10508649.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04958798220666521, | |
| "grad_norm": 0.9977532994086189, | |
| "learning_rate": 2.4418604651162793e-06, | |
| "loss": 0.4007, | |
| "num_tokens": 11165778.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.05250492233646904, | |
| "grad_norm": 1.1279002386313963, | |
| "learning_rate": 2.5872093023255817e-06, | |
| "loss": 0.3834, | |
| "num_tokens": 11832663.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05542186246627288, | |
| "grad_norm": 1.1949547587228786, | |
| "learning_rate": 2.7325581395348837e-06, | |
| "loss": 0.378, | |
| "num_tokens": 12491366.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.05833880259607672, | |
| "grad_norm": 1.2272744151775523, | |
| "learning_rate": 2.8779069767441865e-06, | |
| "loss": 0.3711, | |
| "num_tokens": 13158513.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06125574272588055, | |
| "grad_norm": 1.1016643876078596, | |
| "learning_rate": 3.0232558139534885e-06, | |
| "loss": 0.3673, | |
| "num_tokens": 13815015.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.0641726828556844, | |
| "grad_norm": 1.2643000713720414, | |
| "learning_rate": 3.168604651162791e-06, | |
| "loss": 0.3597, | |
| "num_tokens": 14469541.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06708962298548822, | |
| "grad_norm": 1.201838639094829, | |
| "learning_rate": 3.313953488372093e-06, | |
| "loss": 0.3555, | |
| "num_tokens": 15118625.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.07000656311529206, | |
| "grad_norm": 1.109067555947549, | |
| "learning_rate": 3.4593023255813957e-06, | |
| "loss": 0.3533, | |
| "num_tokens": 15773909.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.07292350324509589, | |
| "grad_norm": 1.1048072692029691, | |
| "learning_rate": 3.6046511627906977e-06, | |
| "loss": 0.347, | |
| "num_tokens": 16431558.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.07584044337489973, | |
| "grad_norm": 1.1222392707741446, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.3561, | |
| "num_tokens": 17071991.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07875738350470357, | |
| "grad_norm": 1.268152454159826, | |
| "learning_rate": 3.8953488372093025e-06, | |
| "loss": 0.3534, | |
| "num_tokens": 17712413.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.0816743236345074, | |
| "grad_norm": 1.2649313269721758, | |
| "learning_rate": 4.040697674418605e-06, | |
| "loss": 0.3299, | |
| "num_tokens": 18365792.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08459126376431124, | |
| "grad_norm": 1.1434149458336165, | |
| "learning_rate": 4.186046511627907e-06, | |
| "loss": 0.3306, | |
| "num_tokens": 19044956.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.08750820389411507, | |
| "grad_norm": 1.2086728419334098, | |
| "learning_rate": 4.331395348837209e-06, | |
| "loss": 0.3374, | |
| "num_tokens": 19695160.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09042514402391891, | |
| "grad_norm": 1.1141028558068582, | |
| "learning_rate": 4.476744186046512e-06, | |
| "loss": 0.3329, | |
| "num_tokens": 20354718.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.09334208415372275, | |
| "grad_norm": 1.1566468839887507, | |
| "learning_rate": 4.622093023255814e-06, | |
| "loss": 0.3341, | |
| "num_tokens": 21006198.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09625902428352658, | |
| "grad_norm": 1.1614415172217762, | |
| "learning_rate": 4.767441860465117e-06, | |
| "loss": 0.3443, | |
| "num_tokens": 21634895.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.09917596441333042, | |
| "grad_norm": 1.2888710984530731, | |
| "learning_rate": 4.912790697674419e-06, | |
| "loss": 0.3332, | |
| "num_tokens": 22279296.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.10209290454313426, | |
| "grad_norm": 1.066687159230309, | |
| "learning_rate": 4.999981321460686e-06, | |
| "loss": 0.33, | |
| "num_tokens": 22932800.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.10500984467293809, | |
| "grad_norm": 1.0587240072052146, | |
| "learning_rate": 4.999771191454943e-06, | |
| "loss": 0.3227, | |
| "num_tokens": 23596098.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10792678480274193, | |
| "grad_norm": 1.0753866637705207, | |
| "learning_rate": 4.999327605146962e-06, | |
| "loss": 0.3265, | |
| "num_tokens": 24240753.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.11084372493254575, | |
| "grad_norm": 1.1696062309729527, | |
| "learning_rate": 4.998650608567207e-06, | |
| "loss": 0.3209, | |
| "num_tokens": 24903847.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1137606650623496, | |
| "grad_norm": 1.0355502586963417, | |
| "learning_rate": 4.997740271966877e-06, | |
| "loss": 0.3146, | |
| "num_tokens": 25564496.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.11667760519215344, | |
| "grad_norm": 1.119036751899036, | |
| "learning_rate": 4.996596689810608e-06, | |
| "loss": 0.315, | |
| "num_tokens": 26231532.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11959454532195726, | |
| "grad_norm": 1.088954101903663, | |
| "learning_rate": 4.995219980766672e-06, | |
| "loss": 0.3178, | |
| "num_tokens": 26876933.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.1225114854517611, | |
| "grad_norm": 1.1468096627392024, | |
| "learning_rate": 4.993610287694663e-06, | |
| "loss": 0.3251, | |
| "num_tokens": 27510135.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.12542842558156495, | |
| "grad_norm": 0.9548342307118838, | |
| "learning_rate": 4.991767777630679e-06, | |
| "loss": 0.3145, | |
| "num_tokens": 28154194.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.1283453657113688, | |
| "grad_norm": 1.0591304201257135, | |
| "learning_rate": 4.989692641769976e-06, | |
| "loss": 0.3229, | |
| "num_tokens": 28801919.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1312623058411726, | |
| "grad_norm": 1.054145800912703, | |
| "learning_rate": 4.987385095447141e-06, | |
| "loss": 0.3208, | |
| "num_tokens": 29456315.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.13417924597097644, | |
| "grad_norm": 1.1458373353014295, | |
| "learning_rate": 4.9848453781137355e-06, | |
| "loss": 0.3136, | |
| "num_tokens": 30111377.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13709618610078028, | |
| "grad_norm": 1.1484854864144058, | |
| "learning_rate": 4.982073753313459e-06, | |
| "loss": 0.3106, | |
| "num_tokens": 30769448.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.14001312623058412, | |
| "grad_norm": 1.1136794674885304, | |
| "learning_rate": 4.979070508654793e-06, | |
| "loss": 0.3123, | |
| "num_tokens": 31423870.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.14293006636038796, | |
| "grad_norm": 1.1270064595880545, | |
| "learning_rate": 4.975835955781159e-06, | |
| "loss": 0.3093, | |
| "num_tokens": 32085017.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.14584700649019178, | |
| "grad_norm": 0.9197262026545928, | |
| "learning_rate": 4.97237043033858e-06, | |
| "loss": 0.3099, | |
| "num_tokens": 32748599.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.14876394661999562, | |
| "grad_norm": 1.0031484672449607, | |
| "learning_rate": 4.968674291940851e-06, | |
| "loss": 0.3113, | |
| "num_tokens": 33396816.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.15168088674979946, | |
| "grad_norm": 1.0106860095524073, | |
| "learning_rate": 4.964747924132222e-06, | |
| "loss": 0.3019, | |
| "num_tokens": 34048082.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1545978268796033, | |
| "grad_norm": 1.1390370333863529, | |
| "learning_rate": 4.9605917343475955e-06, | |
| "loss": 0.3094, | |
| "num_tokens": 34695245.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.15751476700940714, | |
| "grad_norm": 0.9304690639529852, | |
| "learning_rate": 4.9562061538702525e-06, | |
| "loss": 0.3022, | |
| "num_tokens": 35357239.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.16043170713921096, | |
| "grad_norm": 1.0043265999798545, | |
| "learning_rate": 4.9515916377870946e-06, | |
| "loss": 0.3089, | |
| "num_tokens": 36009775.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.1633486472690148, | |
| "grad_norm": 1.2003647631325731, | |
| "learning_rate": 4.946748664941422e-06, | |
| "loss": 0.3011, | |
| "num_tokens": 36668698.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.16626558739881864, | |
| "grad_norm": 0.8918137550325422, | |
| "learning_rate": 4.941677737883243e-06, | |
| "loss": 0.2965, | |
| "num_tokens": 37330940.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.16918252752862248, | |
| "grad_norm": 0.9369985322231161, | |
| "learning_rate": 4.936379382817128e-06, | |
| "loss": 0.3018, | |
| "num_tokens": 37988190.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.17209946765842632, | |
| "grad_norm": 0.9779525599227024, | |
| "learning_rate": 4.930854149547601e-06, | |
| "loss": 0.2918, | |
| "num_tokens": 38661283.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.17501640778823013, | |
| "grad_norm": 1.1129159421490522, | |
| "learning_rate": 4.925102611422091e-06, | |
| "loss": 0.2927, | |
| "num_tokens": 39320436.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.17793334791803397, | |
| "grad_norm": 0.9718156616589078, | |
| "learning_rate": 4.919125365271435e-06, | |
| "loss": 0.2965, | |
| "num_tokens": 39974872.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.18085028804783781, | |
| "grad_norm": 0.997867663517212, | |
| "learning_rate": 4.912923031347944e-06, | |
| "loss": 0.2887, | |
| "num_tokens": 40624237.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.18376722817764166, | |
| "grad_norm": 1.0604840903516273, | |
| "learning_rate": 4.9064962532610465e-06, | |
| "loss": 0.3093, | |
| "num_tokens": 41272880.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.1866841683074455, | |
| "grad_norm": 0.9752847517901945, | |
| "learning_rate": 4.899845697910489e-06, | |
| "loss": 0.3023, | |
| "num_tokens": 41916518.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.18960110843724934, | |
| "grad_norm": 0.9361317315869321, | |
| "learning_rate": 4.89297205541715e-06, | |
| "loss": 0.2906, | |
| "num_tokens": 42570198.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.19251804856705315, | |
| "grad_norm": 0.924312875998321, | |
| "learning_rate": 4.885876039051408e-06, | |
| "loss": 0.2973, | |
| "num_tokens": 43235563.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.195434988696857, | |
| "grad_norm": 0.9664895892779287, | |
| "learning_rate": 4.878558385159143e-06, | |
| "loss": 0.2932, | |
| "num_tokens": 43881944.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.19835192882666083, | |
| "grad_norm": 1.1025545029030783, | |
| "learning_rate": 4.871019853085316e-06, | |
| "loss": 0.2969, | |
| "num_tokens": 44536573.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.20126886895646467, | |
| "grad_norm": 1.1546365826334446, | |
| "learning_rate": 4.863261225095175e-06, | |
| "loss": 0.2975, | |
| "num_tokens": 45185298.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.20418580908626852, | |
| "grad_norm": 0.8539653068120385, | |
| "learning_rate": 4.8552833062930845e-06, | |
| "loss": 0.2941, | |
| "num_tokens": 45833516.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.20710274921607233, | |
| "grad_norm": 0.8887354370276694, | |
| "learning_rate": 4.847086924538975e-06, | |
| "loss": 0.2911, | |
| "num_tokens": 46492785.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.21001968934587617, | |
| "grad_norm": 0.8965576316891322, | |
| "learning_rate": 4.838672930362438e-06, | |
| "loss": 0.2845, | |
| "num_tokens": 47147972.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.21293662947568, | |
| "grad_norm": 1.0547471171000238, | |
| "learning_rate": 4.830042196874472e-06, | |
| "loss": 0.2889, | |
| "num_tokens": 47792460.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.21585356960548385, | |
| "grad_norm": 0.9029069684538267, | |
| "learning_rate": 4.821195619676876e-06, | |
| "loss": 0.2935, | |
| "num_tokens": 48449608.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2187705097352877, | |
| "grad_norm": 1.1056029287041187, | |
| "learning_rate": 4.812134116769312e-06, | |
| "loss": 0.2927, | |
| "num_tokens": 49105080.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.2216874498650915, | |
| "grad_norm": 0.8797309000795296, | |
| "learning_rate": 4.802858628454056e-06, | |
| "loss": 0.2895, | |
| "num_tokens": 49754281.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.22460438999489535, | |
| "grad_norm": 1.044569804031736, | |
| "learning_rate": 4.793370117238408e-06, | |
| "loss": 0.2897, | |
| "num_tokens": 50392756.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.2275213301246992, | |
| "grad_norm": 1.1130572813337458, | |
| "learning_rate": 4.783669567734826e-06, | |
| "loss": 0.2899, | |
| "num_tokens": 51048274.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.23043827025450303, | |
| "grad_norm": 0.960716559832758, | |
| "learning_rate": 4.773757986558753e-06, | |
| "loss": 0.2917, | |
| "num_tokens": 51699260.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.23335521038430687, | |
| "grad_norm": 0.9245960376672167, | |
| "learning_rate": 4.763636402224151e-06, | |
| "loss": 0.2897, | |
| "num_tokens": 52348343.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.23627215051411068, | |
| "grad_norm": 0.8890810056058448, | |
| "learning_rate": 4.753305865036787e-06, | |
| "loss": 0.2885, | |
| "num_tokens": 53001056.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.23918909064391453, | |
| "grad_norm": 0.8579871280345769, | |
| "learning_rate": 4.742767446985234e-06, | |
| "loss": 0.2861, | |
| "num_tokens": 53662504.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.24210603077371837, | |
| "grad_norm": 0.8821250216443968, | |
| "learning_rate": 4.732022241629637e-06, | |
| "loss": 0.287, | |
| "num_tokens": 54318994.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.2450229709035222, | |
| "grad_norm": 0.7974431662516955, | |
| "learning_rate": 4.721071363988235e-06, | |
| "loss": 0.2905, | |
| "num_tokens": 54969623.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.24793991103332605, | |
| "grad_norm": 0.8353737566224922, | |
| "learning_rate": 4.709915950421653e-06, | |
| "loss": 0.2878, | |
| "num_tokens": 55620185.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2508568511631299, | |
| "grad_norm": 0.9111882828366351, | |
| "learning_rate": 4.698557158514988e-06, | |
| "loss": 0.2828, | |
| "num_tokens": 56274073.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2537737912929337, | |
| "grad_norm": 0.919475515341444, | |
| "learning_rate": 4.686996166957683e-06, | |
| "loss": 0.2822, | |
| "num_tokens": 56944526.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.2566907314227376, | |
| "grad_norm": 0.8931932491434317, | |
| "learning_rate": 4.675234175421222e-06, | |
| "loss": 0.2892, | |
| "num_tokens": 57609076.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2596076715525414, | |
| "grad_norm": 0.9216610473053193, | |
| "learning_rate": 4.663272404434635e-06, | |
| "loss": 0.2774, | |
| "num_tokens": 58264579.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.2625246116823452, | |
| "grad_norm": 0.9202742139945425, | |
| "learning_rate": 4.6511120952578484e-06, | |
| "loss": 0.2863, | |
| "num_tokens": 58902959.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.26544155181214907, | |
| "grad_norm": 0.8943558537794027, | |
| "learning_rate": 4.638754509752878e-06, | |
| "loss": 0.2796, | |
| "num_tokens": 59564439.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.2683584919419529, | |
| "grad_norm": 0.863810937001429, | |
| "learning_rate": 4.6262009302528915e-06, | |
| "loss": 0.2843, | |
| "num_tokens": 60214896.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.27127543207175675, | |
| "grad_norm": 0.9983134689445968, | |
| "learning_rate": 4.61345265942914e-06, | |
| "loss": 0.2866, | |
| "num_tokens": 60876642.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.27419237220156056, | |
| "grad_norm": 0.8983030440670102, | |
| "learning_rate": 4.600511020155778e-06, | |
| "loss": 0.2848, | |
| "num_tokens": 61518584.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2771093123313644, | |
| "grad_norm": 0.8031669478146738, | |
| "learning_rate": 4.587377355372596e-06, | |
| "loss": 0.2757, | |
| "num_tokens": 62183326.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.28002625246116825, | |
| "grad_norm": 0.9265028610272964, | |
| "learning_rate": 4.57405302794566e-06, | |
| "loss": 0.2841, | |
| "num_tokens": 62825255.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.28294319259097206, | |
| "grad_norm": 0.7881945775359938, | |
| "learning_rate": 4.5605394205258915e-06, | |
| "loss": 0.2784, | |
| "num_tokens": 63488470.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.28586013272077593, | |
| "grad_norm": 0.890455779305603, | |
| "learning_rate": 4.546837935405591e-06, | |
| "loss": 0.2825, | |
| "num_tokens": 64144973.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.28877707285057974, | |
| "grad_norm": 0.8245358042376737, | |
| "learning_rate": 4.532949994372922e-06, | |
| "loss": 0.2735, | |
| "num_tokens": 64800071.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.29169401298038355, | |
| "grad_norm": 0.8022563146785544, | |
| "learning_rate": 4.518877038564374e-06, | |
| "loss": 0.2805, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.29169401298038355, | |
| "eval_loss": 0.2839949429035187, | |
| "eval_num_tokens": 65444398.0, | |
| "eval_runtime": 611.5469, | |
| "eval_samples_per_second": 9.442, | |
| "eval_steps_per_second": 1.181, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2946109531101874, | |
| "grad_norm": 0.8282011678135937, | |
| "learning_rate": 4.50462052831522e-06, | |
| "loss": 0.2742, | |
| "num_tokens": 66102293.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.29752789323999124, | |
| "grad_norm": 0.8459868862384502, | |
| "learning_rate": 4.4901819430079766e-06, | |
| "loss": 0.2797, | |
| "num_tokens": 66750025.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3004448333697951, | |
| "grad_norm": 1.0044989964568452, | |
| "learning_rate": 4.4755627809188885e-06, | |
| "loss": 0.2808, | |
| "num_tokens": 67398138.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.3033617734995989, | |
| "grad_norm": 0.8645633533810008, | |
| "learning_rate": 4.460764559062458e-06, | |
| "loss": 0.2743, | |
| "num_tokens": 68064298.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.30627871362940273, | |
| "grad_norm": 0.7902367580597776, | |
| "learning_rate": 4.445788813034024e-06, | |
| "loss": 0.2745, | |
| "num_tokens": 68730271.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.3091956537592066, | |
| "grad_norm": 0.8074282979202131, | |
| "learning_rate": 4.430637096850415e-06, | |
| "loss": 0.278, | |
| "num_tokens": 69373992.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3121125938890104, | |
| "grad_norm": 0.7997689062881244, | |
| "learning_rate": 4.4153109827886894e-06, | |
| "loss": 0.2773, | |
| "num_tokens": 70013275.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.3150295340188143, | |
| "grad_norm": 0.7763977877420887, | |
| "learning_rate": 4.399812061222983e-06, | |
| "loss": 0.2792, | |
| "num_tokens": 70667022.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3179464741486181, | |
| "grad_norm": 0.8027330754403003, | |
| "learning_rate": 4.384141940459482e-06, | |
| "loss": 0.2692, | |
| "num_tokens": 71329905.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.3208634142784219, | |
| "grad_norm": 0.842566387021002, | |
| "learning_rate": 4.36830224656952e-06, | |
| "loss": 0.2777, | |
| "num_tokens": 71981698.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3237803544082258, | |
| "grad_norm": 0.760795844133808, | |
| "learning_rate": 4.352294623220853e-06, | |
| "loss": 0.2719, | |
| "num_tokens": 72641200.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.3266972945380296, | |
| "grad_norm": 0.767412158526903, | |
| "learning_rate": 4.336120731507095e-06, | |
| "loss": 0.2786, | |
| "num_tokens": 73289056.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.32961423466783346, | |
| "grad_norm": 0.839424142267615, | |
| "learning_rate": 4.319782249775343e-06, | |
| "loss": 0.2774, | |
| "num_tokens": 73940084.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.3325311747976373, | |
| "grad_norm": 0.7953209168681115, | |
| "learning_rate": 4.303280873452022e-06, | |
| "loss": 0.2761, | |
| "num_tokens": 74602418.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3354481149274411, | |
| "grad_norm": 0.9703712347237416, | |
| "learning_rate": 4.286618314866953e-06, | |
| "loss": 0.2739, | |
| "num_tokens": 75260268.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.33836505505724496, | |
| "grad_norm": 0.785005018552605, | |
| "learning_rate": 4.269796303075664e-06, | |
| "loss": 0.2719, | |
| "num_tokens": 75911746.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.34128199518704877, | |
| "grad_norm": 0.9895796846796822, | |
| "learning_rate": 4.252816583679963e-06, | |
| "loss": 0.2703, | |
| "num_tokens": 76570208.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.34419893531685264, | |
| "grad_norm": 0.8038675047644025, | |
| "learning_rate": 4.235680918646814e-06, | |
| "loss": 0.2733, | |
| "num_tokens": 77231968.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.34711587544665645, | |
| "grad_norm": 0.8232561538467604, | |
| "learning_rate": 4.21839108612548e-06, | |
| "loss": 0.2753, | |
| "num_tokens": 77888752.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.35003281557646027, | |
| "grad_norm": 0.8114352965857334, | |
| "learning_rate": 4.200948880263021e-06, | |
| "loss": 0.2764, | |
| "num_tokens": 78549259.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.35294975570626413, | |
| "grad_norm": 0.918026001844272, | |
| "learning_rate": 4.1833561110181114e-06, | |
| "loss": 0.2696, | |
| "num_tokens": 79207039.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.35586669583606795, | |
| "grad_norm": 0.8799204301828087, | |
| "learning_rate": 4.165614603973225e-06, | |
| "loss": 0.2663, | |
| "num_tokens": 79861860.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3587836359658718, | |
| "grad_norm": 0.87043880731276, | |
| "learning_rate": 4.147726200145192e-06, | |
| "loss": 0.2755, | |
| "num_tokens": 80518900.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.36170057609567563, | |
| "grad_norm": 0.9211963447162147, | |
| "learning_rate": 4.1296927557941625e-06, | |
| "loss": 0.2767, | |
| "num_tokens": 81161582.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3646175162254795, | |
| "grad_norm": 0.8299533305119133, | |
| "learning_rate": 4.111516142230986e-06, | |
| "loss": 0.2696, | |
| "num_tokens": 81813290.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.3675344563552833, | |
| "grad_norm": 0.8309388002459559, | |
| "learning_rate": 4.093198245623022e-06, | |
| "loss": 0.2718, | |
| "num_tokens": 82481134.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3704513964850871, | |
| "grad_norm": 0.7974940185655467, | |
| "learning_rate": 4.074740966798417e-06, | |
| "loss": 0.275, | |
| "num_tokens": 83121589.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.373368336614891, | |
| "grad_norm": 0.7877568112781695, | |
| "learning_rate": 4.056146221048861e-06, | |
| "loss": 0.2689, | |
| "num_tokens": 83783858.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3762852767446948, | |
| "grad_norm": 0.7457416133818777, | |
| "learning_rate": 4.037415937930834e-06, | |
| "loss": 0.2698, | |
| "num_tokens": 84439289.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.3792022168744987, | |
| "grad_norm": 0.804387030534503, | |
| "learning_rate": 4.018552061065385e-06, | |
| "loss": 0.2656, | |
| "num_tokens": 85096010.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3821191570043025, | |
| "grad_norm": 0.7866414763779639, | |
| "learning_rate": 3.999556547936433e-06, | |
| "loss": 0.2679, | |
| "num_tokens": 85763093.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.3850360971341063, | |
| "grad_norm": 0.7975246358334895, | |
| "learning_rate": 3.980431369687657e-06, | |
| "loss": 0.2685, | |
| "num_tokens": 86414815.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.38795303726391017, | |
| "grad_norm": 0.7541579622412656, | |
| "learning_rate": 3.961178510917938e-06, | |
| "loss": 0.2664, | |
| "num_tokens": 87072103.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.390869977393714, | |
| "grad_norm": 0.8462615544123323, | |
| "learning_rate": 3.941799969475426e-06, | |
| "loss": 0.2697, | |
| "num_tokens": 87724238.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.39378691752351785, | |
| "grad_norm": 0.719877168553545, | |
| "learning_rate": 3.922297756250231e-06, | |
| "loss": 0.2682, | |
| "num_tokens": 88385611.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.39670385765332167, | |
| "grad_norm": 0.8061212707580667, | |
| "learning_rate": 3.902673894965739e-06, | |
| "loss": 0.2652, | |
| "num_tokens": 89047105.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3996207977831255, | |
| "grad_norm": 0.8326055553096022, | |
| "learning_rate": 3.88293042196863e-06, | |
| "loss": 0.2651, | |
| "num_tokens": 89703947.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.40253773791292935, | |
| "grad_norm": 0.8409291362121083, | |
| "learning_rate": 3.863069386017559e-06, | |
| "loss": 0.2676, | |
| "num_tokens": 90375478.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.40545467804273316, | |
| "grad_norm": 1.1749044956975412, | |
| "learning_rate": 3.8430928480705595e-06, | |
| "loss": 0.268, | |
| "num_tokens": 91041509.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.40837161817253703, | |
| "grad_norm": 0.9615687348415685, | |
| "learning_rate": 3.823002881071182e-06, | |
| "loss": 0.2713, | |
| "num_tokens": 91687478.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.41128855830234085, | |
| "grad_norm": 0.8598987130093434, | |
| "learning_rate": 3.802801569733385e-06, | |
| "loss": 0.2633, | |
| "num_tokens": 92350865.0, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.41420549843214466, | |
| "grad_norm": 0.9022414410706127, | |
| "learning_rate": 3.7824910103252094e-06, | |
| "loss": 0.2705, | |
| "num_tokens": 93001864.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4171224385619485, | |
| "grad_norm": 0.8563602736920568, | |
| "learning_rate": 3.7620733104512457e-06, | |
| "loss": 0.2686, | |
| "num_tokens": 93646547.0, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.42003937869175234, | |
| "grad_norm": 0.8403741919990089, | |
| "learning_rate": 3.741550588833938e-06, | |
| "loss": 0.261, | |
| "num_tokens": 94309272.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4229563188215562, | |
| "grad_norm": 0.8382205734712479, | |
| "learning_rate": 3.7209249750937194e-06, | |
| "loss": 0.2583, | |
| "num_tokens": 94974415.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.42587325895136, | |
| "grad_norm": 0.7670934985699519, | |
| "learning_rate": 3.700198609528027e-06, | |
| "loss": 0.2621, | |
| "num_tokens": 95637473.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.42879019908116384, | |
| "grad_norm": 0.889958478839124, | |
| "learning_rate": 3.679373642889205e-06, | |
| "loss": 0.263, | |
| "num_tokens": 96285419.0, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.4317071392109677, | |
| "grad_norm": 0.8154007725747381, | |
| "learning_rate": 3.6584522361613227e-06, | |
| "loss": 0.2657, | |
| "num_tokens": 96926138.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4346240793407715, | |
| "grad_norm": 0.8307773360486326, | |
| "learning_rate": 3.6374365603359347e-06, | |
| "loss": 0.2662, | |
| "num_tokens": 97588450.0, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.4375410194705754, | |
| "grad_norm": 0.8078529106621964, | |
| "learning_rate": 3.616328796186794e-06, | |
| "loss": 0.2681, | |
| "num_tokens": 98242023.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4404579596003792, | |
| "grad_norm": 0.9983558574344977, | |
| "learning_rate": 3.5951311340435597e-06, | |
| "loss": 0.2637, | |
| "num_tokens": 98902913.0, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.443374899730183, | |
| "grad_norm": 0.9165975707045902, | |
| "learning_rate": 3.573845773564506e-06, | |
| "loss": 0.2727, | |
| "num_tokens": 99533885.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4462918398599869, | |
| "grad_norm": 0.7822942999479296, | |
| "learning_rate": 3.5524749235082728e-06, | |
| "loss": 0.2712, | |
| "num_tokens": 100188467.0, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.4492087799897907, | |
| "grad_norm": 0.7478178653016506, | |
| "learning_rate": 3.5310208015046547e-06, | |
| "loss": 0.2661, | |
| "num_tokens": 100842626.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.45212572011959457, | |
| "grad_norm": 0.8348739730561777, | |
| "learning_rate": 3.5094856338244882e-06, | |
| "loss": 0.2605, | |
| "num_tokens": 101509817.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.4550426602493984, | |
| "grad_norm": 0.8650672549166954, | |
| "learning_rate": 3.4878716551486296e-06, | |
| "loss": 0.2626, | |
| "num_tokens": 102161268.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.4579596003792022, | |
| "grad_norm": 0.8437538579535597, | |
| "learning_rate": 3.466181108336068e-06, | |
| "loss": 0.2631, | |
| "num_tokens": 102815935.0, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.46087654050900606, | |
| "grad_norm": 0.8055131749035074, | |
| "learning_rate": 3.444416244191184e-06, | |
| "loss": 0.2548, | |
| "num_tokens": 103480501.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4637934806388099, | |
| "grad_norm": 0.7184736639862396, | |
| "learning_rate": 3.422579321230185e-06, | |
| "loss": 0.2615, | |
| "num_tokens": 104127953.0, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.46671042076861374, | |
| "grad_norm": 0.7769160728536546, | |
| "learning_rate": 3.400672605446746e-06, | |
| "loss": 0.2649, | |
| "num_tokens": 104784029.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.46962736089841756, | |
| "grad_norm": 0.7956791364511919, | |
| "learning_rate": 3.378698370076865e-06, | |
| "loss": 0.2585, | |
| "num_tokens": 105448623.0, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.47254430102822137, | |
| "grad_norm": 0.7971292848423556, | |
| "learning_rate": 3.356658895362974e-06, | |
| "loss": 0.258, | |
| "num_tokens": 106102919.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.47546124115802524, | |
| "grad_norm": 0.8024065230344523, | |
| "learning_rate": 3.334556468317322e-06, | |
| "loss": 0.2639, | |
| "num_tokens": 106766059.0, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.47837818128782905, | |
| "grad_norm": 0.8627952778096136, | |
| "learning_rate": 3.3123933824846517e-06, | |
| "loss": 0.2637, | |
| "num_tokens": 107426299.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4812951214176329, | |
| "grad_norm": 0.8393622591727319, | |
| "learning_rate": 3.2901719377042003e-06, | |
| "loss": 0.2629, | |
| "num_tokens": 108065870.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.48421206154743673, | |
| "grad_norm": 0.7495915565490366, | |
| "learning_rate": 3.2678944398710535e-06, | |
| "loss": 0.2624, | |
| "num_tokens": 108729197.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.48712900167724055, | |
| "grad_norm": 0.7951127772717166, | |
| "learning_rate": 3.2455632006968564e-06, | |
| "loss": 0.2644, | |
| "num_tokens": 109377885.0, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.4900459418070444, | |
| "grad_norm": 0.8730539703385664, | |
| "learning_rate": 3.223180537469938e-06, | |
| "loss": 0.2631, | |
| "num_tokens": 110046067.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.49296288193684823, | |
| "grad_norm": 0.8086162516026355, | |
| "learning_rate": 3.200748772814844e-06, | |
| "loss": 0.2529, | |
| "num_tokens": 110715109.0, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.4958798220666521, | |
| "grad_norm": 0.8485834526237742, | |
| "learning_rate": 3.1782702344513215e-06, | |
| "loss": 0.2607, | |
| "num_tokens": 111368579.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4987967621964559, | |
| "grad_norm": 0.79225012559078, | |
| "learning_rate": 3.1557472549527747e-06, | |
| "loss": 0.2618, | |
| "num_tokens": 112019671.0, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.5017137023262598, | |
| "grad_norm": 0.8425810939639716, | |
| "learning_rate": 3.133182171504214e-06, | |
| "loss": 0.2627, | |
| "num_tokens": 112674101.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5046306424560636, | |
| "grad_norm": 0.744964617174656, | |
| "learning_rate": 3.110577325659734e-06, | |
| "loss": 0.2583, | |
| "num_tokens": 113327455.0, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.5075475825858674, | |
| "grad_norm": 0.7709087869015558, | |
| "learning_rate": 3.0879350630995284e-06, | |
| "loss": 0.2631, | |
| "num_tokens": 113978391.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5104645227156712, | |
| "grad_norm": 0.8044798860684248, | |
| "learning_rate": 3.0652577333864812e-06, | |
| "loss": 0.2569, | |
| "num_tokens": 114620393.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.5133814628454751, | |
| "grad_norm": 0.7842031411597096, | |
| "learning_rate": 3.0425476897223584e-06, | |
| "loss": 0.2611, | |
| "num_tokens": 115280804.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.516298402975279, | |
| "grad_norm": 0.8035683788722667, | |
| "learning_rate": 3.019807288703615e-06, | |
| "loss": 0.2553, | |
| "num_tokens": 115935583.0, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.5192153431050828, | |
| "grad_norm": 0.7544056798614338, | |
| "learning_rate": 2.99703889007686e-06, | |
| "loss": 0.2563, | |
| "num_tokens": 116605162.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5221322832348866, | |
| "grad_norm": 0.7470139344901265, | |
| "learning_rate": 2.9742448564939785e-06, | |
| "loss": 0.2566, | |
| "num_tokens": 117256010.0, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.5250492233646904, | |
| "grad_norm": 0.7773494324524246, | |
| "learning_rate": 2.95142755326697e-06, | |
| "loss": 0.2557, | |
| "num_tokens": 117920457.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5279661634944943, | |
| "grad_norm": 0.908172722597612, | |
| "learning_rate": 2.9285893481224976e-06, | |
| "loss": 0.2548, | |
| "num_tokens": 118587118.0, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.5308831036242981, | |
| "grad_norm": 0.7475823569135418, | |
| "learning_rate": 2.9057326109561955e-06, | |
| "loss": 0.2613, | |
| "num_tokens": 119238394.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.533800043754102, | |
| "grad_norm": 0.7910527226931955, | |
| "learning_rate": 2.8828597135867446e-06, | |
| "loss": 0.2568, | |
| "num_tokens": 119902185.0, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.5367169838839058, | |
| "grad_norm": 0.7067850895490586, | |
| "learning_rate": 2.859973029509753e-06, | |
| "loss": 0.2514, | |
| "num_tokens": 120578709.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5396339240137096, | |
| "grad_norm": 0.7318968008202511, | |
| "learning_rate": 2.83707493365146e-06, | |
| "loss": 0.2564, | |
| "num_tokens": 121237827.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.5425508641435135, | |
| "grad_norm": 0.7705210064312026, | |
| "learning_rate": 2.8141678021222933e-06, | |
| "loss": 0.2525, | |
| "num_tokens": 121898853.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5454678042733173, | |
| "grad_norm": 0.8157907159758865, | |
| "learning_rate": 2.791254011970301e-06, | |
| "loss": 0.2558, | |
| "num_tokens": 122562375.0, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.5483847444031211, | |
| "grad_norm": 0.8681587026037888, | |
| "learning_rate": 2.7683359409344905e-06, | |
| "loss": 0.2602, | |
| "num_tokens": 123211114.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5513016845329249, | |
| "grad_norm": 0.7556332005656767, | |
| "learning_rate": 2.745415967198093e-06, | |
| "loss": 0.254, | |
| "num_tokens": 123865098.0, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.5542186246627288, | |
| "grad_norm": 0.7946261128648097, | |
| "learning_rate": 2.722496469141779e-06, | |
| "loss": 0.2586, | |
| "num_tokens": 124518258.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5571355647925327, | |
| "grad_norm": 0.7105424221615, | |
| "learning_rate": 2.699579825096857e-06, | |
| "loss": 0.2558, | |
| "num_tokens": 125177330.0, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.5600525049223365, | |
| "grad_norm": 0.8047130073772089, | |
| "learning_rate": 2.6766684130984818e-06, | |
| "loss": 0.2518, | |
| "num_tokens": 125844180.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5629694450521403, | |
| "grad_norm": 0.9021937489211278, | |
| "learning_rate": 2.653764610638881e-06, | |
| "loss": 0.2622, | |
| "num_tokens": 126493064.0, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.5658863851819441, | |
| "grad_norm": 0.8477417229161418, | |
| "learning_rate": 2.6308707944206487e-06, | |
| "loss": 0.256, | |
| "num_tokens": 127140885.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5688033253117479, | |
| "grad_norm": 0.707517618098058, | |
| "learning_rate": 2.607989340110121e-06, | |
| "loss": 0.247, | |
| "num_tokens": 127799249.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.5717202654415519, | |
| "grad_norm": 0.7587943404137327, | |
| "learning_rate": 2.5851226220908504e-06, | |
| "loss": 0.2572, | |
| "num_tokens": 128456895.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5746372055713557, | |
| "grad_norm": 0.7584809790016415, | |
| "learning_rate": 2.562273013217218e-06, | |
| "loss": 0.2497, | |
| "num_tokens": 129122383.0, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.5775541457011595, | |
| "grad_norm": 0.8214288920378848, | |
| "learning_rate": 2.539442884568211e-06, | |
| "loss": 0.2541, | |
| "num_tokens": 129779846.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5804710858309633, | |
| "grad_norm": 0.7290259201155611, | |
| "learning_rate": 2.5166346052013734e-06, | |
| "loss": 0.257, | |
| "num_tokens": 130425019.0, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.5833880259607671, | |
| "grad_norm": 0.8673297271017076, | |
| "learning_rate": 2.4938505419069737e-06, | |
| "loss": 0.2524, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5833880259607671, | |
| "eval_loss": 0.26100972294807434, | |
| "eval_num_tokens": 131094159.0, | |
| "eval_runtime": 607.9741, | |
| "eval_samples_per_second": 9.497, | |
| "eval_steps_per_second": 1.188, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.586304966090571, | |
| "grad_norm": 0.9024948847675129, | |
| "learning_rate": 2.4710930589624043e-06, | |
| "loss": 0.2543, | |
| "num_tokens": 131753695.0, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.5892219062203748, | |
| "grad_norm": 0.8681506257906868, | |
| "learning_rate": 2.4483645178868436e-06, | |
| "loss": 0.2554, | |
| "num_tokens": 132413825.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5921388463501787, | |
| "grad_norm": 0.7962179648149258, | |
| "learning_rate": 2.425667277196205e-06, | |
| "loss": 0.2548, | |
| "num_tokens": 133068552.0, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.5950557864799825, | |
| "grad_norm": 0.804654580141575, | |
| "learning_rate": 2.4030036921583934e-06, | |
| "loss": 0.2564, | |
| "num_tokens": 133717600.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5979727266097863, | |
| "grad_norm": 0.7346044185339496, | |
| "learning_rate": 2.380376114548905e-06, | |
| "loss": 0.2555, | |
| "num_tokens": 134372291.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.6008896667395902, | |
| "grad_norm": 0.8118188644094798, | |
| "learning_rate": 2.3577868924067838e-06, | |
| "loss": 0.2577, | |
| "num_tokens": 135017558.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.603806606869394, | |
| "grad_norm": 0.7221021652672162, | |
| "learning_rate": 2.3352383697909685e-06, | |
| "loss": 0.256, | |
| "num_tokens": 135666125.0, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.6067235469991978, | |
| "grad_norm": 0.7352033663094344, | |
| "learning_rate": 2.312732886537052e-06, | |
| "loss": 0.2575, | |
| "num_tokens": 136315008.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6096404871290017, | |
| "grad_norm": 0.8607128532349083, | |
| "learning_rate": 2.29027277801448e-06, | |
| "loss": 0.2595, | |
| "num_tokens": 136959663.0, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.6125574272588055, | |
| "grad_norm": 0.7969656435537161, | |
| "learning_rate": 2.267860374884213e-06, | |
| "loss": 0.2541, | |
| "num_tokens": 137610280.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6154743673886094, | |
| "grad_norm": 0.7672936399726409, | |
| "learning_rate": 2.245498002856874e-06, | |
| "loss": 0.2615, | |
| "num_tokens": 138253534.0, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.6183913075184132, | |
| "grad_norm": 0.8116002548995157, | |
| "learning_rate": 2.2231879824514114e-06, | |
| "loss": 0.2514, | |
| "num_tokens": 138918946.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.621308247648217, | |
| "grad_norm": 0.8343062523034186, | |
| "learning_rate": 2.2009326287543046e-06, | |
| "loss": 0.2549, | |
| "num_tokens": 139575919.0, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.6242251877780208, | |
| "grad_norm": 0.7030831591540176, | |
| "learning_rate": 2.1787342511793303e-06, | |
| "loss": 0.2543, | |
| "num_tokens": 140246394.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6271421279078246, | |
| "grad_norm": 0.7676168932953417, | |
| "learning_rate": 2.156595153227911e-06, | |
| "loss": 0.2522, | |
| "num_tokens": 140908500.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.6300590680376286, | |
| "grad_norm": 0.7565551405042247, | |
| "learning_rate": 2.13451763225009e-06, | |
| "loss": 0.2538, | |
| "num_tokens": 141560588.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6329760081674324, | |
| "grad_norm": 0.7523126517976036, | |
| "learning_rate": 2.1125039792061346e-06, | |
| "loss": 0.2507, | |
| "num_tokens": 142230913.0, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.6358929482972362, | |
| "grad_norm": 0.7547351807576873, | |
| "learning_rate": 2.0905564784288064e-06, | |
| "loss": 0.252, | |
| "num_tokens": 142880248.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.63880988842704, | |
| "grad_norm": 0.763981058542318, | |
| "learning_rate": 2.0686774073863183e-06, | |
| "loss": 0.2508, | |
| "num_tokens": 143538441.0, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.6417268285568438, | |
| "grad_norm": 0.7513777028630977, | |
| "learning_rate": 2.0468690364460032e-06, | |
| "loss": 0.2499, | |
| "num_tokens": 144195649.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6446437686866477, | |
| "grad_norm": 0.6978388439562043, | |
| "learning_rate": 2.0251336286387246e-06, | |
| "loss": 0.2539, | |
| "num_tokens": 144840414.0, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.6475607088164516, | |
| "grad_norm": 0.7122399335583397, | |
| "learning_rate": 2.003473439424037e-06, | |
| "loss": 0.2516, | |
| "num_tokens": 145497670.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6504776489462554, | |
| "grad_norm": 0.7662075027673452, | |
| "learning_rate": 1.9818907164561474e-06, | |
| "loss": 0.2508, | |
| "num_tokens": 146167309.0, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.6533945890760592, | |
| "grad_norm": 0.7079059380454418, | |
| "learning_rate": 1.960387699350673e-06, | |
| "loss": 0.2471, | |
| "num_tokens": 146836924.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.656311529205863, | |
| "grad_norm": 0.6995228988412967, | |
| "learning_rate": 1.9389666194522416e-06, | |
| "loss": 0.2477, | |
| "num_tokens": 147496049.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.6592284693356669, | |
| "grad_norm": 0.7018077763682937, | |
| "learning_rate": 1.9176296996029455e-06, | |
| "loss": 0.2505, | |
| "num_tokens": 148149073.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6621454094654707, | |
| "grad_norm": 0.6955137369821246, | |
| "learning_rate": 1.8963791539116794e-06, | |
| "loss": 0.2589, | |
| "num_tokens": 148811679.0, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.6650623495952745, | |
| "grad_norm": 0.7175039810377445, | |
| "learning_rate": 1.8752171875243897e-06, | |
| "loss": 0.254, | |
| "num_tokens": 149453909.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6679792897250784, | |
| "grad_norm": 0.7540266711347994, | |
| "learning_rate": 1.8541459963952401e-06, | |
| "loss": 0.2534, | |
| "num_tokens": 150113931.0, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.6708962298548822, | |
| "grad_norm": 0.6958604937899281, | |
| "learning_rate": 1.8331677670587489e-06, | |
| "loss": 0.2472, | |
| "num_tokens": 150775568.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6738131699846861, | |
| "grad_norm": 0.7533171707259878, | |
| "learning_rate": 1.812284676402889e-06, | |
| "loss": 0.2532, | |
| "num_tokens": 151432991.0, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.6767301101144899, | |
| "grad_norm": 0.7169328151307629, | |
| "learning_rate": 1.7914988914431958e-06, | |
| "loss": 0.2515, | |
| "num_tokens": 152071671.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6796470502442937, | |
| "grad_norm": 0.6814651782340072, | |
| "learning_rate": 1.7708125690978973e-06, | |
| "loss": 0.2577, | |
| "num_tokens": 152727004.0, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.6825639903740975, | |
| "grad_norm": 0.7553012186400901, | |
| "learning_rate": 1.7502278559641e-06, | |
| "loss": 0.246, | |
| "num_tokens": 153403038.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6854809305039014, | |
| "grad_norm": 0.7916967627609816, | |
| "learning_rate": 1.7297468880950275e-06, | |
| "loss": 0.2456, | |
| "num_tokens": 154050570.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.6883978706337053, | |
| "grad_norm": 0.7570768314074023, | |
| "learning_rate": 1.7093717907783725e-06, | |
| "loss": 0.248, | |
| "num_tokens": 154707732.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6913148107635091, | |
| "grad_norm": 0.7403254580619876, | |
| "learning_rate": 1.6891046783157577e-06, | |
| "loss": 0.2506, | |
| "num_tokens": 155357884.0, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.6942317508933129, | |
| "grad_norm": 0.7862768452334575, | |
| "learning_rate": 1.668947653803332e-06, | |
| "loss": 0.2478, | |
| "num_tokens": 156012053.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6971486910231167, | |
| "grad_norm": 0.6944065718383872, | |
| "learning_rate": 1.6489028089135412e-06, | |
| "loss": 0.2403, | |
| "num_tokens": 156675319.0, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.7000656311529205, | |
| "grad_norm": 0.689978749467197, | |
| "learning_rate": 1.6289722236780708e-06, | |
| "loss": 0.2443, | |
| "num_tokens": 157341110.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7029825712827245, | |
| "grad_norm": 0.7401973229886903, | |
| "learning_rate": 1.6091579662720085e-06, | |
| "loss": 0.2468, | |
| "num_tokens": 157999131.0, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.7058995114125283, | |
| "grad_norm": 0.7320601106731225, | |
| "learning_rate": 1.5894620927992305e-06, | |
| "loss": 0.2523, | |
| "num_tokens": 158639604.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7088164515423321, | |
| "grad_norm": 0.7143585333776589, | |
| "learning_rate": 1.5698866470790408e-06, | |
| "loss": 0.2511, | |
| "num_tokens": 159287175.0, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.7117333916721359, | |
| "grad_norm": 0.805040758465756, | |
| "learning_rate": 1.5504336604340859e-06, | |
| "loss": 0.2527, | |
| "num_tokens": 159928090.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7146503318019398, | |
| "grad_norm": 0.7525019096597073, | |
| "learning_rate": 1.5311051514795689e-06, | |
| "loss": 0.2524, | |
| "num_tokens": 160587939.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.7175672719317436, | |
| "grad_norm": 0.7709357498772824, | |
| "learning_rate": 1.5119031259137786e-06, | |
| "loss": 0.2495, | |
| "num_tokens": 161239965.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.7204842120615474, | |
| "grad_norm": 0.6930775959107991, | |
| "learning_rate": 1.4928295763099595e-06, | |
| "loss": 0.2471, | |
| "num_tokens": 161899763.0, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.7234011521913513, | |
| "grad_norm": 0.7146451766584444, | |
| "learning_rate": 1.4738864819095478e-06, | |
| "loss": 0.2468, | |
| "num_tokens": 162563978.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.7263180923211551, | |
| "grad_norm": 0.7233964208181292, | |
| "learning_rate": 1.4550758084167823e-06, | |
| "loss": 0.2468, | |
| "num_tokens": 163215392.0, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.729235032450959, | |
| "grad_norm": 0.7535387485017295, | |
| "learning_rate": 1.4363995077947318e-06, | |
| "loss": 0.2478, | |
| "num_tokens": 163876128.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7321519725807628, | |
| "grad_norm": 0.710154633510974, | |
| "learning_rate": 1.417859518062738e-06, | |
| "loss": 0.2471, | |
| "num_tokens": 164536666.0, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.7350689127105666, | |
| "grad_norm": 0.7793325296651725, | |
| "learning_rate": 1.3994577630953085e-06, | |
| "loss": 0.2465, | |
| "num_tokens": 165186675.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7379858528403704, | |
| "grad_norm": 0.7204453905441239, | |
| "learning_rate": 1.3811961524224838e-06, | |
| "loss": 0.2453, | |
| "num_tokens": 165828836.0, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.7409027929701743, | |
| "grad_norm": 0.6875863992968856, | |
| "learning_rate": 1.3630765810316799e-06, | |
| "loss": 0.2492, | |
| "num_tokens": 166480185.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7438197330999782, | |
| "grad_norm": 0.7399219862362597, | |
| "learning_rate": 1.3451009291710542e-06, | |
| "loss": 0.2509, | |
| "num_tokens": 167127201.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.746736673229782, | |
| "grad_norm": 0.7264134555452614, | |
| "learning_rate": 1.3272710621543892e-06, | |
| "loss": 0.2488, | |
| "num_tokens": 167776261.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7496536133595858, | |
| "grad_norm": 0.7732152888510079, | |
| "learning_rate": 1.309588830167536e-06, | |
| "loss": 0.2449, | |
| "num_tokens": 168426328.0, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.7525705534893896, | |
| "grad_norm": 0.7073729535659216, | |
| "learning_rate": 1.2920560680764165e-06, | |
| "loss": 0.2421, | |
| "num_tokens": 169092019.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7554874936191934, | |
| "grad_norm": 0.7331560167444491, | |
| "learning_rate": 1.2746745952366275e-06, | |
| "loss": 0.2487, | |
| "num_tokens": 169741583.0, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.7584044337489974, | |
| "grad_norm": 0.6884922406400141, | |
| "learning_rate": 1.2574462153046441e-06, | |
| "loss": 0.2412, | |
| "num_tokens": 170389299.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7613213738788012, | |
| "grad_norm": 0.781793314708951, | |
| "learning_rate": 1.2403727160506559e-06, | |
| "loss": 0.2444, | |
| "num_tokens": 171042211.0, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.764238314008605, | |
| "grad_norm": 0.7250494897320526, | |
| "learning_rate": 1.223455869173056e-06, | |
| "loss": 0.2438, | |
| "num_tokens": 171701780.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7671552541384088, | |
| "grad_norm": 0.8740645184327206, | |
| "learning_rate": 1.2066974301145894e-06, | |
| "loss": 0.2456, | |
| "num_tokens": 172376337.0, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.7700721942682126, | |
| "grad_norm": 0.7443315304727108, | |
| "learning_rate": 1.1900991378801964e-06, | |
| "loss": 0.2439, | |
| "num_tokens": 173041371.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7729891343980165, | |
| "grad_norm": 0.7116423082671238, | |
| "learning_rate": 1.1736627148565535e-06, | |
| "loss": 0.2531, | |
| "num_tokens": 173682308.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.7759060745278203, | |
| "grad_norm": 0.7077335089669503, | |
| "learning_rate": 1.157389866633348e-06, | |
| "loss": 0.246, | |
| "num_tokens": 174330336.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7788230146576242, | |
| "grad_norm": 0.7308022672326984, | |
| "learning_rate": 1.1412822818262878e-06, | |
| "loss": 0.2524, | |
| "num_tokens": 174969619.0, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.781739954787428, | |
| "grad_norm": 0.6995063282185964, | |
| "learning_rate": 1.125341631901876e-06, | |
| "loss": 0.2406, | |
| "num_tokens": 175634291.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7846568949172318, | |
| "grad_norm": 0.7514971805090548, | |
| "learning_rate": 1.1095695710039664e-06, | |
| "loss": 0.2403, | |
| "num_tokens": 176302019.0, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.7875738350470357, | |
| "grad_norm": 0.6772018040657682, | |
| "learning_rate": 1.0939677357821108e-06, | |
| "loss": 0.2445, | |
| "num_tokens": 176976275.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7904907751768395, | |
| "grad_norm": 0.6817675548735839, | |
| "learning_rate": 1.0785377452217311e-06, | |
| "loss": 0.2503, | |
| "num_tokens": 177628328.0, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.7934077153066433, | |
| "grad_norm": 0.7346247121033264, | |
| "learning_rate": 1.0632812004761151e-06, | |
| "loss": 0.243, | |
| "num_tokens": 178288873.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7963246554364471, | |
| "grad_norm": 0.7311842645346187, | |
| "learning_rate": 1.0481996847002676e-06, | |
| "loss": 0.2504, | |
| "num_tokens": 178942090.0, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.799241595566251, | |
| "grad_norm": 0.731394943481863, | |
| "learning_rate": 1.0332947628866273e-06, | |
| "loss": 0.2428, | |
| "num_tokens": 179600176.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.8021585356960549, | |
| "grad_norm": 0.7100948172511871, | |
| "learning_rate": 1.0185679817026715e-06, | |
| "loss": 0.2511, | |
| "num_tokens": 180244674.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.8050754758258587, | |
| "grad_norm": 0.708242217717899, | |
| "learning_rate": 1.0040208693304183e-06, | |
| "loss": 0.2456, | |
| "num_tokens": 180901761.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.8079924159556625, | |
| "grad_norm": 0.6803617534010951, | |
| "learning_rate": 9.89654935307848e-07, | |
| "loss": 0.2393, | |
| "num_tokens": 181564639.0, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.8109093560854663, | |
| "grad_norm": 0.6958034065313294, | |
| "learning_rate": 9.754716703722635e-07, | |
| "loss": 0.2347, | |
| "num_tokens": 182224730.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.8138262962152701, | |
| "grad_norm": 0.7165262095609373, | |
| "learning_rate": 9.614725463055931e-07, | |
| "loss": 0.2461, | |
| "num_tokens": 182883552.0, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.8167432363450741, | |
| "grad_norm": 0.7138480327038658, | |
| "learning_rate": 9.476590157816701e-07, | |
| "loss": 0.2411, | |
| "num_tokens": 183537453.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8196601764748779, | |
| "grad_norm": 0.7133912213063753, | |
| "learning_rate": 9.340325122154878e-07, | |
| "loss": 0.2462, | |
| "num_tokens": 184199270.0, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.8225771166046817, | |
| "grad_norm": 0.6947806259213369, | |
| "learning_rate": 9.205944496144556e-07, | |
| "loss": 0.2486, | |
| "num_tokens": 184853282.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.8254940567344855, | |
| "grad_norm": 0.7272223644551763, | |
| "learning_rate": 9.073462224316707e-07, | |
| "loss": 0.2447, | |
| "num_tokens": 185514281.0, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.8284109968642893, | |
| "grad_norm": 0.7072312165887826, | |
| "learning_rate": 8.942892054212143e-07, | |
| "loss": 0.2397, | |
| "num_tokens": 186169820.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.8313279369940932, | |
| "grad_norm": 0.6903355667912614, | |
| "learning_rate": 8.814247534954983e-07, | |
| "loss": 0.2497, | |
| "num_tokens": 186817556.0, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.834244877123897, | |
| "grad_norm": 0.706857521915122, | |
| "learning_rate": 8.687542015846639e-07, | |
| "loss": 0.2438, | |
| "num_tokens": 187469631.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.8371618172537009, | |
| "grad_norm": 0.6988018386149026, | |
| "learning_rate": 8.562788644980624e-07, | |
| "loss": 0.2455, | |
| "num_tokens": 188113842.0, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.8400787573835047, | |
| "grad_norm": 0.673516286108323, | |
| "learning_rate": 8.440000367878115e-07, | |
| "loss": 0.2396, | |
| "num_tokens": 188782137.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.8429956975133085, | |
| "grad_norm": 0.699489145828766, | |
| "learning_rate": 8.319189926144688e-07, | |
| "loss": 0.2511, | |
| "num_tokens": 189419614.0, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.8459126376431124, | |
| "grad_norm": 0.7084426008074721, | |
| "learning_rate": 8.200369856148089e-07, | |
| "loss": 0.2457, | |
| "num_tokens": 190076342.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8488295777729162, | |
| "grad_norm": 0.6739486565962028, | |
| "learning_rate": 8.083552487717358e-07, | |
| "loss": 0.2457, | |
| "num_tokens": 190726569.0, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.85174651790272, | |
| "grad_norm": 0.7095407182813062, | |
| "learning_rate": 7.968749942863385e-07, | |
| "loss": 0.2435, | |
| "num_tokens": 191375804.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8546634580325239, | |
| "grad_norm": 0.6710001579802304, | |
| "learning_rate": 7.855974134520999e-07, | |
| "loss": 0.2405, | |
| "num_tokens": 192039149.0, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.8575803981623277, | |
| "grad_norm": 0.7079688435936735, | |
| "learning_rate": 7.745236765312819e-07, | |
| "loss": 0.2411, | |
| "num_tokens": 192695832.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8604973382921316, | |
| "grad_norm": 0.6854485439890876, | |
| "learning_rate": 7.636549326334825e-07, | |
| "loss": 0.2417, | |
| "num_tokens": 193367048.0, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.8634142784219354, | |
| "grad_norm": 0.7184112786493275, | |
| "learning_rate": 7.529923095963999e-07, | |
| "loss": 0.2441, | |
| "num_tokens": 194017542.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8663312185517392, | |
| "grad_norm": 0.6651542917903988, | |
| "learning_rate": 7.425369138687957e-07, | |
| "loss": 0.2422, | |
| "num_tokens": 194667338.0, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.869248158681543, | |
| "grad_norm": 0.6648797188770686, | |
| "learning_rate": 7.322898303956773e-07, | |
| "loss": 0.2416, | |
| "num_tokens": 195324822.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8721650988113469, | |
| "grad_norm": 0.6743904697311516, | |
| "learning_rate": 7.222521225057187e-07, | |
| "loss": 0.2481, | |
| "num_tokens": 195972844.0, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.8750820389411508, | |
| "grad_norm": 0.6862148679988234, | |
| "learning_rate": 7.124248318009164e-07, | |
| "loss": 0.2426, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8750820389411508, | |
| "eval_loss": 0.2491583675146103, | |
| "eval_num_tokens": 196624476.0, | |
| "eval_runtime": 607.7377, | |
| "eval_samples_per_second": 9.501, | |
| "eval_steps_per_second": 1.188, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8779989790709546, | |
| "grad_norm": 0.7150785623554777, | |
| "learning_rate": 7.028089780485081e-07, | |
| "loss": 0.2437, | |
| "num_tokens": 197287303.0, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.8809159192007584, | |
| "grad_norm": 0.6922051228380665, | |
| "learning_rate": 6.934055590751461e-07, | |
| "loss": 0.237, | |
| "num_tokens": 197950146.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8838328593305622, | |
| "grad_norm": 0.6814419896511972, | |
| "learning_rate": 6.842155506633598e-07, | |
| "loss": 0.2429, | |
| "num_tokens": 198606592.0, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.886749799460366, | |
| "grad_norm": 0.7528574216832195, | |
| "learning_rate": 6.752399064502959e-07, | |
| "loss": 0.2374, | |
| "num_tokens": 199260466.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.88966673959017, | |
| "grad_norm": 0.743850816504643, | |
| "learning_rate": 6.664795578287632e-07, | |
| "loss": 0.2498, | |
| "num_tokens": 199915169.0, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.8925836797199738, | |
| "grad_norm": 0.6659899742478199, | |
| "learning_rate": 6.579354138505817e-07, | |
| "loss": 0.239, | |
| "num_tokens": 200577384.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8955006198497776, | |
| "grad_norm": 0.6989537096567529, | |
| "learning_rate": 6.496083611322503e-07, | |
| "loss": 0.2437, | |
| "num_tokens": 201229904.0, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.8984175599795814, | |
| "grad_norm": 0.6870906534872678, | |
| "learning_rate": 6.414992637629462e-07, | |
| "loss": 0.2459, | |
| "num_tokens": 201874398.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.9013345001093852, | |
| "grad_norm": 0.6851022314706521, | |
| "learning_rate": 6.336089632148566e-07, | |
| "loss": 0.2437, | |
| "num_tokens": 202514986.0, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.9042514402391891, | |
| "grad_norm": 0.708606924412846, | |
| "learning_rate": 6.259382782558623e-07, | |
| "loss": 0.2513, | |
| "num_tokens": 203157709.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.9071683803689929, | |
| "grad_norm": 0.6901795784562913, | |
| "learning_rate": 6.184880048645731e-07, | |
| "loss": 0.241, | |
| "num_tokens": 203820585.0, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.9100853204987968, | |
| "grad_norm": 0.6854819640738248, | |
| "learning_rate": 6.112589161477317e-07, | |
| "loss": 0.2474, | |
| "num_tokens": 204463922.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.9130022606286006, | |
| "grad_norm": 0.6490034167792925, | |
| "learning_rate": 6.042517622599872e-07, | |
| "loss": 0.2423, | |
| "num_tokens": 205116527.0, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.9159192007584044, | |
| "grad_norm": 0.6700066915913209, | |
| "learning_rate": 5.97467270326055e-07, | |
| "loss": 0.2399, | |
| "num_tokens": 205783661.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.9188361408882083, | |
| "grad_norm": 0.6666592595870292, | |
| "learning_rate": 5.909061443652619e-07, | |
| "loss": 0.2483, | |
| "num_tokens": 206431518.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.9217530810180121, | |
| "grad_norm": 0.6880885792235153, | |
| "learning_rate": 5.845690652184906e-07, | |
| "loss": 0.244, | |
| "num_tokens": 207082078.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.9246700211478159, | |
| "grad_norm": 0.7218901751907087, | |
| "learning_rate": 5.784566904775314e-07, | |
| "loss": 0.2439, | |
| "num_tokens": 207733633.0, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.9275869612776197, | |
| "grad_norm": 0.7224619275428422, | |
| "learning_rate": 5.725696544168431e-07, | |
| "loss": 0.2442, | |
| "num_tokens": 208384194.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.9305039014074236, | |
| "grad_norm": 0.685956055593419, | |
| "learning_rate": 5.669085679277353e-07, | |
| "loss": 0.2467, | |
| "num_tokens": 209017882.0, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.9334208415372275, | |
| "grad_norm": 0.6595594654962236, | |
| "learning_rate": 5.614740184549774e-07, | |
| "loss": 0.2385, | |
| "num_tokens": 209671100.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9363377816670313, | |
| "grad_norm": 0.6635370096109607, | |
| "learning_rate": 5.562665699358395e-07, | |
| "loss": 0.2403, | |
| "num_tokens": 210332481.0, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.9392547217968351, | |
| "grad_norm": 0.6792107629780629, | |
| "learning_rate": 5.512867627415738e-07, | |
| "loss": 0.2485, | |
| "num_tokens": 210989185.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.9421716619266389, | |
| "grad_norm": 0.688506342901487, | |
| "learning_rate": 5.465351136213403e-07, | |
| "loss": 0.2404, | |
| "num_tokens": 211654349.0, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.9450886020564427, | |
| "grad_norm": 0.6957995520415324, | |
| "learning_rate": 5.420121156485843e-07, | |
| "loss": 0.2449, | |
| "num_tokens": 212306065.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.9480055421862467, | |
| "grad_norm": 0.6874524399723582, | |
| "learning_rate": 5.377182381698713e-07, | |
| "loss": 0.2461, | |
| "num_tokens": 212961559.0, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.9509224823160505, | |
| "grad_norm": 0.6531866036674674, | |
| "learning_rate": 5.336539267561834e-07, | |
| "loss": 0.2419, | |
| "num_tokens": 213623892.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.9538394224458543, | |
| "grad_norm": 0.6861036738287705, | |
| "learning_rate": 5.298196031566817e-07, | |
| "loss": 0.2397, | |
| "num_tokens": 214276109.0, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.9567563625756581, | |
| "grad_norm": 0.7385399272118951, | |
| "learning_rate": 5.262156652549434e-07, | |
| "loss": 0.2404, | |
| "num_tokens": 214935598.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9596733027054619, | |
| "grad_norm": 0.7641216689081703, | |
| "learning_rate": 5.228424870276732e-07, | |
| "loss": 0.2413, | |
| "num_tokens": 215592154.0, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.9625902428352658, | |
| "grad_norm": 0.6626697793424543, | |
| "learning_rate": 5.197004185058957e-07, | |
| "loss": 0.2402, | |
| "num_tokens": 216248152.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9655071829650697, | |
| "grad_norm": 0.6668250234146728, | |
| "learning_rate": 5.167897857386338e-07, | |
| "loss": 0.24, | |
| "num_tokens": 216908673.0, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.9684241230948735, | |
| "grad_norm": 0.6830282199194727, | |
| "learning_rate": 5.141108907590743e-07, | |
| "loss": 0.2447, | |
| "num_tokens": 217569951.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9713410632246773, | |
| "grad_norm": 0.694769552967056, | |
| "learning_rate": 5.116640115532271e-07, | |
| "loss": 0.2463, | |
| "num_tokens": 218214661.0, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.9742580033544811, | |
| "grad_norm": 0.6576753862947748, | |
| "learning_rate": 5.09449402031078e-07, | |
| "loss": 0.2377, | |
| "num_tokens": 218875984.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.977174943484285, | |
| "grad_norm": 0.6885196158109818, | |
| "learning_rate": 5.074672920002409e-07, | |
| "loss": 0.2388, | |
| "num_tokens": 219529057.0, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.9800918836140888, | |
| "grad_norm": 0.6862150588167344, | |
| "learning_rate": 5.057178871421117e-07, | |
| "loss": 0.238, | |
| "num_tokens": 220197787.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9830088237438926, | |
| "grad_norm": 0.6778601221453343, | |
| "learning_rate": 5.04201368990524e-07, | |
| "loss": 0.2513, | |
| "num_tokens": 220841743.0, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.9859257638736965, | |
| "grad_norm": 0.6899658979865586, | |
| "learning_rate": 5.029178949129118e-07, | |
| "loss": 0.243, | |
| "num_tokens": 221473805.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9888427040035004, | |
| "grad_norm": 0.6528281983164207, | |
| "learning_rate": 5.018675980939805e-07, | |
| "loss": 0.2444, | |
| "num_tokens": 222120807.0, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.9917596441333042, | |
| "grad_norm": 0.6674621015647066, | |
| "learning_rate": 5.010505875218846e-07, | |
| "loss": 0.2386, | |
| "num_tokens": 222794425.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.994676584263108, | |
| "grad_norm": 0.7086141702806269, | |
| "learning_rate": 5.004669479769203e-07, | |
| "loss": 0.2405, | |
| "num_tokens": 223443814.0, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.9975935243929118, | |
| "grad_norm": 0.6795651043519514, | |
| "learning_rate": 5.001167400227263e-07, | |
| "loss": 0.2383, | |
| "num_tokens": 224098786.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9999270764967549, | |
| "num_tokens": 224623551.0, | |
| "step": 1714, | |
| "total_flos": 9.914064352850215e+18, | |
| "train_loss": 0.2932530035076787, | |
| "train_runtime": 32864.6589, | |
| "train_samples_per_second": 3.338, | |
| "train_steps_per_second": 0.052 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1714, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.914064352850215e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |