| { | |
| "best_global_step": 24000, | |
| "best_metric": 0.6922064423561096, | |
| "best_model_checkpoint": "/content/drive/MyDrive/fyp-2025/ModelFinetuniningData/Phi-4-mini-instruct-customerservice/outputs/checkpoint-24000", | |
| "epoch": 3.0, | |
| "eval_steps": 2000, | |
| "global_step": 24063, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006233636703652911, | |
| "grad_norm": 2.4495298862457275, | |
| "learning_rate": 8.139534883720931e-07, | |
| "loss": 2.7651, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012467273407305822, | |
| "grad_norm": 2.381847858428955, | |
| "learning_rate": 1.6445182724252492e-06, | |
| "loss": 2.7284, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.018700910110958733, | |
| "grad_norm": 1.6577181816101074, | |
| "learning_rate": 2.4750830564784057e-06, | |
| "loss": 2.5988, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.024934546814611644, | |
| "grad_norm": 0.9445393085479736, | |
| "learning_rate": 3.305647840531562e-06, | |
| "loss": 2.3047, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.031168183518264555, | |
| "grad_norm": 1.2448478937149048, | |
| "learning_rate": 4.136212624584718e-06, | |
| "loss": 1.9769, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.037401820221917466, | |
| "grad_norm": 0.8048786520957947, | |
| "learning_rate": 4.966777408637874e-06, | |
| "loss": 1.4865, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04363545692557038, | |
| "grad_norm": 0.47680938243865967, | |
| "learning_rate": 5.79734219269103e-06, | |
| "loss": 1.1596, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.04986909362922329, | |
| "grad_norm": 0.4646517336368561, | |
| "learning_rate": 6.627906976744186e-06, | |
| "loss": 1.0886, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0561027303328762, | |
| "grad_norm": 0.49803978204727173, | |
| "learning_rate": 7.4584717607973425e-06, | |
| "loss": 1.0394, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.06233636703652911, | |
| "grad_norm": 0.5668339133262634, | |
| "learning_rate": 8.2890365448505e-06, | |
| "loss": 1.0005, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06857000374018202, | |
| "grad_norm": 0.6751675605773926, | |
| "learning_rate": 9.119601328903655e-06, | |
| "loss": 0.9783, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.07480364044383493, | |
| "grad_norm": 0.7680630087852478, | |
| "learning_rate": 9.950166112956811e-06, | |
| "loss": 0.95, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.08103727714748785, | |
| "grad_norm": 1.0588089227676392, | |
| "learning_rate": 1.0780730897009968e-05, | |
| "loss": 0.9327, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.08727091385114076, | |
| "grad_norm": 0.7925204634666443, | |
| "learning_rate": 1.1611295681063124e-05, | |
| "loss": 0.9109, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.09350455055479366, | |
| "grad_norm": 0.8681260943412781, | |
| "learning_rate": 1.244186046511628e-05, | |
| "loss": 0.8868, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.09973818725844658, | |
| "grad_norm": 0.7388427257537842, | |
| "learning_rate": 1.3272425249169436e-05, | |
| "loss": 0.8935, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10597182396209949, | |
| "grad_norm": 0.7777144908905029, | |
| "learning_rate": 1.4102990033222592e-05, | |
| "loss": 0.8895, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1122054606657524, | |
| "grad_norm": 0.8474845290184021, | |
| "learning_rate": 1.4933554817275748e-05, | |
| "loss": 0.8827, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1184390973694053, | |
| "grad_norm": 0.8560120463371277, | |
| "learning_rate": 1.5764119601328905e-05, | |
| "loss": 0.8757, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.12467273407305822, | |
| "grad_norm": 0.841167151927948, | |
| "learning_rate": 1.659468438538206e-05, | |
| "loss": 0.8693, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.13090637077671113, | |
| "grad_norm": 0.8493492007255554, | |
| "learning_rate": 1.7425249169435217e-05, | |
| "loss": 0.8653, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.13714000748036403, | |
| "grad_norm": 0.8703436255455017, | |
| "learning_rate": 1.825581395348837e-05, | |
| "loss": 0.8591, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.14337364418401696, | |
| "grad_norm": 0.7577048540115356, | |
| "learning_rate": 1.908637873754153e-05, | |
| "loss": 0.8606, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.14960728088766986, | |
| "grad_norm": 1.394689679145813, | |
| "learning_rate": 1.9916943521594686e-05, | |
| "loss": 0.8399, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.15584091759132276, | |
| "grad_norm": 0.8342117667198181, | |
| "learning_rate": 1.999980875991073e-05, | |
| "loss": 0.8434, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.1620745542949757, | |
| "grad_norm": 0.932096540927887, | |
| "learning_rate": 1.9999147692449697e-05, | |
| "loss": 0.8406, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.1683081909986286, | |
| "grad_norm": 0.7788100242614746, | |
| "learning_rate": 1.999801446783615e-05, | |
| "loss": 0.8308, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.17454182770228152, | |
| "grad_norm": 0.7356207370758057, | |
| "learning_rate": 1.9996409139580664e-05, | |
| "loss": 0.8286, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.18077546440593442, | |
| "grad_norm": 0.792756199836731, | |
| "learning_rate": 1.9994331783486415e-05, | |
| "loss": 0.8301, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.18700910110958732, | |
| "grad_norm": 0.7116053700447083, | |
| "learning_rate": 1.9991782497645624e-05, | |
| "loss": 0.8224, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.19324273781324025, | |
| "grad_norm": 0.7787157297134399, | |
| "learning_rate": 1.99887614024349e-05, | |
| "loss": 0.8251, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.19947637451689315, | |
| "grad_norm": 0.7587001919746399, | |
| "learning_rate": 1.9985268640509576e-05, | |
| "loss": 0.8223, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.20571001122054608, | |
| "grad_norm": 0.7318805456161499, | |
| "learning_rate": 1.998130437679696e-05, | |
| "loss": 0.8257, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.21194364792419898, | |
| "grad_norm": 0.7161705493927002, | |
| "learning_rate": 1.997686879848855e-05, | |
| "loss": 0.8171, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.21817728462785188, | |
| "grad_norm": 0.7708820700645447, | |
| "learning_rate": 1.997196211503121e-05, | |
| "loss": 0.8126, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2244109213315048, | |
| "grad_norm": 0.7371873259544373, | |
| "learning_rate": 1.9966584558117242e-05, | |
| "loss": 0.8078, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2306445580351577, | |
| "grad_norm": 0.8285321593284607, | |
| "learning_rate": 1.9960736381673492e-05, | |
| "loss": 0.8075, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2368781947388106, | |
| "grad_norm": 0.7384074926376343, | |
| "learning_rate": 1.9954417861849332e-05, | |
| "loss": 0.8041, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.24311183144246354, | |
| "grad_norm": 0.8148590922355652, | |
| "learning_rate": 1.994762929700362e-05, | |
| "loss": 0.8003, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.24934546814611644, | |
| "grad_norm": 0.8277537226676941, | |
| "learning_rate": 1.9940371007690626e-05, | |
| "loss": 0.7973, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.24934546814611644, | |
| "eval_loss": 0.8006435632705688, | |
| "eval_runtime": 396.9923, | |
| "eval_samples_per_second": 46.18, | |
| "eval_steps_per_second": 5.773, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.25557910484976937, | |
| "grad_norm": 0.8305752277374268, | |
| "learning_rate": 1.9932643336644877e-05, | |
| "loss": 0.8023, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.26181274155342227, | |
| "grad_norm": 0.8062120676040649, | |
| "learning_rate": 1.9924446648764995e-05, | |
| "loss": 0.8022, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.26804637825707517, | |
| "grad_norm": 0.843477725982666, | |
| "learning_rate": 1.991578133109645e-05, | |
| "loss": 0.7972, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.27428001496072807, | |
| "grad_norm": 0.7614973187446594, | |
| "learning_rate": 1.990664779281328e-05, | |
| "loss": 0.7871, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.280513651664381, | |
| "grad_norm": 0.7798663973808289, | |
| "learning_rate": 1.9897046465198794e-05, | |
| "loss": 0.7927, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.2867472883680339, | |
| "grad_norm": 0.7543362379074097, | |
| "learning_rate": 1.9886977801625176e-05, | |
| "loss": 0.7975, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.2929809250716868, | |
| "grad_norm": 0.7982914447784424, | |
| "learning_rate": 1.987644227753211e-05, | |
| "loss": 0.7966, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.2992145617753397, | |
| "grad_norm": 0.8895176649093628, | |
| "learning_rate": 1.98654403904043e-05, | |
| "loss": 0.7799, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3054481984789926, | |
| "grad_norm": 0.7891516089439392, | |
| "learning_rate": 1.9853972659747986e-05, | |
| "loss": 0.7875, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.31168183518264553, | |
| "grad_norm": 0.8148348331451416, | |
| "learning_rate": 1.9842039627066433e-05, | |
| "loss": 0.7976, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3179154718862985, | |
| "grad_norm": 0.7989701628684998, | |
| "learning_rate": 1.982964185583434e-05, | |
| "loss": 0.7862, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.3241491085899514, | |
| "grad_norm": 0.8005407452583313, | |
| "learning_rate": 1.9816779931471238e-05, | |
| "loss": 0.794, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.3303827452936043, | |
| "grad_norm": 0.8170273303985596, | |
| "learning_rate": 1.980345446131385e-05, | |
| "loss": 0.7794, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.3366163819972572, | |
| "grad_norm": 0.7872186303138733, | |
| "learning_rate": 1.9789666074587405e-05, | |
| "loss": 0.7798, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.3428500187009101, | |
| "grad_norm": 0.8118924498558044, | |
| "learning_rate": 1.9775415422375942e-05, | |
| "loss": 0.7803, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.34908365540456304, | |
| "grad_norm": 0.839482843875885, | |
| "learning_rate": 1.9760703177591547e-05, | |
| "loss": 0.7783, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.35531729210821594, | |
| "grad_norm": 0.8034165501594543, | |
| "learning_rate": 1.9745530034942594e-05, | |
| "loss": 0.7926, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.36155092881186884, | |
| "grad_norm": 0.8720650672912598, | |
| "learning_rate": 1.9729896710900927e-05, | |
| "loss": 0.7713, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.36778456551552174, | |
| "grad_norm": 0.883476972579956, | |
| "learning_rate": 1.9713803943668045e-05, | |
| "loss": 0.7735, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.37401820221917464, | |
| "grad_norm": 0.816597580909729, | |
| "learning_rate": 1.9697252493140228e-05, | |
| "loss": 0.7713, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3802518389228276, | |
| "grad_norm": 0.7947691679000854, | |
| "learning_rate": 1.9680243140872664e-05, | |
| "loss": 0.7897, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.3864854756264805, | |
| "grad_norm": 0.8053900599479675, | |
| "learning_rate": 1.966277669004254e-05, | |
| "loss": 0.773, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.3927191123301334, | |
| "grad_norm": 0.8357210159301758, | |
| "learning_rate": 1.9644853965411125e-05, | |
| "loss": 0.772, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.3989527490337863, | |
| "grad_norm": 0.915678083896637, | |
| "learning_rate": 1.96264758132848e-05, | |
| "loss": 0.7762, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4051863857374392, | |
| "grad_norm": 0.8132200837135315, | |
| "learning_rate": 1.9607643101475146e-05, | |
| "loss": 0.77, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.41142002244109216, | |
| "grad_norm": 0.8125095963478088, | |
| "learning_rate": 1.95883567192579e-05, | |
| "loss": 0.7764, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.41765365914474506, | |
| "grad_norm": 0.8730440139770508, | |
| "learning_rate": 1.9568617577331014e-05, | |
| "loss": 0.7703, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.42388729584839796, | |
| "grad_norm": 0.9059854745864868, | |
| "learning_rate": 1.954842660777164e-05, | |
| "loss": 0.7541, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.43012093255205086, | |
| "grad_norm": 0.8346767425537109, | |
| "learning_rate": 1.9527784763992106e-05, | |
| "loss": 0.7634, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.43635456925570376, | |
| "grad_norm": 0.7939431071281433, | |
| "learning_rate": 1.9506693020694904e-05, | |
| "loss": 0.766, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.44258820595935666, | |
| "grad_norm": 0.8269067406654358, | |
| "learning_rate": 1.948515237382666e-05, | |
| "loss": 0.7564, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.4488218426630096, | |
| "grad_norm": 0.8967489004135132, | |
| "learning_rate": 1.9463163840531125e-05, | |
| "loss": 0.7665, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.4550554793666625, | |
| "grad_norm": 0.7770310640335083, | |
| "learning_rate": 1.9440728459101112e-05, | |
| "loss": 0.772, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.4612891160703154, | |
| "grad_norm": 0.8270549774169922, | |
| "learning_rate": 1.9417847288929495e-05, | |
| "loss": 0.7578, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.4675227527739683, | |
| "grad_norm": 0.8432906270027161, | |
| "learning_rate": 1.9394521410459182e-05, | |
| "loss": 0.7653, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.4737563894776212, | |
| "grad_norm": 0.8015474081039429, | |
| "learning_rate": 1.9370751925132082e-05, | |
| "loss": 0.7575, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.4799900261812742, | |
| "grad_norm": 0.8320118188858032, | |
| "learning_rate": 1.9346539955337113e-05, | |
| "loss": 0.7486, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.4862236628849271, | |
| "grad_norm": 0.833288848400116, | |
| "learning_rate": 1.9321886644357178e-05, | |
| "loss": 0.7626, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.49245729958858, | |
| "grad_norm": 0.8111228942871094, | |
| "learning_rate": 1.9296793156315216e-05, | |
| "loss": 0.7503, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.4986909362922329, | |
| "grad_norm": 0.7788956165313721, | |
| "learning_rate": 1.9271260676119205e-05, | |
| "loss": 0.7635, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.4986909362922329, | |
| "eval_loss": 0.7571428418159485, | |
| "eval_runtime": 397.2108, | |
| "eval_samples_per_second": 46.154, | |
| "eval_steps_per_second": 5.77, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5049245729958858, | |
| "grad_norm": 0.8604903817176819, | |
| "learning_rate": 1.924529040940621e-05, | |
| "loss": 0.7644, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.5111582096995387, | |
| "grad_norm": 0.7932497262954712, | |
| "learning_rate": 1.9218883582485476e-05, | |
| "loss": 0.757, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.5173918464031916, | |
| "grad_norm": 0.8160136342048645, | |
| "learning_rate": 1.9192041442280494e-05, | |
| "loss": 0.7458, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.5236254831068445, | |
| "grad_norm": 0.908441960811615, | |
| "learning_rate": 1.916476525627014e-05, | |
| "loss": 0.7501, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.5298591198104975, | |
| "grad_norm": 0.8205119967460632, | |
| "learning_rate": 1.9137056312428827e-05, | |
| "loss": 0.7514, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.5360927565141503, | |
| "grad_norm": 0.783224880695343, | |
| "learning_rate": 1.910891591916567e-05, | |
| "loss": 0.7525, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.5423263932178033, | |
| "grad_norm": 0.8491820096969604, | |
| "learning_rate": 1.908034540526272e-05, | |
| "loss": 0.7668, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.5485600299214561, | |
| "grad_norm": 0.9290434122085571, | |
| "learning_rate": 1.9051346119812208e-05, | |
| "loss": 0.7516, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.5547936666251091, | |
| "grad_norm": 0.8297129273414612, | |
| "learning_rate": 1.902191943215285e-05, | |
| "loss": 0.7552, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.561027303328762, | |
| "grad_norm": 0.8847944140434265, | |
| "learning_rate": 1.8992066731805175e-05, | |
| "loss": 0.7534, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.5672609400324149, | |
| "grad_norm": 0.8535125255584717, | |
| "learning_rate": 1.8961789428405933e-05, | |
| "loss": 0.7483, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.5734945767360679, | |
| "grad_norm": 0.7982850074768066, | |
| "learning_rate": 1.8931088951641512e-05, | |
| "loss": 0.754, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.5797282134397207, | |
| "grad_norm": 0.9072125554084778, | |
| "learning_rate": 1.8899966751180435e-05, | |
| "loss": 0.746, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.5859618501433737, | |
| "grad_norm": 0.800804853439331, | |
| "learning_rate": 1.8868424296604913e-05, | |
| "loss": 0.7504, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.5921954868470266, | |
| "grad_norm": 0.8139356374740601, | |
| "learning_rate": 1.8836463077341447e-05, | |
| "loss": 0.7424, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.5984291235506795, | |
| "grad_norm": 0.8343134522438049, | |
| "learning_rate": 1.880408460259049e-05, | |
| "loss": 0.7503, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.6046627602543324, | |
| "grad_norm": 0.7936016321182251, | |
| "learning_rate": 1.8771290401255194e-05, | |
| "loss": 0.7433, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.6108963969579853, | |
| "grad_norm": 0.8802788257598877, | |
| "learning_rate": 1.873808202186922e-05, | |
| "loss": 0.7428, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.6171300336616382, | |
| "grad_norm": 0.8412097096443176, | |
| "learning_rate": 1.87044610325236e-05, | |
| "loss": 0.746, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.6233636703652911, | |
| "grad_norm": 0.804280698299408, | |
| "learning_rate": 1.8670429020792703e-05, | |
| "loss": 0.7425, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.629597307068944, | |
| "grad_norm": 0.8929165601730347, | |
| "learning_rate": 1.8635987593659274e-05, | |
| "loss": 0.7412, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.635830943772597, | |
| "grad_norm": 0.8767927289009094, | |
| "learning_rate": 1.860113837743853e-05, | |
| "loss": 0.7536, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.6420645804762498, | |
| "grad_norm": 0.8169072270393372, | |
| "learning_rate": 1.8565883017701404e-05, | |
| "loss": 0.7461, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.6482982171799028, | |
| "grad_norm": 0.8935323357582092, | |
| "learning_rate": 1.8530223179196807e-05, | |
| "loss": 0.7413, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.6545318538835556, | |
| "grad_norm": 0.8844409584999084, | |
| "learning_rate": 1.8494160545773036e-05, | |
| "loss": 0.7425, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.6607654905872086, | |
| "grad_norm": 0.853883683681488, | |
| "learning_rate": 1.8457696820298253e-05, | |
| "loss": 0.7368, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.6669991272908615, | |
| "grad_norm": 0.7682216167449951, | |
| "learning_rate": 1.84208337245801e-05, | |
| "loss": 0.7473, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.6732327639945144, | |
| "grad_norm": 0.8282256722450256, | |
| "learning_rate": 1.8383572999284353e-05, | |
| "loss": 0.7492, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.6794664006981673, | |
| "grad_norm": 0.9756806492805481, | |
| "learning_rate": 1.8345916403852777e-05, | |
| "loss": 0.7426, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.6857000374018202, | |
| "grad_norm": 0.9142268896102905, | |
| "learning_rate": 1.8307865716420005e-05, | |
| "loss": 0.7405, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.6919336741054731, | |
| "grad_norm": 0.8344477415084839, | |
| "learning_rate": 1.8269422733729597e-05, | |
| "loss": 0.742, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.6981673108091261, | |
| "grad_norm": 0.8575064539909363, | |
| "learning_rate": 1.8230589271049196e-05, | |
| "loss": 0.7422, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.7044009475127789, | |
| "grad_norm": 0.8407536149024963, | |
| "learning_rate": 1.819136716208481e-05, | |
| "loss": 0.7391, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.7106345842164319, | |
| "grad_norm": 0.8509117364883423, | |
| "learning_rate": 1.815175825889421e-05, | |
| "loss": 0.7427, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.7168682209200847, | |
| "grad_norm": 0.8610983490943909, | |
| "learning_rate": 1.811176443179951e-05, | |
| "loss": 0.7285, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.7231018576237377, | |
| "grad_norm": 0.8700429201126099, | |
| "learning_rate": 1.807138756929881e-05, | |
| "loss": 0.7417, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.7293354943273906, | |
| "grad_norm": 0.8384543061256409, | |
| "learning_rate": 1.8030629577977064e-05, | |
| "loss": 0.7429, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.7355691310310435, | |
| "grad_norm": 0.8495470285415649, | |
| "learning_rate": 1.798949238241601e-05, | |
| "loss": 0.7362, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.7418027677346964, | |
| "grad_norm": 0.7969231605529785, | |
| "learning_rate": 1.7947977925103315e-05, | |
| "loss": 0.7426, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.7480364044383493, | |
| "grad_norm": 0.871778666973114, | |
| "learning_rate": 1.7906088166340864e-05, | |
| "loss": 0.7395, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7480364044383493, | |
| "eval_loss": 0.734497606754303, | |
| "eval_runtime": 395.2496, | |
| "eval_samples_per_second": 46.383, | |
| "eval_steps_per_second": 5.799, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7542700411420022, | |
| "grad_norm": 0.8313363194465637, | |
| "learning_rate": 1.786382508415216e-05, | |
| "loss": 0.734, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.7605036778456552, | |
| "grad_norm": 0.8330720663070679, | |
| "learning_rate": 1.7821190674188953e-05, | |
| "loss": 0.7373, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.766737314549308, | |
| "grad_norm": 0.8525868058204651, | |
| "learning_rate": 1.7778186949636983e-05, | |
| "loss": 0.7336, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.772970951252961, | |
| "grad_norm": 0.8216151595115662, | |
| "learning_rate": 1.7734815941120933e-05, | |
| "loss": 0.7377, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.7792045879566138, | |
| "grad_norm": 0.807019829750061, | |
| "learning_rate": 1.769107969660855e-05, | |
| "loss": 0.7395, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.7854382246602668, | |
| "grad_norm": 0.8149738907814026, | |
| "learning_rate": 1.7646980281313917e-05, | |
| "loss": 0.7312, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.7916718613639198, | |
| "grad_norm": 0.8304749131202698, | |
| "learning_rate": 1.760251977759995e-05, | |
| "loss": 0.724, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.7979054980675726, | |
| "grad_norm": 0.8745994567871094, | |
| "learning_rate": 1.7557700284880063e-05, | |
| "loss": 0.7377, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.8041391347712256, | |
| "grad_norm": 0.8300268054008484, | |
| "learning_rate": 1.751252391951905e-05, | |
| "loss": 0.7238, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.8103727714748784, | |
| "grad_norm": 0.8081981539726257, | |
| "learning_rate": 1.7466992814733123e-05, | |
| "loss": 0.7279, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8166064081785314, | |
| "grad_norm": 0.8357431888580322, | |
| "learning_rate": 1.7421109120489206e-05, | |
| "loss": 0.7349, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.8228400448821843, | |
| "grad_norm": 0.8953850865364075, | |
| "learning_rate": 1.7374875003403402e-05, | |
| "loss": 0.7297, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.8290736815858372, | |
| "grad_norm": 0.8897569179534912, | |
| "learning_rate": 1.7328292646638694e-05, | |
| "loss": 0.7289, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.8353073182894901, | |
| "grad_norm": 0.8997496962547302, | |
| "learning_rate": 1.7281364249801846e-05, | |
| "loss": 0.7244, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.841540954993143, | |
| "grad_norm": 0.8270450830459595, | |
| "learning_rate": 1.723409202883955e-05, | |
| "loss": 0.7323, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.8477745916967959, | |
| "grad_norm": 0.8650174736976624, | |
| "learning_rate": 1.7186478215933776e-05, | |
| "loss": 0.7379, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.8540082284004489, | |
| "grad_norm": 0.9005187153816223, | |
| "learning_rate": 1.713852505939639e-05, | |
| "loss": 0.725, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.8602418651041017, | |
| "grad_norm": 0.8741387724876404, | |
| "learning_rate": 1.7090234823562956e-05, | |
| "loss": 0.7321, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.8664755018077547, | |
| "grad_norm": 0.8768566250801086, | |
| "learning_rate": 1.7041609788685853e-05, | |
| "loss": 0.7229, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.8727091385114075, | |
| "grad_norm": 0.8667632937431335, | |
| "learning_rate": 1.699265225082658e-05, | |
| "loss": 0.7321, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.8789427752150605, | |
| "grad_norm": 0.8344389796257019, | |
| "learning_rate": 1.694336452174733e-05, | |
| "loss": 0.7316, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.8851764119187133, | |
| "grad_norm": 0.9333544969558716, | |
| "learning_rate": 1.689374892880185e-05, | |
| "loss": 0.722, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.8914100486223663, | |
| "grad_norm": 0.8140573501586914, | |
| "learning_rate": 1.684380781482553e-05, | |
| "loss": 0.7212, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.8976436853260192, | |
| "grad_norm": 0.8397175073623657, | |
| "learning_rate": 1.679354353802478e-05, | |
| "loss": 0.7307, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.9038773220296721, | |
| "grad_norm": 0.8897206783294678, | |
| "learning_rate": 1.674295847186567e-05, | |
| "loss": 0.7298, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.910110958733325, | |
| "grad_norm": 0.8106831312179565, | |
| "learning_rate": 1.6692055004961867e-05, | |
| "loss": 0.7254, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.9163445954369779, | |
| "grad_norm": 0.8223755359649658, | |
| "learning_rate": 1.664083554096183e-05, | |
| "loss": 0.7263, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.9225782321406308, | |
| "grad_norm": 0.8110599517822266, | |
| "learning_rate": 1.6589302498435324e-05, | |
| "loss": 0.7185, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.9288118688442838, | |
| "grad_norm": 0.8102028965950012, | |
| "learning_rate": 1.6537458310759215e-05, | |
| "loss": 0.7269, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.9350455055479366, | |
| "grad_norm": 0.8312651515007019, | |
| "learning_rate": 1.648530542600255e-05, | |
| "loss": 0.7169, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.9412791422515896, | |
| "grad_norm": 0.8525806665420532, | |
| "learning_rate": 1.6432846306810982e-05, | |
| "loss": 0.7146, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.9475127789552424, | |
| "grad_norm": 0.8789056539535522, | |
| "learning_rate": 1.6380083430290467e-05, | |
| "loss": 0.7248, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.9537464156588954, | |
| "grad_norm": 0.9384961128234863, | |
| "learning_rate": 1.632701928789031e-05, | |
| "loss": 0.7229, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.9599800523625484, | |
| "grad_norm": 0.8627296090126038, | |
| "learning_rate": 1.627365638528551e-05, | |
| "loss": 0.7263, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.9662136890662012, | |
| "grad_norm": 0.8898370265960693, | |
| "learning_rate": 1.621999724225844e-05, | |
| "loss": 0.72, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.9724473257698542, | |
| "grad_norm": 0.8563375473022461, | |
| "learning_rate": 1.6166044392579877e-05, | |
| "loss": 0.7221, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.978680962473507, | |
| "grad_norm": 0.8451232314109802, | |
| "learning_rate": 1.6111800383889345e-05, | |
| "loss": 0.7197, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.98491459917716, | |
| "grad_norm": 0.8202937245368958, | |
| "learning_rate": 1.605726777757482e-05, | |
| "loss": 0.7255, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.9911482358808129, | |
| "grad_norm": 0.9220672845840454, | |
| "learning_rate": 1.6002449148651784e-05, | |
| "loss": 0.7317, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.9973818725844658, | |
| "grad_norm": 0.8331303000450134, | |
| "learning_rate": 1.5947347085641632e-05, | |
| "loss": 0.718, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.9973818725844658, | |
| "eval_loss": 0.7209351062774658, | |
| "eval_runtime": 394.7424, | |
| "eval_samples_per_second": 46.443, | |
| "eval_steps_per_second": 5.806, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.0036155092881187, | |
| "grad_norm": 0.7849367260932922, | |
| "learning_rate": 1.5891964190449447e-05, | |
| "loss": 0.7015, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.0098491459917716, | |
| "grad_norm": 0.8923312425613403, | |
| "learning_rate": 1.583630307824113e-05, | |
| "loss": 0.7171, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.0160827826954244, | |
| "grad_norm": 0.8501293063163757, | |
| "learning_rate": 1.5780366377319913e-05, | |
| "loss": 0.7235, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.0223164193990775, | |
| "grad_norm": 0.8093539476394653, | |
| "learning_rate": 1.572415672900226e-05, | |
| "loss": 0.714, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.0285500561027303, | |
| "grad_norm": 0.8588930368423462, | |
| "learning_rate": 1.5667676787493148e-05, | |
| "loss": 0.7228, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.0347836928063832, | |
| "grad_norm": 0.8440228700637817, | |
| "learning_rate": 1.5610929219760715e-05, | |
| "loss": 0.7166, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.0410173295100362, | |
| "grad_norm": 0.8874866366386414, | |
| "learning_rate": 1.5553916705410347e-05, | |
| "loss": 0.7164, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.047250966213689, | |
| "grad_norm": 0.8335061073303223, | |
| "learning_rate": 1.5496641936558135e-05, | |
| "loss": 0.7169, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.053484602917342, | |
| "grad_norm": 0.8623288869857788, | |
| "learning_rate": 1.543910761770377e-05, | |
| "loss": 0.7209, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.059718239620995, | |
| "grad_norm": 0.8319075107574463, | |
| "learning_rate": 1.5381316465602808e-05, | |
| "loss": 0.7205, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.0659518763246478, | |
| "grad_norm": 0.9222123622894287, | |
| "learning_rate": 1.532327120913843e-05, | |
| "loss": 0.7165, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.0721855130283007, | |
| "grad_norm": 0.8959569931030273, | |
| "learning_rate": 1.526497458919253e-05, | |
| "loss": 0.7034, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.0784191497319535, | |
| "grad_norm": 0.85212641954422, | |
| "learning_rate": 1.5206429358516341e-05, | |
| "loss": 0.7155, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.0846527864356066, | |
| "grad_norm": 0.8561433553695679, | |
| "learning_rate": 1.5147638281600423e-05, | |
| "loss": 0.7093, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.0908864231392594, | |
| "grad_norm": 0.8480707406997681, | |
| "learning_rate": 1.5088604134544135e-05, | |
| "loss": 0.7193, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.0971200598429123, | |
| "grad_norm": 0.8811634182929993, | |
| "learning_rate": 1.502932970492454e-05, | |
| "loss": 0.7265, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.1033536965465653, | |
| "grad_norm": 0.9020422697067261, | |
| "learning_rate": 1.4969817791664779e-05, | |
| "loss": 0.7013, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.1095873332502182, | |
| "grad_norm": 0.8984312415122986, | |
| "learning_rate": 1.4910071204901916e-05, | |
| "loss": 0.709, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.115820969953871, | |
| "grad_norm": 0.8867316842079163, | |
| "learning_rate": 1.4850092765854233e-05, | |
| "loss": 0.7116, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.122054606657524, | |
| "grad_norm": 0.8604278564453125, | |
| "learning_rate": 1.4789885306688019e-05, | |
| "loss": 0.7133, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.128288243361177, | |
| "grad_norm": 0.9262681603431702, | |
| "learning_rate": 1.4729451670383829e-05, | |
| "loss": 0.7137, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.1345218800648298, | |
| "grad_norm": 0.8793482184410095, | |
| "learning_rate": 1.4668794710602248e-05, | |
| "loss": 0.728, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.1407555167684826, | |
| "grad_norm": 0.8378130197525024, | |
| "learning_rate": 1.4607917291549131e-05, | |
| "loss": 0.7121, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.1469891534721357, | |
| "grad_norm": 0.8587197065353394, | |
| "learning_rate": 1.4546822287840372e-05, | |
| "loss": 0.7124, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.1532227901757885, | |
| "grad_norm": 0.8652411699295044, | |
| "learning_rate": 1.4485512584366146e-05, | |
| "loss": 0.7167, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.1594564268794414, | |
| "grad_norm": 0.8245055675506592, | |
| "learning_rate": 1.4423991076154704e-05, | |
| "loss": 0.6939, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.1656900635830945, | |
| "grad_norm": 0.8209188580513, | |
| "learning_rate": 1.436226066823566e-05, | |
| "loss": 0.7169, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.1719237002867473, | |
| "grad_norm": 0.8945075273513794, | |
| "learning_rate": 1.4300324275502806e-05, | |
| "loss": 0.7083, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.1781573369904002, | |
| "grad_norm": 0.912131667137146, | |
| "learning_rate": 1.4238184822576499e-05, | |
| "loss": 0.7082, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.184390973694053, | |
| "grad_norm": 0.8798801898956299, | |
| "learning_rate": 1.4175845243665536e-05, | |
| "loss": 0.7218, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.190624610397706, | |
| "grad_norm": 0.8814321160316467, | |
| "learning_rate": 1.4113308482428617e-05, | |
| "loss": 0.716, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.196858247101359, | |
| "grad_norm": 0.8622399568557739, | |
| "learning_rate": 1.4050577491835338e-05, | |
| "loss": 0.7161, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.2030918838050118, | |
| "grad_norm": 0.9244632124900818, | |
| "learning_rate": 1.3987655234026752e-05, | |
| "loss": 0.7104, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.2093255205086648, | |
| "grad_norm": 0.8619811534881592, | |
| "learning_rate": 1.39245446801755e-05, | |
| "loss": 0.7155, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.2155591572123177, | |
| "grad_norm": 0.8646683692932129, | |
| "learning_rate": 1.3861248810345516e-05, | |
| "loss": 0.717, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.2217927939159705, | |
| "grad_norm": 0.8994303941726685, | |
| "learning_rate": 1.3797770613351307e-05, | |
| "loss": 0.711, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.2280264306196236, | |
| "grad_norm": 0.9144839644432068, | |
| "learning_rate": 1.373411308661682e-05, | |
| "loss": 0.7146, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.2342600673232764, | |
| "grad_norm": 1.0024070739746094, | |
| "learning_rate": 1.3670279236033902e-05, | |
| "loss": 0.7162, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.2404937040269293, | |
| "grad_norm": 0.9051625728607178, | |
| "learning_rate": 1.3606272075820374e-05, | |
| "loss": 0.6966, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.2467273407305823, | |
| "grad_norm": 0.8814643621444702, | |
| "learning_rate": 1.3542094628377686e-05, | |
| "loss": 0.7101, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.2467273407305823, | |
| "eval_loss": 0.7119160890579224, | |
| "eval_runtime": 394.8999, | |
| "eval_samples_per_second": 46.424, | |
| "eval_steps_per_second": 5.804, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.2529609774342352, | |
| "grad_norm": 0.9712830781936646, | |
| "learning_rate": 1.3477749924148206e-05, | |
| "loss": 0.7203, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.259194614137888, | |
| "grad_norm": 0.9015008807182312, | |
| "learning_rate": 1.3413241001472132e-05, | |
| "loss": 0.7063, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.2654282508415409, | |
| "grad_norm": 0.9200523495674133, | |
| "learning_rate": 1.334857090644401e-05, | |
| "loss": 0.7117, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.271661887545194, | |
| "grad_norm": 0.8595969676971436, | |
| "learning_rate": 1.3283742692768892e-05, | |
| "loss": 0.7184, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.2778955242488468, | |
| "grad_norm": 0.8903511762619019, | |
| "learning_rate": 1.3218759421618172e-05, | |
| "loss": 0.7185, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.2841291609524996, | |
| "grad_norm": 0.8739769458770752, | |
| "learning_rate": 1.3153624161485001e-05, | |
| "loss": 0.7083, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.2903627976561527, | |
| "grad_norm": 0.914838969707489, | |
| "learning_rate": 1.308833998803942e-05, | |
| "loss": 0.7096, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.2965964343598055, | |
| "grad_norm": 0.8579884767532349, | |
| "learning_rate": 1.302290998398311e-05, | |
| "loss": 0.721, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.3028300710634584, | |
| "grad_norm": 0.9888195991516113, | |
| "learning_rate": 1.295733723890384e-05, | |
| "loss": 0.7067, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.3090637077671112, | |
| "grad_norm": 0.8959117531776428, | |
| "learning_rate": 1.2891624849129572e-05, | |
| "loss": 0.7031, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.3152973444707643, | |
| "grad_norm": 0.8969167470932007, | |
| "learning_rate": 1.2825775917582257e-05, | |
| "loss": 0.7032, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.3215309811744171, | |
| "grad_norm": 0.9104548692703247, | |
| "learning_rate": 1.2759793553631307e-05, | |
| "loss": 0.6994, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.32776461787807, | |
| "grad_norm": 0.9115015864372253, | |
| "learning_rate": 1.2693680872946787e-05, | |
| "loss": 0.7137, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.333998254581723, | |
| "grad_norm": 0.9241281747817993, | |
| "learning_rate": 1.2627440997352269e-05, | |
| "loss": 0.7084, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.340231891285376, | |
| "grad_norm": 0.8836124539375305, | |
| "learning_rate": 1.256107705467745e-05, | |
| "loss": 0.7128, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.3464655279890287, | |
| "grad_norm": 0.8973872661590576, | |
| "learning_rate": 1.2494592178610438e-05, | |
| "loss": 0.7073, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.3526991646926816, | |
| "grad_norm": 0.8749911189079285, | |
| "learning_rate": 1.2427989508549781e-05, | |
| "loss": 0.7138, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.3589328013963347, | |
| "grad_norm": 0.9338003993034363, | |
| "learning_rate": 1.236127218945623e-05, | |
| "loss": 0.704, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.3651664380999875, | |
| "grad_norm": 0.9447057247161865, | |
| "learning_rate": 1.2294443371704237e-05, | |
| "loss": 0.7155, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.3714000748036406, | |
| "grad_norm": 0.89363694190979, | |
| "learning_rate": 1.2227506210933187e-05, | |
| "loss": 0.7052, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.3776337115072934, | |
| "grad_norm": 0.9201143383979797, | |
| "learning_rate": 1.2160463867898398e-05, | |
| "loss": 0.7096, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.3838673482109463, | |
| "grad_norm": 0.887363076210022, | |
| "learning_rate": 1.2093319508321863e-05, | |
| "loss": 0.7081, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.390100984914599, | |
| "grad_norm": 0.9289067983627319, | |
| "learning_rate": 1.2026076302742778e-05, | |
| "loss": 0.7067, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.3963346216182522, | |
| "grad_norm": 0.8112872242927551, | |
| "learning_rate": 1.1958737426367806e-05, | |
| "loss": 0.7058, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.402568258321905, | |
| "grad_norm": 0.8492117524147034, | |
| "learning_rate": 1.1891306058921178e-05, | |
| "loss": 0.6959, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.4088018950255579, | |
| "grad_norm": 0.950752854347229, | |
| "learning_rate": 1.1823785384494515e-05, | |
| "loss": 0.7002, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.415035531729211, | |
| "grad_norm": 0.8966166973114014, | |
| "learning_rate": 1.1756178591396499e-05, | |
| "loss": 0.7114, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.4212691684328638, | |
| "grad_norm": 0.919644832611084, | |
| "learning_rate": 1.168848887200231e-05, | |
| "loss": 0.7084, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.4275028051365166, | |
| "grad_norm": 0.8702328205108643, | |
| "learning_rate": 1.162071942260289e-05, | |
| "loss": 0.7086, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.4337364418401695, | |
| "grad_norm": 1.0018388032913208, | |
| "learning_rate": 1.1552873443254002e-05, | |
| "loss": 0.7102, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.4399700785438225, | |
| "grad_norm": 0.9201192855834961, | |
| "learning_rate": 1.1484954137625141e-05, | |
| "loss": 0.7114, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.4462037152474754, | |
| "grad_norm": 0.8748006820678711, | |
| "learning_rate": 1.1416964712848248e-05, | |
| "loss": 0.7084, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.4524373519511282, | |
| "grad_norm": 0.9001298546791077, | |
| "learning_rate": 1.1348908379366275e-05, | |
| "loss": 0.6944, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.4586709886547813, | |
| "grad_norm": 0.8929358124732971, | |
| "learning_rate": 1.1280788350781583e-05, | |
| "loss": 0.7018, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.4649046253584341, | |
| "grad_norm": 0.9111042022705078, | |
| "learning_rate": 1.121260784370419e-05, | |
| "loss": 0.6976, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.471138262062087, | |
| "grad_norm": 0.8904392719268799, | |
| "learning_rate": 1.1144370077599908e-05, | |
| "loss": 0.7025, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.4773718987657398, | |
| "grad_norm": 0.9039836525917053, | |
| "learning_rate": 1.10760782746383e-05, | |
| "loss": 0.7026, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.4836055354693929, | |
| "grad_norm": 0.9207535982131958, | |
| "learning_rate": 1.1007735659540531e-05, | |
| "loss": 0.6987, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.4898391721730457, | |
| "grad_norm": 0.9142275452613831, | |
| "learning_rate": 1.0939345459427106e-05, | |
| "loss": 0.7043, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.4960728088766988, | |
| "grad_norm": 0.9600117802619934, | |
| "learning_rate": 1.0870910903665479e-05, | |
| "loss": 0.7101, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.4960728088766988, | |
| "eval_loss": 0.7051036953926086, | |
| "eval_runtime": 397.8248, | |
| "eval_samples_per_second": 46.083, | |
| "eval_steps_per_second": 5.761, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.5023064455803516, | |
| "grad_norm": 0.9714391827583313, | |
| "learning_rate": 1.080243522371757e-05, | |
| "loss": 0.7009, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 1.5085400822840045, | |
| "grad_norm": 0.8584455847740173, | |
| "learning_rate": 1.0733921652987172e-05, | |
| "loss": 0.7068, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.5147737189876573, | |
| "grad_norm": 0.9826545715332031, | |
| "learning_rate": 1.0665373426667264e-05, | |
| "loss": 0.7053, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 1.5210073556913102, | |
| "grad_norm": 0.9026105999946594, | |
| "learning_rate": 1.0596793781587264e-05, | |
| "loss": 0.6967, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.5272409923949632, | |
| "grad_norm": 0.9080072045326233, | |
| "learning_rate": 1.0528185956060173e-05, | |
| "loss": 0.702, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.533474629098616, | |
| "grad_norm": 0.9512772560119629, | |
| "learning_rate": 1.045955318972965e-05, | |
| "loss": 0.7095, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.5397082658022692, | |
| "grad_norm": 0.8855695128440857, | |
| "learning_rate": 1.0390898723417073e-05, | |
| "loss": 0.7066, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 1.545941902505922, | |
| "grad_norm": 0.888293445110321, | |
| "learning_rate": 1.0322225798968482e-05, | |
| "loss": 0.7025, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.5521755392095749, | |
| "grad_norm": 0.8677939772605896, | |
| "learning_rate": 1.0253537659101495e-05, | |
| "loss": 0.6948, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 1.5584091759132277, | |
| "grad_norm": 0.9333109259605408, | |
| "learning_rate": 1.0184837547252213e-05, | |
| "loss": 0.695, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.5646428126168805, | |
| "grad_norm": 0.9408410787582397, | |
| "learning_rate": 1.0116128707422039e-05, | |
| "loss": 0.6931, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 1.5708764493205336, | |
| "grad_norm": 0.8665565252304077, | |
| "learning_rate": 1.0047414384024513e-05, | |
| "loss": 0.6983, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.5771100860241867, | |
| "grad_norm": 0.9101490378379822, | |
| "learning_rate": 9.978697821732109e-06, | |
| "loss": 0.7017, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 1.5833437227278395, | |
| "grad_norm": 0.8597551584243774, | |
| "learning_rate": 9.90998226532302e-06, | |
| "loss": 0.7046, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.5895773594314924, | |
| "grad_norm": 0.9050746560096741, | |
| "learning_rate": 9.841270959527945e-06, | |
| "loss": 0.6966, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.5958109961351452, | |
| "grad_norm": 0.8811514973640442, | |
| "learning_rate": 9.772567148876859e-06, | |
| "loss": 0.6958, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.602044632838798, | |
| "grad_norm": 0.9430891275405884, | |
| "learning_rate": 9.703874077545837e-06, | |
| "loss": 0.7075, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 1.608278269542451, | |
| "grad_norm": 0.8874529600143433, | |
| "learning_rate": 9.635194989203822e-06, | |
| "loss": 0.6954, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.614511906246104, | |
| "grad_norm": 0.901620626449585, | |
| "learning_rate": 9.566533126859509e-06, | |
| "loss": 0.7007, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 1.620745542949757, | |
| "grad_norm": 0.9101523756980896, | |
| "learning_rate": 9.497891732708168e-06, | |
| "loss": 0.7034, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.6269791796534099, | |
| "grad_norm": 0.9502910375595093, | |
| "learning_rate": 9.429274047978574e-06, | |
| "loss": 0.7114, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.6332128163570627, | |
| "grad_norm": 0.9115251898765564, | |
| "learning_rate": 9.360683312779942e-06, | |
| "loss": 0.7084, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.6394464530607156, | |
| "grad_norm": 0.8814319968223572, | |
| "learning_rate": 9.29212276594895e-06, | |
| "loss": 0.7053, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.6456800897643684, | |
| "grad_norm": 1.040366768836975, | |
| "learning_rate": 9.223595644896773e-06, | |
| "loss": 0.7053, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.6519137264680215, | |
| "grad_norm": 0.9586474895477295, | |
| "learning_rate": 9.15510518545625e-06, | |
| "loss": 0.6978, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.6581473631716743, | |
| "grad_norm": 1.001570701599121, | |
| "learning_rate": 9.086654621729046e-06, | |
| "loss": 0.7041, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.6643809998753274, | |
| "grad_norm": 0.9462688565254211, | |
| "learning_rate": 9.018247185932973e-06, | |
| "loss": 0.7031, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.6706146365789802, | |
| "grad_norm": 0.9172865152359009, | |
| "learning_rate": 8.949886108249358e-06, | |
| "loss": 0.6965, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.676848273282633, | |
| "grad_norm": 0.9603027105331421, | |
| "learning_rate": 8.881574616670493e-06, | |
| "loss": 0.7033, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.683081909986286, | |
| "grad_norm": 0.9165986180305481, | |
| "learning_rate": 8.813315936847247e-06, | |
| "loss": 0.6993, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.6893155466899388, | |
| "grad_norm": 0.9557304978370667, | |
| "learning_rate": 8.745113291936718e-06, | |
| "loss": 0.6962, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.6955491833935918, | |
| "grad_norm": 1.037959337234497, | |
| "learning_rate": 8.676969902450054e-06, | |
| "loss": 0.7046, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.7017828200972447, | |
| "grad_norm": 0.8420128226280212, | |
| "learning_rate": 8.608888986100374e-06, | |
| "loss": 0.6966, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.7080164568008978, | |
| "grad_norm": 0.8743023872375488, | |
| "learning_rate": 8.540873757650845e-06, | |
| "loss": 0.7024, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.7142500935045506, | |
| "grad_norm": 0.9249597191810608, | |
| "learning_rate": 8.472927428762845e-06, | |
| "loss": 0.7025, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.7204837302082034, | |
| "grad_norm": 0.8922662734985352, | |
| "learning_rate": 8.405053207844358e-06, | |
| "loss": 0.7033, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.7267173669118563, | |
| "grad_norm": 0.8870722055435181, | |
| "learning_rate": 8.337254299898432e-06, | |
| "loss": 0.7061, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.7329510036155091, | |
| "grad_norm": 1.065221905708313, | |
| "learning_rate": 8.269533906371862e-06, | |
| "loss": 0.7101, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.7391846403191622, | |
| "grad_norm": 0.9949721693992615, | |
| "learning_rate": 8.201895225004004e-06, | |
| "loss": 0.6916, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.7454182770228153, | |
| "grad_norm": 0.9127896428108215, | |
| "learning_rate": 8.134341449675802e-06, | |
| "loss": 0.6904, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.7454182770228153, | |
| "eval_loss": 0.7001627087593079, | |
| "eval_runtime": 393.8486, | |
| "eval_samples_per_second": 46.548, | |
| "eval_steps_per_second": 5.819, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.7516519137264681, | |
| "grad_norm": 0.9275141358375549, | |
| "learning_rate": 8.066875770258952e-06, | |
| "loss": 0.6886, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.757885550430121, | |
| "grad_norm": 0.9855388402938843, | |
| "learning_rate": 7.99950137246528e-06, | |
| "loss": 0.6997, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.7641191871337738, | |
| "grad_norm": 0.9497159719467163, | |
| "learning_rate": 7.932221437696324e-06, | |
| "loss": 0.7009, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.7703528238374266, | |
| "grad_norm": 0.9329772591590881, | |
| "learning_rate": 7.865039142893091e-06, | |
| "loss": 0.6912, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.7765864605410797, | |
| "grad_norm": 0.9342162609100342, | |
| "learning_rate": 7.797957660386072e-06, | |
| "loss": 0.6932, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.7828200972447326, | |
| "grad_norm": 0.9604317545890808, | |
| "learning_rate": 7.730980157745414e-06, | |
| "loss": 0.6971, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.7890537339483856, | |
| "grad_norm": 0.9466226696968079, | |
| "learning_rate": 7.664109797631365e-06, | |
| "loss": 0.703, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.7952873706520385, | |
| "grad_norm": 0.9420017600059509, | |
| "learning_rate": 7.5973497376449304e-06, | |
| "loss": 0.6991, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.8015210073556913, | |
| "grad_norm": 0.9643932580947876, | |
| "learning_rate": 7.530703130178781e-06, | |
| "loss": 0.6909, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.8077546440593442, | |
| "grad_norm": 0.9386921525001526, | |
| "learning_rate": 7.46417312226837e-06, | |
| "loss": 0.7007, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.813988280762997, | |
| "grad_norm": 0.9069890975952148, | |
| "learning_rate": 7.397762855443374e-06, | |
| "loss": 0.6918, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.82022191746665, | |
| "grad_norm": 0.9002771973609924, | |
| "learning_rate": 7.331475465579303e-06, | |
| "loss": 0.6975, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.826455554170303, | |
| "grad_norm": 0.9573726058006287, | |
| "learning_rate": 7.265314082749471e-06, | |
| "loss": 0.6967, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 1.832689190873956, | |
| "grad_norm": 0.9355108141899109, | |
| "learning_rate": 7.199281831077148e-06, | |
| "loss": 0.6997, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.8389228275776088, | |
| "grad_norm": 0.9691817164421082, | |
| "learning_rate": 7.133381828588088e-06, | |
| "loss": 0.7015, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.8451564642812617, | |
| "grad_norm": 1.0340629816055298, | |
| "learning_rate": 7.0676171870632646e-06, | |
| "loss": 0.6968, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.8513901009849145, | |
| "grad_norm": 0.9037183523178101, | |
| "learning_rate": 7.001991011891936e-06, | |
| "loss": 0.6931, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 1.8576237376885674, | |
| "grad_norm": 0.9368975162506104, | |
| "learning_rate": 6.93650640192502e-06, | |
| "loss": 0.6992, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.8638573743922204, | |
| "grad_norm": 0.8793601393699646, | |
| "learning_rate": 6.871166449328759e-06, | |
| "loss": 0.6995, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 1.8700910110958733, | |
| "grad_norm": 0.9233574867248535, | |
| "learning_rate": 6.8059742394387215e-06, | |
| "loss": 0.6849, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.8763246477995263, | |
| "grad_norm": 0.9490443468093872, | |
| "learning_rate": 6.74093285061409e-06, | |
| "loss": 0.6946, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 1.8825582845031792, | |
| "grad_norm": 0.8939170837402344, | |
| "learning_rate": 6.67604535409233e-06, | |
| "loss": 0.6978, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.888791921206832, | |
| "grad_norm": 0.9986585974693298, | |
| "learning_rate": 6.611314813844139e-06, | |
| "loss": 0.6988, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 1.8950255579104849, | |
| "grad_norm": 0.9630236625671387, | |
| "learning_rate": 6.54674428642879e-06, | |
| "loss": 0.6948, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.9012591946141377, | |
| "grad_norm": 1.1094101667404175, | |
| "learning_rate": 6.482336820849784e-06, | |
| "loss": 0.7034, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.9074928313177908, | |
| "grad_norm": 0.960928738117218, | |
| "learning_rate": 6.418095458410894e-06, | |
| "loss": 0.7087, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.9137264680214439, | |
| "grad_norm": 0.9537627696990967, | |
| "learning_rate": 6.3540232325725325e-06, | |
| "loss": 0.7051, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 1.9199601047250967, | |
| "grad_norm": 0.9109727740287781, | |
| "learning_rate": 6.2901231688085416e-06, | |
| "loss": 0.6952, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.9261937414287496, | |
| "grad_norm": 0.9576653838157654, | |
| "learning_rate": 6.226398284463306e-06, | |
| "loss": 0.7021, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 1.9324273781324024, | |
| "grad_norm": 0.8972005248069763, | |
| "learning_rate": 6.16285158860928e-06, | |
| "loss": 0.7002, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.9386610148360552, | |
| "grad_norm": 0.9278882741928101, | |
| "learning_rate": 6.099486081904914e-06, | |
| "loss": 0.701, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 1.9448946515397083, | |
| "grad_norm": 0.9672142863273621, | |
| "learning_rate": 6.036304756452942e-06, | |
| "loss": 0.6908, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.9511282882433612, | |
| "grad_norm": 0.9314201474189758, | |
| "learning_rate": 5.973310595659123e-06, | |
| "loss": 0.6922, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 1.9573619249470142, | |
| "grad_norm": 0.934550404548645, | |
| "learning_rate": 5.91050657409133e-06, | |
| "loss": 0.6913, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.963595561650667, | |
| "grad_norm": 0.9368258714675903, | |
| "learning_rate": 5.847895657339131e-06, | |
| "loss": 0.6983, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.96982919835432, | |
| "grad_norm": 0.9582532048225403, | |
| "learning_rate": 5.785480801873717e-06, | |
| "loss": 0.6882, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.9760628350579728, | |
| "grad_norm": 0.9585422277450562, | |
| "learning_rate": 5.723264954908329e-06, | |
| "loss": 0.7017, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 1.9822964717616256, | |
| "grad_norm": 0.9137251973152161, | |
| "learning_rate": 5.661251054259072e-06, | |
| "loss": 0.6932, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.9885301084652787, | |
| "grad_norm": 0.8928583860397339, | |
| "learning_rate": 5.5994420282062e-06, | |
| "loss": 0.6965, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 1.9947637451689315, | |
| "grad_norm": 0.948628306388855, | |
| "learning_rate": 5.537840795355844e-06, | |
| "loss": 0.6999, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.9947637451689315, | |
| "eval_loss": 0.6964989304542542, | |
| "eval_runtime": 393.8067, | |
| "eval_samples_per_second": 46.553, | |
| "eval_steps_per_second": 5.82, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.0009973818725846, | |
| "grad_norm": 0.9089522361755371, | |
| "learning_rate": 5.47645026450219e-06, | |
| "loss": 0.6856, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 2.0072310185762374, | |
| "grad_norm": 0.9454661011695862, | |
| "learning_rate": 5.4152733344901344e-06, | |
| "loss": 0.6916, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.0134646552798903, | |
| "grad_norm": 0.9901193976402283, | |
| "learning_rate": 5.354312894078395e-06, | |
| "loss": 0.6882, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 2.019698291983543, | |
| "grad_norm": 0.9769952893257141, | |
| "learning_rate": 5.293571821803107e-06, | |
| "loss": 0.6912, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.025931928687196, | |
| "grad_norm": 0.9182629585266113, | |
| "learning_rate": 5.2330529858419e-06, | |
| "loss": 0.6813, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 2.032165565390849, | |
| "grad_norm": 0.9791790843009949, | |
| "learning_rate": 5.172759243878465e-06, | |
| "loss": 0.6999, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.038399202094502, | |
| "grad_norm": 0.9678331613540649, | |
| "learning_rate": 5.112693442967606e-06, | |
| "loss": 0.6953, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 2.044632838798155, | |
| "grad_norm": 0.9587630033493042, | |
| "learning_rate": 5.052858419400823e-06, | |
| "loss": 0.6871, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.050866475501808, | |
| "grad_norm": 0.9735134840011597, | |
| "learning_rate": 4.993256998572349e-06, | |
| "loss": 0.6953, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 2.0571001122054606, | |
| "grad_norm": 0.9480214715003967, | |
| "learning_rate": 4.933891994845781e-06, | |
| "loss": 0.6857, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.0633337489091135, | |
| "grad_norm": 0.9086151719093323, | |
| "learning_rate": 4.874766211421137e-06, | |
| "loss": 0.6932, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 2.0695673856127663, | |
| "grad_norm": 1.0070750713348389, | |
| "learning_rate": 4.815882440202541e-06, | |
| "loss": 0.6818, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 2.0758010223164196, | |
| "grad_norm": 1.0480530261993408, | |
| "learning_rate": 4.757243461666341e-06, | |
| "loss": 0.6939, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 2.0820346590200725, | |
| "grad_norm": 1.0010734796524048, | |
| "learning_rate": 4.698852044729848e-06, | |
| "loss": 0.6951, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 2.0882682957237253, | |
| "grad_norm": 0.9465258121490479, | |
| "learning_rate": 4.640710946620579e-06, | |
| "loss": 0.6864, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 2.094501932427378, | |
| "grad_norm": 1.030072569847107, | |
| "learning_rate": 4.58282291274606e-06, | |
| "loss": 0.6913, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.100735569131031, | |
| "grad_norm": 0.9375346302986145, | |
| "learning_rate": 4.525190676564189e-06, | |
| "loss": 0.6995, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 2.106969205834684, | |
| "grad_norm": 1.0840826034545898, | |
| "learning_rate": 4.467816959454166e-06, | |
| "loss": 0.6887, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 2.1132028425383367, | |
| "grad_norm": 0.9471514821052551, | |
| "learning_rate": 4.4107044705879835e-06, | |
| "loss": 0.6955, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 2.11943647924199, | |
| "grad_norm": 1.0126205682754517, | |
| "learning_rate": 4.353855906802508e-06, | |
| "loss": 0.6953, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.125670115945643, | |
| "grad_norm": 0.9667809009552002, | |
| "learning_rate": 4.297273952472128e-06, | |
| "loss": 0.6897, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 2.1319037526492957, | |
| "grad_norm": 1.2645550966262817, | |
| "learning_rate": 4.24096127938201e-06, | |
| "loss": 0.6981, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 2.1381373893529485, | |
| "grad_norm": 0.9564869403839111, | |
| "learning_rate": 4.184920546601927e-06, | |
| "loss": 0.6996, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 2.1443710260566013, | |
| "grad_norm": 0.9815326929092407, | |
| "learning_rate": 4.129154400360691e-06, | |
| "loss": 0.6933, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 2.150604662760254, | |
| "grad_norm": 1.01906156539917, | |
| "learning_rate": 4.073665473921232e-06, | |
| "loss": 0.6792, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 2.156838299463907, | |
| "grad_norm": 0.974911630153656, | |
| "learning_rate": 4.018456387456207e-06, | |
| "loss": 0.6821, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 2.1630719361675603, | |
| "grad_norm": 0.922038733959198, | |
| "learning_rate": 3.963529747924326e-06, | |
| "loss": 0.688, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 2.169305572871213, | |
| "grad_norm": 0.9135300517082214, | |
| "learning_rate": 3.90888814894721e-06, | |
| "loss": 0.6781, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 2.175539209574866, | |
| "grad_norm": 0.9557603597640991, | |
| "learning_rate": 3.854534170686943e-06, | |
| "loss": 0.6901, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 2.181772846278519, | |
| "grad_norm": 0.9457572102546692, | |
| "learning_rate": 3.8004703797242514e-06, | |
| "loss": 0.6944, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.1880064829821717, | |
| "grad_norm": 0.9799226522445679, | |
| "learning_rate": 3.746699328937261e-06, | |
| "loss": 0.6907, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 2.1942401196858246, | |
| "grad_norm": 0.9512478113174438, | |
| "learning_rate": 3.693223557381016e-06, | |
| "loss": 0.6865, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 2.200473756389478, | |
| "grad_norm": 0.9199214577674866, | |
| "learning_rate": 3.6400455901675248e-06, | |
| "loss": 0.6938, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 2.2067073930931307, | |
| "grad_norm": 0.9223390221595764, | |
| "learning_rate": 3.5871679383465687e-06, | |
| "loss": 0.6858, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 2.2129410297967835, | |
| "grad_norm": 0.9911532998085022, | |
| "learning_rate": 3.534593098787107e-06, | |
| "loss": 0.6903, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 2.2191746665004364, | |
| "grad_norm": 0.9283427000045776, | |
| "learning_rate": 3.4823235540593857e-06, | |
| "loss": 0.6956, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 2.2254083032040892, | |
| "grad_norm": 0.955929160118103, | |
| "learning_rate": 3.4303617723177085e-06, | |
| "loss": 0.7071, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 2.231641939907742, | |
| "grad_norm": 0.9738143682479858, | |
| "learning_rate": 3.3787102071838907e-06, | |
| "loss": 0.6915, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 2.237875576611395, | |
| "grad_norm": 0.9453316330909729, | |
| "learning_rate": 3.3273712976313966e-06, | |
| "loss": 0.6939, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 2.244109213315048, | |
| "grad_norm": 1.0022181272506714, | |
| "learning_rate": 3.2763474678701847e-06, | |
| "loss": 0.6966, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.244109213315048, | |
| "eval_loss": 0.6944875121116638, | |
| "eval_runtime": 394.539, | |
| "eval_samples_per_second": 46.467, | |
| "eval_steps_per_second": 5.809, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.250342850018701, | |
| "grad_norm": 0.9608253240585327, | |
| "learning_rate": 3.2256411272322097e-06, | |
| "loss": 0.6883, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 2.256576486722354, | |
| "grad_norm": 0.9454545378684998, | |
| "learning_rate": 3.175254670057698e-06, | |
| "loss": 0.6957, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 2.2628101234260067, | |
| "grad_norm": 0.9946788549423218, | |
| "learning_rate": 3.125190475582034e-06, | |
| "loss": 0.6908, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 2.2690437601296596, | |
| "grad_norm": 0.9576852321624756, | |
| "learning_rate": 3.0754509078234663e-06, | |
| "loss": 0.6916, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 2.2752773968333124, | |
| "grad_norm": 1.0309839248657227, | |
| "learning_rate": 3.0260383154714425e-06, | |
| "loss": 0.6897, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 2.2815110335369653, | |
| "grad_norm": 1.0026777982711792, | |
| "learning_rate": 2.9769550317757078e-06, | |
| "loss": 0.6949, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 2.2877446702406186, | |
| "grad_norm": 0.9826598763465881, | |
| "learning_rate": 2.9282033744361613e-06, | |
| "loss": 0.6907, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 2.2939783069442714, | |
| "grad_norm": 0.9755291938781738, | |
| "learning_rate": 2.8797856454933694e-06, | |
| "loss": 0.6918, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 2.3002119436479243, | |
| "grad_norm": 1.0012787580490112, | |
| "learning_rate": 2.831704131219899e-06, | |
| "loss": 0.689, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 2.306445580351577, | |
| "grad_norm": 0.9813947677612305, | |
| "learning_rate": 2.7839611020123447e-06, | |
| "loss": 0.6928, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.31267921705523, | |
| "grad_norm": 0.9425486922264099, | |
| "learning_rate": 2.7365588122841227e-06, | |
| "loss": 0.687, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 2.318912853758883, | |
| "grad_norm": 0.9425567388534546, | |
| "learning_rate": 2.689499500359022e-06, | |
| "loss": 0.6898, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 2.3251464904625356, | |
| "grad_norm": 0.9537495374679565, | |
| "learning_rate": 2.6427853883655085e-06, | |
| "loss": 0.6867, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 2.331380127166189, | |
| "grad_norm": 1.11262047290802, | |
| "learning_rate": 2.5964186821317963e-06, | |
| "loss": 0.6947, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 2.3376137638698418, | |
| "grad_norm": 0.933858335018158, | |
| "learning_rate": 2.550401571081692e-06, | |
| "loss": 0.6874, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 2.3438474005734946, | |
| "grad_norm": 0.9650511741638184, | |
| "learning_rate": 2.5047362281312004e-06, | |
| "loss": 0.6865, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 2.3500810372771475, | |
| "grad_norm": 1.0251282453536987, | |
| "learning_rate": 2.459424809585943e-06, | |
| "loss": 0.6957, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 2.3563146739808003, | |
| "grad_norm": 0.985219419002533, | |
| "learning_rate": 2.41446945503931e-06, | |
| "loss": 0.6807, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 2.362548310684453, | |
| "grad_norm": 1.0091772079467773, | |
| "learning_rate": 2.3698722872714486e-06, | |
| "loss": 0.7004, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 2.368781947388106, | |
| "grad_norm": 0.9813216924667358, | |
| "learning_rate": 2.3256354121490197e-06, | |
| "loss": 0.693, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.3750155840917593, | |
| "grad_norm": 0.9824263453483582, | |
| "learning_rate": 2.2817609185257493e-06, | |
| "loss": 0.6828, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 2.381249220795412, | |
| "grad_norm": 0.9482113718986511, | |
| "learning_rate": 2.2382508781438217e-06, | |
| "loss": 0.6913, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 2.387482857499065, | |
| "grad_norm": 0.9164847135543823, | |
| "learning_rate": 2.195107345536013e-06, | |
| "loss": 0.6915, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 2.393716494202718, | |
| "grad_norm": 1.002191424369812, | |
| "learning_rate": 2.152332357928719e-06, | |
| "loss": 0.677, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 2.3999501309063707, | |
| "grad_norm": 0.9387649297714233, | |
| "learning_rate": 2.109927935145718e-06, | |
| "loss": 0.6894, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 2.4061837676100235, | |
| "grad_norm": 1.0270154476165771, | |
| "learning_rate": 2.0678960795128234e-06, | |
| "loss": 0.6835, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 2.4124174043136764, | |
| "grad_norm": 0.9585641622543335, | |
| "learning_rate": 2.026238775763322e-06, | |
| "loss": 0.6912, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 2.4186510410173296, | |
| "grad_norm": 0.9344882369041443, | |
| "learning_rate": 1.9849579909442595e-06, | |
| "loss": 0.6923, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 2.4248846777209825, | |
| "grad_norm": 1.0244457721710205, | |
| "learning_rate": 1.944055674323554e-06, | |
| "loss": 0.6894, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 2.4311183144246353, | |
| "grad_norm": 0.9319279789924622, | |
| "learning_rate": 1.9035337572979561e-06, | |
| "loss": 0.6931, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.437351951128288, | |
| "grad_norm": 0.9782323837280273, | |
| "learning_rate": 1.8633941533018428e-06, | |
| "loss": 0.6933, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 2.443585587831941, | |
| "grad_norm": 1.0308918952941895, | |
| "learning_rate": 1.8236387577168735e-06, | |
| "loss": 0.6869, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 2.4498192245355943, | |
| "grad_norm": 1.0138496160507202, | |
| "learning_rate": 1.784269447782484e-06, | |
| "loss": 0.6906, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 2.456052861239247, | |
| "grad_norm": 1.0050208568572998, | |
| "learning_rate": 1.7452880825072448e-06, | |
| "loss": 0.678, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 2.4622864979429, | |
| "grad_norm": 0.9685599207878113, | |
| "learning_rate": 1.7066965025810844e-06, | |
| "loss": 0.6904, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 2.468520134646553, | |
| "grad_norm": 1.0548591613769531, | |
| "learning_rate": 1.668496530288366e-06, | |
| "loss": 0.6891, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 2.4747537713502057, | |
| "grad_norm": 1.0550183057785034, | |
| "learning_rate": 1.6306899694218436e-06, | |
| "loss": 0.6877, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 2.4809874080538585, | |
| "grad_norm": 0.9863895773887634, | |
| "learning_rate": 1.5932786051974792e-06, | |
| "loss": 0.696, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 2.4872210447575114, | |
| "grad_norm": 0.9628305435180664, | |
| "learning_rate": 1.556264204170167e-06, | |
| "loss": 0.6889, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 2.4934546814611647, | |
| "grad_norm": 0.9757224917411804, | |
| "learning_rate": 1.519648514150286e-06, | |
| "loss": 0.6916, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.4934546814611647, | |
| "eval_loss": 0.6928977966308594, | |
| "eval_runtime": 394.4481, | |
| "eval_samples_per_second": 46.478, | |
| "eval_steps_per_second": 5.811, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.4996883181648175, | |
| "grad_norm": 1.0164076089859009, | |
| "learning_rate": 1.4834332641211956e-06, | |
| "loss": 0.6911, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 2.5059219548684704, | |
| "grad_norm": 1.003832459449768, | |
| "learning_rate": 1.4476201641575793e-06, | |
| "loss": 0.6913, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 2.512155591572123, | |
| "grad_norm": 1.008109450340271, | |
| "learning_rate": 1.4122109053446997e-06, | |
| "loss": 0.7011, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 2.518389228275776, | |
| "grad_norm": 1.0127811431884766, | |
| "learning_rate": 1.3772071596985448e-06, | |
| "loss": 0.6874, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 2.524622864979429, | |
| "grad_norm": 1.0523449182510376, | |
| "learning_rate": 1.3426105800868782e-06, | |
| "loss": 0.687, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 2.5308565016830817, | |
| "grad_norm": 1.013615608215332, | |
| "learning_rate": 1.3084228001511867e-06, | |
| "loss": 0.6893, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 2.537090138386735, | |
| "grad_norm": 0.9353114366531372, | |
| "learning_rate": 1.2746454342295456e-06, | |
| "loss": 0.6872, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 2.543323775090388, | |
| "grad_norm": 0.9867616295814514, | |
| "learning_rate": 1.2412800772803846e-06, | |
| "loss": 0.6916, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 2.5495574117940407, | |
| "grad_norm": 0.9996484518051147, | |
| "learning_rate": 1.208328304807178e-06, | |
| "loss": 0.6834, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 2.5557910484976936, | |
| "grad_norm": 0.9408335089683533, | |
| "learning_rate": 1.1757916727840502e-06, | |
| "loss": 0.6927, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.5620246852013464, | |
| "grad_norm": 1.0010491609573364, | |
| "learning_rate": 1.1436717175822976e-06, | |
| "loss": 0.6904, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 2.5682583219049993, | |
| "grad_norm": 0.9572605490684509, | |
| "learning_rate": 1.1119699558978525e-06, | |
| "loss": 0.6841, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 2.574491958608652, | |
| "grad_norm": 0.9748229384422302, | |
| "learning_rate": 1.0806878846796454e-06, | |
| "loss": 0.6944, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 2.5807255953123054, | |
| "grad_norm": 1.03251314163208, | |
| "learning_rate": 1.0498269810589501e-06, | |
| "loss": 0.6909, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 2.5869592320159582, | |
| "grad_norm": 0.9722402691841125, | |
| "learning_rate": 1.019388702279599e-06, | |
| "loss": 0.6983, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 2.593192868719611, | |
| "grad_norm": 1.0411121845245361, | |
| "learning_rate": 9.89374485629202e-07, | |
| "loss": 0.682, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 2.599426505423264, | |
| "grad_norm": 0.9544338583946228, | |
| "learning_rate": 9.59785748371257e-07, | |
| "loss": 0.6927, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 2.6056601421269168, | |
| "grad_norm": 1.1393173933029175, | |
| "learning_rate": 9.306238876782381e-07, | |
| "loss": 0.693, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 2.6118937788305696, | |
| "grad_norm": 0.9999702572822571, | |
| "learning_rate": 9.018902805656249e-07, | |
| "loss": 0.686, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 2.6181274155342225, | |
| "grad_norm": 0.9707582592964172, | |
| "learning_rate": 8.735862838268638e-07, | |
| "loss": 0.6826, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.6243610522378757, | |
| "grad_norm": 0.9685394167900085, | |
| "learning_rate": 8.457132339693231e-07, | |
| "loss": 0.6792, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 2.6305946889415286, | |
| "grad_norm": 1.0175048112869263, | |
| "learning_rate": 8.182724471511605e-07, | |
| "loss": 0.6892, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 2.6368283256451814, | |
| "grad_norm": 0.9666849374771118, | |
| "learning_rate": 7.912652191191905e-07, | |
| "loss": 0.69, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 2.6430619623488343, | |
| "grad_norm": 1.0169652700424194, | |
| "learning_rate": 7.64692825147696e-07, | |
| "loss": 0.6939, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 2.649295599052487, | |
| "grad_norm": 0.9823275804519653, | |
| "learning_rate": 7.385565199782063e-07, | |
| "loss": 0.691, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 2.65552923575614, | |
| "grad_norm": 0.9402974843978882, | |
| "learning_rate": 7.128575377602509e-07, | |
| "loss": 0.692, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 2.661762872459793, | |
| "grad_norm": 0.9024824500083923, | |
| "learning_rate": 6.87597091993083e-07, | |
| "loss": 0.6944, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 2.667996509163446, | |
| "grad_norm": 1.0077190399169922, | |
| "learning_rate": 6.627763754683824e-07, | |
| "loss": 0.6922, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 2.674230145867099, | |
| "grad_norm": 0.9743265509605408, | |
| "learning_rate": 6.383965602139253e-07, | |
| "loss": 0.6905, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 2.680463782570752, | |
| "grad_norm": 1.0018881559371948, | |
| "learning_rate": 6.144587974382399e-07, | |
| "loss": 0.6829, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.6866974192744046, | |
| "grad_norm": 1.0576649904251099, | |
| "learning_rate": 5.909642174762642e-07, | |
| "loss": 0.6837, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 2.6929310559780575, | |
| "grad_norm": 0.9916029572486877, | |
| "learning_rate": 5.679139297359448e-07, | |
| "loss": 0.6934, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 2.6991646926817108, | |
| "grad_norm": 0.9222472906112671, | |
| "learning_rate": 5.453090226458758e-07, | |
| "loss": 0.6888, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 2.705398329385363, | |
| "grad_norm": 0.9654924273490906, | |
| "learning_rate": 5.231505636038881e-07, | |
| "loss": 0.6922, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 2.7116319660890165, | |
| "grad_norm": 0.979354739189148, | |
| "learning_rate": 5.014395989266496e-07, | |
| "loss": 0.6841, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 2.7178656027926693, | |
| "grad_norm": 0.9810166954994202, | |
| "learning_rate": 4.801771538002687e-07, | |
| "loss": 0.6901, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 2.724099239496322, | |
| "grad_norm": 0.9526228308677673, | |
| "learning_rate": 4.59364232231867e-07, | |
| "loss": 0.6879, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 2.730332876199975, | |
| "grad_norm": 0.9293854236602783, | |
| "learning_rate": 4.3900181700219035e-07, | |
| "loss": 0.6955, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 2.736566512903628, | |
| "grad_norm": 1.0160183906555176, | |
| "learning_rate": 4.190908696191853e-07, | |
| "loss": 0.6903, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 2.742800149607281, | |
| "grad_norm": 0.9928803443908691, | |
| "learning_rate": 3.9963233027260794e-07, | |
| "loss": 0.685, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.742800149607281, | |
| "eval_loss": 0.6923442482948303, | |
| "eval_runtime": 393.8932, | |
| "eval_samples_per_second": 46.543, | |
| "eval_steps_per_second": 5.819, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.7490337863109335, | |
| "grad_norm": 0.9902337789535522, | |
| "learning_rate": 3.806271177896248e-07, | |
| "loss": 0.6869, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 2.755267423014587, | |
| "grad_norm": 0.9907515645027161, | |
| "learning_rate": 3.6207612959142213e-07, | |
| "loss": 0.6857, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 2.7615010597182397, | |
| "grad_norm": 0.9236803650856018, | |
| "learning_rate": 3.4398024165083864e-07, | |
| "loss": 0.6941, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 2.7677346964218925, | |
| "grad_norm": 1.0519189834594727, | |
| "learning_rate": 3.2634030845099417e-07, | |
| "loss": 0.6903, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 2.7739683331255454, | |
| "grad_norm": 0.9530760049819946, | |
| "learning_rate": 3.0915716294494193e-07, | |
| "loss": 0.688, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 2.780201969829198, | |
| "grad_norm": 0.9760779142379761, | |
| "learning_rate": 2.9243161651634654e-07, | |
| "loss": 0.6914, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 2.7864356065328515, | |
| "grad_norm": 0.9709676504135132, | |
| "learning_rate": 2.7616445894115607e-07, | |
| "loss": 0.6841, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 2.7926692432365043, | |
| "grad_norm": 0.9802337288856506, | |
| "learning_rate": 2.6035645835032044e-07, | |
| "loss": 0.6881, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 2.798902879940157, | |
| "grad_norm": 1.0037245750427246, | |
| "learning_rate": 2.4500836119351503e-07, | |
| "loss": 0.6893, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 2.80513651664381, | |
| "grad_norm": 0.9736759066581726, | |
| "learning_rate": 2.301208922038911e-07, | |
| "loss": 0.6837, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.811370153347463, | |
| "grad_norm": 0.9228886961936951, | |
| "learning_rate": 2.1569475436386546e-07, | |
| "loss": 0.6866, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 2.8176037900511157, | |
| "grad_norm": 1.0323145389556885, | |
| "learning_rate": 2.0173062887190898e-07, | |
| "loss": 0.6736, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 2.8238374267547686, | |
| "grad_norm": 0.9843491911888123, | |
| "learning_rate": 1.8822917511039818e-07, | |
| "loss": 0.6971, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 2.830071063458422, | |
| "grad_norm": 1.0021668672561646, | |
| "learning_rate": 1.7519103061446552e-07, | |
| "loss": 0.7013, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 2.8363047001620747, | |
| "grad_norm": 1.0033491849899292, | |
| "learning_rate": 1.626168110419013e-07, | |
| "loss": 0.6782, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 2.8425383368657275, | |
| "grad_norm": 0.9935190081596375, | |
| "learning_rate": 1.505071101440836e-07, | |
| "loss": 0.6879, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 2.8487719735693804, | |
| "grad_norm": 0.9331642389297485, | |
| "learning_rate": 1.388624997379373e-07, | |
| "loss": 0.6923, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 2.8550056102730332, | |
| "grad_norm": 0.9350413084030151, | |
| "learning_rate": 1.2768352967893582e-07, | |
| "loss": 0.6877, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 2.861239246976686, | |
| "grad_norm": 0.9508509635925293, | |
| "learning_rate": 1.169707278351373e-07, | |
| "loss": 0.694, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 2.867472883680339, | |
| "grad_norm": 0.9146909117698669, | |
| "learning_rate": 1.0672460006225682e-07, | |
| "loss": 0.6923, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.873706520383992, | |
| "grad_norm": 0.94114089012146, | |
| "learning_rate": 9.694563017978331e-08, | |
| "loss": 0.6944, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 2.879940157087645, | |
| "grad_norm": 1.0591011047363281, | |
| "learning_rate": 8.763427994813112e-08, | |
| "loss": 0.6823, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 2.886173793791298, | |
| "grad_norm": 0.9940326809883118, | |
| "learning_rate": 7.879098904683303e-08, | |
| "loss": 0.6937, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 2.8924074304949507, | |
| "grad_norm": 1.0066951513290405, | |
| "learning_rate": 7.041617505378573e-08, | |
| "loss": 0.689, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 2.8986410671986036, | |
| "grad_norm": 0.8989609479904175, | |
| "learning_rate": 6.251023342552787e-08, | |
| "loss": 0.6895, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 2.9048747039022564, | |
| "grad_norm": 1.001419186592102, | |
| "learning_rate": 5.5073537478566034e-08, | |
| "loss": 0.6941, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 2.9111083406059093, | |
| "grad_norm": 0.9182389378547668, | |
| "learning_rate": 4.810643837174667e-08, | |
| "loss": 0.6846, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 2.9173419773095626, | |
| "grad_norm": 1.01217520236969, | |
| "learning_rate": 4.160926508967822e-08, | |
| "loss": 0.6912, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 2.9235756140132154, | |
| "grad_norm": 0.9560514688491821, | |
| "learning_rate": 3.558232442719245e-08, | |
| "loss": 0.6948, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 2.9298092507168683, | |
| "grad_norm": 1.0075358152389526, | |
| "learning_rate": 3.002590097485936e-08, | |
| "loss": 0.6838, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.936042887420521, | |
| "grad_norm": 0.9497339129447937, | |
| "learning_rate": 2.4940257105547928e-08, | |
| "loss": 0.684, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 2.942276524124174, | |
| "grad_norm": 0.9690067172050476, | |
| "learning_rate": 2.0325632962039376e-08, | |
| "loss": 0.6891, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 2.948510160827827, | |
| "grad_norm": 0.9984382390975952, | |
| "learning_rate": 1.6182246445685114e-08, | |
| "loss": 0.6911, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 2.9547437975314796, | |
| "grad_norm": 0.9853787422180176, | |
| "learning_rate": 1.2510293206118296e-08, | |
| "loss": 0.6932, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 2.960977434235133, | |
| "grad_norm": 1.0235117673873901, | |
| "learning_rate": 9.309946632015676e-09, | |
| "loss": 0.6855, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 2.9672110709387858, | |
| "grad_norm": 0.9801839590072632, | |
| "learning_rate": 6.581357842909697e-09, | |
| "loss": 0.6941, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 2.9734447076424386, | |
| "grad_norm": 0.9491228461265564, | |
| "learning_rate": 4.324655682051982e-09, | |
| "loss": 0.6884, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 2.9796783443460915, | |
| "grad_norm": 0.9716805815696716, | |
| "learning_rate": 2.5399467103337518e-09, | |
| "loss": 0.6855, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 2.9859119810497443, | |
| "grad_norm": 0.9866331815719604, | |
| "learning_rate": 1.2273152012465262e-09, | |
| "loss": 0.6952, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 2.9921456177533976, | |
| "grad_norm": 0.9594016075134277, | |
| "learning_rate": 3.8682313690974194e-10, | |
| "loss": 0.694, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.9921456177533976, | |
| "eval_loss": 0.6922064423561096, | |
| "eval_runtime": 394.8751, | |
| "eval_samples_per_second": 46.427, | |
| "eval_steps_per_second": 5.804, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.99837925445705, | |
| "grad_norm": 1.0189405679702759, | |
| "learning_rate": 1.8510205138655424e-11, | |
| "loss": 0.6834, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 24063, | |
| "total_flos": 2.54494160244104e+18, | |
| "train_loss": 0.7434851736349244, | |
| "train_runtime": 33718.164, | |
| "train_samples_per_second": 11.418, | |
| "train_steps_per_second": 0.714 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 24063, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 2, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.54494160244104e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |