{ "best_global_step": 3900, "best_metric": 0.6421848255039291, "best_model_checkpoint": "./experiments/qwen3-0.6b-router-lr1e-5-ep2-batch20-20250917-11:04/checkpoint-3900", "epoch": 1.597908192911098, "eval_steps": 100, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002905287623474724, "grad_norm": 422.0, "learning_rate": 0.0, "loss": 4.1917, "step": 1 }, { "epoch": 0.002905287623474724, "grad_norm": 260.0, "learning_rate": 9.000000000000001e-07, "loss": 3.1155, "step": 10 }, { "epoch": 0.005810575246949448, "grad_norm": 362.0, "learning_rate": 1.9000000000000002e-06, "loss": 3.6823, "step": 20 }, { "epoch": 0.008715862870424172, "grad_norm": 362.0, "learning_rate": 2.9e-06, "loss": 3.5894, "step": 30 }, { "epoch": 0.011621150493898896, "grad_norm": 286.0, "learning_rate": 3.900000000000001e-06, "loss": 2.7935, "step": 40 }, { "epoch": 0.01452643811737362, "grad_norm": 108.5, "learning_rate": 4.9000000000000005e-06, "loss": 2.1466, "step": 50 }, { "epoch": 0.017431725740848343, "grad_norm": 171.0, "learning_rate": 5.9e-06, "loss": 2.174, "step": 60 }, { "epoch": 0.02033701336432307, "grad_norm": 182.0, "learning_rate": 6.9e-06, "loss": 1.9269, "step": 70 }, { "epoch": 0.023242300987797792, "grad_norm": 152.0, "learning_rate": 7.9e-06, "loss": 2.0153, "step": 80 }, { "epoch": 0.026147588611272515, "grad_norm": 124.5, "learning_rate": 8.900000000000001e-06, "loss": 1.7756, "step": 90 }, { "epoch": 0.02905287623474724, "grad_norm": 220.0, "learning_rate": 9.9e-06, "loss": 1.7598, "step": 100 }, { "epoch": 0.02905287623474724, "eval_accuracy": 0.5466593042517945, "eval_f1": 0.5420321197439284, "eval_loss": 0.8443426489830017, "eval_precision": 0.5556366418233831, "eval_recall": 0.5466593042517945, "eval_runtime": 17.2355, "eval_samples_per_second": 105.074, "eval_steps_per_second": 10.56, "step": 100 }, { "epoch": 0.031958163858221963, "grad_norm": 180.0, "learning_rate": 9.986733490566039e-06, "loss": 1.6999, "step": 110 }, { "epoch": 0.034863451481696686, "grad_norm": 98.0, "learning_rate": 9.971992924528303e-06, "loss": 1.6424, "step": 120 }, { "epoch": 0.03776873910517141, "grad_norm": 92.5, "learning_rate": 9.957252358490566e-06, "loss": 1.4874, "step": 130 }, { "epoch": 0.04067402672864614, "grad_norm": 99.5, "learning_rate": 9.942511792452831e-06, "loss": 1.668, "step": 140 }, { "epoch": 0.04357931435212086, "grad_norm": 89.5, "learning_rate": 9.927771226415096e-06, "loss": 1.4386, "step": 150 }, { "epoch": 0.046484601975595584, "grad_norm": 78.5, "learning_rate": 9.91303066037736e-06, "loss": 1.3427, "step": 160 }, { "epoch": 0.04938988959907031, "grad_norm": 77.5, "learning_rate": 9.898290094339623e-06, "loss": 1.7596, "step": 170 }, { "epoch": 0.05229517722254503, "grad_norm": 136.0, "learning_rate": 9.883549528301888e-06, "loss": 1.639, "step": 180 }, { "epoch": 0.05520046484601976, "grad_norm": 78.5, "learning_rate": 9.868808962264151e-06, "loss": 1.6126, "step": 190 }, { "epoch": 0.05810575246949448, "grad_norm": 104.5, "learning_rate": 9.854068396226416e-06, "loss": 1.4535, "step": 200 }, { "epoch": 0.05810575246949448, "eval_accuracy": 0.5737161789066814, "eval_f1": 0.5535126986816034, "eval_loss": 0.7697113752365112, "eval_precision": 0.5771129416178113, "eval_recall": 0.5737161789066814, "eval_runtime": 23.2381, "eval_samples_per_second": 77.932, "eval_steps_per_second": 7.832, "step": 200 }, { "epoch": 0.061011040092969204, "grad_norm": 67.0, "learning_rate": 9.83932783018868e-06, "loss": 1.4115, "step": 210 }, { "epoch": 0.06391632771644393, "grad_norm": 103.5, "learning_rate": 9.824587264150945e-06, "loss": 1.5474, "step": 220 }, { "epoch": 0.06682161533991865, "grad_norm": 70.5, "learning_rate": 9.809846698113208e-06, "loss": 1.3732, "step": 230 }, { "epoch": 0.06972690296339337, "grad_norm": 79.5, "learning_rate": 9.795106132075473e-06, "loss": 1.5937, "step": 240 }, { "epoch": 0.0726321905868681, "grad_norm": 160.0, "learning_rate": 9.780365566037736e-06, "loss": 1.5243, "step": 250 }, { "epoch": 0.07553747821034282, "grad_norm": 86.5, "learning_rate": 9.765625e-06, "loss": 1.5532, "step": 260 }, { "epoch": 0.07844276583381755, "grad_norm": 87.0, "learning_rate": 9.750884433962265e-06, "loss": 1.4715, "step": 270 }, { "epoch": 0.08134805345729228, "grad_norm": 72.0, "learning_rate": 9.73614386792453e-06, "loss": 1.5738, "step": 280 }, { "epoch": 0.084253341080767, "grad_norm": 60.0, "learning_rate": 9.721403301886794e-06, "loss": 1.4415, "step": 290 }, { "epoch": 0.08715862870424172, "grad_norm": 79.5, "learning_rate": 9.706662735849057e-06, "loss": 1.4364, "step": 300 }, { "epoch": 0.08715862870424172, "eval_accuracy": 0.5808945334069575, "eval_f1": 0.579545757594244, "eval_loss": 0.7065893411636353, "eval_precision": 0.5868008140090218, "eval_recall": 0.5808945334069575, "eval_runtime": 25.9239, "eval_samples_per_second": 69.858, "eval_steps_per_second": 7.021, "step": 300 }, { "epoch": 0.09006391632771645, "grad_norm": 50.0, "learning_rate": 9.691922169811322e-06, "loss": 1.323, "step": 310 }, { "epoch": 0.09296920395119117, "grad_norm": 77.0, "learning_rate": 9.677181603773585e-06, "loss": 1.4155, "step": 320 }, { "epoch": 0.09587449157466589, "grad_norm": 64.5, "learning_rate": 9.66244103773585e-06, "loss": 1.4485, "step": 330 }, { "epoch": 0.09877977919814061, "grad_norm": 115.0, "learning_rate": 9.647700471698114e-06, "loss": 1.5113, "step": 340 }, { "epoch": 0.10168506682161534, "grad_norm": 94.0, "learning_rate": 9.632959905660379e-06, "loss": 1.4558, "step": 350 }, { "epoch": 0.10459035444509006, "grad_norm": 74.5, "learning_rate": 9.618219339622642e-06, "loss": 1.385, "step": 360 }, { "epoch": 0.10749564206856478, "grad_norm": 58.0, "learning_rate": 9.603478773584906e-06, "loss": 1.4507, "step": 370 }, { "epoch": 0.11040092969203952, "grad_norm": 53.75, "learning_rate": 9.588738207547171e-06, "loss": 1.4854, "step": 380 }, { "epoch": 0.11330621731551424, "grad_norm": 67.5, "learning_rate": 9.573997641509436e-06, "loss": 1.4493, "step": 390 }, { "epoch": 0.11621150493898896, "grad_norm": 82.5, "learning_rate": 9.559257075471699e-06, "loss": 1.332, "step": 400 }, { "epoch": 0.11621150493898896, "eval_accuracy": 0.6046383213694092, "eval_f1": 0.5982589248765446, "eval_loss": 0.6897996068000793, "eval_precision": 0.6052451933014279, "eval_recall": 0.6046383213694092, "eval_runtime": 21.176, "eval_samples_per_second": 85.522, "eval_steps_per_second": 8.595, "step": 400 }, { "epoch": 0.11911679256246369, "grad_norm": 56.5, "learning_rate": 9.544516509433963e-06, "loss": 1.3926, "step": 410 }, { "epoch": 0.12202208018593841, "grad_norm": 95.5, "learning_rate": 9.529775943396226e-06, "loss": 1.4383, "step": 420 }, { "epoch": 0.12492736780941313, "grad_norm": 105.0, "learning_rate": 9.515035377358491e-06, "loss": 1.4494, "step": 430 }, { "epoch": 0.12783265543288785, "grad_norm": 41.25, "learning_rate": 9.500294811320756e-06, "loss": 1.3664, "step": 440 }, { "epoch": 0.13073794305636258, "grad_norm": 89.0, "learning_rate": 9.48555424528302e-06, "loss": 1.34, "step": 450 }, { "epoch": 0.1336432306798373, "grad_norm": 50.0, "learning_rate": 9.470813679245285e-06, "loss": 1.354, "step": 460 }, { "epoch": 0.13654851830331202, "grad_norm": 54.25, "learning_rate": 9.456073113207548e-06, "loss": 1.3451, "step": 470 }, { "epoch": 0.13945380592678674, "grad_norm": 51.75, "learning_rate": 9.441332547169812e-06, "loss": 1.4787, "step": 480 }, { "epoch": 0.14235909355026147, "grad_norm": 43.0, "learning_rate": 9.426591981132075e-06, "loss": 1.3808, "step": 490 }, { "epoch": 0.1452643811737362, "grad_norm": 62.5, "learning_rate": 9.41185141509434e-06, "loss": 1.4256, "step": 500 }, { "epoch": 0.1452643811737362, "eval_accuracy": 0.6112644947542794, "eval_f1": 0.6088118132920133, "eval_loss": 0.6750182509422302, "eval_precision": 0.6105845735531097, "eval_recall": 0.6112644947542794, "eval_runtime": 25.2545, "eval_samples_per_second": 71.71, "eval_steps_per_second": 7.207, "step": 500 }, { "epoch": 0.1481696687972109, "grad_norm": 66.0, "learning_rate": 9.397110849056605e-06, "loss": 1.2969, "step": 510 }, { "epoch": 0.15107495642068564, "grad_norm": 118.5, "learning_rate": 9.38237028301887e-06, "loss": 1.3989, "step": 520 }, { "epoch": 0.15398024404416036, "grad_norm": 49.25, "learning_rate": 9.367629716981132e-06, "loss": 1.4072, "step": 530 }, { "epoch": 0.1568855316676351, "grad_norm": 50.5, "learning_rate": 9.352889150943397e-06, "loss": 1.3402, "step": 540 }, { "epoch": 0.15979081929110983, "grad_norm": 55.5, "learning_rate": 9.33814858490566e-06, "loss": 1.4138, "step": 550 }, { "epoch": 0.16269610691458455, "grad_norm": 46.25, "learning_rate": 9.323408018867925e-06, "loss": 1.3791, "step": 560 }, { "epoch": 0.16560139453805928, "grad_norm": 126.5, "learning_rate": 9.30866745283019e-06, "loss": 1.4886, "step": 570 }, { "epoch": 0.168506682161534, "grad_norm": 35.0, "learning_rate": 9.293926886792454e-06, "loss": 1.3154, "step": 580 }, { "epoch": 0.17141196978500872, "grad_norm": 87.5, "learning_rate": 9.279186320754717e-06, "loss": 1.3715, "step": 590 }, { "epoch": 0.17431725740848344, "grad_norm": 97.0, "learning_rate": 9.264445754716982e-06, "loss": 1.4323, "step": 600 }, { "epoch": 0.17431725740848344, "eval_accuracy": 0.6007730535615682, "eval_f1": 0.6010027767957538, "eval_loss": 0.6762142777442932, "eval_precision": 0.6018480565642976, "eval_recall": 0.6007730535615682, "eval_runtime": 19.088, "eval_samples_per_second": 94.876, "eval_steps_per_second": 9.535, "step": 600 }, { "epoch": 0.17722254503195817, "grad_norm": 48.25, "learning_rate": 9.249705188679246e-06, "loss": 1.2719, "step": 610 }, { "epoch": 0.1801278326554329, "grad_norm": 125.0, "learning_rate": 9.234964622641511e-06, "loss": 1.3752, "step": 620 }, { "epoch": 0.1830331202789076, "grad_norm": 83.5, "learning_rate": 9.220224056603776e-06, "loss": 1.3115, "step": 630 }, { "epoch": 0.18593840790238234, "grad_norm": 42.25, "learning_rate": 9.205483490566038e-06, "loss": 1.4751, "step": 640 }, { "epoch": 0.18884369552585706, "grad_norm": 116.0, "learning_rate": 9.190742924528303e-06, "loss": 1.4054, "step": 650 }, { "epoch": 0.19174898314933178, "grad_norm": 59.75, "learning_rate": 9.176002358490566e-06, "loss": 1.2389, "step": 660 }, { "epoch": 0.1946542707728065, "grad_norm": 77.5, "learning_rate": 9.16126179245283e-06, "loss": 1.4555, "step": 670 }, { "epoch": 0.19755955839628123, "grad_norm": 28.625, "learning_rate": 9.146521226415095e-06, "loss": 1.2733, "step": 680 }, { "epoch": 0.20046484601975595, "grad_norm": 74.0, "learning_rate": 9.13178066037736e-06, "loss": 1.4384, "step": 690 }, { "epoch": 0.20337013364323067, "grad_norm": 58.75, "learning_rate": 9.117040094339623e-06, "loss": 1.3355, "step": 700 }, { "epoch": 0.20337013364323067, "eval_accuracy": 0.5969077857537273, "eval_f1": 0.5967480017714881, "eval_loss": 0.6716373562812805, "eval_precision": 0.6002412035566317, "eval_recall": 0.5969077857537273, "eval_runtime": 24.8517, "eval_samples_per_second": 72.872, "eval_steps_per_second": 7.323, "step": 700 }, { "epoch": 0.2062754212667054, "grad_norm": 53.5, "learning_rate": 9.102299528301888e-06, "loss": 1.371, "step": 710 }, { "epoch": 0.20918070889018012, "grad_norm": 71.5, "learning_rate": 9.08755896226415e-06, "loss": 1.3885, "step": 720 }, { "epoch": 0.21208599651365484, "grad_norm": 57.75, "learning_rate": 9.072818396226415e-06, "loss": 1.286, "step": 730 }, { "epoch": 0.21499128413712956, "grad_norm": 89.5, "learning_rate": 9.05807783018868e-06, "loss": 1.3166, "step": 740 }, { "epoch": 0.2178965717606043, "grad_norm": 49.0, "learning_rate": 9.043337264150945e-06, "loss": 1.446, "step": 750 }, { "epoch": 0.22080185938407904, "grad_norm": 48.25, "learning_rate": 9.028596698113208e-06, "loss": 1.3413, "step": 760 }, { "epoch": 0.22370714700755376, "grad_norm": 70.5, "learning_rate": 9.013856132075472e-06, "loss": 1.4005, "step": 770 }, { "epoch": 0.22661243463102848, "grad_norm": 85.0, "learning_rate": 8.999115566037737e-06, "loss": 1.3292, "step": 780 }, { "epoch": 0.2295177222545032, "grad_norm": 44.25, "learning_rate": 8.984375000000002e-06, "loss": 1.36, "step": 790 }, { "epoch": 0.23242300987797793, "grad_norm": 49.5, "learning_rate": 8.969634433962266e-06, "loss": 1.3268, "step": 800 }, { "epoch": 0.23242300987797793, "eval_accuracy": 0.609055770292656, "eval_f1": 0.6092814247668411, "eval_loss": 0.6692959666252136, "eval_precision": 0.610225869236403, "eval_recall": 0.609055770292656, "eval_runtime": 24.1313, "eval_samples_per_second": 75.048, "eval_steps_per_second": 7.542, "step": 800 }, { "epoch": 0.23532829750145265, "grad_norm": 63.75, "learning_rate": 8.95489386792453e-06, "loss": 1.3547, "step": 810 }, { "epoch": 0.23823358512492737, "grad_norm": 76.5, "learning_rate": 8.940153301886794e-06, "loss": 1.3306, "step": 820 }, { "epoch": 0.2411388727484021, "grad_norm": 105.5, "learning_rate": 8.925412735849057e-06, "loss": 1.3161, "step": 830 }, { "epoch": 0.24404416037187682, "grad_norm": 104.0, "learning_rate": 8.910672169811321e-06, "loss": 1.3919, "step": 840 }, { "epoch": 0.24694944799535154, "grad_norm": 94.0, "learning_rate": 8.895931603773586e-06, "loss": 1.3846, "step": 850 }, { "epoch": 0.24985473561882626, "grad_norm": 62.75, "learning_rate": 8.88119103773585e-06, "loss": 1.3407, "step": 860 }, { "epoch": 0.252760023242301, "grad_norm": 73.5, "learning_rate": 8.866450471698114e-06, "loss": 1.2485, "step": 870 }, { "epoch": 0.2556653108657757, "grad_norm": 66.5, "learning_rate": 8.851709905660378e-06, "loss": 1.4223, "step": 880 }, { "epoch": 0.25857059848925046, "grad_norm": 44.0, "learning_rate": 8.836969339622641e-06, "loss": 1.3466, "step": 890 }, { "epoch": 0.26147588611272515, "grad_norm": 37.0, "learning_rate": 8.822228773584906e-06, "loss": 1.3865, "step": 900 }, { "epoch": 0.26147588611272515, "eval_accuracy": 0.6107123136388736, "eval_f1": 0.6104421517436964, "eval_loss": 0.6648093461990356, "eval_precision": 0.6103362392904956, "eval_recall": 0.6107123136388736, "eval_runtime": 26.8405, "eval_samples_per_second": 67.473, "eval_steps_per_second": 6.781, "step": 900 }, { "epoch": 0.2643811737361999, "grad_norm": 54.75, "learning_rate": 8.80748820754717e-06, "loss": 1.4233, "step": 910 }, { "epoch": 0.2672864613596746, "grad_norm": 63.75, "learning_rate": 8.792747641509435e-06, "loss": 1.3615, "step": 920 }, { "epoch": 0.27019174898314935, "grad_norm": 55.5, "learning_rate": 8.778007075471698e-06, "loss": 1.3773, "step": 930 }, { "epoch": 0.27309703660662404, "grad_norm": 49.25, "learning_rate": 8.763266509433963e-06, "loss": 1.4202, "step": 940 }, { "epoch": 0.2760023242300988, "grad_norm": 59.25, "learning_rate": 8.748525943396226e-06, "loss": 1.2954, "step": 950 }, { "epoch": 0.2789076118535735, "grad_norm": 43.0, "learning_rate": 8.73378537735849e-06, "loss": 1.3296, "step": 960 }, { "epoch": 0.28181289947704824, "grad_norm": 77.5, "learning_rate": 8.719044811320755e-06, "loss": 1.364, "step": 970 }, { "epoch": 0.28471818710052293, "grad_norm": 88.5, "learning_rate": 8.70430424528302e-06, "loss": 1.4751, "step": 980 }, { "epoch": 0.2876234747239977, "grad_norm": 46.0, "learning_rate": 8.689563679245284e-06, "loss": 1.471, "step": 990 }, { "epoch": 0.2905287623474724, "grad_norm": 44.75, "learning_rate": 8.674823113207547e-06, "loss": 1.3453, "step": 1000 }, { "epoch": 0.2905287623474724, "eval_accuracy": 0.6062948647156268, "eval_f1": 0.5996489127755472, "eval_loss": 0.6624994874000549, "eval_precision": 0.6071509218682795, "eval_recall": 0.6062948647156268, "eval_runtime": 27.0761, "eval_samples_per_second": 66.885, "eval_steps_per_second": 6.722, "step": 1000 }, { "epoch": 0.29343404997094713, "grad_norm": 52.25, "learning_rate": 8.660082547169812e-06, "loss": 1.3341, "step": 1010 }, { "epoch": 0.2963393375944218, "grad_norm": 34.25, "learning_rate": 8.645341981132077e-06, "loss": 1.341, "step": 1020 }, { "epoch": 0.2992446252178966, "grad_norm": 50.75, "learning_rate": 8.630601415094341e-06, "loss": 1.3216, "step": 1030 }, { "epoch": 0.30214991284137127, "grad_norm": 35.75, "learning_rate": 8.615860849056604e-06, "loss": 1.3186, "step": 1040 }, { "epoch": 0.305055200464846, "grad_norm": 30.0, "learning_rate": 8.601120283018869e-06, "loss": 1.3138, "step": 1050 }, { "epoch": 0.3079604880883207, "grad_norm": 44.5, "learning_rate": 8.586379716981132e-06, "loss": 1.2962, "step": 1060 }, { "epoch": 0.31086577571179547, "grad_norm": 46.5, "learning_rate": 8.571639150943397e-06, "loss": 1.3494, "step": 1070 }, { "epoch": 0.3137710633352702, "grad_norm": 64.5, "learning_rate": 8.556898584905661e-06, "loss": 1.2946, "step": 1080 }, { "epoch": 0.3166763509587449, "grad_norm": 87.0, "learning_rate": 8.542158018867926e-06, "loss": 1.2792, "step": 1090 }, { "epoch": 0.31958163858221966, "grad_norm": 37.75, "learning_rate": 8.527417452830189e-06, "loss": 1.3412, "step": 1100 }, { "epoch": 0.31958163858221966, "eval_accuracy": 0.6250690226394258, "eval_f1": 0.6234416564785212, "eval_loss": 0.6603183150291443, "eval_precision": 0.6244569183256099, "eval_recall": 0.6250690226394258, "eval_runtime": 23.7723, "eval_samples_per_second": 76.181, "eval_steps_per_second": 7.656, "step": 1100 }, { "epoch": 0.32248692620569436, "grad_norm": 48.75, "learning_rate": 8.512676886792454e-06, "loss": 1.3886, "step": 1110 }, { "epoch": 0.3253922138291691, "grad_norm": 46.25, "learning_rate": 8.497936320754717e-06, "loss": 1.4322, "step": 1120 }, { "epoch": 0.3282975014526438, "grad_norm": 58.75, "learning_rate": 8.483195754716981e-06, "loss": 1.3484, "step": 1130 }, { "epoch": 0.33120278907611855, "grad_norm": 42.0, "learning_rate": 8.468455188679246e-06, "loss": 1.364, "step": 1140 }, { "epoch": 0.33410807669959325, "grad_norm": 58.75, "learning_rate": 8.45371462264151e-06, "loss": 1.4051, "step": 1150 }, { "epoch": 0.337013364323068, "grad_norm": 45.0, "learning_rate": 8.438974056603775e-06, "loss": 1.3447, "step": 1160 }, { "epoch": 0.3399186519465427, "grad_norm": 45.5, "learning_rate": 8.424233490566038e-06, "loss": 1.3961, "step": 1170 }, { "epoch": 0.34282393957001744, "grad_norm": 68.5, "learning_rate": 8.409492924528303e-06, "loss": 1.3798, "step": 1180 }, { "epoch": 0.34572922719349214, "grad_norm": 55.25, "learning_rate": 8.394752358490566e-06, "loss": 1.3101, "step": 1190 }, { "epoch": 0.3486345148169669, "grad_norm": 37.75, "learning_rate": 8.380011792452832e-06, "loss": 1.3918, "step": 1200 }, { "epoch": 0.3486345148169669, "eval_accuracy": 0.6024295969077857, "eval_f1": 0.6026431424760783, "eval_loss": 0.6649277210235596, "eval_precision": 0.6032326068057581, "eval_recall": 0.6024295969077857, "eval_runtime": 17.6379, "eval_samples_per_second": 102.677, "eval_steps_per_second": 10.319, "step": 1200 }, { "epoch": 0.3515398024404416, "grad_norm": 30.625, "learning_rate": 8.365271226415095e-06, "loss": 1.3555, "step": 1210 }, { "epoch": 0.35444509006391633, "grad_norm": 33.75, "learning_rate": 8.35053066037736e-06, "loss": 1.2323, "step": 1220 }, { "epoch": 0.35735037768739103, "grad_norm": 30.625, "learning_rate": 8.335790094339623e-06, "loss": 1.3533, "step": 1230 }, { "epoch": 0.3602556653108658, "grad_norm": 101.0, "learning_rate": 8.321049528301887e-06, "loss": 1.3743, "step": 1240 }, { "epoch": 0.3631609529343405, "grad_norm": 38.5, "learning_rate": 8.306308962264152e-06, "loss": 1.3463, "step": 1250 }, { "epoch": 0.3660662405578152, "grad_norm": 48.0, "learning_rate": 8.291568396226417e-06, "loss": 1.2404, "step": 1260 }, { "epoch": 0.3689715281812899, "grad_norm": 101.0, "learning_rate": 8.27682783018868e-06, "loss": 1.3486, "step": 1270 }, { "epoch": 0.37187681580476467, "grad_norm": 30.875, "learning_rate": 8.262087264150944e-06, "loss": 1.3636, "step": 1280 }, { "epoch": 0.3747821034282394, "grad_norm": 34.25, "learning_rate": 8.247346698113207e-06, "loss": 1.2899, "step": 1290 }, { "epoch": 0.3776873910517141, "grad_norm": 60.5, "learning_rate": 8.232606132075472e-06, "loss": 1.2714, "step": 1300 }, { "epoch": 0.3776873910517141, "eval_accuracy": 0.6272777471010491, "eval_f1": 0.6221601134611754, "eval_loss": 0.6585363745689392, "eval_precision": 0.6285042745519395, "eval_recall": 0.6272777471010491, "eval_runtime": 26.4941, "eval_samples_per_second": 68.355, "eval_steps_per_second": 6.869, "step": 1300 }, { "epoch": 0.38059267867518887, "grad_norm": 41.5, "learning_rate": 8.217865566037737e-06, "loss": 1.3589, "step": 1310 }, { "epoch": 0.38349796629866356, "grad_norm": 65.5, "learning_rate": 8.203125000000001e-06, "loss": 1.3503, "step": 1320 }, { "epoch": 0.3864032539221383, "grad_norm": 76.5, "learning_rate": 8.188384433962266e-06, "loss": 1.2889, "step": 1330 }, { "epoch": 0.389308541545613, "grad_norm": 83.0, "learning_rate": 8.173643867924529e-06, "loss": 1.4435, "step": 1340 }, { "epoch": 0.39221382916908776, "grad_norm": 46.5, "learning_rate": 8.158903301886793e-06, "loss": 1.3266, "step": 1350 }, { "epoch": 0.39511911679256245, "grad_norm": 50.75, "learning_rate": 8.144162735849056e-06, "loss": 1.2778, "step": 1360 }, { "epoch": 0.3980244044160372, "grad_norm": 59.0, "learning_rate": 8.129422169811321e-06, "loss": 1.3124, "step": 1370 }, { "epoch": 0.4009296920395119, "grad_norm": 63.5, "learning_rate": 8.114681603773586e-06, "loss": 1.2415, "step": 1380 }, { "epoch": 0.40383497966298665, "grad_norm": 67.5, "learning_rate": 8.09994103773585e-06, "loss": 1.3139, "step": 1390 }, { "epoch": 0.40674026728646134, "grad_norm": 45.0, "learning_rate": 8.085200471698113e-06, "loss": 1.2793, "step": 1400 }, { "epoch": 0.40674026728646134, "eval_accuracy": 0.6123688569850911, "eval_f1": 0.6125918781610761, "eval_loss": 0.6618815660476685, "eval_precision": 0.6136048525390123, "eval_recall": 0.6123688569850911, "eval_runtime": 25.4254, "eval_samples_per_second": 71.228, "eval_steps_per_second": 7.158, "step": 1400 }, { "epoch": 0.4096455549099361, "grad_norm": 135.0, "learning_rate": 8.070459905660378e-06, "loss": 1.4438, "step": 1410 }, { "epoch": 0.4125508425334108, "grad_norm": 55.75, "learning_rate": 8.055719339622643e-06, "loss": 1.3361, "step": 1420 }, { "epoch": 0.41545613015688554, "grad_norm": 97.0, "learning_rate": 8.040978773584907e-06, "loss": 1.2857, "step": 1430 }, { "epoch": 0.41836141778036023, "grad_norm": 97.0, "learning_rate": 8.02623820754717e-06, "loss": 1.2567, "step": 1440 }, { "epoch": 0.421266705403835, "grad_norm": 41.0, "learning_rate": 8.011497641509435e-06, "loss": 1.3682, "step": 1450 }, { "epoch": 0.4241719930273097, "grad_norm": 74.5, "learning_rate": 7.996757075471698e-06, "loss": 1.2763, "step": 1460 }, { "epoch": 0.42707728065078443, "grad_norm": 37.0, "learning_rate": 7.982016509433963e-06, "loss": 1.3787, "step": 1470 }, { "epoch": 0.4299825682742591, "grad_norm": 32.25, "learning_rate": 7.967275943396227e-06, "loss": 1.2549, "step": 1480 }, { "epoch": 0.4328878558977339, "grad_norm": 84.5, "learning_rate": 7.952535377358492e-06, "loss": 1.3206, "step": 1490 }, { "epoch": 0.4357931435212086, "grad_norm": 69.0, "learning_rate": 7.937794811320757e-06, "loss": 1.41, "step": 1500 }, { "epoch": 0.4357931435212086, "eval_accuracy": 0.627829928216455, "eval_f1": 0.6269606515260162, "eval_loss": 0.652948796749115, "eval_precision": 0.6272111466552719, "eval_recall": 0.627829928216455, "eval_runtime": 21.9245, "eval_samples_per_second": 82.602, "eval_steps_per_second": 8.301, "step": 1500 }, { "epoch": 0.4386984311446833, "grad_norm": 38.5, "learning_rate": 7.92305424528302e-06, "loss": 1.372, "step": 1510 }, { "epoch": 0.44160371876815807, "grad_norm": 64.5, "learning_rate": 7.908313679245284e-06, "loss": 1.3912, "step": 1520 }, { "epoch": 0.44450900639163277, "grad_norm": 62.5, "learning_rate": 7.893573113207547e-06, "loss": 1.3164, "step": 1530 }, { "epoch": 0.4474142940151075, "grad_norm": 40.75, "learning_rate": 7.878832547169812e-06, "loss": 1.3526, "step": 1540 }, { "epoch": 0.4503195816385822, "grad_norm": 36.25, "learning_rate": 7.864091981132076e-06, "loss": 1.3417, "step": 1550 }, { "epoch": 0.45322486926205696, "grad_norm": 38.5, "learning_rate": 7.849351415094341e-06, "loss": 1.4355, "step": 1560 }, { "epoch": 0.45613015688553166, "grad_norm": 44.75, "learning_rate": 7.834610849056604e-06, "loss": 1.3182, "step": 1570 }, { "epoch": 0.4590354445090064, "grad_norm": 32.5, "learning_rate": 7.819870283018869e-06, "loss": 1.2961, "step": 1580 }, { "epoch": 0.4619407321324811, "grad_norm": 45.75, "learning_rate": 7.805129716981132e-06, "loss": 1.2738, "step": 1590 }, { "epoch": 0.46484601975595585, "grad_norm": 38.5, "learning_rate": 7.790389150943396e-06, "loss": 1.3503, "step": 1600 }, { "epoch": 0.46484601975595585, "eval_accuracy": 0.6223081170623964, "eval_f1": 0.6190936146728951, "eval_loss": 0.6521285772323608, "eval_precision": 0.6221827660719746, "eval_recall": 0.6223081170623964, "eval_runtime": 25.4159, "eval_samples_per_second": 71.255, "eval_steps_per_second": 7.161, "step": 1600 }, { "epoch": 0.46775130737943055, "grad_norm": 58.0, "learning_rate": 7.775648584905661e-06, "loss": 1.2569, "step": 1610 }, { "epoch": 0.4706565950029053, "grad_norm": 87.5, "learning_rate": 7.760908018867926e-06, "loss": 1.3135, "step": 1620 }, { "epoch": 0.47356188262638, "grad_norm": 98.0, "learning_rate": 7.746167452830189e-06, "loss": 1.3008, "step": 1630 }, { "epoch": 0.47646717024985474, "grad_norm": 31.25, "learning_rate": 7.731426886792453e-06, "loss": 1.2975, "step": 1640 }, { "epoch": 0.47937245787332944, "grad_norm": 103.5, "learning_rate": 7.716686320754718e-06, "loss": 1.2854, "step": 1650 }, { "epoch": 0.4822777454968042, "grad_norm": 41.25, "learning_rate": 7.701945754716983e-06, "loss": 1.2613, "step": 1660 }, { "epoch": 0.4851830331202789, "grad_norm": 47.25, "learning_rate": 7.687205188679246e-06, "loss": 1.3469, "step": 1670 }, { "epoch": 0.48808832074375363, "grad_norm": 36.5, "learning_rate": 7.67246462264151e-06, "loss": 1.3195, "step": 1680 }, { "epoch": 0.49099360836722833, "grad_norm": 77.5, "learning_rate": 7.657724056603775e-06, "loss": 1.3431, "step": 1690 }, { "epoch": 0.4938988959907031, "grad_norm": 38.25, "learning_rate": 7.642983490566038e-06, "loss": 1.3423, "step": 1700 }, { "epoch": 0.4938988959907031, "eval_accuracy": 0.6300386526780785, "eval_f1": 0.6163155948087964, "eval_loss": 0.656129777431488, "eval_precision": 0.6398336533169895, "eval_recall": 0.6300386526780785, "eval_runtime": 17.7356, "eval_samples_per_second": 102.111, "eval_steps_per_second": 10.262, "step": 1700 }, { "epoch": 0.49680418361417783, "grad_norm": 63.0, "learning_rate": 7.628242924528303e-06, "loss": 1.2512, "step": 1710 }, { "epoch": 0.4997094712376525, "grad_norm": 26.875, "learning_rate": 7.613502358490566e-06, "loss": 1.2956, "step": 1720 }, { "epoch": 0.5026147588611273, "grad_norm": 52.25, "learning_rate": 7.598761792452831e-06, "loss": 1.4073, "step": 1730 }, { "epoch": 0.505520046484602, "grad_norm": 84.0, "learning_rate": 7.584021226415095e-06, "loss": 1.2805, "step": 1740 }, { "epoch": 0.5084253341080767, "grad_norm": 53.75, "learning_rate": 7.569280660377359e-06, "loss": 1.3359, "step": 1750 }, { "epoch": 0.5113306217315514, "grad_norm": 27.125, "learning_rate": 7.554540094339623e-06, "loss": 1.2798, "step": 1760 }, { "epoch": 0.5142359093550262, "grad_norm": 55.5, "learning_rate": 7.539799528301888e-06, "loss": 1.3784, "step": 1770 }, { "epoch": 0.5171411969785009, "grad_norm": 72.5, "learning_rate": 7.525058962264152e-06, "loss": 1.3408, "step": 1780 }, { "epoch": 0.5200464846019756, "grad_norm": 50.5, "learning_rate": 7.510318396226416e-06, "loss": 1.3178, "step": 1790 }, { "epoch": 0.5229517722254503, "grad_norm": 49.5, "learning_rate": 7.495577830188679e-06, "loss": 1.3394, "step": 1800 }, { "epoch": 0.5229517722254503, "eval_accuracy": 0.6178906681391496, "eval_f1": 0.6181047441367624, "eval_loss": 0.6562695503234863, "eval_precision": 0.61880686931104, "eval_recall": 0.6178906681391496, "eval_runtime": 23.6625, "eval_samples_per_second": 76.535, "eval_steps_per_second": 7.691, "step": 1800 }, { "epoch": 0.5258570598489251, "grad_norm": 34.75, "learning_rate": 7.480837264150944e-06, "loss": 1.2929, "step": 1810 }, { "epoch": 0.5287623474723998, "grad_norm": 45.5, "learning_rate": 7.466096698113208e-06, "loss": 1.3049, "step": 1820 }, { "epoch": 0.5316676350958744, "grad_norm": 48.75, "learning_rate": 7.451356132075472e-06, "loss": 1.2783, "step": 1830 }, { "epoch": 0.5345729227193492, "grad_norm": 83.0, "learning_rate": 7.436615566037736e-06, "loss": 1.2851, "step": 1840 }, { "epoch": 0.537478210342824, "grad_norm": 83.5, "learning_rate": 7.421875000000001e-06, "loss": 1.3286, "step": 1850 }, { "epoch": 0.5403834979662987, "grad_norm": 40.25, "learning_rate": 7.4071344339622655e-06, "loss": 1.3461, "step": 1860 }, { "epoch": 0.5432887855897733, "grad_norm": 38.75, "learning_rate": 7.3923938679245284e-06, "loss": 1.2203, "step": 1870 }, { "epoch": 0.5461940732132481, "grad_norm": 46.75, "learning_rate": 7.377653301886793e-06, "loss": 1.2875, "step": 1880 }, { "epoch": 0.5490993608367228, "grad_norm": 28.125, "learning_rate": 7.362912735849057e-06, "loss": 1.3201, "step": 1890 }, { "epoch": 0.5520046484601976, "grad_norm": 61.0, "learning_rate": 7.3481721698113216e-06, "loss": 1.2116, "step": 1900 }, { "epoch": 0.5520046484601976, "eval_accuracy": 0.6217559359469906, "eval_f1": 0.6041253940543542, "eval_loss": 0.6534772515296936, "eval_precision": 0.6342473819530806, "eval_recall": 0.6217559359469906, "eval_runtime": 22.658, "eval_samples_per_second": 79.928, "eval_steps_per_second": 8.032, "step": 1900 }, { "epoch": 0.5549099360836722, "grad_norm": 83.0, "learning_rate": 7.333431603773585e-06, "loss": 1.2667, "step": 1910 }, { "epoch": 0.557815223707147, "grad_norm": 42.5, "learning_rate": 7.31869103773585e-06, "loss": 1.277, "step": 1920 }, { "epoch": 0.5607205113306217, "grad_norm": 53.0, "learning_rate": 7.303950471698114e-06, "loss": 1.3001, "step": 1930 }, { "epoch": 0.5636257989540965, "grad_norm": 44.75, "learning_rate": 7.2892099056603785e-06, "loss": 1.2845, "step": 1940 }, { "epoch": 0.5665310865775712, "grad_norm": 47.0, "learning_rate": 7.2744693396226415e-06, "loss": 1.35, "step": 1950 }, { "epoch": 0.5694363742010459, "grad_norm": 58.75, "learning_rate": 7.259728773584906e-06, "loss": 1.3458, "step": 1960 }, { "epoch": 0.5723416618245206, "grad_norm": 42.0, "learning_rate": 7.24498820754717e-06, "loss": 1.3163, "step": 1970 }, { "epoch": 0.5752469494479954, "grad_norm": 81.0, "learning_rate": 7.230247641509435e-06, "loss": 1.3481, "step": 1980 }, { "epoch": 0.5781522370714701, "grad_norm": 57.75, "learning_rate": 7.215507075471698e-06, "loss": 1.1711, "step": 1990 }, { "epoch": 0.5810575246949448, "grad_norm": 33.5, "learning_rate": 7.200766509433963e-06, "loss": 1.3834, "step": 2000 }, { "epoch": 0.5810575246949448, "eval_accuracy": 0.6355604638321369, "eval_f1": 0.6342784909550675, "eval_loss": 0.650558352470398, "eval_precision": 0.6350165949733959, "eval_recall": 0.6355604638321369, "eval_runtime": 21.9148, "eval_samples_per_second": 82.638, "eval_steps_per_second": 8.305, "step": 2000 }, { "epoch": 0.5839628123184195, "grad_norm": 46.0, "learning_rate": 7.186025943396227e-06, "loss": 1.3424, "step": 2010 }, { "epoch": 0.5868680999418943, "grad_norm": 30.0, "learning_rate": 7.1712853773584915e-06, "loss": 1.2667, "step": 2020 }, { "epoch": 0.589773387565369, "grad_norm": 35.0, "learning_rate": 7.156544811320756e-06, "loss": 1.2963, "step": 2030 }, { "epoch": 0.5926786751888437, "grad_norm": 26.25, "learning_rate": 7.141804245283019e-06, "loss": 1.2806, "step": 2040 }, { "epoch": 0.5955839628123184, "grad_norm": 35.5, "learning_rate": 7.127063679245284e-06, "loss": 1.2645, "step": 2050 }, { "epoch": 0.5984892504357932, "grad_norm": 54.75, "learning_rate": 7.112323113207548e-06, "loss": 1.3138, "step": 2060 }, { "epoch": 0.6013945380592679, "grad_norm": 38.0, "learning_rate": 7.097582547169812e-06, "loss": 1.2769, "step": 2070 }, { "epoch": 0.6042998256827425, "grad_norm": 50.75, "learning_rate": 7.082841981132076e-06, "loss": 1.3197, "step": 2080 }, { "epoch": 0.6072051133062173, "grad_norm": 52.5, "learning_rate": 7.068101415094341e-06, "loss": 1.363, "step": 2090 }, { "epoch": 0.610110400929692, "grad_norm": 29.5, "learning_rate": 7.0533608490566045e-06, "loss": 1.2125, "step": 2100 }, { "epoch": 0.610110400929692, "eval_accuracy": 0.6300386526780785, "eval_f1": 0.621290753411316, "eval_loss": 0.6497138738632202, "eval_precision": 0.6345540000240397, "eval_recall": 0.6300386526780785, "eval_runtime": 27.7006, "eval_samples_per_second": 65.378, "eval_steps_per_second": 6.57, "step": 2100 }, { "epoch": 0.6130156885531668, "grad_norm": 56.25, "learning_rate": 7.038620283018869e-06, "loss": 1.3201, "step": 2110 }, { "epoch": 0.6159209761766414, "grad_norm": 57.0, "learning_rate": 7.023879716981132e-06, "loss": 1.444, "step": 2120 }, { "epoch": 0.6188262638001162, "grad_norm": 52.5, "learning_rate": 7.009139150943397e-06, "loss": 1.3887, "step": 2130 }, { "epoch": 0.6217315514235909, "grad_norm": 51.25, "learning_rate": 6.994398584905661e-06, "loss": 1.3022, "step": 2140 }, { "epoch": 0.6246368390470657, "grad_norm": 25.25, "learning_rate": 6.979658018867925e-06, "loss": 1.3132, "step": 2150 }, { "epoch": 0.6275421266705404, "grad_norm": 45.25, "learning_rate": 6.964917452830189e-06, "loss": 1.2461, "step": 2160 }, { "epoch": 0.6304474142940151, "grad_norm": 27.125, "learning_rate": 6.950176886792454e-06, "loss": 1.2606, "step": 2170 }, { "epoch": 0.6333527019174898, "grad_norm": 52.0, "learning_rate": 6.935436320754717e-06, "loss": 1.2901, "step": 2180 }, { "epoch": 0.6362579895409646, "grad_norm": 32.75, "learning_rate": 6.920695754716981e-06, "loss": 1.3803, "step": 2190 }, { "epoch": 0.6391632771644393, "grad_norm": 34.25, "learning_rate": 6.905955188679245e-06, "loss": 1.2805, "step": 2200 }, { "epoch": 0.6391632771644393, "eval_accuracy": 0.6311430149088901, "eval_f1": 0.6298455029666443, "eval_loss": 0.648868203163147, "eval_precision": 0.630553601839031, "eval_recall": 0.6311430149088901, "eval_runtime": 23.3959, "eval_samples_per_second": 77.407, "eval_steps_per_second": 7.779, "step": 2200 }, { "epoch": 0.642068564787914, "grad_norm": 45.25, "learning_rate": 6.89121462264151e-06, "loss": 1.3036, "step": 2210 }, { "epoch": 0.6449738524113887, "grad_norm": 54.0, "learning_rate": 6.8764740566037745e-06, "loss": 1.1919, "step": 2220 }, { "epoch": 0.6478791400348635, "grad_norm": 37.25, "learning_rate": 6.861733490566038e-06, "loss": 1.4039, "step": 2230 }, { "epoch": 0.6507844276583382, "grad_norm": 71.0, "learning_rate": 6.846992924528303e-06, "loss": 1.3153, "step": 2240 }, { "epoch": 0.6536897152818129, "grad_norm": 85.0, "learning_rate": 6.832252358490567e-06, "loss": 1.3351, "step": 2250 }, { "epoch": 0.6565950029052876, "grad_norm": 54.0, "learning_rate": 6.817511792452831e-06, "loss": 1.2602, "step": 2260 }, { "epoch": 0.6595002905287624, "grad_norm": 101.5, "learning_rate": 6.802771226415094e-06, "loss": 1.2256, "step": 2270 }, { "epoch": 0.6624055781522371, "grad_norm": 80.5, "learning_rate": 6.788030660377359e-06, "loss": 1.3267, "step": 2280 }, { "epoch": 0.6653108657757117, "grad_norm": 48.25, "learning_rate": 6.773290094339623e-06, "loss": 1.356, "step": 2290 }, { "epoch": 0.6682161533991865, "grad_norm": 33.25, "learning_rate": 6.7585495283018875e-06, "loss": 1.3723, "step": 2300 }, { "epoch": 0.6682161533991865, "eval_accuracy": 0.6322473771397018, "eval_f1": 0.6307364826527131, "eval_loss": 0.646483302116394, "eval_precision": 0.6317087816843026, "eval_recall": 0.6322473771397018, "eval_runtime": 17.5393, "eval_samples_per_second": 103.254, "eval_steps_per_second": 10.377, "step": 2300 }, { "epoch": 0.6711214410226612, "grad_norm": 100.0, "learning_rate": 6.743808962264151e-06, "loss": 1.3795, "step": 2310 }, { "epoch": 0.674026728646136, "grad_norm": 32.0, "learning_rate": 6.729068396226416e-06, "loss": 1.3164, "step": 2320 }, { "epoch": 0.6769320162696106, "grad_norm": 57.25, "learning_rate": 6.71432783018868e-06, "loss": 1.3095, "step": 2330 }, { "epoch": 0.6798373038930854, "grad_norm": 43.5, "learning_rate": 6.699587264150944e-06, "loss": 1.3305, "step": 2340 }, { "epoch": 0.6827425915165601, "grad_norm": 59.5, "learning_rate": 6.684846698113207e-06, "loss": 1.3211, "step": 2350 }, { "epoch": 0.6856478791400349, "grad_norm": 38.75, "learning_rate": 6.670106132075472e-06, "loss": 1.2753, "step": 2360 }, { "epoch": 0.6885531667635096, "grad_norm": 102.5, "learning_rate": 6.655365566037736e-06, "loss": 1.2837, "step": 2370 }, { "epoch": 0.6914584543869843, "grad_norm": 34.25, "learning_rate": 6.6406250000000005e-06, "loss": 1.2707, "step": 2380 }, { "epoch": 0.694363742010459, "grad_norm": 59.75, "learning_rate": 6.625884433962265e-06, "loss": 1.2989, "step": 2390 }, { "epoch": 0.6972690296339338, "grad_norm": 58.0, "learning_rate": 6.611143867924529e-06, "loss": 1.3785, "step": 2400 }, { "epoch": 0.6972690296339338, "eval_accuracy": 0.635008282716731, "eval_f1": 0.633801429811518, "eval_loss": 0.6450901627540588, "eval_precision": 0.6344474468599942, "eval_recall": 0.635008282716731, "eval_runtime": 27.9994, "eval_samples_per_second": 64.68, "eval_steps_per_second": 6.5, "step": 2400 }, { "epoch": 0.7001743172574085, "grad_norm": 62.25, "learning_rate": 6.596403301886794e-06, "loss": 1.3779, "step": 2410 }, { "epoch": 0.7030796048808832, "grad_norm": 35.25, "learning_rate": 6.581662735849057e-06, "loss": 1.3529, "step": 2420 }, { "epoch": 0.7059848925043579, "grad_norm": 34.25, "learning_rate": 6.566922169811322e-06, "loss": 1.2754, "step": 2430 }, { "epoch": 0.7088901801278327, "grad_norm": 52.0, "learning_rate": 6.552181603773585e-06, "loss": 1.3055, "step": 2440 }, { "epoch": 0.7117954677513074, "grad_norm": 43.0, "learning_rate": 6.53744103773585e-06, "loss": 1.2766, "step": 2450 }, { "epoch": 0.7147007553747821, "grad_norm": 28.75, "learning_rate": 6.5227004716981135e-06, "loss": 1.293, "step": 2460 }, { "epoch": 0.7176060429982568, "grad_norm": 33.25, "learning_rate": 6.507959905660378e-06, "loss": 1.324, "step": 2470 }, { "epoch": 0.7205113306217316, "grad_norm": 26.5, "learning_rate": 6.493219339622642e-06, "loss": 1.2177, "step": 2480 }, { "epoch": 0.7234166182452063, "grad_norm": 65.5, "learning_rate": 6.478478773584907e-06, "loss": 1.3648, "step": 2490 }, { "epoch": 0.726321905868681, "grad_norm": 58.0, "learning_rate": 6.4637382075471696e-06, "loss": 1.2685, "step": 2500 }, { "epoch": 0.726321905868681, "eval_accuracy": 0.6068470458310325, "eval_f1": 0.6061765727272004, "eval_loss": 0.6521552801132202, "eval_precision": 0.6118061263264858, "eval_recall": 0.6068470458310325, "eval_runtime": 23.9512, "eval_samples_per_second": 75.612, "eval_steps_per_second": 7.599, "step": 2500 }, { "epoch": 0.7292271934921557, "grad_norm": 73.5, "learning_rate": 6.448997641509434e-06, "loss": 1.2741, "step": 2510 }, { "epoch": 0.7321324811156305, "grad_norm": 25.125, "learning_rate": 6.434257075471698e-06, "loss": 1.3256, "step": 2520 }, { "epoch": 0.7350377687391052, "grad_norm": 48.5, "learning_rate": 6.419516509433963e-06, "loss": 1.3056, "step": 2530 }, { "epoch": 0.7379430563625798, "grad_norm": 41.0, "learning_rate": 6.4047759433962265e-06, "loss": 1.2686, "step": 2540 }, { "epoch": 0.7408483439860546, "grad_norm": 47.0, "learning_rate": 6.390035377358491e-06, "loss": 1.3235, "step": 2550 }, { "epoch": 0.7437536316095293, "grad_norm": 62.25, "learning_rate": 6.375294811320756e-06, "loss": 1.3393, "step": 2560 }, { "epoch": 0.7466589192330041, "grad_norm": 42.75, "learning_rate": 6.36055424528302e-06, "loss": 1.2951, "step": 2570 }, { "epoch": 0.7495642068564788, "grad_norm": 59.5, "learning_rate": 6.345813679245284e-06, "loss": 1.3177, "step": 2580 }, { "epoch": 0.7524694944799535, "grad_norm": 56.25, "learning_rate": 6.331073113207547e-06, "loss": 1.1857, "step": 2590 }, { "epoch": 0.7553747821034282, "grad_norm": 83.5, "learning_rate": 6.316332547169812e-06, "loss": 1.3168, "step": 2600 }, { "epoch": 0.7553747821034282, "eval_accuracy": 0.6267255659856433, "eval_f1": 0.6263898768518164, "eval_loss": 0.6455510854721069, "eval_precision": 0.626312724736393, "eval_recall": 0.6267255659856433, "eval_runtime": 22.9922, "eval_samples_per_second": 78.766, "eval_steps_per_second": 7.916, "step": 2600 }, { "epoch": 0.758280069726903, "grad_norm": 65.5, "learning_rate": 6.301591981132076e-06, "loss": 1.2172, "step": 2610 }, { "epoch": 0.7611853573503777, "grad_norm": 44.5, "learning_rate": 6.28685141509434e-06, "loss": 1.3155, "step": 2620 }, { "epoch": 0.7640906449738524, "grad_norm": 36.0, "learning_rate": 6.272110849056604e-06, "loss": 1.3665, "step": 2630 }, { "epoch": 0.7669959325973271, "grad_norm": 44.75, "learning_rate": 6.257370283018869e-06, "loss": 1.2973, "step": 2640 }, { "epoch": 0.7699012202208019, "grad_norm": 63.25, "learning_rate": 6.242629716981133e-06, "loss": 1.2979, "step": 2650 }, { "epoch": 0.7728065078442766, "grad_norm": 33.5, "learning_rate": 6.227889150943397e-06, "loss": 1.3267, "step": 2660 }, { "epoch": 0.7757117954677513, "grad_norm": 46.5, "learning_rate": 6.21314858490566e-06, "loss": 1.3511, "step": 2670 }, { "epoch": 0.778617083091226, "grad_norm": 39.0, "learning_rate": 6.198408018867925e-06, "loss": 1.3943, "step": 2680 }, { "epoch": 0.7815223707147008, "grad_norm": 34.5, "learning_rate": 6.183667452830189e-06, "loss": 1.2841, "step": 2690 }, { "epoch": 0.7844276583381755, "grad_norm": 41.5, "learning_rate": 6.168926886792453e-06, "loss": 1.3644, "step": 2700 }, { "epoch": 0.7844276583381755, "eval_accuracy": 0.6344561016013253, "eval_f1": 0.6324778464405518, "eval_loss": 0.643452525138855, "eval_precision": 0.6340971076510771, "eval_recall": 0.6344561016013253, "eval_runtime": 26.9644, "eval_samples_per_second": 67.163, "eval_steps_per_second": 6.75, "step": 2700 }, { "epoch": 0.7873329459616502, "grad_norm": 64.5, "learning_rate": 6.154186320754717e-06, "loss": 1.3015, "step": 2710 }, { "epoch": 0.7902382335851249, "grad_norm": 93.0, "learning_rate": 6.139445754716982e-06, "loss": 1.3085, "step": 2720 }, { "epoch": 0.7931435212085997, "grad_norm": 34.5, "learning_rate": 6.124705188679246e-06, "loss": 1.2709, "step": 2730 }, { "epoch": 0.7960488088320744, "grad_norm": 36.5, "learning_rate": 6.10996462264151e-06, "loss": 1.2619, "step": 2740 }, { "epoch": 0.798954096455549, "grad_norm": 58.75, "learning_rate": 6.095224056603775e-06, "loss": 1.2363, "step": 2750 }, { "epoch": 0.8018593840790238, "grad_norm": 54.75, "learning_rate": 6.080483490566038e-06, "loss": 1.3241, "step": 2760 }, { "epoch": 0.8047646717024985, "grad_norm": 62.0, "learning_rate": 6.0657429245283026e-06, "loss": 1.2894, "step": 2770 }, { "epoch": 0.8076699593259733, "grad_norm": 38.0, "learning_rate": 6.051002358490566e-06, "loss": 1.2915, "step": 2780 }, { "epoch": 0.810575246949448, "grad_norm": 29.25, "learning_rate": 6.036261792452831e-06, "loss": 1.3298, "step": 2790 }, { "epoch": 0.8134805345729227, "grad_norm": 44.25, "learning_rate": 6.021521226415095e-06, "loss": 1.2547, "step": 2800 }, { "epoch": 0.8134805345729227, "eval_accuracy": 0.6289342904472667, "eval_f1": 0.6291307580361674, "eval_loss": 0.6478989124298096, "eval_precision": 0.6305301619963926, "eval_recall": 0.6289342904472667, "eval_runtime": 19.0947, "eval_samples_per_second": 94.843, "eval_steps_per_second": 9.531, "step": 2800 }, { "epoch": 0.8163858221963974, "grad_norm": 64.5, "learning_rate": 6.0067806603773595e-06, "loss": 1.2978, "step": 2810 }, { "epoch": 0.8192911098198722, "grad_norm": 36.5, "learning_rate": 5.9920400943396225e-06, "loss": 1.3289, "step": 2820 }, { "epoch": 0.8221963974433469, "grad_norm": 78.0, "learning_rate": 5.977299528301887e-06, "loss": 1.2113, "step": 2830 }, { "epoch": 0.8251016850668216, "grad_norm": 48.0, "learning_rate": 5.962558962264151e-06, "loss": 1.3046, "step": 2840 }, { "epoch": 0.8280069726902963, "grad_norm": 46.25, "learning_rate": 5.9478183962264156e-06, "loss": 1.2813, "step": 2850 }, { "epoch": 0.8309122603137711, "grad_norm": 61.5, "learning_rate": 5.933077830188679e-06, "loss": 1.3428, "step": 2860 }, { "epoch": 0.8338175479372458, "grad_norm": 25.375, "learning_rate": 5.918337264150944e-06, "loss": 1.2289, "step": 2870 }, { "epoch": 0.8367228355607205, "grad_norm": 62.75, "learning_rate": 5.903596698113208e-06, "loss": 1.3327, "step": 2880 }, { "epoch": 0.8396281231841952, "grad_norm": 69.0, "learning_rate": 5.8888561320754725e-06, "loss": 1.3704, "step": 2890 }, { "epoch": 0.84253341080767, "grad_norm": 48.5, "learning_rate": 5.8741155660377355e-06, "loss": 1.2572, "step": 2900 }, { "epoch": 0.84253341080767, "eval_accuracy": 0.6311430149088901, "eval_f1": 0.6297924211046073, "eval_loss": 0.6452035307884216, "eval_precision": 0.6305609562115095, "eval_recall": 0.6311430149088901, "eval_runtime": 22.0587, "eval_samples_per_second": 82.099, "eval_steps_per_second": 8.251, "step": 2900 }, { "epoch": 0.8454386984311447, "grad_norm": 56.75, "learning_rate": 5.859375e-06, "loss": 1.324, "step": 2910 }, { "epoch": 0.8483439860546194, "grad_norm": 37.0, "learning_rate": 5.844634433962265e-06, "loss": 1.3031, "step": 2920 }, { "epoch": 0.8512492736780941, "grad_norm": 87.0, "learning_rate": 5.829893867924529e-06, "loss": 1.4083, "step": 2930 }, { "epoch": 0.8541545613015689, "grad_norm": 35.25, "learning_rate": 5.815153301886793e-06, "loss": 1.3622, "step": 2940 }, { "epoch": 0.8570598489250436, "grad_norm": 31.375, "learning_rate": 5.800412735849057e-06, "loss": 1.3026, "step": 2950 }, { "epoch": 0.8599651365485182, "grad_norm": 37.25, "learning_rate": 5.785672169811322e-06, "loss": 1.2841, "step": 2960 }, { "epoch": 0.862870424171993, "grad_norm": 41.5, "learning_rate": 5.7709316037735855e-06, "loss": 1.2886, "step": 2970 }, { "epoch": 0.8657757117954678, "grad_norm": 33.5, "learning_rate": 5.75619103773585e-06, "loss": 1.2448, "step": 2980 }, { "epoch": 0.8686809994189425, "grad_norm": 38.0, "learning_rate": 5.741450471698113e-06, "loss": 1.3723, "step": 2990 }, { "epoch": 0.8715862870424173, "grad_norm": 43.5, "learning_rate": 5.726709905660378e-06, "loss": 1.3656, "step": 3000 }, { "epoch": 0.8715862870424173, "eval_accuracy": 0.6200993926007731, "eval_f1": 0.6196755505318609, "eval_loss": 0.6481147408485413, "eval_precision": 0.624547308512698, "eval_recall": 0.6200993926007731, "eval_runtime": 22.5073, "eval_samples_per_second": 80.463, "eval_steps_per_second": 8.086, "step": 3000 }, { "epoch": 0.8744915746658919, "grad_norm": 64.0, "learning_rate": 5.711969339622642e-06, "loss": 1.2059, "step": 3010 }, { "epoch": 0.8773968622893666, "grad_norm": 37.5, "learning_rate": 5.697228773584906e-06, "loss": 1.4072, "step": 3020 }, { "epoch": 0.8803021499128414, "grad_norm": 54.75, "learning_rate": 5.68248820754717e-06, "loss": 1.3369, "step": 3030 }, { "epoch": 0.8832074375363161, "grad_norm": 59.25, "learning_rate": 5.667747641509435e-06, "loss": 1.3416, "step": 3040 }, { "epoch": 0.8861127251597908, "grad_norm": 38.5, "learning_rate": 5.6530070754716985e-06, "loss": 1.3036, "step": 3050 }, { "epoch": 0.8890180127832655, "grad_norm": 28.875, "learning_rate": 5.638266509433963e-06, "loss": 1.3368, "step": 3060 }, { "epoch": 0.8919233004067403, "grad_norm": 38.0, "learning_rate": 5.623525943396226e-06, "loss": 1.2894, "step": 3070 }, { "epoch": 0.894828588030215, "grad_norm": 35.75, "learning_rate": 5.608785377358491e-06, "loss": 1.3777, "step": 3080 }, { "epoch": 0.8977338756536897, "grad_norm": 31.25, "learning_rate": 5.5940448113207555e-06, "loss": 1.2222, "step": 3090 }, { "epoch": 0.9006391632771644, "grad_norm": 48.5, "learning_rate": 5.579304245283019e-06, "loss": 1.3222, "step": 3100 }, { "epoch": 0.9006391632771644, "eval_accuracy": 0.6327995582551077, "eval_f1": 0.6309066331655309, "eval_loss": 0.6468316316604614, "eval_precision": 0.6323790829666001, "eval_recall": 0.6327995582551077, "eval_runtime": 22.1011, "eval_samples_per_second": 81.942, "eval_steps_per_second": 8.235, "step": 3100 }, { "epoch": 0.9035444509006392, "grad_norm": 95.0, "learning_rate": 5.564563679245284e-06, "loss": 1.468, "step": 3110 }, { "epoch": 0.9064497385241139, "grad_norm": 65.0, "learning_rate": 5.549823113207548e-06, "loss": 1.3194, "step": 3120 }, { "epoch": 0.9093550261475886, "grad_norm": 46.0, "learning_rate": 5.535082547169812e-06, "loss": 1.2986, "step": 3130 }, { "epoch": 0.9122603137710633, "grad_norm": 57.25, "learning_rate": 5.520341981132075e-06, "loss": 1.2991, "step": 3140 }, { "epoch": 0.9151656013945381, "grad_norm": 45.0, "learning_rate": 5.505601415094341e-06, "loss": 1.2372, "step": 3150 }, { "epoch": 0.9180708890180128, "grad_norm": 47.25, "learning_rate": 5.490860849056604e-06, "loss": 1.3458, "step": 3160 }, { "epoch": 0.9209761766414875, "grad_norm": 49.0, "learning_rate": 5.4761202830188685e-06, "loss": 1.3117, "step": 3170 }, { "epoch": 0.9238814642649622, "grad_norm": 48.5, "learning_rate": 5.461379716981132e-06, "loss": 1.2655, "step": 3180 }, { "epoch": 0.926786751888437, "grad_norm": 33.75, "learning_rate": 5.446639150943397e-06, "loss": 1.2547, "step": 3190 }, { "epoch": 0.9296920395119117, "grad_norm": 54.75, "learning_rate": 5.431898584905661e-06, "loss": 1.2231, "step": 3200 }, { "epoch": 0.9296920395119117, "eval_accuracy": 0.6366648260629486, "eval_f1": 0.6312223269046636, "eval_loss": 0.6476677060127258, "eval_precision": 0.6387207626644001, "eval_recall": 0.6366648260629486, "eval_runtime": 22.0118, "eval_samples_per_second": 82.274, "eval_steps_per_second": 8.268, "step": 3200 }, { "epoch": 0.9325973271353865, "grad_norm": 43.5, "learning_rate": 5.417158018867925e-06, "loss": 1.2574, "step": 3210 }, { "epoch": 0.9355026147588611, "grad_norm": 37.0, "learning_rate": 5.402417452830188e-06, "loss": 1.2772, "step": 3220 }, { "epoch": 0.9384079023823358, "grad_norm": 38.75, "learning_rate": 5.387676886792453e-06, "loss": 1.2504, "step": 3230 }, { "epoch": 0.9413131900058106, "grad_norm": 66.0, "learning_rate": 5.372936320754717e-06, "loss": 1.3723, "step": 3240 }, { "epoch": 0.9442184776292853, "grad_norm": 110.0, "learning_rate": 5.3581957547169815e-06, "loss": 1.3505, "step": 3250 }, { "epoch": 0.94712376525276, "grad_norm": 42.0, "learning_rate": 5.343455188679245e-06, "loss": 1.2563, "step": 3260 }, { "epoch": 0.9500290528762347, "grad_norm": 50.0, "learning_rate": 5.32871462264151e-06, "loss": 1.2599, "step": 3270 }, { "epoch": 0.9529343404997095, "grad_norm": 46.5, "learning_rate": 5.313974056603775e-06, "loss": 1.4014, "step": 3280 }, { "epoch": 0.9558396281231842, "grad_norm": 49.5, "learning_rate": 5.299233490566038e-06, "loss": 1.2596, "step": 3290 }, { "epoch": 0.9587449157466589, "grad_norm": 46.75, "learning_rate": 5.284492924528303e-06, "loss": 1.3213, "step": 3300 }, { "epoch": 0.9587449157466589, "eval_accuracy": 0.6327995582551077, "eval_f1": 0.6278569180798685, "eval_loss": 0.6428092122077942, "eval_precision": 0.6342108267486769, "eval_recall": 0.6327995582551077, "eval_runtime": 21.9512, "eval_samples_per_second": 82.501, "eval_steps_per_second": 8.291, "step": 3300 }, { "epoch": 0.9616502033701336, "grad_norm": 69.0, "learning_rate": 5.269752358490566e-06, "loss": 1.2727, "step": 3310 }, { "epoch": 0.9645554909936084, "grad_norm": 62.75, "learning_rate": 5.255011792452831e-06, "loss": 1.2832, "step": 3320 }, { "epoch": 0.9674607786170831, "grad_norm": 45.25, "learning_rate": 5.2402712264150945e-06, "loss": 1.3935, "step": 3330 }, { "epoch": 0.9703660662405578, "grad_norm": 75.5, "learning_rate": 5.225530660377359e-06, "loss": 1.3141, "step": 3340 }, { "epoch": 0.9732713538640325, "grad_norm": 48.25, "learning_rate": 5.210790094339623e-06, "loss": 1.3165, "step": 3350 }, { "epoch": 0.9761766414875073, "grad_norm": 40.25, "learning_rate": 5.196049528301888e-06, "loss": 1.2481, "step": 3360 }, { "epoch": 0.979081929110982, "grad_norm": 99.5, "learning_rate": 5.181308962264151e-06, "loss": 1.3187, "step": 3370 }, { "epoch": 0.9819872167344567, "grad_norm": 27.125, "learning_rate": 5.166568396226416e-06, "loss": 1.293, "step": 3380 }, { "epoch": 0.9848925043579314, "grad_norm": 36.0, "learning_rate": 5.151827830188679e-06, "loss": 1.3806, "step": 3390 }, { "epoch": 0.9877977919814062, "grad_norm": 66.5, "learning_rate": 5.137087264150944e-06, "loss": 1.3186, "step": 3400 }, { "epoch": 0.9877977919814062, "eval_accuracy": 0.6294864715626726, "eval_f1": 0.6285987096192257, "eval_loss": 0.6412285566329956, "eval_precision": 0.6288758322854454, "eval_recall": 0.6294864715626726, "eval_runtime": 16.5438, "eval_samples_per_second": 109.467, "eval_steps_per_second": 11.001, "step": 3400 }, { "epoch": 0.9907030796048809, "grad_norm": 35.75, "learning_rate": 5.1223466981132075e-06, "loss": 1.2888, "step": 3410 }, { "epoch": 0.9936083672283557, "grad_norm": 29.0, "learning_rate": 5.107606132075472e-06, "loss": 1.2755, "step": 3420 }, { "epoch": 0.9965136548518303, "grad_norm": 39.0, "learning_rate": 5.092865566037736e-06, "loss": 1.187, "step": 3430 }, { "epoch": 0.999418942475305, "grad_norm": 69.5, "learning_rate": 5.078125000000001e-06, "loss": 1.2494, "step": 3440 }, { "epoch": 1.0023242300987798, "grad_norm": 37.25, "learning_rate": 5.063384433962265e-06, "loss": 1.3013, "step": 3450 }, { "epoch": 1.0052295177222546, "grad_norm": 38.0, "learning_rate": 5.048643867924528e-06, "loss": 1.2494, "step": 3460 }, { "epoch": 1.0081348053457293, "grad_norm": 94.5, "learning_rate": 5.033903301886794e-06, "loss": 1.2126, "step": 3470 }, { "epoch": 1.011040092969204, "grad_norm": 42.5, "learning_rate": 5.019162735849057e-06, "loss": 1.2814, "step": 3480 }, { "epoch": 1.0139453805926786, "grad_norm": 31.0, "learning_rate": 5.004422169811321e-06, "loss": 1.1684, "step": 3490 }, { "epoch": 1.0168506682161533, "grad_norm": 27.625, "learning_rate": 4.989681603773585e-06, "loss": 1.2539, "step": 3500 }, { "epoch": 1.0168506682161533, "eval_accuracy": 0.6256212037548315, "eval_f1": 0.6249995872703885, "eval_loss": 0.6409505605697632, "eval_precision": 0.6250521871698147, "eval_recall": 0.6256212037548315, "eval_runtime": 22.8239, "eval_samples_per_second": 79.347, "eval_steps_per_second": 7.974, "step": 3500 }, { "epoch": 1.019755955839628, "grad_norm": 48.5, "learning_rate": 4.974941037735849e-06, "loss": 1.3855, "step": 3510 }, { "epoch": 1.0226612434631028, "grad_norm": 63.5, "learning_rate": 4.960200471698114e-06, "loss": 1.2272, "step": 3520 }, { "epoch": 1.0255665310865776, "grad_norm": 67.0, "learning_rate": 4.945459905660378e-06, "loss": 1.2468, "step": 3530 }, { "epoch": 1.0284718187100523, "grad_norm": 48.5, "learning_rate": 4.930719339622642e-06, "loss": 1.2765, "step": 3540 }, { "epoch": 1.031377106333527, "grad_norm": 34.0, "learning_rate": 4.915978773584906e-06, "loss": 1.2537, "step": 3550 }, { "epoch": 1.0342823939570018, "grad_norm": 41.0, "learning_rate": 4.9012382075471706e-06, "loss": 1.2763, "step": 3560 }, { "epoch": 1.0371876815804764, "grad_norm": 47.25, "learning_rate": 4.886497641509434e-06, "loss": 1.211, "step": 3570 }, { "epoch": 1.0400929692039511, "grad_norm": 69.5, "learning_rate": 4.871757075471699e-06, "loss": 1.253, "step": 3580 }, { "epoch": 1.0429982568274259, "grad_norm": 51.5, "learning_rate": 4.857016509433963e-06, "loss": 1.198, "step": 3590 }, { "epoch": 1.0459035444509006, "grad_norm": 57.5, "learning_rate": 4.842275943396227e-06, "loss": 1.281, "step": 3600 }, { "epoch": 1.0459035444509006, "eval_accuracy": 0.631695196024296, "eval_f1": 0.6319043862620346, "eval_loss": 0.6441466212272644, "eval_precision": 0.6330383376944125, "eval_recall": 0.631695196024296, "eval_runtime": 22.3581, "eval_samples_per_second": 81.0, "eval_steps_per_second": 8.14, "step": 3600 }, { "epoch": 1.0488088320743754, "grad_norm": 41.5, "learning_rate": 4.827535377358491e-06, "loss": 1.238, "step": 3610 }, { "epoch": 1.0517141196978501, "grad_norm": 73.5, "learning_rate": 4.812794811320755e-06, "loss": 1.3438, "step": 3620 }, { "epoch": 1.0546194073213249, "grad_norm": 46.0, "learning_rate": 4.798054245283019e-06, "loss": 1.152, "step": 3630 }, { "epoch": 1.0575246949447996, "grad_norm": 42.75, "learning_rate": 4.7833136792452836e-06, "loss": 1.233, "step": 3640 }, { "epoch": 1.0604299825682744, "grad_norm": 39.25, "learning_rate": 4.768573113207547e-06, "loss": 1.258, "step": 3650 }, { "epoch": 1.063335270191749, "grad_norm": 82.5, "learning_rate": 4.753832547169811e-06, "loss": 1.2224, "step": 3660 }, { "epoch": 1.0662405578152236, "grad_norm": 32.25, "learning_rate": 4.739091981132076e-06, "loss": 1.2568, "step": 3670 }, { "epoch": 1.0691458454386984, "grad_norm": 69.5, "learning_rate": 4.72435141509434e-06, "loss": 1.2814, "step": 3680 }, { "epoch": 1.0720511330621731, "grad_norm": 44.25, "learning_rate": 4.709610849056604e-06, "loss": 1.348, "step": 3690 }, { "epoch": 1.074956420685648, "grad_norm": 40.0, "learning_rate": 4.694870283018868e-06, "loss": 1.1893, "step": 3700 }, { "epoch": 1.074956420685648, "eval_accuracy": 0.6355604638321369, "eval_f1": 0.6355406367530252, "eval_loss": 0.6429353952407837, "eval_precision": 0.6355223595138046, "eval_recall": 0.6355604638321369, "eval_runtime": 22.5961, "eval_samples_per_second": 80.147, "eval_steps_per_second": 8.054, "step": 3700 }, { "epoch": 1.0778617083091226, "grad_norm": 44.5, "learning_rate": 4.680129716981133e-06, "loss": 1.2232, "step": 3710 }, { "epoch": 1.0807669959325974, "grad_norm": 40.75, "learning_rate": 4.6653891509433966e-06, "loss": 1.1672, "step": 3720 }, { "epoch": 1.0836722835560721, "grad_norm": 40.5, "learning_rate": 4.650648584905661e-06, "loss": 1.2764, "step": 3730 }, { "epoch": 1.0865775711795467, "grad_norm": 36.0, "learning_rate": 4.635908018867925e-06, "loss": 1.2449, "step": 3740 }, { "epoch": 1.0894828588030214, "grad_norm": 69.0, "learning_rate": 4.621167452830189e-06, "loss": 1.2499, "step": 3750 }, { "epoch": 1.0923881464264962, "grad_norm": 39.5, "learning_rate": 4.6064268867924535e-06, "loss": 1.1284, "step": 3760 }, { "epoch": 1.095293434049971, "grad_norm": 29.0, "learning_rate": 4.591686320754717e-06, "loss": 1.1788, "step": 3770 }, { "epoch": 1.0981987216734457, "grad_norm": 28.25, "learning_rate": 4.576945754716982e-06, "loss": 1.2809, "step": 3780 }, { "epoch": 1.1011040092969204, "grad_norm": 39.75, "learning_rate": 4.562205188679246e-06, "loss": 1.2236, "step": 3790 }, { "epoch": 1.1040092969203952, "grad_norm": 55.75, "learning_rate": 4.54746462264151e-06, "loss": 1.2188, "step": 3800 }, { "epoch": 1.1040092969203952, "eval_accuracy": 0.6245168415240199, "eval_f1": 0.6229741864922596, "eval_loss": 0.6412906646728516, "eval_precision": 0.6238833289814343, "eval_recall": 0.6245168415240199, "eval_runtime": 21.6276, "eval_samples_per_second": 83.736, "eval_steps_per_second": 8.415, "step": 3800 }, { "epoch": 1.10691458454387, "grad_norm": 69.0, "learning_rate": 4.532724056603774e-06, "loss": 1.2715, "step": 3810 }, { "epoch": 1.1098198721673445, "grad_norm": 42.25, "learning_rate": 4.517983490566038e-06, "loss": 1.2385, "step": 3820 }, { "epoch": 1.1127251597908192, "grad_norm": 38.25, "learning_rate": 4.503242924528302e-06, "loss": 1.3215, "step": 3830 }, { "epoch": 1.115630447414294, "grad_norm": 51.25, "learning_rate": 4.4885023584905665e-06, "loss": 1.2579, "step": 3840 }, { "epoch": 1.1185357350377687, "grad_norm": 25.375, "learning_rate": 4.47376179245283e-06, "loss": 1.2701, "step": 3850 }, { "epoch": 1.1214410226612435, "grad_norm": 37.5, "learning_rate": 4.459021226415094e-06, "loss": 1.2051, "step": 3860 }, { "epoch": 1.1243463102847182, "grad_norm": 46.25, "learning_rate": 4.444280660377359e-06, "loss": 1.2782, "step": 3870 }, { "epoch": 1.127251597908193, "grad_norm": 48.0, "learning_rate": 4.429540094339623e-06, "loss": 1.2045, "step": 3880 }, { "epoch": 1.1301568855316677, "grad_norm": 58.0, "learning_rate": 4.414799528301887e-06, "loss": 1.2268, "step": 3890 }, { "epoch": 1.1330621731551425, "grad_norm": 33.25, "learning_rate": 4.400058962264152e-06, "loss": 1.2652, "step": 3900 }, { "epoch": 1.1330621731551425, "eval_accuracy": 0.6432909994478189, "eval_f1": 0.6421848255039291, "eval_loss": 0.6396148800849915, "eval_precision": 0.6427959924584501, "eval_recall": 0.6432909994478189, "eval_runtime": 17.5222, "eval_samples_per_second": 103.355, "eval_steps_per_second": 10.387, "step": 3900 }, { "epoch": 1.135967460778617, "grad_norm": 37.5, "learning_rate": 4.385318396226416e-06, "loss": 1.2721, "step": 3910 }, { "epoch": 1.1388727484020917, "grad_norm": 50.5, "learning_rate": 4.3705778301886795e-06, "loss": 1.1884, "step": 3920 }, { "epoch": 1.1417780360255665, "grad_norm": 97.5, "learning_rate": 4.355837264150944e-06, "loss": 1.197, "step": 3930 }, { "epoch": 1.1446833236490412, "grad_norm": 58.0, "learning_rate": 4.341096698113208e-06, "loss": 1.2079, "step": 3940 }, { "epoch": 1.147588611272516, "grad_norm": 43.75, "learning_rate": 4.326356132075472e-06, "loss": 1.2688, "step": 3950 }, { "epoch": 1.1504938988959907, "grad_norm": 57.5, "learning_rate": 4.3116155660377365e-06, "loss": 1.2103, "step": 3960 }, { "epoch": 1.1533991865194655, "grad_norm": 76.0, "learning_rate": 4.296875e-06, "loss": 1.2674, "step": 3970 }, { "epoch": 1.1563044741429402, "grad_norm": 90.0, "learning_rate": 4.282134433962264e-06, "loss": 1.2474, "step": 3980 }, { "epoch": 1.159209761766415, "grad_norm": 50.75, "learning_rate": 4.267393867924529e-06, "loss": 1.2753, "step": 3990 }, { "epoch": 1.1621150493898895, "grad_norm": 37.0, "learning_rate": 4.2526533018867925e-06, "loss": 1.2627, "step": 4000 }, { "epoch": 1.1621150493898895, "eval_accuracy": 0.631695196024296, "eval_f1": 0.63172355603383, "eval_loss": 0.640481173992157, "eval_precision": 0.6317553813298517, "eval_recall": 0.631695196024296, "eval_runtime": 21.5816, "eval_samples_per_second": 83.914, "eval_steps_per_second": 8.433, "step": 4000 }, { "epoch": 1.1650203370133643, "grad_norm": 51.0, "learning_rate": 4.237912735849057e-06, "loss": 1.3051, "step": 4010 }, { "epoch": 1.167925624636839, "grad_norm": 39.5, "learning_rate": 4.223172169811321e-06, "loss": 1.2549, "step": 4020 }, { "epoch": 1.1708309122603138, "grad_norm": 40.5, "learning_rate": 4.208431603773585e-06, "loss": 1.1884, "step": 4030 }, { "epoch": 1.1737361998837885, "grad_norm": 25.125, "learning_rate": 4.1936910377358495e-06, "loss": 1.1919, "step": 4040 }, { "epoch": 1.1766414875072633, "grad_norm": 65.5, "learning_rate": 4.178950471698113e-06, "loss": 1.2309, "step": 4050 }, { "epoch": 1.179546775130738, "grad_norm": 34.0, "learning_rate": 4.164209905660378e-06, "loss": 1.1595, "step": 4060 }, { "epoch": 1.1824520627542126, "grad_norm": 60.0, "learning_rate": 4.149469339622642e-06, "loss": 1.229, "step": 4070 }, { "epoch": 1.1853573503776873, "grad_norm": 35.75, "learning_rate": 4.134728773584906e-06, "loss": 1.237, "step": 4080 }, { "epoch": 1.188262638001162, "grad_norm": 43.0, "learning_rate": 4.11998820754717e-06, "loss": 1.2608, "step": 4090 }, { "epoch": 1.1911679256246368, "grad_norm": 95.0, "learning_rate": 4.105247641509435e-06, "loss": 1.2385, "step": 4100 }, { "epoch": 1.1911679256246368, "eval_accuracy": 0.6344561016013253, "eval_f1": 0.6330641107526459, "eval_loss": 0.6400484442710876, "eval_precision": 0.6339200503566697, "eval_recall": 0.6344561016013253, "eval_runtime": 21.1516, "eval_samples_per_second": 85.62, "eval_steps_per_second": 8.605, "step": 4100 }, { "epoch": 1.1940732132481116, "grad_norm": 60.0, "learning_rate": 4.090507075471699e-06, "loss": 1.2141, "step": 4110 }, { "epoch": 1.1969785008715863, "grad_norm": 73.0, "learning_rate": 4.0757665094339625e-06, "loss": 1.2815, "step": 4120 }, { "epoch": 1.199883788495061, "grad_norm": 31.25, "learning_rate": 4.061025943396227e-06, "loss": 1.2256, "step": 4130 }, { "epoch": 1.2027890761185358, "grad_norm": 63.0, "learning_rate": 4.046285377358491e-06, "loss": 1.3107, "step": 4140 }, { "epoch": 1.2056943637420106, "grad_norm": 40.0, "learning_rate": 4.031544811320755e-06, "loss": 1.24, "step": 4150 }, { "epoch": 1.208599651365485, "grad_norm": 57.5, "learning_rate": 4.016804245283019e-06, "loss": 1.2925, "step": 4160 }, { "epoch": 1.2115049389889598, "grad_norm": 32.0, "learning_rate": 4.002063679245283e-06, "loss": 1.2477, "step": 4170 }, { "epoch": 1.2144102266124346, "grad_norm": 35.75, "learning_rate": 3.987323113207547e-06, "loss": 1.2127, "step": 4180 }, { "epoch": 1.2173155142359093, "grad_norm": 46.25, "learning_rate": 3.972582547169812e-06, "loss": 1.2459, "step": 4190 }, { "epoch": 1.220220801859384, "grad_norm": 62.5, "learning_rate": 3.9578419811320755e-06, "loss": 1.2525, "step": 4200 }, { "epoch": 1.220220801859384, "eval_accuracy": 0.6366648260629486, "eval_f1": 0.6323882931612419, "eval_loss": 0.6401476263999939, "eval_precision": 0.6377675688323987, "eval_recall": 0.6366648260629486, "eval_runtime": 21.7359, "eval_samples_per_second": 83.318, "eval_steps_per_second": 8.373, "step": 4200 }, { "epoch": 1.2231260894828588, "grad_norm": 32.5, "learning_rate": 3.94310141509434e-06, "loss": 1.2414, "step": 4210 }, { "epoch": 1.2260313771063336, "grad_norm": 44.0, "learning_rate": 3.928360849056604e-06, "loss": 1.2214, "step": 4220 }, { "epoch": 1.2289366647298083, "grad_norm": 47.25, "learning_rate": 3.913620283018868e-06, "loss": 1.2093, "step": 4230 }, { "epoch": 1.231841952353283, "grad_norm": 28.375, "learning_rate": 3.898879716981132e-06, "loss": 1.1936, "step": 4240 }, { "epoch": 1.2347472399767576, "grad_norm": 51.25, "learning_rate": 3.884139150943397e-06, "loss": 1.2721, "step": 4250 }, { "epoch": 1.2376525276002324, "grad_norm": 33.75, "learning_rate": 3.869398584905661e-06, "loss": 1.2279, "step": 4260 }, { "epoch": 1.2405578152237071, "grad_norm": 43.75, "learning_rate": 3.854658018867925e-06, "loss": 1.2586, "step": 4270 }, { "epoch": 1.2434631028471819, "grad_norm": 51.25, "learning_rate": 3.839917452830189e-06, "loss": 1.1615, "step": 4280 }, { "epoch": 1.2463683904706566, "grad_norm": 69.0, "learning_rate": 3.825176886792453e-06, "loss": 1.2211, "step": 4290 }, { "epoch": 1.2492736780941314, "grad_norm": 56.75, "learning_rate": 3.8104363207547174e-06, "loss": 1.259, "step": 4300 }, { "epoch": 1.2492736780941314, "eval_accuracy": 0.6294864715626726, "eval_f1": 0.6256227167383146, "eval_loss": 0.6409916877746582, "eval_precision": 0.6299778534682698, "eval_recall": 0.6294864715626726, "eval_runtime": 22.2688, "eval_samples_per_second": 81.325, "eval_steps_per_second": 8.173, "step": 4300 }, { "epoch": 1.2521789657176061, "grad_norm": 41.25, "learning_rate": 3.7956957547169816e-06, "loss": 1.1971, "step": 4310 }, { "epoch": 1.2550842533410806, "grad_norm": 70.5, "learning_rate": 3.7809551886792454e-06, "loss": 1.1621, "step": 4320 }, { "epoch": 1.2579895409645556, "grad_norm": 31.0, "learning_rate": 3.7662146226415097e-06, "loss": 1.2817, "step": 4330 }, { "epoch": 1.2608948285880301, "grad_norm": 45.5, "learning_rate": 3.751474056603774e-06, "loss": 1.2521, "step": 4340 }, { "epoch": 1.263800116211505, "grad_norm": 47.75, "learning_rate": 3.736733490566038e-06, "loss": 1.2734, "step": 4350 }, { "epoch": 1.2667054038349796, "grad_norm": 48.25, "learning_rate": 3.721992924528302e-06, "loss": 1.2468, "step": 4360 }, { "epoch": 1.2696106914584544, "grad_norm": 33.25, "learning_rate": 3.707252358490566e-06, "loss": 1.1994, "step": 4370 }, { "epoch": 1.2725159790819291, "grad_norm": 45.25, "learning_rate": 3.6925117924528304e-06, "loss": 1.1964, "step": 4380 }, { "epoch": 1.275421266705404, "grad_norm": 93.0, "learning_rate": 3.6777712264150946e-06, "loss": 1.204, "step": 4390 }, { "epoch": 1.2783265543288787, "grad_norm": 35.5, "learning_rate": 3.6630306603773584e-06, "loss": 1.1888, "step": 4400 }, { "epoch": 1.2783265543288787, "eval_accuracy": 0.6327995582551077, "eval_f1": 0.6273511093523488, "eval_loss": 0.6400877833366394, "eval_precision": 0.6346113195792686, "eval_recall": 0.6327995582551077, "eval_runtime": 22.3949, "eval_samples_per_second": 80.867, "eval_steps_per_second": 8.127, "step": 4400 }, { "epoch": 1.2812318419523532, "grad_norm": 53.25, "learning_rate": 3.6482900943396227e-06, "loss": 1.2274, "step": 4410 }, { "epoch": 1.284137129575828, "grad_norm": 107.5, "learning_rate": 3.6335495283018873e-06, "loss": 1.2562, "step": 4420 }, { "epoch": 1.2870424171993027, "grad_norm": 40.75, "learning_rate": 3.6188089622641515e-06, "loss": 1.227, "step": 4430 }, { "epoch": 1.2899477048227774, "grad_norm": 60.25, "learning_rate": 3.6040683962264158e-06, "loss": 1.3077, "step": 4440 }, { "epoch": 1.2928529924462522, "grad_norm": 50.0, "learning_rate": 3.5893278301886796e-06, "loss": 1.2209, "step": 4450 }, { "epoch": 1.295758280069727, "grad_norm": 42.25, "learning_rate": 3.574587264150944e-06, "loss": 1.2701, "step": 4460 }, { "epoch": 1.2986635676932017, "grad_norm": 63.75, "learning_rate": 3.559846698113208e-06, "loss": 1.2894, "step": 4470 }, { "epoch": 1.3015688553166764, "grad_norm": 49.75, "learning_rate": 3.545106132075472e-06, "loss": 1.2274, "step": 4480 }, { "epoch": 1.3044741429401512, "grad_norm": 69.0, "learning_rate": 3.530365566037736e-06, "loss": 1.2479, "step": 4490 }, { "epoch": 1.3073794305636257, "grad_norm": 45.75, "learning_rate": 3.5156250000000003e-06, "loss": 1.2385, "step": 4500 }, { "epoch": 1.3073794305636257, "eval_accuracy": 0.6410822749861955, "eval_f1": 0.6406300387406191, "eval_loss": 0.6401379108428955, "eval_precision": 0.6406304536792782, "eval_recall": 0.6410822749861955, "eval_runtime": 17.5048, "eval_samples_per_second": 103.458, "eval_steps_per_second": 10.397, "step": 4500 }, { "epoch": 1.3102847181871005, "grad_norm": 51.25, "learning_rate": 3.5008844339622646e-06, "loss": 1.2097, "step": 4510 }, { "epoch": 1.3131900058105752, "grad_norm": 45.0, "learning_rate": 3.4861438679245284e-06, "loss": 1.2962, "step": 4520 }, { "epoch": 1.31609529343405, "grad_norm": 33.25, "learning_rate": 3.4714033018867926e-06, "loss": 1.2895, "step": 4530 }, { "epoch": 1.3190005810575247, "grad_norm": 41.0, "learning_rate": 3.456662735849057e-06, "loss": 1.2459, "step": 4540 }, { "epoch": 1.3219058686809995, "grad_norm": 38.5, "learning_rate": 3.441922169811321e-06, "loss": 1.2178, "step": 4550 }, { "epoch": 1.3248111563044742, "grad_norm": 39.0, "learning_rate": 3.427181603773585e-06, "loss": 1.1922, "step": 4560 }, { "epoch": 1.3277164439279487, "grad_norm": 63.25, "learning_rate": 3.412441037735849e-06, "loss": 1.2432, "step": 4570 }, { "epoch": 1.3306217315514237, "grad_norm": 57.5, "learning_rate": 3.3977004716981133e-06, "loss": 1.2522, "step": 4580 }, { "epoch": 1.3335270191748982, "grad_norm": 45.0, "learning_rate": 3.382959905660378e-06, "loss": 1.2405, "step": 4590 }, { "epoch": 1.336432306798373, "grad_norm": 49.75, "learning_rate": 3.3682193396226422e-06, "loss": 1.3357, "step": 4600 }, { "epoch": 1.336432306798373, "eval_accuracy": 0.6361126449475428, "eval_f1": 0.6358309002139024, "eval_loss": 0.6416363716125488, "eval_precision": 0.6357545053467629, "eval_recall": 0.6361126449475428, "eval_runtime": 21.4148, "eval_samples_per_second": 84.568, "eval_steps_per_second": 8.499, "step": 4600 }, { "epoch": 1.3393375944218477, "grad_norm": 36.75, "learning_rate": 3.353478773584906e-06, "loss": 1.1695, "step": 4610 }, { "epoch": 1.3422428820453225, "grad_norm": 44.0, "learning_rate": 3.3387382075471703e-06, "loss": 1.2259, "step": 4620 }, { "epoch": 1.3451481696687972, "grad_norm": 33.0, "learning_rate": 3.3239976415094345e-06, "loss": 1.2222, "step": 4630 }, { "epoch": 1.348053457292272, "grad_norm": 53.75, "learning_rate": 3.3092570754716983e-06, "loss": 1.1198, "step": 4640 }, { "epoch": 1.3509587449157467, "grad_norm": 39.25, "learning_rate": 3.2945165094339625e-06, "loss": 1.2235, "step": 4650 }, { "epoch": 1.3538640325392213, "grad_norm": 72.5, "learning_rate": 3.2797759433962268e-06, "loss": 1.2233, "step": 4660 }, { "epoch": 1.356769320162696, "grad_norm": 38.25, "learning_rate": 3.265035377358491e-06, "loss": 1.2763, "step": 4670 }, { "epoch": 1.3596746077861708, "grad_norm": 44.75, "learning_rate": 3.250294811320755e-06, "loss": 1.288, "step": 4680 }, { "epoch": 1.3625798954096455, "grad_norm": 41.0, "learning_rate": 3.235554245283019e-06, "loss": 1.2626, "step": 4690 }, { "epoch": 1.3654851830331203, "grad_norm": 80.0, "learning_rate": 3.2208136792452833e-06, "loss": 1.2926, "step": 4700 }, { "epoch": 1.3654851830331203, "eval_accuracy": 0.6372170071783545, "eval_f1": 0.6362579450911782, "eval_loss": 0.6420822739601135, "eval_precision": 0.6366568185144541, "eval_recall": 0.6372170071783545, "eval_runtime": 21.7934, "eval_samples_per_second": 83.099, "eval_steps_per_second": 8.351, "step": 4700 }, { "epoch": 1.368390470656595, "grad_norm": 43.25, "learning_rate": 3.2060731132075475e-06, "loss": 1.246, "step": 4710 }, { "epoch": 1.3712957582800698, "grad_norm": 39.5, "learning_rate": 3.1913325471698113e-06, "loss": 1.2012, "step": 4720 }, { "epoch": 1.3742010459035445, "grad_norm": 30.375, "learning_rate": 3.1765919811320755e-06, "loss": 1.2389, "step": 4730 }, { "epoch": 1.3771063335270193, "grad_norm": 42.75, "learning_rate": 3.1618514150943398e-06, "loss": 1.2816, "step": 4740 }, { "epoch": 1.3800116211504938, "grad_norm": 53.75, "learning_rate": 3.1471108490566036e-06, "loss": 1.2284, "step": 4750 }, { "epoch": 1.3829169087739686, "grad_norm": 60.0, "learning_rate": 3.132370283018868e-06, "loss": 1.2905, "step": 4760 }, { "epoch": 1.3858221963974433, "grad_norm": 58.75, "learning_rate": 3.1176297169811325e-06, "loss": 1.3362, "step": 4770 }, { "epoch": 1.388727484020918, "grad_norm": 73.5, "learning_rate": 3.1028891509433967e-06, "loss": 1.216, "step": 4780 }, { "epoch": 1.3916327716443928, "grad_norm": 34.0, "learning_rate": 3.088148584905661e-06, "loss": 1.1791, "step": 4790 }, { "epoch": 1.3945380592678676, "grad_norm": 47.5, "learning_rate": 3.0734080188679248e-06, "loss": 1.1769, "step": 4800 }, { "epoch": 1.3945380592678676, "eval_accuracy": 0.6294864715626726, "eval_f1": 0.6242980933823017, "eval_loss": 0.6427155137062073, "eval_precision": 0.6308927063460188, "eval_recall": 0.6294864715626726, "eval_runtime": 21.4015, "eval_samples_per_second": 84.62, "eval_steps_per_second": 8.504, "step": 4800 }, { "epoch": 1.3974433468913423, "grad_norm": 50.5, "learning_rate": 3.058667452830189e-06, "loss": 1.2116, "step": 4810 }, { "epoch": 1.4003486345148168, "grad_norm": 96.5, "learning_rate": 3.0439268867924532e-06, "loss": 1.266, "step": 4820 }, { "epoch": 1.4032539221382918, "grad_norm": 38.5, "learning_rate": 3.0291863207547174e-06, "loss": 1.2151, "step": 4830 }, { "epoch": 1.4061592097617663, "grad_norm": 41.0, "learning_rate": 3.0144457547169813e-06, "loss": 1.1757, "step": 4840 }, { "epoch": 1.409064497385241, "grad_norm": 79.0, "learning_rate": 2.9997051886792455e-06, "loss": 1.2929, "step": 4850 }, { "epoch": 1.4119697850087158, "grad_norm": 42.5, "learning_rate": 2.9849646226415097e-06, "loss": 1.1804, "step": 4860 }, { "epoch": 1.4148750726321906, "grad_norm": 39.75, "learning_rate": 2.970224056603774e-06, "loss": 1.2443, "step": 4870 }, { "epoch": 1.4177803602556653, "grad_norm": 45.25, "learning_rate": 2.9554834905660378e-06, "loss": 1.2595, "step": 4880 }, { "epoch": 1.42068564787914, "grad_norm": 35.0, "learning_rate": 2.940742924528302e-06, "loss": 1.2393, "step": 4890 }, { "epoch": 1.4235909355026148, "grad_norm": 50.25, "learning_rate": 2.9260023584905662e-06, "loss": 1.226, "step": 4900 }, { "epoch": 1.4235909355026148, "eval_accuracy": 0.6355604638321369, "eval_f1": 0.6343806261841474, "eval_loss": 0.6406518220901489, "eval_precision": 0.6350017249086662, "eval_recall": 0.6355604638321369, "eval_runtime": 20.0566, "eval_samples_per_second": 90.295, "eval_steps_per_second": 9.074, "step": 4900 }, { "epoch": 1.4264962231260894, "grad_norm": 61.75, "learning_rate": 2.9112617924528305e-06, "loss": 1.2209, "step": 4910 }, { "epoch": 1.4294015107495643, "grad_norm": 65.0, "learning_rate": 2.8965212264150943e-06, "loss": 1.2144, "step": 4920 }, { "epoch": 1.4323067983730389, "grad_norm": 175.0, "learning_rate": 2.8817806603773585e-06, "loss": 1.3651, "step": 4930 }, { "epoch": 1.4352120859965136, "grad_norm": 43.5, "learning_rate": 2.8670400943396227e-06, "loss": 1.2507, "step": 4940 }, { "epoch": 1.4381173736199884, "grad_norm": 48.75, "learning_rate": 2.8522995283018874e-06, "loss": 1.232, "step": 4950 }, { "epoch": 1.4410226612434631, "grad_norm": 86.0, "learning_rate": 2.837558962264151e-06, "loss": 1.2307, "step": 4960 }, { "epoch": 1.4439279488669379, "grad_norm": 64.5, "learning_rate": 2.8228183962264154e-06, "loss": 1.1689, "step": 4970 }, { "epoch": 1.4468332364904126, "grad_norm": 42.75, "learning_rate": 2.8080778301886797e-06, "loss": 1.3197, "step": 4980 }, { "epoch": 1.4497385241138874, "grad_norm": 30.5, "learning_rate": 2.793337264150944e-06, "loss": 1.2356, "step": 4990 }, { "epoch": 1.452643811737362, "grad_norm": 57.25, "learning_rate": 2.7785966981132077e-06, "loss": 1.185, "step": 5000 }, { "epoch": 1.452643811737362, "eval_accuracy": 0.6355604638321369, "eval_f1": 0.6347092433340812, "eval_loss": 0.6409675478935242, "eval_precision": 0.634991904463308, "eval_recall": 0.6355604638321369, "eval_runtime": 21.5778, "eval_samples_per_second": 83.929, "eval_steps_per_second": 8.435, "step": 5000 }, { "epoch": 1.4555490993608367, "grad_norm": 32.75, "learning_rate": 2.763856132075472e-06, "loss": 1.2552, "step": 5010 }, { "epoch": 1.4584543869843114, "grad_norm": 44.5, "learning_rate": 2.749115566037736e-06, "loss": 1.1698, "step": 5020 }, { "epoch": 1.4613596746077862, "grad_norm": 47.5, "learning_rate": 2.7343750000000004e-06, "loss": 1.2106, "step": 5030 }, { "epoch": 1.464264962231261, "grad_norm": 47.75, "learning_rate": 2.719634433962264e-06, "loss": 1.2006, "step": 5040 }, { "epoch": 1.4671702498547357, "grad_norm": 52.25, "learning_rate": 2.7048938679245284e-06, "loss": 1.2336, "step": 5050 }, { "epoch": 1.4700755374782104, "grad_norm": 74.5, "learning_rate": 2.6901533018867927e-06, "loss": 1.2123, "step": 5060 }, { "epoch": 1.472980825101685, "grad_norm": 35.0, "learning_rate": 2.675412735849057e-06, "loss": 1.289, "step": 5070 }, { "epoch": 1.47588611272516, "grad_norm": 73.0, "learning_rate": 2.6606721698113207e-06, "loss": 1.2429, "step": 5080 }, { "epoch": 1.4787914003486344, "grad_norm": 53.0, "learning_rate": 2.645931603773585e-06, "loss": 1.2213, "step": 5090 }, { "epoch": 1.4816966879721092, "grad_norm": 59.0, "learning_rate": 2.631191037735849e-06, "loss": 1.2239, "step": 5100 }, { "epoch": 1.4816966879721092, "eval_accuracy": 0.6339039204859194, "eval_f1": 0.6286765449836439, "eval_loss": 0.643553614616394, "eval_precision": 0.6356054506308846, "eval_recall": 0.6339039204859194, "eval_runtime": 16.6265, "eval_samples_per_second": 108.923, "eval_steps_per_second": 10.946, "step": 5100 }, { "epoch": 1.484601975595584, "grad_norm": 41.75, "learning_rate": 2.616450471698113e-06, "loss": 1.3158, "step": 5110 }, { "epoch": 1.4875072632190587, "grad_norm": 45.5, "learning_rate": 2.6017099056603776e-06, "loss": 1.2863, "step": 5120 }, { "epoch": 1.4904125508425334, "grad_norm": 35.5, "learning_rate": 2.586969339622642e-06, "loss": 1.2767, "step": 5130 }, { "epoch": 1.4933178384660082, "grad_norm": 55.0, "learning_rate": 2.572228773584906e-06, "loss": 1.2172, "step": 5140 }, { "epoch": 1.496223126089483, "grad_norm": 43.0, "learning_rate": 2.5574882075471703e-06, "loss": 1.2822, "step": 5150 }, { "epoch": 1.4991284137129575, "grad_norm": 37.25, "learning_rate": 2.542747641509434e-06, "loss": 1.2316, "step": 5160 }, { "epoch": 1.5020337013364324, "grad_norm": 71.0, "learning_rate": 2.5280070754716984e-06, "loss": 1.207, "step": 5170 }, { "epoch": 1.504938988959907, "grad_norm": 57.5, "learning_rate": 2.5132665094339626e-06, "loss": 1.2247, "step": 5180 }, { "epoch": 1.5078442765833817, "grad_norm": 42.75, "learning_rate": 2.498525943396227e-06, "loss": 1.2729, "step": 5190 }, { "epoch": 1.5107495642068565, "grad_norm": 75.0, "learning_rate": 2.4837853773584906e-06, "loss": 1.1602, "step": 5200 }, { "epoch": 1.5107495642068565, "eval_accuracy": 0.635008282716731, "eval_f1": 0.6342624572051594, "eval_loss": 0.6412646770477295, "eval_precision": 0.634448953777689, "eval_recall": 0.635008282716731, "eval_runtime": 22.4012, "eval_samples_per_second": 80.844, "eval_steps_per_second": 8.125, "step": 5200 }, { "epoch": 1.5136548518303312, "grad_norm": 24.875, "learning_rate": 2.469044811320755e-06, "loss": 1.2224, "step": 5210 }, { "epoch": 1.516560139453806, "grad_norm": 46.25, "learning_rate": 2.454304245283019e-06, "loss": 1.2277, "step": 5220 }, { "epoch": 1.5194654270772805, "grad_norm": 49.25, "learning_rate": 2.4395636792452833e-06, "loss": 1.3351, "step": 5230 }, { "epoch": 1.5223707147007555, "grad_norm": 48.25, "learning_rate": 2.424823113207547e-06, "loss": 1.2759, "step": 5240 }, { "epoch": 1.52527600232423, "grad_norm": 43.75, "learning_rate": 2.4100825471698114e-06, "loss": 1.205, "step": 5250 }, { "epoch": 1.528181289947705, "grad_norm": 42.5, "learning_rate": 2.3953419811320756e-06, "loss": 1.2956, "step": 5260 }, { "epoch": 1.5310865775711795, "grad_norm": 48.25, "learning_rate": 2.38060141509434e-06, "loss": 1.2831, "step": 5270 }, { "epoch": 1.5339918651946542, "grad_norm": 73.5, "learning_rate": 2.365860849056604e-06, "loss": 1.1228, "step": 5280 }, { "epoch": 1.536897152818129, "grad_norm": 88.0, "learning_rate": 2.3511202830188683e-06, "loss": 1.2236, "step": 5290 }, { "epoch": 1.5398024404416037, "grad_norm": 46.25, "learning_rate": 2.336379716981132e-06, "loss": 1.2955, "step": 5300 }, { "epoch": 1.5398024404416037, "eval_accuracy": 0.6339039204859194, "eval_f1": 0.6328417856318803, "eval_loss": 0.6400341391563416, "eval_precision": 0.6333204074415757, "eval_recall": 0.6339039204859194, "eval_runtime": 22.3787, "eval_samples_per_second": 80.925, "eval_steps_per_second": 8.133, "step": 5300 }, { "epoch": 1.5427077280650785, "grad_norm": 83.0, "learning_rate": 2.3216391509433964e-06, "loss": 1.2296, "step": 5310 }, { "epoch": 1.545613015688553, "grad_norm": 39.75, "learning_rate": 2.3068985849056606e-06, "loss": 1.2541, "step": 5320 }, { "epoch": 1.548518303312028, "grad_norm": 49.75, "learning_rate": 2.2921580188679244e-06, "loss": 1.2598, "step": 5330 }, { "epoch": 1.5514235909355025, "grad_norm": 55.5, "learning_rate": 2.277417452830189e-06, "loss": 1.1931, "step": 5340 }, { "epoch": 1.5543288785589775, "grad_norm": 49.0, "learning_rate": 2.2626768867924533e-06, "loss": 1.2074, "step": 5350 }, { "epoch": 1.557234166182452, "grad_norm": 55.75, "learning_rate": 2.247936320754717e-06, "loss": 1.271, "step": 5360 }, { "epoch": 1.5601394538059268, "grad_norm": 56.25, "learning_rate": 2.2331957547169813e-06, "loss": 1.3754, "step": 5370 }, { "epoch": 1.5630447414294015, "grad_norm": 61.25, "learning_rate": 2.2184551886792456e-06, "loss": 1.3038, "step": 5380 }, { "epoch": 1.5659500290528763, "grad_norm": 85.0, "learning_rate": 2.20371462264151e-06, "loss": 1.2226, "step": 5390 }, { "epoch": 1.568855316676351, "grad_norm": 51.25, "learning_rate": 2.1889740566037736e-06, "loss": 1.1983, "step": 5400 }, { "epoch": 1.568855316676351, "eval_accuracy": 0.6361126449475428, "eval_f1": 0.6339171740406839, "eval_loss": 0.6405659317970276, "eval_precision": 0.635879225143538, "eval_recall": 0.6361126449475428, "eval_runtime": 23.9319, "eval_samples_per_second": 75.673, "eval_steps_per_second": 7.605, "step": 5400 }, { "epoch": 1.5717606042998256, "grad_norm": 53.0, "learning_rate": 2.174233490566038e-06, "loss": 1.2049, "step": 5410 }, { "epoch": 1.5746658919233005, "grad_norm": 42.5, "learning_rate": 2.159492924528302e-06, "loss": 1.1668, "step": 5420 }, { "epoch": 1.577571179546775, "grad_norm": 81.0, "learning_rate": 2.1447523584905663e-06, "loss": 1.2764, "step": 5430 }, { "epoch": 1.5804764671702498, "grad_norm": 34.25, "learning_rate": 2.1300117924528305e-06, "loss": 1.2099, "step": 5440 }, { "epoch": 1.5833817547937246, "grad_norm": 32.0, "learning_rate": 2.1152712264150948e-06, "loss": 1.2476, "step": 5450 }, { "epoch": 1.5862870424171993, "grad_norm": 46.5, "learning_rate": 2.1005306603773586e-06, "loss": 1.2006, "step": 5460 }, { "epoch": 1.589192330040674, "grad_norm": 56.75, "learning_rate": 2.085790094339623e-06, "loss": 1.2122, "step": 5470 }, { "epoch": 1.5920976176641486, "grad_norm": 40.5, "learning_rate": 2.071049528301887e-06, "loss": 1.2338, "step": 5480 }, { "epoch": 1.5950029052876236, "grad_norm": 74.5, "learning_rate": 2.0563089622641513e-06, "loss": 1.2353, "step": 5490 }, { "epoch": 1.597908192911098, "grad_norm": 35.75, "learning_rate": 2.041568396226415e-06, "loss": 1.1418, "step": 5500 }, { "epoch": 1.597908192911098, "eval_accuracy": 0.639425731639978, "eval_f1": 0.637184135032376, "eval_loss": 0.6405666470527649, "eval_precision": 0.639288173918868, "eval_recall": 0.639425731639978, "eval_runtime": 21.7694, "eval_samples_per_second": 83.19, "eval_steps_per_second": 8.36, "step": 5500 } ], "logging_steps": 10, "max_steps": 6884, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9676824584855552e+17, "train_batch_size": 10, "trial_name": null, "trial_params": null }