| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9979661016949153, | |
| "eval_steps": 500, | |
| "global_step": 276, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003615819209039548, | |
| "grad_norm": 14.63072256054265, | |
| "learning_rate": 0.0, | |
| "loss": 1.4293, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.007231638418079096, | |
| "grad_norm": 13.703421994615557, | |
| "learning_rate": 1.7857142857142858e-07, | |
| "loss": 1.4193, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.010847457627118645, | |
| "grad_norm": 13.716616968172858, | |
| "learning_rate": 3.5714285714285716e-07, | |
| "loss": 1.3691, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.014463276836158192, | |
| "grad_norm": 13.81492601698403, | |
| "learning_rate": 5.357142857142857e-07, | |
| "loss": 1.4337, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01807909604519774, | |
| "grad_norm": 13.013807610008277, | |
| "learning_rate": 7.142857142857143e-07, | |
| "loss": 1.335, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.02169491525423729, | |
| "grad_norm": 13.316675508543259, | |
| "learning_rate": 8.928571428571429e-07, | |
| "loss": 1.4104, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.025310734463276835, | |
| "grad_norm": 12.54293399835329, | |
| "learning_rate": 1.0714285714285714e-06, | |
| "loss": 1.3364, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.028926553672316384, | |
| "grad_norm": 11.706383970770567, | |
| "learning_rate": 1.25e-06, | |
| "loss": 1.3289, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.03254237288135593, | |
| "grad_norm": 9.135561727877839, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": 1.2395, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.03615819209039548, | |
| "grad_norm": 8.24371141549115, | |
| "learning_rate": 1.6071428571428574e-06, | |
| "loss": 1.2597, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03977401129943503, | |
| "grad_norm": 5.641164800490142, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 1.1323, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.04338983050847458, | |
| "grad_norm": 5.154352679322036, | |
| "learning_rate": 1.9642857142857144e-06, | |
| "loss": 1.1555, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04700564971751412, | |
| "grad_norm": 4.902918571287102, | |
| "learning_rate": 2.1428571428571427e-06, | |
| "loss": 1.1208, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.05062146892655367, | |
| "grad_norm": 4.651473817511105, | |
| "learning_rate": 2.321428571428572e-06, | |
| "loss": 1.1211, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.05423728813559322, | |
| "grad_norm": 7.855915558603105, | |
| "learning_rate": 2.5e-06, | |
| "loss": 1.0921, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05785310734463277, | |
| "grad_norm": 9.766906290977035, | |
| "learning_rate": 2.6785714285714285e-06, | |
| "loss": 1.0519, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.061468926553672316, | |
| "grad_norm": 7.745681824751476, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 1.0559, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.06508474576271187, | |
| "grad_norm": 4.81594013217655, | |
| "learning_rate": 3.0357142857142856e-06, | |
| "loss": 1.0629, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06870056497175141, | |
| "grad_norm": 4.091417378072179, | |
| "learning_rate": 3.2142857142857147e-06, | |
| "loss": 1.0244, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.07231638418079096, | |
| "grad_norm": 2.8101858455670237, | |
| "learning_rate": 3.3928571428571435e-06, | |
| "loss": 0.9697, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0759322033898305, | |
| "grad_norm": 3.180728091735543, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 0.9601, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.07954802259887006, | |
| "grad_norm": 3.6493251911677658, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.9409, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0831638418079096, | |
| "grad_norm": 3.415956781592975, | |
| "learning_rate": 3.928571428571429e-06, | |
| "loss": 0.9486, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.08677966101694916, | |
| "grad_norm": 3.0472462438826, | |
| "learning_rate": 4.107142857142857e-06, | |
| "loss": 0.8874, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0903954802259887, | |
| "grad_norm": 2.264144754905512, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": 0.8994, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.09401129943502824, | |
| "grad_norm": 2.191754581281806, | |
| "learning_rate": 4.464285714285715e-06, | |
| "loss": 0.8975, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0976271186440678, | |
| "grad_norm": 2.3668184420746385, | |
| "learning_rate": 4.642857142857144e-06, | |
| "loss": 0.8581, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.10124293785310734, | |
| "grad_norm": 2.7515269578252193, | |
| "learning_rate": 4.821428571428572e-06, | |
| "loss": 0.8423, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1048587570621469, | |
| "grad_norm": 2.3735525312849837, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8205, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.10847457627118644, | |
| "grad_norm": 2.032163014842967, | |
| "learning_rate": 4.999799414013322e-06, | |
| "loss": 0.8754, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.112090395480226, | |
| "grad_norm": 2.0590504400576997, | |
| "learning_rate": 4.999197688241076e-06, | |
| "loss": 0.8267, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.11570621468926554, | |
| "grad_norm": 2.092420106610846, | |
| "learning_rate": 4.998194919241471e-06, | |
| "loss": 0.8312, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.11932203389830509, | |
| "grad_norm": 1.9381311525751297, | |
| "learning_rate": 4.996791267927632e-06, | |
| "loss": 0.7878, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.12293785310734463, | |
| "grad_norm": 1.7374506458276127, | |
| "learning_rate": 4.994986959541788e-06, | |
| "loss": 0.7816, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.12655367231638417, | |
| "grad_norm": 2.0807346358096166, | |
| "learning_rate": 4.9927822836191185e-06, | |
| "loss": 0.7818, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.13016949152542373, | |
| "grad_norm": 1.73060241181151, | |
| "learning_rate": 4.990177593941303e-06, | |
| "loss": 0.7674, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.1337853107344633, | |
| "grad_norm": 1.7060389302256593, | |
| "learning_rate": 4.987173308479738e-06, | |
| "loss": 0.7652, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.13740112994350281, | |
| "grad_norm": 1.6025020216781503, | |
| "learning_rate": 4.9837699093284765e-06, | |
| "loss": 0.7457, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.14101694915254237, | |
| "grad_norm": 1.7291972266390814, | |
| "learning_rate": 4.9799679426268575e-06, | |
| "loss": 0.7943, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.14463276836158193, | |
| "grad_norm": 1.6572877077102874, | |
| "learning_rate": 4.975768018471877e-06, | |
| "loss": 0.7815, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14824858757062148, | |
| "grad_norm": 1.5680833894820443, | |
| "learning_rate": 4.971170810820279e-06, | |
| "loss": 0.7557, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.151864406779661, | |
| "grad_norm": 1.4822606907798626, | |
| "learning_rate": 4.966177057380409e-06, | |
| "loss": 0.7561, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.15548022598870057, | |
| "grad_norm": 1.6178903613745546, | |
| "learning_rate": 4.960787559493836e-06, | |
| "loss": 0.7474, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.15909604519774012, | |
| "grad_norm": 1.5096967013865987, | |
| "learning_rate": 4.955003182006761e-06, | |
| "loss": 0.716, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.16271186440677965, | |
| "grad_norm": 1.4671027619246142, | |
| "learning_rate": 4.948824853131237e-06, | |
| "loss": 0.7353, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.1663276836158192, | |
| "grad_norm": 1.4778933913112275, | |
| "learning_rate": 4.942253564296217e-06, | |
| "loss": 0.7358, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.16994350282485876, | |
| "grad_norm": 1.4672254672280391, | |
| "learning_rate": 4.935290369988468e-06, | |
| "loss": 0.7419, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.17355932203389832, | |
| "grad_norm": 1.4415543659264245, | |
| "learning_rate": 4.927936387583348e-06, | |
| "loss": 0.748, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.17717514124293784, | |
| "grad_norm": 1.3940953807201077, | |
| "learning_rate": 4.920192797165511e-06, | |
| "loss": 0.7571, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.1807909604519774, | |
| "grad_norm": 1.4519222988783331, | |
| "learning_rate": 4.912060841339536e-06, | |
| "loss": 0.7127, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18440677966101696, | |
| "grad_norm": 1.4865547512367276, | |
| "learning_rate": 4.9035418250305314e-06, | |
| "loss": 0.7272, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.18802259887005648, | |
| "grad_norm": 1.5008090646455723, | |
| "learning_rate": 4.894637115274728e-06, | |
| "loss": 0.7258, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.19163841807909604, | |
| "grad_norm": 1.7037112279423843, | |
| "learning_rate": 4.8853481410001225e-06, | |
| "loss": 0.7316, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.1952542372881356, | |
| "grad_norm": 1.440963552161302, | |
| "learning_rate": 4.875676392797169e-06, | |
| "loss": 0.7551, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.19887005649717515, | |
| "grad_norm": 1.440064113753515, | |
| "learning_rate": 4.865623422679593e-06, | |
| "loss": 0.7446, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.20248587570621468, | |
| "grad_norm": 1.6503815942075044, | |
| "learning_rate": 4.855190843835338e-06, | |
| "loss": 0.6955, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.20610169491525424, | |
| "grad_norm": 1.3995241998208656, | |
| "learning_rate": 4.844380330367701e-06, | |
| "loss": 0.7214, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2097175141242938, | |
| "grad_norm": 1.461439825228087, | |
| "learning_rate": 4.833193617026692e-06, | |
| "loss": 0.7386, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 1.5840510779057535, | |
| "learning_rate": 4.821632498930656e-06, | |
| "loss": 0.7156, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.21694915254237288, | |
| "grad_norm": 1.5720479925852917, | |
| "learning_rate": 4.809698831278217e-06, | |
| "loss": 0.6961, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.22056497175141243, | |
| "grad_norm": 1.6408543211624975, | |
| "learning_rate": 4.797394529050577e-06, | |
| "loss": 0.7223, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.224180790960452, | |
| "grad_norm": 1.4636770967664614, | |
| "learning_rate": 4.784721566704217e-06, | |
| "loss": 0.7157, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.22779661016949151, | |
| "grad_norm": 1.5059556334177817, | |
| "learning_rate": 4.771681977854062e-06, | |
| "loss": 0.7091, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.23141242937853107, | |
| "grad_norm": 1.3659765286811016, | |
| "learning_rate": 4.75827785494715e-06, | |
| "loss": 0.7012, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.23502824858757063, | |
| "grad_norm": 1.418860606361076, | |
| "learning_rate": 4.744511348926855e-06, | |
| "loss": 0.7124, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.23864406779661018, | |
| "grad_norm": 1.4260454819088881, | |
| "learning_rate": 4.730384668887731e-06, | |
| "loss": 0.7215, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.2422598870056497, | |
| "grad_norm": 1.5391993842680236, | |
| "learning_rate": 4.715900081721021e-06, | |
| "loss": 0.6946, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.24587570621468927, | |
| "grad_norm": 1.552630241762705, | |
| "learning_rate": 4.7010599117508936e-06, | |
| "loss": 0.7109, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.24949152542372882, | |
| "grad_norm": 1.5270739656846004, | |
| "learning_rate": 4.685866540361456e-06, | |
| "loss": 0.7092, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.25310734463276835, | |
| "grad_norm": 1.5114873681601728, | |
| "learning_rate": 4.670322405614621e-06, | |
| "loss": 0.7055, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.25672316384180793, | |
| "grad_norm": 1.5299853438436042, | |
| "learning_rate": 4.654430001858874e-06, | |
| "loss": 0.6878, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.26033898305084746, | |
| "grad_norm": 1.445451157672883, | |
| "learning_rate": 4.638191879329005e-06, | |
| "loss": 0.7222, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.263954802259887, | |
| "grad_norm": 1.4221665529233392, | |
| "learning_rate": 4.621610643736878e-06, | |
| "loss": 0.7237, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.2675706214689266, | |
| "grad_norm": 1.469857395185097, | |
| "learning_rate": 4.6046889558532925e-06, | |
| "loss": 0.6966, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.2711864406779661, | |
| "grad_norm": 1.5144096217089977, | |
| "learning_rate": 4.587429531081019e-06, | |
| "loss": 0.7018, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.27480225988700563, | |
| "grad_norm": 1.3933861614255332, | |
| "learning_rate": 4.569835139019054e-06, | |
| "loss": 0.6548, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.2784180790960452, | |
| "grad_norm": 1.7309884671892724, | |
| "learning_rate": 4.551908603018191e-06, | |
| "loss": 0.6976, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.28203389830508474, | |
| "grad_norm": 1.5328106326816477, | |
| "learning_rate": 4.53365279972796e-06, | |
| "loss": 0.693, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.28564971751412427, | |
| "grad_norm": 1.5098349501089556, | |
| "learning_rate": 4.515070658635013e-06, | |
| "loss": 0.697, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.28926553672316385, | |
| "grad_norm": 1.7412535656694954, | |
| "learning_rate": 4.4961651615930344e-06, | |
| "loss": 0.7115, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2928813559322034, | |
| "grad_norm": 1.5076941472793814, | |
| "learning_rate": 4.476939342344246e-06, | |
| "loss": 0.7163, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.29649717514124296, | |
| "grad_norm": 1.4730053806555845, | |
| "learning_rate": 4.457396286032589e-06, | |
| "loss": 0.6886, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.3001129943502825, | |
| "grad_norm": 1.4855157413144469, | |
| "learning_rate": 4.437539128708647e-06, | |
| "loss": 0.7339, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.303728813559322, | |
| "grad_norm": 1.5405150766486688, | |
| "learning_rate": 4.417371056826417e-06, | |
| "loss": 0.7149, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.3073446327683616, | |
| "grad_norm": 1.5508769466195937, | |
| "learning_rate": 4.396895306731978e-06, | |
| "loss": 0.6822, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.31096045197740113, | |
| "grad_norm": 1.4588960420622106, | |
| "learning_rate": 4.376115164144157e-06, | |
| "loss": 0.6836, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.31457627118644066, | |
| "grad_norm": 1.6627413334009173, | |
| "learning_rate": 4.355033963627277e-06, | |
| "loss": 0.7131, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.31819209039548024, | |
| "grad_norm": 1.5553517341256553, | |
| "learning_rate": 4.333655088056065e-06, | |
| "loss": 0.6854, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.32180790960451977, | |
| "grad_norm": 1.6684230900983013, | |
| "learning_rate": 4.3119819680728e-06, | |
| "loss": 0.7094, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.3254237288135593, | |
| "grad_norm": 1.3726285594917855, | |
| "learning_rate": 4.290018081536807e-06, | |
| "loss": 0.6872, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3290395480225989, | |
| "grad_norm": 1.380724539985608, | |
| "learning_rate": 4.267766952966369e-06, | |
| "loss": 0.7139, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.3326553672316384, | |
| "grad_norm": 1.5348446771711333, | |
| "learning_rate": 4.245232152973148e-06, | |
| "loss": 0.6778, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.336271186440678, | |
| "grad_norm": 1.5229418737475282, | |
| "learning_rate": 4.222417297689217e-06, | |
| "loss": 0.6689, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3398870056497175, | |
| "grad_norm": 1.5070456041355724, | |
| "learning_rate": 4.199326048186783e-06, | |
| "loss": 0.6894, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.34350282485875705, | |
| "grad_norm": 1.3616666744687627, | |
| "learning_rate": 4.175962109890697e-06, | |
| "loss": 0.6554, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.34711864406779663, | |
| "grad_norm": 1.5391506984706091, | |
| "learning_rate": 4.152329231983852e-06, | |
| "loss": 0.7023, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.35073446327683616, | |
| "grad_norm": 1.4241158917993244, | |
| "learning_rate": 4.128431206805556e-06, | |
| "loss": 0.7107, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3543502824858757, | |
| "grad_norm": 1.4631603034598988, | |
| "learning_rate": 4.104271869242975e-06, | |
| "loss": 0.6894, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3579661016949153, | |
| "grad_norm": 1.4094826388778878, | |
| "learning_rate": 4.07985509611576e-06, | |
| "loss": 0.677, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.3615819209039548, | |
| "grad_norm": 1.4740189681153695, | |
| "learning_rate": 4.0551848055539345e-06, | |
| "loss": 0.6699, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.36519774011299433, | |
| "grad_norm": 1.4525217649150708, | |
| "learning_rate": 4.030264956369158e-06, | |
| "loss": 0.7368, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.3688135593220339, | |
| "grad_norm": 1.451460518174865, | |
| "learning_rate": 4.005099547419458e-06, | |
| "loss": 0.7034, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.37242937853107344, | |
| "grad_norm": 1.5038951264951017, | |
| "learning_rate": 3.979692616967543e-06, | |
| "loss": 0.6837, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.37604519774011297, | |
| "grad_norm": 1.4157220265249935, | |
| "learning_rate": 3.9540482420327845e-06, | |
| "loss": 0.6875, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.37966101694915255, | |
| "grad_norm": 1.4737148597287204, | |
| "learning_rate": 3.9281705377369814e-06, | |
| "loss": 0.6901, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3832768361581921, | |
| "grad_norm": 1.4677149428122727, | |
| "learning_rate": 3.902063656644012e-06, | |
| "loss": 0.6792, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.38689265536723166, | |
| "grad_norm": 1.3434999781262245, | |
| "learning_rate": 3.875731788093478e-06, | |
| "loss": 0.6876, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.3905084745762712, | |
| "grad_norm": 1.4987089091978376, | |
| "learning_rate": 3.84917915752845e-06, | |
| "loss": 0.6761, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3941242937853107, | |
| "grad_norm": 1.508794866255814, | |
| "learning_rate": 3.8224100258174066e-06, | |
| "loss": 0.6838, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.3977401129943503, | |
| "grad_norm": 1.4688146101109116, | |
| "learning_rate": 3.795428688570505e-06, | |
| "loss": 0.684, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.40135593220338983, | |
| "grad_norm": 1.4606514616212791, | |
| "learning_rate": 3.7682394754502687e-06, | |
| "loss": 0.6824, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.40497175141242936, | |
| "grad_norm": 1.4096709625800348, | |
| "learning_rate": 3.7408467494768104e-06, | |
| "loss": 0.6969, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.40858757062146894, | |
| "grad_norm": 1.418524365345472, | |
| "learning_rate": 3.7132549063277033e-06, | |
| "loss": 0.7097, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.41220338983050847, | |
| "grad_norm": 1.5410281672091906, | |
| "learning_rate": 3.685468373632613e-06, | |
| "loss": 0.6746, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.415819209039548, | |
| "grad_norm": 1.3641313015018903, | |
| "learning_rate": 3.657491610262802e-06, | |
| "loss": 0.6448, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.4194350282485876, | |
| "grad_norm": 1.3041765820835833, | |
| "learning_rate": 3.6293291056156178e-06, | |
| "loss": 0.6819, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.4230508474576271, | |
| "grad_norm": 1.506905844856063, | |
| "learning_rate": 3.600985378894086e-06, | |
| "loss": 0.6876, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 1.376689012221553, | |
| "learning_rate": 3.572464978381719e-06, | |
| "loss": 0.684, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.4302824858757062, | |
| "grad_norm": 1.340240336011346, | |
| "learning_rate": 3.5437724807126583e-06, | |
| "loss": 0.6505, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.43389830508474575, | |
| "grad_norm": 1.325670474835959, | |
| "learning_rate": 3.514912490137268e-06, | |
| "loss": 0.6357, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.43751412429378533, | |
| "grad_norm": 1.2967281789427187, | |
| "learning_rate": 3.4858896377832966e-06, | |
| "loss": 0.6716, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.44112994350282486, | |
| "grad_norm": 1.4289334076188702, | |
| "learning_rate": 3.4567085809127247e-06, | |
| "loss": 0.6749, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.4447457627118644, | |
| "grad_norm": 1.4006278853440177, | |
| "learning_rate": 3.42737400217442e-06, | |
| "loss": 0.6675, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.448361581920904, | |
| "grad_norm": 1.4903303746050145, | |
| "learning_rate": 3.397890608852718e-06, | |
| "loss": 0.6795, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.4519774011299435, | |
| "grad_norm": 1.5237327914091412, | |
| "learning_rate": 3.3682631321120507e-06, | |
| "loss": 0.6834, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.45559322033898303, | |
| "grad_norm": 1.375637272974929, | |
| "learning_rate": 3.3384963262377434e-06, | |
| "loss": 0.6546, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.4592090395480226, | |
| "grad_norm": 1.4010303116849099, | |
| "learning_rate": 3.3085949678730953e-06, | |
| "loss": 0.6687, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.46282485875706214, | |
| "grad_norm": 1.3723912301345371, | |
| "learning_rate": 3.278563855252885e-06, | |
| "loss": 0.6927, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.46644067796610167, | |
| "grad_norm": 1.4580226139492987, | |
| "learning_rate": 3.248407807433396e-06, | |
| "loss": 0.6843, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.47005649717514125, | |
| "grad_norm": 1.5657029406507326, | |
| "learning_rate": 3.2181316635191125e-06, | |
| "loss": 0.6639, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4736723163841808, | |
| "grad_norm": 1.5943610613148829, | |
| "learning_rate": 3.1877402818861954e-06, | |
| "loss": 0.6655, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.47728813559322036, | |
| "grad_norm": 1.398551732301804, | |
| "learning_rate": 3.157238539402862e-06, | |
| "loss": 0.6648, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4809039548022599, | |
| "grad_norm": 1.3527863119261647, | |
| "learning_rate": 3.1266313306468018e-06, | |
| "loss": 0.6793, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.4845197740112994, | |
| "grad_norm": 1.4133737666494006, | |
| "learning_rate": 3.095923567119748e-06, | |
| "loss": 0.6808, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.488135593220339, | |
| "grad_norm": 1.3489274410441074, | |
| "learning_rate": 3.0651201764593375e-06, | |
| "loss": 0.669, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.49175141242937853, | |
| "grad_norm": 1.4710077216567483, | |
| "learning_rate": 3.034226101648377e-06, | |
| "loss": 0.6685, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.49536723163841806, | |
| "grad_norm": 1.4143201023235143, | |
| "learning_rate": 3.0032463002216504e-06, | |
| "loss": 0.6803, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.49898305084745764, | |
| "grad_norm": 1.350434140774409, | |
| "learning_rate": 2.972185743470386e-06, | |
| "loss": 0.6293, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.5025988700564972, | |
| "grad_norm": 1.4061918089975518, | |
| "learning_rate": 2.941049415644522e-06, | |
| "loss": 0.6981, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.5062146892655367, | |
| "grad_norm": 1.4466820101061297, | |
| "learning_rate": 2.909842313152888e-06, | |
| "loss": 0.6738, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5098305084745762, | |
| "grad_norm": 1.5124850873525673, | |
| "learning_rate": 2.878569443761442e-06, | |
| "loss": 0.7131, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.5134463276836159, | |
| "grad_norm": 1.4743009883750753, | |
| "learning_rate": 2.847235825789673e-06, | |
| "loss": 0.7016, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.5170621468926554, | |
| "grad_norm": 1.3586041525935764, | |
| "learning_rate": 2.8158464873053236e-06, | |
| "loss": 0.6724, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.5206779661016949, | |
| "grad_norm": 1.4996529158631906, | |
| "learning_rate": 2.784406465317538e-06, | |
| "loss": 0.6662, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.5242937853107345, | |
| "grad_norm": 1.4671966292049852, | |
| "learning_rate": 2.752920804968581e-06, | |
| "loss": 0.6631, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.527909604519774, | |
| "grad_norm": 1.444812582502839, | |
| "learning_rate": 2.7213945587242507e-06, | |
| "loss": 0.6513, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.5315254237288135, | |
| "grad_norm": 1.3053540444757412, | |
| "learning_rate": 2.689832785563116e-06, | |
| "loss": 0.6555, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5351412429378531, | |
| "grad_norm": 1.314006962084444, | |
| "learning_rate": 2.658240550164704e-06, | |
| "loss": 0.6661, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.5387570621468927, | |
| "grad_norm": 1.4304673510029906, | |
| "learning_rate": 2.626622922096782e-06, | |
| "loss": 0.6621, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.5423728813559322, | |
| "grad_norm": 1.4876534124839516, | |
| "learning_rate": 2.5949849750018486e-06, | |
| "loss": 0.6758, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5459887005649717, | |
| "grad_norm": 1.3200607589115334, | |
| "learning_rate": 2.56333178578297e-06, | |
| "loss": 0.6559, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.5496045197740113, | |
| "grad_norm": 1.3240990359086642, | |
| "learning_rate": 2.5316684337891005e-06, | |
| "loss": 0.6232, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.5532203389830509, | |
| "grad_norm": 1.3124913285203368, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.6373, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.5568361581920904, | |
| "grad_norm": 1.3981996964149215, | |
| "learning_rate": 2.4683315662109003e-06, | |
| "loss": 0.6779, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.56045197740113, | |
| "grad_norm": 1.3816310911971024, | |
| "learning_rate": 2.436668214217031e-06, | |
| "loss": 0.654, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5640677966101695, | |
| "grad_norm": 1.263049809743652, | |
| "learning_rate": 2.4050150249981522e-06, | |
| "loss": 0.6625, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.567683615819209, | |
| "grad_norm": 1.3247706606665524, | |
| "learning_rate": 2.3733770779032185e-06, | |
| "loss": 0.6862, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5712994350282485, | |
| "grad_norm": 1.3384592393063528, | |
| "learning_rate": 2.341759449835297e-06, | |
| "loss": 0.669, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5749152542372882, | |
| "grad_norm": 1.3084651079974374, | |
| "learning_rate": 2.310167214436885e-06, | |
| "loss": 0.6389, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5785310734463277, | |
| "grad_norm": 1.3374680724124108, | |
| "learning_rate": 2.27860544127575e-06, | |
| "loss": 0.6472, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5821468926553672, | |
| "grad_norm": 1.3122640789722633, | |
| "learning_rate": 2.24707919503142e-06, | |
| "loss": 0.6579, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5857627118644068, | |
| "grad_norm": 1.4412496554625216, | |
| "learning_rate": 2.2155935346824634e-06, | |
| "loss": 0.6481, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5893785310734463, | |
| "grad_norm": 1.3863338838498946, | |
| "learning_rate": 2.1841535126946777e-06, | |
| "loss": 0.6535, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5929943502824859, | |
| "grad_norm": 1.345502076046215, | |
| "learning_rate": 2.1527641742103282e-06, | |
| "loss": 0.6707, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5966101694915255, | |
| "grad_norm": 1.3847319032734033, | |
| "learning_rate": 2.1214305562385592e-06, | |
| "loss": 0.6663, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.600225988700565, | |
| "grad_norm": 1.430665603476504, | |
| "learning_rate": 2.0901576868471125e-06, | |
| "loss": 0.6747, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.6038418079096045, | |
| "grad_norm": 1.353261054674096, | |
| "learning_rate": 2.05895058435548e-06, | |
| "loss": 0.6512, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.607457627118644, | |
| "grad_norm": 1.2859249411363938, | |
| "learning_rate": 2.0278142565296153e-06, | |
| "loss": 0.6324, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.6110734463276836, | |
| "grad_norm": 1.4013926433738113, | |
| "learning_rate": 1.9967536997783495e-06, | |
| "loss": 0.6679, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.6146892655367232, | |
| "grad_norm": 1.3862043238014647, | |
| "learning_rate": 1.9657738983516227e-06, | |
| "loss": 0.6729, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6183050847457627, | |
| "grad_norm": 1.3586507239524463, | |
| "learning_rate": 1.934879823540663e-06, | |
| "loss": 0.6493, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.6219209039548023, | |
| "grad_norm": 1.3297630157555045, | |
| "learning_rate": 1.9040764328802523e-06, | |
| "loss": 0.6398, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.6255367231638418, | |
| "grad_norm": 1.3817085459115725, | |
| "learning_rate": 1.8733686693531986e-06, | |
| "loss": 0.6582, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.6291525423728813, | |
| "grad_norm": 1.3202548993972594, | |
| "learning_rate": 1.842761460597138e-06, | |
| "loss": 0.6532, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.632768361581921, | |
| "grad_norm": 1.3288961390749972, | |
| "learning_rate": 1.812259718113805e-06, | |
| "loss": 0.6603, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6363841807909605, | |
| "grad_norm": 1.329943461477084, | |
| "learning_rate": 1.7818683364808883e-06, | |
| "loss": 0.658, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.3692273444745175, | |
| "learning_rate": 1.7515921925666053e-06, | |
| "loss": 0.6317, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.6436158192090395, | |
| "grad_norm": 1.421706203526152, | |
| "learning_rate": 1.7214361447471156e-06, | |
| "loss": 0.677, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.6472316384180791, | |
| "grad_norm": 1.3083895534561967, | |
| "learning_rate": 1.6914050321269049e-06, | |
| "loss": 0.6736, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.6508474576271186, | |
| "grad_norm": 1.3408157323699283, | |
| "learning_rate": 1.6615036737622574e-06, | |
| "loss": 0.6802, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6544632768361582, | |
| "grad_norm": 1.2866436449644132, | |
| "learning_rate": 1.6317368678879497e-06, | |
| "loss": 0.646, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.6580790960451978, | |
| "grad_norm": 1.4469309073814418, | |
| "learning_rate": 1.6021093911472825e-06, | |
| "loss": 0.6502, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.6616949152542373, | |
| "grad_norm": 1.3285438490415578, | |
| "learning_rate": 1.572625997825581e-06, | |
| "loss": 0.6392, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.6653107344632768, | |
| "grad_norm": 1.3767595914556963, | |
| "learning_rate": 1.5432914190872757e-06, | |
| "loss": 0.6478, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.6689265536723163, | |
| "grad_norm": 1.3913230479527472, | |
| "learning_rate": 1.5141103622167042e-06, | |
| "loss": 0.6624, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.672542372881356, | |
| "grad_norm": 1.3717018869154762, | |
| "learning_rate": 1.4850875098627326e-06, | |
| "loss": 0.6519, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.6761581920903955, | |
| "grad_norm": 1.3411020703523342, | |
| "learning_rate": 1.456227519287343e-06, | |
| "loss": 0.6382, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.679774011299435, | |
| "grad_norm": 1.238291654237968, | |
| "learning_rate": 1.4275350216182824e-06, | |
| "loss": 0.6391, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6833898305084746, | |
| "grad_norm": 1.374517850534095, | |
| "learning_rate": 1.3990146211059141e-06, | |
| "loss": 0.6456, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6870056497175141, | |
| "grad_norm": 1.306148052181935, | |
| "learning_rate": 1.3706708943843822e-06, | |
| "loss": 0.6441, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6906214689265536, | |
| "grad_norm": 1.3876372946236282, | |
| "learning_rate": 1.3425083897371983e-06, | |
| "loss": 0.6603, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6942372881355933, | |
| "grad_norm": 1.3904204488306329, | |
| "learning_rate": 1.3145316263673874e-06, | |
| "loss": 0.6721, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6978531073446328, | |
| "grad_norm": 1.510808405530817, | |
| "learning_rate": 1.286745093672298e-06, | |
| "loss": 0.649, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.7014689265536723, | |
| "grad_norm": 1.4312145193748698, | |
| "learning_rate": 1.2591532505231906e-06, | |
| "loss": 0.6573, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.7050847457627119, | |
| "grad_norm": 1.4024105046257231, | |
| "learning_rate": 1.2317605245497324e-06, | |
| "loss": 0.6727, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.7087005649717514, | |
| "grad_norm": 1.3505688502581619, | |
| "learning_rate": 1.204571311429496e-06, | |
| "loss": 0.6131, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.7123163841807909, | |
| "grad_norm": 1.319472329102849, | |
| "learning_rate": 1.1775899741825947e-06, | |
| "loss": 0.6434, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.7159322033898305, | |
| "grad_norm": 1.5152192403656248, | |
| "learning_rate": 1.1508208424715511e-06, | |
| "loss": 0.656, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.7195480225988701, | |
| "grad_norm": 1.6136474853206006, | |
| "learning_rate": 1.1242682119065217e-06, | |
| "loss": 0.6613, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.7231638418079096, | |
| "grad_norm": 1.312425015581362, | |
| "learning_rate": 1.0979363433559892e-06, | |
| "loss": 0.6577, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7267796610169491, | |
| "grad_norm": 1.3953598075891687, | |
| "learning_rate": 1.0718294622630188e-06, | |
| "loss": 0.6905, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.7303954802259887, | |
| "grad_norm": 1.372903007290825, | |
| "learning_rate": 1.045951757967215e-06, | |
| "loss": 0.6448, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.7340112994350283, | |
| "grad_norm": 1.4652703276389691, | |
| "learning_rate": 1.0203073830324566e-06, | |
| "loss": 0.6395, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.7376271186440678, | |
| "grad_norm": 1.366422271732463, | |
| "learning_rate": 9.949004525805423e-07, | |
| "loss": 0.6148, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.7412429378531074, | |
| "grad_norm": 1.2886663538886012, | |
| "learning_rate": 9.697350436308428e-07, | |
| "loss": 0.6322, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.7448587570621469, | |
| "grad_norm": 1.3574509449302405, | |
| "learning_rate": 9.448151944460657e-07, | |
| "loss": 0.6835, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.7484745762711864, | |
| "grad_norm": 1.393654049362733, | |
| "learning_rate": 9.201449038842403e-07, | |
| "loss": 0.6713, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.7520903954802259, | |
| "grad_norm": 1.3962653237548226, | |
| "learning_rate": 8.957281307570254e-07, | |
| "loss": 0.6349, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.7557062146892656, | |
| "grad_norm": 1.2716349996486196, | |
| "learning_rate": 8.71568793194445e-07, | |
| "loss": 0.6395, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.7593220338983051, | |
| "grad_norm": 1.3461355380403557, | |
| "learning_rate": 8.476707680161486e-07, | |
| "loss": 0.6566, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7629378531073446, | |
| "grad_norm": 1.3005123962225364, | |
| "learning_rate": 8.240378901093035e-07, | |
| "loss": 0.6498, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.7665536723163842, | |
| "grad_norm": 1.2709444859864987, | |
| "learning_rate": 8.006739518132179e-07, | |
| "loss": 0.6702, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.7701694915254237, | |
| "grad_norm": 1.3327493213768535, | |
| "learning_rate": 7.775827023107835e-07, | |
| "loss": 0.6403, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.7737853107344633, | |
| "grad_norm": 1.4047092975707265, | |
| "learning_rate": 7.547678470268526e-07, | |
| "loss": 0.6492, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.7774011299435029, | |
| "grad_norm": 1.383631976222337, | |
| "learning_rate": 7.322330470336314e-07, | |
| "loss": 0.6289, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7810169491525424, | |
| "grad_norm": 1.2567546465681303, | |
| "learning_rate": 7.099819184631929e-07, | |
| "loss": 0.6393, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.7846327683615819, | |
| "grad_norm": 1.2354895441269147, | |
| "learning_rate": 6.880180319272006e-07, | |
| "loss": 0.6429, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.7882485875706214, | |
| "grad_norm": 1.3478465932161185, | |
| "learning_rate": 6.663449119439358e-07, | |
| "loss": 0.6652, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.791864406779661, | |
| "grad_norm": 1.3094969549541247, | |
| "learning_rate": 6.449660363727236e-07, | |
| "loss": 0.6424, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.7954802259887006, | |
| "grad_norm": 1.385053278123551, | |
| "learning_rate": 6.238848358558439e-07, | |
| "loss": 0.6409, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7990960451977401, | |
| "grad_norm": 1.3349840769877994, | |
| "learning_rate": 6.031046932680229e-07, | |
| "loss": 0.6815, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.8027118644067797, | |
| "grad_norm": 1.4435164246937722, | |
| "learning_rate": 5.826289431735832e-07, | |
| "loss": 0.6489, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.8063276836158192, | |
| "grad_norm": 1.3356172924249725, | |
| "learning_rate": 5.624608712913531e-07, | |
| "loss": 0.6298, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.8099435028248587, | |
| "grad_norm": 1.2977048398488058, | |
| "learning_rate": 5.426037139674117e-07, | |
| "loss": 0.6509, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.8135593220338984, | |
| "grad_norm": 1.3774919529523362, | |
| "learning_rate": 5.23060657655754e-07, | |
| "loss": 0.6573, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.8171751412429379, | |
| "grad_norm": 1.3410009004205188, | |
| "learning_rate": 5.038348384069663e-07, | |
| "loss": 0.633, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.8207909604519774, | |
| "grad_norm": 1.2928727452886148, | |
| "learning_rate": 4.84929341364988e-07, | |
| "loss": 0.6754, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.8244067796610169, | |
| "grad_norm": 1.358332904803305, | |
| "learning_rate": 4.6634720027204093e-07, | |
| "loss": 0.6614, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.8280225988700565, | |
| "grad_norm": 1.4907573288328078, | |
| "learning_rate": 4.480913969818099e-07, | |
| "loss": 0.6281, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.831638418079096, | |
| "grad_norm": 1.3392158203262607, | |
| "learning_rate": 4.3016486098094667e-07, | |
| "loss": 0.6161, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8352542372881356, | |
| "grad_norm": 1.2444012140753318, | |
| "learning_rate": 4.125704689189819e-07, | |
| "loss": 0.6247, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.8388700564971752, | |
| "grad_norm": 1.4784447467052926, | |
| "learning_rate": 3.953110441467073e-07, | |
| "loss": 0.6586, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.8424858757062147, | |
| "grad_norm": 1.366997421968513, | |
| "learning_rate": 3.7838935626312246e-07, | |
| "loss": 0.6463, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.8461016949152542, | |
| "grad_norm": 1.4541952821449111, | |
| "learning_rate": 3.6180812067099477e-07, | |
| "loss": 0.6726, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.8497175141242937, | |
| "grad_norm": 1.3494835181921292, | |
| "learning_rate": 3.455699981411259e-07, | |
| "loss": 0.6378, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 1.3091535029497192, | |
| "learning_rate": 3.296775943853789e-07, | |
| "loss": 0.6381, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.8569491525423729, | |
| "grad_norm": 1.292004503291086, | |
| "learning_rate": 3.141334596385448e-07, | |
| "loss": 0.6361, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.8605649717514124, | |
| "grad_norm": 1.3072359435785652, | |
| "learning_rate": 2.9894008824910726e-07, | |
| "loss": 0.6311, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.864180790960452, | |
| "grad_norm": 1.3418443898914727, | |
| "learning_rate": 2.840999182789797e-07, | |
| "loss": 0.6584, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.8677966101694915, | |
| "grad_norm": 1.3790407833555425, | |
| "learning_rate": 2.696153311122704e-07, | |
| "loss": 0.6275, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.871412429378531, | |
| "grad_norm": 1.3668643411134043, | |
| "learning_rate": 2.5548865107314606e-07, | |
| "loss": 0.6574, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.8750282485875707, | |
| "grad_norm": 1.267748779732102, | |
| "learning_rate": 2.4172214505285006e-07, | |
| "loss": 0.6394, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.8786440677966102, | |
| "grad_norm": 1.2912726516960031, | |
| "learning_rate": 2.2831802214593774e-07, | |
| "loss": 0.6352, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.8822598870056497, | |
| "grad_norm": 1.3408535402209096, | |
| "learning_rate": 2.1527843329578328e-07, | |
| "loss": 0.6332, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.8858757062146893, | |
| "grad_norm": 1.369964120309017, | |
| "learning_rate": 2.026054709494235e-07, | |
| "loss": 0.6488, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8894915254237288, | |
| "grad_norm": 1.2985394908560597, | |
| "learning_rate": 1.9030116872178317e-07, | |
| "loss": 0.6268, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.8931073446327683, | |
| "grad_norm": 1.2695253299944342, | |
| "learning_rate": 1.7836750106934475e-07, | |
| "loss": 0.6098, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.896723163841808, | |
| "grad_norm": 1.2614680998276686, | |
| "learning_rate": 1.6680638297330854e-07, | |
| "loss": 0.6328, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.9003389830508475, | |
| "grad_norm": 1.3022416023187458, | |
| "learning_rate": 1.5561966963229925e-07, | |
| "loss": 0.6353, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.903954802259887, | |
| "grad_norm": 1.3192179782937912, | |
| "learning_rate": 1.448091561646628e-07, | |
| "loss": 0.6212, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9075706214689265, | |
| "grad_norm": 1.391821357807215, | |
| "learning_rate": 1.3437657732040783e-07, | |
| "loss": 0.6581, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.9111864406779661, | |
| "grad_norm": 1.348097897319632, | |
| "learning_rate": 1.243236072028317e-07, | |
| "loss": 0.6483, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.9148022598870057, | |
| "grad_norm": 1.3327973766461398, | |
| "learning_rate": 1.1465185899987797e-07, | |
| "loss": 0.6688, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.9184180790960452, | |
| "grad_norm": 1.3050936769777537, | |
| "learning_rate": 1.0536288472527162e-07, | |
| "loss": 0.6702, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.9220338983050848, | |
| "grad_norm": 1.3816378424721178, | |
| "learning_rate": 9.645817496946902e-08, | |
| "loss": 0.663, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.9256497175141243, | |
| "grad_norm": 1.3585454766173655, | |
| "learning_rate": 8.79391586604636e-08, | |
| "loss": 0.64, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.9292655367231638, | |
| "grad_norm": 1.3292531553316056, | |
| "learning_rate": 7.980720283448957e-08, | |
| "loss": 0.6335, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.9328813559322033, | |
| "grad_norm": 1.2599441985317357, | |
| "learning_rate": 7.206361241665266e-08, | |
| "loss": 0.6403, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.936497175141243, | |
| "grad_norm": 1.3958906990596642, | |
| "learning_rate": 6.470963001153268e-08, | |
| "loss": 0.6383, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.9401129943502825, | |
| "grad_norm": 1.295975911169058, | |
| "learning_rate": 5.774643570378296e-08, | |
| "loss": 0.6485, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.943728813559322, | |
| "grad_norm": 1.247115424184442, | |
| "learning_rate": 5.117514686876379e-08, | |
| "loss": 0.6479, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.9473446327683616, | |
| "grad_norm": 1.2578135030925344, | |
| "learning_rate": 4.4996817993239464e-08, | |
| "loss": 0.6363, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.9509604519774011, | |
| "grad_norm": 1.2567862444318008, | |
| "learning_rate": 3.9212440506164465e-08, | |
| "loss": 0.6417, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.9545762711864407, | |
| "grad_norm": 1.2972287074987505, | |
| "learning_rate": 3.382294261959157e-08, | |
| "loss": 0.676, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.9581920903954803, | |
| "grad_norm": 1.3028064182497767, | |
| "learning_rate": 2.8829189179721552e-08, | |
| "loss": 0.6502, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.9618079096045198, | |
| "grad_norm": 1.3401154575841365, | |
| "learning_rate": 2.423198152812306e-08, | |
| "loss": 0.6271, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.9654237288135593, | |
| "grad_norm": 1.332097876741386, | |
| "learning_rate": 2.0032057373142453e-08, | |
| "loss": 0.6312, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.9690395480225988, | |
| "grad_norm": 1.2752118722459802, | |
| "learning_rate": 1.6230090671524312e-08, | |
| "loss": 0.6313, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.9726553672316384, | |
| "grad_norm": 1.315897104629199, | |
| "learning_rate": 1.2826691520262114e-08, | |
| "loss": 0.6328, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.976271186440678, | |
| "grad_norm": 1.341142823473401, | |
| "learning_rate": 9.822406058697665e-09, | |
| "loss": 0.6247, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9798870056497175, | |
| "grad_norm": 1.3751891651375663, | |
| "learning_rate": 7.217716380881479e-09, | |
| "loss": 0.6185, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.9835028248587571, | |
| "grad_norm": 1.2993917620171413, | |
| "learning_rate": 5.0130404582127144e-09, | |
| "loss": 0.6308, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.9871186440677966, | |
| "grad_norm": 1.306024529686539, | |
| "learning_rate": 3.208732072368104e-09, | |
| "loss": 0.6383, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.9907344632768361, | |
| "grad_norm": 1.3213672912672132, | |
| "learning_rate": 1.8050807585293095e-09, | |
| "loss": 0.6559, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.9943502824858758, | |
| "grad_norm": 1.3174616601564904, | |
| "learning_rate": 8.023117589237017e-10, | |
| "loss": 0.6701, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.9979661016949153, | |
| "grad_norm": 1.3624163578590245, | |
| "learning_rate": 2.0058598667854755e-10, | |
| "loss": 0.6445, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.9979661016949153, | |
| "step": 276, | |
| "total_flos": 100810460790784.0, | |
| "train_loss": 0.7222011056931122, | |
| "train_runtime": 4200.972, | |
| "train_samples_per_second": 8.426, | |
| "train_steps_per_second": 0.066 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 276, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 100810460790784.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |