| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 1.7063226699829102, | |
| "learning_rate": 4.995e-05, | |
| "loss": 9.6305, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 1.467505693435669, | |
| "learning_rate": 4.99e-05, | |
| "loss": 8.8474, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 1.3338744640350342, | |
| "learning_rate": 4.9850000000000006e-05, | |
| "loss": 8.4272, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 1.194218635559082, | |
| "learning_rate": 4.9800000000000004e-05, | |
| "loss": 7.9969, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.9542586207389832, | |
| "learning_rate": 4.975e-05, | |
| "loss": 7.8018, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 0.8312947154045105, | |
| "learning_rate": 4.97e-05, | |
| "loss": 7.5303, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 0.6978892683982849, | |
| "learning_rate": 4.965e-05, | |
| "loss": 7.3733, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.6895764470100403, | |
| "learning_rate": 4.96e-05, | |
| "loss": 7.2434, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 0.5555976033210754, | |
| "learning_rate": 4.9550000000000005e-05, | |
| "loss": 7.0877, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.836391806602478, | |
| "learning_rate": 4.9500000000000004e-05, | |
| "loss": 7.0338, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 0.782464861869812, | |
| "learning_rate": 4.945e-05, | |
| "loss": 6.878, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 1.3705933094024658, | |
| "learning_rate": 4.94e-05, | |
| "loss": 6.5874, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 0.7560775876045227, | |
| "learning_rate": 4.935e-05, | |
| "loss": 6.4978, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 1.3238508701324463, | |
| "learning_rate": 4.93e-05, | |
| "loss": 6.3998, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.7834548950195312, | |
| "learning_rate": 4.9250000000000004e-05, | |
| "loss": 6.2838, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.762347400188446, | |
| "learning_rate": 4.92e-05, | |
| "loss": 6.0387, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 0.7799501419067383, | |
| "learning_rate": 4.915e-05, | |
| "loss": 6.0241, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 0.7948866486549377, | |
| "learning_rate": 4.91e-05, | |
| "loss": 5.8776, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 0.9890483021736145, | |
| "learning_rate": 4.905e-05, | |
| "loss": 5.747, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.9131263494491577, | |
| "learning_rate": 4.9e-05, | |
| "loss": 5.644, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 1.7073436975479126, | |
| "learning_rate": 4.8950000000000004e-05, | |
| "loss": 5.778, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 0.8059922456741333, | |
| "learning_rate": 4.89e-05, | |
| "loss": 5.4755, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 1.2500686645507812, | |
| "learning_rate": 4.885e-05, | |
| "loss": 5.3769, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 1.3848680257797241, | |
| "learning_rate": 4.88e-05, | |
| "loss": 5.2105, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 1.2381746768951416, | |
| "learning_rate": 4.875e-05, | |
| "loss": 5.1444, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 2.7005224227905273, | |
| "learning_rate": 4.87e-05, | |
| "loss": 5.1608, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 1.1472671031951904, | |
| "learning_rate": 4.8650000000000003e-05, | |
| "loss": 4.9456, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 1.9849270582199097, | |
| "learning_rate": 4.86e-05, | |
| "loss": 4.8466, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 1.857001781463623, | |
| "learning_rate": 4.855e-05, | |
| "loss": 4.7323, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.6731220483779907, | |
| "learning_rate": 4.85e-05, | |
| "loss": 4.5786, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 1.7968906164169312, | |
| "learning_rate": 4.845e-05, | |
| "loss": 4.4588, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.7908226251602173, | |
| "learning_rate": 4.8400000000000004e-05, | |
| "loss": 4.3645, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 2.538881540298462, | |
| "learning_rate": 4.835e-05, | |
| "loss": 4.1489, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 2.306257486343384, | |
| "learning_rate": 4.83e-05, | |
| "loss": 3.9798, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 2.1730940341949463, | |
| "learning_rate": 4.825e-05, | |
| "loss": 4.0231, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 2.4211463928222656, | |
| "learning_rate": 4.82e-05, | |
| "loss": 3.8495, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 2.3698794841766357, | |
| "learning_rate": 4.815e-05, | |
| "loss": 3.6977, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 2.147799491882324, | |
| "learning_rate": 4.8100000000000004e-05, | |
| "loss": 3.8008, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 2.3577606678009033, | |
| "learning_rate": 4.805e-05, | |
| "loss": 3.6983, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.065912961959839, | |
| "learning_rate": 4.8e-05, | |
| "loss": 3.5738, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 2.930288314819336, | |
| "learning_rate": 4.795e-05, | |
| "loss": 3.5117, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 2.3703155517578125, | |
| "learning_rate": 4.79e-05, | |
| "loss": 3.2483, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 2.6050736904144287, | |
| "learning_rate": 4.785e-05, | |
| "loss": 3.2342, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 2.0790674686431885, | |
| "learning_rate": 4.78e-05, | |
| "loss": 3.1452, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 2.2497427463531494, | |
| "learning_rate": 4.775e-05, | |
| "loss": 3.0316, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 2.507902145385742, | |
| "learning_rate": 4.77e-05, | |
| "loss": 2.8938, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 2.517744541168213, | |
| "learning_rate": 4.765e-05, | |
| "loss": 2.8137, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 3.9981460571289062, | |
| "learning_rate": 4.76e-05, | |
| "loss": 2.9864, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 2.265026569366455, | |
| "learning_rate": 4.755e-05, | |
| "loss": 2.7839, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.257293701171875, | |
| "learning_rate": 4.75e-05, | |
| "loss": 2.6834, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 2.6932270526885986, | |
| "learning_rate": 4.745e-05, | |
| "loss": 2.5755, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 1.7177081108093262, | |
| "learning_rate": 4.74e-05, | |
| "loss": 2.425, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 2.2452073097229004, | |
| "learning_rate": 4.735e-05, | |
| "loss": 2.5261, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 2.2109947204589844, | |
| "learning_rate": 4.73e-05, | |
| "loss": 2.3825, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 2.574531078338623, | |
| "learning_rate": 4.7249999999999997e-05, | |
| "loss": 2.3087, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 2.3631017208099365, | |
| "learning_rate": 4.72e-05, | |
| "loss": 2.3099, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 2.3809709548950195, | |
| "learning_rate": 4.715e-05, | |
| "loss": 2.3001, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 2.0683534145355225, | |
| "learning_rate": 4.71e-05, | |
| "loss": 2.0813, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 2.5471837520599365, | |
| "learning_rate": 4.705e-05, | |
| "loss": 2.0378, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.585564374923706, | |
| "learning_rate": 4.7e-05, | |
| "loss": 2.2062, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.061, | |
| "grad_norm": 2.062100648880005, | |
| "learning_rate": 4.695e-05, | |
| "loss": 1.9914, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "grad_norm": 2.1019210815429688, | |
| "learning_rate": 4.69e-05, | |
| "loss": 1.9635, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.063, | |
| "grad_norm": 2.630436658859253, | |
| "learning_rate": 4.685000000000001e-05, | |
| "loss": 1.9123, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 2.1028494834899902, | |
| "learning_rate": 4.6800000000000006e-05, | |
| "loss": 1.7583, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 2.392193078994751, | |
| "learning_rate": 4.6750000000000005e-05, | |
| "loss": 1.7532, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "grad_norm": 2.004413366317749, | |
| "learning_rate": 4.6700000000000003e-05, | |
| "loss": 1.6978, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.067, | |
| "grad_norm": 2.210513114929199, | |
| "learning_rate": 4.665e-05, | |
| "loss": 1.6311, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 1.8464936017990112, | |
| "learning_rate": 4.660000000000001e-05, | |
| "loss": 1.5507, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.069, | |
| "grad_norm": 2.0246541500091553, | |
| "learning_rate": 4.655000000000001e-05, | |
| "loss": 1.5637, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.199751138687134, | |
| "learning_rate": 4.6500000000000005e-05, | |
| "loss": 1.5603, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.071, | |
| "grad_norm": 2.2002196311950684, | |
| "learning_rate": 4.6450000000000004e-05, | |
| "loss": 1.4558, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 1.7826759815216064, | |
| "learning_rate": 4.64e-05, | |
| "loss": 1.4309, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.073, | |
| "grad_norm": 1.760297417640686, | |
| "learning_rate": 4.635e-05, | |
| "loss": 1.3531, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "grad_norm": 2.0505475997924805, | |
| "learning_rate": 4.630000000000001e-05, | |
| "loss": 1.3641, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 2.1375396251678467, | |
| "learning_rate": 4.6250000000000006e-05, | |
| "loss": 1.3259, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 1.8252328634262085, | |
| "learning_rate": 4.6200000000000005e-05, | |
| "loss": 1.2026, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.077, | |
| "grad_norm": 1.8945906162261963, | |
| "learning_rate": 4.6150000000000004e-05, | |
| "loss": 1.2878, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "grad_norm": 1.7990881204605103, | |
| "learning_rate": 4.61e-05, | |
| "loss": 1.1853, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.079, | |
| "grad_norm": 1.4897470474243164, | |
| "learning_rate": 4.605e-05, | |
| "loss": 1.1279, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.2804617881774902, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 1.0804, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.081, | |
| "grad_norm": 1.4800664186477661, | |
| "learning_rate": 4.5950000000000006e-05, | |
| "loss": 1.0361, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "grad_norm": 1.3526049852371216, | |
| "learning_rate": 4.5900000000000004e-05, | |
| "loss": 1.0585, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.083, | |
| "grad_norm": 1.534173607826233, | |
| "learning_rate": 4.585e-05, | |
| "loss": 1.0206, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 1.4844435453414917, | |
| "learning_rate": 4.58e-05, | |
| "loss": 0.9758, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 1.533679485321045, | |
| "learning_rate": 4.575e-05, | |
| "loss": 0.9168, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "grad_norm": 1.456162691116333, | |
| "learning_rate": 4.5700000000000006e-05, | |
| "loss": 0.8913, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.087, | |
| "grad_norm": 1.7335631847381592, | |
| "learning_rate": 4.5650000000000005e-05, | |
| "loss": 0.9154, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 1.3331761360168457, | |
| "learning_rate": 4.5600000000000004e-05, | |
| "loss": 0.8483, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.089, | |
| "grad_norm": 1.6703053712844849, | |
| "learning_rate": 4.555e-05, | |
| "loss": 0.8116, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.275975227355957, | |
| "learning_rate": 4.55e-05, | |
| "loss": 0.7869, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.091, | |
| "grad_norm": 1.3800309896469116, | |
| "learning_rate": 4.545000000000001e-05, | |
| "loss": 0.7637, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 1.9472386837005615, | |
| "learning_rate": 4.5400000000000006e-05, | |
| "loss": 0.7212, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.093, | |
| "grad_norm": 1.3451333045959473, | |
| "learning_rate": 4.5350000000000005e-05, | |
| "loss": 0.6829, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "grad_norm": 1.5209784507751465, | |
| "learning_rate": 4.53e-05, | |
| "loss": 0.729, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 1.3944469690322876, | |
| "learning_rate": 4.525e-05, | |
| "loss": 0.6732, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.2177132368087769, | |
| "learning_rate": 4.52e-05, | |
| "loss": 0.6188, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.097, | |
| "grad_norm": 1.5988528728485107, | |
| "learning_rate": 4.5150000000000006e-05, | |
| "loss": 0.6622, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "grad_norm": 1.3636531829833984, | |
| "learning_rate": 4.5100000000000005e-05, | |
| "loss": 0.5792, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.099, | |
| "grad_norm": 1.377453088760376, | |
| "learning_rate": 4.5050000000000004e-05, | |
| "loss": 0.6062, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.295713186264038, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.5709, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.101, | |
| "grad_norm": 1.35196852684021, | |
| "learning_rate": 4.495e-05, | |
| "loss": 0.5521, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "grad_norm": 1.0617187023162842, | |
| "learning_rate": 4.49e-05, | |
| "loss": 0.5147, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.103, | |
| "grad_norm": 1.3035167455673218, | |
| "learning_rate": 4.4850000000000006e-05, | |
| "loss": 0.5081, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 1.2835568189620972, | |
| "learning_rate": 4.4800000000000005e-05, | |
| "loss": 0.5, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 1.0403038263320923, | |
| "learning_rate": 4.4750000000000004e-05, | |
| "loss": 0.4825, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "grad_norm": 0.9538235068321228, | |
| "learning_rate": 4.47e-05, | |
| "loss": 0.4316, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.107, | |
| "grad_norm": 1.4246289730072021, | |
| "learning_rate": 4.465e-05, | |
| "loss": 0.4304, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 1.1217833757400513, | |
| "learning_rate": 4.46e-05, | |
| "loss": 0.4397, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.109, | |
| "grad_norm": 1.0411335229873657, | |
| "learning_rate": 4.4550000000000005e-05, | |
| "loss": 0.4057, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.8498069643974304, | |
| "learning_rate": 4.4500000000000004e-05, | |
| "loss": 0.3933, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.111, | |
| "grad_norm": 1.1270406246185303, | |
| "learning_rate": 4.445e-05, | |
| "loss": 0.366, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 1.189041256904602, | |
| "learning_rate": 4.44e-05, | |
| "loss": 0.3407, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.113, | |
| "grad_norm": 0.9837467670440674, | |
| "learning_rate": 4.435e-05, | |
| "loss": 0.3511, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "grad_norm": 1.0432955026626587, | |
| "learning_rate": 4.43e-05, | |
| "loss": 0.3381, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.9529951810836792, | |
| "learning_rate": 4.4250000000000005e-05, | |
| "loss": 0.3189, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 1.008836030960083, | |
| "learning_rate": 4.4200000000000004e-05, | |
| "loss": 0.3077, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.117, | |
| "grad_norm": 1.0005086660385132, | |
| "learning_rate": 4.415e-05, | |
| "loss": 0.3001, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "grad_norm": 1.1065175533294678, | |
| "learning_rate": 4.41e-05, | |
| "loss": 0.28, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.119, | |
| "grad_norm": 0.6701949834823608, | |
| "learning_rate": 4.405e-05, | |
| "loss": 0.2692, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.7154658436775208, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 0.2663, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.121, | |
| "grad_norm": 0.6997113823890686, | |
| "learning_rate": 4.3950000000000004e-05, | |
| "loss": 0.2595, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.122, | |
| "grad_norm": 0.9047608971595764, | |
| "learning_rate": 4.39e-05, | |
| "loss": 0.2558, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.123, | |
| "grad_norm": 0.8508415222167969, | |
| "learning_rate": 4.385e-05, | |
| "loss": 0.2459, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 0.6505220532417297, | |
| "learning_rate": 4.38e-05, | |
| "loss": 0.2236, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.5360460877418518, | |
| "learning_rate": 4.375e-05, | |
| "loss": 0.2189, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.126, | |
| "grad_norm": 0.560817539691925, | |
| "learning_rate": 4.3700000000000005e-05, | |
| "loss": 0.2166, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.127, | |
| "grad_norm": 0.7089666128158569, | |
| "learning_rate": 4.3650000000000004e-05, | |
| "loss": 0.2026, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.5265817046165466, | |
| "learning_rate": 4.36e-05, | |
| "loss": 0.197, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.129, | |
| "grad_norm": 0.6629377007484436, | |
| "learning_rate": 4.355e-05, | |
| "loss": 0.1934, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.0730735063552856, | |
| "learning_rate": 4.35e-05, | |
| "loss": 0.1807, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.131, | |
| "grad_norm": 0.6990699172019958, | |
| "learning_rate": 4.345e-05, | |
| "loss": 0.1845, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": 0.5047340393066406, | |
| "learning_rate": 4.3400000000000005e-05, | |
| "loss": 0.1725, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.133, | |
| "grad_norm": 0.6830994486808777, | |
| "learning_rate": 4.335e-05, | |
| "loss": 0.1687, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.134, | |
| "grad_norm": 0.5861710906028748, | |
| "learning_rate": 4.33e-05, | |
| "loss": 0.1671, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 0.43594300746917725, | |
| "learning_rate": 4.325e-05, | |
| "loss": 0.1467, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 0.44587692618370056, | |
| "learning_rate": 4.32e-05, | |
| "loss": 0.1509, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.137, | |
| "grad_norm": 0.5523977875709534, | |
| "learning_rate": 4.315e-05, | |
| "loss": 0.1434, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.138, | |
| "grad_norm": 0.6139170527458191, | |
| "learning_rate": 4.3100000000000004e-05, | |
| "loss": 0.1433, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.139, | |
| "grad_norm": 0.6169497966766357, | |
| "learning_rate": 4.305e-05, | |
| "loss": 0.1365, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.49120134115219116, | |
| "learning_rate": 4.3e-05, | |
| "loss": 0.1287, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.141, | |
| "grad_norm": 0.451753169298172, | |
| "learning_rate": 4.295e-05, | |
| "loss": 0.1142, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.142, | |
| "grad_norm": 0.5429627895355225, | |
| "learning_rate": 4.29e-05, | |
| "loss": 0.134, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.143, | |
| "grad_norm": 0.7613041400909424, | |
| "learning_rate": 4.285e-05, | |
| "loss": 0.1391, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.4953358471393585, | |
| "learning_rate": 4.2800000000000004e-05, | |
| "loss": 0.1197, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 0.3657626509666443, | |
| "learning_rate": 4.275e-05, | |
| "loss": 0.1071, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.146, | |
| "grad_norm": 0.44240206480026245, | |
| "learning_rate": 4.27e-05, | |
| "loss": 0.1111, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.147, | |
| "grad_norm": 0.5007165670394897, | |
| "learning_rate": 4.265e-05, | |
| "loss": 0.1056, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": 0.4580256938934326, | |
| "learning_rate": 4.26e-05, | |
| "loss": 0.1049, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.149, | |
| "grad_norm": 0.4970822036266327, | |
| "learning_rate": 4.2550000000000004e-05, | |
| "loss": 0.1032, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.4138182997703552, | |
| "learning_rate": 4.25e-05, | |
| "loss": 0.0961, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.151, | |
| "grad_norm": 0.4013712406158447, | |
| "learning_rate": 4.245e-05, | |
| "loss": 0.0949, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 0.3868940770626068, | |
| "learning_rate": 4.24e-05, | |
| "loss": 0.0837, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.153, | |
| "grad_norm": 0.3113015294075012, | |
| "learning_rate": 4.235e-05, | |
| "loss": 0.0909, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.154, | |
| "grad_norm": 0.3569623529911041, | |
| "learning_rate": 4.23e-05, | |
| "loss": 0.0908, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 0.3841746151447296, | |
| "learning_rate": 4.2250000000000004e-05, | |
| "loss": 0.0806, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": 0.6565550565719604, | |
| "learning_rate": 4.22e-05, | |
| "loss": 0.075, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.157, | |
| "grad_norm": 0.4816874563694, | |
| "learning_rate": 4.215e-05, | |
| "loss": 0.0858, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.158, | |
| "grad_norm": 0.30408933758735657, | |
| "learning_rate": 4.21e-05, | |
| "loss": 0.0704, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.159, | |
| "grad_norm": 0.43388792872428894, | |
| "learning_rate": 4.205e-05, | |
| "loss": 0.0671, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.33304253220558167, | |
| "learning_rate": 4.2e-05, | |
| "loss": 0.07, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.161, | |
| "grad_norm": 0.4260387420654297, | |
| "learning_rate": 4.195e-05, | |
| "loss": 0.0691, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.162, | |
| "grad_norm": 0.37930798530578613, | |
| "learning_rate": 4.19e-05, | |
| "loss": 0.0715, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.163, | |
| "grad_norm": 0.3198983669281006, | |
| "learning_rate": 4.185e-05, | |
| "loss": 0.0651, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": 0.3510359823703766, | |
| "learning_rate": 4.18e-05, | |
| "loss": 0.058, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 0.41047966480255127, | |
| "learning_rate": 4.175e-05, | |
| "loss": 0.065, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.166, | |
| "grad_norm": 0.3054174482822418, | |
| "learning_rate": 4.17e-05, | |
| "loss": 0.0564, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.167, | |
| "grad_norm": 0.29319772124290466, | |
| "learning_rate": 4.165e-05, | |
| "loss": 0.0599, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 0.257354736328125, | |
| "learning_rate": 4.16e-05, | |
| "loss": 0.0536, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.169, | |
| "grad_norm": 0.25215694308280945, | |
| "learning_rate": 4.155e-05, | |
| "loss": 0.0587, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.4573931097984314, | |
| "learning_rate": 4.15e-05, | |
| "loss": 0.0524, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.171, | |
| "grad_norm": 0.3514876663684845, | |
| "learning_rate": 4.145e-05, | |
| "loss": 0.0551, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": 0.3239930272102356, | |
| "learning_rate": 4.14e-05, | |
| "loss": 0.0499, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.173, | |
| "grad_norm": 0.20213039219379425, | |
| "learning_rate": 4.135e-05, | |
| "loss": 0.0521, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.174, | |
| "grad_norm": 0.21831783652305603, | |
| "learning_rate": 4.13e-05, | |
| "loss": 0.0469, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.2585163712501526, | |
| "learning_rate": 4.125e-05, | |
| "loss": 0.0469, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.21717113256454468, | |
| "learning_rate": 4.12e-05, | |
| "loss": 0.0455, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.177, | |
| "grad_norm": 0.27248838543891907, | |
| "learning_rate": 4.115e-05, | |
| "loss": 0.046, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.178, | |
| "grad_norm": 0.2503461241722107, | |
| "learning_rate": 4.11e-05, | |
| "loss": 0.0447, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.179, | |
| "grad_norm": 0.27404382824897766, | |
| "learning_rate": 4.105e-05, | |
| "loss": 0.0437, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.23549066483974457, | |
| "learning_rate": 4.1e-05, | |
| "loss": 0.0423, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.181, | |
| "grad_norm": 0.19369937479496002, | |
| "learning_rate": 4.095e-05, | |
| "loss": 0.0408, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.182, | |
| "grad_norm": 0.20560242235660553, | |
| "learning_rate": 4.09e-05, | |
| "loss": 0.0379, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.183, | |
| "grad_norm": 0.34989863634109497, | |
| "learning_rate": 4.085e-05, | |
| "loss": 0.0364, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 0.2310326248407364, | |
| "learning_rate": 4.08e-05, | |
| "loss": 0.0385, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 0.21055462956428528, | |
| "learning_rate": 4.075e-05, | |
| "loss": 0.0351, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.186, | |
| "grad_norm": 0.3251895308494568, | |
| "learning_rate": 4.07e-05, | |
| "loss": 0.0381, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.187, | |
| "grad_norm": 0.2887445390224457, | |
| "learning_rate": 4.065e-05, | |
| "loss": 0.0341, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "grad_norm": 0.15948843955993652, | |
| "learning_rate": 4.0600000000000004e-05, | |
| "loss": 0.0313, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.189, | |
| "grad_norm": 0.2413359135389328, | |
| "learning_rate": 4.055e-05, | |
| "loss": 0.0338, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.2132706493139267, | |
| "learning_rate": 4.05e-05, | |
| "loss": 0.0339, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.191, | |
| "grad_norm": 0.17968431115150452, | |
| "learning_rate": 4.045000000000001e-05, | |
| "loss": 0.0317, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.15828929841518402, | |
| "learning_rate": 4.0400000000000006e-05, | |
| "loss": 0.0302, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.193, | |
| "grad_norm": 0.18106874823570251, | |
| "learning_rate": 4.0350000000000005e-05, | |
| "loss": 0.0331, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.194, | |
| "grad_norm": 0.34827324748039246, | |
| "learning_rate": 4.0300000000000004e-05, | |
| "loss": 0.032, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 0.21621111035346985, | |
| "learning_rate": 4.025e-05, | |
| "loss": 0.0317, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "grad_norm": 0.2159423679113388, | |
| "learning_rate": 4.02e-05, | |
| "loss": 0.0296, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.197, | |
| "grad_norm": 0.17750391364097595, | |
| "learning_rate": 4.015000000000001e-05, | |
| "loss": 0.0297, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.198, | |
| "grad_norm": 0.13952311873435974, | |
| "learning_rate": 4.0100000000000006e-05, | |
| "loss": 0.0279, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.199, | |
| "grad_norm": 0.19622887670993805, | |
| "learning_rate": 4.0050000000000004e-05, | |
| "loss": 0.0278, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.14959514141082764, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0251, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.201, | |
| "grad_norm": 0.17456738650798798, | |
| "learning_rate": 3.995e-05, | |
| "loss": 0.0315, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.202, | |
| "grad_norm": 0.15893588960170746, | |
| "learning_rate": 3.99e-05, | |
| "loss": 0.0243, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.203, | |
| "grad_norm": 0.14638105034828186, | |
| "learning_rate": 3.9850000000000006e-05, | |
| "loss": 0.0247, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "grad_norm": 0.1714017242193222, | |
| "learning_rate": 3.9800000000000005e-05, | |
| "loss": 0.0252, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 0.18679572641849518, | |
| "learning_rate": 3.9750000000000004e-05, | |
| "loss": 0.0234, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.206, | |
| "grad_norm": 0.10623681545257568, | |
| "learning_rate": 3.97e-05, | |
| "loss": 0.0256, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.207, | |
| "grad_norm": 0.18566076457500458, | |
| "learning_rate": 3.965e-05, | |
| "loss": 0.0238, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.12487553805112839, | |
| "learning_rate": 3.960000000000001e-05, | |
| "loss": 0.0226, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.209, | |
| "grad_norm": 0.13191473484039307, | |
| "learning_rate": 3.9550000000000006e-05, | |
| "loss": 0.0232, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.225613072514534, | |
| "learning_rate": 3.9500000000000005e-05, | |
| "loss": 0.0226, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.211, | |
| "grad_norm": 0.10896781831979752, | |
| "learning_rate": 3.9450000000000003e-05, | |
| "loss": 0.0206, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "grad_norm": 0.16153796017169952, | |
| "learning_rate": 3.94e-05, | |
| "loss": 0.0195, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.213, | |
| "grad_norm": 0.19171251356601715, | |
| "learning_rate": 3.935e-05, | |
| "loss": 0.0203, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.214, | |
| "grad_norm": 0.13199982047080994, | |
| "learning_rate": 3.9300000000000007e-05, | |
| "loss": 0.0194, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 0.12839478254318237, | |
| "learning_rate": 3.9250000000000005e-05, | |
| "loss": 0.0227, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 0.10787441581487656, | |
| "learning_rate": 3.9200000000000004e-05, | |
| "loss": 0.0195, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.217, | |
| "grad_norm": 0.1551046371459961, | |
| "learning_rate": 3.915e-05, | |
| "loss": 0.019, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.218, | |
| "grad_norm": 0.18844197690486908, | |
| "learning_rate": 3.91e-05, | |
| "loss": 0.02, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.219, | |
| "grad_norm": 0.21247665584087372, | |
| "learning_rate": 3.905e-05, | |
| "loss": 0.0206, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.11881183087825775, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 0.0176, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.221, | |
| "grad_norm": 0.16291823983192444, | |
| "learning_rate": 3.8950000000000005e-05, | |
| "loss": 0.0178, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.222, | |
| "grad_norm": 0.14063787460327148, | |
| "learning_rate": 3.8900000000000004e-05, | |
| "loss": 0.0177, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.223, | |
| "grad_norm": 0.15583930909633636, | |
| "learning_rate": 3.885e-05, | |
| "loss": 0.0185, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.16128882765769958, | |
| "learning_rate": 3.88e-05, | |
| "loss": 0.0168, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.1588473916053772, | |
| "learning_rate": 3.875e-05, | |
| "loss": 0.0167, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.226, | |
| "grad_norm": 0.10487533360719681, | |
| "learning_rate": 3.8700000000000006e-05, | |
| "loss": 0.0154, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.227, | |
| "grad_norm": 0.2638506591320038, | |
| "learning_rate": 3.8650000000000004e-05, | |
| "loss": 0.0179, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "grad_norm": 0.12504911422729492, | |
| "learning_rate": 3.86e-05, | |
| "loss": 0.016, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.229, | |
| "grad_norm": 0.11655262857675552, | |
| "learning_rate": 3.855e-05, | |
| "loss": 0.0164, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.10052930563688278, | |
| "learning_rate": 3.85e-05, | |
| "loss": 0.0143, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.231, | |
| "grad_norm": 0.07682032138109207, | |
| "learning_rate": 3.845e-05, | |
| "loss": 0.0165, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 0.12146533280611038, | |
| "learning_rate": 3.8400000000000005e-05, | |
| "loss": 0.0147, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.233, | |
| "grad_norm": 0.16349685192108154, | |
| "learning_rate": 3.8350000000000004e-05, | |
| "loss": 0.0166, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.234, | |
| "grad_norm": 0.10822432488203049, | |
| "learning_rate": 3.83e-05, | |
| "loss": 0.0146, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.235, | |
| "grad_norm": 0.0805143415927887, | |
| "learning_rate": 3.825e-05, | |
| "loss": 0.0131, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "grad_norm": 0.08285068720579147, | |
| "learning_rate": 3.82e-05, | |
| "loss": 0.0134, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.237, | |
| "grad_norm": 0.0886882022023201, | |
| "learning_rate": 3.8150000000000006e-05, | |
| "loss": 0.0133, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.238, | |
| "grad_norm": 0.08793161064386368, | |
| "learning_rate": 3.8100000000000005e-05, | |
| "loss": 0.0142, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.239, | |
| "grad_norm": 0.08325997740030289, | |
| "learning_rate": 3.805e-05, | |
| "loss": 0.012, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.10990972816944122, | |
| "learning_rate": 3.8e-05, | |
| "loss": 0.0134, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.241, | |
| "grad_norm": 0.06695697456598282, | |
| "learning_rate": 3.795e-05, | |
| "loss": 0.0119, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.242, | |
| "grad_norm": 0.08304648846387863, | |
| "learning_rate": 3.79e-05, | |
| "loss": 0.0128, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.243, | |
| "grad_norm": 0.09513472020626068, | |
| "learning_rate": 3.7850000000000005e-05, | |
| "loss": 0.0146, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "grad_norm": 0.07892587035894394, | |
| "learning_rate": 3.7800000000000004e-05, | |
| "loss": 0.0116, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.245, | |
| "grad_norm": 0.12630197405815125, | |
| "learning_rate": 3.775e-05, | |
| "loss": 0.0132, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.246, | |
| "grad_norm": 0.08250122517347336, | |
| "learning_rate": 3.77e-05, | |
| "loss": 0.013, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.247, | |
| "grad_norm": 0.09903154522180557, | |
| "learning_rate": 3.765e-05, | |
| "loss": 0.0117, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 0.09059987217187881, | |
| "learning_rate": 3.76e-05, | |
| "loss": 0.0111, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.249, | |
| "grad_norm": 0.09777077287435532, | |
| "learning_rate": 3.7550000000000005e-05, | |
| "loss": 0.0142, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.1801980435848236, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.0121, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.251, | |
| "grad_norm": 0.08936703950166702, | |
| "learning_rate": 3.745e-05, | |
| "loss": 0.0112, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "grad_norm": 0.09601296484470367, | |
| "learning_rate": 3.74e-05, | |
| "loss": 0.0116, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.253, | |
| "grad_norm": 0.08924739062786102, | |
| "learning_rate": 3.735e-05, | |
| "loss": 0.0119, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.254, | |
| "grad_norm": 0.07558383047580719, | |
| "learning_rate": 3.73e-05, | |
| "loss": 0.0108, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.255, | |
| "grad_norm": 0.05701779946684837, | |
| "learning_rate": 3.7250000000000004e-05, | |
| "loss": 0.011, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.0955379456281662, | |
| "learning_rate": 3.72e-05, | |
| "loss": 0.0112, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.257, | |
| "grad_norm": 0.07837249338626862, | |
| "learning_rate": 3.715e-05, | |
| "loss": 0.0111, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.258, | |
| "grad_norm": 0.09438953548669815, | |
| "learning_rate": 3.71e-05, | |
| "loss": 0.0121, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.259, | |
| "grad_norm": 0.08802532404661179, | |
| "learning_rate": 3.705e-05, | |
| "loss": 0.0098, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.0785641148686409, | |
| "learning_rate": 3.7e-05, | |
| "loss": 0.0106, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.261, | |
| "grad_norm": 0.10036404430866241, | |
| "learning_rate": 3.6950000000000004e-05, | |
| "loss": 0.0107, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.262, | |
| "grad_norm": 0.0663432776927948, | |
| "learning_rate": 3.69e-05, | |
| "loss": 0.0098, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.263, | |
| "grad_norm": 0.06886564195156097, | |
| "learning_rate": 3.685e-05, | |
| "loss": 0.0094, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 0.06641737371683121, | |
| "learning_rate": 3.68e-05, | |
| "loss": 0.0112, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.265, | |
| "grad_norm": 0.1470363438129425, | |
| "learning_rate": 3.675e-05, | |
| "loss": 0.0118, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.266, | |
| "grad_norm": 0.08694775402545929, | |
| "learning_rate": 3.6700000000000004e-05, | |
| "loss": 0.0105, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.267, | |
| "grad_norm": 0.08168693631887436, | |
| "learning_rate": 3.665e-05, | |
| "loss": 0.0101, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "grad_norm": 0.06114206463098526, | |
| "learning_rate": 3.66e-05, | |
| "loss": 0.0097, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.269, | |
| "grad_norm": 0.09011895209550858, | |
| "learning_rate": 3.655e-05, | |
| "loss": 0.0101, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.06499819457530975, | |
| "learning_rate": 3.65e-05, | |
| "loss": 0.0089, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.271, | |
| "grad_norm": 0.08157055824995041, | |
| "learning_rate": 3.645e-05, | |
| "loss": 0.0099, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.06255003809928894, | |
| "learning_rate": 3.6400000000000004e-05, | |
| "loss": 0.0091, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.273, | |
| "grad_norm": 0.13641099631786346, | |
| "learning_rate": 3.635e-05, | |
| "loss": 0.01, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.274, | |
| "grad_norm": 0.06449054926633835, | |
| "learning_rate": 3.63e-05, | |
| "loss": 0.0087, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.09242594987154007, | |
| "learning_rate": 3.625e-05, | |
| "loss": 0.0084, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "grad_norm": 0.14216932654380798, | |
| "learning_rate": 3.62e-05, | |
| "loss": 0.01, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.277, | |
| "grad_norm": 0.11992328613996506, | |
| "learning_rate": 3.615e-05, | |
| "loss": 0.0107, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.278, | |
| "grad_norm": 0.10537979751825333, | |
| "learning_rate": 3.61e-05, | |
| "loss": 0.0101, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.279, | |
| "grad_norm": 0.06420467048883438, | |
| "learning_rate": 3.605e-05, | |
| "loss": 0.0088, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.10813489556312561, | |
| "learning_rate": 3.6e-05, | |
| "loss": 0.0093, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.281, | |
| "grad_norm": 0.05735234543681145, | |
| "learning_rate": 3.595e-05, | |
| "loss": 0.0087, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.282, | |
| "grad_norm": 0.05712522938847542, | |
| "learning_rate": 3.59e-05, | |
| "loss": 0.0082, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.283, | |
| "grad_norm": 0.07710873335599899, | |
| "learning_rate": 3.585e-05, | |
| "loss": 0.0088, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "grad_norm": 0.11007268726825714, | |
| "learning_rate": 3.58e-05, | |
| "loss": 0.0075, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.285, | |
| "grad_norm": 0.07825978100299835, | |
| "learning_rate": 3.575e-05, | |
| "loss": 0.0089, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.286, | |
| "grad_norm": 0.06950812041759491, | |
| "learning_rate": 3.57e-05, | |
| "loss": 0.0077, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.287, | |
| "grad_norm": 0.052572544664144516, | |
| "learning_rate": 3.565e-05, | |
| "loss": 0.0076, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.06588669121265411, | |
| "learning_rate": 3.56e-05, | |
| "loss": 0.0073, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.289, | |
| "grad_norm": 0.052969031035900116, | |
| "learning_rate": 3.555e-05, | |
| "loss": 0.0066, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.04204658418893814, | |
| "learning_rate": 3.55e-05, | |
| "loss": 0.0064, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.291, | |
| "grad_norm": 0.04765693470835686, | |
| "learning_rate": 3.545e-05, | |
| "loss": 0.0067, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "grad_norm": 0.06796044856309891, | |
| "learning_rate": 3.54e-05, | |
| "loss": 0.0069, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.293, | |
| "grad_norm": 0.12173280119895935, | |
| "learning_rate": 3.535e-05, | |
| "loss": 0.0126, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.294, | |
| "grad_norm": 0.09393921494483948, | |
| "learning_rate": 3.53e-05, | |
| "loss": 0.0096, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.295, | |
| "grad_norm": 0.08246493339538574, | |
| "learning_rate": 3.525e-05, | |
| "loss": 0.0084, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 0.04482726752758026, | |
| "learning_rate": 3.52e-05, | |
| "loss": 0.0073, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.297, | |
| "grad_norm": 0.1147686317563057, | |
| "learning_rate": 3.515e-05, | |
| "loss": 0.0094, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.298, | |
| "grad_norm": 0.09143181890249252, | |
| "learning_rate": 3.51e-05, | |
| "loss": 0.0075, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.299, | |
| "grad_norm": 0.05911434814333916, | |
| "learning_rate": 3.505e-05, | |
| "loss": 0.0075, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.04372965916991234, | |
| "learning_rate": 3.5e-05, | |
| "loss": 0.0072, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.301, | |
| "grad_norm": 0.05518479272723198, | |
| "learning_rate": 3.495e-05, | |
| "loss": 0.0068, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.302, | |
| "grad_norm": 0.04555105045437813, | |
| "learning_rate": 3.49e-05, | |
| "loss": 0.0064, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.303, | |
| "grad_norm": 0.03831150382757187, | |
| "learning_rate": 3.485e-05, | |
| "loss": 0.007, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.04596749320626259, | |
| "learning_rate": 3.48e-05, | |
| "loss": 0.0065, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.305, | |
| "grad_norm": 0.07694078236818314, | |
| "learning_rate": 3.475e-05, | |
| "loss": 0.0059, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.306, | |
| "grad_norm": 0.12307348102331161, | |
| "learning_rate": 3.4699999999999996e-05, | |
| "loss": 0.0099, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.307, | |
| "grad_norm": 0.059611763805150986, | |
| "learning_rate": 3.465e-05, | |
| "loss": 0.0067, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "grad_norm": 0.07357806712388992, | |
| "learning_rate": 3.46e-05, | |
| "loss": 0.007, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.309, | |
| "grad_norm": 0.060446444898843765, | |
| "learning_rate": 3.455e-05, | |
| "loss": 0.0063, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.05178246274590492, | |
| "learning_rate": 3.45e-05, | |
| "loss": 0.0064, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.311, | |
| "grad_norm": 0.4560135006904602, | |
| "learning_rate": 3.445e-05, | |
| "loss": 0.0076, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 0.10910088568925858, | |
| "learning_rate": 3.4399999999999996e-05, | |
| "loss": 0.0082, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.313, | |
| "grad_norm": 0.05087321624159813, | |
| "learning_rate": 3.435e-05, | |
| "loss": 0.0061, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.314, | |
| "grad_norm": 0.055152568966150284, | |
| "learning_rate": 3.430000000000001e-05, | |
| "loss": 0.0065, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.315, | |
| "grad_norm": 0.48375555872917175, | |
| "learning_rate": 3.4250000000000006e-05, | |
| "loss": 0.0081, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "grad_norm": 0.12223263829946518, | |
| "learning_rate": 3.4200000000000005e-05, | |
| "loss": 0.0098, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.317, | |
| "grad_norm": 0.15452681481838226, | |
| "learning_rate": 3.415e-05, | |
| "loss": 0.0087, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.318, | |
| "grad_norm": 0.06133843585848808, | |
| "learning_rate": 3.41e-05, | |
| "loss": 0.0087, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.319, | |
| "grad_norm": 0.04037950187921524, | |
| "learning_rate": 3.405e-05, | |
| "loss": 0.0062, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.05707933381199837, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 0.0061, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.321, | |
| "grad_norm": 0.04124099016189575, | |
| "learning_rate": 3.3950000000000005e-05, | |
| "loss": 0.0058, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.322, | |
| "grad_norm": 0.12988638877868652, | |
| "learning_rate": 3.3900000000000004e-05, | |
| "loss": 0.0062, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.323, | |
| "grad_norm": 0.04961306229233742, | |
| "learning_rate": 3.385e-05, | |
| "loss": 0.0057, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "grad_norm": 0.05354069173336029, | |
| "learning_rate": 3.38e-05, | |
| "loss": 0.0062, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.04944461211562157, | |
| "learning_rate": 3.375000000000001e-05, | |
| "loss": 0.0061, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.326, | |
| "grad_norm": 0.03429180383682251, | |
| "learning_rate": 3.3700000000000006e-05, | |
| "loss": 0.0055, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.327, | |
| "grad_norm": 0.05271946266293526, | |
| "learning_rate": 3.3650000000000005e-05, | |
| "loss": 0.0054, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 0.03602539002895355, | |
| "learning_rate": 3.3600000000000004e-05, | |
| "loss": 0.0049, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.329, | |
| "grad_norm": 0.03325178474187851, | |
| "learning_rate": 3.355e-05, | |
| "loss": 0.0052, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.03728079795837402, | |
| "learning_rate": 3.35e-05, | |
| "loss": 0.0057, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.331, | |
| "grad_norm": 0.053768668323755264, | |
| "learning_rate": 3.345000000000001e-05, | |
| "loss": 0.0055, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "grad_norm": 0.054501548409461975, | |
| "learning_rate": 3.3400000000000005e-05, | |
| "loss": 0.0053, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.333, | |
| "grad_norm": 0.05519956722855568, | |
| "learning_rate": 3.3350000000000004e-05, | |
| "loss": 0.0053, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.334, | |
| "grad_norm": 0.05373954027891159, | |
| "learning_rate": 3.33e-05, | |
| "loss": 0.006, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.335, | |
| "grad_norm": 0.04272560030221939, | |
| "learning_rate": 3.325e-05, | |
| "loss": 0.0057, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.047061894088983536, | |
| "learning_rate": 3.32e-05, | |
| "loss": 0.0048, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.337, | |
| "grad_norm": 0.032794494181871414, | |
| "learning_rate": 3.3150000000000006e-05, | |
| "loss": 0.0046, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.338, | |
| "grad_norm": 0.027148200199007988, | |
| "learning_rate": 3.3100000000000005e-05, | |
| "loss": 0.0046, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.339, | |
| "grad_norm": 0.035516317933797836, | |
| "learning_rate": 3.3050000000000004e-05, | |
| "loss": 0.0048, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.046294230967760086, | |
| "learning_rate": 3.3e-05, | |
| "loss": 0.0063, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.341, | |
| "grad_norm": 0.07840899378061295, | |
| "learning_rate": 3.295e-05, | |
| "loss": 0.0047, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.342, | |
| "grad_norm": 0.04392802715301514, | |
| "learning_rate": 3.29e-05, | |
| "loss": 0.0048, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.343, | |
| "grad_norm": 0.04237942770123482, | |
| "learning_rate": 3.2850000000000006e-05, | |
| "loss": 0.0047, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 0.03379204496741295, | |
| "learning_rate": 3.2800000000000004e-05, | |
| "loss": 0.005, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.345, | |
| "grad_norm": 0.3130718469619751, | |
| "learning_rate": 3.275e-05, | |
| "loss": 0.0108, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.346, | |
| "grad_norm": 0.13712112605571747, | |
| "learning_rate": 3.27e-05, | |
| "loss": 0.0095, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.347, | |
| "grad_norm": 0.12321494519710541, | |
| "learning_rate": 3.265e-05, | |
| "loss": 0.0075, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "grad_norm": 0.06602940708398819, | |
| "learning_rate": 3.26e-05, | |
| "loss": 0.0062, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.349, | |
| "grad_norm": 0.08250287175178528, | |
| "learning_rate": 3.2550000000000005e-05, | |
| "loss": 0.0053, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.04638442397117615, | |
| "learning_rate": 3.2500000000000004e-05, | |
| "loss": 0.0052, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.351, | |
| "grad_norm": 0.043373119086027145, | |
| "learning_rate": 3.245e-05, | |
| "loss": 0.0053, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.04895636439323425, | |
| "learning_rate": 3.24e-05, | |
| "loss": 0.0043, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.353, | |
| "grad_norm": 0.04256746545433998, | |
| "learning_rate": 3.235e-05, | |
| "loss": 0.0044, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.354, | |
| "grad_norm": 0.02967280149459839, | |
| "learning_rate": 3.2300000000000006e-05, | |
| "loss": 0.0046, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.355, | |
| "grad_norm": 0.02590947411954403, | |
| "learning_rate": 3.2250000000000005e-05, | |
| "loss": 0.0044, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "grad_norm": 0.026240160688757896, | |
| "learning_rate": 3.2200000000000003e-05, | |
| "loss": 0.0041, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.357, | |
| "grad_norm": 0.048163361847400665, | |
| "learning_rate": 3.215e-05, | |
| "loss": 0.0048, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.358, | |
| "grad_norm": 0.04310280829668045, | |
| "learning_rate": 3.21e-05, | |
| "loss": 0.0044, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.359, | |
| "grad_norm": 0.027477843686938286, | |
| "learning_rate": 3.205e-05, | |
| "loss": 0.004, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.02773194946348667, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.0046, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.361, | |
| "grad_norm": 0.027110638096928596, | |
| "learning_rate": 3.1950000000000004e-05, | |
| "loss": 0.004, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.362, | |
| "grad_norm": 0.04346521571278572, | |
| "learning_rate": 3.19e-05, | |
| "loss": 0.0039, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.363, | |
| "grad_norm": 0.024588119238615036, | |
| "learning_rate": 3.185e-05, | |
| "loss": 0.0041, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "grad_norm": 0.03631160408258438, | |
| "learning_rate": 3.18e-05, | |
| "loss": 0.0039, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.365, | |
| "grad_norm": 0.028497323393821716, | |
| "learning_rate": 3.175e-05, | |
| "loss": 0.0045, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.366, | |
| "grad_norm": 0.06324070692062378, | |
| "learning_rate": 3.1700000000000005e-05, | |
| "loss": 0.004, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.367, | |
| "grad_norm": 0.22121182084083557, | |
| "learning_rate": 3.1650000000000004e-05, | |
| "loss": 0.0078, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.130402073264122, | |
| "learning_rate": 3.16e-05, | |
| "loss": 0.0068, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.369, | |
| "grad_norm": 0.11193361133337021, | |
| "learning_rate": 3.155e-05, | |
| "loss": 0.0052, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.043261051177978516, | |
| "learning_rate": 3.15e-05, | |
| "loss": 0.0046, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.371, | |
| "grad_norm": 0.05132100731134415, | |
| "learning_rate": 3.145e-05, | |
| "loss": 0.0041, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "grad_norm": 0.03498228266835213, | |
| "learning_rate": 3.1400000000000004e-05, | |
| "loss": 0.0041, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.373, | |
| "grad_norm": 0.03594733029603958, | |
| "learning_rate": 3.135e-05, | |
| "loss": 0.0037, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.374, | |
| "grad_norm": 0.05542735382914543, | |
| "learning_rate": 3.13e-05, | |
| "loss": 0.0042, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.03302931785583496, | |
| "learning_rate": 3.125e-05, | |
| "loss": 0.0048, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 0.028392167761921883, | |
| "learning_rate": 3.12e-05, | |
| "loss": 0.0042, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.377, | |
| "grad_norm": 0.05274713411927223, | |
| "learning_rate": 3.115e-05, | |
| "loss": 0.0045, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.378, | |
| "grad_norm": 0.024890929460525513, | |
| "learning_rate": 3.1100000000000004e-05, | |
| "loss": 0.0038, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.379, | |
| "grad_norm": 0.02797631174325943, | |
| "learning_rate": 3.105e-05, | |
| "loss": 0.0036, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.033390454947948456, | |
| "learning_rate": 3.1e-05, | |
| "loss": 0.0039, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.381, | |
| "grad_norm": 0.024741416797041893, | |
| "learning_rate": 3.095e-05, | |
| "loss": 0.0042, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.382, | |
| "grad_norm": 0.05398337543010712, | |
| "learning_rate": 3.09e-05, | |
| "loss": 0.0049, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.383, | |
| "grad_norm": 0.027936646714806557, | |
| "learning_rate": 3.0850000000000004e-05, | |
| "loss": 0.0034, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.02413495071232319, | |
| "learning_rate": 3.08e-05, | |
| "loss": 0.0036, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.385, | |
| "grad_norm": 0.037689995020627975, | |
| "learning_rate": 3.075e-05, | |
| "loss": 0.0033, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.386, | |
| "grad_norm": 0.028174949809908867, | |
| "learning_rate": 3.07e-05, | |
| "loss": 0.0036, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.387, | |
| "grad_norm": 0.064354807138443, | |
| "learning_rate": 3.065e-05, | |
| "loss": 0.0037, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "grad_norm": 0.028341595083475113, | |
| "learning_rate": 3.06e-05, | |
| "loss": 0.0034, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.389, | |
| "grad_norm": 0.06142325699329376, | |
| "learning_rate": 3.0550000000000004e-05, | |
| "loss": 0.004, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.03553822636604309, | |
| "learning_rate": 3.05e-05, | |
| "loss": 0.004, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.391, | |
| "grad_norm": 0.025645367801189423, | |
| "learning_rate": 3.045e-05, | |
| "loss": 0.0032, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 0.053947921842336655, | |
| "learning_rate": 3.04e-05, | |
| "loss": 0.0042, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.393, | |
| "grad_norm": 0.040126167237758636, | |
| "learning_rate": 3.035e-05, | |
| "loss": 0.0038, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.394, | |
| "grad_norm": 0.02956206165254116, | |
| "learning_rate": 3.03e-05, | |
| "loss": 0.0035, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.395, | |
| "grad_norm": 0.11952024698257446, | |
| "learning_rate": 3.025e-05, | |
| "loss": 0.0057, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "grad_norm": 0.04155363142490387, | |
| "learning_rate": 3.02e-05, | |
| "loss": 0.0034, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.397, | |
| "grad_norm": 0.03884551301598549, | |
| "learning_rate": 3.015e-05, | |
| "loss": 0.0037, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.398, | |
| "grad_norm": 0.033869728446006775, | |
| "learning_rate": 3.01e-05, | |
| "loss": 0.0033, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.399, | |
| "grad_norm": 0.027508044615387917, | |
| "learning_rate": 3.0050000000000002e-05, | |
| "loss": 0.0038, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.019838711246848106, | |
| "learning_rate": 3e-05, | |
| "loss": 0.0034, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.401, | |
| "grad_norm": 0.042124535888433456, | |
| "learning_rate": 2.995e-05, | |
| "loss": 0.003, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.402, | |
| "grad_norm": 0.03583139553666115, | |
| "learning_rate": 2.9900000000000002e-05, | |
| "loss": 0.0029, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.403, | |
| "grad_norm": 0.024187223985791206, | |
| "learning_rate": 2.985e-05, | |
| "loss": 0.0031, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.404, | |
| "grad_norm": 0.02509123831987381, | |
| "learning_rate": 2.98e-05, | |
| "loss": 0.003, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.405, | |
| "grad_norm": 0.015798581764101982, | |
| "learning_rate": 2.975e-05, | |
| "loss": 0.003, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.406, | |
| "grad_norm": 0.01964486949145794, | |
| "learning_rate": 2.97e-05, | |
| "loss": 0.0032, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.407, | |
| "grad_norm": 0.025820232927799225, | |
| "learning_rate": 2.965e-05, | |
| "loss": 0.0032, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 0.03453589975833893, | |
| "learning_rate": 2.96e-05, | |
| "loss": 0.003, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.409, | |
| "grad_norm": 0.022311529144644737, | |
| "learning_rate": 2.955e-05, | |
| "loss": 0.0025, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.02296466939151287, | |
| "learning_rate": 2.95e-05, | |
| "loss": 0.003, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.411, | |
| "grad_norm": 0.022816313430666924, | |
| "learning_rate": 2.945e-05, | |
| "loss": 0.0032, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.412, | |
| "grad_norm": 0.021030904725193977, | |
| "learning_rate": 2.94e-05, | |
| "loss": 0.003, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.413, | |
| "grad_norm": 0.02336346171796322, | |
| "learning_rate": 2.935e-05, | |
| "loss": 0.0028, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.414, | |
| "grad_norm": 0.019582638517022133, | |
| "learning_rate": 2.93e-05, | |
| "loss": 0.0027, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.415, | |
| "grad_norm": 0.031429585069417953, | |
| "learning_rate": 2.925e-05, | |
| "loss": 0.0027, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.027804825454950333, | |
| "learning_rate": 2.9199999999999998e-05, | |
| "loss": 0.0027, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.417, | |
| "grad_norm": 0.022006656974554062, | |
| "learning_rate": 2.915e-05, | |
| "loss": 0.003, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.418, | |
| "grad_norm": 0.052478939294815063, | |
| "learning_rate": 2.91e-05, | |
| "loss": 0.0044, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.419, | |
| "grad_norm": 0.03854925185441971, | |
| "learning_rate": 2.9049999999999998e-05, | |
| "loss": 0.0036, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.02749469131231308, | |
| "learning_rate": 2.9e-05, | |
| "loss": 0.0033, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.421, | |
| "grad_norm": 0.01697971485555172, | |
| "learning_rate": 2.895e-05, | |
| "loss": 0.0032, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.422, | |
| "grad_norm": 0.05183997377753258, | |
| "learning_rate": 2.8899999999999998e-05, | |
| "loss": 0.0029, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.423, | |
| "grad_norm": 0.030102815479040146, | |
| "learning_rate": 2.885e-05, | |
| "loss": 0.0028, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 0.027241216972470284, | |
| "learning_rate": 2.88e-05, | |
| "loss": 0.0033, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 0.01855759136378765, | |
| "learning_rate": 2.8749999999999997e-05, | |
| "loss": 0.0024, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.426, | |
| "grad_norm": 0.019300928339362144, | |
| "learning_rate": 2.87e-05, | |
| "loss": 0.0024, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.427, | |
| "grad_norm": 0.01639522798359394, | |
| "learning_rate": 2.865e-05, | |
| "loss": 0.0026, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.428, | |
| "grad_norm": 0.027084793895483017, | |
| "learning_rate": 2.86e-05, | |
| "loss": 0.0027, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.429, | |
| "grad_norm": 0.021206015720963478, | |
| "learning_rate": 2.855e-05, | |
| "loss": 0.0025, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.0655827671289444, | |
| "learning_rate": 2.8499999999999998e-05, | |
| "loss": 0.0027, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.431, | |
| "grad_norm": 0.03779730945825577, | |
| "learning_rate": 2.845e-05, | |
| "loss": 0.0024, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.045267749577760696, | |
| "learning_rate": 2.84e-05, | |
| "loss": 0.0031, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.433, | |
| "grad_norm": 0.017473919317126274, | |
| "learning_rate": 2.8349999999999998e-05, | |
| "loss": 0.0027, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.434, | |
| "grad_norm": 0.019513197243213654, | |
| "learning_rate": 2.83e-05, | |
| "loss": 0.0024, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.435, | |
| "grad_norm": 0.01616765186190605, | |
| "learning_rate": 2.825e-05, | |
| "loss": 0.0025, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.436, | |
| "grad_norm": 0.02270474284887314, | |
| "learning_rate": 2.8199999999999998e-05, | |
| "loss": 0.0024, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.437, | |
| "grad_norm": 0.02363002672791481, | |
| "learning_rate": 2.815e-05, | |
| "loss": 0.0025, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.438, | |
| "grad_norm": 0.023898936808109283, | |
| "learning_rate": 2.8100000000000005e-05, | |
| "loss": 0.0023, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.439, | |
| "grad_norm": 0.01270793005824089, | |
| "learning_rate": 2.8050000000000004e-05, | |
| "loss": 0.0022, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.03917006403207779, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 0.0029, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.441, | |
| "grad_norm": 0.028284449130296707, | |
| "learning_rate": 2.7950000000000005e-05, | |
| "loss": 0.0023, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.442, | |
| "grad_norm": 0.017493903636932373, | |
| "learning_rate": 2.7900000000000004e-05, | |
| "loss": 0.0022, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.443, | |
| "grad_norm": 0.03160572797060013, | |
| "learning_rate": 2.7850000000000003e-05, | |
| "loss": 0.0025, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.444, | |
| "grad_norm": 0.022926049306988716, | |
| "learning_rate": 2.7800000000000005e-05, | |
| "loss": 0.0024, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.445, | |
| "grad_norm": 0.032902974635362625, | |
| "learning_rate": 2.7750000000000004e-05, | |
| "loss": 0.0025, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.446, | |
| "grad_norm": 0.017781972885131836, | |
| "learning_rate": 2.7700000000000002e-05, | |
| "loss": 0.0022, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.447, | |
| "grad_norm": 0.1209416389465332, | |
| "learning_rate": 2.7650000000000005e-05, | |
| "loss": 0.0028, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.03747338801622391, | |
| "learning_rate": 2.7600000000000003e-05, | |
| "loss": 0.0024, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.449, | |
| "grad_norm": 0.04210735112428665, | |
| "learning_rate": 2.7550000000000002e-05, | |
| "loss": 0.0024, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.02306324429810047, | |
| "learning_rate": 2.7500000000000004e-05, | |
| "loss": 0.0022, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.451, | |
| "grad_norm": 0.027622856199741364, | |
| "learning_rate": 2.7450000000000003e-05, | |
| "loss": 0.0026, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.452, | |
| "grad_norm": 0.014202162623405457, | |
| "learning_rate": 2.7400000000000002e-05, | |
| "loss": 0.0022, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.453, | |
| "grad_norm": 0.051466915756464005, | |
| "learning_rate": 2.7350000000000004e-05, | |
| "loss": 0.0031, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.454, | |
| "grad_norm": 0.052768610417842865, | |
| "learning_rate": 2.7300000000000003e-05, | |
| "loss": 0.0028, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.455, | |
| "grad_norm": 0.02291076071560383, | |
| "learning_rate": 2.725e-05, | |
| "loss": 0.0023, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 0.027942989021539688, | |
| "learning_rate": 2.7200000000000004e-05, | |
| "loss": 0.0023, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.457, | |
| "grad_norm": 0.01529670413583517, | |
| "learning_rate": 2.7150000000000003e-05, | |
| "loss": 0.0024, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.458, | |
| "grad_norm": 0.02945224940776825, | |
| "learning_rate": 2.7100000000000005e-05, | |
| "loss": 0.0024, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.459, | |
| "grad_norm": 0.027197351679205894, | |
| "learning_rate": 2.7050000000000004e-05, | |
| "loss": 0.0024, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.022022951394319534, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 0.0023, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.461, | |
| "grad_norm": 0.019739823415875435, | |
| "learning_rate": 2.6950000000000005e-05, | |
| "loss": 0.0022, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.462, | |
| "grad_norm": 0.06794995814561844, | |
| "learning_rate": 2.6900000000000003e-05, | |
| "loss": 0.0027, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.463, | |
| "grad_norm": 0.049228962510824203, | |
| "learning_rate": 2.6850000000000002e-05, | |
| "loss": 0.0026, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.0241558700799942, | |
| "learning_rate": 2.6800000000000004e-05, | |
| "loss": 0.0021, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.465, | |
| "grad_norm": 0.024576248601078987, | |
| "learning_rate": 2.6750000000000003e-05, | |
| "loss": 0.0022, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.466, | |
| "grad_norm": 0.030337205156683922, | |
| "learning_rate": 2.6700000000000002e-05, | |
| "loss": 0.0024, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.467, | |
| "grad_norm": 0.015081087127327919, | |
| "learning_rate": 2.6650000000000004e-05, | |
| "loss": 0.002, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.468, | |
| "grad_norm": 0.026368912309408188, | |
| "learning_rate": 2.6600000000000003e-05, | |
| "loss": 0.0022, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.469, | |
| "grad_norm": 0.018447600305080414, | |
| "learning_rate": 2.655e-05, | |
| "loss": 0.0022, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.018314722925424576, | |
| "learning_rate": 2.6500000000000004e-05, | |
| "loss": 0.0019, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.471, | |
| "grad_norm": 0.02361704409122467, | |
| "learning_rate": 2.6450000000000003e-05, | |
| "loss": 0.0023, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 0.02032247930765152, | |
| "learning_rate": 2.64e-05, | |
| "loss": 0.002, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.473, | |
| "grad_norm": 0.017889728769659996, | |
| "learning_rate": 2.6350000000000004e-05, | |
| "loss": 0.0019, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.474, | |
| "grad_norm": 0.01962173730134964, | |
| "learning_rate": 2.6300000000000002e-05, | |
| "loss": 0.0019, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 0.015778113156557083, | |
| "learning_rate": 2.625e-05, | |
| "loss": 0.0021, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.476, | |
| "grad_norm": 0.01894952729344368, | |
| "learning_rate": 2.6200000000000003e-05, | |
| "loss": 0.0023, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.477, | |
| "grad_norm": 0.0239462498575449, | |
| "learning_rate": 2.6150000000000002e-05, | |
| "loss": 0.002, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.478, | |
| "grad_norm": 0.025406278669834137, | |
| "learning_rate": 2.61e-05, | |
| "loss": 0.0021, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.479, | |
| "grad_norm": 0.01813661865890026, | |
| "learning_rate": 2.6050000000000003e-05, | |
| "loss": 0.0023, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.01979188807308674, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 0.0019, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.481, | |
| "grad_norm": 0.02219184674322605, | |
| "learning_rate": 2.595e-05, | |
| "loss": 0.002, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.482, | |
| "grad_norm": 0.012867514044046402, | |
| "learning_rate": 2.5900000000000003e-05, | |
| "loss": 0.0017, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.483, | |
| "grad_norm": 0.014178570359945297, | |
| "learning_rate": 2.585e-05, | |
| "loss": 0.0023, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.484, | |
| "grad_norm": 0.013582895509898663, | |
| "learning_rate": 2.58e-05, | |
| "loss": 0.002, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.485, | |
| "grad_norm": 0.02718137763440609, | |
| "learning_rate": 2.5750000000000002e-05, | |
| "loss": 0.0019, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.486, | |
| "grad_norm": 0.016559738665819168, | |
| "learning_rate": 2.57e-05, | |
| "loss": 0.0019, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.487, | |
| "grad_norm": 0.01447515282779932, | |
| "learning_rate": 2.5650000000000003e-05, | |
| "loss": 0.0017, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 0.010251802392303944, | |
| "learning_rate": 2.5600000000000002e-05, | |
| "loss": 0.0017, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.489, | |
| "grad_norm": 0.021138856187462807, | |
| "learning_rate": 2.555e-05, | |
| "loss": 0.0023, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.026664163917303085, | |
| "learning_rate": 2.5500000000000003e-05, | |
| "loss": 0.0021, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.491, | |
| "grad_norm": 0.012794408947229385, | |
| "learning_rate": 2.5450000000000002e-05, | |
| "loss": 0.0017, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.492, | |
| "grad_norm": 0.013725240714848042, | |
| "learning_rate": 2.54e-05, | |
| "loss": 0.0018, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.493, | |
| "grad_norm": 0.01432815007865429, | |
| "learning_rate": 2.5350000000000003e-05, | |
| "loss": 0.0017, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.494, | |
| "grad_norm": 0.014761424623429775, | |
| "learning_rate": 2.5300000000000002e-05, | |
| "loss": 0.0019, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.495, | |
| "grad_norm": 0.06742983311414719, | |
| "learning_rate": 2.525e-05, | |
| "loss": 0.0025, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.024261673912405968, | |
| "learning_rate": 2.5200000000000003e-05, | |
| "loss": 0.0021, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.497, | |
| "grad_norm": 0.02116272784769535, | |
| "learning_rate": 2.515e-05, | |
| "loss": 0.002, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.498, | |
| "grad_norm": 0.014996570535004139, | |
| "learning_rate": 2.51e-05, | |
| "loss": 0.0017, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.499, | |
| "grad_norm": 0.014970551244914532, | |
| "learning_rate": 2.5050000000000002e-05, | |
| "loss": 0.002, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.01756688393652439, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.002, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.501, | |
| "grad_norm": 0.012683290056884289, | |
| "learning_rate": 2.495e-05, | |
| "loss": 0.0016, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.502, | |
| "grad_norm": 0.011495651677250862, | |
| "learning_rate": 2.4900000000000002e-05, | |
| "loss": 0.0016, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.503, | |
| "grad_norm": 0.014306634664535522, | |
| "learning_rate": 2.485e-05, | |
| "loss": 0.0018, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 0.02241896465420723, | |
| "learning_rate": 2.48e-05, | |
| "loss": 0.0021, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.505, | |
| "grad_norm": 0.017740361392498016, | |
| "learning_rate": 2.4750000000000002e-05, | |
| "loss": 0.0016, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.506, | |
| "grad_norm": 0.013199679553508759, | |
| "learning_rate": 2.47e-05, | |
| "loss": 0.0015, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.507, | |
| "grad_norm": 0.057298243045806885, | |
| "learning_rate": 2.465e-05, | |
| "loss": 0.0019, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.508, | |
| "grad_norm": 0.03238265961408615, | |
| "learning_rate": 2.46e-05, | |
| "loss": 0.0026, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.509, | |
| "grad_norm": 0.04820936918258667, | |
| "learning_rate": 2.455e-05, | |
| "loss": 0.0027, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.022526515647768974, | |
| "learning_rate": 2.45e-05, | |
| "loss": 0.0018, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.511, | |
| "grad_norm": 0.1899888962507248, | |
| "learning_rate": 2.445e-05, | |
| "loss": 0.0026, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.05366889387369156, | |
| "learning_rate": 2.44e-05, | |
| "loss": 0.003, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.513, | |
| "grad_norm": 0.028939131647348404, | |
| "learning_rate": 2.435e-05, | |
| "loss": 0.0021, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.514, | |
| "grad_norm": 0.023352844640612602, | |
| "learning_rate": 2.43e-05, | |
| "loss": 0.0019, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.515, | |
| "grad_norm": 0.015283104963600636, | |
| "learning_rate": 2.425e-05, | |
| "loss": 0.0017, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.516, | |
| "grad_norm": 0.0149134686216712, | |
| "learning_rate": 2.4200000000000002e-05, | |
| "loss": 0.0016, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.517, | |
| "grad_norm": 0.01739874854683876, | |
| "learning_rate": 2.415e-05, | |
| "loss": 0.0021, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.518, | |
| "grad_norm": 0.012562318705022335, | |
| "learning_rate": 2.41e-05, | |
| "loss": 0.0016, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.519, | |
| "grad_norm": 0.01181173324584961, | |
| "learning_rate": 2.4050000000000002e-05, | |
| "loss": 0.0017, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.0216183140873909, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.0017, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.521, | |
| "grad_norm": 0.014552557840943336, | |
| "learning_rate": 2.395e-05, | |
| "loss": 0.0017, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.522, | |
| "grad_norm": 0.013402258977293968, | |
| "learning_rate": 2.39e-05, | |
| "loss": 0.0015, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.523, | |
| "grad_norm": 0.017692307010293007, | |
| "learning_rate": 2.385e-05, | |
| "loss": 0.0017, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.524, | |
| "grad_norm": 0.007425515912473202, | |
| "learning_rate": 2.38e-05, | |
| "loss": 0.0015, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 0.010397032834589481, | |
| "learning_rate": 2.375e-05, | |
| "loss": 0.0014, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.526, | |
| "grad_norm": 0.013170558027923107, | |
| "learning_rate": 2.37e-05, | |
| "loss": 0.0017, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.527, | |
| "grad_norm": 0.47324055433273315, | |
| "learning_rate": 2.365e-05, | |
| "loss": 0.0037, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.06395496428012848, | |
| "learning_rate": 2.36e-05, | |
| "loss": 0.003, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.529, | |
| "grad_norm": 0.032293129712343216, | |
| "learning_rate": 2.355e-05, | |
| "loss": 0.0022, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.021514760330319405, | |
| "learning_rate": 2.35e-05, | |
| "loss": 0.002, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.531, | |
| "grad_norm": 0.016594447195529938, | |
| "learning_rate": 2.345e-05, | |
| "loss": 0.002, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.532, | |
| "grad_norm": 0.020661164075136185, | |
| "learning_rate": 2.3400000000000003e-05, | |
| "loss": 0.0018, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.533, | |
| "grad_norm": 0.01472094189375639, | |
| "learning_rate": 2.3350000000000002e-05, | |
| "loss": 0.0022, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.534, | |
| "grad_norm": 0.014501375146210194, | |
| "learning_rate": 2.3300000000000004e-05, | |
| "loss": 0.0017, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.535, | |
| "grad_norm": 0.01241264771670103, | |
| "learning_rate": 2.3250000000000003e-05, | |
| "loss": 0.0015, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 0.015589526854455471, | |
| "learning_rate": 2.32e-05, | |
| "loss": 0.0018, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.537, | |
| "grad_norm": 0.013468182645738125, | |
| "learning_rate": 2.3150000000000004e-05, | |
| "loss": 0.0018, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.538, | |
| "grad_norm": 0.015258733183145523, | |
| "learning_rate": 2.3100000000000002e-05, | |
| "loss": 0.0015, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.539, | |
| "grad_norm": 0.010932616889476776, | |
| "learning_rate": 2.305e-05, | |
| "loss": 0.0014, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.0102313794195652, | |
| "learning_rate": 2.3000000000000003e-05, | |
| "loss": 0.0014, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.541, | |
| "grad_norm": 0.00674120569601655, | |
| "learning_rate": 2.2950000000000002e-05, | |
| "loss": 0.0014, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.542, | |
| "grad_norm": 0.015179513022303581, | |
| "learning_rate": 2.29e-05, | |
| "loss": 0.0014, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.543, | |
| "grad_norm": 0.03448422998189926, | |
| "learning_rate": 2.2850000000000003e-05, | |
| "loss": 0.0019, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.028603358194231987, | |
| "learning_rate": 2.2800000000000002e-05, | |
| "loss": 0.0019, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.545, | |
| "grad_norm": 0.014372209087014198, | |
| "learning_rate": 2.275e-05, | |
| "loss": 0.0016, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.546, | |
| "grad_norm": 0.031532082706689835, | |
| "learning_rate": 2.2700000000000003e-05, | |
| "loss": 0.0017, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.547, | |
| "grad_norm": 0.018091056495904922, | |
| "learning_rate": 2.265e-05, | |
| "loss": 0.0016, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.548, | |
| "grad_norm": 0.014843069948256016, | |
| "learning_rate": 2.26e-05, | |
| "loss": 0.0015, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.549, | |
| "grad_norm": 0.011632148176431656, | |
| "learning_rate": 2.2550000000000003e-05, | |
| "loss": 0.0014, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.009511668235063553, | |
| "learning_rate": 2.25e-05, | |
| "loss": 0.0014, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.551, | |
| "grad_norm": 0.007981637492775917, | |
| "learning_rate": 2.245e-05, | |
| "loss": 0.0014, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 0.021288806572556496, | |
| "learning_rate": 2.2400000000000002e-05, | |
| "loss": 0.0015, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.553, | |
| "grad_norm": 0.01468642894178629, | |
| "learning_rate": 2.235e-05, | |
| "loss": 0.0018, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.554, | |
| "grad_norm": 0.011532713659107685, | |
| "learning_rate": 2.23e-05, | |
| "loss": 0.0012, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.555, | |
| "grad_norm": 0.00889046210795641, | |
| "learning_rate": 2.2250000000000002e-05, | |
| "loss": 0.0011, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.556, | |
| "grad_norm": 0.01401284895837307, | |
| "learning_rate": 2.22e-05, | |
| "loss": 0.0014, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.557, | |
| "grad_norm": 0.012369327247142792, | |
| "learning_rate": 2.215e-05, | |
| "loss": 0.0015, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.558, | |
| "grad_norm": 0.015258446335792542, | |
| "learning_rate": 2.2100000000000002e-05, | |
| "loss": 0.0015, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.559, | |
| "grad_norm": 0.009015046060085297, | |
| "learning_rate": 2.205e-05, | |
| "loss": 0.0012, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.011163819581270218, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 0.0012, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.561, | |
| "grad_norm": 0.016389524564146996, | |
| "learning_rate": 2.195e-05, | |
| "loss": 0.0016, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.562, | |
| "grad_norm": 0.01325678639113903, | |
| "learning_rate": 2.19e-05, | |
| "loss": 0.0013, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.563, | |
| "grad_norm": 0.017966121435165405, | |
| "learning_rate": 2.1850000000000003e-05, | |
| "loss": 0.0014, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.564, | |
| "grad_norm": 0.012039076536893845, | |
| "learning_rate": 2.18e-05, | |
| "loss": 0.0013, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.565, | |
| "grad_norm": 0.006665175314992666, | |
| "learning_rate": 2.175e-05, | |
| "loss": 0.0012, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.566, | |
| "grad_norm": 0.0105441864579916, | |
| "learning_rate": 2.1700000000000002e-05, | |
| "loss": 0.0014, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.567, | |
| "grad_norm": 0.007554101757705212, | |
| "learning_rate": 2.165e-05, | |
| "loss": 0.0011, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 0.009823901578783989, | |
| "learning_rate": 2.16e-05, | |
| "loss": 0.0013, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.569, | |
| "grad_norm": 0.01720455475151539, | |
| "learning_rate": 2.1550000000000002e-05, | |
| "loss": 0.0015, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.01107338909059763, | |
| "learning_rate": 2.15e-05, | |
| "loss": 0.0012, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.571, | |
| "grad_norm": 0.01756761223077774, | |
| "learning_rate": 2.145e-05, | |
| "loss": 0.0014, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.572, | |
| "grad_norm": 0.022118983790278435, | |
| "learning_rate": 2.1400000000000002e-05, | |
| "loss": 0.0015, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.573, | |
| "grad_norm": 0.01616830937564373, | |
| "learning_rate": 2.135e-05, | |
| "loss": 0.0014, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.574, | |
| "grad_norm": 0.020481310784816742, | |
| "learning_rate": 2.13e-05, | |
| "loss": 0.0023, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 0.018176857382059097, | |
| "learning_rate": 2.125e-05, | |
| "loss": 0.0015, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.011317101307213306, | |
| "learning_rate": 2.12e-05, | |
| "loss": 0.0012, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.577, | |
| "grad_norm": 0.028791502118110657, | |
| "learning_rate": 2.115e-05, | |
| "loss": 0.0014, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.578, | |
| "grad_norm": 0.013037024065852165, | |
| "learning_rate": 2.11e-05, | |
| "loss": 0.0013, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.579, | |
| "grad_norm": 0.021426070481538773, | |
| "learning_rate": 2.105e-05, | |
| "loss": 0.0015, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.012033521197736263, | |
| "learning_rate": 2.1e-05, | |
| "loss": 0.0011, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.581, | |
| "grad_norm": 0.014337443746626377, | |
| "learning_rate": 2.095e-05, | |
| "loss": 0.0012, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.582, | |
| "grad_norm": 0.008603113703429699, | |
| "learning_rate": 2.09e-05, | |
| "loss": 0.0011, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.583, | |
| "grad_norm": 0.025418557226657867, | |
| "learning_rate": 2.085e-05, | |
| "loss": 0.0014, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 0.008621426299214363, | |
| "learning_rate": 2.08e-05, | |
| "loss": 0.0011, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.585, | |
| "grad_norm": 0.009969389997422695, | |
| "learning_rate": 2.075e-05, | |
| "loss": 0.0015, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.586, | |
| "grad_norm": 0.00997992418706417, | |
| "learning_rate": 2.07e-05, | |
| "loss": 0.0011, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.587, | |
| "grad_norm": 0.019949181005358696, | |
| "learning_rate": 2.065e-05, | |
| "loss": 0.001, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.588, | |
| "grad_norm": 0.009619793854653835, | |
| "learning_rate": 2.06e-05, | |
| "loss": 0.0011, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.589, | |
| "grad_norm": 0.007747489493340254, | |
| "learning_rate": 2.055e-05, | |
| "loss": 0.0012, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.01052554789930582, | |
| "learning_rate": 2.05e-05, | |
| "loss": 0.0014, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.591, | |
| "grad_norm": 0.014904200099408627, | |
| "learning_rate": 2.045e-05, | |
| "loss": 0.0012, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.00679561635479331, | |
| "learning_rate": 2.04e-05, | |
| "loss": 0.0011, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.593, | |
| "grad_norm": 0.006072670221328735, | |
| "learning_rate": 2.035e-05, | |
| "loss": 0.0011, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 0.594, | |
| "grad_norm": 0.014733157120645046, | |
| "learning_rate": 2.0300000000000002e-05, | |
| "loss": 0.0011, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.595, | |
| "grad_norm": 0.015511419624090195, | |
| "learning_rate": 2.025e-05, | |
| "loss": 0.0016, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.596, | |
| "grad_norm": 0.010620438493788242, | |
| "learning_rate": 2.0200000000000003e-05, | |
| "loss": 0.0012, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.597, | |
| "grad_norm": 0.0075794099830091, | |
| "learning_rate": 2.0150000000000002e-05, | |
| "loss": 0.0011, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 0.598, | |
| "grad_norm": 0.007882976904511452, | |
| "learning_rate": 2.01e-05, | |
| "loss": 0.0011, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.599, | |
| "grad_norm": 0.011548763141036034, | |
| "learning_rate": 2.0050000000000003e-05, | |
| "loss": 0.0013, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.0084703853353858, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0011, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.601, | |
| "grad_norm": 0.007603704463690519, | |
| "learning_rate": 1.995e-05, | |
| "loss": 0.001, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 0.602, | |
| "grad_norm": 0.008562711998820305, | |
| "learning_rate": 1.9900000000000003e-05, | |
| "loss": 0.0012, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.603, | |
| "grad_norm": 0.007590813562273979, | |
| "learning_rate": 1.985e-05, | |
| "loss": 0.001, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 0.604, | |
| "grad_norm": 0.020342741161584854, | |
| "learning_rate": 1.9800000000000004e-05, | |
| "loss": 0.0017, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.605, | |
| "grad_norm": 0.16912633180618286, | |
| "learning_rate": 1.9750000000000002e-05, | |
| "loss": 0.0089, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.606, | |
| "grad_norm": 0.08793429285287857, | |
| "learning_rate": 1.97e-05, | |
| "loss": 0.0027, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.607, | |
| "grad_norm": 0.05196760594844818, | |
| "learning_rate": 1.9650000000000003e-05, | |
| "loss": 0.0022, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.02118327096104622, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 0.0021, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.609, | |
| "grad_norm": 0.013289586640894413, | |
| "learning_rate": 1.955e-05, | |
| "loss": 0.0013, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.012911707162857056, | |
| "learning_rate": 1.9500000000000003e-05, | |
| "loss": 0.0013, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.611, | |
| "grad_norm": 0.018663186579942703, | |
| "learning_rate": 1.9450000000000002e-05, | |
| "loss": 0.0012, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 0.612, | |
| "grad_norm": 0.010551884770393372, | |
| "learning_rate": 1.94e-05, | |
| "loss": 0.0012, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.613, | |
| "grad_norm": 0.015853077173233032, | |
| "learning_rate": 1.9350000000000003e-05, | |
| "loss": 0.0013, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 0.614, | |
| "grad_norm": 0.020374910905957222, | |
| "learning_rate": 1.93e-05, | |
| "loss": 0.001, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.615, | |
| "grad_norm": 0.015159848146140575, | |
| "learning_rate": 1.925e-05, | |
| "loss": 0.0013, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 0.007991676218807697, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 0.0013, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.617, | |
| "grad_norm": 0.007849587127566338, | |
| "learning_rate": 1.915e-05, | |
| "loss": 0.0011, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 0.618, | |
| "grad_norm": 0.022048622369766235, | |
| "learning_rate": 1.91e-05, | |
| "loss": 0.001, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.619, | |
| "grad_norm": 0.021215343847870827, | |
| "learning_rate": 1.9050000000000002e-05, | |
| "loss": 0.0011, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.012288344092667103, | |
| "learning_rate": 1.9e-05, | |
| "loss": 0.0012, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.621, | |
| "grad_norm": 0.020313331857323647, | |
| "learning_rate": 1.895e-05, | |
| "loss": 0.0011, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 0.622, | |
| "grad_norm": 0.008762447163462639, | |
| "learning_rate": 1.8900000000000002e-05, | |
| "loss": 0.001, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.623, | |
| "grad_norm": 0.0247616209089756, | |
| "learning_rate": 1.885e-05, | |
| "loss": 0.0011, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.09021363407373428, | |
| "learning_rate": 1.88e-05, | |
| "loss": 0.0016, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.017945896834135056, | |
| "learning_rate": 1.8750000000000002e-05, | |
| "loss": 0.0011, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.626, | |
| "grad_norm": 0.011303462088108063, | |
| "learning_rate": 1.87e-05, | |
| "loss": 0.0011, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.627, | |
| "grad_norm": 0.008381664752960205, | |
| "learning_rate": 1.865e-05, | |
| "loss": 0.0011, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 0.628, | |
| "grad_norm": 0.011003987863659859, | |
| "learning_rate": 1.86e-05, | |
| "loss": 0.0012, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.629, | |
| "grad_norm": 0.015965888276696205, | |
| "learning_rate": 1.855e-05, | |
| "loss": 0.001, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.006507181562483311, | |
| "learning_rate": 1.85e-05, | |
| "loss": 0.0009, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.631, | |
| "grad_norm": 0.015577591024339199, | |
| "learning_rate": 1.845e-05, | |
| "loss": 0.001, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 0.006741558667272329, | |
| "learning_rate": 1.84e-05, | |
| "loss": 0.0011, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.633, | |
| "grad_norm": 0.016030525788664818, | |
| "learning_rate": 1.8350000000000002e-05, | |
| "loss": 0.001, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 0.634, | |
| "grad_norm": 0.010763168334960938, | |
| "learning_rate": 1.83e-05, | |
| "loss": 0.0011, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.635, | |
| "grad_norm": 0.017273874953389168, | |
| "learning_rate": 1.825e-05, | |
| "loss": 0.001, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.636, | |
| "grad_norm": 0.010964670218527317, | |
| "learning_rate": 1.8200000000000002e-05, | |
| "loss": 0.0011, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.637, | |
| "grad_norm": 0.00803497713059187, | |
| "learning_rate": 1.815e-05, | |
| "loss": 0.0009, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 0.638, | |
| "grad_norm": 0.007479315157979727, | |
| "learning_rate": 1.81e-05, | |
| "loss": 0.0014, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.639, | |
| "grad_norm": 0.010598058812320232, | |
| "learning_rate": 1.805e-05, | |
| "loss": 0.001, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.009770036675035954, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.0009, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.641, | |
| "grad_norm": 0.011602561920881271, | |
| "learning_rate": 1.795e-05, | |
| "loss": 0.0008, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 0.642, | |
| "grad_norm": 0.0076597342267632484, | |
| "learning_rate": 1.79e-05, | |
| "loss": 0.0008, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.643, | |
| "grad_norm": 0.012248953804373741, | |
| "learning_rate": 1.785e-05, | |
| "loss": 0.0008, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 0.644, | |
| "grad_norm": 0.005626557394862175, | |
| "learning_rate": 1.78e-05, | |
| "loss": 0.0008, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.645, | |
| "grad_norm": 0.005482000298798084, | |
| "learning_rate": 1.775e-05, | |
| "loss": 0.0008, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.646, | |
| "grad_norm": 0.007456011138856411, | |
| "learning_rate": 1.77e-05, | |
| "loss": 0.0008, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.647, | |
| "grad_norm": 0.008909308351576328, | |
| "learning_rate": 1.765e-05, | |
| "loss": 0.0008, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 0.011135280132293701, | |
| "learning_rate": 1.76e-05, | |
| "loss": 0.0009, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.649, | |
| "grad_norm": 0.01595783233642578, | |
| "learning_rate": 1.755e-05, | |
| "loss": 0.001, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.013902807608246803, | |
| "learning_rate": 1.75e-05, | |
| "loss": 0.0011, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.651, | |
| "grad_norm": 0.010244622826576233, | |
| "learning_rate": 1.745e-05, | |
| "loss": 0.0009, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 0.652, | |
| "grad_norm": 0.007476091384887695, | |
| "learning_rate": 1.74e-05, | |
| "loss": 0.0009, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.653, | |
| "grad_norm": 0.013044660910964012, | |
| "learning_rate": 1.7349999999999998e-05, | |
| "loss": 0.0009, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 0.654, | |
| "grad_norm": 0.004804369527846575, | |
| "learning_rate": 1.73e-05, | |
| "loss": 0.0009, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.655, | |
| "grad_norm": 0.006042002234607935, | |
| "learning_rate": 1.725e-05, | |
| "loss": 0.0008, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.010785943828523159, | |
| "learning_rate": 1.7199999999999998e-05, | |
| "loss": 0.0009, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.657, | |
| "grad_norm": 0.011350172571837902, | |
| "learning_rate": 1.7150000000000004e-05, | |
| "loss": 0.0008, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 0.658, | |
| "grad_norm": 0.007638021372258663, | |
| "learning_rate": 1.7100000000000002e-05, | |
| "loss": 0.0009, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.659, | |
| "grad_norm": 0.005735939834266901, | |
| "learning_rate": 1.705e-05, | |
| "loss": 0.0009, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.02717960625886917, | |
| "learning_rate": 1.7000000000000003e-05, | |
| "loss": 0.0011, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.661, | |
| "grad_norm": 0.006012643221765757, | |
| "learning_rate": 1.6950000000000002e-05, | |
| "loss": 0.0008, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 0.662, | |
| "grad_norm": 0.00599683728069067, | |
| "learning_rate": 1.69e-05, | |
| "loss": 0.0008, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.663, | |
| "grad_norm": 0.026952974498271942, | |
| "learning_rate": 1.6850000000000003e-05, | |
| "loss": 0.0008, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 0.008171536959707737, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 0.0008, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.665, | |
| "grad_norm": 0.007446442265063524, | |
| "learning_rate": 1.675e-05, | |
| "loss": 0.0009, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.666, | |
| "grad_norm": 0.006456063129007816, | |
| "learning_rate": 1.6700000000000003e-05, | |
| "loss": 0.0008, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 0.667, | |
| "grad_norm": 0.008162173442542553, | |
| "learning_rate": 1.665e-05, | |
| "loss": 0.0007, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 0.668, | |
| "grad_norm": 0.004432919900864363, | |
| "learning_rate": 1.66e-05, | |
| "loss": 0.0008, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 0.669, | |
| "grad_norm": 0.007158307824283838, | |
| "learning_rate": 1.6550000000000002e-05, | |
| "loss": 0.0008, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.003983801696449518, | |
| "learning_rate": 1.65e-05, | |
| "loss": 0.0007, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.671, | |
| "grad_norm": 0.005170087795704603, | |
| "learning_rate": 1.645e-05, | |
| "loss": 0.0008, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.004729804117232561, | |
| "learning_rate": 1.6400000000000002e-05, | |
| "loss": 0.0008, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 0.673, | |
| "grad_norm": 0.010037174448370934, | |
| "learning_rate": 1.635e-05, | |
| "loss": 0.001, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 0.674, | |
| "grad_norm": 0.050949569791555405, | |
| "learning_rate": 1.63e-05, | |
| "loss": 0.0023, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 0.675, | |
| "grad_norm": 0.0323474146425724, | |
| "learning_rate": 1.6250000000000002e-05, | |
| "loss": 0.0017, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.676, | |
| "grad_norm": 0.027231359854340553, | |
| "learning_rate": 1.62e-05, | |
| "loss": 0.0021, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 0.677, | |
| "grad_norm": 0.01555855292826891, | |
| "learning_rate": 1.6150000000000003e-05, | |
| "loss": 0.0013, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 0.678, | |
| "grad_norm": 0.01804298162460327, | |
| "learning_rate": 1.6100000000000002e-05, | |
| "loss": 0.0011, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 0.679, | |
| "grad_norm": 0.011248771101236343, | |
| "learning_rate": 1.605e-05, | |
| "loss": 0.0011, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.007389044389128685, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.0009, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.681, | |
| "grad_norm": 0.014606145210564137, | |
| "learning_rate": 1.595e-05, | |
| "loss": 0.0012, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 0.682, | |
| "grad_norm": 0.012476052157580853, | |
| "learning_rate": 1.59e-05, | |
| "loss": 0.0009, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 0.683, | |
| "grad_norm": 0.009272475726902485, | |
| "learning_rate": 1.5850000000000002e-05, | |
| "loss": 0.0009, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 0.684, | |
| "grad_norm": 0.011705187149345875, | |
| "learning_rate": 1.58e-05, | |
| "loss": 0.0009, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 0.685, | |
| "grad_norm": 0.01874556578695774, | |
| "learning_rate": 1.575e-05, | |
| "loss": 0.0011, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.686, | |
| "grad_norm": 0.01463324110955, | |
| "learning_rate": 1.5700000000000002e-05, | |
| "loss": 0.0009, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 0.687, | |
| "grad_norm": 0.012001392431557178, | |
| "learning_rate": 1.565e-05, | |
| "loss": 0.001, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.009366356767714024, | |
| "learning_rate": 1.56e-05, | |
| "loss": 0.0008, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 0.689, | |
| "grad_norm": 0.010064000263810158, | |
| "learning_rate": 1.5550000000000002e-05, | |
| "loss": 0.0009, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.016703909263014793, | |
| "learning_rate": 1.55e-05, | |
| "loss": 0.0009, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.691, | |
| "grad_norm": 0.0146669652312994, | |
| "learning_rate": 1.545e-05, | |
| "loss": 0.001, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 0.692, | |
| "grad_norm": 0.006643705535680056, | |
| "learning_rate": 1.54e-05, | |
| "loss": 0.0009, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 0.693, | |
| "grad_norm": 0.011501871049404144, | |
| "learning_rate": 1.535e-05, | |
| "loss": 0.0008, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 0.694, | |
| "grad_norm": 0.008170065470039845, | |
| "learning_rate": 1.53e-05, | |
| "loss": 0.0008, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 0.695, | |
| "grad_norm": 0.00737554719671607, | |
| "learning_rate": 1.525e-05, | |
| "loss": 0.0007, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 0.006846282631158829, | |
| "learning_rate": 1.52e-05, | |
| "loss": 0.0009, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 0.697, | |
| "grad_norm": 0.007784941233694553, | |
| "learning_rate": 1.515e-05, | |
| "loss": 0.0008, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 0.698, | |
| "grad_norm": 0.009864069521427155, | |
| "learning_rate": 1.51e-05, | |
| "loss": 0.0008, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 0.699, | |
| "grad_norm": 0.007372863125056028, | |
| "learning_rate": 1.505e-05, | |
| "loss": 0.0009, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.006507135462015867, | |
| "learning_rate": 1.5e-05, | |
| "loss": 0.0008, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.701, | |
| "grad_norm": 0.03093353845179081, | |
| "learning_rate": 1.4950000000000001e-05, | |
| "loss": 0.0014, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 0.702, | |
| "grad_norm": 0.01417300570756197, | |
| "learning_rate": 1.49e-05, | |
| "loss": 0.001, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 0.703, | |
| "grad_norm": 0.010836401022970676, | |
| "learning_rate": 1.485e-05, | |
| "loss": 0.0012, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.01000068336725235, | |
| "learning_rate": 1.48e-05, | |
| "loss": 0.001, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 0.705, | |
| "grad_norm": 0.008654952049255371, | |
| "learning_rate": 1.475e-05, | |
| "loss": 0.0009, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.706, | |
| "grad_norm": 0.010761331766843796, | |
| "learning_rate": 1.47e-05, | |
| "loss": 0.001, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 0.707, | |
| "grad_norm": 0.006188638508319855, | |
| "learning_rate": 1.465e-05, | |
| "loss": 0.0008, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 0.708, | |
| "grad_norm": 0.007858789525926113, | |
| "learning_rate": 1.4599999999999999e-05, | |
| "loss": 0.0008, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 0.709, | |
| "grad_norm": 0.02773350477218628, | |
| "learning_rate": 1.455e-05, | |
| "loss": 0.0014, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.012381108477711678, | |
| "learning_rate": 1.45e-05, | |
| "loss": 0.0009, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.711, | |
| "grad_norm": 0.009256324730813503, | |
| "learning_rate": 1.4449999999999999e-05, | |
| "loss": 0.0008, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 0.007005748804658651, | |
| "learning_rate": 1.44e-05, | |
| "loss": 0.0009, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 0.713, | |
| "grad_norm": 0.0055755749344825745, | |
| "learning_rate": 1.435e-05, | |
| "loss": 0.0007, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 0.714, | |
| "grad_norm": 0.003967254888266325, | |
| "learning_rate": 1.43e-05, | |
| "loss": 0.0008, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 0.715, | |
| "grad_norm": 0.0079165268689394, | |
| "learning_rate": 1.4249999999999999e-05, | |
| "loss": 0.0011, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.716, | |
| "grad_norm": 0.004682580940425396, | |
| "learning_rate": 1.42e-05, | |
| "loss": 0.0007, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 0.717, | |
| "grad_norm": 0.008578700013458729, | |
| "learning_rate": 1.415e-05, | |
| "loss": 0.0011, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 0.718, | |
| "grad_norm": 0.006943961605429649, | |
| "learning_rate": 1.4099999999999999e-05, | |
| "loss": 0.0009, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 0.719, | |
| "grad_norm": 0.0072656250558793545, | |
| "learning_rate": 1.4050000000000003e-05, | |
| "loss": 0.0007, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.005639955401420593, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 0.0007, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.721, | |
| "grad_norm": 0.005733838304877281, | |
| "learning_rate": 1.3950000000000002e-05, | |
| "loss": 0.0008, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 0.722, | |
| "grad_norm": 0.02654002234339714, | |
| "learning_rate": 1.3900000000000002e-05, | |
| "loss": 0.0008, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 0.723, | |
| "grad_norm": 0.007308628410100937, | |
| "learning_rate": 1.3850000000000001e-05, | |
| "loss": 0.0008, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 0.724, | |
| "grad_norm": 0.006939894054085016, | |
| "learning_rate": 1.3800000000000002e-05, | |
| "loss": 0.0007, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 0.725, | |
| "grad_norm": 0.03964811936020851, | |
| "learning_rate": 1.3750000000000002e-05, | |
| "loss": 0.0013, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.726, | |
| "grad_norm": 0.014138396829366684, | |
| "learning_rate": 1.3700000000000001e-05, | |
| "loss": 0.001, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 0.727, | |
| "grad_norm": 0.008445181883871555, | |
| "learning_rate": 1.3650000000000001e-05, | |
| "loss": 0.0008, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 0.01134855579584837, | |
| "learning_rate": 1.3600000000000002e-05, | |
| "loss": 0.0009, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 0.729, | |
| "grad_norm": 0.010982022620737553, | |
| "learning_rate": 1.3550000000000002e-05, | |
| "loss": 0.0015, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.011698734015226364, | |
| "learning_rate": 1.3500000000000001e-05, | |
| "loss": 0.0008, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.731, | |
| "grad_norm": 0.006420729216188192, | |
| "learning_rate": 1.3450000000000002e-05, | |
| "loss": 0.0008, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 0.732, | |
| "grad_norm": 0.006088167428970337, | |
| "learning_rate": 1.3400000000000002e-05, | |
| "loss": 0.0008, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 0.733, | |
| "grad_norm": 0.0071141645312309265, | |
| "learning_rate": 1.3350000000000001e-05, | |
| "loss": 0.0012, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 0.734, | |
| "grad_norm": 0.004975921008735895, | |
| "learning_rate": 1.3300000000000001e-05, | |
| "loss": 0.0006, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 0.735, | |
| "grad_norm": 0.004499469883739948, | |
| "learning_rate": 1.3250000000000002e-05, | |
| "loss": 0.0007, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.009738982655107975, | |
| "learning_rate": 1.32e-05, | |
| "loss": 0.001, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 0.737, | |
| "grad_norm": 0.006863337475806475, | |
| "learning_rate": 1.3150000000000001e-05, | |
| "loss": 0.001, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 0.738, | |
| "grad_norm": 0.008216536603868008, | |
| "learning_rate": 1.3100000000000002e-05, | |
| "loss": 0.0007, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 0.739, | |
| "grad_norm": 0.006803369149565697, | |
| "learning_rate": 1.305e-05, | |
| "loss": 0.0008, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.00551017839461565, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 0.0008, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.741, | |
| "grad_norm": 0.009463651105761528, | |
| "learning_rate": 1.2950000000000001e-05, | |
| "loss": 0.0008, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 0.742, | |
| "grad_norm": 0.01233983039855957, | |
| "learning_rate": 1.29e-05, | |
| "loss": 0.0019, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 0.743, | |
| "grad_norm": 0.008470877073705196, | |
| "learning_rate": 1.285e-05, | |
| "loss": 0.0009, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 0.007592742796987295, | |
| "learning_rate": 1.2800000000000001e-05, | |
| "loss": 0.0008, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 0.745, | |
| "grad_norm": 0.03596987947821617, | |
| "learning_rate": 1.2750000000000002e-05, | |
| "loss": 0.001, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.746, | |
| "grad_norm": 0.005849502049386501, | |
| "learning_rate": 1.27e-05, | |
| "loss": 0.0008, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 0.747, | |
| "grad_norm": 0.009035659022629261, | |
| "learning_rate": 1.2650000000000001e-05, | |
| "loss": 0.0007, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 0.748, | |
| "grad_norm": 0.010397679172456264, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 0.0014, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 0.749, | |
| "grad_norm": 0.014514378271996975, | |
| "learning_rate": 1.255e-05, | |
| "loss": 0.0008, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.004837281536310911, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.0006, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.751, | |
| "grad_norm": 0.007720770314335823, | |
| "learning_rate": 1.2450000000000001e-05, | |
| "loss": 0.0006, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.012046804651618004, | |
| "learning_rate": 1.24e-05, | |
| "loss": 0.0011, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 0.753, | |
| "grad_norm": 0.01343387458473444, | |
| "learning_rate": 1.235e-05, | |
| "loss": 0.0007, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 0.754, | |
| "grad_norm": 0.00810600072145462, | |
| "learning_rate": 1.23e-05, | |
| "loss": 0.0007, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 0.755, | |
| "grad_norm": 0.00925883837044239, | |
| "learning_rate": 1.225e-05, | |
| "loss": 0.0006, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.756, | |
| "grad_norm": 0.01927885413169861, | |
| "learning_rate": 1.22e-05, | |
| "loss": 0.0014, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 0.757, | |
| "grad_norm": 0.010129665955901146, | |
| "learning_rate": 1.215e-05, | |
| "loss": 0.0006, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 0.758, | |
| "grad_norm": 0.007863885723054409, | |
| "learning_rate": 1.2100000000000001e-05, | |
| "loss": 0.0006, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 0.759, | |
| "grad_norm": 0.005500464700162411, | |
| "learning_rate": 1.205e-05, | |
| "loss": 0.0007, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.0040563903748989105, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.0006, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.761, | |
| "grad_norm": 0.006361998151987791, | |
| "learning_rate": 1.195e-05, | |
| "loss": 0.0007, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 0.762, | |
| "grad_norm": 0.0136310625821352, | |
| "learning_rate": 1.19e-05, | |
| "loss": 0.0008, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 0.763, | |
| "grad_norm": 0.005384715739637613, | |
| "learning_rate": 1.185e-05, | |
| "loss": 0.0007, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 0.764, | |
| "grad_norm": 0.014707676135003567, | |
| "learning_rate": 1.18e-05, | |
| "loss": 0.0007, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 0.765, | |
| "grad_norm": 0.008092684671282768, | |
| "learning_rate": 1.175e-05, | |
| "loss": 0.0006, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.766, | |
| "grad_norm": 0.007185132242739201, | |
| "learning_rate": 1.1700000000000001e-05, | |
| "loss": 0.0006, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 0.767, | |
| "grad_norm": 0.005672789178788662, | |
| "learning_rate": 1.1650000000000002e-05, | |
| "loss": 0.0006, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.05434956029057503, | |
| "learning_rate": 1.16e-05, | |
| "loss": 0.001, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 0.769, | |
| "grad_norm": 0.00933472067117691, | |
| "learning_rate": 1.1550000000000001e-05, | |
| "loss": 0.0007, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.008684621192514896, | |
| "learning_rate": 1.1500000000000002e-05, | |
| "loss": 0.0006, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.771, | |
| "grad_norm": 0.03054739721119404, | |
| "learning_rate": 1.145e-05, | |
| "loss": 0.0006, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 0.772, | |
| "grad_norm": 0.005998207256197929, | |
| "learning_rate": 1.1400000000000001e-05, | |
| "loss": 0.0006, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 0.773, | |
| "grad_norm": 0.006153833121061325, | |
| "learning_rate": 1.1350000000000001e-05, | |
| "loss": 0.0006, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 0.774, | |
| "grad_norm": 0.007491481024771929, | |
| "learning_rate": 1.13e-05, | |
| "loss": 0.0007, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 0.775, | |
| "grad_norm": 0.01078925933688879, | |
| "learning_rate": 1.125e-05, | |
| "loss": 0.0006, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 0.005885554943233728, | |
| "learning_rate": 1.1200000000000001e-05, | |
| "loss": 0.0006, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 0.777, | |
| "grad_norm": 0.005423078313469887, | |
| "learning_rate": 1.115e-05, | |
| "loss": 0.0007, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 0.778, | |
| "grad_norm": 0.008044522255659103, | |
| "learning_rate": 1.11e-05, | |
| "loss": 0.0006, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 0.779, | |
| "grad_norm": 0.00733207818120718, | |
| "learning_rate": 1.1050000000000001e-05, | |
| "loss": 0.0007, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.0066906120628118515, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 0.0009, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.781, | |
| "grad_norm": 0.004443836398422718, | |
| "learning_rate": 1.095e-05, | |
| "loss": 0.0006, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 0.782, | |
| "grad_norm": 0.0058379145339131355, | |
| "learning_rate": 1.09e-05, | |
| "loss": 0.0007, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 0.783, | |
| "grad_norm": 0.006808693055063486, | |
| "learning_rate": 1.0850000000000001e-05, | |
| "loss": 0.0006, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.008773542940616608, | |
| "learning_rate": 1.08e-05, | |
| "loss": 0.0006, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 0.785, | |
| "grad_norm": 0.006700740661472082, | |
| "learning_rate": 1.075e-05, | |
| "loss": 0.0006, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.786, | |
| "grad_norm": 0.00906393863260746, | |
| "learning_rate": 1.0700000000000001e-05, | |
| "loss": 0.0006, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 0.787, | |
| "grad_norm": 0.0030822190456092358, | |
| "learning_rate": 1.065e-05, | |
| "loss": 0.0005, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 0.788, | |
| "grad_norm": 0.0029632148798555136, | |
| "learning_rate": 1.06e-05, | |
| "loss": 0.0005, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 0.789, | |
| "grad_norm": 0.004798842128366232, | |
| "learning_rate": 1.055e-05, | |
| "loss": 0.0006, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.007376812864094973, | |
| "learning_rate": 1.05e-05, | |
| "loss": 0.0005, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.791, | |
| "grad_norm": 0.009337624534964561, | |
| "learning_rate": 1.045e-05, | |
| "loss": 0.0009, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 0.012847904115915298, | |
| "learning_rate": 1.04e-05, | |
| "loss": 0.0008, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 0.793, | |
| "grad_norm": 0.005587203428149223, | |
| "learning_rate": 1.035e-05, | |
| "loss": 0.0006, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 0.794, | |
| "grad_norm": 0.008464600890874863, | |
| "learning_rate": 1.03e-05, | |
| "loss": 0.0006, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 0.795, | |
| "grad_norm": 0.2516852617263794, | |
| "learning_rate": 1.025e-05, | |
| "loss": 0.002, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.796, | |
| "grad_norm": 0.04664693772792816, | |
| "learning_rate": 1.02e-05, | |
| "loss": 0.002, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 0.797, | |
| "grad_norm": 0.02456306852400303, | |
| "learning_rate": 1.0150000000000001e-05, | |
| "loss": 0.0013, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 0.798, | |
| "grad_norm": 0.011320951394736767, | |
| "learning_rate": 1.0100000000000002e-05, | |
| "loss": 0.0009, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 0.799, | |
| "grad_norm": 0.01860683411359787, | |
| "learning_rate": 1.005e-05, | |
| "loss": 0.0012, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.03227970749139786, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0009, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.801, | |
| "grad_norm": 0.015873363241553307, | |
| "learning_rate": 9.950000000000001e-06, | |
| "loss": 0.0008, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 0.802, | |
| "grad_norm": 0.005454899277538061, | |
| "learning_rate": 9.900000000000002e-06, | |
| "loss": 0.0008, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 0.803, | |
| "grad_norm": 0.007948348298668861, | |
| "learning_rate": 9.85e-06, | |
| "loss": 0.0007, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 0.804, | |
| "grad_norm": 0.013328757137060165, | |
| "learning_rate": 9.800000000000001e-06, | |
| "loss": 0.0006, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 0.805, | |
| "grad_norm": 0.01018743496388197, | |
| "learning_rate": 9.750000000000002e-06, | |
| "loss": 0.0012, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.806, | |
| "grad_norm": 0.009421809576451778, | |
| "learning_rate": 9.7e-06, | |
| "loss": 0.0008, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 0.807, | |
| "grad_norm": 0.005202045664191246, | |
| "learning_rate": 9.65e-06, | |
| "loss": 0.0007, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 0.012956002727150917, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 0.0007, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 0.809, | |
| "grad_norm": 0.006403383333235979, | |
| "learning_rate": 9.55e-06, | |
| "loss": 0.0007, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.027560915797948837, | |
| "learning_rate": 9.5e-06, | |
| "loss": 0.0008, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.811, | |
| "grad_norm": 0.005196988116949797, | |
| "learning_rate": 9.450000000000001e-06, | |
| "loss": 0.0006, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 0.812, | |
| "grad_norm": 0.009510821662843227, | |
| "learning_rate": 9.4e-06, | |
| "loss": 0.0006, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 0.813, | |
| "grad_norm": 0.006430651992559433, | |
| "learning_rate": 9.35e-06, | |
| "loss": 0.0006, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 0.814, | |
| "grad_norm": 0.019426727667450905, | |
| "learning_rate": 9.3e-06, | |
| "loss": 0.0009, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 0.815, | |
| "grad_norm": 0.011564865708351135, | |
| "learning_rate": 9.25e-06, | |
| "loss": 0.0006, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.009036659263074398, | |
| "learning_rate": 9.2e-06, | |
| "loss": 0.0008, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 0.817, | |
| "grad_norm": 0.006685588974505663, | |
| "learning_rate": 9.15e-06, | |
| "loss": 0.0007, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 0.818, | |
| "grad_norm": 0.005980687215924263, | |
| "learning_rate": 9.100000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 0.819, | |
| "grad_norm": 0.0029402158688753843, | |
| "learning_rate": 9.05e-06, | |
| "loss": 0.0005, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.0034720194526016712, | |
| "learning_rate": 9e-06, | |
| "loss": 0.0006, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.821, | |
| "grad_norm": 0.008967465721070766, | |
| "learning_rate": 8.95e-06, | |
| "loss": 0.0009, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 0.822, | |
| "grad_norm": 0.007418784312903881, | |
| "learning_rate": 8.9e-06, | |
| "loss": 0.0007, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 0.823, | |
| "grad_norm": 0.0077253603376448154, | |
| "learning_rate": 8.85e-06, | |
| "loss": 0.0006, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 0.011202674359083176, | |
| "learning_rate": 8.8e-06, | |
| "loss": 0.0013, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 0.825, | |
| "grad_norm": 0.022354573011398315, | |
| "learning_rate": 8.75e-06, | |
| "loss": 0.0008, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.826, | |
| "grad_norm": 0.01750505343079567, | |
| "learning_rate": 8.7e-06, | |
| "loss": 0.0013, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 0.827, | |
| "grad_norm": 0.01153852604329586, | |
| "learning_rate": 8.65e-06, | |
| "loss": 0.0009, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 0.828, | |
| "grad_norm": 0.008752427063882351, | |
| "learning_rate": 8.599999999999999e-06, | |
| "loss": 0.0006, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 0.829, | |
| "grad_norm": 0.007307702675461769, | |
| "learning_rate": 8.550000000000001e-06, | |
| "loss": 0.0007, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.0077101094648242, | |
| "learning_rate": 8.500000000000002e-06, | |
| "loss": 0.0006, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.831, | |
| "grad_norm": 0.006358897779136896, | |
| "learning_rate": 8.45e-06, | |
| "loss": 0.0005, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.003663134528324008, | |
| "learning_rate": 8.400000000000001e-06, | |
| "loss": 0.0006, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 0.833, | |
| "grad_norm": 0.005117372144013643, | |
| "learning_rate": 8.350000000000001e-06, | |
| "loss": 0.0006, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 0.834, | |
| "grad_norm": 0.004245636984705925, | |
| "learning_rate": 8.3e-06, | |
| "loss": 0.0005, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 0.835, | |
| "grad_norm": 0.005357146263122559, | |
| "learning_rate": 8.25e-06, | |
| "loss": 0.0006, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.836, | |
| "grad_norm": 0.01055213250219822, | |
| "learning_rate": 8.200000000000001e-06, | |
| "loss": 0.0008, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 0.837, | |
| "grad_norm": 0.01871907152235508, | |
| "learning_rate": 8.15e-06, | |
| "loss": 0.0007, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 0.838, | |
| "grad_norm": 0.013110162690281868, | |
| "learning_rate": 8.1e-06, | |
| "loss": 0.0005, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 0.839, | |
| "grad_norm": 0.005271353758871555, | |
| "learning_rate": 8.050000000000001e-06, | |
| "loss": 0.0007, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.004324494861066341, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.841, | |
| "grad_norm": 0.0031851409003138542, | |
| "learning_rate": 7.95e-06, | |
| "loss": 0.0006, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 0.842, | |
| "grad_norm": 0.009736557491123676, | |
| "learning_rate": 7.9e-06, | |
| "loss": 0.0006, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 0.843, | |
| "grad_norm": 0.005168536212295294, | |
| "learning_rate": 7.850000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 0.844, | |
| "grad_norm": 0.002579685300588608, | |
| "learning_rate": 7.8e-06, | |
| "loss": 0.0005, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 0.845, | |
| "grad_norm": 0.008710252121090889, | |
| "learning_rate": 7.75e-06, | |
| "loss": 0.0005, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.846, | |
| "grad_norm": 0.004952189512550831, | |
| "learning_rate": 7.7e-06, | |
| "loss": 0.0008, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 0.847, | |
| "grad_norm": 0.003375423140823841, | |
| "learning_rate": 7.65e-06, | |
| "loss": 0.0005, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.13184253871440887, | |
| "learning_rate": 7.6e-06, | |
| "loss": 0.0012, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 0.849, | |
| "grad_norm": 0.017549166455864906, | |
| "learning_rate": 7.55e-06, | |
| "loss": 0.0007, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.00852286908775568, | |
| "learning_rate": 7.5e-06, | |
| "loss": 0.0006, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.851, | |
| "grad_norm": 0.005547389388084412, | |
| "learning_rate": 7.45e-06, | |
| "loss": 0.0005, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 0.852, | |
| "grad_norm": 0.0061622606590390205, | |
| "learning_rate": 7.4e-06, | |
| "loss": 0.0005, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 0.853, | |
| "grad_norm": 0.005182339809834957, | |
| "learning_rate": 7.35e-06, | |
| "loss": 0.0008, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 0.854, | |
| "grad_norm": 0.005366960074752569, | |
| "learning_rate": 7.2999999999999996e-06, | |
| "loss": 0.0006, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 0.855, | |
| "grad_norm": 0.005542315077036619, | |
| "learning_rate": 7.25e-06, | |
| "loss": 0.0006, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 0.003940809518098831, | |
| "learning_rate": 7.2e-06, | |
| "loss": 0.0005, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 0.857, | |
| "grad_norm": 0.003730529686436057, | |
| "learning_rate": 7.15e-06, | |
| "loss": 0.0006, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 0.858, | |
| "grad_norm": 0.0033961348235607147, | |
| "learning_rate": 7.1e-06, | |
| "loss": 0.0005, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 0.859, | |
| "grad_norm": 0.004546662792563438, | |
| "learning_rate": 7.049999999999999e-06, | |
| "loss": 0.0006, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.009168008342385292, | |
| "learning_rate": 7.000000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.861, | |
| "grad_norm": 0.008373426273465157, | |
| "learning_rate": 6.950000000000001e-06, | |
| "loss": 0.0008, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 0.862, | |
| "grad_norm": 0.004947313107550144, | |
| "learning_rate": 6.900000000000001e-06, | |
| "loss": 0.0006, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 0.863, | |
| "grad_norm": 0.015127859078347683, | |
| "learning_rate": 6.8500000000000005e-06, | |
| "loss": 0.0006, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.0056435600854456425, | |
| "learning_rate": 6.800000000000001e-06, | |
| "loss": 0.0006, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 0.865, | |
| "grad_norm": 0.004109732341021299, | |
| "learning_rate": 6.750000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.866, | |
| "grad_norm": 0.006170314736664295, | |
| "learning_rate": 6.700000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 0.867, | |
| "grad_norm": 0.002802550094202161, | |
| "learning_rate": 6.650000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 0.868, | |
| "grad_norm": 0.0029788350220769644, | |
| "learning_rate": 6.6e-06, | |
| "loss": 0.0004, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 0.869, | |
| "grad_norm": 0.013022363185882568, | |
| "learning_rate": 6.550000000000001e-06, | |
| "loss": 0.0006, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.0036853367928415537, | |
| "learning_rate": 6.5000000000000004e-06, | |
| "loss": 0.0006, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.871, | |
| "grad_norm": 0.002578242914751172, | |
| "learning_rate": 6.45e-06, | |
| "loss": 0.0005, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 0.0036895396187901497, | |
| "learning_rate": 6.4000000000000006e-06, | |
| "loss": 0.0005, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 0.873, | |
| "grad_norm": 0.006020987406373024, | |
| "learning_rate": 6.35e-06, | |
| "loss": 0.0005, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 0.874, | |
| "grad_norm": 0.006671608425676823, | |
| "learning_rate": 6.300000000000001e-06, | |
| "loss": 0.0006, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 0.0038102639373391867, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.0006, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.876, | |
| "grad_norm": 0.006786294747143984, | |
| "learning_rate": 6.2e-06, | |
| "loss": 0.0004, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 0.877, | |
| "grad_norm": 0.00381205091252923, | |
| "learning_rate": 6.15e-06, | |
| "loss": 0.0004, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 0.878, | |
| "grad_norm": 0.007368630729615688, | |
| "learning_rate": 6.1e-06, | |
| "loss": 0.0005, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 0.879, | |
| "grad_norm": 0.0035172586794942617, | |
| "learning_rate": 6.0500000000000005e-06, | |
| "loss": 0.0006, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.005555720068514347, | |
| "learning_rate": 6e-06, | |
| "loss": 0.0007, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.881, | |
| "grad_norm": 0.0076825893484056, | |
| "learning_rate": 5.95e-06, | |
| "loss": 0.0005, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 0.882, | |
| "grad_norm": 0.0055446140468120575, | |
| "learning_rate": 5.9e-06, | |
| "loss": 0.0005, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 0.883, | |
| "grad_norm": 0.002265618182718754, | |
| "learning_rate": 5.850000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 0.884, | |
| "grad_norm": 0.003428585361689329, | |
| "learning_rate": 5.8e-06, | |
| "loss": 0.0004, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 0.885, | |
| "grad_norm": 0.0044764927588403225, | |
| "learning_rate": 5.750000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.886, | |
| "grad_norm": 0.003201392712071538, | |
| "learning_rate": 5.7000000000000005e-06, | |
| "loss": 0.0005, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 0.887, | |
| "grad_norm": 0.0029762780759483576, | |
| "learning_rate": 5.65e-06, | |
| "loss": 0.0006, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 0.07450267672538757, | |
| "learning_rate": 5.600000000000001e-06, | |
| "loss": 0.0009, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 0.889, | |
| "grad_norm": 0.006392148323357105, | |
| "learning_rate": 5.55e-06, | |
| "loss": 0.0006, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.0038995451759546995, | |
| "learning_rate": 5.500000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.891, | |
| "grad_norm": 0.0028438065201044083, | |
| "learning_rate": 5.45e-06, | |
| "loss": 0.0004, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 0.892, | |
| "grad_norm": 0.003168331226333976, | |
| "learning_rate": 5.4e-06, | |
| "loss": 0.0004, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 0.893, | |
| "grad_norm": 0.0026163198053836823, | |
| "learning_rate": 5.3500000000000004e-06, | |
| "loss": 0.0004, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 0.894, | |
| "grad_norm": 0.0029086521826684475, | |
| "learning_rate": 5.3e-06, | |
| "loss": 0.0005, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 0.895, | |
| "grad_norm": 0.011433840729296207, | |
| "learning_rate": 5.25e-06, | |
| "loss": 0.0007, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.01782575435936451, | |
| "learning_rate": 5.2e-06, | |
| "loss": 0.0011, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 0.897, | |
| "grad_norm": 0.00613692682236433, | |
| "learning_rate": 5.15e-06, | |
| "loss": 0.0004, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 0.898, | |
| "grad_norm": 0.02408697083592415, | |
| "learning_rate": 5.1e-06, | |
| "loss": 0.0007, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 0.899, | |
| "grad_norm": 0.004028539173305035, | |
| "learning_rate": 5.050000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.0032080088276416063, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0005, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.901, | |
| "grad_norm": 0.0035681568551808596, | |
| "learning_rate": 4.950000000000001e-06, | |
| "loss": 0.0004, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 0.902, | |
| "grad_norm": 0.007591512985527515, | |
| "learning_rate": 4.9000000000000005e-06, | |
| "loss": 0.0005, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 0.903, | |
| "grad_norm": 0.004855870269238949, | |
| "learning_rate": 4.85e-06, | |
| "loss": 0.0004, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 0.004854188766330481, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 0.0004, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 0.905, | |
| "grad_norm": 0.004117886070162058, | |
| "learning_rate": 4.75e-06, | |
| "loss": 0.0005, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.906, | |
| "grad_norm": 0.0045243133790791035, | |
| "learning_rate": 4.7e-06, | |
| "loss": 0.0005, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 0.907, | |
| "grad_norm": 0.001863984507508576, | |
| "learning_rate": 4.65e-06, | |
| "loss": 0.0004, | |
| "step": 9070 | |
| }, | |
| { | |
| "epoch": 0.908, | |
| "grad_norm": 0.002472365740686655, | |
| "learning_rate": 4.6e-06, | |
| "loss": 0.0005, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 0.909, | |
| "grad_norm": 0.0020466954447329044, | |
| "learning_rate": 4.5500000000000005e-06, | |
| "loss": 0.0004, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.004180034622550011, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.0004, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.911, | |
| "grad_norm": 0.00341266137547791, | |
| "learning_rate": 4.45e-06, | |
| "loss": 0.0006, | |
| "step": 9110 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.006567875389009714, | |
| "learning_rate": 4.4e-06, | |
| "loss": 0.0004, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 0.913, | |
| "grad_norm": 0.003975498490035534, | |
| "learning_rate": 4.35e-06, | |
| "loss": 0.0006, | |
| "step": 9130 | |
| }, | |
| { | |
| "epoch": 0.914, | |
| "grad_norm": 0.003391894046217203, | |
| "learning_rate": 4.2999999999999995e-06, | |
| "loss": 0.0006, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 0.915, | |
| "grad_norm": 0.005821021273732185, | |
| "learning_rate": 4.250000000000001e-06, | |
| "loss": 0.0004, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.916, | |
| "grad_norm": 0.0022448371164500713, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "loss": 0.0004, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 0.917, | |
| "grad_norm": 0.003718709573149681, | |
| "learning_rate": 4.15e-06, | |
| "loss": 0.0004, | |
| "step": 9170 | |
| }, | |
| { | |
| "epoch": 0.918, | |
| "grad_norm": 0.008243223652243614, | |
| "learning_rate": 4.1000000000000006e-06, | |
| "loss": 0.0007, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 0.919, | |
| "grad_norm": 0.010773789137601852, | |
| "learning_rate": 4.05e-06, | |
| "loss": 0.0007, | |
| "step": 9190 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.006589268799871206, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.921, | |
| "grad_norm": 0.0026856744661927223, | |
| "learning_rate": 3.95e-06, | |
| "loss": 0.0004, | |
| "step": 9210 | |
| }, | |
| { | |
| "epoch": 0.922, | |
| "grad_norm": 0.012134186923503876, | |
| "learning_rate": 3.9e-06, | |
| "loss": 0.0005, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 0.923, | |
| "grad_norm": 0.004260225687175989, | |
| "learning_rate": 3.85e-06, | |
| "loss": 0.0005, | |
| "step": 9230 | |
| }, | |
| { | |
| "epoch": 0.924, | |
| "grad_norm": 0.0023803950753062963, | |
| "learning_rate": 3.8e-06, | |
| "loss": 0.0004, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 0.925, | |
| "grad_norm": 0.0037502460181713104, | |
| "learning_rate": 3.75e-06, | |
| "loss": 0.0005, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.926, | |
| "grad_norm": 0.0017525887815281749, | |
| "learning_rate": 3.7e-06, | |
| "loss": 0.0003, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 0.927, | |
| "grad_norm": 0.003996537532657385, | |
| "learning_rate": 3.6499999999999998e-06, | |
| "loss": 0.0005, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.009158821776509285, | |
| "learning_rate": 3.6e-06, | |
| "loss": 0.0007, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 0.929, | |
| "grad_norm": 0.003372638253495097, | |
| "learning_rate": 3.55e-06, | |
| "loss": 0.0004, | |
| "step": 9290 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.0026602360885590315, | |
| "learning_rate": 3.5000000000000004e-06, | |
| "loss": 0.0004, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.931, | |
| "grad_norm": 0.014532738365232944, | |
| "learning_rate": 3.4500000000000004e-06, | |
| "loss": 0.0007, | |
| "step": 9310 | |
| }, | |
| { | |
| "epoch": 0.932, | |
| "grad_norm": 0.002912462456151843, | |
| "learning_rate": 3.4000000000000005e-06, | |
| "loss": 0.0004, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 0.933, | |
| "grad_norm": 0.0052029709331691265, | |
| "learning_rate": 3.3500000000000005e-06, | |
| "loss": 0.0006, | |
| "step": 9330 | |
| }, | |
| { | |
| "epoch": 0.934, | |
| "grad_norm": 0.016220854595303535, | |
| "learning_rate": 3.3e-06, | |
| "loss": 0.0004, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 0.935, | |
| "grad_norm": 0.0030162036418914795, | |
| "learning_rate": 3.2500000000000002e-06, | |
| "loss": 0.0004, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 0.002491691382601857, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 0.0004, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 0.937, | |
| "grad_norm": 0.022630969062447548, | |
| "learning_rate": 3.1500000000000003e-06, | |
| "loss": 0.0005, | |
| "step": 9370 | |
| }, | |
| { | |
| "epoch": 0.938, | |
| "grad_norm": 0.005951160565018654, | |
| "learning_rate": 3.1e-06, | |
| "loss": 0.0005, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 0.939, | |
| "grad_norm": 0.0024763622786849737, | |
| "learning_rate": 3.05e-06, | |
| "loss": 0.0005, | |
| "step": 9390 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.0049979290924966335, | |
| "learning_rate": 3e-06, | |
| "loss": 0.0005, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.941, | |
| "grad_norm": 0.0025999436620622873, | |
| "learning_rate": 2.95e-06, | |
| "loss": 0.0004, | |
| "step": 9410 | |
| }, | |
| { | |
| "epoch": 0.942, | |
| "grad_norm": 0.004584169946610928, | |
| "learning_rate": 2.9e-06, | |
| "loss": 0.0006, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 0.943, | |
| "grad_norm": 0.005211680196225643, | |
| "learning_rate": 2.8500000000000002e-06, | |
| "loss": 0.0005, | |
| "step": 9430 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.0022507943212985992, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 0.0004, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 0.945, | |
| "grad_norm": 0.0029024691320955753, | |
| "learning_rate": 2.7500000000000004e-06, | |
| "loss": 0.0004, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.946, | |
| "grad_norm": 0.003968573175370693, | |
| "learning_rate": 2.7e-06, | |
| "loss": 0.0004, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 0.947, | |
| "grad_norm": 0.003264777595177293, | |
| "learning_rate": 2.65e-06, | |
| "loss": 0.0005, | |
| "step": 9470 | |
| }, | |
| { | |
| "epoch": 0.948, | |
| "grad_norm": 0.0048127188347280025, | |
| "learning_rate": 2.6e-06, | |
| "loss": 0.0004, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 0.949, | |
| "grad_norm": 0.004405410494655371, | |
| "learning_rate": 2.55e-06, | |
| "loss": 0.0006, | |
| "step": 9490 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.00462340796366334, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0004, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.951, | |
| "grad_norm": 0.0021721452940255404, | |
| "learning_rate": 2.4500000000000003e-06, | |
| "loss": 0.0004, | |
| "step": 9510 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 0.002355078933760524, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 0.0006, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 0.953, | |
| "grad_norm": 0.0022414589766412973, | |
| "learning_rate": 2.35e-06, | |
| "loss": 0.0004, | |
| "step": 9530 | |
| }, | |
| { | |
| "epoch": 0.954, | |
| "grad_norm": 0.012005253694951534, | |
| "learning_rate": 2.3e-06, | |
| "loss": 0.0006, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 0.955, | |
| "grad_norm": 0.00513832364231348, | |
| "learning_rate": 2.25e-06, | |
| "loss": 0.0005, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.956, | |
| "grad_norm": 0.0027625642251223326, | |
| "learning_rate": 2.2e-06, | |
| "loss": 0.0005, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 0.957, | |
| "grad_norm": 0.008645957335829735, | |
| "learning_rate": 2.1499999999999997e-06, | |
| "loss": 0.0005, | |
| "step": 9570 | |
| }, | |
| { | |
| "epoch": 0.958, | |
| "grad_norm": 0.00188863230869174, | |
| "learning_rate": 2.1000000000000002e-06, | |
| "loss": 0.0004, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 0.959, | |
| "grad_norm": 0.0025561931543052197, | |
| "learning_rate": 2.0500000000000003e-06, | |
| "loss": 0.0004, | |
| "step": 9590 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.0033618698362261057, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0004, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.961, | |
| "grad_norm": 0.0018735548947006464, | |
| "learning_rate": 1.95e-06, | |
| "loss": 0.0004, | |
| "step": 9610 | |
| }, | |
| { | |
| "epoch": 0.962, | |
| "grad_norm": 0.0019359014695510268, | |
| "learning_rate": 1.9e-06, | |
| "loss": 0.0005, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 0.963, | |
| "grad_norm": 0.005369434133172035, | |
| "learning_rate": 1.85e-06, | |
| "loss": 0.0004, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 0.964, | |
| "grad_norm": 0.0017576682148501277, | |
| "learning_rate": 1.8e-06, | |
| "loss": 0.0004, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 0.965, | |
| "grad_norm": 0.002633103635162115, | |
| "learning_rate": 1.7500000000000002e-06, | |
| "loss": 0.0004, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.966, | |
| "grad_norm": 0.007023205049335957, | |
| "learning_rate": 1.7000000000000002e-06, | |
| "loss": 0.0004, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 0.967, | |
| "grad_norm": 0.0026062438264489174, | |
| "learning_rate": 1.65e-06, | |
| "loss": 0.0005, | |
| "step": 9670 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 0.0025111304130405188, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 0.969, | |
| "grad_norm": 0.0028218806255608797, | |
| "learning_rate": 1.55e-06, | |
| "loss": 0.0004, | |
| "step": 9690 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.0024802633561193943, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.0005, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.971, | |
| "grad_norm": 0.002883223118260503, | |
| "learning_rate": 1.45e-06, | |
| "loss": 0.0005, | |
| "step": 9710 | |
| }, | |
| { | |
| "epoch": 0.972, | |
| "grad_norm": 0.002503247233107686, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "loss": 0.0005, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 0.973, | |
| "grad_norm": 0.002495008986443281, | |
| "learning_rate": 1.35e-06, | |
| "loss": 0.0006, | |
| "step": 9730 | |
| }, | |
| { | |
| "epoch": 0.974, | |
| "grad_norm": 0.03429775312542915, | |
| "learning_rate": 1.3e-06, | |
| "loss": 0.0006, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 0.975, | |
| "grad_norm": 0.003482217201963067, | |
| "learning_rate": 1.25e-06, | |
| "loss": 0.0004, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 0.001837963704019785, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 0.0004, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 0.977, | |
| "grad_norm": 0.0020507893059402704, | |
| "learning_rate": 1.15e-06, | |
| "loss": 0.0004, | |
| "step": 9770 | |
| }, | |
| { | |
| "epoch": 0.978, | |
| "grad_norm": 0.0022647210862487555, | |
| "learning_rate": 1.1e-06, | |
| "loss": 0.0005, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 0.979, | |
| "grad_norm": 0.0017425378318876028, | |
| "learning_rate": 1.0500000000000001e-06, | |
| "loss": 0.0004, | |
| "step": 9790 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.2319187968969345, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.0021, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.981, | |
| "grad_norm": 0.01799739897251129, | |
| "learning_rate": 9.5e-07, | |
| "loss": 0.0006, | |
| "step": 9810 | |
| }, | |
| { | |
| "epoch": 0.982, | |
| "grad_norm": 0.007147952448576689, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0005, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 0.983, | |
| "grad_norm": 0.004181794356554747, | |
| "learning_rate": 8.500000000000001e-07, | |
| "loss": 0.0005, | |
| "step": 9830 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 0.00277232495136559, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 0.0004, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 0.985, | |
| "grad_norm": 0.0024797001387923956, | |
| "learning_rate": 7.5e-07, | |
| "loss": 0.0006, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.986, | |
| "grad_norm": 0.002748242113739252, | |
| "learning_rate": 7.000000000000001e-07, | |
| "loss": 0.0005, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 0.987, | |
| "grad_norm": 0.002988820429891348, | |
| "learning_rate": 6.5e-07, | |
| "loss": 0.0004, | |
| "step": 9870 | |
| }, | |
| { | |
| "epoch": 0.988, | |
| "grad_norm": 0.002272873418405652, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 0.0006, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 0.989, | |
| "grad_norm": 0.0028824047185480595, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0005, | |
| "step": 9890 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.013895529322326183, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 0.0005, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.991, | |
| "grad_norm": 0.004210934974253178, | |
| "learning_rate": 4.5e-07, | |
| "loss": 0.0004, | |
| "step": 9910 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 0.0017349456902593374, | |
| "learning_rate": 4.0000000000000003e-07, | |
| "loss": 0.0005, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 0.993, | |
| "grad_norm": 0.0036622195038944483, | |
| "learning_rate": 3.5000000000000004e-07, | |
| "loss": 0.0003, | |
| "step": 9930 | |
| }, | |
| { | |
| "epoch": 0.994, | |
| "grad_norm": 0.02928483486175537, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "loss": 0.0006, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 0.995, | |
| "grad_norm": 0.004271595273166895, | |
| "learning_rate": 2.5000000000000004e-07, | |
| "loss": 0.0004, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.996, | |
| "grad_norm": 0.004935207776725292, | |
| "learning_rate": 2.0000000000000002e-07, | |
| "loss": 0.0004, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 0.997, | |
| "grad_norm": 0.005258087068796158, | |
| "learning_rate": 1.5000000000000002e-07, | |
| "loss": 0.0004, | |
| "step": 9970 | |
| }, | |
| { | |
| "epoch": 0.998, | |
| "grad_norm": 0.0014150363858789206, | |
| "learning_rate": 1.0000000000000001e-07, | |
| "loss": 0.0004, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 0.999, | |
| "grad_norm": 0.003183445893228054, | |
| "learning_rate": 5.0000000000000004e-08, | |
| "loss": 0.0004, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.0063432566821575165, | |
| "learning_rate": 0.0, | |
| "loss": 0.0004, | |
| "step": 10000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.6962203336704e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |