| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.922065846752039, | |
| "eval_steps": 400, | |
| "global_step": 11600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01690545623600017, | |
| "grad_norm": 9.244617462158203, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 5.2046, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03381091247200034, | |
| "grad_norm": 11.779854774475098, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 4.1731, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05071636870800051, | |
| "grad_norm": 3.761927604675293, | |
| "learning_rate": 5e-05, | |
| "loss": 4.0566, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06762182494400068, | |
| "grad_norm": 9.56804084777832, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 4.0416, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08452728118000084, | |
| "grad_norm": 4.056901454925537, | |
| "learning_rate": 8.333333333333334e-05, | |
| "loss": 3.9922, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.10143273741600102, | |
| "grad_norm": 8.173892974853516, | |
| "learning_rate": 0.0001, | |
| "loss": 4.056, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.11833819365200118, | |
| "grad_norm": 7.099091053009033, | |
| "learning_rate": 9.999735405375364e-05, | |
| "loss": 3.9618, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.13524364988800136, | |
| "grad_norm": 3.7080025672912598, | |
| "learning_rate": 9.998941652617147e-05, | |
| "loss": 3.9753, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.13524364988800136, | |
| "eval_loss": 3.9605562686920166, | |
| "eval_runtime": 11.6413, | |
| "eval_samples_per_second": 85.901, | |
| "eval_steps_per_second": 2.749, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1521491061240015, | |
| "grad_norm": 3.6200530529022217, | |
| "learning_rate": 9.997618835068782e-05, | |
| "loss": 3.956, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1690545623600017, | |
| "grad_norm": 3.6575722694396973, | |
| "learning_rate": 9.995767108290448e-05, | |
| "loss": 3.9065, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.18596001859600186, | |
| "grad_norm": 7.8048248291015625, | |
| "learning_rate": 9.993386690040792e-05, | |
| "loss": 3.88, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.20286547483200204, | |
| "grad_norm": 2.769477605819702, | |
| "learning_rate": 9.990477860251314e-05, | |
| "loss": 3.8931, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.2197709310680022, | |
| "grad_norm": 3.249483823776245, | |
| "learning_rate": 9.987040960993446e-05, | |
| "loss": 3.8389, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.23667638730400237, | |
| "grad_norm": 1.6407500505447388, | |
| "learning_rate": 9.983076396438333e-05, | |
| "loss": 3.855, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2535818435400025, | |
| "grad_norm": 2.995973825454712, | |
| "learning_rate": 9.978584632809293e-05, | |
| "loss": 3.8473, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.2704872997760027, | |
| "grad_norm": 2.316067934036255, | |
| "learning_rate": 9.973566198326996e-05, | |
| "loss": 3.8353, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.2704872997760027, | |
| "eval_loss": 3.8578951358795166, | |
| "eval_runtime": 11.681, | |
| "eval_samples_per_second": 85.609, | |
| "eval_steps_per_second": 2.739, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.28739275601200287, | |
| "grad_norm": 2.2685530185699463, | |
| "learning_rate": 9.968021683147353e-05, | |
| "loss": 3.8267, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.304298212248003, | |
| "grad_norm": 2.8378384113311768, | |
| "learning_rate": 9.961951739292097e-05, | |
| "loss": 3.8046, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3212036684840032, | |
| "grad_norm": 3.528430938720703, | |
| "learning_rate": 9.955357080572128e-05, | |
| "loss": 3.8239, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.3381091247200034, | |
| "grad_norm": 5.5983099937438965, | |
| "learning_rate": 9.948238482503555e-05, | |
| "loss": 3.7842, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3550145809560035, | |
| "grad_norm": 2.4193906784057617, | |
| "learning_rate": 9.940596782216504e-05, | |
| "loss": 3.7816, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3719200371920037, | |
| "grad_norm": 1.893676996231079, | |
| "learning_rate": 9.932432878356672e-05, | |
| "loss": 3.7895, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.3888254934280039, | |
| "grad_norm": 2.029632806777954, | |
| "learning_rate": 9.92374773097965e-05, | |
| "loss": 3.7668, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.4057309496640041, | |
| "grad_norm": 2.21596622467041, | |
| "learning_rate": 9.91454236143802e-05, | |
| "loss": 3.7979, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.4057309496640041, | |
| "eval_loss": 3.8054637908935547, | |
| "eval_runtime": 11.6706, | |
| "eval_samples_per_second": 85.685, | |
| "eval_steps_per_second": 2.742, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.42263640590000423, | |
| "grad_norm": 3.131930351257324, | |
| "learning_rate": 9.90481785226125e-05, | |
| "loss": 3.7824, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4395418621360044, | |
| "grad_norm": 1.5176011323928833, | |
| "learning_rate": 9.894575347028381e-05, | |
| "loss": 3.7666, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4564473183720046, | |
| "grad_norm": 2.2135422229766846, | |
| "learning_rate": 9.883816050233566e-05, | |
| "loss": 3.7684, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.47335277460800473, | |
| "grad_norm": 1.565341830253601, | |
| "learning_rate": 9.872541227144397e-05, | |
| "loss": 3.7421, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4902582308440049, | |
| "grad_norm": 2.0272250175476074, | |
| "learning_rate": 9.860752203653138e-05, | |
| "loss": 3.7524, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.507163687080005, | |
| "grad_norm": 2.3772072792053223, | |
| "learning_rate": 9.848450366120785e-05, | |
| "loss": 3.7489, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5240691433160053, | |
| "grad_norm": 2.349799633026123, | |
| "learning_rate": 9.835637161214042e-05, | |
| "loss": 3.7421, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5409745995520054, | |
| "grad_norm": 2.6349542140960693, | |
| "learning_rate": 9.822314095735195e-05, | |
| "loss": 3.7416, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5409745995520054, | |
| "eval_loss": 3.775789499282837, | |
| "eval_runtime": 11.6523, | |
| "eval_samples_per_second": 85.82, | |
| "eval_steps_per_second": 2.746, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5578800557880056, | |
| "grad_norm": 2.5276811122894287, | |
| "learning_rate": 9.808482736444913e-05, | |
| "loss": 3.7517, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5747855120240057, | |
| "grad_norm": 2.6835217475891113, | |
| "learning_rate": 9.794144709878008e-05, | |
| "loss": 3.7325, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5916909682600059, | |
| "grad_norm": 2.924553632736206, | |
| "learning_rate": 9.779301702152147e-05, | |
| "loss": 3.7142, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.608596424496006, | |
| "grad_norm": 2.129288673400879, | |
| "learning_rate": 9.763955458769581e-05, | |
| "loss": 3.7347, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6255018807320063, | |
| "grad_norm": 1.441651463508606, | |
| "learning_rate": 9.748107784411867e-05, | |
| "loss": 3.72, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.6424073369680064, | |
| "grad_norm": 2.1518378257751465, | |
| "learning_rate": 9.731760542727647e-05, | |
| "loss": 3.7042, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6593127932040066, | |
| "grad_norm": 4.049715518951416, | |
| "learning_rate": 9.714915656113491e-05, | |
| "loss": 3.7284, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6762182494400067, | |
| "grad_norm": 1.596699595451355, | |
| "learning_rate": 9.697575105487821e-05, | |
| "loss": 3.7116, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6762182494400067, | |
| "eval_loss": 3.7348101139068604, | |
| "eval_runtime": 11.6849, | |
| "eval_samples_per_second": 85.58, | |
| "eval_steps_per_second": 2.739, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6931237056760069, | |
| "grad_norm": 2.405496835708618, | |
| "learning_rate": 9.679740930057965e-05, | |
| "loss": 3.6867, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.710029161912007, | |
| "grad_norm": 2.290134906768799, | |
| "learning_rate": 9.661415227080342e-05, | |
| "loss": 3.6967, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7269346181480073, | |
| "grad_norm": 1.6495215892791748, | |
| "learning_rate": 9.642600151613847e-05, | |
| "loss": 3.6822, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.7438400743840075, | |
| "grad_norm": 1.6393038034439087, | |
| "learning_rate": 9.623297916266399e-05, | |
| "loss": 3.6747, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7607455306200076, | |
| "grad_norm": 1.7952766418457031, | |
| "learning_rate": 9.603510790934765e-05, | |
| "loss": 3.6778, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7776509868560078, | |
| "grad_norm": 2.7831900119781494, | |
| "learning_rate": 9.583241102537609e-05, | |
| "loss": 3.6891, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7945564430920079, | |
| "grad_norm": 1.602946400642395, | |
| "learning_rate": 9.562491234741863e-05, | |
| "loss": 3.6745, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.8114618993280082, | |
| "grad_norm": 2.163341522216797, | |
| "learning_rate": 9.541263627682408e-05, | |
| "loss": 3.6622, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8114618993280082, | |
| "eval_loss": 3.7114827632904053, | |
| "eval_runtime": 11.6568, | |
| "eval_samples_per_second": 85.787, | |
| "eval_steps_per_second": 2.745, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8283673555640083, | |
| "grad_norm": 3.2292706966400146, | |
| "learning_rate": 9.519560777675117e-05, | |
| "loss": 3.6907, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.8452728118000085, | |
| "grad_norm": 2.3297388553619385, | |
| "learning_rate": 9.497385236923304e-05, | |
| "loss": 3.659, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8621782680360086, | |
| "grad_norm": 2.1775872707366943, | |
| "learning_rate": 9.474739613217583e-05, | |
| "loss": 3.6543, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.8790837242720088, | |
| "grad_norm": 2.1940650939941406, | |
| "learning_rate": 9.4516265696292e-05, | |
| "loss": 3.6631, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8959891805080089, | |
| "grad_norm": 2.0075910091400146, | |
| "learning_rate": 9.428048824196861e-05, | |
| "loss": 3.6699, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.9128946367440092, | |
| "grad_norm": 1.463968276977539, | |
| "learning_rate": 9.404009149607104e-05, | |
| "loss": 3.6458, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9298000929800093, | |
| "grad_norm": 1.776117205619812, | |
| "learning_rate": 9.379510372868227e-05, | |
| "loss": 3.6442, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.9467055492160095, | |
| "grad_norm": 1.4943894147872925, | |
| "learning_rate": 9.354555374977845e-05, | |
| "loss": 3.6717, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9467055492160095, | |
| "eval_loss": 3.6869394779205322, | |
| "eval_runtime": 11.6677, | |
| "eval_samples_per_second": 85.706, | |
| "eval_steps_per_second": 2.743, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9636110054520096, | |
| "grad_norm": 2.1420116424560547, | |
| "learning_rate": 9.32914709058409e-05, | |
| "loss": 3.6602, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.9805164616880098, | |
| "grad_norm": 1.4225534200668335, | |
| "learning_rate": 9.303288507640508e-05, | |
| "loss": 3.6312, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.99742191792401, | |
| "grad_norm": 1.5110421180725098, | |
| "learning_rate": 9.276982667054676e-05, | |
| "loss": 3.6309, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.01432737416001, | |
| "grad_norm": 1.4234793186187744, | |
| "learning_rate": 9.250232662330597e-05, | |
| "loss": 3.5446, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0312328303960103, | |
| "grad_norm": 1.4176740646362305, | |
| "learning_rate": 9.223041639204923e-05, | |
| "loss": 3.5084, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.0481382866320106, | |
| "grad_norm": 1.5388909578323364, | |
| "learning_rate": 9.195412795277012e-05, | |
| "loss": 3.4993, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.0650437428680106, | |
| "grad_norm": 1.7309865951538086, | |
| "learning_rate": 9.167349379632901e-05, | |
| "loss": 3.5396, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.0819491991040109, | |
| "grad_norm": 1.6854147911071777, | |
| "learning_rate": 9.138854692463229e-05, | |
| "loss": 3.5129, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.0819491991040109, | |
| "eval_loss": 3.657722234725952, | |
| "eval_runtime": 11.6989, | |
| "eval_samples_per_second": 85.478, | |
| "eval_steps_per_second": 2.735, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.098854655340011, | |
| "grad_norm": 2.1751339435577393, | |
| "learning_rate": 9.10993208467513e-05, | |
| "loss": 3.523, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.1157601115760112, | |
| "grad_norm": 1.1804617643356323, | |
| "learning_rate": 9.08058495749818e-05, | |
| "loss": 3.5124, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.1326655678120114, | |
| "grad_norm": 2.0826492309570312, | |
| "learning_rate": 9.050816762084426e-05, | |
| "loss": 3.5145, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.1495710240480115, | |
| "grad_norm": 1.2046183347702026, | |
| "learning_rate": 9.020630999102529e-05, | |
| "loss": 3.4959, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.1664764802840117, | |
| "grad_norm": 1.7077451944351196, | |
| "learning_rate": 8.990031218326104e-05, | |
| "loss": 3.5147, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.1833819365200118, | |
| "grad_norm": 1.5411063432693481, | |
| "learning_rate": 8.959021018216262e-05, | |
| "loss": 3.5041, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.200287392756012, | |
| "grad_norm": 1.4028867483139038, | |
| "learning_rate": 8.927604045498453e-05, | |
| "loss": 3.5017, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.217192848992012, | |
| "grad_norm": 1.7714195251464844, | |
| "learning_rate": 8.895783994733611e-05, | |
| "loss": 3.5056, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.217192848992012, | |
| "eval_loss": 3.643071174621582, | |
| "eval_runtime": 11.6982, | |
| "eval_samples_per_second": 85.484, | |
| "eval_steps_per_second": 2.735, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.2340983052280123, | |
| "grad_norm": 1.4252102375030518, | |
| "learning_rate": 8.863564607883687e-05, | |
| "loss": 3.4944, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.2510037614640126, | |
| "grad_norm": 1.6317962408065796, | |
| "learning_rate": 8.8309496738716e-05, | |
| "loss": 3.4899, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2679092177000126, | |
| "grad_norm": 1.9923129081726074, | |
| "learning_rate": 8.79794302813567e-05, | |
| "loss": 3.5166, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.284814673936013, | |
| "grad_norm": 1.0414555072784424, | |
| "learning_rate": 8.764548552178584e-05, | |
| "loss": 3.4998, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.301720130172013, | |
| "grad_norm": 1.9689300060272217, | |
| "learning_rate": 8.730770173110932e-05, | |
| "loss": 3.5138, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.3186255864080132, | |
| "grad_norm": 1.6223361492156982, | |
| "learning_rate": 8.696611863189395e-05, | |
| "loss": 3.49, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.3355310426440132, | |
| "grad_norm": 1.6299093961715698, | |
| "learning_rate": 8.662077639349612e-05, | |
| "loss": 3.4629, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.3524364988800135, | |
| "grad_norm": 1.510591983795166, | |
| "learning_rate": 8.627171562733803e-05, | |
| "loss": 3.4711, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3524364988800135, | |
| "eval_loss": 3.620584726333618, | |
| "eval_runtime": 11.664, | |
| "eval_samples_per_second": 85.734, | |
| "eval_steps_per_second": 2.743, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3693419551160138, | |
| "grad_norm": 1.280197262763977, | |
| "learning_rate": 8.591897738213187e-05, | |
| "loss": 3.4808, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.3862474113520138, | |
| "grad_norm": 1.5453211069107056, | |
| "learning_rate": 8.556260313905257e-05, | |
| "loss": 3.4747, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.403152867588014, | |
| "grad_norm": 1.439795970916748, | |
| "learning_rate": 8.520263480685968e-05, | |
| "loss": 3.4847, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.4200583238240143, | |
| "grad_norm": 1.2832996845245361, | |
| "learning_rate": 8.483911471696912e-05, | |
| "loss": 3.482, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.4369637800600144, | |
| "grad_norm": 1.684513807296753, | |
| "learning_rate": 8.447208561847501e-05, | |
| "loss": 3.4514, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.4538692362960146, | |
| "grad_norm": 0.9559006094932556, | |
| "learning_rate": 8.410159067312243e-05, | |
| "loss": 3.4771, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.4707746925320146, | |
| "grad_norm": 1.0770719051361084, | |
| "learning_rate": 8.372767345023185e-05, | |
| "loss": 3.483, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.487680148768015, | |
| "grad_norm": 1.1667702198028564, | |
| "learning_rate": 8.33503779215754e-05, | |
| "loss": 3.4781, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.487680148768015, | |
| "eval_loss": 3.585639715194702, | |
| "eval_runtime": 11.6769, | |
| "eval_samples_per_second": 85.639, | |
| "eval_steps_per_second": 2.74, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.504585605004015, | |
| "grad_norm": 1.183647632598877, | |
| "learning_rate": 8.296974845620584e-05, | |
| "loss": 3.4836, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.5214910612400152, | |
| "grad_norm": 1.308076024055481, | |
| "learning_rate": 8.258582981523895e-05, | |
| "loss": 3.4571, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.5383965174760155, | |
| "grad_norm": 1.5776398181915283, | |
| "learning_rate": 8.219866714658971e-05, | |
| "loss": 3.444, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.5553019737120155, | |
| "grad_norm": 1.1617664098739624, | |
| "learning_rate": 8.180830597966303e-05, | |
| "loss": 3.4465, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.5722074299480158, | |
| "grad_norm": 0.9418047070503235, | |
| "learning_rate": 8.141479221999953e-05, | |
| "loss": 3.4568, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.589112886184016, | |
| "grad_norm": 1.3985543251037598, | |
| "learning_rate": 8.101817214387723e-05, | |
| "loss": 3.4502, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.606018342420016, | |
| "grad_norm": 1.0654075145721436, | |
| "learning_rate": 8.06184923928695e-05, | |
| "loss": 3.4424, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.622923798656016, | |
| "grad_norm": 1.5798131227493286, | |
| "learning_rate": 8.021579996836025e-05, | |
| "loss": 3.4336, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.622923798656016, | |
| "eval_loss": 3.5558018684387207, | |
| "eval_runtime": 11.6573, | |
| "eval_samples_per_second": 85.783, | |
| "eval_steps_per_second": 2.745, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6398292548920164, | |
| "grad_norm": 1.036365270614624, | |
| "learning_rate": 7.981014222601651e-05, | |
| "loss": 3.4397, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.6567347111280166, | |
| "grad_norm": 1.2249760627746582, | |
| "learning_rate": 7.940156687021969e-05, | |
| "loss": 3.4315, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.6736401673640167, | |
| "grad_norm": 1.3212870359420776, | |
| "learning_rate": 7.899012194845549e-05, | |
| "loss": 3.4342, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.690545623600017, | |
| "grad_norm": 1.130004644393921, | |
| "learning_rate": 7.857585584566375e-05, | |
| "loss": 3.428, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7074510798360172, | |
| "grad_norm": 1.0531315803527832, | |
| "learning_rate": 7.815881727854847e-05, | |
| "loss": 3.4208, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.7243565360720172, | |
| "grad_norm": 1.3188402652740479, | |
| "learning_rate": 7.77390552898488e-05, | |
| "loss": 3.4227, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.7412619923080173, | |
| "grad_norm": 1.0934581756591797, | |
| "learning_rate": 7.73166192425718e-05, | |
| "loss": 3.4245, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.7581674485440177, | |
| "grad_norm": 1.7236692905426025, | |
| "learning_rate": 7.68915588141874e-05, | |
| "loss": 3.4167, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.7581674485440177, | |
| "eval_loss": 3.5321877002716064, | |
| "eval_runtime": 11.6863, | |
| "eval_samples_per_second": 85.57, | |
| "eval_steps_per_second": 2.738, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.7750729047800178, | |
| "grad_norm": 0.9837960600852966, | |
| "learning_rate": 7.646392399078647e-05, | |
| "loss": 3.4196, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.7919783610160178, | |
| "grad_norm": 1.0993341207504272, | |
| "learning_rate": 7.60337650612026e-05, | |
| "loss": 3.4116, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.808883817252018, | |
| "grad_norm": 1.3233968019485474, | |
| "learning_rate": 7.560113261109827e-05, | |
| "loss": 3.4118, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.8257892734880183, | |
| "grad_norm": 1.250022530555725, | |
| "learning_rate": 7.516607751701602e-05, | |
| "loss": 3.3959, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.8426947297240184, | |
| "grad_norm": 1.4518115520477295, | |
| "learning_rate": 7.472865094039555e-05, | |
| "loss": 3.4155, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.8596001859600186, | |
| "grad_norm": 1.307420015335083, | |
| "learning_rate": 7.428890432155719e-05, | |
| "loss": 3.3955, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.876505642196019, | |
| "grad_norm": 1.255487322807312, | |
| "learning_rate": 7.384688937365279e-05, | |
| "loss": 3.3877, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.893411098432019, | |
| "grad_norm": 0.999951183795929, | |
| "learning_rate": 7.340265807658422e-05, | |
| "loss": 3.3942, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.893411098432019, | |
| "eval_loss": 3.4983253479003906, | |
| "eval_runtime": 11.6754, | |
| "eval_samples_per_second": 85.65, | |
| "eval_steps_per_second": 2.741, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.910316554668019, | |
| "grad_norm": 1.4537323713302612, | |
| "learning_rate": 7.29562626708907e-05, | |
| "loss": 3.4025, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.9272220109040192, | |
| "grad_norm": 1.2412198781967163, | |
| "learning_rate": 7.25077556516055e-05, | |
| "loss": 3.3924, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.9441274671400195, | |
| "grad_norm": 1.592606544494629, | |
| "learning_rate": 7.205718976208258e-05, | |
| "loss": 3.3732, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.9610329233760195, | |
| "grad_norm": 0.9837318658828735, | |
| "learning_rate": 7.160461798779413e-05, | |
| "loss": 3.3909, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.9779383796120198, | |
| "grad_norm": 1.1004836559295654, | |
| "learning_rate": 7.115009355009959e-05, | |
| "loss": 3.3837, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.99484383584802, | |
| "grad_norm": 1.5246398448944092, | |
| "learning_rate": 7.069366989998692e-05, | |
| "loss": 3.366, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.01174929208402, | |
| "grad_norm": 1.1871501207351685, | |
| "learning_rate": 7.023540071178697e-05, | |
| "loss": 3.2107, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 2.02865474832002, | |
| "grad_norm": 1.3174642324447632, | |
| "learning_rate": 6.977533987686147e-05, | |
| "loss": 3.1636, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.02865474832002, | |
| "eval_loss": 3.476820230484009, | |
| "eval_runtime": 11.7209, | |
| "eval_samples_per_second": 85.318, | |
| "eval_steps_per_second": 2.73, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.0455602045560206, | |
| "grad_norm": 1.3271148204803467, | |
| "learning_rate": 6.931354149726548e-05, | |
| "loss": 3.145, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 2.0624656607920206, | |
| "grad_norm": 1.0174113512039185, | |
| "learning_rate": 6.885005987938516e-05, | |
| "loss": 3.1572, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.0793711170280207, | |
| "grad_norm": 1.086552619934082, | |
| "learning_rate": 6.838494952755154e-05, | |
| "loss": 3.1499, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 2.096276573264021, | |
| "grad_norm": 0.9986650943756104, | |
| "learning_rate": 6.791826513763076e-05, | |
| "loss": 3.1508, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.113182029500021, | |
| "grad_norm": 1.0550023317337036, | |
| "learning_rate": 6.745006159059222e-05, | |
| "loss": 3.1484, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 2.1300874857360212, | |
| "grad_norm": 0.9948070049285889, | |
| "learning_rate": 6.69803939460544e-05, | |
| "loss": 3.1394, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.1469929419720213, | |
| "grad_norm": 0.9610317945480347, | |
| "learning_rate": 6.650931743581033e-05, | |
| "loss": 3.1438, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 2.1638983982080218, | |
| "grad_norm": 1.1480181217193604, | |
| "learning_rate": 6.603688745733211e-05, | |
| "loss": 3.1411, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.1638983982080218, | |
| "eval_loss": 3.4495885372161865, | |
| "eval_runtime": 11.6737, | |
| "eval_samples_per_second": 85.663, | |
| "eval_steps_per_second": 2.741, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.180803854444022, | |
| "grad_norm": 0.8075036406517029, | |
| "learning_rate": 6.556315956725662e-05, | |
| "loss": 3.1346, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 2.197709310680022, | |
| "grad_norm": 1.198843240737915, | |
| "learning_rate": 6.50881894748519e-05, | |
| "loss": 3.1739, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.2146147669160223, | |
| "grad_norm": 1.3039313554763794, | |
| "learning_rate": 6.461203303546615e-05, | |
| "loss": 3.1518, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 2.2315202231520224, | |
| "grad_norm": 1.0603476762771606, | |
| "learning_rate": 6.413474624395905e-05, | |
| "loss": 3.1561, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.2484256793880224, | |
| "grad_norm": 1.0246437788009644, | |
| "learning_rate": 6.365638522811704e-05, | |
| "loss": 3.1361, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 2.265331135624023, | |
| "grad_norm": 1.0935068130493164, | |
| "learning_rate": 6.317700624205273e-05, | |
| "loss": 3.1174, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.282236591860023, | |
| "grad_norm": 0.9507274031639099, | |
| "learning_rate": 6.269666565958963e-05, | |
| "loss": 3.1381, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 2.299142048096023, | |
| "grad_norm": 0.9847383499145508, | |
| "learning_rate": 6.221541996763269e-05, | |
| "loss": 3.1154, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.299142048096023, | |
| "eval_loss": 3.407775640487671, | |
| "eval_runtime": 11.6847, | |
| "eval_samples_per_second": 85.582, | |
| "eval_steps_per_second": 2.739, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.316047504332023, | |
| "grad_norm": 0.9965755343437195, | |
| "learning_rate": 6.173332575952557e-05, | |
| "loss": 3.1175, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 2.3329529605680235, | |
| "grad_norm": 0.9966267943382263, | |
| "learning_rate": 6.125043972839536e-05, | |
| "loss": 3.1507, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.3498584168040235, | |
| "grad_norm": 1.1471587419509888, | |
| "learning_rate": 6.0766818660485716e-05, | |
| "loss": 3.1287, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 2.3667638730400236, | |
| "grad_norm": 1.2493683099746704, | |
| "learning_rate": 6.028251942847882e-05, | |
| "loss": 3.1202, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.3836693292760236, | |
| "grad_norm": 1.0860092639923096, | |
| "learning_rate": 5.9797598984807335e-05, | |
| "loss": 3.121, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 2.400574785512024, | |
| "grad_norm": 1.1602693796157837, | |
| "learning_rate": 5.931211435495694e-05, | |
| "loss": 3.1282, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.417480241748024, | |
| "grad_norm": 0.9266437292098999, | |
| "learning_rate": 5.882612263076026e-05, | |
| "loss": 3.1033, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 2.434385697984024, | |
| "grad_norm": 1.0014028549194336, | |
| "learning_rate": 5.833968096368301e-05, | |
| "loss": 3.1007, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.434385697984024, | |
| "eval_loss": 3.378451108932495, | |
| "eval_runtime": 11.6786, | |
| "eval_samples_per_second": 85.627, | |
| "eval_steps_per_second": 2.74, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.4512911542200246, | |
| "grad_norm": 1.0491281747817993, | |
| "learning_rate": 5.785284655810308e-05, | |
| "loss": 3.1117, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 2.4681966104560247, | |
| "grad_norm": 0.8672958612442017, | |
| "learning_rate": 5.7365676664583514e-05, | |
| "loss": 3.1037, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.4851020666920247, | |
| "grad_norm": 1.0733287334442139, | |
| "learning_rate": 5.687822857313993e-05, | |
| "loss": 3.1175, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 2.502007522928025, | |
| "grad_norm": 0.8078711032867432, | |
| "learning_rate": 5.63905596065033e-05, | |
| "loss": 3.1099, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.5189129791640252, | |
| "grad_norm": 1.0341311693191528, | |
| "learning_rate": 5.590272711337908e-05, | |
| "loss": 3.1167, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 2.5358184354000253, | |
| "grad_norm": 1.1616711616516113, | |
| "learning_rate": 5.541478846170298e-05, | |
| "loss": 3.0931, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.5527238916360258, | |
| "grad_norm": 0.8573184609413147, | |
| "learning_rate": 5.4926801031894734e-05, | |
| "loss": 3.1002, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 2.569629347872026, | |
| "grad_norm": 0.9128609895706177, | |
| "learning_rate": 5.4438822210110275e-05, | |
| "loss": 3.1117, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.569629347872026, | |
| "eval_loss": 3.3413400650024414, | |
| "eval_runtime": 11.683, | |
| "eval_samples_per_second": 85.595, | |
| "eval_steps_per_second": 2.739, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.586534804108026, | |
| "grad_norm": 0.9771824479103088, | |
| "learning_rate": 5.395090938149321e-05, | |
| "loss": 3.0962, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 2.603440260344026, | |
| "grad_norm": 1.0292117595672607, | |
| "learning_rate": 5.346311992342656e-05, | |
| "loss": 3.0829, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.6203457165800264, | |
| "grad_norm": 0.8531580567359924, | |
| "learning_rate": 5.297551119878522e-05, | |
| "loss": 3.0778, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 2.6372511728160264, | |
| "grad_norm": 0.9693793058395386, | |
| "learning_rate": 5.248814054919031e-05, | |
| "loss": 3.0678, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.6541566290520264, | |
| "grad_norm": 0.999812662601471, | |
| "learning_rate": 5.200106528826586e-05, | |
| "loss": 3.0851, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 2.6710620852880265, | |
| "grad_norm": 0.8840042352676392, | |
| "learning_rate": 5.151434269489889e-05, | |
| "loss": 3.0775, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.687967541524027, | |
| "grad_norm": 0.8592630624771118, | |
| "learning_rate": 5.102803000650359e-05, | |
| "loss": 3.0553, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 2.704872997760027, | |
| "grad_norm": 0.8932810425758362, | |
| "learning_rate": 5.054218441229031e-05, | |
| "loss": 3.0653, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.704872997760027, | |
| "eval_loss": 3.309781551361084, | |
| "eval_runtime": 11.6906, | |
| "eval_samples_per_second": 85.538, | |
| "eval_steps_per_second": 2.737, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.721778453996027, | |
| "grad_norm": 0.9920628070831299, | |
| "learning_rate": 5.005686304654018e-05, | |
| "loss": 3.0565, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 2.7386839102320275, | |
| "grad_norm": 0.8969925045967102, | |
| "learning_rate": 4.957212298188638e-05, | |
| "loss": 3.0684, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.7555893664680275, | |
| "grad_norm": 0.9055591821670532, | |
| "learning_rate": 4.908802122260243e-05, | |
| "loss": 3.0721, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 2.7724948227040276, | |
| "grad_norm": 1.047692060470581, | |
| "learning_rate": 4.8604614697898706e-05, | |
| "loss": 3.0404, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.789400278940028, | |
| "grad_norm": 0.9087944626808167, | |
| "learning_rate": 4.8121960255227603e-05, | |
| "loss": 3.0607, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 2.806305735176028, | |
| "grad_norm": 0.845425546169281, | |
| "learning_rate": 4.764011465359851e-05, | |
| "loss": 3.0396, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.823211191412028, | |
| "grad_norm": 0.9913256764411926, | |
| "learning_rate": 4.715913455690301e-05, | |
| "loss": 3.0444, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 2.8401166476480286, | |
| "grad_norm": 1.0773906707763672, | |
| "learning_rate": 4.66790765272514e-05, | |
| "loss": 3.0378, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.8401166476480286, | |
| "eval_loss": 3.284963846206665, | |
| "eval_runtime": 11.6698, | |
| "eval_samples_per_second": 85.691, | |
| "eval_steps_per_second": 2.742, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.8570221038840287, | |
| "grad_norm": 0.9364065527915955, | |
| "learning_rate": 4.619999701832108e-05, | |
| "loss": 3.0088, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 2.8739275601200287, | |
| "grad_norm": 0.9384069442749023, | |
| "learning_rate": 4.572195236871777e-05, | |
| "loss": 3.0226, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.890833016356029, | |
| "grad_norm": 0.8556334972381592, | |
| "learning_rate": 4.524499879535016e-05, | |
| "loss": 3.0257, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 2.907738472592029, | |
| "grad_norm": 0.9114282131195068, | |
| "learning_rate": 4.476919238681904e-05, | |
| "loss": 3.0302, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.9246439288280293, | |
| "grad_norm": 0.832721471786499, | |
| "learning_rate": 4.4294589096821325e-05, | |
| "loss": 3.0438, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 2.9415493850640293, | |
| "grad_norm": 0.8480991125106812, | |
| "learning_rate": 4.3821244737570046e-05, | |
| "loss": 3.0276, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.9584548413000293, | |
| "grad_norm": 1.021043300628662, | |
| "learning_rate": 4.3349214973231024e-05, | |
| "loss": 3.0216, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 2.97536029753603, | |
| "grad_norm": 0.7957491874694824, | |
| "learning_rate": 4.287855531337683e-05, | |
| "loss": 3.0173, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.97536029753603, | |
| "eval_loss": 3.2582342624664307, | |
| "eval_runtime": 11.6907, | |
| "eval_samples_per_second": 85.538, | |
| "eval_steps_per_second": 2.737, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.99226575377203, | |
| "grad_norm": 1.0264720916748047, | |
| "learning_rate": 4.2409321106459077e-05, | |
| "loss": 3.0152, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 3.00917121000803, | |
| "grad_norm": 0.8381086587905884, | |
| "learning_rate": 4.194156753329942e-05, | |
| "loss": 2.886, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 3.0260766662440304, | |
| "grad_norm": 0.9872603416442871, | |
| "learning_rate": 4.147534960060059e-05, | |
| "loss": 2.7538, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 3.0429821224800304, | |
| "grad_norm": 0.8750160336494446, | |
| "learning_rate": 4.1010722134477665e-05, | |
| "loss": 2.791, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.0598875787160305, | |
| "grad_norm": 0.7593681216239929, | |
| "learning_rate": 4.054773977401066e-05, | |
| "loss": 2.7791, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 3.076793034952031, | |
| "grad_norm": 0.9249379634857178, | |
| "learning_rate": 4.008645696481903e-05, | |
| "loss": 2.7693, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.093698491188031, | |
| "grad_norm": 1.0219660997390747, | |
| "learning_rate": 3.962692795265914e-05, | |
| "loss": 2.7869, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 3.110603947424031, | |
| "grad_norm": 0.8459084630012512, | |
| "learning_rate": 3.916920677704499e-05, | |
| "loss": 2.778, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.110603947424031, | |
| "eval_loss": 3.247343063354492, | |
| "eval_runtime": 11.6589, | |
| "eval_samples_per_second": 85.771, | |
| "eval_steps_per_second": 2.745, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.1275094036600315, | |
| "grad_norm": 0.7969251871109009, | |
| "learning_rate": 3.8713347264893294e-05, | |
| "loss": 2.7645, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 3.1444148598960315, | |
| "grad_norm": 0.8492719531059265, | |
| "learning_rate": 3.8259403024193616e-05, | |
| "loss": 2.7729, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.1613203161320316, | |
| "grad_norm": 0.862267255783081, | |
| "learning_rate": 3.780742743770417e-05, | |
| "loss": 2.7825, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 3.1782257723680316, | |
| "grad_norm": 0.9612255692481995, | |
| "learning_rate": 3.7357473656674126e-05, | |
| "loss": 2.7848, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.195131228604032, | |
| "grad_norm": 0.8228344321250916, | |
| "learning_rate": 3.6909594594593175e-05, | |
| "loss": 2.7684, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 3.212036684840032, | |
| "grad_norm": 0.9417145252227783, | |
| "learning_rate": 3.6463842920969026e-05, | |
| "loss": 2.7771, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.228942141076032, | |
| "grad_norm": 0.7457485795021057, | |
| "learning_rate": 3.602027105513355e-05, | |
| "loss": 2.8103, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 3.2458475973120327, | |
| "grad_norm": 0.8420482277870178, | |
| "learning_rate": 3.557893116007848e-05, | |
| "loss": 2.7591, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.2458475973120327, | |
| "eval_loss": 3.20989990234375, | |
| "eval_runtime": 11.7, | |
| "eval_samples_per_second": 85.47, | |
| "eval_steps_per_second": 2.735, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.2627530535480327, | |
| "grad_norm": 0.8763940930366516, | |
| "learning_rate": 3.5139875136321066e-05, | |
| "loss": 2.7569, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 3.2796585097840327, | |
| "grad_norm": 0.8527898192405701, | |
| "learning_rate": 3.470315461580079e-05, | |
| "loss": 2.7533, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.2965639660200328, | |
| "grad_norm": 0.8603160381317139, | |
| "learning_rate": 3.426882095580751e-05, | |
| "loss": 2.7438, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 3.3134694222560332, | |
| "grad_norm": 0.8680624961853027, | |
| "learning_rate": 3.3836925232942005e-05, | |
| "loss": 2.7353, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.3303748784920333, | |
| "grad_norm": 0.9345048666000366, | |
| "learning_rate": 3.3407518237109456e-05, | |
| "loss": 2.7667, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 3.3472803347280333, | |
| "grad_norm": 0.838909387588501, | |
| "learning_rate": 3.29806504655467e-05, | |
| "loss": 2.7438, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.364185790964034, | |
| "grad_norm": 0.8523705005645752, | |
| "learning_rate": 3.2556372116883874e-05, | |
| "loss": 2.771, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 3.381091247200034, | |
| "grad_norm": 0.8146962523460388, | |
| "learning_rate": 3.213473308524115e-05, | |
| "loss": 2.7634, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.381091247200034, | |
| "eval_loss": 3.1767163276672363, | |
| "eval_runtime": 11.6994, | |
| "eval_samples_per_second": 85.474, | |
| "eval_steps_per_second": 2.735, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.397996703436034, | |
| "grad_norm": 0.789837121963501, | |
| "learning_rate": 3.171578295436133e-05, | |
| "loss": 2.7489, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 3.4149021596720344, | |
| "grad_norm": 0.8918471932411194, | |
| "learning_rate": 3.129957099177892e-05, | |
| "loss": 2.7424, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 3.4318076159080344, | |
| "grad_norm": 0.859986424446106, | |
| "learning_rate": 3.0886146143026346e-05, | |
| "loss": 2.7504, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 3.4487130721440344, | |
| "grad_norm": 0.8879761695861816, | |
| "learning_rate": 3.047555702587816e-05, | |
| "loss": 2.7572, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 3.4656185283800345, | |
| "grad_norm": 0.8334829211235046, | |
| "learning_rate": 3.0067851924633606e-05, | |
| "loss": 2.7627, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 3.482523984616035, | |
| "grad_norm": 0.8631997108459473, | |
| "learning_rate": 2.9663078784438558e-05, | |
| "loss": 2.7526, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 3.499429440852035, | |
| "grad_norm": 0.8065224885940552, | |
| "learning_rate": 2.9261285205647283e-05, | |
| "loss": 2.7353, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 3.516334897088035, | |
| "grad_norm": 0.8405721783638, | |
| "learning_rate": 2.886251843822475e-05, | |
| "loss": 2.7255, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.516334897088035, | |
| "eval_loss": 3.1594960689544678, | |
| "eval_runtime": 11.722, | |
| "eval_samples_per_second": 85.31, | |
| "eval_steps_per_second": 2.73, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 3.533240353324035, | |
| "grad_norm": 0.8360841274261475, | |
| "learning_rate": 2.8466825376190122e-05, | |
| "loss": 2.723, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 3.5501458095600356, | |
| "grad_norm": 0.8372629880905151, | |
| "learning_rate": 2.8074252552102176e-05, | |
| "loss": 2.7196, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.5670512657960356, | |
| "grad_norm": 0.7810975313186646, | |
| "learning_rate": 2.768484613158714e-05, | |
| "loss": 2.7162, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 3.5839567220320356, | |
| "grad_norm": 0.8663210272789001, | |
| "learning_rate": 2.729865190790975e-05, | |
| "loss": 2.736, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 3.600862178268036, | |
| "grad_norm": 0.8004422187805176, | |
| "learning_rate": 2.6915715296588083e-05, | |
| "loss": 2.7291, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 3.617767634504036, | |
| "grad_norm": 0.7938219308853149, | |
| "learning_rate": 2.653608133005278e-05, | |
| "loss": 2.6953, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 3.634673090740036, | |
| "grad_norm": 0.8083840608596802, | |
| "learning_rate": 2.6159794652351332e-05, | |
| "loss": 2.7371, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 3.6515785469760367, | |
| "grad_norm": 0.8592216968536377, | |
| "learning_rate": 2.5786899513898066e-05, | |
| "loss": 2.7152, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.6515785469760367, | |
| "eval_loss": 3.1375598907470703, | |
| "eval_runtime": 11.6562, | |
| "eval_samples_per_second": 85.791, | |
| "eval_steps_per_second": 2.745, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.6684840032120367, | |
| "grad_norm": 0.7407676577568054, | |
| "learning_rate": 2.54174397662704e-05, | |
| "loss": 2.713, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 3.6853894594480368, | |
| "grad_norm": 0.8212365508079529, | |
| "learning_rate": 2.5051458857052006e-05, | |
| "loss": 2.7203, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 3.7022949156840372, | |
| "grad_norm": 0.8753028512001038, | |
| "learning_rate": 2.468899982472346e-05, | |
| "loss": 2.7398, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 3.7192003719200373, | |
| "grad_norm": 0.8082839846611023, | |
| "learning_rate": 2.4330105293601023e-05, | |
| "loss": 2.7097, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.7361058281560373, | |
| "grad_norm": 0.8813438415527344, | |
| "learning_rate": 2.397481746882414e-05, | |
| "loss": 2.7213, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 3.753011284392038, | |
| "grad_norm": 0.8139234781265259, | |
| "learning_rate": 2.36231781313922e-05, | |
| "loss": 2.7002, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 3.769916740628038, | |
| "grad_norm": 0.800145149230957, | |
| "learning_rate": 2.3275228633251227e-05, | |
| "loss": 2.7182, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 3.786822196864038, | |
| "grad_norm": 0.7504892945289612, | |
| "learning_rate": 2.29310098924309e-05, | |
| "loss": 2.6992, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.786822196864038, | |
| "eval_loss": 3.1104345321655273, | |
| "eval_runtime": 11.6893, | |
| "eval_samples_per_second": 85.548, | |
| "eval_steps_per_second": 2.738, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 3.803727653100038, | |
| "grad_norm": 0.8169353604316711, | |
| "learning_rate": 2.2590562388232804e-05, | |
| "loss": 2.7137, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 3.820633109336038, | |
| "grad_norm": 0.8978986144065857, | |
| "learning_rate": 2.225392615647006e-05, | |
| "loss": 2.7369, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 3.8375385655720384, | |
| "grad_norm": 0.8523368239402771, | |
| "learning_rate": 2.1921140784759338e-05, | |
| "loss": 2.7309, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 3.8544440218080385, | |
| "grad_norm": 0.890410840511322, | |
| "learning_rate": 2.1592245407865252e-05, | |
| "loss": 2.6908, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 3.8713494780440385, | |
| "grad_norm": 0.8792175650596619, | |
| "learning_rate": 2.126727870309841e-05, | |
| "loss": 2.698, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 3.888254934280039, | |
| "grad_norm": 0.7908258438110352, | |
| "learning_rate": 2.09462788857669e-05, | |
| "loss": 2.6956, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.905160390516039, | |
| "grad_norm": 0.7934091687202454, | |
| "learning_rate": 2.0629283704682392e-05, | |
| "loss": 2.7036, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 3.922065846752039, | |
| "grad_norm": 0.8208107948303223, | |
| "learning_rate": 2.031633043772086e-05, | |
| "loss": 2.7007, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 3.922065846752039, | |
| "eval_loss": 3.0899033546447754, | |
| "eval_runtime": 11.6628, | |
| "eval_samples_per_second": 85.743, | |
| "eval_steps_per_second": 2.744, | |
| "step": 11600 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 14785, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 400, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.925050875573101e+20, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |