{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.922065846752039, "eval_steps": 400, "global_step": 11600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01690545623600017, "grad_norm": 9.244617462158203, "learning_rate": 1.6666666666666667e-05, "loss": 5.2046, "step": 50 }, { "epoch": 0.03381091247200034, "grad_norm": 11.779854774475098, "learning_rate": 3.3333333333333335e-05, "loss": 4.1731, "step": 100 }, { "epoch": 0.05071636870800051, "grad_norm": 3.761927604675293, "learning_rate": 5e-05, "loss": 4.0566, "step": 150 }, { "epoch": 0.06762182494400068, "grad_norm": 9.56804084777832, "learning_rate": 6.666666666666667e-05, "loss": 4.0416, "step": 200 }, { "epoch": 0.08452728118000084, "grad_norm": 4.056901454925537, "learning_rate": 8.333333333333334e-05, "loss": 3.9922, "step": 250 }, { "epoch": 0.10143273741600102, "grad_norm": 8.173892974853516, "learning_rate": 0.0001, "loss": 4.056, "step": 300 }, { "epoch": 0.11833819365200118, "grad_norm": 7.099091053009033, "learning_rate": 9.999735405375364e-05, "loss": 3.9618, "step": 350 }, { "epoch": 0.13524364988800136, "grad_norm": 3.7080025672912598, "learning_rate": 9.998941652617147e-05, "loss": 3.9753, "step": 400 }, { "epoch": 0.13524364988800136, "eval_loss": 3.9605562686920166, "eval_runtime": 11.6413, "eval_samples_per_second": 85.901, "eval_steps_per_second": 2.749, "step": 400 }, { "epoch": 0.1521491061240015, "grad_norm": 3.6200530529022217, "learning_rate": 9.997618835068782e-05, "loss": 3.956, "step": 450 }, { "epoch": 0.1690545623600017, "grad_norm": 3.6575722694396973, "learning_rate": 9.995767108290448e-05, "loss": 3.9065, "step": 500 }, { "epoch": 0.18596001859600186, "grad_norm": 7.8048248291015625, "learning_rate": 9.993386690040792e-05, "loss": 3.88, "step": 550 }, { "epoch": 0.20286547483200204, "grad_norm": 2.769477605819702, "learning_rate": 9.990477860251314e-05, "loss": 3.8931, "step": 600 }, { "epoch": 0.2197709310680022, "grad_norm": 3.249483823776245, "learning_rate": 9.987040960993446e-05, "loss": 3.8389, "step": 650 }, { "epoch": 0.23667638730400237, "grad_norm": 1.6407500505447388, "learning_rate": 9.983076396438333e-05, "loss": 3.855, "step": 700 }, { "epoch": 0.2535818435400025, "grad_norm": 2.995973825454712, "learning_rate": 9.978584632809293e-05, "loss": 3.8473, "step": 750 }, { "epoch": 0.2704872997760027, "grad_norm": 2.316067934036255, "learning_rate": 9.973566198326996e-05, "loss": 3.8353, "step": 800 }, { "epoch": 0.2704872997760027, "eval_loss": 3.8578951358795166, "eval_runtime": 11.681, "eval_samples_per_second": 85.609, "eval_steps_per_second": 2.739, "step": 800 }, { "epoch": 0.28739275601200287, "grad_norm": 2.2685530185699463, "learning_rate": 9.968021683147353e-05, "loss": 3.8267, "step": 850 }, { "epoch": 0.304298212248003, "grad_norm": 2.8378384113311768, "learning_rate": 9.961951739292097e-05, "loss": 3.8046, "step": 900 }, { "epoch": 0.3212036684840032, "grad_norm": 3.528430938720703, "learning_rate": 9.955357080572128e-05, "loss": 3.8239, "step": 950 }, { "epoch": 0.3381091247200034, "grad_norm": 5.5983099937438965, "learning_rate": 9.948238482503555e-05, "loss": 3.7842, "step": 1000 }, { "epoch": 0.3550145809560035, "grad_norm": 2.4193906784057617, "learning_rate": 9.940596782216504e-05, "loss": 3.7816, "step": 1050 }, { "epoch": 0.3719200371920037, "grad_norm": 1.893676996231079, "learning_rate": 9.932432878356672e-05, "loss": 3.7895, "step": 1100 }, { "epoch": 0.3888254934280039, "grad_norm": 2.029632806777954, "learning_rate": 9.92374773097965e-05, "loss": 3.7668, "step": 1150 }, { "epoch": 0.4057309496640041, "grad_norm": 2.21596622467041, "learning_rate": 9.91454236143802e-05, "loss": 3.7979, "step": 1200 }, { "epoch": 0.4057309496640041, "eval_loss": 3.8054637908935547, "eval_runtime": 11.6706, "eval_samples_per_second": 85.685, "eval_steps_per_second": 2.742, "step": 1200 }, { "epoch": 0.42263640590000423, "grad_norm": 3.131930351257324, "learning_rate": 9.90481785226125e-05, "loss": 3.7824, "step": 1250 }, { "epoch": 0.4395418621360044, "grad_norm": 1.5176011323928833, "learning_rate": 9.894575347028381e-05, "loss": 3.7666, "step": 1300 }, { "epoch": 0.4564473183720046, "grad_norm": 2.2135422229766846, "learning_rate": 9.883816050233566e-05, "loss": 3.7684, "step": 1350 }, { "epoch": 0.47335277460800473, "grad_norm": 1.565341830253601, "learning_rate": 9.872541227144397e-05, "loss": 3.7421, "step": 1400 }, { "epoch": 0.4902582308440049, "grad_norm": 2.0272250175476074, "learning_rate": 9.860752203653138e-05, "loss": 3.7524, "step": 1450 }, { "epoch": 0.507163687080005, "grad_norm": 2.3772072792053223, "learning_rate": 9.848450366120785e-05, "loss": 3.7489, "step": 1500 }, { "epoch": 0.5240691433160053, "grad_norm": 2.349799633026123, "learning_rate": 9.835637161214042e-05, "loss": 3.7421, "step": 1550 }, { "epoch": 0.5409745995520054, "grad_norm": 2.6349542140960693, "learning_rate": 9.822314095735195e-05, "loss": 3.7416, "step": 1600 }, { "epoch": 0.5409745995520054, "eval_loss": 3.775789499282837, "eval_runtime": 11.6523, "eval_samples_per_second": 85.82, "eval_steps_per_second": 2.746, "step": 1600 }, { "epoch": 0.5578800557880056, "grad_norm": 2.5276811122894287, "learning_rate": 9.808482736444913e-05, "loss": 3.7517, "step": 1650 }, { "epoch": 0.5747855120240057, "grad_norm": 2.6835217475891113, "learning_rate": 9.794144709878008e-05, "loss": 3.7325, "step": 1700 }, { "epoch": 0.5916909682600059, "grad_norm": 2.924553632736206, "learning_rate": 9.779301702152147e-05, "loss": 3.7142, "step": 1750 }, { "epoch": 0.608596424496006, "grad_norm": 2.129288673400879, "learning_rate": 9.763955458769581e-05, "loss": 3.7347, "step": 1800 }, { "epoch": 0.6255018807320063, "grad_norm": 1.441651463508606, "learning_rate": 9.748107784411867e-05, "loss": 3.72, "step": 1850 }, { "epoch": 0.6424073369680064, "grad_norm": 2.1518378257751465, "learning_rate": 9.731760542727647e-05, "loss": 3.7042, "step": 1900 }, { "epoch": 0.6593127932040066, "grad_norm": 4.049715518951416, "learning_rate": 9.714915656113491e-05, "loss": 3.7284, "step": 1950 }, { "epoch": 0.6762182494400067, "grad_norm": 1.596699595451355, "learning_rate": 9.697575105487821e-05, "loss": 3.7116, "step": 2000 }, { "epoch": 0.6762182494400067, "eval_loss": 3.7348101139068604, "eval_runtime": 11.6849, "eval_samples_per_second": 85.58, "eval_steps_per_second": 2.739, "step": 2000 }, { "epoch": 0.6931237056760069, "grad_norm": 2.405496835708618, "learning_rate": 9.679740930057965e-05, "loss": 3.6867, "step": 2050 }, { "epoch": 0.710029161912007, "grad_norm": 2.290134906768799, "learning_rate": 9.661415227080342e-05, "loss": 3.6967, "step": 2100 }, { "epoch": 0.7269346181480073, "grad_norm": 1.6495215892791748, "learning_rate": 9.642600151613847e-05, "loss": 3.6822, "step": 2150 }, { "epoch": 0.7438400743840075, "grad_norm": 1.6393038034439087, "learning_rate": 9.623297916266399e-05, "loss": 3.6747, "step": 2200 }, { "epoch": 0.7607455306200076, "grad_norm": 1.7952766418457031, "learning_rate": 9.603510790934765e-05, "loss": 3.6778, "step": 2250 }, { "epoch": 0.7776509868560078, "grad_norm": 2.7831900119781494, "learning_rate": 9.583241102537609e-05, "loss": 3.6891, "step": 2300 }, { "epoch": 0.7945564430920079, "grad_norm": 1.602946400642395, "learning_rate": 9.562491234741863e-05, "loss": 3.6745, "step": 2350 }, { "epoch": 0.8114618993280082, "grad_norm": 2.163341522216797, "learning_rate": 9.541263627682408e-05, "loss": 3.6622, "step": 2400 }, { "epoch": 0.8114618993280082, "eval_loss": 3.7114827632904053, "eval_runtime": 11.6568, "eval_samples_per_second": 85.787, "eval_steps_per_second": 2.745, "step": 2400 }, { "epoch": 0.8283673555640083, "grad_norm": 3.2292706966400146, "learning_rate": 9.519560777675117e-05, "loss": 3.6907, "step": 2450 }, { "epoch": 0.8452728118000085, "grad_norm": 2.3297388553619385, "learning_rate": 9.497385236923304e-05, "loss": 3.659, "step": 2500 }, { "epoch": 0.8621782680360086, "grad_norm": 2.1775872707366943, "learning_rate": 9.474739613217583e-05, "loss": 3.6543, "step": 2550 }, { "epoch": 0.8790837242720088, "grad_norm": 2.1940650939941406, "learning_rate": 9.4516265696292e-05, "loss": 3.6631, "step": 2600 }, { "epoch": 0.8959891805080089, "grad_norm": 2.0075910091400146, "learning_rate": 9.428048824196861e-05, "loss": 3.6699, "step": 2650 }, { "epoch": 0.9128946367440092, "grad_norm": 1.463968276977539, "learning_rate": 9.404009149607104e-05, "loss": 3.6458, "step": 2700 }, { "epoch": 0.9298000929800093, "grad_norm": 1.776117205619812, "learning_rate": 9.379510372868227e-05, "loss": 3.6442, "step": 2750 }, { "epoch": 0.9467055492160095, "grad_norm": 1.4943894147872925, "learning_rate": 9.354555374977845e-05, "loss": 3.6717, "step": 2800 }, { "epoch": 0.9467055492160095, "eval_loss": 3.6869394779205322, "eval_runtime": 11.6677, "eval_samples_per_second": 85.706, "eval_steps_per_second": 2.743, "step": 2800 }, { "epoch": 0.9636110054520096, "grad_norm": 2.1420116424560547, "learning_rate": 9.32914709058409e-05, "loss": 3.6602, "step": 2850 }, { "epoch": 0.9805164616880098, "grad_norm": 1.4225534200668335, "learning_rate": 9.303288507640508e-05, "loss": 3.6312, "step": 2900 }, { "epoch": 0.99742191792401, "grad_norm": 1.5110421180725098, "learning_rate": 9.276982667054676e-05, "loss": 3.6309, "step": 2950 }, { "epoch": 1.01432737416001, "grad_norm": 1.4234793186187744, "learning_rate": 9.250232662330597e-05, "loss": 3.5446, "step": 3000 }, { "epoch": 1.0312328303960103, "grad_norm": 1.4176740646362305, "learning_rate": 9.223041639204923e-05, "loss": 3.5084, "step": 3050 }, { "epoch": 1.0481382866320106, "grad_norm": 1.5388909578323364, "learning_rate": 9.195412795277012e-05, "loss": 3.4993, "step": 3100 }, { "epoch": 1.0650437428680106, "grad_norm": 1.7309865951538086, "learning_rate": 9.167349379632901e-05, "loss": 3.5396, "step": 3150 }, { "epoch": 1.0819491991040109, "grad_norm": 1.6854147911071777, "learning_rate": 9.138854692463229e-05, "loss": 3.5129, "step": 3200 }, { "epoch": 1.0819491991040109, "eval_loss": 3.657722234725952, "eval_runtime": 11.6989, "eval_samples_per_second": 85.478, "eval_steps_per_second": 2.735, "step": 3200 }, { "epoch": 1.098854655340011, "grad_norm": 2.1751339435577393, "learning_rate": 9.10993208467513e-05, "loss": 3.523, "step": 3250 }, { "epoch": 1.1157601115760112, "grad_norm": 1.1804617643356323, "learning_rate": 9.08058495749818e-05, "loss": 3.5124, "step": 3300 }, { "epoch": 1.1326655678120114, "grad_norm": 2.0826492309570312, "learning_rate": 9.050816762084426e-05, "loss": 3.5145, "step": 3350 }, { "epoch": 1.1495710240480115, "grad_norm": 1.2046183347702026, "learning_rate": 9.020630999102529e-05, "loss": 3.4959, "step": 3400 }, { "epoch": 1.1664764802840117, "grad_norm": 1.7077451944351196, "learning_rate": 8.990031218326104e-05, "loss": 3.5147, "step": 3450 }, { "epoch": 1.1833819365200118, "grad_norm": 1.5411063432693481, "learning_rate": 8.959021018216262e-05, "loss": 3.5041, "step": 3500 }, { "epoch": 1.200287392756012, "grad_norm": 1.4028867483139038, "learning_rate": 8.927604045498453e-05, "loss": 3.5017, "step": 3550 }, { "epoch": 1.217192848992012, "grad_norm": 1.7714195251464844, "learning_rate": 8.895783994733611e-05, "loss": 3.5056, "step": 3600 }, { "epoch": 1.217192848992012, "eval_loss": 3.643071174621582, "eval_runtime": 11.6982, "eval_samples_per_second": 85.484, "eval_steps_per_second": 2.735, "step": 3600 }, { "epoch": 1.2340983052280123, "grad_norm": 1.4252102375030518, "learning_rate": 8.863564607883687e-05, "loss": 3.4944, "step": 3650 }, { "epoch": 1.2510037614640126, "grad_norm": 1.6317962408065796, "learning_rate": 8.8309496738716e-05, "loss": 3.4899, "step": 3700 }, { "epoch": 1.2679092177000126, "grad_norm": 1.9923129081726074, "learning_rate": 8.79794302813567e-05, "loss": 3.5166, "step": 3750 }, { "epoch": 1.284814673936013, "grad_norm": 1.0414555072784424, "learning_rate": 8.764548552178584e-05, "loss": 3.4998, "step": 3800 }, { "epoch": 1.301720130172013, "grad_norm": 1.9689300060272217, "learning_rate": 8.730770173110932e-05, "loss": 3.5138, "step": 3850 }, { "epoch": 1.3186255864080132, "grad_norm": 1.6223361492156982, "learning_rate": 8.696611863189395e-05, "loss": 3.49, "step": 3900 }, { "epoch": 1.3355310426440132, "grad_norm": 1.6299093961715698, "learning_rate": 8.662077639349612e-05, "loss": 3.4629, "step": 3950 }, { "epoch": 1.3524364988800135, "grad_norm": 1.510591983795166, "learning_rate": 8.627171562733803e-05, "loss": 3.4711, "step": 4000 }, { "epoch": 1.3524364988800135, "eval_loss": 3.620584726333618, "eval_runtime": 11.664, "eval_samples_per_second": 85.734, "eval_steps_per_second": 2.743, "step": 4000 }, { "epoch": 1.3693419551160138, "grad_norm": 1.280197262763977, "learning_rate": 8.591897738213187e-05, "loss": 3.4808, "step": 4050 }, { "epoch": 1.3862474113520138, "grad_norm": 1.5453211069107056, "learning_rate": 8.556260313905257e-05, "loss": 3.4747, "step": 4100 }, { "epoch": 1.403152867588014, "grad_norm": 1.439795970916748, "learning_rate": 8.520263480685968e-05, "loss": 3.4847, "step": 4150 }, { "epoch": 1.4200583238240143, "grad_norm": 1.2832996845245361, "learning_rate": 8.483911471696912e-05, "loss": 3.482, "step": 4200 }, { "epoch": 1.4369637800600144, "grad_norm": 1.684513807296753, "learning_rate": 8.447208561847501e-05, "loss": 3.4514, "step": 4250 }, { "epoch": 1.4538692362960146, "grad_norm": 0.9559006094932556, "learning_rate": 8.410159067312243e-05, "loss": 3.4771, "step": 4300 }, { "epoch": 1.4707746925320146, "grad_norm": 1.0770719051361084, "learning_rate": 8.372767345023185e-05, "loss": 3.483, "step": 4350 }, { "epoch": 1.487680148768015, "grad_norm": 1.1667702198028564, "learning_rate": 8.33503779215754e-05, "loss": 3.4781, "step": 4400 }, { "epoch": 1.487680148768015, "eval_loss": 3.585639715194702, "eval_runtime": 11.6769, "eval_samples_per_second": 85.639, "eval_steps_per_second": 2.74, "step": 4400 }, { "epoch": 1.504585605004015, "grad_norm": 1.183647632598877, "learning_rate": 8.296974845620584e-05, "loss": 3.4836, "step": 4450 }, { "epoch": 1.5214910612400152, "grad_norm": 1.308076024055481, "learning_rate": 8.258582981523895e-05, "loss": 3.4571, "step": 4500 }, { "epoch": 1.5383965174760155, "grad_norm": 1.5776398181915283, "learning_rate": 8.219866714658971e-05, "loss": 3.444, "step": 4550 }, { "epoch": 1.5553019737120155, "grad_norm": 1.1617664098739624, "learning_rate": 8.180830597966303e-05, "loss": 3.4465, "step": 4600 }, { "epoch": 1.5722074299480158, "grad_norm": 0.9418047070503235, "learning_rate": 8.141479221999953e-05, "loss": 3.4568, "step": 4650 }, { "epoch": 1.589112886184016, "grad_norm": 1.3985543251037598, "learning_rate": 8.101817214387723e-05, "loss": 3.4502, "step": 4700 }, { "epoch": 1.606018342420016, "grad_norm": 1.0654075145721436, "learning_rate": 8.06184923928695e-05, "loss": 3.4424, "step": 4750 }, { "epoch": 1.622923798656016, "grad_norm": 1.5798131227493286, "learning_rate": 8.021579996836025e-05, "loss": 3.4336, "step": 4800 }, { "epoch": 1.622923798656016, "eval_loss": 3.5558018684387207, "eval_runtime": 11.6573, "eval_samples_per_second": 85.783, "eval_steps_per_second": 2.745, "step": 4800 }, { "epoch": 1.6398292548920164, "grad_norm": 1.036365270614624, "learning_rate": 7.981014222601651e-05, "loss": 3.4397, "step": 4850 }, { "epoch": 1.6567347111280166, "grad_norm": 1.2249760627746582, "learning_rate": 7.940156687021969e-05, "loss": 3.4315, "step": 4900 }, { "epoch": 1.6736401673640167, "grad_norm": 1.3212870359420776, "learning_rate": 7.899012194845549e-05, "loss": 3.4342, "step": 4950 }, { "epoch": 1.690545623600017, "grad_norm": 1.130004644393921, "learning_rate": 7.857585584566375e-05, "loss": 3.428, "step": 5000 }, { "epoch": 1.7074510798360172, "grad_norm": 1.0531315803527832, "learning_rate": 7.815881727854847e-05, "loss": 3.4208, "step": 5050 }, { "epoch": 1.7243565360720172, "grad_norm": 1.3188402652740479, "learning_rate": 7.77390552898488e-05, "loss": 3.4227, "step": 5100 }, { "epoch": 1.7412619923080173, "grad_norm": 1.0934581756591797, "learning_rate": 7.73166192425718e-05, "loss": 3.4245, "step": 5150 }, { "epoch": 1.7581674485440177, "grad_norm": 1.7236692905426025, "learning_rate": 7.68915588141874e-05, "loss": 3.4167, "step": 5200 }, { "epoch": 1.7581674485440177, "eval_loss": 3.5321877002716064, "eval_runtime": 11.6863, "eval_samples_per_second": 85.57, "eval_steps_per_second": 2.738, "step": 5200 }, { "epoch": 1.7750729047800178, "grad_norm": 0.9837960600852966, "learning_rate": 7.646392399078647e-05, "loss": 3.4196, "step": 5250 }, { "epoch": 1.7919783610160178, "grad_norm": 1.0993341207504272, "learning_rate": 7.60337650612026e-05, "loss": 3.4116, "step": 5300 }, { "epoch": 1.808883817252018, "grad_norm": 1.3233968019485474, "learning_rate": 7.560113261109827e-05, "loss": 3.4118, "step": 5350 }, { "epoch": 1.8257892734880183, "grad_norm": 1.250022530555725, "learning_rate": 7.516607751701602e-05, "loss": 3.3959, "step": 5400 }, { "epoch": 1.8426947297240184, "grad_norm": 1.4518115520477295, "learning_rate": 7.472865094039555e-05, "loss": 3.4155, "step": 5450 }, { "epoch": 1.8596001859600186, "grad_norm": 1.307420015335083, "learning_rate": 7.428890432155719e-05, "loss": 3.3955, "step": 5500 }, { "epoch": 1.876505642196019, "grad_norm": 1.255487322807312, "learning_rate": 7.384688937365279e-05, "loss": 3.3877, "step": 5550 }, { "epoch": 1.893411098432019, "grad_norm": 0.999951183795929, "learning_rate": 7.340265807658422e-05, "loss": 3.3942, "step": 5600 }, { "epoch": 1.893411098432019, "eval_loss": 3.4983253479003906, "eval_runtime": 11.6754, "eval_samples_per_second": 85.65, "eval_steps_per_second": 2.741, "step": 5600 }, { "epoch": 1.910316554668019, "grad_norm": 1.4537323713302612, "learning_rate": 7.29562626708907e-05, "loss": 3.4025, "step": 5650 }, { "epoch": 1.9272220109040192, "grad_norm": 1.2412198781967163, "learning_rate": 7.25077556516055e-05, "loss": 3.3924, "step": 5700 }, { "epoch": 1.9441274671400195, "grad_norm": 1.592606544494629, "learning_rate": 7.205718976208258e-05, "loss": 3.3732, "step": 5750 }, { "epoch": 1.9610329233760195, "grad_norm": 0.9837318658828735, "learning_rate": 7.160461798779413e-05, "loss": 3.3909, "step": 5800 }, { "epoch": 1.9779383796120198, "grad_norm": 1.1004836559295654, "learning_rate": 7.115009355009959e-05, "loss": 3.3837, "step": 5850 }, { "epoch": 1.99484383584802, "grad_norm": 1.5246398448944092, "learning_rate": 7.069366989998692e-05, "loss": 3.366, "step": 5900 }, { "epoch": 2.01174929208402, "grad_norm": 1.1871501207351685, "learning_rate": 7.023540071178697e-05, "loss": 3.2107, "step": 5950 }, { "epoch": 2.02865474832002, "grad_norm": 1.3174642324447632, "learning_rate": 6.977533987686147e-05, "loss": 3.1636, "step": 6000 }, { "epoch": 2.02865474832002, "eval_loss": 3.476820230484009, "eval_runtime": 11.7209, "eval_samples_per_second": 85.318, "eval_steps_per_second": 2.73, "step": 6000 }, { "epoch": 2.0455602045560206, "grad_norm": 1.3271148204803467, "learning_rate": 6.931354149726548e-05, "loss": 3.145, "step": 6050 }, { "epoch": 2.0624656607920206, "grad_norm": 1.0174113512039185, "learning_rate": 6.885005987938516e-05, "loss": 3.1572, "step": 6100 }, { "epoch": 2.0793711170280207, "grad_norm": 1.086552619934082, "learning_rate": 6.838494952755154e-05, "loss": 3.1499, "step": 6150 }, { "epoch": 2.096276573264021, "grad_norm": 0.9986650943756104, "learning_rate": 6.791826513763076e-05, "loss": 3.1508, "step": 6200 }, { "epoch": 2.113182029500021, "grad_norm": 1.0550023317337036, "learning_rate": 6.745006159059222e-05, "loss": 3.1484, "step": 6250 }, { "epoch": 2.1300874857360212, "grad_norm": 0.9948070049285889, "learning_rate": 6.69803939460544e-05, "loss": 3.1394, "step": 6300 }, { "epoch": 2.1469929419720213, "grad_norm": 0.9610317945480347, "learning_rate": 6.650931743581033e-05, "loss": 3.1438, "step": 6350 }, { "epoch": 2.1638983982080218, "grad_norm": 1.1480181217193604, "learning_rate": 6.603688745733211e-05, "loss": 3.1411, "step": 6400 }, { "epoch": 2.1638983982080218, "eval_loss": 3.4495885372161865, "eval_runtime": 11.6737, "eval_samples_per_second": 85.663, "eval_steps_per_second": 2.741, "step": 6400 }, { "epoch": 2.180803854444022, "grad_norm": 0.8075036406517029, "learning_rate": 6.556315956725662e-05, "loss": 3.1346, "step": 6450 }, { "epoch": 2.197709310680022, "grad_norm": 1.198843240737915, "learning_rate": 6.50881894748519e-05, "loss": 3.1739, "step": 6500 }, { "epoch": 2.2146147669160223, "grad_norm": 1.3039313554763794, "learning_rate": 6.461203303546615e-05, "loss": 3.1518, "step": 6550 }, { "epoch": 2.2315202231520224, "grad_norm": 1.0603476762771606, "learning_rate": 6.413474624395905e-05, "loss": 3.1561, "step": 6600 }, { "epoch": 2.2484256793880224, "grad_norm": 1.0246437788009644, "learning_rate": 6.365638522811704e-05, "loss": 3.1361, "step": 6650 }, { "epoch": 2.265331135624023, "grad_norm": 1.0935068130493164, "learning_rate": 6.317700624205273e-05, "loss": 3.1174, "step": 6700 }, { "epoch": 2.282236591860023, "grad_norm": 0.9507274031639099, "learning_rate": 6.269666565958963e-05, "loss": 3.1381, "step": 6750 }, { "epoch": 2.299142048096023, "grad_norm": 0.9847383499145508, "learning_rate": 6.221541996763269e-05, "loss": 3.1154, "step": 6800 }, { "epoch": 2.299142048096023, "eval_loss": 3.407775640487671, "eval_runtime": 11.6847, "eval_samples_per_second": 85.582, "eval_steps_per_second": 2.739, "step": 6800 }, { "epoch": 2.316047504332023, "grad_norm": 0.9965755343437195, "learning_rate": 6.173332575952557e-05, "loss": 3.1175, "step": 6850 }, { "epoch": 2.3329529605680235, "grad_norm": 0.9966267943382263, "learning_rate": 6.125043972839536e-05, "loss": 3.1507, "step": 6900 }, { "epoch": 2.3498584168040235, "grad_norm": 1.1471587419509888, "learning_rate": 6.0766818660485716e-05, "loss": 3.1287, "step": 6950 }, { "epoch": 2.3667638730400236, "grad_norm": 1.2493683099746704, "learning_rate": 6.028251942847882e-05, "loss": 3.1202, "step": 7000 }, { "epoch": 2.3836693292760236, "grad_norm": 1.0860092639923096, "learning_rate": 5.9797598984807335e-05, "loss": 3.121, "step": 7050 }, { "epoch": 2.400574785512024, "grad_norm": 1.1602693796157837, "learning_rate": 5.931211435495694e-05, "loss": 3.1282, "step": 7100 }, { "epoch": 2.417480241748024, "grad_norm": 0.9266437292098999, "learning_rate": 5.882612263076026e-05, "loss": 3.1033, "step": 7150 }, { "epoch": 2.434385697984024, "grad_norm": 1.0014028549194336, "learning_rate": 5.833968096368301e-05, "loss": 3.1007, "step": 7200 }, { "epoch": 2.434385697984024, "eval_loss": 3.378451108932495, "eval_runtime": 11.6786, "eval_samples_per_second": 85.627, "eval_steps_per_second": 2.74, "step": 7200 }, { "epoch": 2.4512911542200246, "grad_norm": 1.0491281747817993, "learning_rate": 5.785284655810308e-05, "loss": 3.1117, "step": 7250 }, { "epoch": 2.4681966104560247, "grad_norm": 0.8672958612442017, "learning_rate": 5.7365676664583514e-05, "loss": 3.1037, "step": 7300 }, { "epoch": 2.4851020666920247, "grad_norm": 1.0733287334442139, "learning_rate": 5.687822857313993e-05, "loss": 3.1175, "step": 7350 }, { "epoch": 2.502007522928025, "grad_norm": 0.8078711032867432, "learning_rate": 5.63905596065033e-05, "loss": 3.1099, "step": 7400 }, { "epoch": 2.5189129791640252, "grad_norm": 1.0341311693191528, "learning_rate": 5.590272711337908e-05, "loss": 3.1167, "step": 7450 }, { "epoch": 2.5358184354000253, "grad_norm": 1.1616711616516113, "learning_rate": 5.541478846170298e-05, "loss": 3.0931, "step": 7500 }, { "epoch": 2.5527238916360258, "grad_norm": 0.8573184609413147, "learning_rate": 5.4926801031894734e-05, "loss": 3.1002, "step": 7550 }, { "epoch": 2.569629347872026, "grad_norm": 0.9128609895706177, "learning_rate": 5.4438822210110275e-05, "loss": 3.1117, "step": 7600 }, { "epoch": 2.569629347872026, "eval_loss": 3.3413400650024414, "eval_runtime": 11.683, "eval_samples_per_second": 85.595, "eval_steps_per_second": 2.739, "step": 7600 }, { "epoch": 2.586534804108026, "grad_norm": 0.9771824479103088, "learning_rate": 5.395090938149321e-05, "loss": 3.0962, "step": 7650 }, { "epoch": 2.603440260344026, "grad_norm": 1.0292117595672607, "learning_rate": 5.346311992342656e-05, "loss": 3.0829, "step": 7700 }, { "epoch": 2.6203457165800264, "grad_norm": 0.8531580567359924, "learning_rate": 5.297551119878522e-05, "loss": 3.0778, "step": 7750 }, { "epoch": 2.6372511728160264, "grad_norm": 0.9693793058395386, "learning_rate": 5.248814054919031e-05, "loss": 3.0678, "step": 7800 }, { "epoch": 2.6541566290520264, "grad_norm": 0.999812662601471, "learning_rate": 5.200106528826586e-05, "loss": 3.0851, "step": 7850 }, { "epoch": 2.6710620852880265, "grad_norm": 0.8840042352676392, "learning_rate": 5.151434269489889e-05, "loss": 3.0775, "step": 7900 }, { "epoch": 2.687967541524027, "grad_norm": 0.8592630624771118, "learning_rate": 5.102803000650359e-05, "loss": 3.0553, "step": 7950 }, { "epoch": 2.704872997760027, "grad_norm": 0.8932810425758362, "learning_rate": 5.054218441229031e-05, "loss": 3.0653, "step": 8000 }, { "epoch": 2.704872997760027, "eval_loss": 3.309781551361084, "eval_runtime": 11.6906, "eval_samples_per_second": 85.538, "eval_steps_per_second": 2.737, "step": 8000 }, { "epoch": 2.721778453996027, "grad_norm": 0.9920628070831299, "learning_rate": 5.005686304654018e-05, "loss": 3.0565, "step": 8050 }, { "epoch": 2.7386839102320275, "grad_norm": 0.8969925045967102, "learning_rate": 4.957212298188638e-05, "loss": 3.0684, "step": 8100 }, { "epoch": 2.7555893664680275, "grad_norm": 0.9055591821670532, "learning_rate": 4.908802122260243e-05, "loss": 3.0721, "step": 8150 }, { "epoch": 2.7724948227040276, "grad_norm": 1.047692060470581, "learning_rate": 4.8604614697898706e-05, "loss": 3.0404, "step": 8200 }, { "epoch": 2.789400278940028, "grad_norm": 0.9087944626808167, "learning_rate": 4.8121960255227603e-05, "loss": 3.0607, "step": 8250 }, { "epoch": 2.806305735176028, "grad_norm": 0.845425546169281, "learning_rate": 4.764011465359851e-05, "loss": 3.0396, "step": 8300 }, { "epoch": 2.823211191412028, "grad_norm": 0.9913256764411926, "learning_rate": 4.715913455690301e-05, "loss": 3.0444, "step": 8350 }, { "epoch": 2.8401166476480286, "grad_norm": 1.0773906707763672, "learning_rate": 4.66790765272514e-05, "loss": 3.0378, "step": 8400 }, { "epoch": 2.8401166476480286, "eval_loss": 3.284963846206665, "eval_runtime": 11.6698, "eval_samples_per_second": 85.691, "eval_steps_per_second": 2.742, "step": 8400 }, { "epoch": 2.8570221038840287, "grad_norm": 0.9364065527915955, "learning_rate": 4.619999701832108e-05, "loss": 3.0088, "step": 8450 }, { "epoch": 2.8739275601200287, "grad_norm": 0.9384069442749023, "learning_rate": 4.572195236871777e-05, "loss": 3.0226, "step": 8500 }, { "epoch": 2.890833016356029, "grad_norm": 0.8556334972381592, "learning_rate": 4.524499879535016e-05, "loss": 3.0257, "step": 8550 }, { "epoch": 2.907738472592029, "grad_norm": 0.9114282131195068, "learning_rate": 4.476919238681904e-05, "loss": 3.0302, "step": 8600 }, { "epoch": 2.9246439288280293, "grad_norm": 0.832721471786499, "learning_rate": 4.4294589096821325e-05, "loss": 3.0438, "step": 8650 }, { "epoch": 2.9415493850640293, "grad_norm": 0.8480991125106812, "learning_rate": 4.3821244737570046e-05, "loss": 3.0276, "step": 8700 }, { "epoch": 2.9584548413000293, "grad_norm": 1.021043300628662, "learning_rate": 4.3349214973231024e-05, "loss": 3.0216, "step": 8750 }, { "epoch": 2.97536029753603, "grad_norm": 0.7957491874694824, "learning_rate": 4.287855531337683e-05, "loss": 3.0173, "step": 8800 }, { "epoch": 2.97536029753603, "eval_loss": 3.2582342624664307, "eval_runtime": 11.6907, "eval_samples_per_second": 85.538, "eval_steps_per_second": 2.737, "step": 8800 }, { "epoch": 2.99226575377203, "grad_norm": 1.0264720916748047, "learning_rate": 4.2409321106459077e-05, "loss": 3.0152, "step": 8850 }, { "epoch": 3.00917121000803, "grad_norm": 0.8381086587905884, "learning_rate": 4.194156753329942e-05, "loss": 2.886, "step": 8900 }, { "epoch": 3.0260766662440304, "grad_norm": 0.9872603416442871, "learning_rate": 4.147534960060059e-05, "loss": 2.7538, "step": 8950 }, { "epoch": 3.0429821224800304, "grad_norm": 0.8750160336494446, "learning_rate": 4.1010722134477665e-05, "loss": 2.791, "step": 9000 }, { "epoch": 3.0598875787160305, "grad_norm": 0.7593681216239929, "learning_rate": 4.054773977401066e-05, "loss": 2.7791, "step": 9050 }, { "epoch": 3.076793034952031, "grad_norm": 0.9249379634857178, "learning_rate": 4.008645696481903e-05, "loss": 2.7693, "step": 9100 }, { "epoch": 3.093698491188031, "grad_norm": 1.0219660997390747, "learning_rate": 3.962692795265914e-05, "loss": 2.7869, "step": 9150 }, { "epoch": 3.110603947424031, "grad_norm": 0.8459084630012512, "learning_rate": 3.916920677704499e-05, "loss": 2.778, "step": 9200 }, { "epoch": 3.110603947424031, "eval_loss": 3.247343063354492, "eval_runtime": 11.6589, "eval_samples_per_second": 85.771, "eval_steps_per_second": 2.745, "step": 9200 }, { "epoch": 3.1275094036600315, "grad_norm": 0.7969251871109009, "learning_rate": 3.8713347264893294e-05, "loss": 2.7645, "step": 9250 }, { "epoch": 3.1444148598960315, "grad_norm": 0.8492719531059265, "learning_rate": 3.8259403024193616e-05, "loss": 2.7729, "step": 9300 }, { "epoch": 3.1613203161320316, "grad_norm": 0.862267255783081, "learning_rate": 3.780742743770417e-05, "loss": 2.7825, "step": 9350 }, { "epoch": 3.1782257723680316, "grad_norm": 0.9612255692481995, "learning_rate": 3.7357473656674126e-05, "loss": 2.7848, "step": 9400 }, { "epoch": 3.195131228604032, "grad_norm": 0.8228344321250916, "learning_rate": 3.6909594594593175e-05, "loss": 2.7684, "step": 9450 }, { "epoch": 3.212036684840032, "grad_norm": 0.9417145252227783, "learning_rate": 3.6463842920969026e-05, "loss": 2.7771, "step": 9500 }, { "epoch": 3.228942141076032, "grad_norm": 0.7457485795021057, "learning_rate": 3.602027105513355e-05, "loss": 2.8103, "step": 9550 }, { "epoch": 3.2458475973120327, "grad_norm": 0.8420482277870178, "learning_rate": 3.557893116007848e-05, "loss": 2.7591, "step": 9600 }, { "epoch": 3.2458475973120327, "eval_loss": 3.20989990234375, "eval_runtime": 11.7, "eval_samples_per_second": 85.47, "eval_steps_per_second": 2.735, "step": 9600 }, { "epoch": 3.2627530535480327, "grad_norm": 0.8763940930366516, "learning_rate": 3.5139875136321066e-05, "loss": 2.7569, "step": 9650 }, { "epoch": 3.2796585097840327, "grad_norm": 0.8527898192405701, "learning_rate": 3.470315461580079e-05, "loss": 2.7533, "step": 9700 }, { "epoch": 3.2965639660200328, "grad_norm": 0.8603160381317139, "learning_rate": 3.426882095580751e-05, "loss": 2.7438, "step": 9750 }, { "epoch": 3.3134694222560332, "grad_norm": 0.8680624961853027, "learning_rate": 3.3836925232942005e-05, "loss": 2.7353, "step": 9800 }, { "epoch": 3.3303748784920333, "grad_norm": 0.9345048666000366, "learning_rate": 3.3407518237109456e-05, "loss": 2.7667, "step": 9850 }, { "epoch": 3.3472803347280333, "grad_norm": 0.838909387588501, "learning_rate": 3.29806504655467e-05, "loss": 2.7438, "step": 9900 }, { "epoch": 3.364185790964034, "grad_norm": 0.8523705005645752, "learning_rate": 3.2556372116883874e-05, "loss": 2.771, "step": 9950 }, { "epoch": 3.381091247200034, "grad_norm": 0.8146962523460388, "learning_rate": 3.213473308524115e-05, "loss": 2.7634, "step": 10000 }, { "epoch": 3.381091247200034, "eval_loss": 3.1767163276672363, "eval_runtime": 11.6994, "eval_samples_per_second": 85.474, "eval_steps_per_second": 2.735, "step": 10000 }, { "epoch": 3.397996703436034, "grad_norm": 0.789837121963501, "learning_rate": 3.171578295436133e-05, "loss": 2.7489, "step": 10050 }, { "epoch": 3.4149021596720344, "grad_norm": 0.8918471932411194, "learning_rate": 3.129957099177892e-05, "loss": 2.7424, "step": 10100 }, { "epoch": 3.4318076159080344, "grad_norm": 0.859986424446106, "learning_rate": 3.0886146143026346e-05, "loss": 2.7504, "step": 10150 }, { "epoch": 3.4487130721440344, "grad_norm": 0.8879761695861816, "learning_rate": 3.047555702587816e-05, "loss": 2.7572, "step": 10200 }, { "epoch": 3.4656185283800345, "grad_norm": 0.8334829211235046, "learning_rate": 3.0067851924633606e-05, "loss": 2.7627, "step": 10250 }, { "epoch": 3.482523984616035, "grad_norm": 0.8631997108459473, "learning_rate": 2.9663078784438558e-05, "loss": 2.7526, "step": 10300 }, { "epoch": 3.499429440852035, "grad_norm": 0.8065224885940552, "learning_rate": 2.9261285205647283e-05, "loss": 2.7353, "step": 10350 }, { "epoch": 3.516334897088035, "grad_norm": 0.8405721783638, "learning_rate": 2.886251843822475e-05, "loss": 2.7255, "step": 10400 }, { "epoch": 3.516334897088035, "eval_loss": 3.1594960689544678, "eval_runtime": 11.722, "eval_samples_per_second": 85.31, "eval_steps_per_second": 2.73, "step": 10400 }, { "epoch": 3.533240353324035, "grad_norm": 0.8360841274261475, "learning_rate": 2.8466825376190122e-05, "loss": 2.723, "step": 10450 }, { "epoch": 3.5501458095600356, "grad_norm": 0.8372629880905151, "learning_rate": 2.8074252552102176e-05, "loss": 2.7196, "step": 10500 }, { "epoch": 3.5670512657960356, "grad_norm": 0.7810975313186646, "learning_rate": 2.768484613158714e-05, "loss": 2.7162, "step": 10550 }, { "epoch": 3.5839567220320356, "grad_norm": 0.8663210272789001, "learning_rate": 2.729865190790975e-05, "loss": 2.736, "step": 10600 }, { "epoch": 3.600862178268036, "grad_norm": 0.8004422187805176, "learning_rate": 2.6915715296588083e-05, "loss": 2.7291, "step": 10650 }, { "epoch": 3.617767634504036, "grad_norm": 0.7938219308853149, "learning_rate": 2.653608133005278e-05, "loss": 2.6953, "step": 10700 }, { "epoch": 3.634673090740036, "grad_norm": 0.8083840608596802, "learning_rate": 2.6159794652351332e-05, "loss": 2.7371, "step": 10750 }, { "epoch": 3.6515785469760367, "grad_norm": 0.8592216968536377, "learning_rate": 2.5786899513898066e-05, "loss": 2.7152, "step": 10800 }, { "epoch": 3.6515785469760367, "eval_loss": 3.1375598907470703, "eval_runtime": 11.6562, "eval_samples_per_second": 85.791, "eval_steps_per_second": 2.745, "step": 10800 }, { "epoch": 3.6684840032120367, "grad_norm": 0.7407676577568054, "learning_rate": 2.54174397662704e-05, "loss": 2.713, "step": 10850 }, { "epoch": 3.6853894594480368, "grad_norm": 0.8212365508079529, "learning_rate": 2.5051458857052006e-05, "loss": 2.7203, "step": 10900 }, { "epoch": 3.7022949156840372, "grad_norm": 0.8753028512001038, "learning_rate": 2.468899982472346e-05, "loss": 2.7398, "step": 10950 }, { "epoch": 3.7192003719200373, "grad_norm": 0.8082839846611023, "learning_rate": 2.4330105293601023e-05, "loss": 2.7097, "step": 11000 }, { "epoch": 3.7361058281560373, "grad_norm": 0.8813438415527344, "learning_rate": 2.397481746882414e-05, "loss": 2.7213, "step": 11050 }, { "epoch": 3.753011284392038, "grad_norm": 0.8139234781265259, "learning_rate": 2.36231781313922e-05, "loss": 2.7002, "step": 11100 }, { "epoch": 3.769916740628038, "grad_norm": 0.800145149230957, "learning_rate": 2.3275228633251227e-05, "loss": 2.7182, "step": 11150 }, { "epoch": 3.786822196864038, "grad_norm": 0.7504892945289612, "learning_rate": 2.29310098924309e-05, "loss": 2.6992, "step": 11200 }, { "epoch": 3.786822196864038, "eval_loss": 3.1104345321655273, "eval_runtime": 11.6893, "eval_samples_per_second": 85.548, "eval_steps_per_second": 2.738, "step": 11200 }, { "epoch": 3.803727653100038, "grad_norm": 0.8169353604316711, "learning_rate": 2.2590562388232804e-05, "loss": 2.7137, "step": 11250 }, { "epoch": 3.820633109336038, "grad_norm": 0.8978986144065857, "learning_rate": 2.225392615647006e-05, "loss": 2.7369, "step": 11300 }, { "epoch": 3.8375385655720384, "grad_norm": 0.8523368239402771, "learning_rate": 2.1921140784759338e-05, "loss": 2.7309, "step": 11350 }, { "epoch": 3.8544440218080385, "grad_norm": 0.890410840511322, "learning_rate": 2.1592245407865252e-05, "loss": 2.6908, "step": 11400 }, { "epoch": 3.8713494780440385, "grad_norm": 0.8792175650596619, "learning_rate": 2.126727870309841e-05, "loss": 2.698, "step": 11450 }, { "epoch": 3.888254934280039, "grad_norm": 0.7908258438110352, "learning_rate": 2.09462788857669e-05, "loss": 2.6956, "step": 11500 }, { "epoch": 3.905160390516039, "grad_norm": 0.7934091687202454, "learning_rate": 2.0629283704682392e-05, "loss": 2.7036, "step": 11550 }, { "epoch": 3.922065846752039, "grad_norm": 0.8208107948303223, "learning_rate": 2.031633043772086e-05, "loss": 2.7007, "step": 11600 }, { "epoch": 3.922065846752039, "eval_loss": 3.0899033546447754, "eval_runtime": 11.6628, "eval_samples_per_second": 85.743, "eval_steps_per_second": 2.744, "step": 11600 } ], "logging_steps": 50, "max_steps": 14785, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.925050875573101e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }