rabiulawal's picture
Add files using upload-large-folder tool
85205ef verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.095586711789488,
"eval_steps": 100,
"global_step": 5600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017404925593943087,
"grad_norm": 1.748035737475059,
"learning_rate": 0.0001,
"loss": 3.3816,
"step": 50
},
{
"epoch": 0.034809851187886175,
"grad_norm": 2.305170683663697,
"learning_rate": 9.999925705188519e-05,
"loss": 2.0964,
"step": 100
},
{
"epoch": 0.034809851187886175,
"eval_loss": 1.7720723152160645,
"eval_runtime": 14.1317,
"eval_samples_per_second": 70.763,
"eval_steps_per_second": 2.264,
"step": 100
},
{
"epoch": 0.05221477678182926,
"grad_norm": 2.132850344968578,
"learning_rate": 9.999702822984264e-05,
"loss": 1.7562,
"step": 150
},
{
"epoch": 0.06961970237577235,
"grad_norm": 2.0007634960538283,
"learning_rate": 9.999331360077739e-05,
"loss": 1.6268,
"step": 200
},
{
"epoch": 0.06961970237577235,
"eval_loss": 1.5610754489898682,
"eval_runtime": 14.0282,
"eval_samples_per_second": 71.285,
"eval_steps_per_second": 2.281,
"step": 200
},
{
"epoch": 0.08702462796971543,
"grad_norm": 1.4046084740159703,
"learning_rate": 9.998811327619556e-05,
"loss": 1.5797,
"step": 250
},
{
"epoch": 0.10442955356365852,
"grad_norm": 1.788935175624683,
"learning_rate": 9.998142741220103e-05,
"loss": 1.5612,
"step": 300
},
{
"epoch": 0.10442955356365852,
"eval_loss": 1.5176177024841309,
"eval_runtime": 14.1024,
"eval_samples_per_second": 70.91,
"eval_steps_per_second": 2.269,
"step": 300
},
{
"epoch": 0.12183447915760161,
"grad_norm": 2.3569679355459012,
"learning_rate": 9.997325620949076e-05,
"loss": 1.5506,
"step": 350
},
{
"epoch": 0.1392394047515447,
"grad_norm": 1.0673623983257676,
"learning_rate": 9.996359991334882e-05,
"loss": 1.5488,
"step": 400
},
{
"epoch": 0.1392394047515447,
"eval_loss": 1.509135365486145,
"eval_runtime": 13.9837,
"eval_samples_per_second": 71.512,
"eval_steps_per_second": 2.288,
"step": 400
},
{
"epoch": 0.15664433034548778,
"grad_norm": 1.733996756941176,
"learning_rate": 9.995245881363888e-05,
"loss": 1.5374,
"step": 450
},
{
"epoch": 0.17404925593943085,
"grad_norm": 0.9264926041034174,
"learning_rate": 9.993983324479569e-05,
"loss": 1.5209,
"step": 500
},
{
"epoch": 0.17404925593943085,
"eval_loss": 1.491538643836975,
"eval_runtime": 14.0359,
"eval_samples_per_second": 71.246,
"eval_steps_per_second": 2.28,
"step": 500
},
{
"epoch": 0.19145418153337396,
"grad_norm": 1.1488376675463685,
"learning_rate": 9.992572358581487e-05,
"loss": 1.521,
"step": 550
},
{
"epoch": 0.20885910712731703,
"grad_norm": 1.669516601726485,
"learning_rate": 9.991013026024168e-05,
"loss": 1.5024,
"step": 600
},
{
"epoch": 0.20885910712731703,
"eval_loss": 1.5023186206817627,
"eval_runtime": 14.028,
"eval_samples_per_second": 71.286,
"eval_steps_per_second": 2.281,
"step": 600
},
{
"epoch": 0.2262640327212601,
"grad_norm": 0.9295243577760427,
"learning_rate": 9.989305373615821e-05,
"loss": 1.5152,
"step": 650
},
{
"epoch": 0.24366895831520322,
"grad_norm": 1.2361910561400884,
"learning_rate": 9.987449452616938e-05,
"loss": 1.5164,
"step": 700
},
{
"epoch": 0.24366895831520322,
"eval_loss": 1.5014784336090088,
"eval_runtime": 13.8436,
"eval_samples_per_second": 72.236,
"eval_steps_per_second": 2.312,
"step": 700
},
{
"epoch": 0.26107388390914626,
"grad_norm": 0.7804262457778667,
"learning_rate": 9.985445318738746e-05,
"loss": 1.5142,
"step": 750
},
{
"epoch": 0.2784788095030894,
"grad_norm": 0.6710521243228105,
"learning_rate": 9.983293032141556e-05,
"loss": 1.5136,
"step": 800
},
{
"epoch": 0.2784788095030894,
"eval_loss": 1.4961464405059814,
"eval_runtime": 13.9776,
"eval_samples_per_second": 71.543,
"eval_steps_per_second": 2.289,
"step": 800
},
{
"epoch": 0.2958837350970325,
"grad_norm": 1.094671287498252,
"learning_rate": 9.980992657432926e-05,
"loss": 1.5062,
"step": 850
},
{
"epoch": 0.31328866069097555,
"grad_norm": 0.8930703012061629,
"learning_rate": 9.978544263665752e-05,
"loss": 1.5085,
"step": 900
},
{
"epoch": 0.31328866069097555,
"eval_loss": 1.4857720136642456,
"eval_runtime": 13.9639,
"eval_samples_per_second": 71.613,
"eval_steps_per_second": 2.292,
"step": 900
},
{
"epoch": 0.33069358628491863,
"grad_norm": 1.0794347625850207,
"learning_rate": 9.975947924336177e-05,
"loss": 1.4997,
"step": 950
},
{
"epoch": 0.3480985118788617,
"grad_norm": 0.7170458063432213,
"learning_rate": 9.973203717381386e-05,
"loss": 1.4966,
"step": 1000
},
{
"epoch": 0.3480985118788617,
"eval_loss": 1.4856551885604858,
"eval_runtime": 13.952,
"eval_samples_per_second": 71.674,
"eval_steps_per_second": 2.294,
"step": 1000
},
{
"epoch": 0.3655034374728048,
"grad_norm": 0.4881975172416428,
"learning_rate": 9.970311725177276e-05,
"loss": 1.4907,
"step": 1050
},
{
"epoch": 0.3829083630667479,
"grad_norm": 0.8770440504616285,
"learning_rate": 9.967272034535975e-05,
"loss": 1.4899,
"step": 1100
},
{
"epoch": 0.3829083630667479,
"eval_loss": 1.4895131587982178,
"eval_runtime": 13.991,
"eval_samples_per_second": 71.475,
"eval_steps_per_second": 2.287,
"step": 1100
},
{
"epoch": 0.400313288660691,
"grad_norm": 0.7914949102985543,
"learning_rate": 9.964084736703232e-05,
"loss": 1.4915,
"step": 1150
},
{
"epoch": 0.41771821425463407,
"grad_norm": 1.401100910547186,
"learning_rate": 9.9607499273557e-05,
"loss": 1.5037,
"step": 1200
},
{
"epoch": 0.41771821425463407,
"eval_loss": 1.4771944284439087,
"eval_runtime": 14.0165,
"eval_samples_per_second": 71.344,
"eval_steps_per_second": 2.283,
"step": 1200
},
{
"epoch": 0.43512313984857715,
"grad_norm": 1.0816751773800846,
"learning_rate": 9.957267706598031e-05,
"loss": 1.5052,
"step": 1250
},
{
"epoch": 0.4525280654425202,
"grad_norm": 0.7646700065689047,
"learning_rate": 9.953638178959896e-05,
"loss": 1.497,
"step": 1300
},
{
"epoch": 0.4525280654425202,
"eval_loss": 1.4796491861343384,
"eval_runtime": 13.9408,
"eval_samples_per_second": 71.732,
"eval_steps_per_second": 2.295,
"step": 1300
},
{
"epoch": 0.4699329910364633,
"grad_norm": 0.7864231088946072,
"learning_rate": 9.949861453392844e-05,
"loss": 1.4817,
"step": 1350
},
{
"epoch": 0.48733791663040643,
"grad_norm": 0.6787872856546141,
"learning_rate": 9.945937643267016e-05,
"loss": 1.489,
"step": 1400
},
{
"epoch": 0.48733791663040643,
"eval_loss": 1.4757252931594849,
"eval_runtime": 14.061,
"eval_samples_per_second": 71.119,
"eval_steps_per_second": 2.276,
"step": 1400
},
{
"epoch": 0.5047428422243495,
"grad_norm": 0.7188134698113986,
"learning_rate": 9.941866866367761e-05,
"loss": 1.4885,
"step": 1450
},
{
"epoch": 0.5221477678182925,
"grad_norm": 0.540769457626735,
"learning_rate": 9.937649244892093e-05,
"loss": 1.4871,
"step": 1500
},
{
"epoch": 0.5221477678182925,
"eval_loss": 1.4772448539733887,
"eval_runtime": 13.9665,
"eval_samples_per_second": 71.6,
"eval_steps_per_second": 2.291,
"step": 1500
},
{
"epoch": 0.5395526934122357,
"grad_norm": 0.7147998900607448,
"learning_rate": 9.933284905445015e-05,
"loss": 1.4889,
"step": 1550
},
{
"epoch": 0.5569576190061788,
"grad_norm": 1.0204620680412557,
"learning_rate": 9.928773979035732e-05,
"loss": 1.4796,
"step": 1600
},
{
"epoch": 0.5569576190061788,
"eval_loss": 1.4816679954528809,
"eval_runtime": 13.9778,
"eval_samples_per_second": 71.542,
"eval_steps_per_second": 2.289,
"step": 1600
},
{
"epoch": 0.5743625446001218,
"grad_norm": 0.7498586170923626,
"learning_rate": 9.924116601073708e-05,
"loss": 1.4763,
"step": 1650
},
{
"epoch": 0.591767470194065,
"grad_norm": 1.001969715758831,
"learning_rate": 9.919312911364608e-05,
"loss": 1.4864,
"step": 1700
},
{
"epoch": 0.591767470194065,
"eval_loss": 1.4769014120101929,
"eval_runtime": 14.2925,
"eval_samples_per_second": 69.967,
"eval_steps_per_second": 2.239,
"step": 1700
},
{
"epoch": 0.609172395788008,
"grad_norm": 0.9109140733322101,
"learning_rate": 9.914363054106097e-05,
"loss": 1.4893,
"step": 1750
},
{
"epoch": 0.6265773213819511,
"grad_norm": 0.6203506870450822,
"learning_rate": 9.909267177883513e-05,
"loss": 1.4688,
"step": 1800
},
{
"epoch": 0.6265773213819511,
"eval_loss": 1.4739837646484375,
"eval_runtime": 14.2323,
"eval_samples_per_second": 70.263,
"eval_steps_per_second": 2.248,
"step": 1800
},
{
"epoch": 0.6439822469758941,
"grad_norm": 0.7146858170328513,
"learning_rate": 9.904025435665407e-05,
"loss": 1.4854,
"step": 1850
},
{
"epoch": 0.6613871725698373,
"grad_norm": 0.8425737717370796,
"learning_rate": 9.898637984798949e-05,
"loss": 1.4726,
"step": 1900
},
{
"epoch": 0.6613871725698373,
"eval_loss": 1.474668025970459,
"eval_runtime": 14.2187,
"eval_samples_per_second": 70.33,
"eval_steps_per_second": 2.251,
"step": 1900
},
{
"epoch": 0.6787920981637804,
"grad_norm": 0.7988368913631474,
"learning_rate": 9.89310498700521e-05,
"loss": 1.4742,
"step": 1950
},
{
"epoch": 0.6961970237577234,
"grad_norm": 0.5619281860924451,
"learning_rate": 9.887426608374303e-05,
"loss": 1.4729,
"step": 2000
},
{
"epoch": 0.6961970237577234,
"eval_loss": 1.4624892473220825,
"eval_runtime": 14.2489,
"eval_samples_per_second": 70.181,
"eval_steps_per_second": 2.246,
"step": 2000
},
{
"epoch": 0.7136019493516665,
"grad_norm": 0.6036809706784098,
"learning_rate": 9.881603019360396e-05,
"loss": 1.4678,
"step": 2050
},
{
"epoch": 0.7310068749456096,
"grad_norm": 0.5585936322285137,
"learning_rate": 9.875634394776601e-05,
"loss": 1.4707,
"step": 2100
},
{
"epoch": 0.7310068749456096,
"eval_loss": 1.4655637741088867,
"eval_runtime": 14.2102,
"eval_samples_per_second": 70.372,
"eval_steps_per_second": 2.252,
"step": 2100
},
{
"epoch": 0.7484118005395527,
"grad_norm": 0.5692019006343436,
"learning_rate": 9.869520913789719e-05,
"loss": 1.463,
"step": 2150
},
{
"epoch": 0.7658167261334958,
"grad_norm": 0.6722557889499563,
"learning_rate": 9.86326275991487e-05,
"loss": 1.462,
"step": 2200
},
{
"epoch": 0.7658167261334958,
"eval_loss": 1.4523816108703613,
"eval_runtime": 14.2588,
"eval_samples_per_second": 70.132,
"eval_steps_per_second": 2.244,
"step": 2200
},
{
"epoch": 0.7832216517274389,
"grad_norm": 0.6537919031919717,
"learning_rate": 9.856860121009977e-05,
"loss": 1.47,
"step": 2250
},
{
"epoch": 0.800626577321382,
"grad_norm": 0.6073431052346562,
"learning_rate": 9.850313189270131e-05,
"loss": 1.4597,
"step": 2300
},
{
"epoch": 0.800626577321382,
"eval_loss": 1.4533107280731201,
"eval_runtime": 14.2406,
"eval_samples_per_second": 70.222,
"eval_steps_per_second": 2.247,
"step": 2300
},
{
"epoch": 0.818031502915325,
"grad_norm": 0.565917048637927,
"learning_rate": 9.843622161221823e-05,
"loss": 1.4668,
"step": 2350
},
{
"epoch": 0.8354364285092681,
"grad_norm": 1.1201286663948173,
"learning_rate": 9.836787237717037e-05,
"loss": 1.4714,
"step": 2400
},
{
"epoch": 0.8354364285092681,
"eval_loss": 1.470242977142334,
"eval_runtime": 14.1434,
"eval_samples_per_second": 70.704,
"eval_steps_per_second": 2.263,
"step": 2400
},
{
"epoch": 1.35416810553215,
"grad_norm": 0.5423017975669697,
"learning_rate": 9.57426783302541e-05,
"loss": 1.3855,
"step": 2450
},
{
"epoch": 1.381794322812349,
"grad_norm": 0.633765102988447,
"learning_rate": 9.55660918060588e-05,
"loss": 1.385,
"step": 2500
},
{
"epoch": 1.381794322812349,
"eval_loss": 1.4470162391662598,
"eval_runtime": 13.9147,
"eval_samples_per_second": 71.867,
"eval_steps_per_second": 2.3,
"step": 2500
},
{
"epoch": 1.409420540092548,
"grad_norm": 0.5284456537306131,
"learning_rate": 9.538608852684625e-05,
"loss": 1.3947,
"step": 2550
},
{
"epoch": 1.4370467573727468,
"grad_norm": 0.5156553380375979,
"learning_rate": 9.520268213984171e-05,
"loss": 1.3898,
"step": 2600
},
{
"epoch": 1.4370467573727468,
"eval_loss": 1.4567538499832153,
"eval_runtime": 14.6689,
"eval_samples_per_second": 68.172,
"eval_steps_per_second": 2.181,
"step": 2600
},
{
"epoch": 1.4646729746529457,
"grad_norm": 0.5868426818010671,
"learning_rate": 9.501588655028233e-05,
"loss": 1.3933,
"step": 2650
},
{
"epoch": 1.4922991919331445,
"grad_norm": 0.49204202163052463,
"learning_rate": 9.482571592036283e-05,
"loss": 1.3911,
"step": 2700
},
{
"epoch": 1.4922991919331445,
"eval_loss": 1.4373167753219604,
"eval_runtime": 13.8667,
"eval_samples_per_second": 72.115,
"eval_steps_per_second": 2.308,
"step": 2700
},
{
"epoch": 1.5199254092133434,
"grad_norm": 0.6706914520167009,
"learning_rate": 9.463218466816181e-05,
"loss": 1.3889,
"step": 2750
},
{
"epoch": 1.5475516264935423,
"grad_norm": 0.5937608414509916,
"learning_rate": 9.44353074665486e-05,
"loss": 1.3973,
"step": 2800
},
{
"epoch": 1.5475516264935423,
"eval_loss": 1.4474902153015137,
"eval_runtime": 13.7287,
"eval_samples_per_second": 72.84,
"eval_steps_per_second": 2.331,
"step": 2800
},
{
"epoch": 1.5751778437737411,
"grad_norm": 0.6571605912290549,
"learning_rate": 9.423509924207087e-05,
"loss": 1.3946,
"step": 2850
},
{
"epoch": 1.6028040610539402,
"grad_norm": 0.49095502872485536,
"learning_rate": 9.403157517382286e-05,
"loss": 1.3798,
"step": 2900
},
{
"epoch": 1.6028040610539402,
"eval_loss": 1.4381753206253052,
"eval_runtime": 13.9041,
"eval_samples_per_second": 71.921,
"eval_steps_per_second": 2.301,
"step": 2900
},
{
"epoch": 1.6304302783341391,
"grad_norm": 0.5694690346337261,
"learning_rate": 9.382475069229462e-05,
"loss": 1.3917,
"step": 2950
},
{
"epoch": 1.658056495614338,
"grad_norm": 0.7592695351020217,
"learning_rate": 9.361464147820214e-05,
"loss": 1.3803,
"step": 3000
},
{
"epoch": 1.658056495614338,
"eval_loss": 1.4414281845092773,
"eval_runtime": 13.8386,
"eval_samples_per_second": 72.262,
"eval_steps_per_second": 2.312,
"step": 3000
},
{
"epoch": 1.6856827128945369,
"grad_norm": 0.6343580907126278,
"learning_rate": 9.340126346129839e-05,
"loss": 1.399,
"step": 3050
},
{
"epoch": 1.713308930174736,
"grad_norm": 0.8990427661264742,
"learning_rate": 9.31846328191657e-05,
"loss": 1.3952,
"step": 3100
},
{
"epoch": 1.713308930174736,
"eval_loss": 1.4493228197097778,
"eval_runtime": 13.7835,
"eval_samples_per_second": 72.551,
"eval_steps_per_second": 2.322,
"step": 3100
},
{
"epoch": 1.7409351474549348,
"grad_norm": 0.6453304011433102,
"learning_rate": 9.296476597598915e-05,
"loss": 1.3912,
"step": 3150
},
{
"epoch": 1.7685613647351337,
"grad_norm": 0.6243500248086706,
"learning_rate": 9.274167960131144e-05,
"loss": 1.381,
"step": 3200
},
{
"epoch": 1.7685613647351337,
"eval_loss": 1.4362107515335083,
"eval_runtime": 13.8362,
"eval_samples_per_second": 72.274,
"eval_steps_per_second": 2.313,
"step": 3200
},
{
"epoch": 1.7961875820153326,
"grad_norm": 0.5285948737566967,
"learning_rate": 9.25153906087689e-05,
"loss": 1.3854,
"step": 3250
},
{
"epoch": 1.8238137992955314,
"grad_norm": 0.7011864733726069,
"learning_rate": 9.228591615480933e-05,
"loss": 1.3907,
"step": 3300
},
{
"epoch": 1.8238137992955314,
"eval_loss": 1.4350640773773193,
"eval_runtime": 13.8727,
"eval_samples_per_second": 72.084,
"eval_steps_per_second": 2.307,
"step": 3300
},
{
"epoch": 1.8514400165757303,
"grad_norm": 0.5192126522121225,
"learning_rate": 9.205327363739116e-05,
"loss": 1.3852,
"step": 3350
},
{
"epoch": 1.8790662338559292,
"grad_norm": 0.43800222607154987,
"learning_rate": 9.181748069466442e-05,
"loss": 1.4035,
"step": 3400
},
{
"epoch": 1.8790662338559292,
"eval_loss": 1.4314343929290771,
"eval_runtime": 13.927,
"eval_samples_per_second": 71.803,
"eval_steps_per_second": 2.298,
"step": 3400
},
{
"epoch": 1.906692451136128,
"grad_norm": 0.9102701559767962,
"learning_rate": 9.157855520363348e-05,
"loss": 1.3868,
"step": 3450
},
{
"epoch": 1.934318668416327,
"grad_norm": 0.4670303544147569,
"learning_rate": 9.133651527880168e-05,
"loss": 1.3886,
"step": 3500
},
{
"epoch": 1.934318668416327,
"eval_loss": 1.424402117729187,
"eval_runtime": 13.9378,
"eval_samples_per_second": 71.747,
"eval_steps_per_second": 2.296,
"step": 3500
},
{
"epoch": 1.961944885696526,
"grad_norm": 0.6495516419509523,
"learning_rate": 9.109137927079793e-05,
"loss": 1.3834,
"step": 3550
},
{
"epoch": 1.989571102976725,
"grad_norm": 0.5093395778911604,
"learning_rate": 9.084316576498545e-05,
"loss": 1.3793,
"step": 3600
},
{
"epoch": 1.989571102976725,
"eval_loss": 1.417135238647461,
"eval_runtime": 13.9961,
"eval_samples_per_second": 71.448,
"eval_steps_per_second": 2.286,
"step": 3600
},
{
"epoch": 2.017680779059327,
"grad_norm": 0.7497872170253476,
"learning_rate": 9.05918935800527e-05,
"loss": 1.3555,
"step": 3650
},
{
"epoch": 2.045306996339526,
"grad_norm": 0.48298967449158703,
"learning_rate": 9.033758176658656e-05,
"loss": 1.2926,
"step": 3700
},
{
"epoch": 2.045306996339526,
"eval_loss": 1.434622049331665,
"eval_runtime": 13.8285,
"eval_samples_per_second": 72.314,
"eval_steps_per_second": 2.314,
"step": 3700
},
{
"epoch": 2.072933213619725,
"grad_norm": 0.5727314238142795,
"learning_rate": 9.00802496056281e-05,
"loss": 1.299,
"step": 3750
},
{
"epoch": 2.1005594308999243,
"grad_norm": 0.6991328817835216,
"learning_rate": 8.981991660721059e-05,
"loss": 1.3088,
"step": 3800
},
{
"epoch": 2.1005594308999243,
"eval_loss": 1.426012396812439,
"eval_runtime": 13.9031,
"eval_samples_per_second": 71.927,
"eval_steps_per_second": 2.302,
"step": 3800
},
{
"epoch": 2.128185648180123,
"grad_norm": 0.4535319425232075,
"learning_rate": 8.955660250888043e-05,
"loss": 1.3139,
"step": 3850
},
{
"epoch": 2.155811865460322,
"grad_norm": 0.5698506378304132,
"learning_rate": 8.929032727420071e-05,
"loss": 1.3004,
"step": 3900
},
{
"epoch": 2.155811865460322,
"eval_loss": 1.4277055263519287,
"eval_runtime": 13.8261,
"eval_samples_per_second": 72.327,
"eval_steps_per_second": 2.314,
"step": 3900
},
{
"epoch": 2.183438082740521,
"grad_norm": 0.5561254900135922,
"learning_rate": 8.902111109123764e-05,
"loss": 1.3143,
"step": 3950
},
{
"epoch": 2.2110643000207197,
"grad_norm": 0.543188902461103,
"learning_rate": 8.874897437102988e-05,
"loss": 1.3155,
"step": 4000
},
{
"epoch": 2.2110643000207197,
"eval_loss": 1.426361083984375,
"eval_runtime": 13.9204,
"eval_samples_per_second": 71.837,
"eval_steps_per_second": 2.299,
"step": 4000
},
{
"epoch": 2.2386905173009186,
"grad_norm": 0.7935812675669448,
"learning_rate": 8.847393774604117e-05,
"loss": 1.3138,
"step": 4050
},
{
"epoch": 2.2663167345811175,
"grad_norm": 0.6177885341195845,
"learning_rate": 8.819602206859597e-05,
"loss": 1.3001,
"step": 4100
},
{
"epoch": 2.2663167345811175,
"eval_loss": 1.4111926555633545,
"eval_runtime": 13.8757,
"eval_samples_per_second": 72.068,
"eval_steps_per_second": 2.306,
"step": 4100
},
{
"epoch": 2.2939429518613164,
"grad_norm": 0.6890852675629292,
"learning_rate": 8.791524840929852e-05,
"loss": 1.3058,
"step": 4150
},
{
"epoch": 2.3215691691415152,
"grad_norm": 0.5232893247490601,
"learning_rate": 8.763163805543534e-05,
"loss": 1.319,
"step": 4200
},
{
"epoch": 2.3215691691415152,
"eval_loss": 1.4131031036376953,
"eval_runtime": 13.8589,
"eval_samples_per_second": 72.156,
"eval_steps_per_second": 2.309,
"step": 4200
},
{
"epoch": 2.349195386421714,
"grad_norm": 0.4907618228196506,
"learning_rate": 8.734521250936136e-05,
"loss": 1.314,
"step": 4250
},
{
"epoch": 2.376821603701913,
"grad_norm": 0.5868482210150553,
"learning_rate": 8.705599348686951e-05,
"loss": 1.3198,
"step": 4300
},
{
"epoch": 2.376821603701913,
"eval_loss": 1.4351857900619507,
"eval_runtime": 13.737,
"eval_samples_per_second": 72.796,
"eval_steps_per_second": 2.329,
"step": 4300
},
{
"epoch": 2.404447820982112,
"grad_norm": 0.5006063254268277,
"learning_rate": 8.676400291554461e-05,
"loss": 1.3065,
"step": 4350
},
{
"epoch": 2.4320740382623107,
"grad_norm": 0.6115354068941561,
"learning_rate": 8.646926293310056e-05,
"loss": 1.3107,
"step": 4400
},
{
"epoch": 2.4320740382623107,
"eval_loss": 1.4137160778045654,
"eval_runtime": 13.9071,
"eval_samples_per_second": 71.906,
"eval_steps_per_second": 2.301,
"step": 4400
},
{
"epoch": 2.4597002555425096,
"grad_norm": 0.4765973912904181,
"learning_rate": 8.617179588570216e-05,
"loss": 1.3133,
"step": 4450
},
{
"epoch": 2.487326472822709,
"grad_norm": 0.6739692432100701,
"learning_rate": 8.587162432627084e-05,
"loss": 1.3207,
"step": 4500
},
{
"epoch": 2.487326472822709,
"eval_loss": 1.4046252965927124,
"eval_runtime": 13.9425,
"eval_samples_per_second": 71.723,
"eval_steps_per_second": 2.295,
"step": 4500
},
{
"epoch": 2.514952690102908,
"grad_norm": 0.4202598968092653,
"learning_rate": 8.55687710127747e-05,
"loss": 1.3241,
"step": 4550
},
{
"epoch": 2.5425789073831067,
"grad_norm": 0.5223268903415091,
"learning_rate": 8.526325890650322e-05,
"loss": 1.3246,
"step": 4600
},
{
"epoch": 2.5425789073831067,
"eval_loss": 1.4036352634429932,
"eval_runtime": 13.8803,
"eval_samples_per_second": 72.044,
"eval_steps_per_second": 2.305,
"step": 4600
},
{
"epoch": 2.5702051246633055,
"grad_norm": 0.4344655061039349,
"learning_rate": 8.49551111703263e-05,
"loss": 1.315,
"step": 4650
},
{
"epoch": 2.5978313419435044,
"grad_norm": 0.5505616366165278,
"learning_rate": 8.46443511669382e-05,
"loss": 1.3118,
"step": 4700
},
{
"epoch": 2.5978313419435044,
"eval_loss": 1.4142063856124878,
"eval_runtime": 13.6869,
"eval_samples_per_second": 73.063,
"eval_steps_per_second": 2.338,
"step": 4700
},
{
"epoch": 2.6254575592237033,
"grad_norm": 0.49791040130089953,
"learning_rate": 8.43310024570862e-05,
"loss": 1.3272,
"step": 4750
},
{
"epoch": 2.653083776503902,
"grad_norm": 0.5654192967435524,
"learning_rate": 8.401508879778437e-05,
"loss": 1.3119,
"step": 4800
},
{
"epoch": 2.653083776503902,
"eval_loss": 1.4022154808044434,
"eval_runtime": 13.8266,
"eval_samples_per_second": 72.324,
"eval_steps_per_second": 2.314,
"step": 4800
},
{
"epoch": 2.680709993784101,
"grad_norm": 0.5241222947369369,
"learning_rate": 8.369663414051235e-05,
"loss": 1.3192,
"step": 4850
},
{
"epoch": 2.7083362110643,
"grad_norm": 0.5829004323666963,
"learning_rate": 8.337566262939944e-05,
"loss": 1.3285,
"step": 4900
},
{
"epoch": 2.7083362110643,
"eval_loss": 1.3964084386825562,
"eval_runtime": 14.0218,
"eval_samples_per_second": 71.318,
"eval_steps_per_second": 2.282,
"step": 4900
},
{
"epoch": 2.7359624283444988,
"grad_norm": 0.5138951715501007,
"learning_rate": 8.30521985993941e-05,
"loss": 1.3141,
"step": 4950
},
{
"epoch": 2.763588645624698,
"grad_norm": 0.49081549233151356,
"learning_rate": 8.272626657441892e-05,
"loss": 1.3023,
"step": 5000
},
{
"epoch": 2.763588645624698,
"eval_loss": 1.4064538478851318,
"eval_runtime": 13.9031,
"eval_samples_per_second": 71.926,
"eval_steps_per_second": 2.302,
"step": 5000
},
{
"epoch": 2.791214862904897,
"grad_norm": 0.4376279695564705,
"learning_rate": 8.239789126551135e-05,
"loss": 1.3175,
"step": 5050
},
{
"epoch": 2.818841080185096,
"grad_norm": 0.5424867772492635,
"learning_rate": 8.206709756895014e-05,
"loss": 1.3206,
"step": 5100
},
{
"epoch": 2.818841080185096,
"eval_loss": 1.3960105180740356,
"eval_runtime": 14.0178,
"eval_samples_per_second": 71.338,
"eval_steps_per_second": 2.283,
"step": 5100
},
{
"epoch": 2.8464672974652947,
"grad_norm": 0.4125557932022418,
"learning_rate": 8.173391056436784e-05,
"loss": 1.326,
"step": 5150
},
{
"epoch": 2.8740935147454936,
"grad_norm": 0.5663252025147304,
"learning_rate": 8.13983555128493e-05,
"loss": 1.311,
"step": 5200
},
{
"epoch": 2.8740935147454936,
"eval_loss": 1.3975000381469727,
"eval_runtime": 13.9076,
"eval_samples_per_second": 71.903,
"eval_steps_per_second": 2.301,
"step": 5200
},
{
"epoch": 2.9017197320256924,
"grad_norm": 0.4661168732170414,
"learning_rate": 8.10604578550165e-05,
"loss": 1.3173,
"step": 5250
},
{
"epoch": 2.9293459493058913,
"grad_norm": 0.5005757522014943,
"learning_rate": 8.072024320909975e-05,
"loss": 1.3109,
"step": 5300
},
{
"epoch": 2.9293459493058913,
"eval_loss": 1.3958256244659424,
"eval_runtime": 13.8724,
"eval_samples_per_second": 72.086,
"eval_steps_per_second": 2.307,
"step": 5300
},
{
"epoch": 2.95697216658609,
"grad_norm": 0.4048035790917549,
"learning_rate": 8.037773736899528e-05,
"loss": 1.3035,
"step": 5350
},
{
"epoch": 2.984598383866289,
"grad_norm": 0.44570187008592616,
"learning_rate": 8.003296630230988e-05,
"loss": 1.3128,
"step": 5400
},
{
"epoch": 2.984598383866289,
"eval_loss": 1.3918607234954834,
"eval_runtime": 13.8508,
"eval_samples_per_second": 72.198,
"eval_steps_per_second": 2.31,
"step": 5400
},
{
"epoch": 3.0127080599488916,
"grad_norm": 0.882823951711692,
"learning_rate": 7.96859561483918e-05,
"loss": 1.2598,
"step": 5450
},
{
"epoch": 3.0403342772290904,
"grad_norm": 0.5502812788616913,
"learning_rate": 7.933673321634928e-05,
"loss": 1.1938,
"step": 5500
},
{
"epoch": 3.0403342772290904,
"eval_loss": 1.3945913314819336,
"eval_runtime": 13.9938,
"eval_samples_per_second": 71.46,
"eval_steps_per_second": 2.287,
"step": 5500
},
{
"epoch": 3.0679604945092893,
"grad_norm": 0.43894367827193864,
"learning_rate": 7.898532398305564e-05,
"loss": 1.1905,
"step": 5550
},
{
"epoch": 3.095586711789488,
"grad_norm": 0.47471158476888814,
"learning_rate": 7.863175509114201e-05,
"loss": 1.1914,
"step": 5600
},
{
"epoch": 3.095586711789488,
"eval_loss": 1.392581820487976,
"eval_runtime": 14.1574,
"eval_samples_per_second": 70.634,
"eval_steps_per_second": 2.26,
"step": 5600
}
],
"logging_steps": 50,
"max_steps": 18090,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 800,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5356699724546048.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}