pyamy's picture
Upload DPO LLM Judge fine-tuned model
f582c27 verified
raw
history blame
27.3 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 38.48,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.8,
"grad_norm": 4.544456958770752,
"learning_rate": 0.00016,
"logits/chosen": -0.15711309015750885,
"logits/rejected": -0.17393989861011505,
"logps/chosen": -146.22421264648438,
"logps/rejected": -135.63302612304688,
"loss": 0.7347,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": -0.016495775431394577,
"rewards/margins": -0.05580342561006546,
"rewards/rejected": 0.03930765017867088,
"step": 10
},
{
"epoch": 1.56,
"grad_norm": 3.64050030708313,
"learning_rate": 0.00019673469387755104,
"logits/chosen": 0.22568197548389435,
"logits/rejected": 0.24968074262142181,
"logps/chosen": -140.56317138671875,
"logps/rejected": -136.4517822265625,
"loss": 0.5906,
"rewards/accuracies": 0.6973684430122375,
"rewards/chosen": 0.14059841632843018,
"rewards/margins": 0.35573944449424744,
"rewards/rejected": -0.21514104306697845,
"step": 20
},
{
"epoch": 2.32,
"grad_norm": 2.210195779800415,
"learning_rate": 0.0001926530612244898,
"logits/chosen": 0.3498680293560028,
"logits/rejected": 0.3381582796573639,
"logps/chosen": -139.37893676757812,
"logps/rejected": -140.91236877441406,
"loss": 0.3742,
"rewards/accuracies": 0.9342105388641357,
"rewards/chosen": 0.26392117142677307,
"rewards/margins": 1.149539828300476,
"rewards/rejected": -0.8856186866760254,
"step": 30
},
{
"epoch": 3.08,
"grad_norm": 1.595664143562317,
"learning_rate": 0.00018857142857142857,
"logits/chosen": 0.33004075288772583,
"logits/rejected": 0.3743434250354767,
"logps/chosen": -142.27561950683594,
"logps/rejected": -155.76829528808594,
"loss": 0.2073,
"rewards/accuracies": 0.9736841917037964,
"rewards/chosen": 0.2223319262266159,
"rewards/margins": 2.211127519607544,
"rewards/rejected": -1.9887956380844116,
"step": 40
},
{
"epoch": 3.88,
"grad_norm": 0.8180229663848877,
"learning_rate": 0.00018448979591836735,
"logits/chosen": -0.3215048909187317,
"logits/rejected": -0.17942988872528076,
"logps/chosen": -146.15176391601562,
"logps/rejected": -177.8779296875,
"loss": 0.064,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -0.18169204890727997,
"rewards/margins": 4.198596954345703,
"rewards/rejected": -4.380288600921631,
"step": 50
},
{
"epoch": 4.64,
"grad_norm": 0.2704571485519409,
"learning_rate": 0.00018040816326530615,
"logits/chosen": -0.8548356890678406,
"logits/rejected": -0.7396419644355774,
"logps/chosen": -180.8902130126953,
"logps/rejected": -247.38720703125,
"loss": 0.0253,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -3.894167423248291,
"rewards/margins": 7.3716654777526855,
"rewards/rejected": -11.265832901000977,
"step": 60
},
{
"epoch": 5.4,
"grad_norm": 0.04568086564540863,
"learning_rate": 0.0001763265306122449,
"logits/chosen": -1.0966769456863403,
"logits/rejected": -0.9558103680610657,
"logps/chosen": -216.4236297607422,
"logps/rejected": -299.0456848144531,
"loss": 0.0086,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.186522006988525,
"rewards/margins": 9.163583755493164,
"rewards/rejected": -16.35010528564453,
"step": 70
},
{
"epoch": 6.16,
"grad_norm": 0.030845321714878082,
"learning_rate": 0.00017224489795918368,
"logits/chosen": -0.8765879273414612,
"logits/rejected": -0.710603654384613,
"logps/chosen": -208.9407196044922,
"logps/rejected": -290.63800048828125,
"loss": 0.0222,
"rewards/accuracies": 0.9736841917037964,
"rewards/chosen": -6.255966663360596,
"rewards/margins": 9.707125663757324,
"rewards/rejected": -15.963091850280762,
"step": 80
},
{
"epoch": 6.96,
"grad_norm": 0.06601617485284805,
"learning_rate": 0.00016816326530612246,
"logits/chosen": -0.7680649757385254,
"logits/rejected": -0.5714871287345886,
"logps/chosen": -177.8467254638672,
"logps/rejected": -262.7944030761719,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.312117338180542,
"rewards/margins": 9.123556137084961,
"rewards/rejected": -12.435674667358398,
"step": 90
},
{
"epoch": 7.72,
"grad_norm": 0.02727358043193817,
"learning_rate": 0.00016408163265306124,
"logits/chosen": -0.7908716797828674,
"logits/rejected": -0.5970498919487,
"logps/chosen": -179.80191040039062,
"logps/rejected": -264.28271484375,
"loss": 0.0091,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -3.6998748779296875,
"rewards/margins": 9.295143127441406,
"rewards/rejected": -12.995016098022461,
"step": 100
},
{
"epoch": 8.48,
"grad_norm": 0.052736785262823105,
"learning_rate": 0.00016,
"logits/chosen": -0.6252234578132629,
"logits/rejected": -0.4707661271095276,
"logps/chosen": -185.8507537841797,
"logps/rejected": -273.919677734375,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.9508209228515625,
"rewards/margins": 9.781594276428223,
"rewards/rejected": -13.732414245605469,
"step": 110
},
{
"epoch": 9.24,
"grad_norm": 0.010697088204324245,
"learning_rate": 0.0001559183673469388,
"logits/chosen": -0.9722104072570801,
"logits/rejected": -0.7994766235351562,
"logps/chosen": -193.7563018798828,
"logps/rejected": -286.2286682128906,
"loss": 0.0088,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -5.037867069244385,
"rewards/margins": 10.28415298461914,
"rewards/rejected": -15.322019577026367,
"step": 120
},
{
"epoch": 10.0,
"grad_norm": NaN,
"learning_rate": 0.00015183673469387757,
"logits/chosen": -0.9392030835151672,
"logits/rejected": -0.7859267592430115,
"logps/chosen": -208.22225952148438,
"logps/rejected": -304.37054443359375,
"loss": 0.0174,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -6.531684875488281,
"rewards/margins": 10.490182876586914,
"rewards/rejected": -17.021865844726562,
"step": 130
},
{
"epoch": 10.8,
"grad_norm": 0.004046889953315258,
"learning_rate": 0.00014816326530612246,
"logits/chosen": -0.9692522883415222,
"logits/rejected": -0.8100675344467163,
"logps/chosen": -207.4010772705078,
"logps/rejected": -305.93890380859375,
"loss": 0.0087,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -6.667567253112793,
"rewards/margins": 10.732935905456543,
"rewards/rejected": -17.400503158569336,
"step": 140
},
{
"epoch": 11.56,
"grad_norm": 0.0038029830902814865,
"learning_rate": 0.00014408163265306124,
"logits/chosen": -0.9378202557563782,
"logits/rejected": -0.7926595211029053,
"logps/chosen": -211.8084259033203,
"logps/rejected": -317.88360595703125,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -6.8508453369140625,
"rewards/margins": 11.4996337890625,
"rewards/rejected": -18.350479125976562,
"step": 150
},
{
"epoch": 12.32,
"grad_norm": 0.006996906362473965,
"learning_rate": 0.00014,
"logits/chosen": -0.9922997355461121,
"logits/rejected": -0.8554012775421143,
"logps/chosen": -225.67135620117188,
"logps/rejected": -333.3880615234375,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -7.963207244873047,
"rewards/margins": 11.519519805908203,
"rewards/rejected": -19.482725143432617,
"step": 160
},
{
"epoch": 13.08,
"grad_norm": 0.004378203302621841,
"learning_rate": 0.0001359183673469388,
"logits/chosen": -0.9898152351379395,
"logits/rejected": -0.8856151103973389,
"logps/chosen": -217.1047821044922,
"logps/rejected": -322.1737976074219,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.197223663330078,
"rewards/margins": 11.574359893798828,
"rewards/rejected": -18.771583557128906,
"step": 170
},
{
"epoch": 13.88,
"grad_norm": 0.006621514912694693,
"learning_rate": 0.00013183673469387757,
"logits/chosen": -1.0127683877944946,
"logits/rejected": -0.8718019723892212,
"logps/chosen": -220.30392456054688,
"logps/rejected": -324.86114501953125,
"loss": 0.0087,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.355535984039307,
"rewards/margins": 11.52188777923584,
"rewards/rejected": -18.877422332763672,
"step": 180
},
{
"epoch": 14.64,
"grad_norm": 0.0018915284890681505,
"learning_rate": 0.00012775510204081632,
"logits/chosen": -1.0274461507797241,
"logits/rejected": -0.8911333680152893,
"logps/chosen": -222.65452575683594,
"logps/rejected": -327.3719787597656,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -8.228594779968262,
"rewards/margins": 11.633173942565918,
"rewards/rejected": -19.861770629882812,
"step": 190
},
{
"epoch": 15.4,
"grad_norm": 0.004701931029558182,
"learning_rate": 0.0001236734693877551,
"logits/chosen": -0.9393897652626038,
"logits/rejected": -0.8236327171325684,
"logps/chosen": -233.4442901611328,
"logps/rejected": -337.7091979980469,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.586127281188965,
"rewards/margins": 11.664649963378906,
"rewards/rejected": -20.250778198242188,
"step": 200
},
{
"epoch": 16.16,
"grad_norm": 0.0026641907170414925,
"learning_rate": 0.00011959183673469388,
"logits/chosen": -1.07357656955719,
"logits/rejected": -0.9411842226982117,
"logps/chosen": -217.97422790527344,
"logps/rejected": -331.9872131347656,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -7.6344499588012695,
"rewards/margins": 11.908411026000977,
"rewards/rejected": -19.542861938476562,
"step": 210
},
{
"epoch": 16.96,
"grad_norm": 0.003238040255382657,
"learning_rate": 0.00011551020408163267,
"logits/chosen": -0.9635592699050903,
"logits/rejected": -0.8426879644393921,
"logps/chosen": -231.2787322998047,
"logps/rejected": -343.9873962402344,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.459344863891602,
"rewards/margins": 12.18354320526123,
"rewards/rejected": -20.642887115478516,
"step": 220
},
{
"epoch": 17.72,
"grad_norm": 0.004305878188461065,
"learning_rate": 0.00011142857142857144,
"logits/chosen": -0.9933919310569763,
"logits/rejected": -0.9030781984329224,
"logps/chosen": -227.8687286376953,
"logps/rejected": -337.8186950683594,
"loss": 0.026,
"rewards/accuracies": 0.9736841917037964,
"rewards/chosen": -8.76144790649414,
"rewards/margins": 11.788559913635254,
"rewards/rejected": -20.550006866455078,
"step": 230
},
{
"epoch": 18.48,
"grad_norm": 0.005013110116124153,
"learning_rate": 0.00010734693877551021,
"logits/chosen": -1.106400489807129,
"logits/rejected": -0.9791207313537598,
"logps/chosen": -235.90512084960938,
"logps/rejected": -355.18023681640625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.072104454040527,
"rewards/margins": 12.6239652633667,
"rewards/rejected": -21.696069717407227,
"step": 240
},
{
"epoch": 19.24,
"grad_norm": 0.0031214996706694365,
"learning_rate": 0.00010326530612244899,
"logits/chosen": -0.9300792813301086,
"logits/rejected": -0.8107971549034119,
"logps/chosen": -226.8355712890625,
"logps/rejected": -337.895751953125,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -8.232963562011719,
"rewards/margins": 12.14808177947998,
"rewards/rejected": -20.381046295166016,
"step": 250
},
{
"epoch": 20.0,
"grad_norm": 0.0045935832895338535,
"learning_rate": 9.918367346938776e-05,
"logits/chosen": -1.0340404510498047,
"logits/rejected": -0.9297473430633545,
"logps/chosen": -234.9521942138672,
"logps/rejected": -347.18572998046875,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -9.113563537597656,
"rewards/margins": 12.3864107131958,
"rewards/rejected": -21.499975204467773,
"step": 260
},
{
"epoch": 20.8,
"grad_norm": 0.0013183593982830644,
"learning_rate": 9.510204081632653e-05,
"logits/chosen": -1.018434762954712,
"logits/rejected": -0.9138419032096863,
"logps/chosen": -236.2734832763672,
"logps/rejected": -347.966064453125,
"loss": 0.0087,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -8.8914794921875,
"rewards/margins": 12.274205207824707,
"rewards/rejected": -21.16568374633789,
"step": 270
},
{
"epoch": 21.56,
"grad_norm": 0.002180259209126234,
"learning_rate": 9.102040816326532e-05,
"logits/chosen": -0.9927906394004822,
"logits/rejected": -0.9210112690925598,
"logps/chosen": -230.18836975097656,
"logps/rejected": -351.64263916015625,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -9.216283798217773,
"rewards/margins": 12.91382884979248,
"rewards/rejected": -22.13011360168457,
"step": 280
},
{
"epoch": 22.32,
"grad_norm": 0.0018169954419136047,
"learning_rate": 8.693877551020408e-05,
"logits/chosen": -1.0399179458618164,
"logits/rejected": -0.9215599894523621,
"logps/chosen": -243.0306854248047,
"logps/rejected": -352.8495178222656,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -9.664202690124512,
"rewards/margins": 12.071381568908691,
"rewards/rejected": -21.73558235168457,
"step": 290
},
{
"epoch": 23.08,
"grad_norm": 0.0032243800815194845,
"learning_rate": 8.285714285714287e-05,
"logits/chosen": -1.0440365076065063,
"logits/rejected": -0.909357488155365,
"logps/chosen": -233.0224609375,
"logps/rejected": -356.0140686035156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.971783638000488,
"rewards/margins": 12.992220878601074,
"rewards/rejected": -21.964004516601562,
"step": 300
},
{
"epoch": 23.88,
"grad_norm": 0.005289977416396141,
"learning_rate": 7.877551020408164e-05,
"logits/chosen": -1.0011231899261475,
"logits/rejected": -0.890425980091095,
"logps/chosen": -247.11978149414062,
"logps/rejected": -363.4911804199219,
"loss": 0.0087,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -10.250194549560547,
"rewards/margins": 12.585700035095215,
"rewards/rejected": -22.835895538330078,
"step": 310
},
{
"epoch": 24.64,
"grad_norm": 0.0014915637439116836,
"learning_rate": 7.469387755102041e-05,
"logits/chosen": -1.0355056524276733,
"logits/rejected": -0.9172827005386353,
"logps/chosen": -237.4662628173828,
"logps/rejected": -356.5818786621094,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -9.470785140991211,
"rewards/margins": 12.789039611816406,
"rewards/rejected": -22.259824752807617,
"step": 320
},
{
"epoch": 25.4,
"grad_norm": 0.003703987691551447,
"learning_rate": 7.061224489795919e-05,
"logits/chosen": -0.9380186200141907,
"logits/rejected": -0.8530542254447937,
"logps/chosen": -240.56121826171875,
"logps/rejected": -359.228271484375,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -9.604844093322754,
"rewards/margins": 12.866401672363281,
"rewards/rejected": -22.47124481201172,
"step": 330
},
{
"epoch": 26.16,
"grad_norm": 0.007798569742590189,
"learning_rate": 6.653061224489796e-05,
"logits/chosen": -1.1456927061080933,
"logits/rejected": -1.036659598350525,
"logps/chosen": -246.2130126953125,
"logps/rejected": -367.6144714355469,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.008074760437012,
"rewards/margins": 12.975547790527344,
"rewards/rejected": -22.983623504638672,
"step": 340
},
{
"epoch": 26.96,
"grad_norm": 0.0026127954479306936,
"learning_rate": 6.244897959183675e-05,
"logits/chosen": -0.9940476417541504,
"logits/rejected": -0.8720202445983887,
"logps/chosen": -234.91995239257812,
"logps/rejected": -348.8143615722656,
"loss": 0.0087,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -9.309648513793945,
"rewards/margins": 12.443445205688477,
"rewards/rejected": -21.753093719482422,
"step": 350
},
{
"epoch": 27.72,
"grad_norm": 0.0053157140500843525,
"learning_rate": 5.836734693877551e-05,
"logits/chosen": -0.9642550945281982,
"logits/rejected": -0.8671989440917969,
"logps/chosen": -247.78762817382812,
"logps/rejected": -368.91265869140625,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -10.423563003540039,
"rewards/margins": 12.920208930969238,
"rewards/rejected": -23.34377098083496,
"step": 360
},
{
"epoch": 28.48,
"grad_norm": 0.0024100164882838726,
"learning_rate": 5.428571428571428e-05,
"logits/chosen": -1.0245031118392944,
"logits/rejected": -0.9381424784660339,
"logps/chosen": -250.5137481689453,
"logps/rejected": -367.24285888671875,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -10.55148696899414,
"rewards/margins": 13.011177062988281,
"rewards/rejected": -23.562665939331055,
"step": 370
},
{
"epoch": 29.24,
"grad_norm": 0.006509924773126841,
"learning_rate": 5.0204081632653066e-05,
"logits/chosen": -1.0565561056137085,
"logits/rejected": -0.9717170596122742,
"logps/chosen": -231.0986328125,
"logps/rejected": -354.435791015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.792767524719238,
"rewards/margins": 12.750945091247559,
"rewards/rejected": -21.543716430664062,
"step": 380
},
{
"epoch": 30.0,
"grad_norm": 0.005250515416264534,
"learning_rate": 4.612244897959184e-05,
"logits/chosen": -1.0130590200424194,
"logits/rejected": -0.9052179455757141,
"logps/chosen": -254.4619140625,
"logps/rejected": -371.8909606933594,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -10.911393165588379,
"rewards/margins": 12.962776184082031,
"rewards/rejected": -23.874168395996094,
"step": 390
},
{
"epoch": 30.8,
"grad_norm": 0.003937486559152603,
"learning_rate": 4.2040816326530615e-05,
"logits/chosen": -1.068474531173706,
"logits/rejected": -0.9814627766609192,
"logps/chosen": -243.24130249023438,
"logps/rejected": -363.50213623046875,
"loss": 0.0087,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -9.871267318725586,
"rewards/margins": 12.944598197937012,
"rewards/rejected": -22.815866470336914,
"step": 400
},
{
"epoch": 31.56,
"grad_norm": 0.0019006684888154268,
"learning_rate": 3.795918367346939e-05,
"logits/chosen": -0.9314201474189758,
"logits/rejected": -0.8595296740531921,
"logps/chosen": -246.16455078125,
"logps/rejected": -366.99896240234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.104809761047363,
"rewards/margins": 13.133968353271484,
"rewards/rejected": -23.2387752532959,
"step": 410
},
{
"epoch": 32.32,
"grad_norm": 0.0005793002783320844,
"learning_rate": 3.387755102040816e-05,
"logits/chosen": -1.098380208015442,
"logits/rejected": -0.9939666390419006,
"logps/chosen": -252.05137634277344,
"logps/rejected": -373.0602722167969,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -10.78331470489502,
"rewards/margins": 12.870674133300781,
"rewards/rejected": -23.65399169921875,
"step": 420
},
{
"epoch": 33.08,
"grad_norm": 0.0039499541744589806,
"learning_rate": 2.9795918367346944e-05,
"logits/chosen": -0.9925128817558289,
"logits/rejected": -0.8669744729995728,
"logps/chosen": -248.5987548828125,
"logps/rejected": -369.866943359375,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -10.700756072998047,
"rewards/margins": 13.224184036254883,
"rewards/rejected": -23.924942016601562,
"step": 430
},
{
"epoch": 33.88,
"grad_norm": 0.0036473730579018593,
"learning_rate": 2.5714285714285714e-05,
"logits/chosen": -1.0634641647338867,
"logits/rejected": -0.9340164065361023,
"logps/chosen": -245.3182830810547,
"logps/rejected": -368.24359130859375,
"loss": 0.0087,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -10.03348445892334,
"rewards/margins": 13.248746871948242,
"rewards/rejected": -23.2822322845459,
"step": 440
},
{
"epoch": 34.64,
"grad_norm": 0.0020049242302775383,
"learning_rate": 2.1632653061224492e-05,
"logits/chosen": -0.9656567573547363,
"logits/rejected": -0.9384468197822571,
"logps/chosen": -248.10145568847656,
"logps/rejected": -374.0014953613281,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.594840049743652,
"rewards/margins": 13.160733222961426,
"rewards/rejected": -23.755571365356445,
"step": 450
},
{
"epoch": 35.4,
"grad_norm": 0.0025986225809901953,
"learning_rate": 1.7551020408163266e-05,
"logits/chosen": -1.0361719131469727,
"logits/rejected": -0.925805926322937,
"logps/chosen": -250.07704162597656,
"logps/rejected": -365.67340087890625,
"loss": 0.0173,
"rewards/accuracies": 0.9736841917037964,
"rewards/chosen": -10.339940071105957,
"rewards/margins": 12.819701194763184,
"rewards/rejected": -23.159643173217773,
"step": 460
},
{
"epoch": 36.16,
"grad_norm": 0.000490883132442832,
"learning_rate": 1.3469387755102042e-05,
"logits/chosen": -1.0601824522018433,
"logits/rejected": -0.9313357472419739,
"logps/chosen": -246.32838439941406,
"logps/rejected": -368.8094787597656,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.99472427368164,
"rewards/margins": 13.403642654418945,
"rewards/rejected": -23.398366928100586,
"step": 470
},
{
"epoch": 36.96,
"grad_norm": 0.0015991979744285345,
"learning_rate": 9.387755102040816e-06,
"logits/chosen": -0.973480224609375,
"logits/rejected": -0.883003830909729,
"logps/chosen": -242.6292724609375,
"logps/rejected": -364.950927734375,
"loss": 0.0087,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -10.180352210998535,
"rewards/margins": 12.991645812988281,
"rewards/rejected": -23.171995162963867,
"step": 480
},
{
"epoch": 37.72,
"grad_norm": 0.0009250708390027285,
"learning_rate": 5.306122448979592e-06,
"logits/chosen": -0.9940276741981506,
"logits/rejected": -0.9116230010986328,
"logps/chosen": -246.79559326171875,
"logps/rejected": -374.49420166015625,
"loss": 0.0087,
"rewards/accuracies": 0.9868420958518982,
"rewards/chosen": -10.589569091796875,
"rewards/margins": 13.223187446594238,
"rewards/rejected": -23.81275749206543,
"step": 490
},
{
"epoch": 38.48,
"grad_norm": 0.001163547858595848,
"learning_rate": 1.2244897959183673e-06,
"logits/chosen": -1.030373215675354,
"logits/rejected": -0.9079036712646484,
"logps/chosen": -249.69741821289062,
"logps/rejected": -373.4053039550781,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.532529830932617,
"rewards/margins": 13.417372703552246,
"rewards/rejected": -23.949905395507812,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 39,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}