{ "best_global_step": 351, "best_metric": 0.08202474, "best_model_checkpoint": "/ckpts/models/wohu_ui_llm/7b/test_dpo_0730_8723_3e-6_beta_005_add_claude2_epoch3/v0-20250801-071953/checkpoint-351", "epoch": 2.9957446808510637, "eval_steps": 100, "global_step": 351, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00851063829787234, "grad_norm": 113.12678527832031, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -2.26171875, "logits/rejected": -2.27734375, "logps/chosen": -421.0, "logps/rejected": -97.875, "loss": 1.2347412109375, "memory(GiB)": 63.45, "nll_loss": 0.54248046875, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "train_speed(iter/s)": 0.015536 }, { "epoch": 0.0425531914893617, "grad_norm": 116.12273406982422, "learning_rate": 8.333333333333334e-07, "logits/chosen": -2.2197265625, "logits/rejected": -2.2900390625, "logps/chosen": -518.375, "logps/rejected": -105.0625, "loss": 1.295806884765625, "memory(GiB)": 78.07, "nll_loss": 0.623046875, "rewards/accuracies": 0.44921875, "rewards/chosen": 0.04306221008300781, "rewards/margins": 0.03983926773071289, "rewards/rejected": 0.0031530149281024933, "step": 5, "train_speed(iter/s)": 0.020262 }, { "epoch": 0.0851063829787234, "grad_norm": 16.63855743408203, "learning_rate": 1.6666666666666669e-06, "logits/chosen": -2.178906202316284, "logits/rejected": -2.296875, "logps/chosen": -496.29998779296875, "logps/rejected": -100.19999694824219, "loss": 0.902490234375, "memory(GiB)": 78.07, "nll_loss": 0.584765613079071, "rewards/accuracies": 1.0, "rewards/chosen": 1.335205078125, "rewards/margins": 1.311669945716858, "rewards/rejected": 0.024295806884765625, "step": 10, "train_speed(iter/s)": 0.023151 }, { "epoch": 0.1276595744680851, "grad_norm": 3.581925392150879, "learning_rate": 2.5e-06, "logits/chosen": -1.9874999523162842, "logits/rejected": -2.1171875, "logps/chosen": -399.20001220703125, "logps/rejected": -105.57499694824219, "loss": 0.502459716796875, "memory(GiB)": 78.11, "nll_loss": 0.4808593690395355, "rewards/accuracies": 1.0, "rewards/chosen": 6.135937690734863, "rewards/margins": 5.94140625, "rewards/rejected": 0.19367675483226776, "step": 15, "train_speed(iter/s)": 0.02441 }, { "epoch": 0.1702127659574468, "grad_norm": 1.2561246156692505, "learning_rate": 2.99973299484371e-06, "logits/chosen": -0.9429687261581421, "logits/rejected": -1.2861328125, "logps/chosen": -267.79998779296875, "logps/rejected": -103.30000305175781, "loss": 0.3127166748046875, "memory(GiB)": 78.11, "nll_loss": 0.3118652403354645, "rewards/accuracies": 1.0, "rewards/chosen": 12.771875381469727, "rewards/margins": 12.581250190734863, "rewards/rejected": 0.18445129692554474, "step": 20, "train_speed(iter/s)": 0.024676 }, { "epoch": 0.2127659574468085, "grad_norm": 0.6227205395698547, "learning_rate": 2.9967302783835462e-06, "logits/chosen": 0.38526612520217896, "logits/rejected": 0.3094726502895355, "logps/chosen": -193.35000610351562, "logps/rejected": -125.125, "loss": 0.229217529296875, "memory(GiB)": 78.11, "nll_loss": 0.22915038466453552, "rewards/accuracies": 1.0, "rewards/chosen": 16.259374618530273, "rewards/margins": 17.034374237060547, "rewards/rejected": -0.7733398675918579, "step": 25, "train_speed(iter/s)": 0.024962 }, { "epoch": 0.2553191489361702, "grad_norm": 0.41129618883132935, "learning_rate": 2.9903977914295545e-06, "logits/chosen": 0.7916015386581421, "logits/rejected": 1.0089843273162842, "logps/chosen": -168.39999389648438, "logps/rejected": -163.8000030517578, "loss": 0.19467926025390625, "memory(GiB)": 78.11, "nll_loss": 0.19462890923023224, "rewards/accuracies": 1.0, "rewards/chosen": 17.774999618530273, "rewards/margins": 20.4375, "rewards/rejected": -2.672656297683716, "step": 30, "train_speed(iter/s)": 0.025268 }, { "epoch": 0.2978723404255319, "grad_norm": 0.4016430974006653, "learning_rate": 2.9807496218427986e-06, "logits/chosen": 0.7188476324081421, "logits/rejected": 1.051367163658142, "logps/chosen": -147.5749969482422, "logps/rejected": -167.4499969482422, "loss": 0.1833892822265625, "memory(GiB)": 78.11, "nll_loss": 0.18344727158546448, "rewards/accuracies": 1.0, "rewards/chosen": 17.71875, "rewards/margins": 21.081249237060547, "rewards/rejected": -3.3648438453674316, "step": 35, "train_speed(iter/s)": 0.025286 }, { "epoch": 0.3404255319148936, "grad_norm": 0.31658557057380676, "learning_rate": 2.967807233871629e-06, "logits/chosen": 0.45478516817092896, "logits/rejected": 0.924609363079071, "logps/chosen": -145.5749969482422, "logps/rejected": -178.89999389648438, "loss": 0.17320556640625, "memory(GiB)": 78.11, "nll_loss": 0.17319336533546448, "rewards/accuracies": 1.0, "rewards/chosen": 19.15625, "rewards/margins": 22.78125, "rewards/rejected": -3.624218702316284, "step": 40, "train_speed(iter/s)": 0.025482 }, { "epoch": 0.3829787234042553, "grad_norm": 0.27831006050109863, "learning_rate": 2.9515994204002487e-06, "logits/chosen": 0.1028236374258995, "logits/rejected": 0.770703136920929, "logps/chosen": -139.27499389648438, "logps/rejected": -189.64999389648438, "loss": 0.1634765625, "memory(GiB)": 78.11, "nll_loss": 0.16347655653953552, "rewards/accuracies": 1.0, "rewards/chosen": 19.056249618530273, "rewards/margins": 23.15625, "rewards/rejected": -4.110937595367432, "step": 45, "train_speed(iter/s)": 0.025643 }, { "epoch": 0.425531914893617, "grad_norm": 0.25902220606803894, "learning_rate": 2.93216223889328e-06, "logits/chosen": -0.0480804443359375, "logits/rejected": 0.71923828125, "logps/chosen": -133.72500610351562, "logps/rejected": -198.5, "loss": 0.15597991943359374, "memory(GiB)": 78.11, "nll_loss": 0.15595702826976776, "rewards/accuracies": 1.0, "rewards/chosen": 20.168750762939453, "rewards/margins": 24.75, "rewards/rejected": -4.5859375, "step": 50, "train_speed(iter/s)": 0.025474 }, { "epoch": 0.46808510638297873, "grad_norm": 0.27047035098075867, "learning_rate": 2.9095389311788626e-06, "logits/chosen": -0.18197020888328552, "logits/rejected": 0.5546875, "logps/chosen": -125.92500305175781, "logps/rejected": -208.14999389648438, "loss": 0.14620513916015626, "memory(GiB)": 78.11, "nll_loss": 0.14619140326976776, "rewards/accuracies": 1.0, "rewards/chosen": 19.799999237060547, "rewards/margins": 24.693750381469727, "rewards/rejected": -4.910937309265137, "step": 55, "train_speed(iter/s)": 0.025306 }, { "epoch": 0.5106382978723404, "grad_norm": 0.2637348473072052, "learning_rate": 2.8837798272487033e-06, "logits/chosen": -0.17402306199073792, "logits/rejected": 0.616503894329071, "logps/chosen": -130.97500610351562, "logps/rejected": -213.89999389648438, "loss": 0.14871978759765625, "memory(GiB)": 78.11, "nll_loss": 0.14863280951976776, "rewards/accuracies": 1.0, "rewards/chosen": 20.625, "rewards/margins": 25.737499237060547, "rewards/rejected": -5.128125190734863, "step": 60, "train_speed(iter/s)": 0.025358 }, { "epoch": 0.5531914893617021, "grad_norm": 0.2757234573364258, "learning_rate": 2.8549422332891285e-06, "logits/chosen": -0.1392822265625, "logits/rejected": 0.5840820074081421, "logps/chosen": -122.125, "logps/rejected": -214.10000610351562, "loss": 0.1463531494140625, "memory(GiB)": 78.11, "nll_loss": 0.14619140326976776, "rewards/accuracies": 1.0, "rewards/chosen": 20.606250762939453, "rewards/margins": 25.912500381469727, "rewards/rejected": -5.298437595367432, "step": 65, "train_speed(iter/s)": 0.02546 }, { "epoch": 0.5957446808510638, "grad_norm": 0.283324658870697, "learning_rate": 2.823090304192217e-06, "logits/chosen": -0.08134154975414276, "logits/rejected": 0.64697265625, "logps/chosen": -113.42500305175781, "logps/rejected": -220.1999969482422, "loss": 0.13967132568359375, "memory(GiB)": 78.11, "nll_loss": 0.1396484375, "rewards/accuracies": 1.0, "rewards/chosen": 19.106250762939453, "rewards/margins": 24.731250762939453, "rewards/rejected": -5.606249809265137, "step": 70, "train_speed(iter/s)": 0.025562 }, { "epoch": 0.6382978723404256, "grad_norm": 0.25037115812301636, "learning_rate": 2.7882949008306392e-06, "logits/chosen": -0.18788452446460724, "logits/rejected": 0.564404308795929, "logps/chosen": -116.57499694824219, "logps/rejected": -237.60000610351562, "loss": 0.1353363037109375, "memory(GiB)": 78.11, "nll_loss": 0.13532714545726776, "rewards/accuracies": 1.0, "rewards/chosen": 20.306249618530273, "rewards/margins": 26.40625, "rewards/rejected": -6.090624809265137, "step": 75, "train_speed(iter/s)": 0.025539 }, { "epoch": 0.6808510638297872, "grad_norm": 0.2523050308227539, "learning_rate": 2.750633432413728e-06, "logits/chosen": -0.17770537734031677, "logits/rejected": 0.498748779296875, "logps/chosen": -112.0250015258789, "logps/rejected": -236.9499969482422, "loss": 0.13072662353515624, "memory(GiB)": 78.11, "nll_loss": 0.13068847358226776, "rewards/accuracies": 1.0, "rewards/chosen": 19.984375, "rewards/margins": 26.318750381469727, "rewards/rejected": -6.315625190734863, "step": 80, "train_speed(iter/s)": 0.025618 }, { "epoch": 0.723404255319149, "grad_norm": 0.2926895320415497, "learning_rate": 2.7101896842754866e-06, "logits/chosen": -0.23759765923023224, "logits/rejected": 0.45625001192092896, "logps/chosen": -111.875, "logps/rejected": -246.39999389648438, "loss": 0.12862091064453124, "memory(GiB)": 78.11, "nll_loss": 0.128662109375, "rewards/accuracies": 1.0, "rewards/chosen": 20.412500381469727, "rewards/margins": 27.200000762939453, "rewards/rejected": -6.787499904632568, "step": 85, "train_speed(iter/s)": 0.025658 }, { "epoch": 0.7659574468085106, "grad_norm": 0.2679479420185089, "learning_rate": 2.6670536314776595e-06, "logits/chosen": -0.18338623642921448, "logits/rejected": 0.4960083067417145, "logps/chosen": -108.4000015258789, "logps/rejected": -243.9499969482422, "loss": 0.13025360107421874, "memory(GiB)": 78.11, "nll_loss": 0.1302490234375, "rewards/accuracies": 1.0, "rewards/chosen": 20.603124618530273, "rewards/margins": 27.668750762939453, "rewards/rejected": -7.056250095367432, "step": 90, "train_speed(iter/s)": 0.025811 }, { "epoch": 0.8085106382978723, "grad_norm": 0.2539767622947693, "learning_rate": 2.6213212386425304e-06, "logits/chosen": -0.35136717557907104, "logits/rejected": 0.42353516817092896, "logps/chosen": -111.375, "logps/rejected": -262.45001220703125, "loss": 0.12735443115234374, "memory(GiB)": 78.11, "nll_loss": 0.1273193359375, "rewards/accuracies": 1.0, "rewards/chosen": 20.912500381469727, "rewards/margins": 28.243749618530273, "rewards/rejected": -7.326562404632568, "step": 95, "train_speed(iter/s)": 0.025843 }, { "epoch": 0.851063829787234, "grad_norm": 0.2883126735687256, "learning_rate": 2.573094246460773e-06, "logits/chosen": -0.3926025331020355, "logits/rejected": 0.3401855528354645, "logps/chosen": -101.8499984741211, "logps/rejected": -255.14999389648438, "loss": 0.12263336181640624, "memory(GiB)": 78.11, "nll_loss": 0.12258300930261612, "rewards/accuracies": 1.0, "rewards/chosen": 20.381250381469727, "rewards/margins": 27.981250762939453, "rewards/rejected": -7.606249809265137, "step": 100, "train_speed(iter/s)": 0.025837 }, { "epoch": 0.851063829787234, "eval_logits/chosen": -0.3551269471645355, "eval_logits/rejected": 0.3993896543979645, "eval_logps/chosen": -114.19999694824219, "eval_logps/rejected": -283.3999938964844, "eval_loss": 0.12689127027988434, "eval_nll_loss": 0.13129882514476776, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 21.575000762939453, "eval_rewards/margins": 29.962499618530273, "eval_rewards/rejected": -8.393750190734863, "eval_runtime": 16.796, "eval_samples_per_second": 4.465, "eval_steps_per_second": 0.595, "step": 100 }, { "epoch": 0.8936170212765957, "grad_norm": 0.26771247386932373, "learning_rate": 2.5224799453492994e-06, "logits/chosen": -0.45463865995407104, "logits/rejected": 0.3071838319301605, "logps/chosen": -103.67500305175781, "logps/rejected": -268.0, "loss": 0.12566070556640624, "memory(GiB)": 78.11, "nll_loss": 0.12568359076976776, "rewards/accuracies": 1.0, "rewards/chosen": 21.1875, "rewards/margins": 29.40625, "rewards/rejected": -8.196874618530273, "step": 105, "train_speed(iter/s)": 0.025625 }, { "epoch": 0.9361702127659575, "grad_norm": 0.26143553853034973, "learning_rate": 2.469590936762654e-06, "logits/chosen": -0.6083984375, "logits/rejected": 0.19863280653953552, "logps/chosen": -102.7750015258789, "logps/rejected": -276.70001220703125, "loss": 0.11554946899414062, "memory(GiB)": 78.11, "nll_loss": 0.11552734673023224, "rewards/accuracies": 1.0, "rewards/chosen": 21.356250762939453, "rewards/margins": 29.475000381469727, "rewards/rejected": -8.104687690734863, "step": 110, "train_speed(iter/s)": 0.025585 }, { "epoch": 0.9787234042553191, "grad_norm": 0.26631999015808105, "learning_rate": 2.414544882688961e-06, "logits/chosen": -0.5418945550918579, "logits/rejected": 0.3081298768520355, "logps/chosen": -96.5999984741211, "logps/rejected": -275.20001220703125, "loss": 0.11951446533203125, "memory(GiB)": 78.11, "nll_loss": 0.11955566704273224, "rewards/accuracies": 1.0, "rewards/chosen": 20.450000762939453, "rewards/margins": 28.706249237060547, "rewards/rejected": -8.2578125, "step": 115, "train_speed(iter/s)": 0.025519 }, { "epoch": 1.025531914893617, "grad_norm": 0.261348694562912, "learning_rate": 2.3574642438877183e-06, "logits/chosen": -0.6530877947807312, "logits/rejected": 0.1812860369682312, "logps/chosen": -95.26190185546875, "logps/rejected": -286.047607421875, "loss": 0.1169525146484375, "memory(GiB)": 78.11, "nll_loss": 0.1113978773355484, "rewards/accuracies": 1.0, "rewards/chosen": 21.404762268066406, "rewards/margins": 30.095237731933594, "rewards/rejected": -8.691964149475098, "step": 120, "train_speed(iter/s)": 0.025517 }, { "epoch": 1.0680851063829788, "grad_norm": 0.266865998506546, "learning_rate": 2.2984760074517883e-06, "logits/chosen": -0.744921863079071, "logits/rejected": 0.12298583984375, "logps/chosen": -91.2249984741211, "logps/rejected": -293.79998779296875, "loss": 0.10560760498046876, "memory(GiB)": 78.11, "nll_loss": 0.10561523586511612, "rewards/accuracies": 1.0, "rewards/chosen": 21.450000762939453, "rewards/margins": 30.8125, "rewards/rejected": -9.362500190734863, "step": 125, "train_speed(iter/s)": 0.025403 }, { "epoch": 1.1106382978723404, "grad_norm": 0.24889616668224335, "learning_rate": 2.2377114042996625e-06, "logits/chosen": -0.736035168170929, "logits/rejected": 0.17665405571460724, "logps/chosen": -87.07499694824219, "logps/rejected": -295.8999938964844, "loss": 0.10625762939453125, "memory(GiB)": 78.11, "nll_loss": 0.10622558742761612, "rewards/accuracies": 1.0, "rewards/chosen": 20.903125762939453, "rewards/margins": 30.087499618530273, "rewards/rejected": -9.206250190734863, "step": 130, "train_speed(iter/s)": 0.025476 }, { "epoch": 1.1531914893617021, "grad_norm": 0.2701774835586548, "learning_rate": 2.17530561722651e-06, "logits/chosen": -0.8929687738418579, "logits/rejected": 0.006848144344985485, "logps/chosen": -84.75, "logps/rejected": -301.04998779296875, "loss": 0.099554443359375, "memory(GiB)": 78.11, "nll_loss": 0.09956054389476776, "rewards/accuracies": 1.0, "rewards/chosen": 21.587499618530273, "rewards/margins": 31.524999618530273, "rewards/rejected": -9.928125381469727, "step": 135, "train_speed(iter/s)": 0.02541 }, { "epoch": 1.195744680851064, "grad_norm": 0.27444973587989807, "learning_rate": 2.1113974801634947e-06, "logits/chosen": -0.9662109613418579, "logits/rejected": 0.02514800988137722, "logps/chosen": -83.9749984741211, "logps/rejected": -317.3999938964844, "loss": 0.0990264892578125, "memory(GiB)": 78.11, "nll_loss": 0.09904785454273224, "rewards/accuracies": 1.0, "rewards/chosen": 21.65625, "rewards/margins": 31.981250762939453, "rewards/rejected": -10.328125, "step": 140, "train_speed(iter/s)": 0.025373 }, { "epoch": 1.2382978723404254, "grad_norm": 0.29677021503448486, "learning_rate": 2.046129169314426e-06, "logits/chosen": -1.015039086341858, "logits/rejected": -0.07387695461511612, "logps/chosen": -83.88749694824219, "logps/rejected": -319.5, "loss": 0.0983306884765625, "memory(GiB)": 78.11, "nll_loss": 0.09833984076976776, "rewards/accuracies": 1.0, "rewards/chosen": 21.225000381469727, "rewards/margins": 31.84375, "rewards/rejected": -10.637499809265137, "step": 145, "train_speed(iter/s)": 0.025449 }, { "epoch": 1.2808510638297872, "grad_norm": 0.2784733474254608, "learning_rate": 1.979645886856868e-06, "logits/chosen": -1.034570336341858, "logits/rejected": -0.11180724948644638, "logps/chosen": -82.82499694824219, "logps/rejected": -329.3999938964844, "loss": 0.09817352294921874, "memory(GiB)": 78.11, "nll_loss": 0.09816894680261612, "rewards/accuracies": 1.0, "rewards/chosen": 21.762500762939453, "rewards/margins": 32.525001525878906, "rewards/rejected": -10.774999618530273, "step": 150, "train_speed(iter/s)": 0.025502 }, { "epoch": 1.323404255319149, "grad_norm": 0.30967381596565247, "learning_rate": 1.9120955379113745e-06, "logits/chosen": -1.0148437023162842, "logits/rejected": -0.06700439751148224, "logps/chosen": -80.7874984741211, "logps/rejected": -321.0, "loss": 0.0952880859375, "memory(GiB)": 78.11, "nll_loss": 0.09526367485523224, "rewards/accuracies": 1.0, "rewards/chosen": 20.943750381469727, "rewards/margins": 31.568750381469727, "rewards/rejected": -10.643750190734863, "step": 155, "train_speed(iter/s)": 0.025434 }, { "epoch": 1.3659574468085105, "grad_norm": 0.28575485944747925, "learning_rate": 1.843628401497495e-06, "logits/chosen": -1.0359375476837158, "logits/rejected": -0.08933410793542862, "logps/chosen": -82.32499694824219, "logps/rejected": -332.3999938964844, "loss": 0.09707794189453126, "memory(GiB)": 78.11, "nll_loss": 0.09709472954273224, "rewards/accuracies": 1.0, "rewards/chosen": 22.137500762939453, "rewards/margins": 33.33124923706055, "rewards/rejected": -11.215624809265137, "step": 160, "train_speed(iter/s)": 0.02552 }, { "epoch": 1.4085106382978723, "grad_norm": 0.30471956729888916, "learning_rate": 1.7743967962085799e-06, "logits/chosen": -1.074609398841858, "logits/rejected": -0.05779419094324112, "logps/chosen": -77.69999694824219, "logps/rejected": -341.1000061035156, "loss": 0.0906494140625, "memory(GiB)": 78.11, "nll_loss": 0.09067382663488388, "rewards/accuracies": 1.0, "rewards/chosen": 22.637500762939453, "rewards/margins": 34.0625, "rewards/rejected": -11.471875190734863, "step": 165, "train_speed(iter/s)": 0.025578 }, { "epoch": 1.451063829787234, "grad_norm": 0.2834922671318054, "learning_rate": 1.7045547413491502e-06, "logits/chosen": -1.1212890148162842, "logits/rejected": -0.20931701362133026, "logps/chosen": -79.4625015258789, "logps/rejected": -334.29998779296875, "loss": 0.09400634765625, "memory(GiB)": 78.11, "nll_loss": 0.09396972507238388, "rewards/accuracies": 1.0, "rewards/chosen": 21.221874237060547, "rewards/margins": 32.66875076293945, "rewards/rejected": -11.440625190734863, "step": 170, "train_speed(iter/s)": 0.025547 }, { "epoch": 1.4936170212765958, "grad_norm": 0.33174580335617065, "learning_rate": 1.6342576142887001e-06, "logits/chosen": -1.1828124523162842, "logits/rejected": -0.18878021836280823, "logps/chosen": -75.125, "logps/rejected": -336.8999938964844, "loss": 0.09259796142578125, "memory(GiB)": 78.11, "nll_loss": 0.09261474758386612, "rewards/accuracies": 1.0, "rewards/chosen": 22.381250381469727, "rewards/margins": 34.01874923706055, "rewards/rejected": -11.65625, "step": 175, "train_speed(iter/s)": 0.025586 }, { "epoch": 1.5361702127659576, "grad_norm": 0.3023318648338318, "learning_rate": 1.5636618047942224e-06, "logits/chosen": -1.2580077648162842, "logits/rejected": -0.2625732421875, "logps/chosen": -75.9749984741211, "logps/rejected": -345.70001220703125, "loss": 0.08780364990234375, "memory(GiB)": 78.11, "nll_loss": 0.08784179389476776, "rewards/accuracies": 1.0, "rewards/chosen": 22.193750381469727, "rewards/margins": 34.10625076293945, "rewards/rejected": -11.934374809265137, "step": 180, "train_speed(iter/s)": 0.025549 }, { "epoch": 1.578723404255319, "grad_norm": 0.3265296220779419, "learning_rate": 1.492924367110452e-06, "logits/chosen": -1.228124976158142, "logits/rejected": -0.18481139838695526, "logps/chosen": -72.94999694824219, "logps/rejected": -332.3999938964844, "loss": 0.0884490966796875, "memory(GiB)": 78.11, "nll_loss": 0.08845214545726776, "rewards/accuracies": 1.0, "rewards/chosen": 21.912500381469727, "rewards/margins": 33.243751525878906, "rewards/rejected": -11.318750381469727, "step": 185, "train_speed(iter/s)": 0.025616 }, { "epoch": 1.6212765957446809, "grad_norm": 0.3522721529006958, "learning_rate": 1.4222026705618485e-06, "logits/chosen": -1.308984398841858, "logits/rejected": -0.3495544493198395, "logps/chosen": -82.75, "logps/rejected": -339.0, "loss": 0.09106903076171875, "memory(GiB)": 78.11, "nll_loss": 0.09104003757238388, "rewards/accuracies": 1.0, "rewards/chosen": 22.306249618530273, "rewards/margins": 33.86249923706055, "rewards/rejected": -11.59375, "step": 190, "train_speed(iter/s)": 0.025589 }, { "epoch": 1.6638297872340426, "grad_norm": 0.3345926105976105, "learning_rate": 1.3516540494536255e-06, "logits/chosen": -1.242578148841858, "logits/rejected": -0.24477234482765198, "logps/chosen": -80.61250305175781, "logps/rejected": -340.1000061035156, "loss": 0.0941619873046875, "memory(GiB)": 78.11, "nll_loss": 0.0941162109375, "rewards/accuracies": 1.0, "rewards/chosen": 22.962499618530273, "rewards/margins": 34.537498474121094, "rewards/rejected": -11.559374809265137, "step": 195, "train_speed(iter/s)": 0.025589 }, { "epoch": 1.7063829787234042, "grad_norm": 0.28689754009246826, "learning_rate": 1.281435453050683e-06, "logits/chosen": -1.349609375, "logits/rejected": -0.32838135957717896, "logps/chosen": -75.3375015258789, "logps/rejected": -342.29998779296875, "loss": 0.08746185302734374, "memory(GiB)": 78.11, "nll_loss": 0.08747558295726776, "rewards/accuracies": 1.0, "rewards/chosen": 22.237499237060547, "rewards/margins": 34.03125, "rewards/rejected": -11.774999618530273, "step": 200, "train_speed(iter/s)": 0.025601 }, { "epoch": 1.7063829787234042, "eval_logits/chosen": -1.2234375476837158, "eval_logits/rejected": -0.24904784560203552, "eval_logps/chosen": -88.7249984741211, "eval_logps/rejected": -361.3999938964844, "eval_loss": 0.09561848640441895, "eval_nll_loss": 0.10009765625, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 22.875, "eval_rewards/margins": 35.17499923706055, "eval_rewards/rejected": -12.3125, "eval_runtime": 16.8954, "eval_samples_per_second": 4.439, "eval_steps_per_second": 0.592, "step": 200 }, { "epoch": 1.748936170212766, "grad_norm": 0.3166359066963196, "learning_rate": 1.211703096413141e-06, "logits/chosen": -1.329687476158142, "logits/rejected": -0.3259124755859375, "logps/chosen": -77.3125, "logps/rejected": -346.0, "loss": 0.0876129150390625, "memory(GiB)": 78.11, "nll_loss": 0.08759765326976776, "rewards/accuracies": 1.0, "rewards/chosen": 22.806249618530273, "rewards/margins": 34.587501525878906, "rewards/rejected": -11.800000190734863, "step": 205, "train_speed(iter/s)": 0.025467 }, { "epoch": 1.7914893617021277, "grad_norm": 0.3028061091899872, "learning_rate": 1.1426121128652528e-06, "logits/chosen": -1.3445312976837158, "logits/rejected": -0.32697755098342896, "logps/chosen": -74.69999694824219, "logps/rejected": -338.3999938964844, "loss": 0.08412704467773438, "memory(GiB)": 78.11, "nll_loss": 0.08408202975988388, "rewards/accuracies": 1.0, "rewards/chosen": 21.924999237060547, "rewards/margins": 33.65625, "rewards/rejected": -11.725000381469727, "step": 210, "train_speed(iter/s)": 0.025437 }, { "epoch": 1.8340425531914892, "grad_norm": 0.3295106291770935, "learning_rate": 1.0743162088708549e-06, "logits/chosen": -1.3125, "logits/rejected": -0.33466798067092896, "logps/chosen": -72.98750305175781, "logps/rejected": -345.0, "loss": 0.08744964599609376, "memory(GiB)": 78.11, "nll_loss": 0.08747558295726776, "rewards/accuracies": 1.0, "rewards/chosen": 22.737499237060547, "rewards/margins": 34.631248474121094, "rewards/rejected": -11.878125190734863, "step": 215, "train_speed(iter/s)": 0.025475 }, { "epoch": 1.8765957446808512, "grad_norm": 0.35723093152046204, "learning_rate": 1.006967322083147e-06, "logits/chosen": -1.4054687023162842, "logits/rejected": -0.357666015625, "logps/chosen": -67.86250305175781, "logps/rejected": -345.5, "loss": 0.08179931640625, "memory(GiB)": 86.43, "nll_loss": 0.08175048977136612, "rewards/accuracies": 1.0, "rewards/chosen": 22.371875762939453, "rewards/margins": 34.068748474121094, "rewards/rejected": -11.684374809265137, "step": 220, "train_speed(iter/s)": 0.025478 }, { "epoch": 1.9191489361702128, "grad_norm": 0.31711524724960327, "learning_rate": 9.407152833295372e-07, "logits/chosen": -1.478515625, "logits/rejected": -0.43549805879592896, "logps/chosen": -71.625, "logps/rejected": -349.5, "loss": 0.08211593627929688, "memory(GiB)": 86.43, "nll_loss": 0.08212890475988388, "rewards/accuracies": 1.0, "rewards/chosen": 22.71875, "rewards/margins": 34.82500076293945, "rewards/rejected": -12.143750190734863, "step": 225, "train_speed(iter/s)": 0.025434 }, { "epoch": 1.9617021276595743, "grad_norm": 0.33292317390441895, "learning_rate": 8.757074832835386e-07, "logits/chosen": -1.5011718273162842, "logits/rejected": -0.40864259004592896, "logps/chosen": -72.36250305175781, "logps/rejected": -353.0, "loss": 0.081207275390625, "memory(GiB)": 86.43, "nll_loss": 0.08120117336511612, "rewards/accuracies": 1.0, "rewards/chosen": 23.90625, "rewards/margins": 35.91875076293945, "rewards/rejected": -12.018750190734863, "step": 230, "train_speed(iter/s)": 0.025453 }, { "epoch": 2.008510638297872, "grad_norm": 0.32713377475738525, "learning_rate": 8.12088544565264e-07, "logits/chosen": -1.4508928060531616, "logits/rejected": -0.4289202094078064, "logps/chosen": -65.23809814453125, "logps/rejected": -349.047607421875, "loss": 0.08363189697265624, "memory(GiB)": 86.43, "nll_loss": 0.0796247199177742, "rewards/accuracies": 1.0, "rewards/chosen": 21.845237731933594, "rewards/margins": 34.005950927734375, "rewards/rejected": -12.142857551574707, "step": 235, "train_speed(iter/s)": 0.025437 }, { "epoch": 2.051063829787234, "grad_norm": 0.2999580204486847, "learning_rate": 7.500000000000003e-07, "logits/chosen": -1.605078101158142, "logits/rejected": -0.5160156488418579, "logps/chosen": -58.01250076293945, "logps/rejected": -359.3999938964844, "loss": 0.06568679809570313, "memory(GiB)": 86.43, "nll_loss": 0.06569824367761612, "rewards/accuracies": 1.0, "rewards/chosen": 23.456249237060547, "rewards/margins": 35.88750076293945, "rewards/rejected": -12.465624809265137, "step": 240, "train_speed(iter/s)": 0.025403 }, { "epoch": 2.0936170212765957, "grad_norm": 0.3553922772407532, "learning_rate": 6.895799777506399e-07, "logits/chosen": -1.6183593273162842, "logits/rejected": -0.5166015625, "logps/chosen": -61.787498474121094, "logps/rejected": -365.5, "loss": 0.07020797729492187, "memory(GiB)": 86.43, "nll_loss": 0.07022704929113388, "rewards/accuracies": 1.0, "rewards/chosen": 22.4375, "rewards/margins": 35.25, "rewards/rejected": -12.784375190734863, "step": 245, "train_speed(iter/s)": 0.025384 }, { "epoch": 2.1361702127659576, "grad_norm": 0.30416449904441833, "learning_rate": 6.3096289402445e-07, "logits/chosen": -1.5906250476837158, "logits/rejected": -0.5077148675918579, "logps/chosen": -57.6875, "logps/rejected": -379.20001220703125, "loss": 0.06732330322265626, "memory(GiB)": 86.43, "nll_loss": 0.06728515774011612, "rewards/accuracies": 1.0, "rewards/chosen": 22.893749237060547, "rewards/margins": 36.212501525878906, "rewards/rejected": -13.306249618530273, "step": 250, "train_speed(iter/s)": 0.025433 }, { "epoch": 2.178723404255319, "grad_norm": 0.29929059743881226, "learning_rate": 5.742791540378176e-07, "logits/chosen": -1.740234375, "logits/rejected": -0.644824206829071, "logps/chosen": -60.212501525878906, "logps/rejected": -368.1000061035156, "loss": 0.0677215576171875, "memory(GiB)": 86.43, "nll_loss": 0.06768798828125, "rewards/accuracies": 1.0, "rewards/chosen": 22.662500381469727, "rewards/margins": 35.6875, "rewards/rejected": -13.021875381469727, "step": 255, "train_speed(iter/s)": 0.025403 }, { "epoch": 2.2212765957446807, "grad_norm": 0.3583555817604065, "learning_rate": 5.196548619042311e-07, "logits/chosen": -1.7507812976837158, "logits/rejected": -0.62744140625, "logps/chosen": -57.537498474121094, "logps/rejected": -370.5, "loss": 0.06677093505859374, "memory(GiB)": 86.43, "nll_loss": 0.06679687649011612, "rewards/accuracies": 1.0, "rewards/chosen": 22.850000381469727, "rewards/margins": 36.04999923706055, "rewards/rejected": -13.178125381469727, "step": 260, "train_speed(iter/s)": 0.025386 }, { "epoch": 2.2638297872340427, "grad_norm": 0.35006001591682434, "learning_rate": 4.672115400909117e-07, "logits/chosen": -1.760156273841858, "logits/rejected": -0.6236327886581421, "logps/chosen": -55.13750076293945, "logps/rejected": -373.20001220703125, "loss": 0.0646148681640625, "memory(GiB)": 86.43, "nll_loss": 0.06462402641773224, "rewards/accuracies": 1.0, "rewards/chosen": 22.743749618530273, "rewards/margins": 35.89374923706055, "rewards/rejected": -13.184374809265137, "step": 265, "train_speed(iter/s)": 0.02541 }, { "epoch": 2.3063829787234043, "grad_norm": 0.3760732412338257, "learning_rate": 4.170658590682134e-07, "logits/chosen": -1.755859375, "logits/rejected": -0.6622070074081421, "logps/chosen": -54.17499923706055, "logps/rejected": -383.8999938964844, "loss": 0.06567840576171875, "memory(GiB)": 86.43, "nll_loss": 0.06573486328125, "rewards/accuracies": 1.0, "rewards/chosen": 23.168750762939453, "rewards/margins": 36.662498474121094, "rewards/rejected": -13.5, "step": 270, "train_speed(iter/s)": 0.025453 }, { "epoch": 2.348936170212766, "grad_norm": 0.32026490569114685, "learning_rate": 3.6932937775324586e-07, "logits/chosen": -1.851953148841858, "logits/rejected": -0.682812511920929, "logps/chosen": -53.837501525878906, "logps/rejected": -378.70001220703125, "loss": 0.063421630859375, "memory(GiB)": 86.43, "nll_loss": 0.06342773139476776, "rewards/accuracies": 1.0, "rewards/chosen": 23.049999237060547, "rewards/margins": 36.58124923706055, "rewards/rejected": -13.506250381469727, "step": 275, "train_speed(iter/s)": 0.02545 }, { "epoch": 2.391489361702128, "grad_norm": 0.3174494504928589, "learning_rate": 3.2410829532515156e-07, "logits/chosen": -1.8175780773162842, "logits/rejected": -0.7105468511581421, "logps/chosen": -59.625, "logps/rejected": -367.0, "loss": 0.0679901123046875, "memory(GiB)": 86.43, "nll_loss": 0.06796874850988388, "rewards/accuracies": 1.0, "rewards/chosen": 23.868749618530273, "rewards/margins": 37.25, "rewards/rejected": -13.409375190734863, "step": 280, "train_speed(iter/s)": 0.025469 }, { "epoch": 2.4340425531914893, "grad_norm": 0.33984753489494324, "learning_rate": 2.8150321496417134e-07, "logits/chosen": -1.87109375, "logits/rejected": -0.7925781011581421, "logps/chosen": -59.212501525878906, "logps/rejected": -390.3999938964844, "loss": 0.06616058349609374, "memory(GiB)": 86.43, "nll_loss": 0.06617431342601776, "rewards/accuracies": 1.0, "rewards/chosen": 23.549999237060547, "rewards/margins": 37.57500076293945, "rewards/rejected": -14.046875, "step": 285, "train_speed(iter/s)": 0.025429 }, { "epoch": 2.476595744680851, "grad_norm": 0.3459691107273102, "learning_rate": 2.4160892004010924e-07, "logits/chosen": -1.845703125, "logits/rejected": -0.737109363079071, "logps/chosen": -54.82500076293945, "logps/rejected": -380.3999938964844, "loss": 0.06555709838867188, "memory(GiB)": 86.43, "nll_loss": 0.06560058891773224, "rewards/accuracies": 1.0, "rewards/chosen": 23.674999237060547, "rewards/margins": 37.45000076293945, "rewards/rejected": -13.78125, "step": 290, "train_speed(iter/s)": 0.025456 }, { "epoch": 2.519148936170213, "grad_norm": 0.33621102571487427, "learning_rate": 2.0451416324810927e-07, "logits/chosen": -1.851171851158142, "logits/rejected": -0.726367175579071, "logps/chosen": -55.17499923706055, "logps/rejected": -389.0, "loss": 0.06555557250976562, "memory(GiB)": 86.43, "nll_loss": 0.06556396186351776, "rewards/accuracies": 1.0, "rewards/chosen": 22.631250381469727, "rewards/margins": 36.45000076293945, "rewards/rejected": -13.834375381469727, "step": 295, "train_speed(iter/s)": 0.025462 }, { "epoch": 2.5617021276595744, "grad_norm": 0.3450789153575897, "learning_rate": 1.7030146916085187e-07, "logits/chosen": -1.864843726158142, "logits/rejected": -0.7413085699081421, "logps/chosen": -53.76250076293945, "logps/rejected": -380.79998779296875, "loss": 0.0641754150390625, "memory(GiB)": 86.43, "nll_loss": 0.06414794921875, "rewards/accuracies": 1.0, "rewards/chosen": 22.737499237060547, "rewards/margins": 36.493751525878906, "rewards/rejected": -13.712499618530273, "step": 300, "train_speed(iter/s)": 0.025502 }, { "epoch": 2.5617021276595744, "eval_logits/chosen": -1.7765624523162842, "eval_logits/rejected": -0.6953125, "eval_logps/chosen": -78.2750015258789, "eval_logps/rejected": -407.79998779296875, "eval_loss": 0.08268880099058151, "eval_nll_loss": 0.08737792819738388, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 23.412500381469727, "eval_rewards/margins": 38.0, "eval_rewards/rejected": -14.606249809265137, "eval_runtime": 17.0595, "eval_samples_per_second": 4.396, "eval_steps_per_second": 0.586, "step": 300 }, { "epoch": 2.604255319148936, "grad_norm": 0.3593633770942688, "learning_rate": 1.3904695063643336e-07, "logits/chosen": -1.878515601158142, "logits/rejected": -0.77197265625, "logps/chosen": -57.875, "logps/rejected": -386.3999938964844, "loss": 0.064154052734375, "memory(GiB)": 86.43, "nll_loss": 0.06413574516773224, "rewards/accuracies": 1.0, "rewards/chosen": 23.649999618530273, "rewards/margins": 37.650001525878906, "rewards/rejected": -13.984375, "step": 305, "train_speed(iter/s)": 0.025388 }, { "epoch": 2.646808510638298, "grad_norm": 0.36554569005966187, "learning_rate": 1.1082013949036119e-07, "logits/chosen": -1.890625, "logits/rejected": -0.768261730670929, "logps/chosen": -59.54999923706055, "logps/rejected": -386.1000061035156, "loss": 0.06463623046875, "memory(GiB)": 86.43, "nll_loss": 0.06468506157398224, "rewards/accuracies": 1.0, "rewards/chosen": 24.274999618530273, "rewards/margins": 38.20000076293945, "rewards/rejected": -13.925000190734863, "step": 310, "train_speed(iter/s)": 0.025368 }, { "epoch": 2.6893617021276595, "grad_norm": 0.3650209605693817, "learning_rate": 8.568383180837369e-08, "logits/chosen": -1.8703124523162842, "logits/rejected": -0.714648425579071, "logps/chosen": -53.162498474121094, "logps/rejected": -390.29998779296875, "loss": 0.062223052978515624, "memory(GiB)": 86.43, "nll_loss": 0.06223144382238388, "rewards/accuracies": 1.0, "rewards/chosen": 23.568750381469727, "rewards/margins": 37.318748474121094, "rewards/rejected": -13.746874809265137, "step": 315, "train_speed(iter/s)": 0.025378 }, { "epoch": 2.731914893617021, "grad_norm": 0.37099653482437134, "learning_rate": 6.369394824421365e-08, "logits/chosen": -1.875390648841858, "logits/rejected": -0.8294922113418579, "logps/chosen": -60.73749923706055, "logps/rejected": -385.8999938964844, "loss": 0.06792373657226562, "memory(GiB)": 86.43, "nll_loss": 0.06791992485523224, "rewards/accuracies": 1.0, "rewards/chosen": 23.037500381469727, "rewards/margins": 37.11249923706055, "rewards/rejected": -14.065625190734863, "step": 320, "train_speed(iter/s)": 0.02535 }, { "epoch": 2.774468085106383, "grad_norm": 0.35642552375793457, "learning_rate": 4.489940961314881e-08, "logits/chosen": -1.8425781726837158, "logits/rejected": -0.7470703125, "logps/chosen": -60.037498474121094, "logps/rejected": -395.1000061035156, "loss": 0.06998977661132813, "memory(GiB)": 86.43, "nll_loss": 0.06997070461511612, "rewards/accuracies": 1.0, "rewards/chosen": 24.143749237060547, "rewards/margins": 37.962501525878906, "rewards/rejected": -13.831250190734863, "step": 325, "train_speed(iter/s)": 0.025386 }, { "epoch": 2.8170212765957445, "grad_norm": 0.32820969820022583, "learning_rate": 2.9342028058009896e-08, "logits/chosen": -1.870703101158142, "logits/rejected": -0.7451171875, "logps/chosen": -54.76250076293945, "logps/rejected": -387.29998779296875, "loss": 0.06422500610351563, "memory(GiB)": 86.43, "nll_loss": 0.06422118842601776, "rewards/accuracies": 1.0, "rewards/chosen": 22.887500762939453, "rewards/margins": 36.88750076293945, "rewards/rejected": -14.028124809265137, "step": 330, "train_speed(iter/s)": 0.02541 }, { "epoch": 2.8595744680851065, "grad_norm": 0.35879823565483093, "learning_rate": 1.7056414029866018e-08, "logits/chosen": -1.8328125476837158, "logits/rejected": -0.7754882574081421, "logps/chosen": -53.400001525878906, "logps/rejected": -388.29998779296875, "loss": 0.064886474609375, "memory(GiB)": 86.43, "nll_loss": 0.06489257514476776, "rewards/accuracies": 1.0, "rewards/chosen": 22.431249618530273, "rewards/margins": 36.58124923706055, "rewards/rejected": -14.153124809265137, "step": 335, "train_speed(iter/s)": 0.025437 }, { "epoch": 2.902127659574468, "grad_norm": 0.32765164971351624, "learning_rate": 8.069899290277683e-09, "logits/chosen": -1.848046898841858, "logits/rejected": -0.744140625, "logps/chosen": -54.67499923706055, "logps/rejected": -380.5, "loss": 0.0650665283203125, "memory(GiB)": 86.43, "nll_loss": 0.06510009616613388, "rewards/accuracies": 1.0, "rewards/chosen": 22.681249618530273, "rewards/margins": 36.70000076293945, "rewards/rejected": -14.009374618530273, "step": 340, "train_speed(iter/s)": 0.025437 }, { "epoch": 2.94468085106383, "grad_norm": 0.370295912027359, "learning_rate": 2.4024761064254664e-09, "logits/chosen": -1.872656226158142, "logits/rejected": -0.7118164300918579, "logps/chosen": -57.849998474121094, "logps/rejected": -377.5, "loss": 0.06773147583007813, "memory(GiB)": 86.43, "nll_loss": 0.06773681938648224, "rewards/accuracies": 1.0, "rewards/chosen": 23.737499237060547, "rewards/margins": 37.337501525878906, "rewards/rejected": -13.590624809265137, "step": 345, "train_speed(iter/s)": 0.02546 }, { "epoch": 2.9872340425531916, "grad_norm": 0.3416670560836792, "learning_rate": 6.675277438356054e-11, "logits/chosen": -1.8679687976837158, "logits/rejected": -0.7328125238418579, "logps/chosen": -56.537498474121094, "logps/rejected": -390.0, "loss": 0.06469192504882812, "memory(GiB)": 86.43, "nll_loss": 0.06467285007238388, "rewards/accuracies": 1.0, "rewards/chosen": 22.943750381469727, "rewards/margins": 36.98749923706055, "rewards/rejected": -14.0625, "step": 350, "train_speed(iter/s)": 0.025452 }, { "epoch": 2.9957446808510637, "eval_logits/chosen": -1.7937500476837158, "eval_logits/rejected": -0.7124999761581421, "eval_logps/chosen": -77.80000305175781, "eval_logps/rejected": -408.6000061035156, "eval_loss": 0.08202473819255829, "eval_nll_loss": 0.08659668266773224, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 23.399999618530273, "eval_rewards/margins": 38.087501525878906, "eval_rewards/rejected": -14.662500381469727, "eval_runtime": 17.0317, "eval_samples_per_second": 4.404, "eval_steps_per_second": 0.587, "step": 351 } ], "logging_steps": 5, "max_steps": 351, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.038030084192076e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }