{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 10624, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037650602409638554, "grad_norm": 152.2126504758387, "learning_rate": 9.99152861445783e-07, "logits/chosen": -1.9054687023162842, "logits/rejected": -1.970703125, "logps/chosen": -341.75, "logps/rejected": -360.70001220703125, "loss": 0.7138, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.01610565185546875, "rewards/margins": -0.006549072451889515, "rewards/rejected": -0.00953826867043972, "step": 10 }, { "epoch": 0.007530120481927711, "grad_norm": 105.467870848321, "learning_rate": 9.98211596385542e-07, "logits/chosen": -1.939453125, "logits/rejected": -1.81640625, "logps/chosen": -314.3999938964844, "logps/rejected": -366.6499938964844, "loss": 0.7058, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.0841827392578125, "rewards/margins": 0.05379028245806694, "rewards/rejected": -0.13783112168312073, "step": 20 }, { "epoch": 0.011295180722891566, "grad_norm": 102.10996667520865, "learning_rate": 9.972703313253011e-07, "logits/chosen": -1.8914062976837158, "logits/rejected": -1.78515625, "logps/chosen": -319.6499938964844, "logps/rejected": -334.70001220703125, "loss": 0.6707, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.04068603366613388, "rewards/margins": 0.11593017727136612, "rewards/rejected": -0.156524658203125, "step": 30 }, { "epoch": 0.015060240963855422, "grad_norm": 115.87568667881561, "learning_rate": 9.963290662650602e-07, "logits/chosen": -1.9765625, "logits/rejected": -1.9421875476837158, "logps/chosen": -312.57501220703125, "logps/rejected": -347.25, "loss": 0.673, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.07699279487133026, "rewards/margins": 0.09013672173023224, "rewards/rejected": -0.01322021521627903, "step": 40 }, { "epoch": 0.01882530120481928, "grad_norm": 106.28143260337926, "learning_rate": 9.953878012048193e-07, "logits/chosen": -1.999609351158142, "logits/rejected": -1.9480469226837158, "logps/chosen": -289.79998779296875, "logps/rejected": -341.25, "loss": 0.6202, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22335204482078552, "rewards/margins": 0.20418700575828552, "rewards/rejected": 0.01898803748190403, "step": 50 }, { "epoch": 0.022590361445783132, "grad_norm": 94.01890825562822, "learning_rate": 9.944465361445784e-07, "logits/chosen": -2.0035157203674316, "logits/rejected": -1.921875, "logps/chosen": -350.54998779296875, "logps/rejected": -366.79998779296875, "loss": 0.6695, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.09030761569738388, "rewards/margins": 0.13191528618335724, "rewards/rejected": -0.04142456129193306, "step": 60 }, { "epoch": 0.02635542168674699, "grad_norm": 89.23585199047301, "learning_rate": 9.935052710843374e-07, "logits/chosen": -2.007031202316284, "logits/rejected": -1.9832031726837158, "logps/chosen": -307.79998779296875, "logps/rejected": -307.3999938964844, "loss": 0.647, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.12453918159008026, "rewards/margins": 0.19892577826976776, "rewards/rejected": -0.07449035346508026, "step": 70 }, { "epoch": 0.030120481927710843, "grad_norm": 128.4679375110798, "learning_rate": 9.925640060240963e-07, "logits/chosen": -2.0390625, "logits/rejected": -2.08203125, "logps/chosen": -368.70001220703125, "logps/rejected": -393.0, "loss": 0.6244, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.1619873046875, "rewards/margins": 0.25236815214157104, "rewards/rejected": -0.09047241508960724, "step": 80 }, { "epoch": 0.0338855421686747, "grad_norm": 123.24319762975927, "learning_rate": 9.916227409638554e-07, "logits/chosen": -2.057421922683716, "logits/rejected": -2.075000047683716, "logps/chosen": -362.54998779296875, "logps/rejected": -374.0, "loss": 0.6443, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.10191650688648224, "rewards/margins": 0.22462768852710724, "rewards/rejected": -0.12285766750574112, "step": 90 }, { "epoch": 0.03765060240963856, "grad_norm": 125.3404590209247, "learning_rate": 9.906814759036145e-07, "logits/chosen": -2.01953125, "logits/rejected": -1.9269530773162842, "logps/chosen": -322.45001220703125, "logps/rejected": -356.6499938964844, "loss": 0.6793, "rewards/accuracies": 0.53125, "rewards/chosen": 0.17569580674171448, "rewards/margins": 0.18458251655101776, "rewards/rejected": -0.00861511193215847, "step": 100 }, { "epoch": 0.04141566265060241, "grad_norm": 109.37911598668393, "learning_rate": 9.897402108433735e-07, "logits/chosen": -1.9816405773162842, "logits/rejected": -1.935546875, "logps/chosen": -290.8500061035156, "logps/rejected": -344.8999938964844, "loss": 0.5983, "rewards/accuracies": 0.625, "rewards/chosen": 0.23388671875, "rewards/margins": 0.3409423828125, "rewards/rejected": -0.10723876953125, "step": 110 }, { "epoch": 0.045180722891566265, "grad_norm": 122.18121650728303, "learning_rate": 9.887989457831324e-07, "logits/chosen": -2.06640625, "logits/rejected": -2.052734375, "logps/chosen": -347.75, "logps/rejected": -378.54998779296875, "loss": 0.6381, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.13023681938648224, "rewards/margins": 0.2669677734375, "rewards/rejected": -0.13681641221046448, "step": 120 }, { "epoch": 0.04894578313253012, "grad_norm": 132.02852271156584, "learning_rate": 9.878576807228915e-07, "logits/chosen": -2.0464844703674316, "logits/rejected": -2.083984375, "logps/chosen": -359.8500061035156, "logps/rejected": -354.5, "loss": 0.6439, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.01424560509622097, "rewards/margins": 0.26957398653030396, "rewards/rejected": -0.2834716737270355, "step": 130 }, { "epoch": 0.05271084337349398, "grad_norm": 108.72441640933614, "learning_rate": 9.869164156626506e-07, "logits/chosen": -2.034374952316284, "logits/rejected": -2.0082030296325684, "logps/chosen": -363.54998779296875, "logps/rejected": -411.0, "loss": 0.5901, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10563965141773224, "rewards/margins": 0.453643798828125, "rewards/rejected": -0.558544933795929, "step": 140 }, { "epoch": 0.05647590361445783, "grad_norm": 117.42424928053588, "learning_rate": 9.859751506024096e-07, "logits/chosen": -2.0347657203674316, "logits/rejected": -1.8777344226837158, "logps/chosen": -368.29998779296875, "logps/rejected": -387.5, "loss": 0.6536, "rewards/accuracies": 0.53125, "rewards/chosen": -0.11945800483226776, "rewards/margins": 0.32271116971969604, "rewards/rejected": -0.4422851502895355, "step": 150 }, { "epoch": 0.060240963855421686, "grad_norm": 110.59165568249628, "learning_rate": 9.850338855421685e-07, "logits/chosen": -2.090625047683716, "logits/rejected": -2.03125, "logps/chosen": -351.95001220703125, "logps/rejected": -326.0, "loss": 0.6606, "rewards/accuracies": 0.59375, "rewards/chosen": -0.28703612089157104, "rewards/margins": 0.24370117485523224, "rewards/rejected": -0.530444324016571, "step": 160 }, { "epoch": 0.06400602409638555, "grad_norm": 116.47021953706664, "learning_rate": 9.840926204819276e-07, "logits/chosen": -2.060546875, "logits/rejected": -2.064453125, "logps/chosen": -303.54998779296875, "logps/rejected": -354.75, "loss": 0.5902, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24456787109375, "rewards/margins": 0.48518067598342896, "rewards/rejected": -0.729687511920929, "step": 170 }, { "epoch": 0.0677710843373494, "grad_norm": 118.46680336940885, "learning_rate": 9.831513554216867e-07, "logits/chosen": -2.142578125, "logits/rejected": -2.1136717796325684, "logps/chosen": -357.0, "logps/rejected": -380.1000061035156, "loss": 0.5904, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09508667141199112, "rewards/margins": 0.40227049589157104, "rewards/rejected": -0.497140496969223, "step": 180 }, { "epoch": 0.07153614457831325, "grad_norm": 126.80757157527249, "learning_rate": 9.822100903614458e-07, "logits/chosen": -2.231250047683716, "logits/rejected": -2.155078172683716, "logps/chosen": -361.79998779296875, "logps/rejected": -390.29998779296875, "loss": 0.6458, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.08977051079273224, "rewards/margins": 0.35755616426467896, "rewards/rejected": -0.4475952088832855, "step": 190 }, { "epoch": 0.07530120481927711, "grad_norm": 177.22764087220844, "learning_rate": 9.812688253012048e-07, "logits/chosen": -2.05859375, "logits/rejected": -2.042187452316284, "logps/chosen": -328.6000061035156, "logps/rejected": -376.5, "loss": 0.6128, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0062927245162427425, "rewards/margins": 0.466522216796875, "rewards/rejected": -0.46076661348342896, "step": 200 }, { "epoch": 0.07906626506024096, "grad_norm": 78.48940662213016, "learning_rate": 9.80327560240964e-07, "logits/chosen": -2.0234375, "logits/rejected": -2.0210938453674316, "logps/chosen": -304.04998779296875, "logps/rejected": -349.25, "loss": 0.6531, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09563140571117401, "rewards/margins": 0.31349486112594604, "rewards/rejected": -0.40861815214157104, "step": 210 }, { "epoch": 0.08283132530120482, "grad_norm": 118.8397006101472, "learning_rate": 9.793862951807228e-07, "logits/chosen": -2.072265625, "logits/rejected": -2.0542969703674316, "logps/chosen": -307.92498779296875, "logps/rejected": -333.3500061035156, "loss": 0.5937, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.11757354438304901, "rewards/margins": 0.39801025390625, "rewards/rejected": -0.28013914823532104, "step": 220 }, { "epoch": 0.08659638554216867, "grad_norm": 81.64030829199463, "learning_rate": 9.784450301204819e-07, "logits/chosen": -2.032031297683716, "logits/rejected": -1.974218726158142, "logps/chosen": -338.1499938964844, "logps/rejected": -365.70001220703125, "loss": 0.5836, "rewards/accuracies": 0.625, "rewards/chosen": 0.19725342094898224, "rewards/margins": 0.4989990293979645, "rewards/rejected": -0.30119019746780396, "step": 230 }, { "epoch": 0.09036144578313253, "grad_norm": 90.09381938382023, "learning_rate": 9.77503765060241e-07, "logits/chosen": -2.1015625, "logits/rejected": -2.132031202316284, "logps/chosen": -313.04998779296875, "logps/rejected": -354.0, "loss": 0.6301, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.14000244438648224, "rewards/margins": 0.4311767518520355, "rewards/rejected": -0.2913818359375, "step": 240 }, { "epoch": 0.09412650602409639, "grad_norm": 100.46304590079562, "learning_rate": 9.765625e-07, "logits/chosen": -2.205859422683716, "logits/rejected": -2.1304688453674316, "logps/chosen": -350.875, "logps/rejected": -364.0249938964844, "loss": 0.6094, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.21094970405101776, "rewards/margins": 0.47480469942092896, "rewards/rejected": -0.26362305879592896, "step": 250 }, { "epoch": 0.09789156626506024, "grad_norm": 135.9506794834279, "learning_rate": 9.75621234939759e-07, "logits/chosen": -2.086718797683716, "logits/rejected": -1.9992187023162842, "logps/chosen": -357.25, "logps/rejected": -412.8999938964844, "loss": 0.6254, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.169921875, "rewards/margins": 0.4170165956020355, "rewards/rejected": -0.247528076171875, "step": 260 }, { "epoch": 0.1016566265060241, "grad_norm": 120.98348473318408, "learning_rate": 9.74679969879518e-07, "logits/chosen": -2.089062452316284, "logits/rejected": -2.0453124046325684, "logps/chosen": -317.20001220703125, "logps/rejected": -382.25, "loss": 0.6197, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.11505126953125, "rewards/margins": 0.43461912870407104, "rewards/rejected": -0.31968384981155396, "step": 270 }, { "epoch": 0.10542168674698796, "grad_norm": 121.03765832238275, "learning_rate": 9.73738704819277e-07, "logits/chosen": -2.085156202316284, "logits/rejected": -2.0757813453674316, "logps/chosen": -373.1000061035156, "logps/rejected": -373.95001220703125, "loss": 0.6263, "rewards/accuracies": 0.625, "rewards/chosen": -0.1055755615234375, "rewards/margins": 0.3779540956020355, "rewards/rejected": -0.48350828886032104, "step": 280 }, { "epoch": 0.1091867469879518, "grad_norm": 108.71147708439034, "learning_rate": 9.727974397590361e-07, "logits/chosen": -2.07421875, "logits/rejected": -2.0835938453674316, "logps/chosen": -367.95001220703125, "logps/rejected": -386.79998779296875, "loss": 0.5979, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.18942871689796448, "rewards/margins": 0.4923461973667145, "rewards/rejected": -0.682202160358429, "step": 290 }, { "epoch": 0.11295180722891567, "grad_norm": 128.71987771820258, "learning_rate": 9.718561746987952e-07, "logits/chosen": -2.091796875, "logits/rejected": -2.0062499046325684, "logps/chosen": -346.5, "logps/rejected": -391.0, "loss": 0.6209, "rewards/accuracies": 0.625, "rewards/chosen": 0.01620788499712944, "rewards/margins": 0.450277715921402, "rewards/rejected": -0.4339233338832855, "step": 300 }, { "epoch": 0.11671686746987951, "grad_norm": 126.2994026000916, "learning_rate": 9.70914909638554e-07, "logits/chosen": -2.119140625, "logits/rejected": -2.07421875, "logps/chosen": -345.75, "logps/rejected": -401.6000061035156, "loss": 0.6622, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07279052585363388, "rewards/margins": 0.37922972440719604, "rewards/rejected": -0.45228272676467896, "step": 310 }, { "epoch": 0.12048192771084337, "grad_norm": 142.82946502024313, "learning_rate": 9.699736445783132e-07, "logits/chosen": -2.000781297683716, "logits/rejected": -1.967187523841858, "logps/chosen": -342.1499938964844, "logps/rejected": -332.2250061035156, "loss": 0.6977, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12893065810203552, "rewards/margins": 0.26261597871780396, "rewards/rejected": -0.391387939453125, "step": 320 }, { "epoch": 0.12424698795180723, "grad_norm": 115.10930836927598, "learning_rate": 9.690323795180722e-07, "logits/chosen": -2.100781202316284, "logits/rejected": -2.087890625, "logps/chosen": -312.875, "logps/rejected": -374.1000061035156, "loss": 0.6158, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.23402099311351776, "rewards/margins": 0.4174438416957855, "rewards/rejected": -0.6509917974472046, "step": 330 }, { "epoch": 0.1280120481927711, "grad_norm": 121.82892565578389, "learning_rate": 9.680911144578313e-07, "logits/chosen": -2.0726561546325684, "logits/rejected": -2.0992188453674316, "logps/chosen": -317.5, "logps/rejected": -384.20001220703125, "loss": 0.5975, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.40435791015625, "rewards/margins": 0.5442138910293579, "rewards/rejected": -0.948925793170929, "step": 340 }, { "epoch": 0.13177710843373494, "grad_norm": 111.09261866139548, "learning_rate": 9.671498493975904e-07, "logits/chosen": -2.0855469703674316, "logits/rejected": -2.077343702316284, "logps/chosen": -353.3999938964844, "logps/rejected": -356.54998779296875, "loss": 0.6414, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.48663330078125, "rewards/margins": 0.33433836698532104, "rewards/rejected": -0.8211914300918579, "step": 350 }, { "epoch": 0.1355421686746988, "grad_norm": 124.56296496123919, "learning_rate": 9.662085843373493e-07, "logits/chosen": -2.1304688453674316, "logits/rejected": -2.08984375, "logps/chosen": -318.25, "logps/rejected": -365.3999938964844, "loss": 0.6173, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.09855957329273224, "rewards/margins": 0.4584899842739105, "rewards/rejected": -0.556195080280304, "step": 360 }, { "epoch": 0.13930722891566266, "grad_norm": 116.13025772084133, "learning_rate": 9.652673192771083e-07, "logits/chosen": -2.143749952316284, "logits/rejected": -2.069140672683716, "logps/chosen": -311.8999938964844, "logps/rejected": -360.6499938964844, "loss": 0.575, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.30010986328125, "rewards/margins": 0.49876099824905396, "rewards/rejected": -0.19869080185890198, "step": 370 }, { "epoch": 0.1430722891566265, "grad_norm": 82.27758164044418, "learning_rate": 9.643260542168674e-07, "logits/chosen": -2.0765624046325684, "logits/rejected": -1.96875, "logps/chosen": -297.67498779296875, "logps/rejected": -329.04998779296875, "loss": 0.6017, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.569995105266571, "rewards/margins": 0.4827819764614105, "rewards/rejected": 0.087677001953125, "step": 380 }, { "epoch": 0.14683734939759036, "grad_norm": 115.44383902165703, "learning_rate": 9.633847891566265e-07, "logits/chosen": -2.0390625, "logits/rejected": -2.014843702316284, "logps/chosen": -338.1499938964844, "logps/rejected": -339.79998779296875, "loss": 0.5714, "rewards/accuracies": 0.65625, "rewards/chosen": 0.5312134027481079, "rewards/margins": 0.619677722454071, "rewards/rejected": -0.08856201171875, "step": 390 }, { "epoch": 0.15060240963855423, "grad_norm": 95.17508531296836, "learning_rate": 9.624435240963856e-07, "logits/chosen": -2.0738282203674316, "logits/rejected": -2.032031297683716, "logps/chosen": -336.45001220703125, "logps/rejected": -384.75, "loss": 0.5373, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.2725830078125, "rewards/margins": 0.628155529499054, "rewards/rejected": -0.3562560975551605, "step": 400 }, { "epoch": 0.15436746987951808, "grad_norm": 112.3287445021028, "learning_rate": 9.615022590361447e-07, "logits/chosen": -2.110546827316284, "logits/rejected": -2.095703125, "logps/chosen": -326.125, "logps/rejected": -368.20001220703125, "loss": 0.5445, "rewards/accuracies": 0.6875, "rewards/chosen": -0.12425537407398224, "rewards/margins": 0.627124011516571, "rewards/rejected": -0.7508331537246704, "step": 410 }, { "epoch": 0.15813253012048192, "grad_norm": 105.66504407081169, "learning_rate": 9.605609939759035e-07, "logits/chosen": -2.229296922683716, "logits/rejected": -2.10546875, "logps/chosen": -302.5, "logps/rejected": -362.3500061035156, "loss": 0.7066, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.707080066204071, "rewards/margins": 0.4013915956020355, "rewards/rejected": -1.108496069908142, "step": 420 }, { "epoch": 0.16189759036144577, "grad_norm": 117.04895103692287, "learning_rate": 9.596197289156626e-07, "logits/chosen": -2.080859422683716, "logits/rejected": -2.080859422683716, "logps/chosen": -374.3999938964844, "logps/rejected": -447.75, "loss": 0.6618, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.6052612066268921, "rewards/margins": 0.38519287109375, "rewards/rejected": -0.990039050579071, "step": 430 }, { "epoch": 0.16566265060240964, "grad_norm": 90.34352460185451, "learning_rate": 9.586784638554217e-07, "logits/chosen": -2.0296874046325684, "logits/rejected": -2.0152344703674316, "logps/chosen": -354.20001220703125, "logps/rejected": -359.8999938964844, "loss": 0.6371, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07721557468175888, "rewards/margins": 0.37994384765625, "rewards/rejected": -0.45692747831344604, "step": 440 }, { "epoch": 0.1694277108433735, "grad_norm": 131.63512063348136, "learning_rate": 9.577371987951808e-07, "logits/chosen": -2.022265672683716, "logits/rejected": -1.931640625, "logps/chosen": -411.29998779296875, "logps/rejected": -434.1000061035156, "loss": 0.6798, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.272369384765625, "rewards/margins": 0.36540526151657104, "rewards/rejected": -0.6378418207168579, "step": 450 }, { "epoch": 0.17319277108433734, "grad_norm": 95.64203253857268, "learning_rate": 9.567959337349396e-07, "logits/chosen": -2.005078077316284, "logits/rejected": -2.0015625953674316, "logps/chosen": -352.5, "logps/rejected": -343.6499938964844, "loss": 0.586, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10806884616613388, "rewards/margins": 0.4941039979457855, "rewards/rejected": -0.601269543170929, "step": 460 }, { "epoch": 0.1769578313253012, "grad_norm": 98.76283707341581, "learning_rate": 9.558546686746987e-07, "logits/chosen": -1.964453101158142, "logits/rejected": -1.959375023841858, "logps/chosen": -330.75, "logps/rejected": -366.70001220703125, "loss": 0.5802, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.10303954780101776, "rewards/margins": 0.5168670415878296, "rewards/rejected": -0.4135894775390625, "step": 470 }, { "epoch": 0.18072289156626506, "grad_norm": 101.68497941132625, "learning_rate": 9.549134036144578e-07, "logits/chosen": -2.0054688453674316, "logits/rejected": -2.037890672683716, "logps/chosen": -391.04998779296875, "logps/rejected": -423.6000061035156, "loss": 0.6234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07268066704273224, "rewards/margins": 0.48552244901657104, "rewards/rejected": -0.5577331781387329, "step": 480 }, { "epoch": 0.1844879518072289, "grad_norm": 106.45933036128771, "learning_rate": 9.539721385542169e-07, "logits/chosen": -2.063281297683716, "logits/rejected": -2.0140624046325684, "logps/chosen": -343.45001220703125, "logps/rejected": -377.3999938964844, "loss": 0.619, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.28513795137405396, "rewards/margins": 0.3911071717739105, "rewards/rejected": -0.6759277582168579, "step": 490 }, { "epoch": 0.18825301204819278, "grad_norm": 164.01158798809766, "learning_rate": 9.530308734939758e-07, "logits/chosen": -2.0804686546325684, "logits/rejected": -2.047656297683716, "logps/chosen": -359.54998779296875, "logps/rejected": -408.1000061035156, "loss": 0.6634, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.43244630098342896, "rewards/margins": 0.38850098848342896, "rewards/rejected": -0.821179211139679, "step": 500 }, { "epoch": 0.19201807228915663, "grad_norm": 82.36480838526172, "learning_rate": 9.520896084337348e-07, "logits/chosen": -2.041015625, "logits/rejected": -2.0707030296325684, "logps/chosen": -318.04998779296875, "logps/rejected": -376.95001220703125, "loss": 0.6067, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5701659917831421, "rewards/margins": 0.49418944120407104, "rewards/rejected": -1.064453125, "step": 510 }, { "epoch": 0.19578313253012047, "grad_norm": 99.93361470639375, "learning_rate": 9.511483433734939e-07, "logits/chosen": -2.131640672683716, "logits/rejected": -2.0562500953674316, "logps/chosen": -358.0, "logps/rejected": -375.1000061035156, "loss": 0.6151, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.679492175579071, "rewards/margins": 0.4659667909145355, "rewards/rejected": -1.145117163658142, "step": 520 }, { "epoch": 0.19954819277108435, "grad_norm": 105.70687343091096, "learning_rate": 9.50207078313253e-07, "logits/chosen": -2.079296827316284, "logits/rejected": -2.0855469703674316, "logps/chosen": -328.25, "logps/rejected": -341.25, "loss": 0.6186, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.09735107421875, "rewards/margins": 0.5572509765625, "rewards/rejected": -0.6541992425918579, "step": 530 }, { "epoch": 0.2033132530120482, "grad_norm": 113.1246654031252, "learning_rate": 9.492658132530121e-07, "logits/chosen": -1.932031273841858, "logits/rejected": -1.945703148841858, "logps/chosen": -337.0, "logps/rejected": -372.70001220703125, "loss": 0.6007, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.19091796875, "rewards/margins": 0.5262451171875, "rewards/rejected": -0.3353027403354645, "step": 540 }, { "epoch": 0.20707831325301204, "grad_norm": 118.09463702435889, "learning_rate": 9.48324548192771e-07, "logits/chosen": -1.958984375, "logits/rejected": -1.943359375, "logps/chosen": -341.0, "logps/rejected": -409.95001220703125, "loss": 0.5815, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.18023681640625, "rewards/margins": 0.47685545682907104, "rewards/rejected": -0.29686278104782104, "step": 550 }, { "epoch": 0.21084337349397592, "grad_norm": 140.14770563915707, "learning_rate": 9.473832831325301e-07, "logits/chosen": -1.9367187023162842, "logits/rejected": -1.8839843273162842, "logps/chosen": -355.20001220703125, "logps/rejected": -382.79998779296875, "loss": 0.6534, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08446197211742401, "rewards/margins": 0.36376953125, "rewards/rejected": -0.4483886659145355, "step": 560 }, { "epoch": 0.21460843373493976, "grad_norm": 94.36642589161377, "learning_rate": 9.464420180722891e-07, "logits/chosen": -2.0414061546325684, "logits/rejected": -2.0179686546325684, "logps/chosen": -328.45001220703125, "logps/rejected": -365.54998779296875, "loss": 0.5924, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0142822265625, "rewards/margins": 0.530078113079071, "rewards/rejected": -0.5161072015762329, "step": 570 }, { "epoch": 0.2183734939759036, "grad_norm": 121.85693871991664, "learning_rate": 9.455007530120482e-07, "logits/chosen": -2.0703125, "logits/rejected": -2.021484375, "logps/chosen": -327.75, "logps/rejected": -375.54998779296875, "loss": 0.5642, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.07267455756664276, "rewards/margins": 0.609875500202179, "rewards/rejected": -0.5363189578056335, "step": 580 }, { "epoch": 0.22213855421686746, "grad_norm": 110.51459207995428, "learning_rate": 9.445594879518071e-07, "logits/chosen": -2.0257811546325684, "logits/rejected": -2.034374952316284, "logps/chosen": -347.20001220703125, "logps/rejected": -307.95001220703125, "loss": 0.6876, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.21962890028953552, "rewards/margins": 0.2870849668979645, "rewards/rejected": -0.506457507610321, "step": 590 }, { "epoch": 0.22590361445783133, "grad_norm": 111.3448146876888, "learning_rate": 9.436182228915662e-07, "logits/chosen": -2.047656297683716, "logits/rejected": -2.0374999046325684, "logps/chosen": -338.75, "logps/rejected": -358.45001220703125, "loss": 0.638, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.05327758938074112, "rewards/margins": 0.49720460176467896, "rewards/rejected": -0.4437499940395355, "step": 600 }, { "epoch": 0.22966867469879518, "grad_norm": 111.43892455130297, "learning_rate": 9.426769578313253e-07, "logits/chosen": -2.1031250953674316, "logits/rejected": -2.044921875, "logps/chosen": -386.3500061035156, "logps/rejected": -433.54998779296875, "loss": 0.6017, "rewards/accuracies": 0.625, "rewards/chosen": -0.08312378078699112, "rewards/margins": 0.4941650331020355, "rewards/rejected": -0.5772460699081421, "step": 610 }, { "epoch": 0.23343373493975902, "grad_norm": 101.72240932298538, "learning_rate": 9.417356927710844e-07, "logits/chosen": -1.9816405773162842, "logits/rejected": -1.955078125, "logps/chosen": -313.1499938964844, "logps/rejected": -379.6000061035156, "loss": 0.5437, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.18433228135108948, "rewards/margins": 0.6211913824081421, "rewards/rejected": -0.8047851324081421, "step": 620 }, { "epoch": 0.2371987951807229, "grad_norm": 116.98740920483999, "learning_rate": 9.407944277108434e-07, "logits/chosen": -2.15234375, "logits/rejected": -2.0562500953674316, "logps/chosen": -380.0, "logps/rejected": -381.1499938964844, "loss": 0.6673, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.5074218511581421, "rewards/margins": 0.3888610899448395, "rewards/rejected": -0.8956054449081421, "step": 630 }, { "epoch": 0.24096385542168675, "grad_norm": 93.62563353867796, "learning_rate": 9.398531626506023e-07, "logits/chosen": -2.0824217796325684, "logits/rejected": -2.1097655296325684, "logps/chosen": -319.6499938964844, "logps/rejected": -373.75, "loss": 0.6015, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5038086175918579, "rewards/margins": 0.58111572265625, "rewards/rejected": -1.08575439453125, "step": 640 }, { "epoch": 0.2447289156626506, "grad_norm": 116.4234278222081, "learning_rate": 9.389118975903614e-07, "logits/chosen": -2.083984375, "logits/rejected": -2.0542969703674316, "logps/chosen": -407.3999938964844, "logps/rejected": -435.8500061035156, "loss": 0.6428, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.637805163860321, "rewards/margins": 0.5348449945449829, "rewards/rejected": -1.1720702648162842, "step": 650 }, { "epoch": 0.24849397590361447, "grad_norm": 119.12271642145033, "learning_rate": 9.379706325301205e-07, "logits/chosen": -2.0640625953674316, "logits/rejected": -2.0394530296325684, "logps/chosen": -382.04998779296875, "logps/rejected": -384.54998779296875, "loss": 0.7568, "rewards/accuracies": 0.5625, "rewards/chosen": -0.676562488079071, "rewards/margins": 0.18610839545726776, "rewards/rejected": -0.8628906011581421, "step": 660 }, { "epoch": 0.2522590361445783, "grad_norm": 119.71000786666006, "learning_rate": 9.370293674698795e-07, "logits/chosen": -2.1089844703674316, "logits/rejected": -2.018359422683716, "logps/chosen": -358.75, "logps/rejected": -387.04998779296875, "loss": 0.6365, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.46558839082717896, "rewards/margins": 0.46014404296875, "rewards/rejected": -0.925585925579071, "step": 670 }, { "epoch": 0.2560240963855422, "grad_norm": 91.91512901654094, "learning_rate": 9.360881024096385e-07, "logits/chosen": -2.0855469703674316, "logits/rejected": -2.0765624046325684, "logps/chosen": -353.5, "logps/rejected": -388.5, "loss": 0.583, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03619994968175888, "rewards/margins": 0.6088622808456421, "rewards/rejected": -0.572265625, "step": 680 }, { "epoch": 0.25978915662650603, "grad_norm": 112.74927535625835, "learning_rate": 9.351468373493976e-07, "logits/chosen": -2.0687499046325684, "logits/rejected": -2.026171922683716, "logps/chosen": -338.1499938964844, "logps/rejected": -369.95001220703125, "loss": 0.6136, "rewards/accuracies": 0.625, "rewards/chosen": 0.13910523056983948, "rewards/margins": 0.4451049864292145, "rewards/rejected": -0.30577391386032104, "step": 690 }, { "epoch": 0.2635542168674699, "grad_norm": 111.59288334854844, "learning_rate": 9.342055722891565e-07, "logits/chosen": -2.0093750953674316, "logits/rejected": -2.022265672683716, "logps/chosen": -342.95001220703125, "logps/rejected": -361.70001220703125, "loss": 0.7004, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.24398192763328552, "rewards/margins": 0.2760376036167145, "rewards/rejected": -0.03184204176068306, "step": 700 }, { "epoch": 0.26731927710843373, "grad_norm": 113.38014043367615, "learning_rate": 9.332643072289156e-07, "logits/chosen": -1.984765648841858, "logits/rejected": -1.9835937023162842, "logps/chosen": -400.79998779296875, "logps/rejected": -405.75, "loss": 0.5903, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.34354859590530396, "rewards/margins": 0.521191418170929, "rewards/rejected": -0.17800292372703552, "step": 710 }, { "epoch": 0.2710843373493976, "grad_norm": 114.52806980921011, "learning_rate": 9.323230421686746e-07, "logits/chosen": -2.1324219703674316, "logits/rejected": -2.1117186546325684, "logps/chosen": -324.54998779296875, "logps/rejected": -350.8500061035156, "loss": 0.633, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.14516906440258026, "rewards/margins": 0.42537230253219604, "rewards/rejected": -0.28013306856155396, "step": 720 }, { "epoch": 0.2748493975903614, "grad_norm": 95.8421629926595, "learning_rate": 9.313817771084337e-07, "logits/chosen": -2.075000047683716, "logits/rejected": -2.0718750953674316, "logps/chosen": -315.75, "logps/rejected": -357.3500061035156, "loss": 0.5961, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.26127928495407104, "rewards/margins": 0.572467029094696, "rewards/rejected": -0.3110595643520355, "step": 730 }, { "epoch": 0.2786144578313253, "grad_norm": 99.15818974010313, "learning_rate": 9.304405120481927e-07, "logits/chosen": -2.071093797683716, "logits/rejected": -2.073437452316284, "logps/chosen": -334.29998779296875, "logps/rejected": -334.79998779296875, "loss": 0.6369, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3306518495082855, "rewards/margins": 0.418731689453125, "rewards/rejected": -0.0881500244140625, "step": 740 }, { "epoch": 0.28237951807228917, "grad_norm": 127.3956691285135, "learning_rate": 9.294992469879518e-07, "logits/chosen": -2.1363282203674316, "logits/rejected": -2.1148438453674316, "logps/chosen": -342.75, "logps/rejected": -371.45001220703125, "loss": 0.6306, "rewards/accuracies": 0.625, "rewards/chosen": 0.31694334745407104, "rewards/margins": 0.43876951932907104, "rewards/rejected": -0.122161865234375, "step": 750 }, { "epoch": 0.286144578313253, "grad_norm": 148.20299699012207, "learning_rate": 9.285579819277109e-07, "logits/chosen": -2.135546922683716, "logits/rejected": -2.051562547683716, "logps/chosen": -350.0, "logps/rejected": -412.5, "loss": 0.6129, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.22220154106616974, "rewards/margins": 0.4360107481479645, "rewards/rejected": -0.21382446587085724, "step": 760 }, { "epoch": 0.28990963855421686, "grad_norm": 105.32673066983808, "learning_rate": 9.276167168674698e-07, "logits/chosen": -2.05078125, "logits/rejected": -1.976953148841858, "logps/chosen": -342.04998779296875, "logps/rejected": -348.79998779296875, "loss": 0.6244, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.07254638522863388, "rewards/margins": 0.49287110567092896, "rewards/rejected": -0.42033690214157104, "step": 770 }, { "epoch": 0.2936746987951807, "grad_norm": 102.46887236014057, "learning_rate": 9.266754518072288e-07, "logits/chosen": -1.980078101158142, "logits/rejected": -2.0511717796325684, "logps/chosen": -303.25, "logps/rejected": -348.6000061035156, "loss": 0.6114, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.14608153700828552, "rewards/margins": 0.548748791217804, "rewards/rejected": -0.4016265869140625, "step": 780 }, { "epoch": 0.29743975903614456, "grad_norm": 109.64027090787577, "learning_rate": 9.257341867469879e-07, "logits/chosen": -2.048046827316284, "logits/rejected": -2.020312547683716, "logps/chosen": -312.1000061035156, "logps/rejected": -333.95001220703125, "loss": 0.5868, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.16099242866039276, "rewards/margins": 0.5184570550918579, "rewards/rejected": -0.3580566346645355, "step": 790 }, { "epoch": 0.30120481927710846, "grad_norm": 87.37711462305195, "learning_rate": 9.24792921686747e-07, "logits/chosen": -1.997656226158142, "logits/rejected": -1.994531273841858, "logps/chosen": -313.79998779296875, "logps/rejected": -346.8999938964844, "loss": 0.6167, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.2213897705078125, "rewards/margins": 0.45489501953125, "rewards/rejected": -0.233123779296875, "step": 800 }, { "epoch": 0.3049698795180723, "grad_norm": 87.12115180114708, "learning_rate": 9.23851656626506e-07, "logits/chosen": -1.962890625, "logits/rejected": -1.990234375, "logps/chosen": -310.95001220703125, "logps/rejected": -360.8999938964844, "loss": 0.5987, "rewards/accuracies": 0.625, "rewards/chosen": 0.44218748807907104, "rewards/margins": 0.641064465045929, "rewards/rejected": -0.19816894829273224, "step": 810 }, { "epoch": 0.30873493975903615, "grad_norm": 141.5359228709243, "learning_rate": 9.22910391566265e-07, "logits/chosen": -2.0492186546325684, "logits/rejected": -1.91015625, "logps/chosen": -333.0, "logps/rejected": -413.79998779296875, "loss": 0.6376, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.5743163824081421, "rewards/margins": 0.47307127714157104, "rewards/rejected": 0.10090331733226776, "step": 820 }, { "epoch": 0.3125, "grad_norm": 118.25886427710564, "learning_rate": 9.219691265060241e-07, "logits/chosen": -2.112109422683716, "logits/rejected": -2.041796922683716, "logps/chosen": -364.54998779296875, "logps/rejected": -407.29998779296875, "loss": 0.5739, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.591632068157196, "rewards/margins": 0.6204589605331421, "rewards/rejected": -0.02885131910443306, "step": 830 }, { "epoch": 0.31626506024096385, "grad_norm": 150.41335178791348, "learning_rate": 9.210278614457831e-07, "logits/chosen": -2.026562452316284, "logits/rejected": -1.963281273841858, "logps/chosen": -377.0, "logps/rejected": -380.45001220703125, "loss": 0.597, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.642285168170929, "rewards/margins": 0.584667980670929, "rewards/rejected": 0.05782470852136612, "step": 840 }, { "epoch": 0.3200301204819277, "grad_norm": 67.35371990003739, "learning_rate": 9.20086596385542e-07, "logits/chosen": -1.9621093273162842, "logits/rejected": -1.9519531726837158, "logps/chosen": -367.1499938964844, "logps/rejected": -407.3999938964844, "loss": 0.6029, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.19569091498851776, "rewards/margins": 0.5557311773300171, "rewards/rejected": -0.36027830839157104, "step": 850 }, { "epoch": 0.32379518072289154, "grad_norm": 103.19480932480253, "learning_rate": 9.191453313253011e-07, "logits/chosen": -2.1195311546325684, "logits/rejected": -2.09765625, "logps/chosen": -359.79998779296875, "logps/rejected": -391.0, "loss": 0.6363, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.05950622633099556, "rewards/margins": 0.4863525331020355, "rewards/rejected": -0.42661744356155396, "step": 860 }, { "epoch": 0.32756024096385544, "grad_norm": 120.91420810883585, "learning_rate": 9.182040662650602e-07, "logits/chosen": -2.0257811546325684, "logits/rejected": -1.955078125, "logps/chosen": -325.95001220703125, "logps/rejected": -350.79998779296875, "loss": 0.6752, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03216857835650444, "rewards/margins": 0.34625244140625, "rewards/rejected": -0.31464844942092896, "step": 870 }, { "epoch": 0.3313253012048193, "grad_norm": 134.6136782306288, "learning_rate": 9.172628012048193e-07, "logits/chosen": -2.055859327316284, "logits/rejected": -2.060546875, "logps/chosen": -321.29998779296875, "logps/rejected": -361.45001220703125, "loss": 0.5981, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.11828003078699112, "rewards/margins": 0.5791870355606079, "rewards/rejected": -0.4615280032157898, "step": 880 }, { "epoch": 0.33509036144578314, "grad_norm": 105.28974972182175, "learning_rate": 9.163215361445783e-07, "logits/chosen": -2.033984422683716, "logits/rejected": -2.0152344703674316, "logps/chosen": -382.0, "logps/rejected": -423.04998779296875, "loss": 0.6012, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.2629150450229645, "rewards/margins": 0.642041027545929, "rewards/rejected": -0.37921142578125, "step": 890 }, { "epoch": 0.338855421686747, "grad_norm": 114.27006532026437, "learning_rate": 9.153802710843373e-07, "logits/chosen": -2.0511717796325684, "logits/rejected": -1.9675781726837158, "logps/chosen": -315.8500061035156, "logps/rejected": -361.29998779296875, "loss": 0.5965, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.34998780488967896, "rewards/margins": 0.5501464605331421, "rewards/rejected": -0.19936522841453552, "step": 900 }, { "epoch": 0.34262048192771083, "grad_norm": 102.02580140568612, "learning_rate": 9.144390060240963e-07, "logits/chosen": -2.0355467796325684, "logits/rejected": -1.9539062976837158, "logps/chosen": -319.9750061035156, "logps/rejected": -385.29998779296875, "loss": 0.6499, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.30235594511032104, "rewards/margins": 0.61090087890625, "rewards/rejected": -0.3081298768520355, "step": 910 }, { "epoch": 0.3463855421686747, "grad_norm": 174.86739191684097, "learning_rate": 9.134977409638554e-07, "logits/chosen": -2.044140577316284, "logits/rejected": -2.055468797683716, "logps/chosen": -387.3999938964844, "logps/rejected": -397.25, "loss": 0.5868, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3239990174770355, "rewards/margins": 0.509814441204071, "rewards/rejected": -0.18614502251148224, "step": 920 }, { "epoch": 0.3501506024096386, "grad_norm": 89.08602918653729, "learning_rate": 9.125564759036144e-07, "logits/chosen": -2.032031297683716, "logits/rejected": -1.9601562023162842, "logps/chosen": -321.25, "logps/rejected": -342.75, "loss": 0.6039, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.37680357694625854, "rewards/margins": 0.513928234577179, "rewards/rejected": -0.13718262314796448, "step": 930 }, { "epoch": 0.3539156626506024, "grad_norm": 85.40871075758267, "learning_rate": 9.116152108433734e-07, "logits/chosen": -2.0113282203674316, "logits/rejected": -2.0296874046325684, "logps/chosen": -314.75, "logps/rejected": -372.75, "loss": 0.6122, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.405029296875, "rewards/margins": 0.599072277545929, "rewards/rejected": -0.194183349609375, "step": 940 }, { "epoch": 0.35768072289156627, "grad_norm": 136.39625568760795, "learning_rate": 9.106739457831325e-07, "logits/chosen": -2.0015625953674316, "logits/rejected": -1.991796851158142, "logps/chosen": -380.54998779296875, "logps/rejected": -369.45001220703125, "loss": 0.6068, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.3829406797885895, "rewards/margins": 0.5194091796875, "rewards/rejected": -0.13634033501148224, "step": 950 }, { "epoch": 0.3614457831325301, "grad_norm": 129.11234539806668, "learning_rate": 9.097326807228916e-07, "logits/chosen": -2.1214842796325684, "logits/rejected": -2.076171875, "logps/chosen": -339.70001220703125, "logps/rejected": -372.6000061035156, "loss": 0.6093, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.5345703363418579, "rewards/margins": 0.46947020292282104, "rewards/rejected": 0.06488647311925888, "step": 960 }, { "epoch": 0.36521084337349397, "grad_norm": 135.24002300345398, "learning_rate": 9.087914156626506e-07, "logits/chosen": -2.030468702316284, "logits/rejected": -1.9617187976837158, "logps/chosen": -308.0, "logps/rejected": -376.79998779296875, "loss": 0.5907, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4830078184604645, "rewards/margins": 0.596606433391571, "rewards/rejected": -0.11273193359375, "step": 970 }, { "epoch": 0.3689759036144578, "grad_norm": 88.93404958990655, "learning_rate": 9.078501506024095e-07, "logits/chosen": -2.047656297683716, "logits/rejected": -2.024609327316284, "logps/chosen": -378.45001220703125, "logps/rejected": -383.79998779296875, "loss": 0.5949, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3139404356479645, "rewards/margins": 0.506390392780304, "rewards/rejected": -0.19216307997703552, "step": 980 }, { "epoch": 0.3727409638554217, "grad_norm": 110.73307373146505, "learning_rate": 9.069088855421686e-07, "logits/chosen": -2.001171827316284, "logits/rejected": -1.929296851158142, "logps/chosen": -338.67498779296875, "logps/rejected": -364.54998779296875, "loss": 0.5443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.28386229276657104, "rewards/margins": 0.7441650629043579, "rewards/rejected": -0.4609619081020355, "step": 990 }, { "epoch": 0.37650602409638556, "grad_norm": 110.99510883752451, "learning_rate": 9.059676204819276e-07, "logits/chosen": -2.037109375, "logits/rejected": -1.94140625, "logps/chosen": -339.3500061035156, "logps/rejected": -386.75, "loss": 0.5924, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.2621093690395355, "rewards/margins": 0.5776122808456421, "rewards/rejected": -0.3167480528354645, "step": 1000 }, { "epoch": 0.3802710843373494, "grad_norm": 104.86492983547298, "learning_rate": 9.050263554216867e-07, "logits/chosen": -1.9988281726837158, "logits/rejected": -1.917578101158142, "logps/chosen": -341.6000061035156, "logps/rejected": -354.1499938964844, "loss": 0.6866, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.08772888034582138, "rewards/margins": 0.44221192598342896, "rewards/rejected": -0.35433655977249146, "step": 1010 }, { "epoch": 0.38403614457831325, "grad_norm": 108.8164749206756, "learning_rate": 9.040850903614458e-07, "logits/chosen": -2.044921875, "logits/rejected": -1.966406226158142, "logps/chosen": -336.1000061035156, "logps/rejected": -392.45001220703125, "loss": 0.6565, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.2729248106479645, "rewards/margins": 0.4714111387729645, "rewards/rejected": -0.19834594428539276, "step": 1020 }, { "epoch": 0.3878012048192771, "grad_norm": 116.04337769686741, "learning_rate": 9.031438253012048e-07, "logits/chosen": -2.0863280296325684, "logits/rejected": -1.928125023841858, "logps/chosen": -324.04998779296875, "logps/rejected": -375.3500061035156, "loss": 0.5881, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.3542114198207855, "rewards/margins": 0.593017578125, "rewards/rejected": -0.23896484076976776, "step": 1030 }, { "epoch": 0.39156626506024095, "grad_norm": 93.82826775039412, "learning_rate": 9.022025602409638e-07, "logits/chosen": -1.943750023841858, "logits/rejected": -1.9480469226837158, "logps/chosen": -338.29998779296875, "logps/rejected": -363.8999938964844, "loss": 0.5539, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.3850540220737457, "rewards/margins": 0.6250976324081421, "rewards/rejected": -0.24038085341453552, "step": 1040 }, { "epoch": 0.3953313253012048, "grad_norm": 152.3630339032053, "learning_rate": 9.012612951807228e-07, "logits/chosen": -2.0621094703674316, "logits/rejected": -2.0667967796325684, "logps/chosen": -365.8500061035156, "logps/rejected": -360.1499938964844, "loss": 0.6706, "rewards/accuracies": 0.59375, "rewards/chosen": 0.3727050721645355, "rewards/margins": 0.40118408203125, "rewards/rejected": -0.02875976637005806, "step": 1050 }, { "epoch": 0.3990963855421687, "grad_norm": 118.3501096907846, "learning_rate": 9.003200301204819e-07, "logits/chosen": -2.0609374046325684, "logits/rejected": -2.0859375, "logps/chosen": -403.45001220703125, "logps/rejected": -426.79998779296875, "loss": 0.5946, "rewards/accuracies": 0.625, "rewards/chosen": 0.2962646484375, "rewards/margins": 0.5477294921875, "rewards/rejected": -0.25138550996780396, "step": 1060 }, { "epoch": 0.40286144578313254, "grad_norm": 123.40930803383038, "learning_rate": 8.99378765060241e-07, "logits/chosen": -2.092578172683716, "logits/rejected": -2.0679688453674316, "logps/chosen": -362.75, "logps/rejected": -381.3500061035156, "loss": 0.6248, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.20849609375, "rewards/margins": 0.5634399652481079, "rewards/rejected": -0.3543457090854645, "step": 1070 }, { "epoch": 0.4066265060240964, "grad_norm": 85.68560512857552, "learning_rate": 8.984374999999999e-07, "logits/chosen": -1.974609375, "logits/rejected": -2.033984422683716, "logps/chosen": -319.6499938964844, "logps/rejected": -338.29998779296875, "loss": 0.5392, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.37971192598342896, "rewards/margins": 0.663134753704071, "rewards/rejected": -0.283294677734375, "step": 1080 }, { "epoch": 0.41039156626506024, "grad_norm": 104.14749894241699, "learning_rate": 8.97496234939759e-07, "logits/chosen": -2.09375, "logits/rejected": -2.076171875, "logps/chosen": -342.45001220703125, "logps/rejected": -393.79998779296875, "loss": 0.61, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.48540037870407104, "rewards/margins": 0.543164074420929, "rewards/rejected": -0.057708740234375, "step": 1090 }, { "epoch": 0.4141566265060241, "grad_norm": 105.82220309961075, "learning_rate": 8.965549698795181e-07, "logits/chosen": -2.072265625, "logits/rejected": -1.957421898841858, "logps/chosen": -376.8999938964844, "logps/rejected": -449.1499938964844, "loss": 0.573, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.5472656488418579, "rewards/margins": 0.6667724847793579, "rewards/rejected": -0.11970214545726776, "step": 1100 }, { "epoch": 0.41792168674698793, "grad_norm": 116.25729589258705, "learning_rate": 8.956137048192772e-07, "logits/chosen": -2.076171875, "logits/rejected": -2.087890625, "logps/chosen": -317.0, "logps/rejected": -420.75, "loss": 0.575, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22136230766773224, "rewards/margins": 0.7445312738418579, "rewards/rejected": -0.523547351360321, "step": 1110 }, { "epoch": 0.42168674698795183, "grad_norm": 135.77099252412688, "learning_rate": 8.94672439759036e-07, "logits/chosen": -2.05859375, "logits/rejected": -1.9484374523162842, "logps/chosen": -322.7250061035156, "logps/rejected": -378.5, "loss": 0.6043, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07266845554113388, "rewards/margins": 0.633313000202179, "rewards/rejected": -0.705517590045929, "step": 1120 }, { "epoch": 0.4254518072289157, "grad_norm": 119.3128309809916, "learning_rate": 8.937311746987951e-07, "logits/chosen": -2.0328125953674316, "logits/rejected": -2.040234327316284, "logps/chosen": -347.6000061035156, "logps/rejected": -372.20001220703125, "loss": 0.569, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.12564697861671448, "rewards/margins": 0.724316418170929, "rewards/rejected": -0.8490234613418579, "step": 1130 }, { "epoch": 0.4292168674698795, "grad_norm": 103.27306834967854, "learning_rate": 8.927899096385542e-07, "logits/chosen": -1.999609351158142, "logits/rejected": -2.0023436546325684, "logps/chosen": -337.29998779296875, "logps/rejected": -385.8999938964844, "loss": 0.6245, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.22938232123851776, "rewards/margins": 0.8289550542831421, "rewards/rejected": -1.057348608970642, "step": 1140 }, { "epoch": 0.4329819277108434, "grad_norm": 69.52772792868703, "learning_rate": 8.918486445783133e-07, "logits/chosen": -2.049999952316284, "logits/rejected": -2.0570311546325684, "logps/chosen": -343.25, "logps/rejected": -347.8999938964844, "loss": 0.5242, "rewards/accuracies": 0.75, "rewards/chosen": 0.04838867112994194, "rewards/margins": 0.7650390863418579, "rewards/rejected": -0.717547595500946, "step": 1150 }, { "epoch": 0.4367469879518072, "grad_norm": 148.4443601909508, "learning_rate": 8.909073795180722e-07, "logits/chosen": -1.990234375, "logits/rejected": -1.927734375, "logps/chosen": -339.8999938964844, "logps/rejected": -400.29998779296875, "loss": 0.716, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3299560546875, "rewards/margins": 0.42500001192092896, "rewards/rejected": -0.7550293207168579, "step": 1160 }, { "epoch": 0.44051204819277107, "grad_norm": 118.04466419687076, "learning_rate": 8.899661144578313e-07, "logits/chosen": -1.994140625, "logits/rejected": -1.9773437976837158, "logps/chosen": -408.6499938964844, "logps/rejected": -394.29998779296875, "loss": 0.6527, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0653533935546875, "rewards/margins": 0.7203003168106079, "rewards/rejected": -0.78564453125, "step": 1170 }, { "epoch": 0.4442771084337349, "grad_norm": 75.62556447864961, "learning_rate": 8.890248493975904e-07, "logits/chosen": -1.9871094226837158, "logits/rejected": -1.8996093273162842, "logps/chosen": -288.57501220703125, "logps/rejected": -363.32501220703125, "loss": 0.6029, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.16551513969898224, "rewards/margins": 0.6287597417831421, "rewards/rejected": -0.795458972454071, "step": 1180 }, { "epoch": 0.4480421686746988, "grad_norm": 120.7488554341746, "learning_rate": 8.880835843373493e-07, "logits/chosen": -2.0804686546325684, "logits/rejected": -2.080078125, "logps/chosen": -352.75, "logps/rejected": -386.04998779296875, "loss": 0.5407, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.06685791164636612, "rewards/margins": 0.773327648639679, "rewards/rejected": -0.8401855230331421, "step": 1190 }, { "epoch": 0.45180722891566266, "grad_norm": 121.06970111771686, "learning_rate": 8.871423192771083e-07, "logits/chosen": -2.106250047683716, "logits/rejected": -1.9796874523162842, "logps/chosen": -323.79998779296875, "logps/rejected": -389.20001220703125, "loss": 0.6525, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3145996034145355, "rewards/margins": 0.5687500238418579, "rewards/rejected": -0.882861316204071, "step": 1200 }, { "epoch": 0.4555722891566265, "grad_norm": 107.21905467915113, "learning_rate": 8.862010542168674e-07, "logits/chosen": -2.1324219703674316, "logits/rejected": -2.056640625, "logps/chosen": -342.3999938964844, "logps/rejected": -374.25, "loss": 0.6403, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.515454113483429, "rewards/margins": 0.5108886957168579, "rewards/rejected": -1.026068091392517, "step": 1210 }, { "epoch": 0.45933734939759036, "grad_norm": 109.66982567044258, "learning_rate": 8.852597891566265e-07, "logits/chosen": -2.108203172683716, "logits/rejected": -2.0855469703674316, "logps/chosen": -365.0, "logps/rejected": -385.1000061035156, "loss": 0.5583, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3344970643520355, "rewards/margins": 0.679882824420929, "rewards/rejected": -1.015039086341858, "step": 1220 }, { "epoch": 0.4631024096385542, "grad_norm": 178.9956974816963, "learning_rate": 8.843185240963855e-07, "logits/chosen": -2.1207032203674316, "logits/rejected": -2.1410155296325684, "logps/chosen": -341.95001220703125, "logps/rejected": -352.6499938964844, "loss": 0.6169, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.12770995497703552, "rewards/margins": 0.6989074945449829, "rewards/rejected": -0.8266845941543579, "step": 1230 }, { "epoch": 0.46686746987951805, "grad_norm": 79.38245087967344, "learning_rate": 8.833772590361446e-07, "logits/chosen": -2.1136717796325684, "logits/rejected": -2.193359375, "logps/chosen": -349.17498779296875, "logps/rejected": -375.42498779296875, "loss": 0.587, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.16783447563648224, "rewards/margins": 0.698834240436554, "rewards/rejected": -0.5306762456893921, "step": 1240 }, { "epoch": 0.47063253012048195, "grad_norm": 112.32193578639642, "learning_rate": 8.824359939759036e-07, "logits/chosen": -2.139843702316284, "logits/rejected": -2.11328125, "logps/chosen": -335.3500061035156, "logps/rejected": -390.0, "loss": 0.6587, "rewards/accuracies": 0.625, "rewards/chosen": 0.1309814453125, "rewards/margins": 0.4751525819301605, "rewards/rejected": -0.3438476622104645, "step": 1250 }, { "epoch": 0.4743975903614458, "grad_norm": 85.82919962102238, "learning_rate": 8.814947289156626e-07, "logits/chosen": -2.014843702316284, "logits/rejected": -1.978124976158142, "logps/chosen": -321.07501220703125, "logps/rejected": -366.45001220703125, "loss": 0.5564, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1534423828125, "rewards/margins": 0.7531585693359375, "rewards/rejected": -0.5997070074081421, "step": 1260 }, { "epoch": 0.47816265060240964, "grad_norm": 97.46728352823442, "learning_rate": 8.805534638554216e-07, "logits/chosen": -2.1585936546325684, "logits/rejected": -2.0953125953674316, "logps/chosen": -319.95001220703125, "logps/rejected": -388.6000061035156, "loss": 0.64, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.04353637620806694, "rewards/margins": 0.537182629108429, "rewards/rejected": -0.494140625, "step": 1270 }, { "epoch": 0.4819277108433735, "grad_norm": 94.96379075340379, "learning_rate": 8.796121987951807e-07, "logits/chosen": -2.1109375953674316, "logits/rejected": -2.1011719703674316, "logps/chosen": -339.25, "logps/rejected": -375.5, "loss": 0.6708, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.06431885063648224, "rewards/margins": 0.49882811307907104, "rewards/rejected": -0.563159167766571, "step": 1280 }, { "epoch": 0.48569277108433734, "grad_norm": 102.40860176332323, "learning_rate": 8.786709337349397e-07, "logits/chosen": -2.0972657203674316, "logits/rejected": -2.0746092796325684, "logps/chosen": -317.57501220703125, "logps/rejected": -351.29998779296875, "loss": 0.599, "rewards/accuracies": 0.625, "rewards/chosen": 0.20717772841453552, "rewards/margins": 0.547192394733429, "rewards/rejected": -0.34022217988967896, "step": 1290 }, { "epoch": 0.4894578313253012, "grad_norm": 123.61184760724956, "learning_rate": 8.777296686746988e-07, "logits/chosen": -2.087109327316284, "logits/rejected": -2.057812452316284, "logps/chosen": -340.8500061035156, "logps/rejected": -373.45001220703125, "loss": 0.6134, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.23900146782398224, "rewards/margins": 0.6482299566268921, "rewards/rejected": -0.4084228575229645, "step": 1300 }, { "epoch": 0.4932228915662651, "grad_norm": 100.7902725074793, "learning_rate": 8.767884036144578e-07, "logits/chosen": -1.9480469226837158, "logits/rejected": -1.974218726158142, "logps/chosen": -351.45001220703125, "logps/rejected": -417.25, "loss": 0.5471, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.20999756455421448, "rewards/margins": 0.7735351324081421, "rewards/rejected": -0.563159167766571, "step": 1310 }, { "epoch": 0.49698795180722893, "grad_norm": 136.1813404916251, "learning_rate": 8.758471385542169e-07, "logits/chosen": -2.03125, "logits/rejected": -2.010546922683716, "logps/chosen": -305.8500061035156, "logps/rejected": -341.17498779296875, "loss": 0.6229, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3778076171875, "rewards/margins": 0.5752319097518921, "rewards/rejected": -0.19819335639476776, "step": 1320 }, { "epoch": 0.5007530120481928, "grad_norm": 115.40945360497435, "learning_rate": 8.749058734939759e-07, "logits/chosen": -2.03515625, "logits/rejected": -2.0531249046325684, "logps/chosen": -346.6000061035156, "logps/rejected": -380.6499938964844, "loss": 0.6333, "rewards/accuracies": 0.65625, "rewards/chosen": 0.24533692002296448, "rewards/margins": 0.47966307401657104, "rewards/rejected": -0.23417358100414276, "step": 1330 }, { "epoch": 0.5045180722891566, "grad_norm": 103.7645023868546, "learning_rate": 8.739646084337348e-07, "logits/chosen": -2.1871094703674316, "logits/rejected": -2.069140672683716, "logps/chosen": -323.8999938964844, "logps/rejected": -348.25, "loss": 0.6364, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.25328367948532104, "rewards/margins": 0.45585936307907104, "rewards/rejected": -0.20268554985523224, "step": 1340 }, { "epoch": 0.5082831325301205, "grad_norm": 105.85572050880074, "learning_rate": 8.730233433734939e-07, "logits/chosen": -2.10546875, "logits/rejected": -2.0972657203674316, "logps/chosen": -315.1499938964844, "logps/rejected": -320.45001220703125, "loss": 0.6077, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.23701171576976776, "rewards/margins": 0.533862292766571, "rewards/rejected": -0.296722412109375, "step": 1350 }, { "epoch": 0.5120481927710844, "grad_norm": 106.85505283553186, "learning_rate": 8.72082078313253e-07, "logits/chosen": -2.1136717796325684, "logits/rejected": -2.1226563453674316, "logps/chosen": -345.5, "logps/rejected": -390.20001220703125, "loss": 0.6112, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.04832763597369194, "rewards/margins": 0.60302734375, "rewards/rejected": -0.554186999797821, "step": 1360 }, { "epoch": 0.5158132530120482, "grad_norm": 91.30413354430902, "learning_rate": 8.711408132530121e-07, "logits/chosen": -2.1773438453674316, "logits/rejected": -2.1441407203674316, "logps/chosen": -340.1000061035156, "logps/rejected": -346.1499938964844, "loss": 0.6485, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03095092810690403, "rewards/margins": 0.4761718809604645, "rewards/rejected": -0.506909191608429, "step": 1370 }, { "epoch": 0.5195783132530121, "grad_norm": 83.6963660104711, "learning_rate": 8.70199548192771e-07, "logits/chosen": -1.9992187023162842, "logits/rejected": -1.94140625, "logps/chosen": -308.8500061035156, "logps/rejected": -385.1000061035156, "loss": 0.6276, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.08549804985523224, "rewards/margins": 0.555615246295929, "rewards/rejected": -0.640869140625, "step": 1380 }, { "epoch": 0.5233433734939759, "grad_norm": 151.4132004465058, "learning_rate": 8.692582831325301e-07, "logits/chosen": -1.990234375, "logits/rejected": -1.9152343273162842, "logps/chosen": -332.1000061035156, "logps/rejected": -387.8999938964844, "loss": 0.6283, "rewards/accuracies": 0.625, "rewards/chosen": -0.03709106519818306, "rewards/margins": 0.6053100824356079, "rewards/rejected": -0.642773449420929, "step": 1390 }, { "epoch": 0.5271084337349398, "grad_norm": 156.58676917518773, "learning_rate": 8.683170180722891e-07, "logits/chosen": -2.102734327316284, "logits/rejected": -2.080859422683716, "logps/chosen": -398.79998779296875, "logps/rejected": -424.1499938964844, "loss": 0.6086, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.09279785305261612, "rewards/margins": 0.548754870891571, "rewards/rejected": -0.641552746295929, "step": 1400 }, { "epoch": 0.5308734939759037, "grad_norm": 178.2704641666902, "learning_rate": 8.673757530120482e-07, "logits/chosen": -2.033984422683716, "logits/rejected": -2.013671875, "logps/chosen": -329.8500061035156, "logps/rejected": -356.3999938964844, "loss": 0.6149, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.013806152157485485, "rewards/margins": 0.546679675579071, "rewards/rejected": -0.5604522824287415, "step": 1410 }, { "epoch": 0.5346385542168675, "grad_norm": 159.01927071338855, "learning_rate": 8.664344879518071e-07, "logits/chosen": -2.024218797683716, "logits/rejected": -2.0546875, "logps/chosen": -363.25, "logps/rejected": -393.70001220703125, "loss": 0.6608, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09876708686351776, "rewards/margins": 0.4318603575229645, "rewards/rejected": -0.5301147699356079, "step": 1420 }, { "epoch": 0.5384036144578314, "grad_norm": 121.32831523455977, "learning_rate": 8.654932228915662e-07, "logits/chosen": -2.020312547683716, "logits/rejected": -2.075390577316284, "logps/chosen": -338.3999938964844, "logps/rejected": -379.0, "loss": 0.5675, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01300048828125, "rewards/margins": 0.6390136480331421, "rewards/rejected": -0.6519531011581421, "step": 1430 }, { "epoch": 0.5421686746987951, "grad_norm": 119.34425559031715, "learning_rate": 8.645519578313253e-07, "logits/chosen": -2.141406297683716, "logits/rejected": -2.075000047683716, "logps/chosen": -366.95001220703125, "logps/rejected": -404.5, "loss": 0.6402, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1974639892578125, "rewards/margins": 0.5236877202987671, "rewards/rejected": -0.7206054925918579, "step": 1440 }, { "epoch": 0.545933734939759, "grad_norm": 69.21002743426132, "learning_rate": 8.636106927710844e-07, "logits/chosen": -2.0859375, "logits/rejected": -2.02734375, "logps/chosen": -318.6000061035156, "logps/rejected": -368.45001220703125, "loss": 0.5855, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.02966003492474556, "rewards/margins": 0.556689441204071, "rewards/rejected": -0.527355968952179, "step": 1450 }, { "epoch": 0.5496987951807228, "grad_norm": 67.63538127644517, "learning_rate": 8.626694277108434e-07, "logits/chosen": -1.978124976158142, "logits/rejected": -2.0464844703674316, "logps/chosen": -336.07501220703125, "logps/rejected": -382.3999938964844, "loss": 0.5382, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.01627807691693306, "rewards/margins": 0.769482433795929, "rewards/rejected": -0.752636730670929, "step": 1460 }, { "epoch": 0.5534638554216867, "grad_norm": 112.26416655029841, "learning_rate": 8.617281626506023e-07, "logits/chosen": -2.0707030296325684, "logits/rejected": -2.00390625, "logps/chosen": -336.6000061035156, "logps/rejected": -386.0, "loss": 0.5804, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.00990905798971653, "rewards/margins": 0.665087878704071, "rewards/rejected": -0.6541748046875, "step": 1470 }, { "epoch": 0.5572289156626506, "grad_norm": 104.4700697318105, "learning_rate": 8.607868975903614e-07, "logits/chosen": -2.03125, "logits/rejected": -2.042187452316284, "logps/chosen": -315.1499938964844, "logps/rejected": -336.29998779296875, "loss": 0.6156, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.1114501953125, "rewards/margins": 0.673876941204071, "rewards/rejected": -0.5624023675918579, "step": 1480 }, { "epoch": 0.5609939759036144, "grad_norm": 75.7933511076787, "learning_rate": 8.598456325301204e-07, "logits/chosen": -1.989843726158142, "logits/rejected": -2.0003905296325684, "logps/chosen": -348.6000061035156, "logps/rejected": -448.0, "loss": 0.5249, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.19879150390625, "rewards/margins": 0.8953857421875, "rewards/rejected": -0.695751965045929, "step": 1490 }, { "epoch": 0.5647590361445783, "grad_norm": 116.67926678251548, "learning_rate": 8.589043674698795e-07, "logits/chosen": -1.9816405773162842, "logits/rejected": -1.990234375, "logps/chosen": -351.6499938964844, "logps/rejected": -352.70001220703125, "loss": 0.5366, "rewards/accuracies": 0.71875, "rewards/chosen": 0.17684325575828552, "rewards/margins": 0.760986328125, "rewards/rejected": -0.5850585699081421, "step": 1500 }, { "epoch": 0.5685240963855421, "grad_norm": 110.51997711103142, "learning_rate": 8.579631024096385e-07, "logits/chosen": -2.1011719703674316, "logits/rejected": -2.1019530296325684, "logps/chosen": -338.95001220703125, "logps/rejected": -401.95001220703125, "loss": 0.5673, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.3445983827114105, "rewards/margins": 0.689746081829071, "rewards/rejected": -0.34569090604782104, "step": 1510 }, { "epoch": 0.572289156626506, "grad_norm": 124.20420903673059, "learning_rate": 8.570218373493976e-07, "logits/chosen": -1.933203101158142, "logits/rejected": -1.9386718273162842, "logps/chosen": -358.95001220703125, "logps/rejected": -384.3999938964844, "loss": 0.6115, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.33088988065719604, "rewards/margins": 0.653247058391571, "rewards/rejected": -0.32208251953125, "step": 1520 }, { "epoch": 0.5760542168674698, "grad_norm": 97.2429751837015, "learning_rate": 8.560805722891565e-07, "logits/chosen": -1.9480469226837158, "logits/rejected": -1.9152343273162842, "logps/chosen": -295.20001220703125, "logps/rejected": -325.25, "loss": 0.6303, "rewards/accuracies": 0.625, "rewards/chosen": 0.15561524033546448, "rewards/margins": 0.4826904237270355, "rewards/rejected": -0.32734376192092896, "step": 1530 }, { "epoch": 0.5798192771084337, "grad_norm": 137.62875002509705, "learning_rate": 8.551393072289156e-07, "logits/chosen": -1.980859398841858, "logits/rejected": -1.9777343273162842, "logps/chosen": -404.5, "logps/rejected": -387.20001220703125, "loss": 0.6005, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.21307373046875, "rewards/margins": 0.567218005657196, "rewards/rejected": -0.35338133573532104, "step": 1540 }, { "epoch": 0.5835843373493976, "grad_norm": 135.88455225945475, "learning_rate": 8.541980421686747e-07, "logits/chosen": -1.989843726158142, "logits/rejected": -1.9304687976837158, "logps/chosen": -391.3999938964844, "logps/rejected": -419.8999938964844, "loss": 0.5911, "rewards/accuracies": 0.6875, "rewards/chosen": 0.29320067167282104, "rewards/margins": 0.756884753704071, "rewards/rejected": -0.4635681211948395, "step": 1550 }, { "epoch": 0.5873493975903614, "grad_norm": 165.5377066431754, "learning_rate": 8.532567771084337e-07, "logits/chosen": -1.9832031726837158, "logits/rejected": -1.9640624523162842, "logps/chosen": -367.7749938964844, "logps/rejected": -414.8500061035156, "loss": 0.5827, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.14482422173023224, "rewards/margins": 0.722949206829071, "rewards/rejected": -0.57708740234375, "step": 1560 }, { "epoch": 0.5911144578313253, "grad_norm": 139.75236964157529, "learning_rate": 8.523155120481927e-07, "logits/chosen": -1.9894530773162842, "logits/rejected": -1.94140625, "logps/chosen": -333.6000061035156, "logps/rejected": -377.79998779296875, "loss": 0.5734, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.14589843153953552, "rewards/margins": 0.76922607421875, "rewards/rejected": -0.6228271722793579, "step": 1570 }, { "epoch": 0.5948795180722891, "grad_norm": 99.3816325657316, "learning_rate": 8.513742469879518e-07, "logits/chosen": -1.9500000476837158, "logits/rejected": -1.9949219226837158, "logps/chosen": -362.1499938964844, "logps/rejected": -355.25, "loss": 0.537, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.19130858778953552, "rewards/margins": 0.8483520746231079, "rewards/rejected": -0.6573547124862671, "step": 1580 }, { "epoch": 0.598644578313253, "grad_norm": 101.51993335489229, "learning_rate": 8.504329819277109e-07, "logits/chosen": -1.996484398841858, "logits/rejected": -1.9738280773162842, "logps/chosen": -333.75, "logps/rejected": -388.8500061035156, "loss": 0.5691, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.21005859971046448, "rewards/margins": 0.8822265863418579, "rewards/rejected": -0.6720992922782898, "step": 1590 }, { "epoch": 0.6024096385542169, "grad_norm": 85.1286841796459, "learning_rate": 8.494917168674698e-07, "logits/chosen": -1.976171851158142, "logits/rejected": -2.001953125, "logps/chosen": -347.79998779296875, "logps/rejected": -402.54998779296875, "loss": 0.6725, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.07089538872241974, "rewards/margins": 0.576947033405304, "rewards/rejected": -0.505688488483429, "step": 1600 }, { "epoch": 0.6061746987951807, "grad_norm": 95.45366655572853, "learning_rate": 8.485504518072288e-07, "logits/chosen": -1.961328148841858, "logits/rejected": -2.000781297683716, "logps/chosen": -307.1499938964844, "logps/rejected": -348.54998779296875, "loss": 0.5941, "rewards/accuracies": 0.625, "rewards/chosen": -0.01717529259622097, "rewards/margins": 0.6099609136581421, "rewards/rejected": -0.6268066167831421, "step": 1610 }, { "epoch": 0.6099397590361446, "grad_norm": 80.01494851071922, "learning_rate": 8.476091867469879e-07, "logits/chosen": -2.037890672683716, "logits/rejected": -1.9347655773162842, "logps/chosen": -334.3500061035156, "logps/rejected": -357.1499938964844, "loss": 0.6226, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01605529710650444, "rewards/margins": 0.53216552734375, "rewards/rejected": -0.5162109136581421, "step": 1620 }, { "epoch": 0.6137048192771084, "grad_norm": 92.96884175439457, "learning_rate": 8.46667921686747e-07, "logits/chosen": -2.067187547683716, "logits/rejected": -2.039843797683716, "logps/chosen": -272.57501220703125, "logps/rejected": -327.75, "loss": 0.609, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.07647705078125, "rewards/margins": 0.5487426519393921, "rewards/rejected": -0.4728027284145355, "step": 1630 }, { "epoch": 0.6174698795180723, "grad_norm": 155.62189736036177, "learning_rate": 8.457266566265059e-07, "logits/chosen": -1.958593726158142, "logits/rejected": -1.975000023841858, "logps/chosen": -332.5, "logps/rejected": -373.0249938964844, "loss": 0.5903, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.27247315645217896, "rewards/margins": 0.627636730670929, "rewards/rejected": -0.899810791015625, "step": 1640 }, { "epoch": 0.6212349397590361, "grad_norm": 496.2493506227212, "learning_rate": 8.44785391566265e-07, "logits/chosen": -2.043750047683716, "logits/rejected": -2.0992188453674316, "logps/chosen": -351.8500061035156, "logps/rejected": -392.95001220703125, "loss": 0.6273, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09541015326976776, "rewards/margins": 0.6147400140762329, "rewards/rejected": -0.710375964641571, "step": 1650 }, { "epoch": 0.625, "grad_norm": 110.00387051150955, "learning_rate": 8.438441265060241e-07, "logits/chosen": -2.112109422683716, "logits/rejected": -2.064453125, "logps/chosen": -342.29998779296875, "logps/rejected": -367.6000061035156, "loss": 0.6242, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.20378418266773224, "rewards/margins": 0.535137951374054, "rewards/rejected": -0.7386840581893921, "step": 1660 }, { "epoch": 0.6287650602409639, "grad_norm": 74.43695123688704, "learning_rate": 8.429028614457831e-07, "logits/chosen": -1.9503905773162842, "logits/rejected": -1.9140625, "logps/chosen": -332.7749938964844, "logps/rejected": -407.45001220703125, "loss": 0.6087, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.35493165254592896, "rewards/margins": 0.649169921875, "rewards/rejected": -1.0043671131134033, "step": 1670 }, { "epoch": 0.6325301204819277, "grad_norm": 128.99532223938422, "learning_rate": 8.41961596385542e-07, "logits/chosen": -2.15625, "logits/rejected": -2.1214842796325684, "logps/chosen": -408.45001220703125, "logps/rejected": -407.3999938964844, "loss": 0.6898, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.7181640863418579, "rewards/margins": 0.44712525606155396, "rewards/rejected": -1.165283203125, "step": 1680 }, { "epoch": 0.6362951807228916, "grad_norm": 98.36771474877474, "learning_rate": 8.410203313253011e-07, "logits/chosen": -2.047656297683716, "logits/rejected": -2.061718702316284, "logps/chosen": -356.1499938964844, "logps/rejected": -425.3999938964844, "loss": 0.543, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.712207019329071, "rewards/margins": 0.7001953125, "rewards/rejected": -1.412695288658142, "step": 1690 }, { "epoch": 0.6400602409638554, "grad_norm": 147.55782909917593, "learning_rate": 8.400790662650602e-07, "logits/chosen": -2.0628905296325684, "logits/rejected": -2.0699219703674316, "logps/chosen": -339.79998779296875, "logps/rejected": -355.75, "loss": 0.6226, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7344726324081421, "rewards/margins": 0.611132800579071, "rewards/rejected": -1.344482421875, "step": 1700 }, { "epoch": 0.6438253012048193, "grad_norm": 106.42904621043785, "learning_rate": 8.391378012048193e-07, "logits/chosen": -2.0550780296325684, "logits/rejected": -2.0093750953674316, "logps/chosen": -378.54998779296875, "logps/rejected": -418.04998779296875, "loss": 0.6077, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6015380620956421, "rewards/margins": 0.6344024538993835, "rewards/rejected": -1.235742211341858, "step": 1710 }, { "epoch": 0.6475903614457831, "grad_norm": 111.16054023444222, "learning_rate": 8.381965361445783e-07, "logits/chosen": -2.0843749046325684, "logits/rejected": -2.0726561546325684, "logps/chosen": -360.8500061035156, "logps/rejected": -405.1000061035156, "loss": 0.5968, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2641967833042145, "rewards/margins": 0.7218017578125, "rewards/rejected": -0.985644519329071, "step": 1720 }, { "epoch": 0.651355421686747, "grad_norm": 73.78130827555165, "learning_rate": 8.372552710843373e-07, "logits/chosen": -1.9753906726837158, "logits/rejected": -2.014453172683716, "logps/chosen": -331.54998779296875, "logps/rejected": -337.5, "loss": 0.5693, "rewards/accuracies": 0.6875, "rewards/chosen": -0.17960205674171448, "rewards/margins": 0.72113037109375, "rewards/rejected": -0.901171863079071, "step": 1730 }, { "epoch": 0.6551204819277109, "grad_norm": 84.9037726766695, "learning_rate": 8.363140060240963e-07, "logits/chosen": -2.0277342796325684, "logits/rejected": -2.034374952316284, "logps/chosen": -294.5, "logps/rejected": -357.8999938964844, "loss": 0.5835, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.15654297173023224, "rewards/margins": 0.70147705078125, "rewards/rejected": -0.857666015625, "step": 1740 }, { "epoch": 0.6588855421686747, "grad_norm": 118.84749141069743, "learning_rate": 8.353727409638554e-07, "logits/chosen": -2.065234422683716, "logits/rejected": -2.057421922683716, "logps/chosen": -371.3500061035156, "logps/rejected": -414.3999938964844, "loss": 0.5527, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.13613586127758026, "rewards/margins": 0.8033813238143921, "rewards/rejected": -0.9396117925643921, "step": 1750 }, { "epoch": 0.6626506024096386, "grad_norm": 99.68263884422323, "learning_rate": 8.344314759036144e-07, "logits/chosen": -1.9796874523162842, "logits/rejected": -1.916406273841858, "logps/chosen": -347.95001220703125, "logps/rejected": -390.3500061035156, "loss": 0.5595, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.22338256239891052, "rewards/margins": 0.7833007574081421, "rewards/rejected": -1.005761742591858, "step": 1760 }, { "epoch": 0.6664156626506024, "grad_norm": 86.50576245040463, "learning_rate": 8.334902108433734e-07, "logits/chosen": -1.951171875, "logits/rejected": -1.8894531726837158, "logps/chosen": -332.5, "logps/rejected": -361.95001220703125, "loss": 0.6483, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.17436523735523224, "rewards/margins": 0.686328113079071, "rewards/rejected": -0.8612304925918579, "step": 1770 }, { "epoch": 0.6701807228915663, "grad_norm": 103.2464524855862, "learning_rate": 8.325489457831325e-07, "logits/chosen": -1.898828148841858, "logits/rejected": -1.9191405773162842, "logps/chosen": -353.0, "logps/rejected": -378.04998779296875, "loss": 0.6471, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0626220703125, "rewards/margins": 0.5638672113418579, "rewards/rejected": -0.625872790813446, "step": 1780 }, { "epoch": 0.6739457831325302, "grad_norm": 89.48174249833059, "learning_rate": 8.316076807228916e-07, "logits/chosen": -1.9679687023162842, "logits/rejected": -1.8878905773162842, "logps/chosen": -323.2250061035156, "logps/rejected": -384.8500061035156, "loss": 0.6175, "rewards/accuracies": 0.65625, "rewards/chosen": 0.07190551608800888, "rewards/margins": 0.577441394329071, "rewards/rejected": -0.5052490234375, "step": 1790 }, { "epoch": 0.677710843373494, "grad_norm": 88.45281735557766, "learning_rate": 8.306664156626506e-07, "logits/chosen": -1.925390601158142, "logits/rejected": -1.8976562023162842, "logps/chosen": -322.45001220703125, "logps/rejected": -362.04998779296875, "loss": 0.5739, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.04024047777056694, "rewards/margins": 0.67333984375, "rewards/rejected": -0.6328872442245483, "step": 1800 }, { "epoch": 0.6814759036144579, "grad_norm": 80.28970475755716, "learning_rate": 8.297251506024096e-07, "logits/chosen": -1.9167969226837158, "logits/rejected": -1.8972656726837158, "logps/chosen": -323.1499938964844, "logps/rejected": -373.45001220703125, "loss": 0.6081, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02031249925494194, "rewards/margins": 0.6243041753768921, "rewards/rejected": -0.6448730230331421, "step": 1810 }, { "epoch": 0.6852409638554217, "grad_norm": 107.56150073285325, "learning_rate": 8.287838855421686e-07, "logits/chosen": -1.900781273841858, "logits/rejected": -1.8390624523162842, "logps/chosen": -345.20001220703125, "logps/rejected": -380.3999938964844, "loss": 0.642, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.1109619140625, "rewards/margins": 0.4954589903354645, "rewards/rejected": -0.607421875, "step": 1820 }, { "epoch": 0.6890060240963856, "grad_norm": 109.24270462771929, "learning_rate": 8.278426204819276e-07, "logits/chosen": -1.953515648841858, "logits/rejected": -1.944921851158142, "logps/chosen": -370.3999938964844, "logps/rejected": -404.29998779296875, "loss": 0.5848, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.03303222730755806, "rewards/margins": 0.6614745855331421, "rewards/rejected": -0.629016101360321, "step": 1830 }, { "epoch": 0.6927710843373494, "grad_norm": 99.86434621002039, "learning_rate": 8.269013554216867e-07, "logits/chosen": -1.9074218273162842, "logits/rejected": -1.8976562023162842, "logps/chosen": -324.3500061035156, "logps/rejected": -365.95001220703125, "loss": 0.5538, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.16567687690258026, "rewards/margins": 0.7900635004043579, "rewards/rejected": -0.6241699457168579, "step": 1840 }, { "epoch": 0.6965361445783133, "grad_norm": 91.83455519873013, "learning_rate": 8.259600903614458e-07, "logits/chosen": -2.0347657203674316, "logits/rejected": -2.0093750953674316, "logps/chosen": -366.8500061035156, "logps/rejected": -420.70001220703125, "loss": 0.5788, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02641601487994194, "rewards/margins": 0.618457019329071, "rewards/rejected": -0.591888427734375, "step": 1850 }, { "epoch": 0.7003012048192772, "grad_norm": 94.29413436213645, "learning_rate": 8.250188253012048e-07, "logits/chosen": -1.960546851158142, "logits/rejected": -1.99609375, "logps/chosen": -336.95001220703125, "logps/rejected": -378.70001220703125, "loss": 0.6362, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.12960204482078552, "rewards/margins": 0.65869140625, "rewards/rejected": -0.7887207269668579, "step": 1860 }, { "epoch": 0.704066265060241, "grad_norm": 110.01030031448786, "learning_rate": 8.240775602409638e-07, "logits/chosen": -1.9792969226837158, "logits/rejected": -1.894921898841858, "logps/chosen": -333.125, "logps/rejected": -382.1499938964844, "loss": 0.6152, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.17644043266773224, "rewards/margins": 0.61669921875, "rewards/rejected": -0.7923828363418579, "step": 1870 }, { "epoch": 0.7078313253012049, "grad_norm": 63.95093981738905, "learning_rate": 8.231362951807228e-07, "logits/chosen": -1.842187523841858, "logits/rejected": -1.881250023841858, "logps/chosen": -394.6000061035156, "logps/rejected": -421.70001220703125, "loss": 0.5786, "rewards/accuracies": 0.65625, "rewards/chosen": -0.39808350801467896, "rewards/margins": 0.80224609375, "rewards/rejected": -1.199945092201233, "step": 1880 }, { "epoch": 0.7115963855421686, "grad_norm": 96.26192183419808, "learning_rate": 8.221950301204819e-07, "logits/chosen": -2.0718750953674316, "logits/rejected": -2.051953077316284, "logps/chosen": -371.0, "logps/rejected": -369.54998779296875, "loss": 0.7303, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.3468383848667145, "rewards/margins": 0.4249511659145355, "rewards/rejected": -0.772656261920929, "step": 1890 }, { "epoch": 0.7153614457831325, "grad_norm": 121.7220370932885, "learning_rate": 8.21253765060241e-07, "logits/chosen": -2.018359422683716, "logits/rejected": -1.96484375, "logps/chosen": -349.6000061035156, "logps/rejected": -417.79998779296875, "loss": 0.5898, "rewards/accuracies": 0.625, "rewards/chosen": -0.61883544921875, "rewards/margins": 0.6514892578125, "rewards/rejected": -1.271093726158142, "step": 1900 }, { "epoch": 0.7191265060240963, "grad_norm": 88.17668793339645, "learning_rate": 8.203124999999999e-07, "logits/chosen": -1.9636719226837158, "logits/rejected": -1.959375023841858, "logps/chosen": -335.1499938964844, "logps/rejected": -348.25, "loss": 0.5955, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.4750122129917145, "rewards/margins": 0.721142590045929, "rewards/rejected": -1.1962890625, "step": 1910 }, { "epoch": 0.7228915662650602, "grad_norm": 129.98727452707107, "learning_rate": 8.19371234939759e-07, "logits/chosen": -1.9523437023162842, "logits/rejected": -1.916406273841858, "logps/chosen": -324.25, "logps/rejected": -396.75, "loss": 0.5994, "rewards/accuracies": 0.6875, "rewards/chosen": -0.519604504108429, "rewards/margins": 0.6587890386581421, "rewards/rejected": -1.1785156726837158, "step": 1920 }, { "epoch": 0.7266566265060241, "grad_norm": 109.95865154564731, "learning_rate": 8.184299698795181e-07, "logits/chosen": -2.018749952316284, "logits/rejected": -1.993749976158142, "logps/chosen": -356.79998779296875, "logps/rejected": -382.3999938964844, "loss": 0.624, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.48582762479782104, "rewards/margins": 0.5780029296875, "rewards/rejected": -1.063085913658142, "step": 1930 }, { "epoch": 0.7304216867469879, "grad_norm": 71.92195353870521, "learning_rate": 8.174887048192772e-07, "logits/chosen": -1.953515648841858, "logits/rejected": -1.8878905773162842, "logps/chosen": -339.20001220703125, "logps/rejected": -405.20001220703125, "loss": 0.6104, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.4372619688510895, "rewards/margins": 0.7235107421875, "rewards/rejected": -1.161230444908142, "step": 1940 }, { "epoch": 0.7341867469879518, "grad_norm": 117.47995364464965, "learning_rate": 8.16547439759036e-07, "logits/chosen": -1.974218726158142, "logits/rejected": -1.9539062976837158, "logps/chosen": -343.5, "logps/rejected": -366.45001220703125, "loss": 0.5787, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06937865912914276, "rewards/margins": 0.6579223871231079, "rewards/rejected": -0.727124035358429, "step": 1950 }, { "epoch": 0.7379518072289156, "grad_norm": 160.329622139751, "learning_rate": 8.156061746987951e-07, "logits/chosen": -2.061718702316284, "logits/rejected": -2.003124952316284, "logps/chosen": -320.95001220703125, "logps/rejected": -372.8500061035156, "loss": 0.7401, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.22443847358226776, "rewards/margins": 0.3213745057582855, "rewards/rejected": -0.54595947265625, "step": 1960 }, { "epoch": 0.7417168674698795, "grad_norm": 113.9804849843493, "learning_rate": 8.146649096385542e-07, "logits/chosen": -2.01171875, "logits/rejected": -1.971093773841858, "logps/chosen": -327.6499938964844, "logps/rejected": -359.25, "loss": 0.6333, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01732177659869194, "rewards/margins": 0.5553954839706421, "rewards/rejected": -0.5382629632949829, "step": 1970 }, { "epoch": 0.7454819277108434, "grad_norm": 112.03288081352139, "learning_rate": 8.137236445783132e-07, "logits/chosen": -2.0042967796325684, "logits/rejected": -1.9851562976837158, "logps/chosen": -342.70001220703125, "logps/rejected": -374.29998779296875, "loss": 0.5993, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2918334901332855, "rewards/margins": 0.686718761920929, "rewards/rejected": -0.3956291079521179, "step": 1980 }, { "epoch": 0.7492469879518072, "grad_norm": 93.13665965953437, "learning_rate": 8.127823795180722e-07, "logits/chosen": -1.935156226158142, "logits/rejected": -1.9347655773162842, "logps/chosen": -291.75, "logps/rejected": -328.75, "loss": 0.5868, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.5954345464706421, "rewards/margins": 0.6421874761581421, "rewards/rejected": -0.04649658128619194, "step": 1990 }, { "epoch": 0.7530120481927711, "grad_norm": 105.2051002512212, "learning_rate": 8.118411144578313e-07, "logits/chosen": -1.9167969226837158, "logits/rejected": -1.8484375476837158, "logps/chosen": -339.75, "logps/rejected": -359.0, "loss": 0.6168, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.41746824979782104, "rewards/margins": 0.6239989995956421, "rewards/rejected": -0.20695190131664276, "step": 2000 }, { "epoch": 0.7567771084337349, "grad_norm": 139.7817239548692, "learning_rate": 8.108998493975904e-07, "logits/chosen": -1.986328125, "logits/rejected": -1.955468773841858, "logps/chosen": -352.70001220703125, "logps/rejected": -349.5, "loss": 0.605, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.42322999238967896, "rewards/margins": 0.740478515625, "rewards/rejected": -0.3171935975551605, "step": 2010 }, { "epoch": 0.7605421686746988, "grad_norm": 111.74459017066806, "learning_rate": 8.099585843373493e-07, "logits/chosen": -2.0667967796325684, "logits/rejected": -2.0101561546325684, "logps/chosen": -316.1000061035156, "logps/rejected": -388.3999938964844, "loss": 0.6224, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.4239746034145355, "rewards/margins": 0.626574695110321, "rewards/rejected": -0.20213623344898224, "step": 2020 }, { "epoch": 0.7643072289156626, "grad_norm": 168.55475521008938, "learning_rate": 8.090173192771084e-07, "logits/chosen": -2.063671827316284, "logits/rejected": -2.044921875, "logps/chosen": -342.79998779296875, "logps/rejected": -381.54998779296875, "loss": 0.5609, "rewards/accuracies": 0.6875, "rewards/chosen": 0.52105712890625, "rewards/margins": 0.7196289300918579, "rewards/rejected": -0.19794921576976776, "step": 2030 }, { "epoch": 0.7680722891566265, "grad_norm": 101.40010861712243, "learning_rate": 8.080760542168674e-07, "logits/chosen": -1.9460937976837158, "logits/rejected": -1.958984375, "logps/chosen": -344.8500061035156, "logps/rejected": -394.8500061035156, "loss": 0.6318, "rewards/accuracies": 0.625, "rewards/chosen": 0.15593262016773224, "rewards/margins": 0.5870116949081421, "rewards/rejected": -0.43116456270217896, "step": 2040 }, { "epoch": 0.7718373493975904, "grad_norm": 116.5082879818785, "learning_rate": 8.071347891566265e-07, "logits/chosen": -2.049999952316284, "logits/rejected": -2.000781297683716, "logps/chosen": -364.8999938964844, "logps/rejected": -364.45001220703125, "loss": 0.6026, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.06257323920726776, "rewards/margins": 0.6379150152206421, "rewards/rejected": -0.5755981206893921, "step": 2050 }, { "epoch": 0.7756024096385542, "grad_norm": 89.73072034449949, "learning_rate": 8.061935240963855e-07, "logits/chosen": -2.007031202316284, "logits/rejected": -1.9601562023162842, "logps/chosen": -281.1499938964844, "logps/rejected": -322.20001220703125, "loss": 0.5997, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.19183349609375, "rewards/margins": 0.6597900390625, "rewards/rejected": -0.46770018339157104, "step": 2060 }, { "epoch": 0.7793674698795181, "grad_norm": 85.44423570365286, "learning_rate": 8.052522590361446e-07, "logits/chosen": -2.0179686546325684, "logits/rejected": -2.0199217796325684, "logps/chosen": -304.5249938964844, "logps/rejected": -350.6499938964844, "loss": 0.5835, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.17133179306983948, "rewards/margins": 0.7066406011581421, "rewards/rejected": -0.5353943109512329, "step": 2070 }, { "epoch": 0.7831325301204819, "grad_norm": 115.28041365634311, "learning_rate": 8.043109939759036e-07, "logits/chosen": -2.016406297683716, "logits/rejected": -1.935546875, "logps/chosen": -321.54998779296875, "logps/rejected": -335.3500061035156, "loss": 0.582, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.007617187686264515, "rewards/margins": 0.6175537109375, "rewards/rejected": -0.625048816204071, "step": 2080 }, { "epoch": 0.7868975903614458, "grad_norm": 109.64634554460712, "learning_rate": 8.033697289156626e-07, "logits/chosen": -2.041796922683716, "logits/rejected": -2.052734375, "logps/chosen": -340.70001220703125, "logps/rejected": -399.8999938964844, "loss": 0.5236, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.01507568359375, "rewards/margins": 0.808764636516571, "rewards/rejected": -0.792675793170929, "step": 2090 }, { "epoch": 0.7906626506024096, "grad_norm": 106.84908045501699, "learning_rate": 8.024284638554216e-07, "logits/chosen": -2.068359375, "logits/rejected": -1.959375023841858, "logps/chosen": -316.75, "logps/rejected": -376.8999938964844, "loss": 0.5466, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.06193847581744194, "rewards/margins": 0.7806152105331421, "rewards/rejected": -0.719250500202179, "step": 2100 }, { "epoch": 0.7944277108433735, "grad_norm": 95.58250717191865, "learning_rate": 8.014871987951807e-07, "logits/chosen": -1.965234398841858, "logits/rejected": -1.9363281726837158, "logps/chosen": -346.8500061035156, "logps/rejected": -362.25, "loss": 0.6186, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20783081650733948, "rewards/margins": 0.739062488079071, "rewards/rejected": -0.5306152105331421, "step": 2110 }, { "epoch": 0.7981927710843374, "grad_norm": 94.93928023095718, "learning_rate": 8.005459337349398e-07, "logits/chosen": -2.100781202316284, "logits/rejected": -2.0726561546325684, "logps/chosen": -354.45001220703125, "logps/rejected": -351.70001220703125, "loss": 0.5885, "rewards/accuracies": 0.65625, "rewards/chosen": 0.42595213651657104, "rewards/margins": 0.753173828125, "rewards/rejected": -0.327383428812027, "step": 2120 }, { "epoch": 0.8019578313253012, "grad_norm": 104.54667359991979, "learning_rate": 7.996046686746987e-07, "logits/chosen": -2.0328125953674316, "logits/rejected": -2.104687452316284, "logps/chosen": -370.29998779296875, "logps/rejected": -423.45001220703125, "loss": 0.6389, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.19820556044578552, "rewards/margins": 0.655688464641571, "rewards/rejected": -0.45769041776657104, "step": 2130 }, { "epoch": 0.8057228915662651, "grad_norm": 99.6571431784662, "learning_rate": 7.986634036144578e-07, "logits/chosen": -2.07421875, "logits/rejected": -2.0218749046325684, "logps/chosen": -354.8500061035156, "logps/rejected": -402.45001220703125, "loss": 0.5845, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.30205076932907104, "rewards/margins": 0.6953979730606079, "rewards/rejected": -0.39335936307907104, "step": 2140 }, { "epoch": 0.8094879518072289, "grad_norm": 104.48978327193952, "learning_rate": 7.977221385542169e-07, "logits/chosen": -2.091015577316284, "logits/rejected": -2.0796875953674316, "logps/chosen": -347.20001220703125, "logps/rejected": -362.8999938964844, "loss": 0.6032, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.2705322206020355, "rewards/margins": 0.5754638910293579, "rewards/rejected": -0.304931640625, "step": 2150 }, { "epoch": 0.8132530120481928, "grad_norm": 130.5332919452141, "learning_rate": 7.967808734939759e-07, "logits/chosen": -1.9796874523162842, "logits/rejected": -1.896875023841858, "logps/chosen": -335.95001220703125, "logps/rejected": -384.3500061035156, "loss": 0.6363, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.04764404147863388, "rewards/margins": 0.6065307855606079, "rewards/rejected": -0.65380859375, "step": 2160 }, { "epoch": 0.8170180722891566, "grad_norm": 90.77039578288294, "learning_rate": 7.958396084337348e-07, "logits/chosen": -2.060546875, "logits/rejected": -2.0374999046325684, "logps/chosen": -372.75, "logps/rejected": -415.70001220703125, "loss": 0.5883, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.10992126166820526, "rewards/margins": 0.6593261957168579, "rewards/rejected": -0.769580066204071, "step": 2170 }, { "epoch": 0.8207831325301205, "grad_norm": 99.25512600308396, "learning_rate": 7.948983433734939e-07, "logits/chosen": -2.0234375, "logits/rejected": -1.9914062023162842, "logps/chosen": -333.1000061035156, "logps/rejected": -337.1499938964844, "loss": 0.6382, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3839355409145355, "rewards/margins": 0.5852416753768921, "rewards/rejected": -0.9693359136581421, "step": 2180 }, { "epoch": 0.8245481927710844, "grad_norm": 140.55109530644233, "learning_rate": 7.93957078313253e-07, "logits/chosen": -2.0757813453674316, "logits/rejected": -2.021484375, "logps/chosen": -335.3999938964844, "logps/rejected": -339.29998779296875, "loss": 0.5441, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20369262993335724, "rewards/margins": 0.7207275629043579, "rewards/rejected": -0.9243408441543579, "step": 2190 }, { "epoch": 0.8283132530120482, "grad_norm": 159.47075636576534, "learning_rate": 7.930158132530121e-07, "logits/chosen": -2.116015672683716, "logits/rejected": -2.020703077316284, "logps/chosen": -338.54998779296875, "logps/rejected": -345.70001220703125, "loss": 0.6487, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19919434189796448, "rewards/margins": 0.561718761920929, "rewards/rejected": -0.7611144781112671, "step": 2200 }, { "epoch": 0.8320783132530121, "grad_norm": 72.7159900187189, "learning_rate": 7.92074548192771e-07, "logits/chosen": -2.0999999046325684, "logits/rejected": -2.0863280296325684, "logps/chosen": -345.8500061035156, "logps/rejected": -410.25, "loss": 0.5691, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.009533691219985485, "rewards/margins": 0.904003918170929, "rewards/rejected": -0.8951660394668579, "step": 2210 }, { "epoch": 0.8358433734939759, "grad_norm": 146.63701519095096, "learning_rate": 7.911332831325301e-07, "logits/chosen": -1.9375, "logits/rejected": -1.8914062976837158, "logps/chosen": -333.25, "logps/rejected": -392.8999938964844, "loss": 0.6414, "rewards/accuracies": 0.625, "rewards/chosen": -0.03007812425494194, "rewards/margins": 0.6097656488418579, "rewards/rejected": -0.6407104730606079, "step": 2220 }, { "epoch": 0.8396084337349398, "grad_norm": 94.42342501874396, "learning_rate": 7.901920180722891e-07, "logits/chosen": -1.974609375, "logits/rejected": -1.9503905773162842, "logps/chosen": -341.20001220703125, "logps/rejected": -394.29998779296875, "loss": 0.6485, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.30247801542282104, "rewards/margins": 0.65966796875, "rewards/rejected": -0.35804444551467896, "step": 2230 }, { "epoch": 0.8433734939759037, "grad_norm": 110.01187338144784, "learning_rate": 7.892507530120482e-07, "logits/chosen": -2.0042967796325684, "logits/rejected": -1.9542968273162842, "logps/chosen": -320.5, "logps/rejected": -381.1000061035156, "loss": 0.5774, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4628051817417145, "rewards/margins": 0.760546863079071, "rewards/rejected": -0.2977050840854645, "step": 2240 }, { "epoch": 0.8471385542168675, "grad_norm": 122.35265191100058, "learning_rate": 7.883094879518072e-07, "logits/chosen": -1.927343726158142, "logits/rejected": -1.937109351158142, "logps/chosen": -347.29998779296875, "logps/rejected": -412.1499938964844, "loss": 0.618, "rewards/accuracies": 0.625, "rewards/chosen": 0.3580322265625, "rewards/margins": 0.624951183795929, "rewards/rejected": -0.2670959532260895, "step": 2250 }, { "epoch": 0.8509036144578314, "grad_norm": 77.33604559642676, "learning_rate": 7.873682228915662e-07, "logits/chosen": -2.020312547683716, "logits/rejected": -2.018359422683716, "logps/chosen": -310.29998779296875, "logps/rejected": -339.95001220703125, "loss": 0.5805, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.311767578125, "rewards/margins": 0.6649414300918579, "rewards/rejected": -0.3525146543979645, "step": 2260 }, { "epoch": 0.8546686746987951, "grad_norm": 138.79885328448842, "learning_rate": 7.864269578313253e-07, "logits/chosen": -2.098437547683716, "logits/rejected": -2.033203125, "logps/chosen": -350.875, "logps/rejected": -359.1499938964844, "loss": 0.6921, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.11075439304113388, "rewards/margins": 0.4532226622104645, "rewards/rejected": -0.34260863065719604, "step": 2270 }, { "epoch": 0.858433734939759, "grad_norm": 82.39937629147859, "learning_rate": 7.854856927710844e-07, "logits/chosen": -1.951171875, "logits/rejected": -2.0859375, "logps/chosen": -388.04998779296875, "logps/rejected": -407.8999938964844, "loss": 0.5114, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.17342528700828552, "rewards/margins": 0.760668933391571, "rewards/rejected": -0.58648681640625, "step": 2280 }, { "epoch": 0.8621987951807228, "grad_norm": 108.15898817768803, "learning_rate": 7.845444277108434e-07, "logits/chosen": -2.02734375, "logits/rejected": -1.933984398841858, "logps/chosen": -308.20001220703125, "logps/rejected": -370.54998779296875, "loss": 0.6446, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.06608887016773224, "rewards/margins": 0.536755383014679, "rewards/rejected": -0.4709716737270355, "step": 2290 }, { "epoch": 0.8659638554216867, "grad_norm": 98.97916062362945, "learning_rate": 7.836031626506023e-07, "logits/chosen": -2.112109422683716, "logits/rejected": -2.048046827316284, "logps/chosen": -286.17498779296875, "logps/rejected": -315.79998779296875, "loss": 0.6191, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.09713134914636612, "rewards/margins": 0.5246948003768921, "rewards/rejected": -0.4272522032260895, "step": 2300 }, { "epoch": 0.8697289156626506, "grad_norm": 105.15500929145286, "learning_rate": 7.826618975903614e-07, "logits/chosen": -2.111328125, "logits/rejected": -2.045703172683716, "logps/chosen": -319.57501220703125, "logps/rejected": -371.75, "loss": 0.5945, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01958007737994194, "rewards/margins": 0.6626220941543579, "rewards/rejected": -0.681506335735321, "step": 2310 }, { "epoch": 0.8734939759036144, "grad_norm": 114.67981855789827, "learning_rate": 7.817206325301204e-07, "logits/chosen": -2.1117186546325684, "logits/rejected": -2.117968797683716, "logps/chosen": -332.32501220703125, "logps/rejected": -351.54998779296875, "loss": 0.5806, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.06323852390050888, "rewards/margins": 0.605419933795929, "rewards/rejected": -0.5413818359375, "step": 2320 }, { "epoch": 0.8772590361445783, "grad_norm": 113.86258870453736, "learning_rate": 7.807793674698795e-07, "logits/chosen": -2.1011719703674316, "logits/rejected": -2.064453125, "logps/chosen": -328.04998779296875, "logps/rejected": -373.1499938964844, "loss": 0.6138, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.02412719652056694, "rewards/margins": 0.6168212890625, "rewards/rejected": -0.592449963092804, "step": 2330 }, { "epoch": 0.8810240963855421, "grad_norm": 148.0864065720678, "learning_rate": 7.798381024096386e-07, "logits/chosen": -2.041015625, "logits/rejected": -2.044140577316284, "logps/chosen": -374.95001220703125, "logps/rejected": -376.5, "loss": 0.6708, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.09888305515050888, "rewards/margins": 0.40479737520217896, "rewards/rejected": -0.3060058653354645, "step": 2340 }, { "epoch": 0.884789156626506, "grad_norm": 113.09172945811706, "learning_rate": 7.788968373493976e-07, "logits/chosen": -2.03515625, "logits/rejected": -2.0230469703674316, "logps/chosen": -314.79998779296875, "logps/rejected": -340.6499938964844, "loss": 0.6043, "rewards/accuracies": 0.625, "rewards/chosen": 0.172882080078125, "rewards/margins": 0.592578113079071, "rewards/rejected": -0.4203247129917145, "step": 2350 }, { "epoch": 0.8885542168674698, "grad_norm": 111.91875952957344, "learning_rate": 7.779555722891565e-07, "logits/chosen": -2.0406250953674316, "logits/rejected": -2.009765625, "logps/chosen": -335.6499938964844, "logps/rejected": -376.54998779296875, "loss": 0.6297, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.02894287183880806, "rewards/margins": 0.6014404296875, "rewards/rejected": -0.5726867914199829, "step": 2360 }, { "epoch": 0.8923192771084337, "grad_norm": 98.76349231431999, "learning_rate": 7.770143072289156e-07, "logits/chosen": -2.0277342796325684, "logits/rejected": -1.9699218273162842, "logps/chosen": -309.1499938964844, "logps/rejected": -325.54998779296875, "loss": 0.6512, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.02134399488568306, "rewards/margins": 0.5021728277206421, "rewards/rejected": -0.48086851835250854, "step": 2370 }, { "epoch": 0.8960843373493976, "grad_norm": 82.19477565687869, "learning_rate": 7.760730421686747e-07, "logits/chosen": -2.0625, "logits/rejected": -2.073437452316284, "logps/chosen": -299.0, "logps/rejected": -335.17498779296875, "loss": 0.529, "rewards/accuracies": 0.75, "rewards/chosen": 0.45756834745407104, "rewards/margins": 0.790234386920929, "rewards/rejected": -0.3320556581020355, "step": 2380 }, { "epoch": 0.8998493975903614, "grad_norm": 106.03096032534926, "learning_rate": 7.751317771084337e-07, "logits/chosen": -1.9921875, "logits/rejected": -2.015625, "logps/chosen": -357.6000061035156, "logps/rejected": -369.8500061035156, "loss": 0.5623, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3870605528354645, "rewards/margins": 0.7032226324081421, "rewards/rejected": -0.3161682188510895, "step": 2390 }, { "epoch": 0.9036144578313253, "grad_norm": 71.67060940997516, "learning_rate": 7.741905120481927e-07, "logits/chosen": -2.065624952316284, "logits/rejected": -2.0062499046325684, "logps/chosen": -312.45001220703125, "logps/rejected": -364.75, "loss": 0.6398, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.600830078125, "rewards/margins": 0.5738281011581421, "rewards/rejected": 0.02711791917681694, "step": 2400 }, { "epoch": 0.9073795180722891, "grad_norm": 99.50009866369543, "learning_rate": 7.732492469879518e-07, "logits/chosen": -1.9988281726837158, "logits/rejected": -1.9582030773162842, "logps/chosen": -298.6000061035156, "logps/rejected": -335.70001220703125, "loss": 0.6071, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7635742425918579, "rewards/margins": 0.5251830816268921, "rewards/rejected": 0.23895263671875, "step": 2410 }, { "epoch": 0.911144578313253, "grad_norm": 93.63214279302436, "learning_rate": 7.723079819277109e-07, "logits/chosen": -2.0394530296325684, "logits/rejected": -2.096484422683716, "logps/chosen": -412.0, "logps/rejected": -440.3999938964844, "loss": 0.6163, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7749999761581421, "rewards/margins": 0.571459949016571, "rewards/rejected": 0.20292969048023224, "step": 2420 }, { "epoch": 0.9149096385542169, "grad_norm": 121.97701587498119, "learning_rate": 7.713667168674698e-07, "logits/chosen": -1.9910156726837158, "logits/rejected": -1.947265625, "logps/chosen": -304.29998779296875, "logps/rejected": -329.95001220703125, "loss": 0.6197, "rewards/accuracies": 0.625, "rewards/chosen": 0.77734375, "rewards/margins": 0.546984851360321, "rewards/rejected": 0.22976073622703552, "step": 2430 }, { "epoch": 0.9186746987951807, "grad_norm": 179.19829303793202, "learning_rate": 7.704254518072288e-07, "logits/chosen": -2.106250047683716, "logits/rejected": -2.075000047683716, "logps/chosen": -316.8500061035156, "logps/rejected": -352.45001220703125, "loss": 0.5832, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.836621105670929, "rewards/margins": 0.575146496295929, "rewards/rejected": 0.2603698670864105, "step": 2440 }, { "epoch": 0.9224397590361446, "grad_norm": 107.97152335888023, "learning_rate": 7.694841867469879e-07, "logits/chosen": -1.976171851158142, "logits/rejected": -1.9521484375, "logps/chosen": -346.0, "logps/rejected": -404.1000061035156, "loss": 0.5381, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7730468511581421, "rewards/margins": 0.798779308795929, "rewards/rejected": -0.02484130859375, "step": 2450 }, { "epoch": 0.9262048192771084, "grad_norm": 110.88491048319109, "learning_rate": 7.68542921686747e-07, "logits/chosen": -2.0953125953674316, "logits/rejected": -2.013671875, "logps/chosen": -341.6000061035156, "logps/rejected": -346.6000061035156, "loss": 0.5695, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.698559582233429, "rewards/margins": 0.6772094964981079, "rewards/rejected": 0.02128906175494194, "step": 2460 }, { "epoch": 0.9299698795180723, "grad_norm": 115.36645715413955, "learning_rate": 7.67601656626506e-07, "logits/chosen": -1.986718773841858, "logits/rejected": -1.9207031726837158, "logps/chosen": -345.6000061035156, "logps/rejected": -407.5, "loss": 0.683, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.5403808355331421, "rewards/margins": 0.42280274629592896, "rewards/rejected": 0.11800537258386612, "step": 2470 }, { "epoch": 0.9337349397590361, "grad_norm": 105.56291244574126, "learning_rate": 7.66660391566265e-07, "logits/chosen": -2.1312499046325684, "logits/rejected": -2.065234422683716, "logps/chosen": -305.54998779296875, "logps/rejected": -344.04998779296875, "loss": 0.5566, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.724414050579071, "rewards/margins": 0.885119616985321, "rewards/rejected": -0.16188964247703552, "step": 2480 }, { "epoch": 0.9375, "grad_norm": 75.33145137832086, "learning_rate": 7.657191265060241e-07, "logits/chosen": -1.984375, "logits/rejected": -2.071484327316284, "logps/chosen": -388.54998779296875, "logps/rejected": -394.79998779296875, "loss": 0.5727, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.4234375059604645, "rewards/margins": 0.819091796875, "rewards/rejected": -0.39635008573532104, "step": 2490 }, { "epoch": 0.9412650602409639, "grad_norm": 102.9718704187978, "learning_rate": 7.647778614457831e-07, "logits/chosen": -2.063281297683716, "logits/rejected": -1.9914062023162842, "logps/chosen": -326.54998779296875, "logps/rejected": -408.0, "loss": 0.5908, "rewards/accuracies": 0.65625, "rewards/chosen": 0.32768553495407104, "rewards/margins": 0.6773926019668579, "rewards/rejected": -0.34968262910842896, "step": 2500 }, { "epoch": 0.9450301204819277, "grad_norm": 89.7125659389419, "learning_rate": 7.638365963855421e-07, "logits/chosen": -2.0980467796325684, "logits/rejected": -2.145312547683716, "logps/chosen": -362.1000061035156, "logps/rejected": -397.04998779296875, "loss": 0.6527, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.22775879502296448, "rewards/margins": 0.64208984375, "rewards/rejected": -0.41411131620407104, "step": 2510 }, { "epoch": 0.9487951807228916, "grad_norm": 129.71987712476457, "learning_rate": 7.628953313253011e-07, "logits/chosen": -2.05859375, "logits/rejected": -2.041796922683716, "logps/chosen": -337.0, "logps/rejected": -415.3999938964844, "loss": 0.6106, "rewards/accuracies": 0.625, "rewards/chosen": 0.29075318574905396, "rewards/margins": 0.6948486566543579, "rewards/rejected": -0.404296875, "step": 2520 }, { "epoch": 0.9525602409638554, "grad_norm": 136.02821042401942, "learning_rate": 7.619540662650602e-07, "logits/chosen": -1.9765625, "logits/rejected": -1.973046898841858, "logps/chosen": -325.125, "logps/rejected": -362.8500061035156, "loss": 0.5896, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.45011597871780396, "rewards/margins": 0.7677978277206421, "rewards/rejected": -0.3181213438510895, "step": 2530 }, { "epoch": 0.9563253012048193, "grad_norm": 92.35274297113232, "learning_rate": 7.610128012048193e-07, "logits/chosen": -2.0992188453674316, "logits/rejected": -2.0347657203674316, "logps/chosen": -287.70001220703125, "logps/rejected": -339.79998779296875, "loss": 0.5488, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.6620849370956421, "rewards/margins": 0.69134521484375, "rewards/rejected": -0.02873840369284153, "step": 2540 }, { "epoch": 0.9600903614457831, "grad_norm": 125.21766954432819, "learning_rate": 7.600715361445783e-07, "logits/chosen": -2.135937452316284, "logits/rejected": -2.0601563453674316, "logps/chosen": -323.04998779296875, "logps/rejected": -407.6499938964844, "loss": 0.5632, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7773681879043579, "rewards/margins": 0.900317370891571, "rewards/rejected": -0.12232665717601776, "step": 2550 }, { "epoch": 0.963855421686747, "grad_norm": 100.71597860605692, "learning_rate": 7.591302710843373e-07, "logits/chosen": -2.034374952316284, "logits/rejected": -2.0562500953674316, "logps/chosen": -348.6499938964844, "logps/rejected": -374.5, "loss": 0.6022, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.46193236112594604, "rewards/margins": 0.568817138671875, "rewards/rejected": -0.10706786811351776, "step": 2560 }, { "epoch": 0.9676204819277109, "grad_norm": 111.28395648338892, "learning_rate": 7.581890060240963e-07, "logits/chosen": -2.1167969703674316, "logits/rejected": -2.041796922683716, "logps/chosen": -330.04998779296875, "logps/rejected": -337.0, "loss": 0.5739, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.881542980670929, "rewards/margins": 0.7093261480331421, "rewards/rejected": 0.17231445014476776, "step": 2570 }, { "epoch": 0.9713855421686747, "grad_norm": 80.8658339365809, "learning_rate": 7.572477409638554e-07, "logits/chosen": -2.0152344703674316, "logits/rejected": -2.030078172683716, "logps/chosen": -338.1000061035156, "logps/rejected": -349.29998779296875, "loss": 0.5502, "rewards/accuracies": 0.65625, "rewards/chosen": 1.0047607421875, "rewards/margins": 0.7638183832168579, "rewards/rejected": 0.24157103896141052, "step": 2580 }, { "epoch": 0.9751506024096386, "grad_norm": 158.792367793022, "learning_rate": 7.563064759036144e-07, "logits/chosen": -2.1117186546325684, "logits/rejected": -2.055468797683716, "logps/chosen": -281.625, "logps/rejected": -312.70001220703125, "loss": 0.5916, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.9513183832168579, "rewards/margins": 0.6631835699081421, "rewards/rejected": 0.287353515625, "step": 2590 }, { "epoch": 0.9789156626506024, "grad_norm": 144.5787297055879, "learning_rate": 7.553652108433735e-07, "logits/chosen": -2.135937452316284, "logits/rejected": -2.0933594703674316, "logps/chosen": -351.5249938964844, "logps/rejected": -380.25, "loss": 0.6737, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.8975830078125, "rewards/margins": 0.4498657286167145, "rewards/rejected": 0.44810789823532104, "step": 2600 }, { "epoch": 0.9826807228915663, "grad_norm": 73.77152552873693, "learning_rate": 7.544239457831325e-07, "logits/chosen": -2.071484327316284, "logits/rejected": -2.067187547683716, "logps/chosen": -349.1499938964844, "logps/rejected": -353.45001220703125, "loss": 0.6042, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.058496117591858, "rewards/margins": 0.67889404296875, "rewards/rejected": 0.3799682557582855, "step": 2610 }, { "epoch": 0.9864457831325302, "grad_norm": 138.57132172523717, "learning_rate": 7.534826807228915e-07, "logits/chosen": -2.084765672683716, "logits/rejected": -2.0269532203674316, "logps/chosen": -351.8500061035156, "logps/rejected": -345.45001220703125, "loss": 0.6768, "rewards/accuracies": 0.625, "rewards/chosen": 0.918408215045929, "rewards/margins": 0.3726562559604645, "rewards/rejected": 0.544543445110321, "step": 2620 }, { "epoch": 0.990210843373494, "grad_norm": 137.65032158079325, "learning_rate": 7.525414156626506e-07, "logits/chosen": -2.1343750953674316, "logits/rejected": -2.083203077316284, "logps/chosen": -343.04998779296875, "logps/rejected": -364.29998779296875, "loss": 0.5882, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.988476574420929, "rewards/margins": 0.6869751214981079, "rewards/rejected": 0.302001953125, "step": 2630 }, { "epoch": 0.9939759036144579, "grad_norm": 127.20616610743119, "learning_rate": 7.516001506024096e-07, "logits/chosen": -2.1363282203674316, "logits/rejected": -2.038281202316284, "logps/chosen": -360.6000061035156, "logps/rejected": -427.04998779296875, "loss": 0.543, "rewards/accuracies": 0.71875, "rewards/chosen": 0.6723998785018921, "rewards/margins": 0.890881359577179, "rewards/rejected": -0.217529296875, "step": 2640 }, { "epoch": 0.9977409638554217, "grad_norm": 137.6998510860425, "learning_rate": 7.506588855421686e-07, "logits/chosen": -2.186718702316284, "logits/rejected": -2.0835938453674316, "logps/chosen": -363.1499938964844, "logps/rejected": -370.8999938964844, "loss": 0.5921, "rewards/accuracies": 0.6875, "rewards/chosen": 0.523388683795929, "rewards/margins": 0.7101684808731079, "rewards/rejected": -0.187042236328125, "step": 2650 }, { "epoch": 1.0015060240963856, "grad_norm": 28.668905443059067, "learning_rate": 7.497176204819276e-07, "logits/chosen": -2.1031250953674316, "logits/rejected": -2.135937452316284, "logps/chosen": -369.79998779296875, "logps/rejected": -409.8999938964844, "loss": 0.4383, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.9495605230331421, "rewards/margins": 1.2314453125, "rewards/rejected": -0.2826782166957855, "step": 2660 }, { "epoch": 1.0052710843373494, "grad_norm": 44.19551100137227, "learning_rate": 7.487763554216867e-07, "logits/chosen": -2.094531297683716, "logits/rejected": -2.1644530296325684, "logps/chosen": -356.95001220703125, "logps/rejected": -393.45001220703125, "loss": 0.2077, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.3878905773162842, "rewards/margins": 2.352734327316284, "rewards/rejected": -0.9647461175918579, "step": 2670 }, { "epoch": 1.0090361445783131, "grad_norm": 36.81286358925092, "learning_rate": 7.478350903614458e-07, "logits/chosen": -2.1371092796325684, "logits/rejected": -2.1839842796325684, "logps/chosen": -342.6000061035156, "logps/rejected": -383.6000061035156, "loss": 0.2046, "rewards/accuracies": 0.9375, "rewards/chosen": 1.065039038658142, "rewards/margins": 2.24609375, "rewards/rejected": -1.181298851966858, "step": 2680 }, { "epoch": 1.0128012048192772, "grad_norm": 60.78604348015027, "learning_rate": 7.468938253012049e-07, "logits/chosen": -2.20703125, "logits/rejected": -2.212109327316284, "logps/chosen": -397.6499938964844, "logps/rejected": -390.57501220703125, "loss": 0.2052, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.823803722858429, "rewards/margins": 2.5619139671325684, "rewards/rejected": -1.7383301258087158, "step": 2690 }, { "epoch": 1.016566265060241, "grad_norm": 31.0674281679368, "learning_rate": 7.459525602409638e-07, "logits/chosen": -2.1996092796325684, "logits/rejected": -2.126953125, "logps/chosen": -335.1000061035156, "logps/rejected": -420.45001220703125, "loss": 0.1556, "rewards/accuracies": 0.96875, "rewards/chosen": 1.0182616710662842, "rewards/margins": 2.775390625, "rewards/rejected": -1.754296898841858, "step": 2700 }, { "epoch": 1.0203313253012047, "grad_norm": 26.046280334037327, "learning_rate": 7.450112951807228e-07, "logits/chosen": -2.1441407203674316, "logits/rejected": -2.164843797683716, "logps/chosen": -344.70001220703125, "logps/rejected": -386.0, "loss": 0.1985, "rewards/accuracies": 0.9375, "rewards/chosen": 0.659252941608429, "rewards/margins": 2.7542967796325684, "rewards/rejected": -2.0960936546325684, "step": 2710 }, { "epoch": 1.0240963855421688, "grad_norm": 41.43404224010132, "learning_rate": 7.440700301204819e-07, "logits/chosen": -2.2738280296325684, "logits/rejected": -2.2125000953674316, "logps/chosen": -302.29998779296875, "logps/rejected": -354.1499938964844, "loss": 0.2146, "rewards/accuracies": 0.9375, "rewards/chosen": 0.728564441204071, "rewards/margins": 2.272265672683716, "rewards/rejected": -1.5441405773162842, "step": 2720 }, { "epoch": 1.0278614457831325, "grad_norm": 68.65799710286599, "learning_rate": 7.43128765060241e-07, "logits/chosen": -2.26171875, "logits/rejected": -2.219531297683716, "logps/chosen": -343.79998779296875, "logps/rejected": -402.25, "loss": 0.1859, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.04736328125, "rewards/margins": 2.564453125, "rewards/rejected": -1.5166015625, "step": 2730 }, { "epoch": 1.0316265060240963, "grad_norm": 41.543832062493045, "learning_rate": 7.421874999999999e-07, "logits/chosen": -2.255078077316284, "logits/rejected": -2.1382813453674316, "logps/chosen": -370.6499938964844, "logps/rejected": -428.6000061035156, "loss": 0.2062, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.2082030773162842, "rewards/margins": 2.8023438453674316, "rewards/rejected": -1.590722680091858, "step": 2740 }, { "epoch": 1.0353915662650603, "grad_norm": 40.293428875731905, "learning_rate": 7.41246234939759e-07, "logits/chosen": -2.1839842796325684, "logits/rejected": -2.145312547683716, "logps/chosen": -340.70001220703125, "logps/rejected": -391.3500061035156, "loss": 0.1809, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.3359375, "rewards/margins": 2.8882813453674316, "rewards/rejected": -1.550390601158142, "step": 2750 }, { "epoch": 1.0391566265060241, "grad_norm": 46.675585345955355, "learning_rate": 7.403049698795181e-07, "logits/chosen": -2.2593750953674316, "logits/rejected": -2.318359375, "logps/chosen": -349.29998779296875, "logps/rejected": -390.8500061035156, "loss": 0.2088, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5412842035293579, "rewards/margins": 2.563671827316284, "rewards/rejected": -2.023632764816284, "step": 2760 }, { "epoch": 1.042921686746988, "grad_norm": 42.70522979804127, "learning_rate": 7.393637048192772e-07, "logits/chosen": -2.279296875, "logits/rejected": -2.2964844703674316, "logps/chosen": -361.42498779296875, "logps/rejected": -389.95001220703125, "loss": 0.1952, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.33668214082717896, "rewards/margins": 2.5894532203674316, "rewards/rejected": -2.2515625953674316, "step": 2770 }, { "epoch": 1.0466867469879517, "grad_norm": 35.536642648619676, "learning_rate": 7.38422439759036e-07, "logits/chosen": -2.257031202316284, "logits/rejected": -2.2164063453674316, "logps/chosen": -372.25, "logps/rejected": -397.70001220703125, "loss": 0.1987, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5906311273574829, "rewards/margins": 2.578906297683716, "rewards/rejected": -1.9894530773162842, "step": 2780 }, { "epoch": 1.0504518072289157, "grad_norm": 27.76812978966094, "learning_rate": 7.374811746987951e-07, "logits/chosen": -2.2691407203674316, "logits/rejected": -2.260546922683716, "logps/chosen": -379.0, "logps/rejected": -429.75, "loss": 0.1945, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.37928467988967896, "rewards/margins": 2.701171875, "rewards/rejected": -2.3218750953674316, "step": 2790 }, { "epoch": 1.0542168674698795, "grad_norm": 36.38651927230059, "learning_rate": 7.365399096385542e-07, "logits/chosen": -2.3121094703674316, "logits/rejected": -2.3121094703674316, "logps/chosen": -369.29998779296875, "logps/rejected": -397.0, "loss": 0.2131, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.58544921875, "rewards/margins": 2.518359422683716, "rewards/rejected": -1.935156226158142, "step": 2800 }, { "epoch": 1.0579819277108433, "grad_norm": 33.26761970413593, "learning_rate": 7.355986445783132e-07, "logits/chosen": -2.2734375, "logits/rejected": -2.3453125953674316, "logps/chosen": -348.79998779296875, "logps/rejected": -364.20001220703125, "loss": 0.1972, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.727246105670929, "rewards/margins": 2.7464842796325684, "rewards/rejected": -2.0179686546325684, "step": 2810 }, { "epoch": 1.0617469879518073, "grad_norm": 32.724765396366045, "learning_rate": 7.346573795180723e-07, "logits/chosen": -2.292187452316284, "logits/rejected": -2.3296875953674316, "logps/chosen": -354.1499938964844, "logps/rejected": -370.3500061035156, "loss": 0.198, "rewards/accuracies": 0.9375, "rewards/chosen": 0.698779284954071, "rewards/margins": 2.512500047683716, "rewards/rejected": -1.812109351158142, "step": 2820 }, { "epoch": 1.0655120481927711, "grad_norm": 162.71420433868784, "learning_rate": 7.337161144578313e-07, "logits/chosen": -2.272265672683716, "logits/rejected": -2.2281250953674316, "logps/chosen": -293.8999938964844, "logps/rejected": -358.1000061035156, "loss": 0.2249, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8062499761581421, "rewards/margins": 2.328125, "rewards/rejected": -1.521093726158142, "step": 2830 }, { "epoch": 1.069277108433735, "grad_norm": 30.504948993297294, "learning_rate": 7.327748493975904e-07, "logits/chosen": -2.255859375, "logits/rejected": -2.25390625, "logps/chosen": -379.3500061035156, "logps/rejected": -394.8500061035156, "loss": 0.1632, "rewards/accuracies": 0.96875, "rewards/chosen": 0.961230456829071, "rewards/margins": 2.776171922683716, "rewards/rejected": -1.8142578601837158, "step": 2840 }, { "epoch": 1.0730421686746987, "grad_norm": 47.16848706612897, "learning_rate": 7.318335843373493e-07, "logits/chosen": -2.311328172683716, "logits/rejected": -2.20703125, "logps/chosen": -340.79998779296875, "logps/rejected": -384.29998779296875, "loss": 0.1783, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.93603515625, "rewards/margins": 2.674609422683716, "rewards/rejected": -1.73828125, "step": 2850 }, { "epoch": 1.0768072289156627, "grad_norm": 37.667430967061655, "learning_rate": 7.308923192771084e-07, "logits/chosen": -2.291796922683716, "logits/rejected": -2.323437452316284, "logps/chosen": -339.0249938964844, "logps/rejected": -402.95001220703125, "loss": 0.1701, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7792999148368835, "rewards/margins": 2.5433592796325684, "rewards/rejected": -1.7626953125, "step": 2860 }, { "epoch": 1.0805722891566265, "grad_norm": 63.45250062799562, "learning_rate": 7.299510542168674e-07, "logits/chosen": -2.229687452316284, "logits/rejected": -2.303515672683716, "logps/chosen": -309.6000061035156, "logps/rejected": -380.3500061035156, "loss": 0.1944, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.6728515625, "rewards/margins": 2.6253905296325684, "rewards/rejected": -1.952539086341858, "step": 2870 }, { "epoch": 1.0843373493975903, "grad_norm": 31.970623565814254, "learning_rate": 7.290097891566265e-07, "logits/chosen": -2.319531202316284, "logits/rejected": -2.2890625, "logps/chosen": -304.8999938964844, "logps/rejected": -362.1499938964844, "loss": 0.1773, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6253296136856079, "rewards/margins": 2.959765672683716, "rewards/rejected": -2.336132764816284, "step": 2880 }, { "epoch": 1.0881024096385543, "grad_norm": 31.29900498111414, "learning_rate": 7.280685240963855e-07, "logits/chosen": -2.4007811546325684, "logits/rejected": -2.385937452316284, "logps/chosen": -318.54998779296875, "logps/rejected": -369.3999938964844, "loss": 0.2089, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.40108031034469604, "rewards/margins": 2.7265625, "rewards/rejected": -2.324414014816284, "step": 2890 }, { "epoch": 1.091867469879518, "grad_norm": 21.449632131580927, "learning_rate": 7.271272590361446e-07, "logits/chosen": -2.3355469703674316, "logits/rejected": -2.306640625, "logps/chosen": -333.0249938964844, "logps/rejected": -382.75, "loss": 0.186, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.285888671875, "rewards/margins": 2.973828077316284, "rewards/rejected": -2.689453125, "step": 2900 }, { "epoch": 1.095632530120482, "grad_norm": 44.02624954754674, "learning_rate": 7.261859939759037e-07, "logits/chosen": -2.417187452316284, "logits/rejected": -2.337109327316284, "logps/chosen": -300.29998779296875, "logps/rejected": -358.3999938964844, "loss": 0.1968, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3889404237270355, "rewards/margins": 2.717968702316284, "rewards/rejected": -2.327343702316284, "step": 2910 }, { "epoch": 1.0993975903614457, "grad_norm": 36.661311383105364, "learning_rate": 7.252447289156625e-07, "logits/chosen": -2.4554686546325684, "logits/rejected": -2.3812499046325684, "logps/chosen": -323.0, "logps/rejected": -373.95001220703125, "loss": 0.1762, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.5566161870956421, "rewards/margins": 3.032031297683716, "rewards/rejected": -2.475781202316284, "step": 2920 }, { "epoch": 1.1031626506024097, "grad_norm": 25.578681679287435, "learning_rate": 7.243034638554216e-07, "logits/chosen": -2.3558592796325684, "logits/rejected": -2.2984375953674316, "logps/chosen": -288.42498779296875, "logps/rejected": -386.04998779296875, "loss": 0.183, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5545409917831421, "rewards/margins": 2.940624952316284, "rewards/rejected": -2.387500047683716, "step": 2930 }, { "epoch": 1.1069277108433735, "grad_norm": 60.81017845149102, "learning_rate": 7.233621987951807e-07, "logits/chosen": -2.342968702316284, "logits/rejected": -2.2984375953674316, "logps/chosen": -321.20001220703125, "logps/rejected": -401.3999938964844, "loss": 0.2471, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.608593761920929, "rewards/margins": 2.793750047683716, "rewards/rejected": -2.183789014816284, "step": 2940 }, { "epoch": 1.1106927710843373, "grad_norm": 34.57607113546994, "learning_rate": 7.224209337349398e-07, "logits/chosen": -2.376953125, "logits/rejected": -2.3031249046325684, "logps/chosen": -306.29998779296875, "logps/rejected": -391.45001220703125, "loss": 0.2327, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.690673828125, "rewards/margins": 2.662890672683716, "rewards/rejected": -1.9727051258087158, "step": 2950 }, { "epoch": 1.1144578313253013, "grad_norm": 24.782655469515376, "learning_rate": 7.214796686746987e-07, "logits/chosen": -2.3667969703674316, "logits/rejected": -2.354687452316284, "logps/chosen": -389.70001220703125, "logps/rejected": -417.29998779296875, "loss": 0.1563, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.8321288824081421, "rewards/margins": 2.948046922683716, "rewards/rejected": -2.115039110183716, "step": 2960 }, { "epoch": 1.118222891566265, "grad_norm": 39.030169056218284, "learning_rate": 7.205384036144578e-07, "logits/chosen": -2.48046875, "logits/rejected": -2.3671875, "logps/chosen": -275.6499938964844, "logps/rejected": -363.54998779296875, "loss": 0.1817, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7339843511581421, "rewards/margins": 2.8921875953674316, "rewards/rejected": -2.159374952316284, "step": 2970 }, { "epoch": 1.1219879518072289, "grad_norm": 39.81205753701399, "learning_rate": 7.195971385542169e-07, "logits/chosen": -2.4593749046325684, "logits/rejected": -2.469921827316284, "logps/chosen": -350.6000061035156, "logps/rejected": -378.3999938964844, "loss": 0.1634, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7852783203125, "rewards/margins": 2.668750047683716, "rewards/rejected": -1.88427734375, "step": 2980 }, { "epoch": 1.1257530120481927, "grad_norm": 58.622599181959636, "learning_rate": 7.186558734939759e-07, "logits/chosen": -2.428515672683716, "logits/rejected": -2.4242186546325684, "logps/chosen": -356.3999938964844, "logps/rejected": -413.45001220703125, "loss": 0.2159, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.33637696504592896, "rewards/margins": 2.8355469703674316, "rewards/rejected": -2.4976563453674316, "step": 2990 }, { "epoch": 1.1295180722891567, "grad_norm": 33.87837777148823, "learning_rate": 7.177146084337348e-07, "logits/chosen": -2.4546875953674316, "logits/rejected": -2.401171922683716, "logps/chosen": -306.25, "logps/rejected": -397.79998779296875, "loss": 0.1617, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.48264771699905396, "rewards/margins": 3.0355467796325684, "rewards/rejected": -2.5550780296325684, "step": 3000 }, { "epoch": 1.1332831325301205, "grad_norm": 23.702407210978446, "learning_rate": 7.167733433734939e-07, "logits/chosen": -2.419921875, "logits/rejected": -2.368359327316284, "logps/chosen": -385.3500061035156, "logps/rejected": -458.8500061035156, "loss": 0.1422, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01770629920065403, "rewards/margins": 3.055468797683716, "rewards/rejected": -3.0355467796325684, "step": 3010 }, { "epoch": 1.1370481927710843, "grad_norm": 45.32177666756968, "learning_rate": 7.15832078313253e-07, "logits/chosen": -2.3421874046325684, "logits/rejected": -2.3046875, "logps/chosen": -332.6499938964844, "logps/rejected": -383.45001220703125, "loss": 0.1988, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.14765015244483948, "rewards/margins": 2.720703125, "rewards/rejected": -2.572265625, "step": 3020 }, { "epoch": 1.1408132530120483, "grad_norm": 85.99955188507955, "learning_rate": 7.148908132530121e-07, "logits/chosen": -2.5390625, "logits/rejected": -2.499218702316284, "logps/chosen": -319.2749938964844, "logps/rejected": -363.1499938964844, "loss": 0.2556, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.12430725246667862, "rewards/margins": 2.676953077316284, "rewards/rejected": -2.55078125, "step": 3030 }, { "epoch": 1.144578313253012, "grad_norm": 57.56520443203874, "learning_rate": 7.13949548192771e-07, "logits/chosen": -2.368359327316284, "logits/rejected": -2.319531202316284, "logps/chosen": -368.3999938964844, "logps/rejected": -432.75, "loss": 0.1299, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19707031548023224, "rewards/margins": 3.466015577316284, "rewards/rejected": -3.265625, "step": 3040 }, { "epoch": 1.1483433734939759, "grad_norm": 69.51806557496123, "learning_rate": 7.130082831325301e-07, "logits/chosen": -2.371875047683716, "logits/rejected": -2.407031297683716, "logps/chosen": -364.25, "logps/rejected": -395.79998779296875, "loss": 0.1988, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.13143005967140198, "rewards/margins": 2.9019532203674316, "rewards/rejected": -2.771484375, "step": 3050 }, { "epoch": 1.1521084337349397, "grad_norm": 30.354355078149208, "learning_rate": 7.120670180722891e-07, "logits/chosen": -2.477343797683716, "logits/rejected": -2.469531297683716, "logps/chosen": -373.75, "logps/rejected": -406.5, "loss": 0.185, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.16314697265625, "rewards/margins": 2.860156297683716, "rewards/rejected": -2.6976561546325684, "step": 3060 }, { "epoch": 1.1558734939759037, "grad_norm": 67.2379955525595, "learning_rate": 7.111257530120482e-07, "logits/chosen": -2.4945311546325684, "logits/rejected": -2.399218797683716, "logps/chosen": -335.6000061035156, "logps/rejected": -404.1499938964844, "loss": 0.2098, "rewards/accuracies": 0.9375, "rewards/chosen": 0.31807249784469604, "rewards/margins": 3.001953125, "rewards/rejected": -2.68359375, "step": 3070 }, { "epoch": 1.1596385542168675, "grad_norm": 79.25274176395449, "learning_rate": 7.101844879518072e-07, "logits/chosen": -2.390625, "logits/rejected": -2.3765625953674316, "logps/chosen": -343.3500061035156, "logps/rejected": -380.3999938964844, "loss": 0.2189, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4354797303676605, "rewards/margins": 2.528125047683716, "rewards/rejected": -2.0927734375, "step": 3080 }, { "epoch": 1.1634036144578312, "grad_norm": 35.90806472870617, "learning_rate": 7.092432228915662e-07, "logits/chosen": -2.301953077316284, "logits/rejected": -2.272656202316284, "logps/chosen": -363.70001220703125, "logps/rejected": -374.54998779296875, "loss": 0.1708, "rewards/accuracies": 0.9375, "rewards/chosen": 0.662341296672821, "rewards/margins": 2.7691407203674316, "rewards/rejected": -2.107617139816284, "step": 3090 }, { "epoch": 1.1671686746987953, "grad_norm": 84.6348689426645, "learning_rate": 7.083019578313253e-07, "logits/chosen": -2.294140577316284, "logits/rejected": -2.2523436546325684, "logps/chosen": -304.3999938964844, "logps/rejected": -376.95001220703125, "loss": 0.2072, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.57958984375, "rewards/margins": 2.912890672683716, "rewards/rejected": -2.33203125, "step": 3100 }, { "epoch": 1.170933734939759, "grad_norm": 58.38671808463888, "learning_rate": 7.073606927710843e-07, "logits/chosen": -2.2953124046325684, "logits/rejected": -2.3441405296325684, "logps/chosen": -349.29998779296875, "logps/rejected": -401.1000061035156, "loss": 0.1933, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.538464367389679, "rewards/margins": 2.808789014816284, "rewards/rejected": -2.269726514816284, "step": 3110 }, { "epoch": 1.1746987951807228, "grad_norm": 18.281732142974263, "learning_rate": 7.064194277108434e-07, "logits/chosen": -2.3375000953674316, "logits/rejected": -2.366406202316284, "logps/chosen": -353.8999938964844, "logps/rejected": -404.75, "loss": 0.1546, "rewards/accuracies": 0.9375, "rewards/chosen": 0.42463380098342896, "rewards/margins": 3.020703077316284, "rewards/rejected": -2.59765625, "step": 3120 }, { "epoch": 1.1784638554216866, "grad_norm": 32.9694349209941, "learning_rate": 7.054781626506023e-07, "logits/chosen": -2.3695311546325684, "logits/rejected": -2.305859327316284, "logps/chosen": -345.8999938964844, "logps/rejected": -402.79998779296875, "loss": 0.1789, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.39002686738967896, "rewards/margins": 3.069531202316284, "rewards/rejected": -2.6796875, "step": 3130 }, { "epoch": 1.1822289156626506, "grad_norm": 99.42706274941935, "learning_rate": 7.045368975903614e-07, "logits/chosen": -2.276562452316284, "logits/rejected": -2.169140577316284, "logps/chosen": -333.79998779296875, "logps/rejected": -378.79998779296875, "loss": 0.1966, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.4982360899448395, "rewards/margins": 2.825000047683716, "rewards/rejected": -2.326953172683716, "step": 3140 }, { "epoch": 1.1859939759036144, "grad_norm": 33.482445613081445, "learning_rate": 7.035956325301204e-07, "logits/chosen": -2.4312500953674316, "logits/rejected": -2.3675780296325684, "logps/chosen": -323.32501220703125, "logps/rejected": -369.1499938964844, "loss": 0.2099, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.3809814453125, "rewards/margins": 2.807812452316284, "rewards/rejected": -2.4261717796325684, "step": 3150 }, { "epoch": 1.1897590361445782, "grad_norm": 60.59748511612262, "learning_rate": 7.026543674698795e-07, "logits/chosen": -2.334765672683716, "logits/rejected": -2.2574219703674316, "logps/chosen": -334.5, "logps/rejected": -382.3999938964844, "loss": 0.1748, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.19656983017921448, "rewards/margins": 2.9242186546325684, "rewards/rejected": -2.729296922683716, "step": 3160 }, { "epoch": 1.1935240963855422, "grad_norm": 77.56104132205125, "learning_rate": 7.017131024096386e-07, "logits/chosen": -2.4175782203674316, "logits/rejected": -2.448437452316284, "logps/chosen": -291.8500061035156, "logps/rejected": -358.95001220703125, "loss": 0.2313, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.28046876192092896, "rewards/margins": 2.501171827316284, "rewards/rejected": -2.220703125, "step": 3170 }, { "epoch": 1.197289156626506, "grad_norm": 52.23230003844356, "learning_rate": 7.007718373493976e-07, "logits/chosen": -2.3882813453674316, "logits/rejected": -2.336718797683716, "logps/chosen": -325.8999938964844, "logps/rejected": -356.3500061035156, "loss": 0.1801, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05131225660443306, "rewards/margins": 2.8070311546325684, "rewards/rejected": -2.754687547683716, "step": 3180 }, { "epoch": 1.2010542168674698, "grad_norm": 61.173650061883414, "learning_rate": 6.998305722891565e-07, "logits/chosen": -2.514843702316284, "logits/rejected": -2.329296827316284, "logps/chosen": -302.8500061035156, "logps/rejected": -399.6000061035156, "loss": 0.1865, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.11018066108226776, "rewards/margins": 2.9203124046325684, "rewards/rejected": -3.0308594703674316, "step": 3190 }, { "epoch": 1.2048192771084336, "grad_norm": 74.63478712354278, "learning_rate": 6.988893072289156e-07, "logits/chosen": -2.373046875, "logits/rejected": -2.3804688453674316, "logps/chosen": -311.75, "logps/rejected": -377.1499938964844, "loss": 0.229, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.09440918266773224, "rewards/margins": 2.575000047683716, "rewards/rejected": -2.478515625, "step": 3200 }, { "epoch": 1.2085843373493976, "grad_norm": 44.150888480387735, "learning_rate": 6.979480421686747e-07, "logits/chosen": -2.428906202316284, "logits/rejected": -2.423828125, "logps/chosen": -361.25, "logps/rejected": -477.3999938964844, "loss": 0.1953, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0318603515625, "rewards/margins": 3.198046922683716, "rewards/rejected": -3.1664061546325684, "step": 3210 }, { "epoch": 1.2123493975903614, "grad_norm": 47.031385256751086, "learning_rate": 6.970067771084337e-07, "logits/chosen": -2.313281297683716, "logits/rejected": -2.287109375, "logps/chosen": -378.8500061035156, "logps/rejected": -451.20001220703125, "loss": 0.1569, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02018432691693306, "rewards/margins": 3.2972655296325684, "rewards/rejected": -3.2757811546325684, "step": 3220 }, { "epoch": 1.2161144578313252, "grad_norm": 52.79673048198422, "learning_rate": 6.960655120481927e-07, "logits/chosen": -2.375, "logits/rejected": -2.3687500953674316, "logps/chosen": -369.29998779296875, "logps/rejected": -403.8999938964844, "loss": 0.206, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04320373386144638, "rewards/margins": 2.797656297683716, "rewards/rejected": -2.755078077316284, "step": 3230 }, { "epoch": 1.2198795180722892, "grad_norm": 50.392870162766314, "learning_rate": 6.951242469879518e-07, "logits/chosen": -2.399218797683716, "logits/rejected": -2.322265625, "logps/chosen": -355.29998779296875, "logps/rejected": -412.04998779296875, "loss": 0.1836, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.34569090604782104, "rewards/margins": 2.994140625, "rewards/rejected": -2.64794921875, "step": 3240 }, { "epoch": 1.223644578313253, "grad_norm": 40.52354468764439, "learning_rate": 6.941829819277109e-07, "logits/chosen": -2.34375, "logits/rejected": -2.3304686546325684, "logps/chosen": -365.70001220703125, "logps/rejected": -381.3500061035156, "loss": 0.1782, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.326019287109375, "rewards/margins": 2.689453125, "rewards/rejected": -2.360156297683716, "step": 3250 }, { "epoch": 1.2274096385542168, "grad_norm": 44.672696525303046, "learning_rate": 6.932417168674697e-07, "logits/chosen": -2.337109327316284, "logits/rejected": -2.3421874046325684, "logps/chosen": -358.6000061035156, "logps/rejected": -386.3500061035156, "loss": 0.1943, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.06267090141773224, "rewards/margins": 2.8140625953674316, "rewards/rejected": -2.754687547683716, "step": 3260 }, { "epoch": 1.2311746987951806, "grad_norm": 62.09341126024057, "learning_rate": 6.923004518072288e-07, "logits/chosen": -2.278515577316284, "logits/rejected": -2.305468797683716, "logps/chosen": -315.1000061035156, "logps/rejected": -363.5, "loss": 0.2135, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.09436645358800888, "rewards/margins": 2.6324219703674316, "rewards/rejected": -2.7265625, "step": 3270 }, { "epoch": 1.2349397590361446, "grad_norm": 42.829861719965706, "learning_rate": 6.913591867469879e-07, "logits/chosen": -2.375, "logits/rejected": -2.299999952316284, "logps/chosen": -335.79998779296875, "logps/rejected": -406.6499938964844, "loss": 0.1857, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.09133300930261612, "rewards/margins": 2.871875047683716, "rewards/rejected": -2.960156202316284, "step": 3280 }, { "epoch": 1.2387048192771084, "grad_norm": 38.0769820211466, "learning_rate": 6.90417921686747e-07, "logits/chosen": -2.3125, "logits/rejected": -2.3402342796325684, "logps/chosen": -361.95001220703125, "logps/rejected": -386.6000061035156, "loss": 0.1587, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.10731200873851776, "rewards/margins": 2.823046922683716, "rewards/rejected": -2.9312500953674316, "step": 3290 }, { "epoch": 1.2424698795180722, "grad_norm": 40.941768881493836, "learning_rate": 6.89476656626506e-07, "logits/chosen": -2.3453125953674316, "logits/rejected": -2.3636717796325684, "logps/chosen": -371.20001220703125, "logps/rejected": -434.45001220703125, "loss": 0.1646, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.13909301161766052, "rewards/margins": 3.137500047683716, "rewards/rejected": -3.27734375, "step": 3300 }, { "epoch": 1.2462349397590362, "grad_norm": 62.70935491397893, "learning_rate": 6.88535391566265e-07, "logits/chosen": -2.4609375, "logits/rejected": -2.4234375953674316, "logps/chosen": -299.3500061035156, "logps/rejected": -381.04998779296875, "loss": 0.1951, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.17241211235523224, "rewards/margins": 2.694531202316284, "rewards/rejected": -2.865234375, "step": 3310 }, { "epoch": 1.25, "grad_norm": 48.67999549641775, "learning_rate": 6.875941265060241e-07, "logits/chosen": -2.3257813453674316, "logits/rejected": -2.291015625, "logps/chosen": -348.6000061035156, "logps/rejected": -411.79998779296875, "loss": 0.208, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.08927001804113388, "rewards/margins": 2.8167967796325684, "rewards/rejected": -2.7281250953674316, "step": 3320 }, { "epoch": 1.2537650602409638, "grad_norm": 51.02552491319701, "learning_rate": 6.866528614457831e-07, "logits/chosen": -2.3082032203674316, "logits/rejected": -2.339062452316284, "logps/chosen": -321.54998779296875, "logps/rejected": -393.6000061035156, "loss": 0.2313, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2904296815395355, "rewards/margins": 2.8462891578674316, "rewards/rejected": -2.553515672683716, "step": 3330 }, { "epoch": 1.2575301204819276, "grad_norm": 58.37624694152142, "learning_rate": 6.857115963855421e-07, "logits/chosen": -2.344921827316284, "logits/rejected": -2.3578124046325684, "logps/chosen": -374.0, "logps/rejected": -440.3999938964844, "loss": 0.1258, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5706787109375, "rewards/margins": 3.254687547683716, "rewards/rejected": -2.686328172683716, "step": 3340 }, { "epoch": 1.2612951807228916, "grad_norm": 103.87648489704237, "learning_rate": 6.847703313253011e-07, "logits/chosen": -2.2074217796325684, "logits/rejected": -2.236328125, "logps/chosen": -348.04998779296875, "logps/rejected": -411.25, "loss": 0.1706, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5145202875137329, "rewards/margins": 3.344531297683716, "rewards/rejected": -2.8265624046325684, "step": 3350 }, { "epoch": 1.2650602409638554, "grad_norm": 77.1003055054433, "learning_rate": 6.838290662650602e-07, "logits/chosen": -2.5023436546325684, "logits/rejected": -2.408984422683716, "logps/chosen": -294.95001220703125, "logps/rejected": -349.6499938964844, "loss": 0.2264, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.23416748642921448, "rewards/margins": 2.642578125, "rewards/rejected": -2.408984422683716, "step": 3360 }, { "epoch": 1.2688253012048194, "grad_norm": 44.72248443232538, "learning_rate": 6.828878012048193e-07, "logits/chosen": -2.408984422683716, "logits/rejected": -2.2562499046325684, "logps/chosen": -324.54998779296875, "logps/rejected": -394.0, "loss": 0.1888, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3092712461948395, "rewards/margins": 2.821484327316284, "rewards/rejected": -2.5152344703674316, "step": 3370 }, { "epoch": 1.2725903614457832, "grad_norm": 29.425346147269313, "learning_rate": 6.819465361445783e-07, "logits/chosen": -2.3402342796325684, "logits/rejected": -2.237499952316284, "logps/chosen": -312.8999938964844, "logps/rejected": -426.8500061035156, "loss": 0.1575, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.461669921875, "rewards/margins": 3.0445313453674316, "rewards/rejected": -2.5833983421325684, "step": 3380 }, { "epoch": 1.276355421686747, "grad_norm": 32.3834931156063, "learning_rate": 6.810052710843374e-07, "logits/chosen": -2.2906250953674316, "logits/rejected": -2.272265672683716, "logps/chosen": -314.6000061035156, "logps/rejected": -371.95001220703125, "loss": 0.2249, "rewards/accuracies": 0.90625, "rewards/chosen": 0.550048828125, "rewards/margins": 2.6664061546325684, "rewards/rejected": -2.113964796066284, "step": 3390 }, { "epoch": 1.2801204819277108, "grad_norm": 42.07820330307824, "learning_rate": 6.800640060240963e-07, "logits/chosen": -2.371875047683716, "logits/rejected": -2.4300780296325684, "logps/chosen": -302.20001220703125, "logps/rejected": -355.5249938964844, "loss": 0.2242, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.6962539553642273, "rewards/margins": 2.673828125, "rewards/rejected": -1.978051781654358, "step": 3400 }, { "epoch": 1.2838855421686746, "grad_norm": 36.82784035769058, "learning_rate": 6.791227409638553e-07, "logits/chosen": -2.369921922683716, "logits/rejected": -2.2769532203674316, "logps/chosen": -319.70001220703125, "logps/rejected": -385.04998779296875, "loss": 0.202, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.595996081829071, "rewards/margins": 2.825390577316284, "rewards/rejected": -2.2320313453674316, "step": 3410 }, { "epoch": 1.2876506024096386, "grad_norm": 63.73325738351356, "learning_rate": 6.781814759036144e-07, "logits/chosen": -2.36328125, "logits/rejected": -2.3421874046325684, "logps/chosen": -303.25, "logps/rejected": -379.8999938964844, "loss": 0.1908, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.608264148235321, "rewards/margins": 2.842578172683716, "rewards/rejected": -2.232421875, "step": 3420 }, { "epoch": 1.2914156626506024, "grad_norm": 60.80179463474883, "learning_rate": 6.772402108433735e-07, "logits/chosen": -2.326171875, "logits/rejected": -2.3277344703674316, "logps/chosen": -338.75, "logps/rejected": -378.79998779296875, "loss": 0.1935, "rewards/accuracies": 0.9375, "rewards/chosen": 0.739672839641571, "rewards/margins": 2.8480467796325684, "rewards/rejected": -2.107617139816284, "step": 3430 }, { "epoch": 1.2951807228915664, "grad_norm": 41.275052230434525, "learning_rate": 6.762989457831325e-07, "logits/chosen": -2.3882813453674316, "logits/rejected": -2.438281297683716, "logps/chosen": -386.45001220703125, "logps/rejected": -433.0, "loss": 0.1523, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8533691167831421, "rewards/margins": 3.030468702316284, "rewards/rejected": -2.1781249046325684, "step": 3440 }, { "epoch": 1.2989457831325302, "grad_norm": 34.18147651611178, "learning_rate": 6.753576807228915e-07, "logits/chosen": -2.3023438453674316, "logits/rejected": -2.3082032203674316, "logps/chosen": -391.6000061035156, "logps/rejected": -391.20001220703125, "loss": 0.1646, "rewards/accuracies": 0.96875, "rewards/chosen": 0.46928709745407104, "rewards/margins": 3.132031202316284, "rewards/rejected": -2.661914110183716, "step": 3450 }, { "epoch": 1.302710843373494, "grad_norm": 59.719250167523505, "learning_rate": 6.744164156626506e-07, "logits/chosen": -2.370312452316284, "logits/rejected": -2.326953172683716, "logps/chosen": -357.75, "logps/rejected": -417.6000061035156, "loss": 0.2173, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.44189453125, "rewards/margins": 2.885937452316284, "rewards/rejected": -2.4408202171325684, "step": 3460 }, { "epoch": 1.3064759036144578, "grad_norm": 43.8120844622312, "learning_rate": 6.734751506024096e-07, "logits/chosen": -2.3433594703674316, "logits/rejected": -2.3421874046325684, "logps/chosen": -314.1499938964844, "logps/rejected": -393.25, "loss": 0.1578, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.250784307718277, "rewards/margins": 3.0453124046325684, "rewards/rejected": -2.793750047683716, "step": 3470 }, { "epoch": 1.3102409638554218, "grad_norm": 29.93441952491013, "learning_rate": 6.725338855421686e-07, "logits/chosen": -2.297656297683716, "logits/rejected": -2.3843750953674316, "logps/chosen": -303.3999938964844, "logps/rejected": -369.70001220703125, "loss": 0.1809, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.27263182401657104, "rewards/margins": 2.788281202316284, "rewards/rejected": -2.5171875953674316, "step": 3480 }, { "epoch": 1.3140060240963856, "grad_norm": 72.9427350553484, "learning_rate": 6.715926204819276e-07, "logits/chosen": -2.432421922683716, "logits/rejected": -2.38671875, "logps/chosen": -351.8999938964844, "logps/rejected": -438.04998779296875, "loss": 0.1848, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02962646447122097, "rewards/margins": 3.0707030296325684, "rewards/rejected": -3.0406250953674316, "step": 3490 }, { "epoch": 1.3177710843373494, "grad_norm": 57.96414058158063, "learning_rate": 6.706513554216867e-07, "logits/chosen": -2.4140625, "logits/rejected": -2.3929686546325684, "logps/chosen": -392.75, "logps/rejected": -453.54998779296875, "loss": 0.1872, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.22613525390625, "rewards/margins": 3.075000047683716, "rewards/rejected": -2.846484422683716, "step": 3500 }, { "epoch": 1.3215361445783134, "grad_norm": 61.50186200976051, "learning_rate": 6.697100903614458e-07, "logits/chosen": -2.371875047683716, "logits/rejected": -2.3160157203674316, "logps/chosen": -382.6499938964844, "logps/rejected": -416.3500061035156, "loss": 0.1882, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.17720337212085724, "rewards/margins": 2.9234375953674316, "rewards/rejected": -2.7457032203674316, "step": 3510 }, { "epoch": 1.3253012048192772, "grad_norm": 65.61868348402031, "learning_rate": 6.687688253012049e-07, "logits/chosen": -2.3726563453674316, "logits/rejected": -2.3050780296325684, "logps/chosen": -317.25, "logps/rejected": -332.79998779296875, "loss": 0.2117, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.2698913514614105, "rewards/margins": 2.565234422683716, "rewards/rejected": -2.29296875, "step": 3520 }, { "epoch": 1.329066265060241, "grad_norm": 53.559532670369585, "learning_rate": 6.678275602409638e-07, "logits/chosen": -2.3265624046325684, "logits/rejected": -2.301953077316284, "logps/chosen": -323.70001220703125, "logps/rejected": -400.79998779296875, "loss": 0.2198, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.35627442598342896, "rewards/margins": 2.9287109375, "rewards/rejected": -2.573437452316284, "step": 3530 }, { "epoch": 1.3328313253012047, "grad_norm": 68.26697400198076, "learning_rate": 6.668862951807228e-07, "logits/chosen": -2.1917967796325684, "logits/rejected": -2.217968702316284, "logps/chosen": -366.3500061035156, "logps/rejected": -407.0, "loss": 0.1982, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.527148425579071, "rewards/margins": 2.9664063453674316, "rewards/rejected": -2.4349608421325684, "step": 3540 }, { "epoch": 1.3365963855421688, "grad_norm": 44.791315730536226, "learning_rate": 6.659450301204819e-07, "logits/chosen": -2.2621092796325684, "logits/rejected": -2.280078172683716, "logps/chosen": -327.04998779296875, "logps/rejected": -388.75, "loss": 0.1713, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5293213129043579, "rewards/margins": 2.903515577316284, "rewards/rejected": -2.376171827316284, "step": 3550 }, { "epoch": 1.3403614457831325, "grad_norm": 29.679616171544502, "learning_rate": 6.65003765060241e-07, "logits/chosen": -2.341015577316284, "logits/rejected": -2.374218702316284, "logps/chosen": -319.45001220703125, "logps/rejected": -373.3999938964844, "loss": 0.1999, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4040893614292145, "rewards/margins": 2.9554686546325684, "rewards/rejected": -2.5511717796325684, "step": 3560 }, { "epoch": 1.3441265060240963, "grad_norm": 36.369796803747164, "learning_rate": 6.640624999999999e-07, "logits/chosen": -2.337890625, "logits/rejected": -2.391796827316284, "logps/chosen": -337.1499938964844, "logps/rejected": -362.6499938964844, "loss": 0.2401, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.3198486268520355, "rewards/margins": 3.014453172683716, "rewards/rejected": -2.6962890625, "step": 3570 }, { "epoch": 1.3478915662650603, "grad_norm": 31.832081642517096, "learning_rate": 6.63121234939759e-07, "logits/chosen": -2.333984375, "logits/rejected": -2.258984327316284, "logps/chosen": -348.45001220703125, "logps/rejected": -404.95001220703125, "loss": 0.1957, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.21492919325828552, "rewards/margins": 3.021484375, "rewards/rejected": -2.802734375, "step": 3580 }, { "epoch": 1.3516566265060241, "grad_norm": 27.77362324381603, "learning_rate": 6.621799698795181e-07, "logits/chosen": -2.481250047683716, "logits/rejected": -2.43359375, "logps/chosen": -347.20001220703125, "logps/rejected": -426.04998779296875, "loss": 0.1663, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.15952149033546448, "rewards/margins": 2.987499952316284, "rewards/rejected": -2.826171875, "step": 3590 }, { "epoch": 1.355421686746988, "grad_norm": 54.10578373539966, "learning_rate": 6.612387048192771e-07, "logits/chosen": -2.291015625, "logits/rejected": -2.2777342796325684, "logps/chosen": -318.45001220703125, "logps/rejected": -404.45001220703125, "loss": 0.2035, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04522094875574112, "rewards/margins": 3.1058592796325684, "rewards/rejected": -3.0625, "step": 3600 }, { "epoch": 1.3591867469879517, "grad_norm": 37.8349596083777, "learning_rate": 6.60297439759036e-07, "logits/chosen": -2.4507813453674316, "logits/rejected": -2.379687547683716, "logps/chosen": -351.3999938964844, "logps/rejected": -414.8999938964844, "loss": 0.1981, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.076171875, "rewards/margins": 3.018749952316284, "rewards/rejected": -2.9404296875, "step": 3610 }, { "epoch": 1.3629518072289157, "grad_norm": 32.8388499507501, "learning_rate": 6.593561746987951e-07, "logits/chosen": -2.36328125, "logits/rejected": -2.294921875, "logps/chosen": -352.04998779296875, "logps/rejected": -423.0, "loss": 0.2145, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.14450684189796448, "rewards/margins": 2.815234422683716, "rewards/rejected": -2.6714844703674316, "step": 3620 }, { "epoch": 1.3667168674698795, "grad_norm": 53.867253140546815, "learning_rate": 6.584149096385542e-07, "logits/chosen": -2.4937500953674316, "logits/rejected": -2.400390625, "logps/chosen": -327.5, "logps/rejected": -399.04998779296875, "loss": 0.1788, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.2857116758823395, "rewards/margins": 3.1796875, "rewards/rejected": -2.8929686546325684, "step": 3630 }, { "epoch": 1.3704819277108433, "grad_norm": 47.739543091007306, "learning_rate": 6.574736445783132e-07, "logits/chosen": -2.395703077316284, "logits/rejected": -2.374218702316284, "logps/chosen": -368.2749938964844, "logps/rejected": -414.6000061035156, "loss": 0.1756, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.27650147676467896, "rewards/margins": 2.62109375, "rewards/rejected": -2.34375, "step": 3640 }, { "epoch": 1.3742469879518073, "grad_norm": 41.49719774201439, "learning_rate": 6.565323795180723e-07, "logits/chosen": -2.375, "logits/rejected": -2.2699217796325684, "logps/chosen": -335.07501220703125, "logps/rejected": -415.8500061035156, "loss": 0.1821, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3193603456020355, "rewards/margins": 3.115234375, "rewards/rejected": -2.795703172683716, "step": 3650 }, { "epoch": 1.3780120481927711, "grad_norm": 23.552486439729932, "learning_rate": 6.555911144578313e-07, "logits/chosen": -2.44921875, "logits/rejected": -2.466796875, "logps/chosen": -343.8999938964844, "logps/rejected": -402.3999938964844, "loss": 0.2044, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.526928722858429, "rewards/margins": 2.823437452316284, "rewards/rejected": -2.2984375953674316, "step": 3660 }, { "epoch": 1.381777108433735, "grad_norm": 51.56441058363433, "learning_rate": 6.546498493975904e-07, "logits/chosen": -2.395312547683716, "logits/rejected": -2.358593702316284, "logps/chosen": -358.92498779296875, "logps/rejected": -414.75, "loss": 0.2115, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.418121337890625, "rewards/margins": 3.009765625, "rewards/rejected": -2.590039014816284, "step": 3670 }, { "epoch": 1.3855421686746987, "grad_norm": 50.875710842702965, "learning_rate": 6.537085843373493e-07, "logits/chosen": -2.4765625, "logits/rejected": -2.403125047683716, "logps/chosen": -310.04998779296875, "logps/rejected": -385.25, "loss": 0.2047, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.23190918564796448, "rewards/margins": 2.9007811546325684, "rewards/rejected": -2.66796875, "step": 3680 }, { "epoch": 1.3893072289156627, "grad_norm": 38.86330028706987, "learning_rate": 6.527673192771084e-07, "logits/chosen": -2.4175782203674316, "logits/rejected": -2.283984422683716, "logps/chosen": -320.0, "logps/rejected": -376.45001220703125, "loss": 0.1847, "rewards/accuracies": 0.9375, "rewards/chosen": 0.257537841796875, "rewards/margins": 2.760937452316284, "rewards/rejected": -2.5042967796325684, "step": 3690 }, { "epoch": 1.3930722891566265, "grad_norm": 49.03371884995939, "learning_rate": 6.518260542168674e-07, "logits/chosen": -2.369140625, "logits/rejected": -2.4632811546325684, "logps/chosen": -394.70001220703125, "logps/rejected": -424.3999938964844, "loss": 0.1868, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.31474608182907104, "rewards/margins": 2.9996094703674316, "rewards/rejected": -2.682812452316284, "step": 3700 }, { "epoch": 1.3968373493975903, "grad_norm": 26.002788544467975, "learning_rate": 6.508847891566265e-07, "logits/chosen": -2.4808592796325684, "logits/rejected": -2.318359375, "logps/chosen": -302.04998779296875, "logps/rejected": -388.29998779296875, "loss": 0.1797, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.19121094048023224, "rewards/margins": 2.8890624046325684, "rewards/rejected": -2.6968750953674316, "step": 3710 }, { "epoch": 1.4006024096385543, "grad_norm": 40.963318911975605, "learning_rate": 6.499435240963855e-07, "logits/chosen": -2.4859375953674316, "logits/rejected": -2.4351563453674316, "logps/chosen": -398.29998779296875, "logps/rejected": -439.45001220703125, "loss": 0.1386, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.01888427697122097, "rewards/margins": 3.2320313453674316, "rewards/rejected": -3.253124952316284, "step": 3720 }, { "epoch": 1.404367469879518, "grad_norm": 51.01758385944105, "learning_rate": 6.490022590361446e-07, "logits/chosen": -2.411328077316284, "logits/rejected": -2.4398436546325684, "logps/chosen": -371.1499938964844, "logps/rejected": -432.5, "loss": 0.1876, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.03703613206744194, "rewards/margins": 2.978515625, "rewards/rejected": -3.014453172683716, "step": 3730 }, { "epoch": 1.408132530120482, "grad_norm": 31.583499316751166, "learning_rate": 6.480609939759037e-07, "logits/chosen": -2.391406297683716, "logits/rejected": -2.367968797683716, "logps/chosen": -299.04998779296875, "logps/rejected": -403.70001220703125, "loss": 0.1799, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.08514709770679474, "rewards/margins": 3.1292967796325684, "rewards/rejected": -3.044921875, "step": 3740 }, { "epoch": 1.4118975903614457, "grad_norm": 39.0112035443209, "learning_rate": 6.471197289156625e-07, "logits/chosen": -2.3812499046325684, "logits/rejected": -2.4039063453674316, "logps/chosen": -326.95001220703125, "logps/rejected": -374.3500061035156, "loss": 0.1865, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.16253051161766052, "rewards/margins": 2.9000000953674316, "rewards/rejected": -2.740234375, "step": 3750 }, { "epoch": 1.4156626506024097, "grad_norm": 33.24860106584229, "learning_rate": 6.461784638554216e-07, "logits/chosen": -2.380859375, "logits/rejected": -2.4136719703674316, "logps/chosen": -354.45001220703125, "logps/rejected": -419.5, "loss": 0.1727, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.124603271484375, "rewards/margins": 3.1949219703674316, "rewards/rejected": -3.0726561546325684, "step": 3760 }, { "epoch": 1.4194277108433735, "grad_norm": 38.45569560051199, "learning_rate": 6.452371987951807e-07, "logits/chosen": -2.3382811546325684, "logits/rejected": -2.3765625953674316, "logps/chosen": -331.70001220703125, "logps/rejected": -417.5, "loss": 0.1836, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.19475403428077698, "rewards/margins": 3.0023436546325684, "rewards/rejected": -2.8070311546325684, "step": 3770 }, { "epoch": 1.4231927710843373, "grad_norm": 49.20281241170397, "learning_rate": 6.442959337349398e-07, "logits/chosen": -2.456249952316284, "logits/rejected": -2.388671875, "logps/chosen": -312.45001220703125, "logps/rejected": -375.75, "loss": 0.2053, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.4162353575229645, "rewards/margins": 2.765625, "rewards/rejected": -2.344921827316284, "step": 3780 }, { "epoch": 1.4269578313253013, "grad_norm": 46.20263102277942, "learning_rate": 6.433546686746987e-07, "logits/chosen": -2.506640672683716, "logits/rejected": -2.4644532203674316, "logps/chosen": -355.1000061035156, "logps/rejected": -421.0, "loss": 0.1922, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.504931628704071, "rewards/margins": 2.8453125953674316, "rewards/rejected": -2.3414063453674316, "step": 3790 }, { "epoch": 1.430722891566265, "grad_norm": 36.46348389900767, "learning_rate": 6.424134036144578e-07, "logits/chosen": -2.206249952316284, "logits/rejected": -2.242968797683716, "logps/chosen": -326.75, "logps/rejected": -401.1499938964844, "loss": 0.2131, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.506762683391571, "rewards/margins": 2.850781202316284, "rewards/rejected": -2.3460936546325684, "step": 3800 }, { "epoch": 1.4344879518072289, "grad_norm": 64.19950754465484, "learning_rate": 6.414721385542169e-07, "logits/chosen": -2.4625000953674316, "logits/rejected": -2.3890624046325684, "logps/chosen": -331.95001220703125, "logps/rejected": -347.04998779296875, "loss": 0.2194, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.5608123540878296, "rewards/margins": 2.666015625, "rewards/rejected": -2.1041016578674316, "step": 3810 }, { "epoch": 1.4382530120481927, "grad_norm": 49.896886855085015, "learning_rate": 6.405308734939759e-07, "logits/chosen": -2.3671875, "logits/rejected": -2.4175782203674316, "logps/chosen": -394.20001220703125, "logps/rejected": -420.5, "loss": 0.1813, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.6954590082168579, "rewards/margins": 3.081249952316284, "rewards/rejected": -2.388671875, "step": 3820 }, { "epoch": 1.4420180722891567, "grad_norm": 58.58606917648683, "learning_rate": 6.395896084337348e-07, "logits/chosen": -2.436328172683716, "logits/rejected": -2.353515625, "logps/chosen": -322.1000061035156, "logps/rejected": -396.0, "loss": 0.2128, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.37791138887405396, "rewards/margins": 2.6175780296325684, "rewards/rejected": -2.2378907203674316, "step": 3830 }, { "epoch": 1.4457831325301205, "grad_norm": 27.893077852343826, "learning_rate": 6.386483433734939e-07, "logits/chosen": -2.3609375953674316, "logits/rejected": -2.313671827316284, "logps/chosen": -312.1000061035156, "logps/rejected": -386.6499938964844, "loss": 0.1554, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.3687988221645355, "rewards/margins": 3.099609375, "rewards/rejected": -2.731640577316284, "step": 3840 }, { "epoch": 1.4495481927710843, "grad_norm": 46.666461581691884, "learning_rate": 6.37707078313253e-07, "logits/chosen": -2.4921875, "logits/rejected": -2.426562547683716, "logps/chosen": -331.1000061035156, "logps/rejected": -441.5, "loss": 0.1765, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.08131103217601776, "rewards/margins": 3.1695313453674316, "rewards/rejected": -3.0882811546325684, "step": 3850 }, { "epoch": 1.4533132530120483, "grad_norm": 38.079994979004454, "learning_rate": 6.367658132530121e-07, "logits/chosen": -2.366406202316284, "logits/rejected": -2.461718797683716, "logps/chosen": -363.6499938964844, "logps/rejected": -410.54998779296875, "loss": 0.1799, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.12832030653953552, "rewards/margins": 2.989062547683716, "rewards/rejected": -2.8597655296325684, "step": 3860 }, { "epoch": 1.457078313253012, "grad_norm": 46.07746140012447, "learning_rate": 6.358245481927711e-07, "logits/chosen": -2.5015625953674316, "logits/rejected": -2.5367188453674316, "logps/chosen": -375.70001220703125, "logps/rejected": -389.5, "loss": 0.1818, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.1136474609375, "rewards/margins": 3.067578077316284, "rewards/rejected": -2.951953172683716, "step": 3870 }, { "epoch": 1.4608433734939759, "grad_norm": 35.7663426151683, "learning_rate": 6.348832831325301e-07, "logits/chosen": -2.6304688453674316, "logits/rejected": -2.578906297683716, "logps/chosen": -287.54998779296875, "logps/rejected": -369.0, "loss": 0.2113, "rewards/accuracies": 0.90625, "rewards/chosen": 0.0253753662109375, "rewards/margins": 2.715625047683716, "rewards/rejected": -2.69140625, "step": 3880 }, { "epoch": 1.4646084337349397, "grad_norm": 95.63871788888223, "learning_rate": 6.339420180722891e-07, "logits/chosen": -2.487109422683716, "logits/rejected": -2.45703125, "logps/chosen": -329.29998779296875, "logps/rejected": -397.1499938964844, "loss": 0.2102, "rewards/accuracies": 0.90625, "rewards/chosen": 0.0550537109375, "rewards/margins": 2.8882813453674316, "rewards/rejected": -2.8355469703674316, "step": 3890 }, { "epoch": 1.4683734939759037, "grad_norm": 60.57735484725855, "learning_rate": 6.330007530120481e-07, "logits/chosen": -2.446093797683716, "logits/rejected": -2.3843750953674316, "logps/chosen": -365.1499938964844, "logps/rejected": -446.79998779296875, "loss": 0.1924, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.22150573134422302, "rewards/margins": 3.232421875, "rewards/rejected": -3.457812547683716, "step": 3900 }, { "epoch": 1.4721385542168675, "grad_norm": 45.93976767184252, "learning_rate": 6.320594879518072e-07, "logits/chosen": -2.458984375, "logits/rejected": -2.4175782203674316, "logps/chosen": -332.1000061035156, "logps/rejected": -395.8999938964844, "loss": 0.1505, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.10057983547449112, "rewards/margins": 3.1167969703674316, "rewards/rejected": -3.017578125, "step": 3910 }, { "epoch": 1.4759036144578312, "grad_norm": 67.29543549940092, "learning_rate": 6.311182228915662e-07, "logits/chosen": -2.5425782203674316, "logits/rejected": -2.5699219703674316, "logps/chosen": -369.8500061035156, "logps/rejected": -446.5, "loss": 0.1453, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.14327391982078552, "rewards/margins": 3.23046875, "rewards/rejected": -3.084765672683716, "step": 3920 }, { "epoch": 1.4796686746987953, "grad_norm": 40.21715279975144, "learning_rate": 6.301769578313253e-07, "logits/chosen": -2.555468797683716, "logits/rejected": -2.55078125, "logps/chosen": -324.8500061035156, "logps/rejected": -344.1000061035156, "loss": 0.199, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.14621582627296448, "rewards/margins": 2.760937452316284, "rewards/rejected": -2.61328125, "step": 3930 }, { "epoch": 1.483433734939759, "grad_norm": 43.791531978291374, "learning_rate": 6.292356927710843e-07, "logits/chosen": -2.491406202316284, "logits/rejected": -2.46875, "logps/chosen": -329.25, "logps/rejected": -407.0, "loss": 0.16, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.330078125, "rewards/margins": 3.3492188453674316, "rewards/rejected": -3.021484375, "step": 3940 }, { "epoch": 1.4871987951807228, "grad_norm": 46.81747342319922, "learning_rate": 6.282944277108434e-07, "logits/chosen": -2.5003905296325684, "logits/rejected": -2.4000000953674316, "logps/chosen": -331.5, "logps/rejected": -378.95001220703125, "loss": 0.1942, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5872558355331421, "rewards/margins": 2.7417969703674316, "rewards/rejected": -2.155566453933716, "step": 3950 }, { "epoch": 1.4909638554216866, "grad_norm": 35.63731916978665, "learning_rate": 6.273531626506024e-07, "logits/chosen": -2.424999952316284, "logits/rejected": -2.4097657203674316, "logps/chosen": -321.6000061035156, "logps/rejected": -439.6499938964844, "loss": 0.1752, "rewards/accuracies": 0.9375, "rewards/chosen": 0.46630859375, "rewards/margins": 3.125, "rewards/rejected": -2.66015625, "step": 3960 }, { "epoch": 1.4947289156626506, "grad_norm": 47.26494681061979, "learning_rate": 6.264118975903614e-07, "logits/chosen": -2.442578077316284, "logits/rejected": -2.4507813453674316, "logps/chosen": -326.20001220703125, "logps/rejected": -378.45001220703125, "loss": 0.2068, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.5783630609512329, "rewards/margins": 2.8902344703674316, "rewards/rejected": -2.3140625953674316, "step": 3970 }, { "epoch": 1.4984939759036144, "grad_norm": 49.67367180175573, "learning_rate": 6.254706325301204e-07, "logits/chosen": -2.4898438453674316, "logits/rejected": -2.465625047683716, "logps/chosen": -337.1499938964844, "logps/rejected": -396.0, "loss": 0.2081, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8066161870956421, "rewards/margins": 2.753124952316284, "rewards/rejected": -1.9456055164337158, "step": 3980 }, { "epoch": 1.5022590361445785, "grad_norm": 47.75232448098017, "learning_rate": 6.245293674698795e-07, "logits/chosen": -2.4398436546325684, "logits/rejected": -2.408203125, "logps/chosen": -362.75, "logps/rejected": -411.29998779296875, "loss": 0.1928, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.578125, "rewards/margins": 3.248828172683716, "rewards/rejected": -2.66845703125, "step": 3990 }, { "epoch": 1.5060240963855422, "grad_norm": 49.518580464538715, "learning_rate": 6.235881024096386e-07, "logits/chosen": -2.491406202316284, "logits/rejected": -2.362109422683716, "logps/chosen": -330.6499938964844, "logps/rejected": -414.8500061035156, "loss": 0.1617, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5625976324081421, "rewards/margins": 2.99609375, "rewards/rejected": -2.43359375, "step": 4000 }, { "epoch": 1.509789156626506, "grad_norm": 35.035417239779186, "learning_rate": 6.226468373493976e-07, "logits/chosen": -2.528125047683716, "logits/rejected": -2.465625047683716, "logps/chosen": -343.04998779296875, "logps/rejected": -399.54998779296875, "loss": 0.1848, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.4444946348667145, "rewards/margins": 2.905078172683716, "rewards/rejected": -2.461718797683716, "step": 4010 }, { "epoch": 1.5135542168674698, "grad_norm": 68.64065731258175, "learning_rate": 6.217055722891565e-07, "logits/chosen": -2.3753905296325684, "logits/rejected": -2.3863282203674316, "logps/chosen": -332.2250061035156, "logps/rejected": -383.45001220703125, "loss": 0.198, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3897705078125, "rewards/margins": 2.951953172683716, "rewards/rejected": -2.5624022483825684, "step": 4020 }, { "epoch": 1.5173192771084336, "grad_norm": 58.13498516349828, "learning_rate": 6.207643072289156e-07, "logits/chosen": -2.428906202316284, "logits/rejected": -2.410937547683716, "logps/chosen": -331.54998779296875, "logps/rejected": -383.8999938964844, "loss": 0.1796, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.650073230266571, "rewards/margins": 3.0667967796325684, "rewards/rejected": -2.416210889816284, "step": 4030 }, { "epoch": 1.5210843373493976, "grad_norm": 36.20699429526076, "learning_rate": 6.198230421686747e-07, "logits/chosen": -2.4039063453674316, "logits/rejected": -2.432812452316284, "logps/chosen": -342.42498779296875, "logps/rejected": -392.6000061035156, "loss": 0.1655, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.699414074420929, "rewards/margins": 3.085156202316284, "rewards/rejected": -2.3828125, "step": 4040 }, { "epoch": 1.5248493975903614, "grad_norm": 36.038673782913406, "learning_rate": 6.188817771084338e-07, "logits/chosen": -2.4585938453674316, "logits/rejected": -2.462890625, "logps/chosen": -393.29998779296875, "logps/rejected": -418.04998779296875, "loss": 0.1475, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.43626707792282104, "rewards/margins": 3.1976561546325684, "rewards/rejected": -2.762500047683716, "step": 4050 }, { "epoch": 1.5286144578313254, "grad_norm": 50.07294291487463, "learning_rate": 6.179405120481927e-07, "logits/chosen": -2.446093797683716, "logits/rejected": -2.450000047683716, "logps/chosen": -342.20001220703125, "logps/rejected": -406.75, "loss": 0.1482, "rewards/accuracies": 0.96875, "rewards/chosen": 0.481201171875, "rewards/margins": 3.001953125, "rewards/rejected": -2.518749952316284, "step": 4060 }, { "epoch": 1.5323795180722892, "grad_norm": 67.17382204743184, "learning_rate": 6.169992469879518e-07, "logits/chosen": -2.4339842796325684, "logits/rejected": -2.39453125, "logps/chosen": -380.0, "logps/rejected": -419.0, "loss": 0.1688, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.1068115234375, "rewards/margins": 3.0933594703674316, "rewards/rejected": -2.989062547683716, "step": 4070 }, { "epoch": 1.536144578313253, "grad_norm": 25.790451967591668, "learning_rate": 6.160579819277109e-07, "logits/chosen": -2.375781297683716, "logits/rejected": -2.3472657203674316, "logps/chosen": -321.1499938964844, "logps/rejected": -431.0, "loss": 0.1832, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.13831177353858948, "rewards/margins": 3.240234375, "rewards/rejected": -3.1031250953674316, "step": 4080 }, { "epoch": 1.5399096385542168, "grad_norm": 36.52031957108004, "learning_rate": 6.151167168674698e-07, "logits/chosen": -2.236328125, "logits/rejected": -2.2867188453674316, "logps/chosen": -277.875, "logps/rejected": -348.1000061035156, "loss": 0.1901, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.45463865995407104, "rewards/margins": 2.837890625, "rewards/rejected": -2.3824219703674316, "step": 4090 }, { "epoch": 1.5436746987951806, "grad_norm": 52.89034952274142, "learning_rate": 6.141754518072288e-07, "logits/chosen": -2.3671875, "logits/rejected": -2.401171922683716, "logps/chosen": -381.3500061035156, "logps/rejected": -436.8999938964844, "loss": 0.1684, "rewards/accuracies": 0.9375, "rewards/chosen": 0.44263917207717896, "rewards/margins": 3.12890625, "rewards/rejected": -2.6875, "step": 4100 }, { "epoch": 1.5474397590361446, "grad_norm": 57.37359254112129, "learning_rate": 6.132341867469879e-07, "logits/chosen": -2.362499952316284, "logits/rejected": -2.332812547683716, "logps/chosen": -356.6499938964844, "logps/rejected": -408.79998779296875, "loss": 0.1785, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.3725219666957855, "rewards/margins": 3.1617188453674316, "rewards/rejected": -2.790234327316284, "step": 4110 }, { "epoch": 1.5512048192771084, "grad_norm": 39.221526281838926, "learning_rate": 6.12292921686747e-07, "logits/chosen": -2.401562452316284, "logits/rejected": -2.3695311546325684, "logps/chosen": -342.0, "logps/rejected": -399.5, "loss": 0.181, "rewards/accuracies": 0.9375, "rewards/chosen": 0.29349976778030396, "rewards/margins": 2.9722657203674316, "rewards/rejected": -2.677734375, "step": 4120 }, { "epoch": 1.5549698795180724, "grad_norm": 43.36537523641982, "learning_rate": 6.11351656626506e-07, "logits/chosen": -2.500781297683716, "logits/rejected": -2.345703125, "logps/chosen": -331.1499938964844, "logps/rejected": -378.04998779296875, "loss": 0.1769, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.23917236924171448, "rewards/margins": 3.2339844703674316, "rewards/rejected": -2.993359327316284, "step": 4130 }, { "epoch": 1.5587349397590362, "grad_norm": 54.80631380351077, "learning_rate": 6.10410391566265e-07, "logits/chosen": -2.2874999046325684, "logits/rejected": -2.3218750953674316, "logps/chosen": -307.8500061035156, "logps/rejected": -368.04998779296875, "loss": 0.2099, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.09589843451976776, "rewards/margins": 2.9921875, "rewards/rejected": -2.897656202316284, "step": 4140 }, { "epoch": 1.5625, "grad_norm": 70.18505691872971, "learning_rate": 6.094691265060241e-07, "logits/chosen": -2.3609375953674316, "logits/rejected": -2.339062452316284, "logps/chosen": -330.79998779296875, "logps/rejected": -417.8999938964844, "loss": 0.1855, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.15239563584327698, "rewards/margins": 3.075000047683716, "rewards/rejected": -2.924609422683716, "step": 4150 }, { "epoch": 1.5662650602409638, "grad_norm": 66.9782161678782, "learning_rate": 6.085278614457831e-07, "logits/chosen": -2.3890624046325684, "logits/rejected": -2.2953124046325684, "logps/chosen": -352.6499938964844, "logps/rejected": -416.29998779296875, "loss": 0.2103, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.21394042670726776, "rewards/margins": 3.133593797683716, "rewards/rejected": -2.91796875, "step": 4160 }, { "epoch": 1.5700301204819276, "grad_norm": 79.92102407057332, "learning_rate": 6.075865963855421e-07, "logits/chosen": -2.4664063453674316, "logits/rejected": -2.4476561546325684, "logps/chosen": -364.6000061035156, "logps/rejected": -407.8500061035156, "loss": 0.2036, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.10620727390050888, "rewards/margins": 2.875, "rewards/rejected": -2.7671875953674316, "step": 4170 }, { "epoch": 1.5737951807228916, "grad_norm": 44.77745886357275, "learning_rate": 6.066453313253012e-07, "logits/chosen": -2.327343702316284, "logits/rejected": -2.328125, "logps/chosen": -385.6000061035156, "logps/rejected": -415.3999938964844, "loss": 0.1319, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1793212890625, "rewards/margins": 3.5269532203674316, "rewards/rejected": -3.3519530296325684, "step": 4180 }, { "epoch": 1.5775602409638554, "grad_norm": 41.45898383369551, "learning_rate": 6.057040662650602e-07, "logits/chosen": -2.3421874046325684, "logits/rejected": -2.325390577316284, "logps/chosen": -331.3500061035156, "logps/rejected": -378.6499938964844, "loss": 0.1396, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.24432373046875, "rewards/margins": 3.0703125, "rewards/rejected": -2.827343702316284, "step": 4190 }, { "epoch": 1.5813253012048194, "grad_norm": 24.30664746646458, "learning_rate": 6.047628012048193e-07, "logits/chosen": -2.311328172683716, "logits/rejected": -2.3179688453674316, "logps/chosen": -342.04998779296875, "logps/rejected": -404.1499938964844, "loss": 0.184, "rewards/accuracies": 0.9375, "rewards/chosen": -0.007861328311264515, "rewards/margins": 3.1265625953674316, "rewards/rejected": -3.1328125, "step": 4200 }, { "epoch": 1.5850903614457832, "grad_norm": 45.471145974341155, "learning_rate": 6.038215361445783e-07, "logits/chosen": -2.4457030296325684, "logits/rejected": -2.4273438453674316, "logps/chosen": -331.79998779296875, "logps/rejected": -406.5, "loss": 0.1811, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.2086181640625, "rewards/margins": 3.1539063453674316, "rewards/rejected": -2.946093797683716, "step": 4210 }, { "epoch": 1.588855421686747, "grad_norm": 35.62866881180573, "learning_rate": 6.028802710843374e-07, "logits/chosen": -2.5269532203674316, "logits/rejected": -2.5726561546325684, "logps/chosen": -375.25, "logps/rejected": -419.79998779296875, "loss": 0.2508, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.2797897458076477, "rewards/margins": 2.865234375, "rewards/rejected": -3.143749952316284, "step": 4220 }, { "epoch": 1.5926204819277108, "grad_norm": 55.22929822071815, "learning_rate": 6.019390060240963e-07, "logits/chosen": -2.444531202316284, "logits/rejected": -2.3773436546325684, "logps/chosen": -336.04998779296875, "logps/rejected": -410.20001220703125, "loss": 0.1694, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.4345703125, "rewards/margins": 3.1171875, "rewards/rejected": -3.55078125, "step": 4230 }, { "epoch": 1.5963855421686746, "grad_norm": 69.15270892857252, "learning_rate": 6.009977409638553e-07, "logits/chosen": -2.5015625953674316, "logits/rejected": -2.48046875, "logps/chosen": -301.25, "logps/rejected": -413.3999938964844, "loss": 0.2503, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.353912353515625, "rewards/margins": 3.02734375, "rewards/rejected": -3.3812499046325684, "step": 4240 }, { "epoch": 1.6001506024096386, "grad_norm": 51.79765282676417, "learning_rate": 6.000564759036144e-07, "logits/chosen": -2.3843750953674316, "logits/rejected": -2.4898438453674316, "logps/chosen": -344.45001220703125, "logps/rejected": -404.1000061035156, "loss": 0.2108, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.735913097858429, "rewards/margins": 2.9996094703674316, "rewards/rejected": -3.733593702316284, "step": 4250 }, { "epoch": 1.6039156626506024, "grad_norm": 36.04219671466811, "learning_rate": 5.991152108433735e-07, "logits/chosen": -2.420703172683716, "logits/rejected": -2.5218749046325684, "logps/chosen": -407.17498779296875, "logps/rejected": -442.79998779296875, "loss": 0.1832, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.4357666075229645, "rewards/margins": 2.950390577316284, "rewards/rejected": -3.3871092796325684, "step": 4260 }, { "epoch": 1.6076807228915664, "grad_norm": 52.21706288878413, "learning_rate": 5.981739457831325e-07, "logits/chosen": -2.508984327316284, "logits/rejected": -2.48046875, "logps/chosen": -359.79998779296875, "logps/rejected": -433.54998779296875, "loss": 0.1828, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2704101502895355, "rewards/margins": 2.9691405296325684, "rewards/rejected": -3.2406249046325684, "step": 4270 }, { "epoch": 1.6114457831325302, "grad_norm": 40.30717298202087, "learning_rate": 5.972326807228915e-07, "logits/chosen": -2.407031297683716, "logits/rejected": -2.4625000953674316, "logps/chosen": -375.75, "logps/rejected": -388.54998779296875, "loss": 0.1963, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05596771091222763, "rewards/margins": 2.7738280296325684, "rewards/rejected": -2.7183594703674316, "step": 4280 }, { "epoch": 1.615210843373494, "grad_norm": 62.99975168362503, "learning_rate": 5.962914156626506e-07, "logits/chosen": -2.452343702316284, "logits/rejected": -2.43359375, "logps/chosen": -330.8999938964844, "logps/rejected": -373.1499938964844, "loss": 0.1958, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.30415040254592896, "rewards/margins": 2.917187452316284, "rewards/rejected": -2.61328125, "step": 4290 }, { "epoch": 1.6189759036144578, "grad_norm": 57.07165238985806, "learning_rate": 5.953501506024096e-07, "logits/chosen": -2.485156297683716, "logits/rejected": -2.4609375, "logps/chosen": -321.25, "logps/rejected": -368.3999938964844, "loss": 0.2001, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.12019042670726776, "rewards/margins": 2.895703077316284, "rewards/rejected": -2.7750000953674316, "step": 4300 }, { "epoch": 1.6227409638554215, "grad_norm": 67.73205527720296, "learning_rate": 5.944088855421687e-07, "logits/chosen": -2.3980469703674316, "logits/rejected": -2.357421875, "logps/chosen": -327.6000061035156, "logps/rejected": -376.25, "loss": 0.1894, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.16990967094898224, "rewards/margins": 3.073046922683716, "rewards/rejected": -2.903515577316284, "step": 4310 }, { "epoch": 1.6265060240963856, "grad_norm": 56.24520644044716, "learning_rate": 5.934676204819276e-07, "logits/chosen": -2.3941407203674316, "logits/rejected": -2.448437452316284, "logps/chosen": -336.29998779296875, "logps/rejected": -382.20001220703125, "loss": 0.1729, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.14312744140625, "rewards/margins": 2.958984375, "rewards/rejected": -2.814453125, "step": 4320 }, { "epoch": 1.6302710843373494, "grad_norm": 34.386206697754766, "learning_rate": 5.925263554216867e-07, "logits/chosen": -2.524609327316284, "logits/rejected": -2.40234375, "logps/chosen": -326.04998779296875, "logps/rejected": -402.3999938964844, "loss": 0.2015, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.18476562201976776, "rewards/margins": 3.01953125, "rewards/rejected": -3.1988282203674316, "step": 4330 }, { "epoch": 1.6340361445783134, "grad_norm": 44.60158787692179, "learning_rate": 5.915850903614458e-07, "logits/chosen": -2.4632811546325684, "logits/rejected": -2.4292969703674316, "logps/chosen": -291.70001220703125, "logps/rejected": -368.6000061035156, "loss": 0.2011, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.07120361179113388, "rewards/margins": 2.796093702316284, "rewards/rejected": -2.7261719703674316, "step": 4340 }, { "epoch": 1.6378012048192772, "grad_norm": 50.797811812757296, "learning_rate": 5.906438253012049e-07, "logits/chosen": -2.4332032203674316, "logits/rejected": -2.4320311546325684, "logps/chosen": -341.8500061035156, "logps/rejected": -410.29998779296875, "loss": 0.1981, "rewards/accuracies": 0.90625, "rewards/chosen": -0.14274902641773224, "rewards/margins": 2.854296922683716, "rewards/rejected": -2.9957032203674316, "step": 4350 }, { "epoch": 1.641566265060241, "grad_norm": 47.26223297475457, "learning_rate": 5.897025602409638e-07, "logits/chosen": -2.424999952316284, "logits/rejected": -2.385546922683716, "logps/chosen": -356.95001220703125, "logps/rejected": -427.0, "loss": 0.1622, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.2119140625, "rewards/margins": 3.1070313453674316, "rewards/rejected": -2.89453125, "step": 4360 }, { "epoch": 1.6453313253012047, "grad_norm": 40.57216035515869, "learning_rate": 5.887612951807228e-07, "logits/chosen": -2.4117188453674316, "logits/rejected": -2.46875, "logps/chosen": -343.0, "logps/rejected": -407.04998779296875, "loss": 0.1332, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.13793334364891052, "rewards/margins": 3.3023438453674316, "rewards/rejected": -3.164843797683716, "step": 4370 }, { "epoch": 1.6490963855421685, "grad_norm": 27.442256882316865, "learning_rate": 5.878200301204819e-07, "logits/chosen": -2.5484375953674316, "logits/rejected": -2.4507813453674316, "logps/chosen": -304.54998779296875, "logps/rejected": -361.6499938964844, "loss": 0.2133, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.03057250939309597, "rewards/margins": 2.655078172683716, "rewards/rejected": -2.6253905296325684, "step": 4380 }, { "epoch": 1.6528614457831325, "grad_norm": 45.443803798511034, "learning_rate": 5.868787650602409e-07, "logits/chosen": -2.4789061546325684, "logits/rejected": -2.356250047683716, "logps/chosen": -354.29998779296875, "logps/rejected": -411.0, "loss": 0.1733, "rewards/accuracies": 0.9375, "rewards/chosen": -0.13894042372703552, "rewards/margins": 3.3335938453674316, "rewards/rejected": -3.471874952316284, "step": 4390 }, { "epoch": 1.6566265060240963, "grad_norm": 47.38771633205144, "learning_rate": 5.859375e-07, "logits/chosen": -2.4886717796325684, "logits/rejected": -2.4609375, "logps/chosen": -318.0, "logps/rejected": -357.1000061035156, "loss": 0.2176, "rewards/accuracies": 0.90625, "rewards/chosen": 0.13688965141773224, "rewards/margins": 2.8812499046325684, "rewards/rejected": -2.7457032203674316, "step": 4400 }, { "epoch": 1.6603915662650603, "grad_norm": 57.32135182541953, "learning_rate": 5.84996234939759e-07, "logits/chosen": -2.512500047683716, "logits/rejected": -2.4664063453674316, "logps/chosen": -302.1499938964844, "logps/rejected": -390.5, "loss": 0.2043, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.02048644982278347, "rewards/margins": 2.969531297683716, "rewards/rejected": -2.990234375, "step": 4410 }, { "epoch": 1.6641566265060241, "grad_norm": 92.70950676874456, "learning_rate": 5.840549698795181e-07, "logits/chosen": -2.4742188453674316, "logits/rejected": -2.41796875, "logps/chosen": -326.45001220703125, "logps/rejected": -368.6499938964844, "loss": 0.2249, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.14278563857078552, "rewards/margins": 2.7582030296325684, "rewards/rejected": -2.617968797683716, "step": 4420 }, { "epoch": 1.667921686746988, "grad_norm": 24.586879630279654, "learning_rate": 5.831137048192771e-07, "logits/chosen": -2.414843797683716, "logits/rejected": -2.479687452316284, "logps/chosen": -355.1000061035156, "logps/rejected": -435.70001220703125, "loss": 0.1965, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.01726074144244194, "rewards/margins": 3.13671875, "rewards/rejected": -3.1566405296325684, "step": 4430 }, { "epoch": 1.6716867469879517, "grad_norm": 49.075231823596894, "learning_rate": 5.821724397590361e-07, "logits/chosen": -2.4535155296325684, "logits/rejected": -2.453125, "logps/chosen": -265.20001220703125, "logps/rejected": -288.3500061035156, "loss": 0.2197, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.30602723360061646, "rewards/margins": 2.628124952316284, "rewards/rejected": -2.3226561546325684, "step": 4440 }, { "epoch": 1.6754518072289155, "grad_norm": 50.25572613667356, "learning_rate": 5.812311746987951e-07, "logits/chosen": -2.4898438453674316, "logits/rejected": -2.405078172683716, "logps/chosen": -360.3500061035156, "logps/rejected": -424.8999938964844, "loss": 0.191, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05708007887005806, "rewards/margins": 3.1429686546325684, "rewards/rejected": -3.0843749046325684, "step": 4450 }, { "epoch": 1.6792168674698795, "grad_norm": 26.375746138533923, "learning_rate": 5.802899096385542e-07, "logits/chosen": -2.3304686546325684, "logits/rejected": -2.372265577316284, "logps/chosen": -367.8500061035156, "logps/rejected": -425.0, "loss": 0.1767, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.04754028469324112, "rewards/margins": 3.0648436546325684, "rewards/rejected": -3.014843702316284, "step": 4460 }, { "epoch": 1.6829819277108435, "grad_norm": 70.30693599774249, "learning_rate": 5.793486445783132e-07, "logits/chosen": -2.348437547683716, "logits/rejected": -2.373046875, "logps/chosen": -316.45001220703125, "logps/rejected": -367.6499938964844, "loss": 0.2299, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15102538466453552, "rewards/margins": 2.618945360183716, "rewards/rejected": -2.768749952316284, "step": 4470 }, { "epoch": 1.6867469879518073, "grad_norm": 46.193534183309204, "learning_rate": 5.784073795180723e-07, "logits/chosen": -2.360156297683716, "logits/rejected": -2.2679686546325684, "logps/chosen": -344.6499938964844, "logps/rejected": -423.8500061035156, "loss": 0.1922, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02650756761431694, "rewards/margins": 3.1753907203674316, "rewards/rejected": -3.149609327316284, "step": 4480 }, { "epoch": 1.6905120481927711, "grad_norm": 34.3335097690496, "learning_rate": 5.774661144578313e-07, "logits/chosen": -2.38671875, "logits/rejected": -2.317187547683716, "logps/chosen": -304.29998779296875, "logps/rejected": -389.6000061035156, "loss": 0.1666, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.08607177436351776, "rewards/margins": 3.0054688453674316, "rewards/rejected": -3.0902342796325684, "step": 4490 }, { "epoch": 1.694277108433735, "grad_norm": 45.24685159935828, "learning_rate": 5.765248493975904e-07, "logits/chosen": -2.41796875, "logits/rejected": -2.387500047683716, "logps/chosen": -288.29998779296875, "logps/rejected": -380.8500061035156, "loss": 0.1695, "rewards/accuracies": 0.9375, "rewards/chosen": 0.17617186903953552, "rewards/margins": 3.0882811546325684, "rewards/rejected": -2.9117188453674316, "step": 4500 }, { "epoch": 1.6980421686746987, "grad_norm": 65.7702770927805, "learning_rate": 5.755835843373493e-07, "logits/chosen": -2.399218797683716, "logits/rejected": -2.407031297683716, "logps/chosen": -347.75, "logps/rejected": -403.6000061035156, "loss": 0.1903, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.005175781436264515, "rewards/margins": 3.039843797683716, "rewards/rejected": -3.0445313453674316, "step": 4510 }, { "epoch": 1.7018072289156625, "grad_norm": 32.71162943304979, "learning_rate": 5.746423192771084e-07, "logits/chosen": -2.492968797683716, "logits/rejected": -2.4671874046325684, "logps/chosen": -350.20001220703125, "logps/rejected": -427.3500061035156, "loss": 0.1854, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.20025634765625, "rewards/margins": 3.2730469703674316, "rewards/rejected": -3.4730467796325684, "step": 4520 }, { "epoch": 1.7055722891566265, "grad_norm": 64.05484730525515, "learning_rate": 5.737010542168675e-07, "logits/chosen": -2.4156250953674316, "logits/rejected": -2.3121094703674316, "logps/chosen": -327.20001220703125, "logps/rejected": -394.8500061035156, "loss": 0.1736, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.15470580756664276, "rewards/margins": 2.9476561546325684, "rewards/rejected": -3.100781202316284, "step": 4530 }, { "epoch": 1.7093373493975905, "grad_norm": 37.34596440365226, "learning_rate": 5.727597891566265e-07, "logits/chosen": -2.549999952316284, "logits/rejected": -2.532031297683716, "logps/chosen": -393.1000061035156, "logps/rejected": -422.79998779296875, "loss": 0.1443, "rewards/accuracies": 0.96875, "rewards/chosen": -0.22708435356616974, "rewards/margins": 3.061718702316284, "rewards/rejected": -3.287109375, "step": 4540 }, { "epoch": 1.7131024096385543, "grad_norm": 49.5686946059949, "learning_rate": 5.718185240963855e-07, "logits/chosen": -2.492968797683716, "logits/rejected": -2.487109422683716, "logps/chosen": -309.70001220703125, "logps/rejected": -348.5, "loss": 0.2212, "rewards/accuracies": 0.9375, "rewards/chosen": -0.17326660454273224, "rewards/margins": 2.7574219703674316, "rewards/rejected": -2.9296875, "step": 4550 }, { "epoch": 1.716867469879518, "grad_norm": 39.900105305283056, "learning_rate": 5.708772590361446e-07, "logits/chosen": -2.4535155296325684, "logits/rejected": -2.4371094703674316, "logps/chosen": -352.8500061035156, "logps/rejected": -402.75, "loss": 0.1594, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.20412597060203552, "rewards/margins": 3.024609327316284, "rewards/rejected": -3.228515625, "step": 4560 }, { "epoch": 1.720632530120482, "grad_norm": 62.40235785805452, "learning_rate": 5.699359939759037e-07, "logits/chosen": -2.442187547683716, "logits/rejected": -2.5062499046325684, "logps/chosen": -292.82501220703125, "logps/rejected": -362.20001220703125, "loss": 0.1776, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.225311279296875, "rewards/margins": 3.0121092796325684, "rewards/rejected": -3.236328125, "step": 4570 }, { "epoch": 1.7243975903614457, "grad_norm": 82.33348955156158, "learning_rate": 5.689947289156625e-07, "logits/chosen": -2.477343797683716, "logits/rejected": -2.34765625, "logps/chosen": -329.04998779296875, "logps/rejected": -407.75, "loss": 0.1936, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.22197265923023224, "rewards/margins": 3.0003905296325684, "rewards/rejected": -3.220703125, "step": 4580 }, { "epoch": 1.7281626506024095, "grad_norm": 30.10161786926399, "learning_rate": 5.680534638554216e-07, "logits/chosen": -2.40625, "logits/rejected": -2.4593749046325684, "logps/chosen": -362.79998779296875, "logps/rejected": -364.3999938964844, "loss": 0.1409, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.0435791015625, "rewards/margins": 3.044921875, "rewards/rejected": -3.087890625, "step": 4590 }, { "epoch": 1.7319277108433735, "grad_norm": 49.19045600877121, "learning_rate": 5.671121987951807e-07, "logits/chosen": -2.491406202316284, "logits/rejected": -2.446093797683716, "logps/chosen": -393.3500061035156, "logps/rejected": -434.6499938964844, "loss": 0.1705, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.11672363430261612, "rewards/margins": 3.21875, "rewards/rejected": -3.3359375, "step": 4600 }, { "epoch": 1.7356927710843375, "grad_norm": 43.795557919076415, "learning_rate": 5.661709337349398e-07, "logits/chosen": -2.467968702316284, "logits/rejected": -2.36328125, "logps/chosen": -268.32501220703125, "logps/rejected": -355.6499938964844, "loss": 0.2141, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.17420653998851776, "rewards/margins": 2.662109375, "rewards/rejected": -2.839062452316284, "step": 4610 }, { "epoch": 1.7394578313253013, "grad_norm": 36.83427781982414, "learning_rate": 5.652296686746987e-07, "logits/chosen": -2.537109375, "logits/rejected": -2.43359375, "logps/chosen": -352.25, "logps/rejected": -389.79998779296875, "loss": 0.1758, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.23032227158546448, "rewards/margins": 3.1527342796325684, "rewards/rejected": -2.9222655296325684, "step": 4620 }, { "epoch": 1.743222891566265, "grad_norm": 45.19075038721786, "learning_rate": 5.642884036144578e-07, "logits/chosen": -2.4632811546325684, "logits/rejected": -2.438281297683716, "logps/chosen": -340.1499938964844, "logps/rejected": -397.95001220703125, "loss": 0.1894, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20114746689796448, "rewards/margins": 3.231640577316284, "rewards/rejected": -3.031054735183716, "step": 4630 }, { "epoch": 1.7469879518072289, "grad_norm": 45.28105676558326, "learning_rate": 5.633471385542169e-07, "logits/chosen": -2.4320311546325684, "logits/rejected": -2.430468797683716, "logps/chosen": -366.75, "logps/rejected": -429.0, "loss": 0.1748, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.18436583876609802, "rewards/margins": 3.4828124046325684, "rewards/rejected": -3.3023438453674316, "step": 4640 }, { "epoch": 1.7507530120481927, "grad_norm": 26.558303125873188, "learning_rate": 5.624058734939759e-07, "logits/chosen": -2.4281249046325684, "logits/rejected": -2.453906297683716, "logps/chosen": -365.3500061035156, "logps/rejected": -402.25, "loss": 0.1397, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19379882514476776, "rewards/margins": 3.1273436546325684, "rewards/rejected": -2.934765577316284, "step": 4650 }, { "epoch": 1.7545180722891565, "grad_norm": 61.68849968143313, "learning_rate": 5.614646084337349e-07, "logits/chosen": -2.494140625, "logits/rejected": -2.499218702316284, "logps/chosen": -328.5, "logps/rejected": -368.3500061035156, "loss": 0.2302, "rewards/accuracies": 0.90625, "rewards/chosen": 0.24674072861671448, "rewards/margins": 2.78515625, "rewards/rejected": -2.535937547683716, "step": 4660 }, { "epoch": 1.7582831325301205, "grad_norm": 65.15212413508738, "learning_rate": 5.605233433734939e-07, "logits/chosen": -2.5179686546325684, "logits/rejected": -2.48046875, "logps/chosen": -321.8500061035156, "logps/rejected": -401.79998779296875, "loss": 0.1693, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.25677490234375, "rewards/margins": 3.0609374046325684, "rewards/rejected": -2.8031249046325684, "step": 4670 }, { "epoch": 1.7620481927710845, "grad_norm": 47.28370788347945, "learning_rate": 5.59582078313253e-07, "logits/chosen": -2.499218702316284, "logits/rejected": -2.516406297683716, "logps/chosen": -316.79998779296875, "logps/rejected": -372.95001220703125, "loss": 0.1939, "rewards/accuracies": 0.90625, "rewards/chosen": 0.03486328199505806, "rewards/margins": 2.9765625, "rewards/rejected": -2.940624952316284, "step": 4680 }, { "epoch": 1.7658132530120483, "grad_norm": 41.056801417266236, "learning_rate": 5.586408132530121e-07, "logits/chosen": -2.4476561546325684, "logits/rejected": -2.516406297683716, "logps/chosen": -319.79998779296875, "logps/rejected": -364.5, "loss": 0.18, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.3340210020542145, "rewards/margins": 2.984375, "rewards/rejected": -2.6478514671325684, "step": 4690 }, { "epoch": 1.769578313253012, "grad_norm": 56.79897762550142, "learning_rate": 5.576995481927711e-07, "logits/chosen": -2.6484375, "logits/rejected": -2.575000047683716, "logps/chosen": -344.6499938964844, "logps/rejected": -398.1000061035156, "loss": 0.1884, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.37385255098342896, "rewards/margins": 3.029296875, "rewards/rejected": -2.6527342796325684, "step": 4700 }, { "epoch": 1.7733433734939759, "grad_norm": 55.6754488696005, "learning_rate": 5.567582831325301e-07, "logits/chosen": -2.4800782203674316, "logits/rejected": -2.4507813453674316, "logps/chosen": -310.82501220703125, "logps/rejected": -370.5, "loss": 0.204, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.24895019829273224, "rewards/margins": 3.034374952316284, "rewards/rejected": -2.78515625, "step": 4710 }, { "epoch": 1.7771084337349397, "grad_norm": 50.85958067220109, "learning_rate": 5.558170180722891e-07, "logits/chosen": -2.573437452316284, "logits/rejected": -2.5875000953674316, "logps/chosen": -387.79998779296875, "logps/rejected": -416.79998779296875, "loss": 0.1954, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.12893065810203552, "rewards/margins": 2.8203125, "rewards/rejected": -2.692578077316284, "step": 4720 }, { "epoch": 1.7808734939759037, "grad_norm": 51.81605701394221, "learning_rate": 5.548757530120481e-07, "logits/chosen": -2.5484375953674316, "logits/rejected": -2.47265625, "logps/chosen": -361.0, "logps/rejected": -417.29998779296875, "loss": 0.1911, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.05156249925494194, "rewards/margins": 2.7562499046325684, "rewards/rejected": -2.705078125, "step": 4730 }, { "epoch": 1.7846385542168675, "grad_norm": 33.859531089421395, "learning_rate": 5.539344879518072e-07, "logits/chosen": -2.4945311546325684, "logits/rejected": -2.4937500953674316, "logps/chosen": -327.67498779296875, "logps/rejected": -386.20001220703125, "loss": 0.1944, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.10517577826976776, "rewards/margins": 2.905468702316284, "rewards/rejected": -3.0087890625, "step": 4740 }, { "epoch": 1.7884036144578315, "grad_norm": 21.77673092908246, "learning_rate": 5.529932228915663e-07, "logits/chosen": -2.4789061546325684, "logits/rejected": -2.44921875, "logps/chosen": -361.1000061035156, "logps/rejected": -419.5, "loss": 0.1426, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.13990478217601776, "rewards/margins": 2.975781202316284, "rewards/rejected": -3.119140625, "step": 4750 }, { "epoch": 1.7921686746987953, "grad_norm": 66.82395390940846, "learning_rate": 5.520519578313253e-07, "logits/chosen": -2.507031202316284, "logits/rejected": -2.4878907203674316, "logps/chosen": -340.45001220703125, "logps/rejected": -413.20001220703125, "loss": 0.2031, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.3250976502895355, "rewards/margins": 2.8753905296325684, "rewards/rejected": -3.2035155296325684, "step": 4760 }, { "epoch": 1.795933734939759, "grad_norm": 63.35274739961596, "learning_rate": 5.511106927710843e-07, "logits/chosen": -2.594531297683716, "logits/rejected": -2.56640625, "logps/chosen": -364.75, "logps/rejected": -411.3500061035156, "loss": 0.2124, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5310424566268921, "rewards/margins": 3.021484375, "rewards/rejected": -3.55078125, "step": 4770 }, { "epoch": 1.7996987951807228, "grad_norm": 121.96287632813242, "learning_rate": 5.501694277108434e-07, "logits/chosen": -2.453906297683716, "logits/rejected": -2.469921827316284, "logps/chosen": -345.6000061035156, "logps/rejected": -409.8999938964844, "loss": 0.1789, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.5106140375137329, "rewards/margins": 3.0269532203674316, "rewards/rejected": -3.53515625, "step": 4780 }, { "epoch": 1.8034638554216866, "grad_norm": 44.55877696479236, "learning_rate": 5.492281626506024e-07, "logits/chosen": -2.4742188453674316, "logits/rejected": -2.465625047683716, "logps/chosen": -295.0249938964844, "logps/rejected": -371.3500061035156, "loss": 0.1789, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.372650146484375, "rewards/margins": 3.1175780296325684, "rewards/rejected": -3.4859375953674316, "step": 4790 }, { "epoch": 1.8072289156626506, "grad_norm": 50.5455039237177, "learning_rate": 5.482868975903614e-07, "logits/chosen": -2.58203125, "logits/rejected": -2.516796827316284, "logps/chosen": -371.25, "logps/rejected": -395.75, "loss": 0.1778, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3290039002895355, "rewards/margins": 3.0152344703674316, "rewards/rejected": -3.3421874046325684, "step": 4800 }, { "epoch": 1.8109939759036144, "grad_norm": 63.854057339042775, "learning_rate": 5.473456325301204e-07, "logits/chosen": -2.4664063453674316, "logits/rejected": -2.516406297683716, "logps/chosen": -360.8999938964844, "logps/rejected": -397.1000061035156, "loss": 0.176, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.35120850801467896, "rewards/margins": 3.0152344703674316, "rewards/rejected": -3.367968797683716, "step": 4810 }, { "epoch": 1.8147590361445785, "grad_norm": 39.61373973331794, "learning_rate": 5.464043674698795e-07, "logits/chosen": -2.500781297683716, "logits/rejected": -2.549999952316284, "logps/chosen": -312.3500061035156, "logps/rejected": -386.1499938964844, "loss": 0.1602, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.13795776665210724, "rewards/margins": 2.995312452316284, "rewards/rejected": -3.1343750953674316, "step": 4820 }, { "epoch": 1.8185240963855422, "grad_norm": 40.74201477040741, "learning_rate": 5.454631024096386e-07, "logits/chosen": -2.5492186546325684, "logits/rejected": -2.477343797683716, "logps/chosen": -350.875, "logps/rejected": -416.70001220703125, "loss": 0.1929, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.011828613467514515, "rewards/margins": 3.0082030296325684, "rewards/rejected": -3.01953125, "step": 4830 }, { "epoch": 1.822289156626506, "grad_norm": 44.576194117039165, "learning_rate": 5.445218373493977e-07, "logits/chosen": -2.4351563453674316, "logits/rejected": -2.4765625, "logps/chosen": -338.79998779296875, "logps/rejected": -382.1000061035156, "loss": 0.2422, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.17506103217601776, "rewards/margins": 3.1231446266174316, "rewards/rejected": -2.9505858421325684, "step": 4840 }, { "epoch": 1.8260542168674698, "grad_norm": 25.447763872248306, "learning_rate": 5.435805722891565e-07, "logits/chosen": -2.5074219703674316, "logits/rejected": -2.448437452316284, "logps/chosen": -333.70001220703125, "logps/rejected": -422.79998779296875, "loss": 0.1459, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.23747558891773224, "rewards/margins": 3.239062547683716, "rewards/rejected": -3.001953125, "step": 4850 }, { "epoch": 1.8298192771084336, "grad_norm": 89.09748042436883, "learning_rate": 5.426393072289156e-07, "logits/chosen": -2.418750047683716, "logits/rejected": -2.4984374046325684, "logps/chosen": -335.75, "logps/rejected": -355.04998779296875, "loss": 0.2252, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.12136230617761612, "rewards/margins": 3.005664110183716, "rewards/rejected": -2.882031202316284, "step": 4860 }, { "epoch": 1.8335843373493976, "grad_norm": 26.838983822495546, "learning_rate": 5.416980421686747e-07, "logits/chosen": -2.4371094703674316, "logits/rejected": -2.364453077316284, "logps/chosen": -349.42498779296875, "logps/rejected": -428.79998779296875, "loss": 0.1733, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.1005859375, "rewards/margins": 3.173046827316284, "rewards/rejected": -3.2738280296325684, "step": 4870 }, { "epoch": 1.8373493975903614, "grad_norm": 44.72846589802191, "learning_rate": 5.407567771084337e-07, "logits/chosen": -2.423046827316284, "logits/rejected": -2.3636717796325684, "logps/chosen": -364.0, "logps/rejected": -439.1000061035156, "loss": 0.1867, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.08604278415441513, "rewards/margins": 3.149218797683716, "rewards/rejected": -3.0660157203674316, "step": 4880 }, { "epoch": 1.8411144578313254, "grad_norm": 47.12597999034897, "learning_rate": 5.398155120481927e-07, "logits/chosen": -2.427734375, "logits/rejected": -2.4019532203674316, "logps/chosen": -332.1000061035156, "logps/rejected": -445.20001220703125, "loss": 0.1992, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.06169433519244194, "rewards/margins": 3.3031249046325684, "rewards/rejected": -3.3648438453674316, "step": 4890 }, { "epoch": 1.8448795180722892, "grad_norm": 46.8014269772038, "learning_rate": 5.388742469879518e-07, "logits/chosen": -2.39453125, "logits/rejected": -2.3687500953674316, "logps/chosen": -300.95001220703125, "logps/rejected": -364.3999938964844, "loss": 0.2262, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.10998992621898651, "rewards/margins": 2.496875047683716, "rewards/rejected": -2.3890624046325684, "step": 4900 }, { "epoch": 1.848644578313253, "grad_norm": 33.57172683074936, "learning_rate": 5.379329819277109e-07, "logits/chosen": -2.453125, "logits/rejected": -2.353515625, "logps/chosen": -328.54998779296875, "logps/rejected": -402.8999938964844, "loss": 0.173, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.02231445349752903, "rewards/margins": 3.00390625, "rewards/rejected": -2.9828124046325684, "step": 4910 }, { "epoch": 1.8524096385542168, "grad_norm": 56.32506541211211, "learning_rate": 5.369917168674698e-07, "logits/chosen": -2.385937452316284, "logits/rejected": -2.3570313453674316, "logps/chosen": -303.29998779296875, "logps/rejected": -348.95001220703125, "loss": 0.1878, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.3298583924770355, "rewards/margins": 2.71875, "rewards/rejected": -2.3902344703674316, "step": 4920 }, { "epoch": 1.8561746987951806, "grad_norm": 35.33804719131636, "learning_rate": 5.360504518072288e-07, "logits/chosen": -2.395312547683716, "logits/rejected": -2.405468702316284, "logps/chosen": -323.70001220703125, "logps/rejected": -371.70001220703125, "loss": 0.183, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.26909178495407104, "rewards/margins": 3.087890625, "rewards/rejected": -2.819140672683716, "step": 4930 }, { "epoch": 1.8599397590361446, "grad_norm": 26.381625063978763, "learning_rate": 5.351091867469879e-07, "logits/chosen": -2.4585938453674316, "logits/rejected": -2.4144530296325684, "logps/chosen": -325.29998779296875, "logps/rejected": -418.45001220703125, "loss": 0.1964, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.06401367485523224, "rewards/margins": 3.1875, "rewards/rejected": -3.123828172683716, "step": 4940 }, { "epoch": 1.8637048192771084, "grad_norm": 56.31142320051511, "learning_rate": 5.34167921686747e-07, "logits/chosen": -2.495312452316284, "logits/rejected": -2.4859375953674316, "logps/chosen": -369.20001220703125, "logps/rejected": -416.1499938964844, "loss": 0.1864, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.2508789002895355, "rewards/margins": 3.098828077316284, "rewards/rejected": -2.8492188453674316, "step": 4950 }, { "epoch": 1.8674698795180724, "grad_norm": 57.8495296911186, "learning_rate": 5.33226656626506e-07, "logits/chosen": -2.3910155296325684, "logits/rejected": -2.4476561546325684, "logps/chosen": -331.875, "logps/rejected": -392.29998779296875, "loss": 0.1698, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.288055419921875, "rewards/margins": 3.118359327316284, "rewards/rejected": -2.832812547683716, "step": 4960 }, { "epoch": 1.8712349397590362, "grad_norm": 29.016852082819177, "learning_rate": 5.32285391566265e-07, "logits/chosen": -2.3746094703674316, "logits/rejected": -2.3140625953674316, "logps/chosen": -329.6499938964844, "logps/rejected": -427.1000061035156, "loss": 0.2041, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.3080993592739105, "rewards/margins": 3.3046875, "rewards/rejected": -2.9937500953674316, "step": 4970 }, { "epoch": 1.875, "grad_norm": 25.103047910748536, "learning_rate": 5.313441265060241e-07, "logits/chosen": -2.3984375, "logits/rejected": -2.387500047683716, "logps/chosen": -320.79998779296875, "logps/rejected": -389.04998779296875, "loss": 0.1636, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.290365606546402, "rewards/margins": 3.177734375, "rewards/rejected": -2.887500047683716, "step": 4980 }, { "epoch": 1.8787650602409638, "grad_norm": 38.04867756514407, "learning_rate": 5.304028614457831e-07, "logits/chosen": -2.384765625, "logits/rejected": -2.4859375953674316, "logps/chosen": -374.20001220703125, "logps/rejected": -420.20001220703125, "loss": 0.1807, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.4644531309604645, "rewards/margins": 3.3492188453674316, "rewards/rejected": -2.8863282203674316, "step": 4990 }, { "epoch": 1.8825301204819276, "grad_norm": 56.68296698849738, "learning_rate": 5.294615963855421e-07, "logits/chosen": -2.4652342796325684, "logits/rejected": -2.4214844703674316, "logps/chosen": -336.70001220703125, "logps/rejected": -430.95001220703125, "loss": 0.1783, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4267822206020355, "rewards/margins": 3.211718797683716, "rewards/rejected": -2.7835936546325684, "step": 5000 }, { "epoch": 1.8862951807228916, "grad_norm": 31.596349288312616, "learning_rate": 5.285203313253012e-07, "logits/chosen": -2.479687452316284, "logits/rejected": -2.5101561546325684, "logps/chosen": -357.8500061035156, "logps/rejected": -393.79998779296875, "loss": 0.1544, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.52001953125, "rewards/margins": 3.112499952316284, "rewards/rejected": -2.5918946266174316, "step": 5010 }, { "epoch": 1.8900602409638554, "grad_norm": 69.89043004858283, "learning_rate": 5.275790662650602e-07, "logits/chosen": -2.4749999046325684, "logits/rejected": -2.436328172683716, "logps/chosen": -309.20001220703125, "logps/rejected": -442.25, "loss": 0.1345, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.3048461973667145, "rewards/margins": 3.737109422683716, "rewards/rejected": -3.4300780296325684, "step": 5020 }, { "epoch": 1.8938253012048194, "grad_norm": 120.37743350027183, "learning_rate": 5.266378012048193e-07, "logits/chosen": -2.535937547683716, "logits/rejected": -2.51171875, "logps/chosen": -275.25, "logps/rejected": -385.04998779296875, "loss": 0.2344, "rewards/accuracies": 0.90625, "rewards/chosen": 0.12658080458641052, "rewards/margins": 3.1832032203674316, "rewards/rejected": -3.0589842796325684, "step": 5030 }, { "epoch": 1.8975903614457832, "grad_norm": 56.36506002984404, "learning_rate": 5.256965361445783e-07, "logits/chosen": -2.5609374046325684, "logits/rejected": -2.5374999046325684, "logps/chosen": -294.3500061035156, "logps/rejected": -350.95001220703125, "loss": 0.188, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18436889350414276, "rewards/margins": 2.983593702316284, "rewards/rejected": -2.798828125, "step": 5040 }, { "epoch": 1.901355421686747, "grad_norm": 30.639314106943456, "learning_rate": 5.247552710843374e-07, "logits/chosen": -2.6351561546325684, "logits/rejected": -2.5992188453674316, "logps/chosen": -328.75, "logps/rejected": -396.0, "loss": 0.1742, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.26044923067092896, "rewards/margins": 2.9449219703674316, "rewards/rejected": -2.6839842796325684, "step": 5050 }, { "epoch": 1.9051204819277108, "grad_norm": 71.21635766158133, "learning_rate": 5.238140060240963e-07, "logits/chosen": -2.598828077316284, "logits/rejected": -2.5289063453674316, "logps/chosen": -360.54998779296875, "logps/rejected": -421.5, "loss": 0.2073, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.10490722954273224, "rewards/margins": 3.41015625, "rewards/rejected": -3.3046875, "step": 5060 }, { "epoch": 1.9088855421686746, "grad_norm": 42.67463617082377, "learning_rate": 5.228727409638553e-07, "logits/chosen": -2.532031297683716, "logits/rejected": -2.526171922683716, "logps/chosen": -321.8999938964844, "logps/rejected": -387.5, "loss": 0.1962, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.25520020723342896, "rewards/margins": 3.1195311546325684, "rewards/rejected": -2.8628907203674316, "step": 5070 }, { "epoch": 1.9126506024096386, "grad_norm": 65.94077498315153, "learning_rate": 5.219314759036144e-07, "logits/chosen": -2.585156202316284, "logits/rejected": -2.557812452316284, "logps/chosen": -344.0, "logps/rejected": -405.95001220703125, "loss": 0.1696, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0067138671875, "rewards/margins": 3.095703125, "rewards/rejected": -3.088671922683716, "step": 5080 }, { "epoch": 1.9164156626506024, "grad_norm": 41.632055198935284, "learning_rate": 5.209902108433735e-07, "logits/chosen": -2.460156202316284, "logits/rejected": -2.5433592796325684, "logps/chosen": -369.70001220703125, "logps/rejected": -400.5, "loss": 0.1862, "rewards/accuracies": 0.9375, "rewards/chosen": 0.12111816555261612, "rewards/margins": 3.169140577316284, "rewards/rejected": -3.046875, "step": 5090 }, { "epoch": 1.9201807228915664, "grad_norm": 37.67632139903457, "learning_rate": 5.200489457831326e-07, "logits/chosen": -2.4859375953674316, "logits/rejected": -2.489062547683716, "logps/chosen": -356.54998779296875, "logps/rejected": -413.3500061035156, "loss": 0.1886, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.098358154296875, "rewards/margins": 2.9175782203674316, "rewards/rejected": -2.819531202316284, "step": 5100 }, { "epoch": 1.9239457831325302, "grad_norm": 46.11960117942316, "learning_rate": 5.191076807228915e-07, "logits/chosen": -2.5757813453674316, "logits/rejected": -2.5328125953674316, "logps/chosen": -338.0, "logps/rejected": -429.1499938964844, "loss": 0.1842, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.20627442002296448, "rewards/margins": 3.030078172683716, "rewards/rejected": -2.8238282203674316, "step": 5110 }, { "epoch": 1.927710843373494, "grad_norm": 34.02955243603665, "learning_rate": 5.181664156626506e-07, "logits/chosen": -2.520703077316284, "logits/rejected": -2.430468797683716, "logps/chosen": -285.45001220703125, "logps/rejected": -371.79998779296875, "loss": 0.2213, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.00032806396484375, "rewards/margins": 2.915234327316284, "rewards/rejected": -2.918750047683716, "step": 5120 }, { "epoch": 1.9314759036144578, "grad_norm": 37.56470740859429, "learning_rate": 5.172251506024096e-07, "logits/chosen": -2.4375, "logits/rejected": -2.391796827316284, "logps/chosen": -313.54998779296875, "logps/rejected": -376.0, "loss": 0.1682, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.3154296875, "rewards/margins": 2.9566407203674316, "rewards/rejected": -2.6421875953674316, "step": 5130 }, { "epoch": 1.9352409638554215, "grad_norm": 36.05950315509966, "learning_rate": 5.162838855421687e-07, "logits/chosen": -2.438281297683716, "logits/rejected": -2.48046875, "logps/chosen": -320.67498779296875, "logps/rejected": -377.0, "loss": 0.1831, "rewards/accuracies": 0.9375, "rewards/chosen": 0.015063476748764515, "rewards/margins": 3.091015577316284, "rewards/rejected": -3.076171875, "step": 5140 }, { "epoch": 1.9390060240963856, "grad_norm": 32.62725101178559, "learning_rate": 5.153426204819276e-07, "logits/chosen": -2.462890625, "logits/rejected": -2.3968749046325684, "logps/chosen": -314.54998779296875, "logps/rejected": -396.5, "loss": 0.1497, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.14080810546875, "rewards/margins": 3.266796827316284, "rewards/rejected": -3.125781297683716, "step": 5150 }, { "epoch": 1.9427710843373494, "grad_norm": 43.72606626677966, "learning_rate": 5.144013554216867e-07, "logits/chosen": -2.473437547683716, "logits/rejected": -2.5023436546325684, "logps/chosen": -344.1000061035156, "logps/rejected": -399.6000061035156, "loss": 0.1679, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.2805419862270355, "rewards/margins": 3.392578125, "rewards/rejected": -3.11328125, "step": 5160 }, { "epoch": 1.9465361445783134, "grad_norm": 50.842803961249, "learning_rate": 5.134600903614458e-07, "logits/chosen": -2.35546875, "logits/rejected": -2.3277344703674316, "logps/chosen": -326.29998779296875, "logps/rejected": -398.6000061035156, "loss": 0.2077, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.159912109375, "rewards/margins": 2.882031202316284, "rewards/rejected": -3.043750047683716, "step": 5170 }, { "epoch": 1.9503012048192772, "grad_norm": 69.99625806347039, "learning_rate": 5.125188253012049e-07, "logits/chosen": -2.399218797683716, "logits/rejected": -2.443359375, "logps/chosen": -293.29998779296875, "logps/rejected": -395.5, "loss": 0.2267, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.02254028245806694, "rewards/margins": 3.090625047683716, "rewards/rejected": -3.11328125, "step": 5180 }, { "epoch": 1.954066265060241, "grad_norm": 71.57173919732644, "learning_rate": 5.115775602409638e-07, "logits/chosen": -2.4867186546325684, "logits/rejected": -2.4437499046325684, "logps/chosen": -323.0, "logps/rejected": -393.8999938964844, "loss": 0.2489, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.07631836086511612, "rewards/margins": 2.92578125, "rewards/rejected": -2.848828077316284, "step": 5190 }, { "epoch": 1.9578313253012047, "grad_norm": 47.99718836504726, "learning_rate": 5.106362951807228e-07, "logits/chosen": -2.342968702316284, "logits/rejected": -2.391406297683716, "logps/chosen": -339.0, "logps/rejected": -392.1499938964844, "loss": 0.1919, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3065551817417145, "rewards/margins": 3.0093750953674316, "rewards/rejected": -2.703125, "step": 5200 }, { "epoch": 1.9615963855421685, "grad_norm": 36.14511010918862, "learning_rate": 5.096950301204819e-07, "logits/chosen": -2.373046875, "logits/rejected": -2.4140625, "logps/chosen": -342.20001220703125, "logps/rejected": -391.29998779296875, "loss": 0.1742, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3498474061489105, "rewards/margins": 3.169921875, "rewards/rejected": -2.8203125, "step": 5210 }, { "epoch": 1.9653614457831325, "grad_norm": 38.09400695605831, "learning_rate": 5.087537650602409e-07, "logits/chosen": -2.4476561546325684, "logits/rejected": -2.372265577316284, "logps/chosen": -306.6000061035156, "logps/rejected": -375.6499938964844, "loss": 0.1924, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.48625487089157104, "rewards/margins": 2.946484327316284, "rewards/rejected": -2.4615235328674316, "step": 5220 }, { "epoch": 1.9691265060240963, "grad_norm": 52.27549101588853, "learning_rate": 5.078125e-07, "logits/chosen": -2.3753905296325684, "logits/rejected": -2.2933592796325684, "logps/chosen": -329.20001220703125, "logps/rejected": -386.70001220703125, "loss": 0.1947, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.30859375, "rewards/margins": 2.904296875, "rewards/rejected": -2.598437547683716, "step": 5230 }, { "epoch": 1.9728915662650603, "grad_norm": 50.52721012186175, "learning_rate": 5.06871234939759e-07, "logits/chosen": -2.43359375, "logits/rejected": -2.401562452316284, "logps/chosen": -399.20001220703125, "logps/rejected": -397.3999938964844, "loss": 0.1978, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.02889404259622097, "rewards/margins": 2.845703125, "rewards/rejected": -2.876953125, "step": 5240 }, { "epoch": 1.9766566265060241, "grad_norm": 38.376399782576215, "learning_rate": 5.059299698795181e-07, "logits/chosen": -2.340625047683716, "logits/rejected": -2.3238282203674316, "logps/chosen": -369.04998779296875, "logps/rejected": -385.54998779296875, "loss": 0.1409, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.08535156399011612, "rewards/margins": 3.382031202316284, "rewards/rejected": -3.298046827316284, "step": 5250 }, { "epoch": 1.980421686746988, "grad_norm": 79.9074742868225, "learning_rate": 5.049887048192771e-07, "logits/chosen": -2.436328172683716, "logits/rejected": -2.397265672683716, "logps/chosen": -338.45001220703125, "logps/rejected": -394.29998779296875, "loss": 0.1732, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03252563625574112, "rewards/margins": 3.362499952316284, "rewards/rejected": -3.330859422683716, "step": 5260 }, { "epoch": 1.9841867469879517, "grad_norm": 62.910505419465515, "learning_rate": 5.040474397590361e-07, "logits/chosen": -2.436328172683716, "logits/rejected": -2.428906202316284, "logps/chosen": -341.75, "logps/rejected": -413.8500061035156, "loss": 0.1816, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0016845703357830644, "rewards/margins": 2.9664063453674316, "rewards/rejected": -2.96484375, "step": 5270 }, { "epoch": 1.9879518072289155, "grad_norm": 42.70980275396678, "learning_rate": 5.031061746987951e-07, "logits/chosen": -2.600781202316284, "logits/rejected": -2.422656297683716, "logps/chosen": -340.375, "logps/rejected": -459.0, "loss": 0.1651, "rewards/accuracies": 0.96875, "rewards/chosen": -0.19156494736671448, "rewards/margins": 3.1917967796325684, "rewards/rejected": -3.381640672683716, "step": 5280 }, { "epoch": 1.9917168674698795, "grad_norm": 43.29635423888815, "learning_rate": 5.021649096385542e-07, "logits/chosen": -2.4820313453674316, "logits/rejected": -2.5257811546325684, "logps/chosen": -336.45001220703125, "logps/rejected": -357.79998779296875, "loss": 0.1889, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.13580933213233948, "rewards/margins": 2.942578077316284, "rewards/rejected": -3.079296827316284, "step": 5290 }, { "epoch": 1.9954819277108435, "grad_norm": 62.451710960527656, "learning_rate": 5.012236445783132e-07, "logits/chosen": -2.3960938453674316, "logits/rejected": -2.315234422683716, "logps/chosen": -357.3500061035156, "logps/rejected": -433.20001220703125, "loss": 0.1642, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.08603515475988388, "rewards/margins": 3.184765577316284, "rewards/rejected": -3.272656202316284, "step": 5300 }, { "epoch": 1.9992469879518073, "grad_norm": 80.36224667856804, "learning_rate": 5.002823795180723e-07, "logits/chosen": -2.5062499046325684, "logits/rejected": -2.535937547683716, "logps/chosen": -337.54998779296875, "logps/rejected": -378.3999938964844, "loss": 0.2066, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.205322265625, "rewards/margins": 3.010937452316284, "rewards/rejected": -3.2152342796325684, "step": 5310 }, { "epoch": 2.003012048192771, "grad_norm": 9.30740736116374, "learning_rate": 4.993411144578312e-07, "logits/chosen": -2.473437547683716, "logits/rejected": -2.4183592796325684, "logps/chosen": -316.57501220703125, "logps/rejected": -413.6499938964844, "loss": 0.1027, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.09913329780101776, "rewards/margins": 3.6976561546325684, "rewards/rejected": -3.796875, "step": 5320 }, { "epoch": 2.006777108433735, "grad_norm": 21.196787381052445, "learning_rate": 4.983998493975903e-07, "logits/chosen": -2.633593797683716, "logits/rejected": -2.518749952316284, "logps/chosen": -370.45001220703125, "logps/rejected": -430.95001220703125, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": 0.27135008573532104, "rewards/margins": 3.992968797683716, "rewards/rejected": -3.719531297683716, "step": 5330 }, { "epoch": 2.0105421686746987, "grad_norm": 22.14993954996983, "learning_rate": 4.974585843373494e-07, "logits/chosen": -2.486328125, "logits/rejected": -2.51171875, "logps/chosen": -366.3500061035156, "logps/rejected": -390.1499938964844, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 0.14609985053539276, "rewards/margins": 3.9937500953674316, "rewards/rejected": -3.8499999046325684, "step": 5340 }, { "epoch": 2.0143072289156625, "grad_norm": 11.08629172201386, "learning_rate": 4.965173192771085e-07, "logits/chosen": -2.575390577316284, "logits/rejected": -2.5523438453674316, "logps/chosen": -352.3500061035156, "logps/rejected": -447.79998779296875, "loss": 0.0514, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.11557617038488388, "rewards/margins": 4.439062595367432, "rewards/rejected": -4.557812690734863, "step": 5350 }, { "epoch": 2.0180722891566263, "grad_norm": 18.881406511140465, "learning_rate": 4.955760542168675e-07, "logits/chosen": -2.7281250953674316, "logits/rejected": -2.69921875, "logps/chosen": -326.6499938964844, "logps/rejected": -381.54998779296875, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 0.016632080078125, "rewards/margins": 3.9000000953674316, "rewards/rejected": -3.8843750953674316, "step": 5360 }, { "epoch": 2.0218373493975905, "grad_norm": 23.477440377155137, "learning_rate": 4.946347891566264e-07, "logits/chosen": -2.625781297683716, "logits/rejected": -2.6796875, "logps/chosen": -364.0, "logps/rejected": -419.6000061035156, "loss": 0.0647, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.40472412109375, "rewards/margins": 4.477343559265137, "rewards/rejected": -4.88671875, "step": 5370 }, { "epoch": 2.0256024096385543, "grad_norm": 27.51558156183409, "learning_rate": 4.936935240963855e-07, "logits/chosen": -2.575000047683716, "logits/rejected": -2.578906297683716, "logps/chosen": -276.54998779296875, "logps/rejected": -352.45001220703125, "loss": 0.0668, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2947998046875, "rewards/margins": 4.082812309265137, "rewards/rejected": -4.376562595367432, "step": 5380 }, { "epoch": 2.029367469879518, "grad_norm": 7.807321157988913, "learning_rate": 4.927522590361445e-07, "logits/chosen": -2.6078124046325684, "logits/rejected": -2.598437547683716, "logps/chosen": -358.8500061035156, "logps/rejected": -429.3500061035156, "loss": 0.0424, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.534954845905304, "rewards/margins": 4.567968845367432, "rewards/rejected": -5.099218845367432, "step": 5390 }, { "epoch": 2.033132530120482, "grad_norm": 22.367005520866673, "learning_rate": 4.918109939759036e-07, "logits/chosen": -2.657031297683716, "logits/rejected": -2.663281202316284, "logps/chosen": -343.5, "logps/rejected": -406.8999938964844, "loss": 0.0591, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.13902588188648224, "rewards/margins": 4.2578125, "rewards/rejected": -4.401562690734863, "step": 5400 }, { "epoch": 2.0368975903614457, "grad_norm": 22.317995252828364, "learning_rate": 4.908697289156626e-07, "logits/chosen": -2.70703125, "logits/rejected": -2.698437452316284, "logps/chosen": -339.95001220703125, "logps/rejected": -447.8500061035156, "loss": 0.0539, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.551098644733429, "rewards/margins": 4.688281059265137, "rewards/rejected": -5.241406440734863, "step": 5410 }, { "epoch": 2.0406626506024095, "grad_norm": 11.378065983530437, "learning_rate": 4.899284638554217e-07, "logits/chosen": -2.674999952316284, "logits/rejected": -2.7265625, "logps/chosen": -306.95001220703125, "logps/rejected": -365.79998779296875, "loss": 0.081, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.5763305425643921, "rewards/margins": 4.267187595367432, "rewards/rejected": -4.842187404632568, "step": 5420 }, { "epoch": 2.0444277108433733, "grad_norm": 34.44634012881254, "learning_rate": 4.889871987951807e-07, "logits/chosen": -2.782031297683716, "logits/rejected": -2.727343797683716, "logps/chosen": -355.5, "logps/rejected": -426.20001220703125, "loss": 0.0444, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6290527582168579, "rewards/margins": 4.752343654632568, "rewards/rejected": -5.378125190734863, "step": 5430 }, { "epoch": 2.0481927710843375, "grad_norm": 25.386010879863306, "learning_rate": 4.880459337349398e-07, "logits/chosen": -2.758593797683716, "logits/rejected": -2.753124952316284, "logps/chosen": -316.5, "logps/rejected": -365.6499938964844, "loss": 0.0538, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7753540277481079, "rewards/margins": 4.51171875, "rewards/rejected": -5.28515625, "step": 5440 }, { "epoch": 2.0519578313253013, "grad_norm": 64.04151824479744, "learning_rate": 4.871046686746988e-07, "logits/chosen": -2.70703125, "logits/rejected": -2.676562547683716, "logps/chosen": -395.0, "logps/rejected": -445.6499938964844, "loss": 0.0648, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.03485107421875, "rewards/margins": 4.834374904632568, "rewards/rejected": -5.87109375, "step": 5450 }, { "epoch": 2.055722891566265, "grad_norm": 39.2357777666957, "learning_rate": 4.861634036144578e-07, "logits/chosen": -2.625, "logits/rejected": -2.6578125953674316, "logps/chosen": -344.54998779296875, "logps/rejected": -477.25, "loss": 0.0584, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.17437744140625, "rewards/margins": 5.057031154632568, "rewards/rejected": -6.229687690734863, "step": 5460 }, { "epoch": 2.059487951807229, "grad_norm": 24.45601367130127, "learning_rate": 4.852221385542168e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": -2.5882811546325684, "logps/chosen": -327.75, "logps/rejected": -374.95001220703125, "loss": 0.0669, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.949810802936554, "rewards/margins": 4.461718559265137, "rewards/rejected": -5.41015625, "step": 5470 }, { "epoch": 2.0632530120481927, "grad_norm": 8.504312296438673, "learning_rate": 4.842808734939759e-07, "logits/chosen": -2.746875047683716, "logits/rejected": -2.698437452316284, "logps/chosen": -346.20001220703125, "logps/rejected": -455.3999938964844, "loss": 0.0456, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.77508544921875, "rewards/margins": 4.939843654632568, "rewards/rejected": -5.716406345367432, "step": 5480 }, { "epoch": 2.0670180722891565, "grad_norm": 19.196017347203963, "learning_rate": 4.833396084337349e-07, "logits/chosen": -2.735156297683716, "logits/rejected": -2.7210936546325684, "logps/chosen": -351.5, "logps/rejected": -445.1000061035156, "loss": 0.042, "rewards/accuracies": 1.0, "rewards/chosen": -0.8716491460800171, "rewards/margins": 4.936718940734863, "rewards/rejected": -5.8125, "step": 5490 }, { "epoch": 2.0707831325301207, "grad_norm": 33.294132390034015, "learning_rate": 4.823983433734939e-07, "logits/chosen": -2.502734422683716, "logits/rejected": -2.530078172683716, "logps/chosen": -322.67498779296875, "logps/rejected": -410.04998779296875, "loss": 0.0613, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.830517590045929, "rewards/margins": 4.794531345367432, "rewards/rejected": -5.628125190734863, "step": 5500 }, { "epoch": 2.0745481927710845, "grad_norm": 12.31159572113931, "learning_rate": 4.81457078313253e-07, "logits/chosen": -2.692187547683716, "logits/rejected": -2.746875047683716, "logps/chosen": -395.25, "logps/rejected": -459.0, "loss": 0.043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.021447777748108, "rewards/margins": 4.924218654632568, "rewards/rejected": -5.943749904632568, "step": 5510 }, { "epoch": 2.0783132530120483, "grad_norm": 6.399664565102331, "learning_rate": 4.805158132530121e-07, "logits/chosen": -2.61328125, "logits/rejected": -2.6656250953674316, "logps/chosen": -368.3500061035156, "logps/rejected": -440.6000061035156, "loss": 0.051, "rewards/accuracies": 1.0, "rewards/chosen": -0.7311645746231079, "rewards/margins": 5.117968559265137, "rewards/rejected": -5.8515625, "step": 5520 }, { "epoch": 2.082078313253012, "grad_norm": 26.243249832467374, "learning_rate": 4.795745481927711e-07, "logits/chosen": -2.754687547683716, "logits/rejected": -2.77734375, "logps/chosen": -377.95001220703125, "logps/rejected": -431.54998779296875, "loss": 0.0513, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.52935791015625, "rewards/margins": 4.891406059265137, "rewards/rejected": -5.420312404632568, "step": 5530 }, { "epoch": 2.085843373493976, "grad_norm": 3.6892490288953597, "learning_rate": 4.7863328313253e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.7171874046325684, "logps/chosen": -365.3500061035156, "logps/rejected": -431.04998779296875, "loss": 0.0379, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.574755847454071, "rewards/margins": 5.313281059265137, "rewards/rejected": -5.887499809265137, "step": 5540 }, { "epoch": 2.0896084337349397, "grad_norm": 46.17482675488325, "learning_rate": 4.776920180722891e-07, "logits/chosen": -2.7367186546325684, "logits/rejected": -2.703906297683716, "logps/chosen": -341.6499938964844, "logps/rejected": -420.1499938964844, "loss": 0.0658, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.7112182378768921, "rewards/margins": 4.541406154632568, "rewards/rejected": -5.249218940734863, "step": 5550 }, { "epoch": 2.0933734939759034, "grad_norm": 11.96398593310928, "learning_rate": 4.7675075301204815e-07, "logits/chosen": -2.6617188453674316, "logits/rejected": -2.608593702316284, "logps/chosen": -365.3500061035156, "logps/rejected": -441.29998779296875, "loss": 0.0537, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1873047351837158, "rewards/margins": 4.842187404632568, "rewards/rejected": -6.03125, "step": 5560 }, { "epoch": 2.0971385542168677, "grad_norm": 24.837997835742954, "learning_rate": 4.758094879518072e-07, "logits/chosen": -2.667187452316284, "logits/rejected": -2.7289061546325684, "logps/chosen": -395.3500061035156, "logps/rejected": -470.5, "loss": 0.0539, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.246484398841858, "rewards/margins": 4.845312595367432, "rewards/rejected": -6.098437309265137, "step": 5570 }, { "epoch": 2.1009036144578315, "grad_norm": 17.602899152383355, "learning_rate": 4.7486822289156626e-07, "logits/chosen": -2.729687452316284, "logits/rejected": -2.7289061546325684, "logps/chosen": -385.0, "logps/rejected": -468.20001220703125, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": -1.373754858970642, "rewards/margins": 5.211718559265137, "rewards/rejected": -6.5859375, "step": 5580 }, { "epoch": 2.1046686746987953, "grad_norm": 19.146967746897804, "learning_rate": 4.739269578313253e-07, "logits/chosen": -2.73046875, "logits/rejected": -2.7515625953674316, "logps/chosen": -356.75, "logps/rejected": -410.70001220703125, "loss": 0.0567, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7979491949081421, "rewards/margins": 4.663281440734863, "rewards/rejected": -5.462500095367432, "step": 5590 }, { "epoch": 2.108433734939759, "grad_norm": 13.911064783934966, "learning_rate": 4.729856927710843e-07, "logits/chosen": -2.664843797683716, "logits/rejected": -2.625781297683716, "logps/chosen": -355.1000061035156, "logps/rejected": -425.04998779296875, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": -0.518908679485321, "rewards/margins": 4.993750095367432, "rewards/rejected": -5.516406059265137, "step": 5600 }, { "epoch": 2.112198795180723, "grad_norm": 26.328954623831823, "learning_rate": 4.7204442771084334e-07, "logits/chosen": -2.572265625, "logits/rejected": -2.629687547683716, "logps/chosen": -338.20001220703125, "logps/rejected": -415.04998779296875, "loss": 0.0528, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.40397948026657104, "rewards/margins": 4.661718845367432, "rewards/rejected": -5.067187309265137, "step": 5610 }, { "epoch": 2.1159638554216866, "grad_norm": 11.234202450736662, "learning_rate": 4.711031626506024e-07, "logits/chosen": -2.6820311546325684, "logits/rejected": -2.6968750953674316, "logps/chosen": -344.5, "logps/rejected": -419.20001220703125, "loss": 0.0507, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.48289793729782104, "rewards/margins": 5.078125, "rewards/rejected": -5.560937404632568, "step": 5620 }, { "epoch": 2.1197289156626504, "grad_norm": 29.44724463600101, "learning_rate": 4.701618975903614e-07, "logits/chosen": -2.62109375, "logits/rejected": -2.6796875, "logps/chosen": -358.6000061035156, "logps/rejected": -413.5, "loss": 0.0536, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.3321777284145355, "rewards/margins": 4.616406440734863, "rewards/rejected": -4.94921875, "step": 5630 }, { "epoch": 2.1234939759036147, "grad_norm": 34.87217210520758, "learning_rate": 4.6922063253012047e-07, "logits/chosen": -2.6773438453674316, "logits/rejected": -2.69140625, "logps/chosen": -323.20001220703125, "logps/rejected": -400.3999938964844, "loss": 0.0597, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.19803467392921448, "rewards/margins": 5.128125190734863, "rewards/rejected": -5.330468654632568, "step": 5640 }, { "epoch": 2.1272590361445785, "grad_norm": 12.305800239373331, "learning_rate": 4.682793674698795e-07, "logits/chosen": -2.6624999046325684, "logits/rejected": -2.742968797683716, "logps/chosen": -359.79998779296875, "logps/rejected": -408.0, "loss": 0.0445, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.8171631097793579, "rewards/margins": 4.849218845367432, "rewards/rejected": -5.66796875, "step": 5650 }, { "epoch": 2.1310240963855422, "grad_norm": 26.073154350239093, "learning_rate": 4.673381024096386e-07, "logits/chosen": -2.69140625, "logits/rejected": -2.725781202316284, "logps/chosen": -349.8500061035156, "logps/rejected": -406.79998779296875, "loss": 0.0581, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.950122058391571, "rewards/margins": 4.9140625, "rewards/rejected": -5.860937595367432, "step": 5660 }, { "epoch": 2.134789156626506, "grad_norm": 20.64934157171303, "learning_rate": 4.6639683734939755e-07, "logits/chosen": -2.745312452316284, "logits/rejected": -2.714062452316284, "logps/chosen": -351.5, "logps/rejected": -414.6000061035156, "loss": 0.0455, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9109863042831421, "rewards/margins": 4.692187309265137, "rewards/rejected": -5.6015625, "step": 5670 }, { "epoch": 2.13855421686747, "grad_norm": 14.581372666275461, "learning_rate": 4.6545557228915663e-07, "logits/chosen": -2.760937452316284, "logits/rejected": -2.77734375, "logps/chosen": -307.6499938964844, "logps/rejected": -384.95001220703125, "loss": 0.0461, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.5826416015625, "rewards/margins": 4.713281154632568, "rewards/rejected": -5.293749809265137, "step": 5680 }, { "epoch": 2.1423192771084336, "grad_norm": 44.85027455756815, "learning_rate": 4.6451430722891566e-07, "logits/chosen": -2.608593702316284, "logits/rejected": -2.577343702316284, "logps/chosen": -354.6499938964844, "logps/rejected": -460.6000061035156, "loss": 0.0493, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7823852300643921, "rewards/margins": 4.925000190734863, "rewards/rejected": -5.707812309265137, "step": 5690 }, { "epoch": 2.1460843373493974, "grad_norm": 23.081941024691563, "learning_rate": 4.6357304216867463e-07, "logits/chosen": -2.6703124046325684, "logits/rejected": -2.645312547683716, "logps/chosen": -361.6000061035156, "logps/rejected": -466.29998779296875, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": -0.781689465045929, "rewards/margins": 5.064843654632568, "rewards/rejected": -5.844531059265137, "step": 5700 }, { "epoch": 2.1498493975903616, "grad_norm": 43.84200080219723, "learning_rate": 4.626317771084337e-07, "logits/chosen": -2.6968750953674316, "logits/rejected": -2.73046875, "logps/chosen": -360.0, "logps/rejected": -385.6499938964844, "loss": 0.054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.5315490961074829, "rewards/margins": 4.482031345367432, "rewards/rejected": -5.017187595367432, "step": 5710 }, { "epoch": 2.1536144578313254, "grad_norm": 72.95153332328613, "learning_rate": 4.6169051204819274e-07, "logits/chosen": -2.4976563453674316, "logits/rejected": -2.600781202316284, "logps/chosen": -363.95001220703125, "logps/rejected": -446.0, "loss": 0.0516, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.954760730266571, "rewards/margins": 4.893750190734863, "rewards/rejected": -5.852343559265137, "step": 5720 }, { "epoch": 2.1573795180722892, "grad_norm": 43.20648554041402, "learning_rate": 4.607492469879518e-07, "logits/chosen": -2.6382813453674316, "logits/rejected": -2.64453125, "logps/chosen": -382.45001220703125, "logps/rejected": -460.3999938964844, "loss": 0.0529, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.823345959186554, "rewards/margins": 5.21875, "rewards/rejected": -6.043749809265137, "step": 5730 }, { "epoch": 2.161144578313253, "grad_norm": 15.505406935675827, "learning_rate": 4.598079819277108e-07, "logits/chosen": -2.765625, "logits/rejected": -2.7281250953674316, "logps/chosen": -340.1499938964844, "logps/rejected": -400.75, "loss": 0.0618, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.939208984375, "rewards/margins": 4.61328125, "rewards/rejected": -5.553124904632568, "step": 5740 }, { "epoch": 2.164909638554217, "grad_norm": 20.74022644432084, "learning_rate": 4.5886671686746987e-07, "logits/chosen": -2.6937499046325684, "logits/rejected": -2.6820311546325684, "logps/chosen": -352.6499938964844, "logps/rejected": -426.04998779296875, "loss": 0.0663, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0574219226837158, "rewards/margins": 4.8125, "rewards/rejected": -5.867968559265137, "step": 5750 }, { "epoch": 2.1686746987951806, "grad_norm": 35.20927226735573, "learning_rate": 4.579254518072289e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": -2.660937547683716, "logps/chosen": -335.0, "logps/rejected": -410.54998779296875, "loss": 0.0754, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0343506336212158, "rewards/margins": 4.951562404632568, "rewards/rejected": -5.982812404632568, "step": 5760 }, { "epoch": 2.1724397590361444, "grad_norm": 24.03373924700295, "learning_rate": 4.569841867469879e-07, "logits/chosen": -2.772656202316284, "logits/rejected": -2.6953125, "logps/chosen": -372.1499938964844, "logps/rejected": -429.1000061035156, "loss": 0.054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.767578125, "rewards/margins": 4.59765625, "rewards/rejected": -6.364062309265137, "step": 5770 }, { "epoch": 2.1762048192771086, "grad_norm": 8.051405021230375, "learning_rate": 4.5604292168674695e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.71875, "logps/chosen": -330.1499938964844, "logps/rejected": -403.20001220703125, "loss": 0.0438, "rewards/accuracies": 1.0, "rewards/chosen": -1.079492211341858, "rewards/margins": 5.01953125, "rewards/rejected": -6.09375, "step": 5780 }, { "epoch": 2.1799698795180724, "grad_norm": 82.0195483277044, "learning_rate": 4.5510165662650603e-07, "logits/chosen": -2.596874952316284, "logits/rejected": -2.51171875, "logps/chosen": -345.5, "logps/rejected": -444.0, "loss": 0.0615, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.7066650390625, "rewards/margins": 4.9453125, "rewards/rejected": -5.651562690734863, "step": 5790 }, { "epoch": 2.183734939759036, "grad_norm": 61.65565416967591, "learning_rate": 4.5416039156626506e-07, "logits/chosen": -2.668750047683716, "logits/rejected": -2.712890625, "logps/chosen": -334.95001220703125, "logps/rejected": -404.75, "loss": 0.0564, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5393921136856079, "rewards/margins": 4.659375190734863, "rewards/rejected": -5.198437690734863, "step": 5800 }, { "epoch": 2.1875, "grad_norm": 15.265649140266111, "learning_rate": 4.532191265060241e-07, "logits/chosen": -2.58203125, "logits/rejected": -2.6109375953674316, "logps/chosen": -327.75, "logps/rejected": -404.6000061035156, "loss": 0.0481, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.612170398235321, "rewards/margins": 4.850781440734863, "rewards/rejected": -5.471093654632568, "step": 5810 }, { "epoch": 2.191265060240964, "grad_norm": 19.904724357392393, "learning_rate": 4.522778614457831e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.671875, "logps/chosen": -337.1499938964844, "logps/rejected": -445.5, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": -0.868115246295929, "rewards/margins": 5.042187690734863, "rewards/rejected": -5.912499904632568, "step": 5820 }, { "epoch": 2.1950301204819276, "grad_norm": 53.41645000163531, "learning_rate": 4.513365963855422e-07, "logits/chosen": -2.702343702316284, "logits/rejected": -2.678906202316284, "logps/chosen": -345.8999938964844, "logps/rejected": -449.8999938964844, "loss": 0.0653, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.1500976085662842, "rewards/margins": 4.854687690734863, "rewards/rejected": -6.003125190734863, "step": 5830 }, { "epoch": 2.1987951807228914, "grad_norm": 10.875092549310137, "learning_rate": 4.5039533132530116e-07, "logits/chosen": -2.733593702316284, "logits/rejected": -2.689453125, "logps/chosen": -335.04998779296875, "logps/rejected": -408.95001220703125, "loss": 0.0508, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.060644507408142, "rewards/margins": 4.785937309265137, "rewards/rejected": -5.84375, "step": 5840 }, { "epoch": 2.2025602409638556, "grad_norm": 16.170580683604186, "learning_rate": 4.494540662650602e-07, "logits/chosen": -2.578125, "logits/rejected": -2.6484375, "logps/chosen": -372.45001220703125, "logps/rejected": -474.5, "loss": 0.0313, "rewards/accuracies": 1.0, "rewards/chosen": -1.1116211414337158, "rewards/margins": 5.033593654632568, "rewards/rejected": -6.140625, "step": 5850 }, { "epoch": 2.2063253012048194, "grad_norm": 38.44731745168649, "learning_rate": 4.4851280120481927e-07, "logits/chosen": -2.7085938453674316, "logits/rejected": -2.698437452316284, "logps/chosen": -340.3500061035156, "logps/rejected": -426.54998779296875, "loss": 0.0674, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.141210913658142, "rewards/margins": 5.081250190734863, "rewards/rejected": -6.2265625, "step": 5860 }, { "epoch": 2.210090361445783, "grad_norm": 11.644984089011244, "learning_rate": 4.4757153614457824e-07, "logits/chosen": -2.649609327316284, "logits/rejected": -2.764843702316284, "logps/chosen": -380.6000061035156, "logps/rejected": -460.5, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": -0.689099133014679, "rewards/margins": 5.107031345367432, "rewards/rejected": -5.800000190734863, "step": 5870 }, { "epoch": 2.213855421686747, "grad_norm": 29.811325466503522, "learning_rate": 4.466302710843373e-07, "logits/chosen": -2.753124952316284, "logits/rejected": -2.7359375953674316, "logps/chosen": -345.04998779296875, "logps/rejected": -418.8999938964844, "loss": 0.0598, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.032617211341858, "rewards/margins": 4.8515625, "rewards/rejected": -5.88671875, "step": 5880 }, { "epoch": 2.2176204819277108, "grad_norm": 8.634958963037807, "learning_rate": 4.4568900602409635e-07, "logits/chosen": -2.7593750953674316, "logits/rejected": -2.7757811546325684, "logps/chosen": -306.70001220703125, "logps/rejected": -415.5, "loss": 0.0519, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.334375023841858, "rewards/margins": 5.206250190734863, "rewards/rejected": -6.5390625, "step": 5890 }, { "epoch": 2.2213855421686746, "grad_norm": 21.883625682634914, "learning_rate": 4.4474774096385543e-07, "logits/chosen": -2.8257813453674316, "logits/rejected": -2.7289061546325684, "logps/chosen": -341.3500061035156, "logps/rejected": -439.6000061035156, "loss": 0.0561, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.296960473060608, "rewards/margins": 5.05078125, "rewards/rejected": -6.345312595367432, "step": 5900 }, { "epoch": 2.2251506024096384, "grad_norm": 6.76816297720333, "learning_rate": 4.438064759036144e-07, "logits/chosen": -2.739062547683716, "logits/rejected": -2.8148436546325684, "logps/chosen": -355.79998779296875, "logps/rejected": -447.1000061035156, "loss": 0.0543, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.736718773841858, "rewards/margins": 5.442187309265137, "rewards/rejected": -7.173437595367432, "step": 5910 }, { "epoch": 2.2289156626506026, "grad_norm": 21.240289746522397, "learning_rate": 4.428652108433735e-07, "logits/chosen": -2.815624952316284, "logits/rejected": -2.871875047683716, "logps/chosen": -353.95001220703125, "logps/rejected": -415.8999938964844, "loss": 0.0745, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.7314453125, "rewards/margins": 4.810937404632568, "rewards/rejected": -6.5390625, "step": 5920 }, { "epoch": 2.2326807228915664, "grad_norm": 20.561662770111027, "learning_rate": 4.419239457831325e-07, "logits/chosen": -2.655468702316284, "logits/rejected": -2.6617188453674316, "logps/chosen": -354.6000061035156, "logps/rejected": -411.29998779296875, "loss": 0.0625, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.5224609375, "rewards/margins": 4.957812309265137, "rewards/rejected": -6.482812404632568, "step": 5930 }, { "epoch": 2.23644578313253, "grad_norm": 45.92139669463513, "learning_rate": 4.4098268072289154e-07, "logits/chosen": -2.7164063453674316, "logits/rejected": -2.7046875953674316, "logps/chosen": -342.8500061035156, "logps/rejected": -432.3999938964844, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": -1.4093749523162842, "rewards/margins": 5.21484375, "rewards/rejected": -6.628125190734863, "step": 5940 }, { "epoch": 2.240210843373494, "grad_norm": 20.54317894997648, "learning_rate": 4.4004141566265056e-07, "logits/chosen": -2.7671875953674316, "logits/rejected": -2.7476563453674316, "logps/chosen": -326.3999938964844, "logps/rejected": -441.6000061035156, "loss": 0.0372, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.169677734375, "rewards/margins": 5.491406440734863, "rewards/rejected": -6.657812595367432, "step": 5950 }, { "epoch": 2.2439759036144578, "grad_norm": 21.53051642017506, "learning_rate": 4.3910015060240964e-07, "logits/chosen": -2.73046875, "logits/rejected": -2.762500047683716, "logps/chosen": -355.04998779296875, "logps/rejected": -452.29998779296875, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -1.1763184070587158, "rewards/margins": 5.59375, "rewards/rejected": -6.771874904632568, "step": 5960 }, { "epoch": 2.2477409638554215, "grad_norm": 10.528409711260915, "learning_rate": 4.3815888554216867e-07, "logits/chosen": -2.8515625, "logits/rejected": -2.827343702316284, "logps/chosen": -397.5, "logps/rejected": -471.5, "loss": 0.0427, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.149774193763733, "rewards/margins": 5.293749809265137, "rewards/rejected": -6.449999809265137, "step": 5970 }, { "epoch": 2.2515060240963853, "grad_norm": 19.64428185262646, "learning_rate": 4.372176204819277e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.7359375953674316, "logps/chosen": -364.20001220703125, "logps/rejected": -435.1000061035156, "loss": 0.0469, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4345703125, "rewards/margins": 5.487500190734863, "rewards/rejected": -6.918749809265137, "step": 5980 }, { "epoch": 2.2552710843373496, "grad_norm": 20.094490628213638, "learning_rate": 4.362763554216867e-07, "logits/chosen": -2.7816405296325684, "logits/rejected": -2.7984375953674316, "logps/chosen": -394.1000061035156, "logps/rejected": -474.20001220703125, "loss": 0.0478, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.585058569908142, "rewards/margins": 5.2578125, "rewards/rejected": -6.845312595367432, "step": 5990 }, { "epoch": 2.2590361445783134, "grad_norm": 21.170219062648115, "learning_rate": 4.353350903614458e-07, "logits/chosen": -2.664843797683716, "logits/rejected": -2.7367186546325684, "logps/chosen": -304.5, "logps/rejected": -421.0, "loss": 0.0376, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.44287109375, "rewards/margins": 5.257031440734863, "rewards/rejected": -6.698437690734863, "step": 6000 }, { "epoch": 2.262801204819277, "grad_norm": 5.305612483362943, "learning_rate": 4.343938253012048e-07, "logits/chosen": -2.6976561546325684, "logits/rejected": -2.702343702316284, "logps/chosen": -356.6499938964844, "logps/rejected": -434.70001220703125, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -1.34375, "rewards/margins": 5.041406154632568, "rewards/rejected": -6.387499809265137, "step": 6010 }, { "epoch": 2.266566265060241, "grad_norm": 8.2208881903527, "learning_rate": 4.334525602409638e-07, "logits/chosen": -2.692187547683716, "logits/rejected": -2.7984375953674316, "logps/chosen": -345.1000061035156, "logps/rejected": -411.20001220703125, "loss": 0.065, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.079675316810608, "rewards/margins": 4.911718845367432, "rewards/rejected": -5.992968559265137, "step": 6020 }, { "epoch": 2.2703313253012047, "grad_norm": 10.492540088608944, "learning_rate": 4.325112951807229e-07, "logits/chosen": -2.757031202316284, "logits/rejected": -2.739062547683716, "logps/chosen": -341.8999938964844, "logps/rejected": -427.20001220703125, "loss": 0.066, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2712891101837158, "rewards/margins": 5.154687404632568, "rewards/rejected": -6.428124904632568, "step": 6030 }, { "epoch": 2.2740963855421685, "grad_norm": 11.251736007052815, "learning_rate": 4.315700301204819e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.7203125953674316, "logps/chosen": -347.0249938964844, "logps/rejected": -415.79998779296875, "loss": 0.0437, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6003906726837158, "rewards/margins": 5.257031440734863, "rewards/rejected": -6.857812404632568, "step": 6040 }, { "epoch": 2.2778614457831328, "grad_norm": 10.340423214897939, "learning_rate": 4.3062876506024094e-07, "logits/chosen": -2.7523436546325684, "logits/rejected": -2.742968797683716, "logps/chosen": -375.5, "logps/rejected": -455.29998779296875, "loss": 0.0323, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.464453101158142, "rewards/margins": 5.515625, "rewards/rejected": -6.985937595367432, "step": 6050 }, { "epoch": 2.2816265060240966, "grad_norm": 14.905003874514419, "learning_rate": 4.2968749999999996e-07, "logits/chosen": -2.703125, "logits/rejected": -2.6968750953674316, "logps/chosen": -400.54998779296875, "logps/rejected": -450.3500061035156, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -1.568359375, "rewards/margins": 5.45703125, "rewards/rejected": -7.015625, "step": 6060 }, { "epoch": 2.2853915662650603, "grad_norm": 13.326763495404503, "learning_rate": 4.2874623493975904e-07, "logits/chosen": -2.8179688453674316, "logits/rejected": -2.7890625, "logps/chosen": -302.04998779296875, "logps/rejected": -385.8500061035156, "loss": 0.0431, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4376952648162842, "rewards/margins": 5.024218559265137, "rewards/rejected": -6.465624809265137, "step": 6070 }, { "epoch": 2.289156626506024, "grad_norm": 21.698460968680838, "learning_rate": 4.27804969879518e-07, "logits/chosen": -2.7046875953674316, "logits/rejected": -2.803906202316284, "logps/chosen": -345.1499938964844, "logps/rejected": -453.70001220703125, "loss": 0.0707, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4153320789337158, "rewards/margins": 5.094531059265137, "rewards/rejected": -6.506249904632568, "step": 6080 }, { "epoch": 2.292921686746988, "grad_norm": 58.743449936483806, "learning_rate": 4.268637048192771e-07, "logits/chosen": -2.731250047683716, "logits/rejected": -2.7125000953674316, "logps/chosen": -317.25, "logps/rejected": -376.3999938964844, "loss": 0.0589, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5107421875, "rewards/margins": 5.0234375, "rewards/rejected": -6.532812595367432, "step": 6090 }, { "epoch": 2.2966867469879517, "grad_norm": 24.857546186231026, "learning_rate": 4.259224397590361e-07, "logits/chosen": -2.7125000953674316, "logits/rejected": -2.7085938453674316, "logps/chosen": -330.6499938964844, "logps/rejected": -437.29998779296875, "loss": 0.0528, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.521093726158142, "rewards/margins": 5.094531059265137, "rewards/rejected": -6.615624904632568, "step": 6100 }, { "epoch": 2.3004518072289155, "grad_norm": 35.16260864635914, "learning_rate": 4.249811746987952e-07, "logits/chosen": -2.6734375953674316, "logits/rejected": -2.7164063453674316, "logps/chosen": -292.75, "logps/rejected": -415.70001220703125, "loss": 0.0616, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5286133289337158, "rewards/margins": 5.220312595367432, "rewards/rejected": -6.753125190734863, "step": 6110 }, { "epoch": 2.3042168674698793, "grad_norm": 15.636049078827346, "learning_rate": 4.240399096385542e-07, "logits/chosen": -2.7125000953674316, "logits/rejected": -2.706249952316284, "logps/chosen": -344.0, "logps/rejected": -409.8999938964844, "loss": 0.0716, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4670898914337158, "rewards/margins": 4.774218559265137, "rewards/rejected": -6.240624904632568, "step": 6120 }, { "epoch": 2.3079819277108435, "grad_norm": 24.537672596372797, "learning_rate": 4.2309864457831325e-07, "logits/chosen": -2.81640625, "logits/rejected": -2.71875, "logps/chosen": -338.29998779296875, "logps/rejected": -434.29998779296875, "loss": 0.0669, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2698242664337158, "rewards/margins": 4.73046875, "rewards/rejected": -5.998437404632568, "step": 6130 }, { "epoch": 2.3117469879518073, "grad_norm": 79.53072566406489, "learning_rate": 4.221573795180723e-07, "logits/chosen": -2.6507811546325684, "logits/rejected": -2.6167969703674316, "logps/chosen": -323.8500061035156, "logps/rejected": -429.0, "loss": 0.0682, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.988476574420929, "rewards/margins": 4.981249809265137, "rewards/rejected": -5.967968940734863, "step": 6140 }, { "epoch": 2.315512048192771, "grad_norm": 29.6456006711646, "learning_rate": 4.212161144578313e-07, "logits/chosen": -2.672656297683716, "logits/rejected": -2.7085938453674316, "logps/chosen": -362.29998779296875, "logps/rejected": -413.54998779296875, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": -1.040771484375, "rewards/margins": 4.964062690734863, "rewards/rejected": -6.005468845367432, "step": 6150 }, { "epoch": 2.319277108433735, "grad_norm": 11.95950601054181, "learning_rate": 4.2027484939759033e-07, "logits/chosen": -2.807812452316284, "logits/rejected": -2.7328124046325684, "logps/chosen": -375.3999938964844, "logps/rejected": -464.8999938964844, "loss": 0.058, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.1868164539337158, "rewards/margins": 5.207812309265137, "rewards/rejected": -6.390625, "step": 6160 }, { "epoch": 2.3230421686746987, "grad_norm": 53.329905463809595, "learning_rate": 4.193335843373494e-07, "logits/chosen": -2.8070311546325684, "logits/rejected": -2.664843797683716, "logps/chosen": -351.54998779296875, "logps/rejected": -477.25, "loss": 0.0645, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.1642577648162842, "rewards/margins": 5.01171875, "rewards/rejected": -6.182031154632568, "step": 6170 }, { "epoch": 2.3268072289156625, "grad_norm": 14.724475504619589, "learning_rate": 4.1839231927710844e-07, "logits/chosen": -2.75390625, "logits/rejected": -2.73828125, "logps/chosen": -389.75, "logps/rejected": -502.5, "loss": 0.042, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3386719226837158, "rewards/margins": 5.73828125, "rewards/rejected": -7.076562404632568, "step": 6180 }, { "epoch": 2.3305722891566267, "grad_norm": 20.14890006664389, "learning_rate": 4.174510542168674e-07, "logits/chosen": -2.782031297683716, "logits/rejected": -2.768749952316284, "logps/chosen": -353.3500061035156, "logps/rejected": -418.45001220703125, "loss": 0.0441, "rewards/accuracies": 1.0, "rewards/chosen": -1.506445288658142, "rewards/margins": 4.9296875, "rewards/rejected": -6.434374809265137, "step": 6190 }, { "epoch": 2.3343373493975905, "grad_norm": 78.49008436191302, "learning_rate": 4.165097891566265e-07, "logits/chosen": -2.879687547683716, "logits/rejected": -2.842968702316284, "logps/chosen": -377.79998779296875, "logps/rejected": -456.6000061035156, "loss": 0.0525, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.604882836341858, "rewards/margins": 5.358593940734863, "rewards/rejected": -6.964062690734863, "step": 6200 }, { "epoch": 2.3381024096385543, "grad_norm": 13.097041200307485, "learning_rate": 4.155685240963855e-07, "logits/chosen": -2.780468702316284, "logits/rejected": -2.7945313453674316, "logps/chosen": -348.75, "logps/rejected": -398.5, "loss": 0.0675, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.96527099609375, "rewards/margins": 4.903906345367432, "rewards/rejected": -5.875, "step": 6210 }, { "epoch": 2.341867469879518, "grad_norm": 10.963780704218607, "learning_rate": 4.1462725903614455e-07, "logits/chosen": -2.700000047683716, "logits/rejected": -2.760937452316284, "logps/chosen": -372.125, "logps/rejected": -410.1000061035156, "loss": 0.0754, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.758923351764679, "rewards/margins": 5.07421875, "rewards/rejected": -5.834374904632568, "step": 6220 }, { "epoch": 2.345632530120482, "grad_norm": 19.956345047336285, "learning_rate": 4.136859939759036e-07, "logits/chosen": -2.760937452316284, "logits/rejected": -2.778125047683716, "logps/chosen": -340.75, "logps/rejected": -395.3999938964844, "loss": 0.0604, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1139647960662842, "rewards/margins": 4.817968845367432, "rewards/rejected": -5.925000190734863, "step": 6230 }, { "epoch": 2.3493975903614457, "grad_norm": 28.16515627173267, "learning_rate": 4.1274472891566265e-07, "logits/chosen": -2.7164063453674316, "logits/rejected": -2.76171875, "logps/chosen": -360.54998779296875, "logps/rejected": -462.29998779296875, "loss": 0.0564, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1496093273162842, "rewards/margins": 5.145312309265137, "rewards/rejected": -6.295312404632568, "step": 6240 }, { "epoch": 2.3531626506024095, "grad_norm": 14.02895555575323, "learning_rate": 4.118034638554217e-07, "logits/chosen": -2.6617188453674316, "logits/rejected": -2.6898436546325684, "logps/chosen": -372.6000061035156, "logps/rejected": -462.29998779296875, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": -1.0604248046875, "rewards/margins": 5.227343559265137, "rewards/rejected": -6.290625095367432, "step": 6250 }, { "epoch": 2.3569277108433733, "grad_norm": 73.13292469840067, "learning_rate": 4.108621987951807e-07, "logits/chosen": -2.753124952316284, "logits/rejected": -2.7109375, "logps/chosen": -358.04998779296875, "logps/rejected": -431.04998779296875, "loss": 0.0505, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9314941167831421, "rewards/margins": 4.875, "rewards/rejected": -5.807031154632568, "step": 6260 }, { "epoch": 2.3606927710843375, "grad_norm": 7.509577492523318, "learning_rate": 4.0992093373493973e-07, "logits/chosen": -2.6328125, "logits/rejected": -2.6148438453674316, "logps/chosen": -381.75, "logps/rejected": -507.0, "loss": 0.0409, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2693359851837158, "rewards/margins": 5.885937690734863, "rewards/rejected": -7.157812595367432, "step": 6270 }, { "epoch": 2.3644578313253013, "grad_norm": 15.604293872320223, "learning_rate": 4.089796686746988e-07, "logits/chosen": -2.749218702316284, "logits/rejected": -2.682812452316284, "logps/chosen": -343.54998779296875, "logps/rejected": -412.25, "loss": 0.0508, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1336548328399658, "rewards/margins": 4.921093940734863, "rewards/rejected": -6.057031154632568, "step": 6280 }, { "epoch": 2.368222891566265, "grad_norm": 28.947733306972808, "learning_rate": 4.080384036144578e-07, "logits/chosen": -2.6875, "logits/rejected": -2.701171875, "logps/chosen": -389.5, "logps/rejected": -454.75, "loss": 0.0572, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0519530773162842, "rewards/margins": 4.916406154632568, "rewards/rejected": -5.965624809265137, "step": 6290 }, { "epoch": 2.371987951807229, "grad_norm": 13.195391530859713, "learning_rate": 4.0709713855421687e-07, "logits/chosen": -2.899218797683716, "logits/rejected": -2.846874952316284, "logps/chosen": -361.8999938964844, "logps/rejected": -442.70001220703125, "loss": 0.0358, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5701172351837158, "rewards/margins": 5.215624809265137, "rewards/rejected": -6.7890625, "step": 6300 }, { "epoch": 2.3757530120481927, "grad_norm": 11.21123152468063, "learning_rate": 4.061558734939759e-07, "logits/chosen": -2.7249999046325684, "logits/rejected": -2.7601561546325684, "logps/chosen": -378.1000061035156, "logps/rejected": -491.54998779296875, "loss": 0.0449, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.835546851158142, "rewards/margins": 5.256249904632568, "rewards/rejected": -7.090624809265137, "step": 6310 }, { "epoch": 2.3795180722891565, "grad_norm": 7.58157037360832, "learning_rate": 4.052146084337349e-07, "logits/chosen": -2.764843702316284, "logits/rejected": -2.8101563453674316, "logps/chosen": -340.79998779296875, "logps/rejected": -407.1000061035156, "loss": 0.0457, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.734765648841858, "rewards/margins": 5.2890625, "rewards/rejected": -7.025000095367432, "step": 6320 }, { "epoch": 2.3832831325301207, "grad_norm": 33.684580384517915, "learning_rate": 4.0427334337349395e-07, "logits/chosen": -2.7710938453674316, "logits/rejected": -2.7578125, "logps/chosen": -344.45001220703125, "logps/rejected": -442.29998779296875, "loss": 0.0458, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6204102039337158, "rewards/margins": 5.0703125, "rewards/rejected": -6.690625190734863, "step": 6330 }, { "epoch": 2.3870481927710845, "grad_norm": 13.706945117318048, "learning_rate": 4.0333207831325297e-07, "logits/chosen": -2.671093702316284, "logits/rejected": -2.8187499046325684, "logps/chosen": -353.1499938964844, "logps/rejected": -439.6499938964844, "loss": 0.0477, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.453033447265625, "rewards/margins": 5.079687595367432, "rewards/rejected": -6.529687404632568, "step": 6340 }, { "epoch": 2.3908132530120483, "grad_norm": 45.53500987844781, "learning_rate": 4.0239081325301205e-07, "logits/chosen": -2.7421875, "logits/rejected": -2.815624952316284, "logps/chosen": -402.8999938964844, "logps/rejected": -450.45001220703125, "loss": 0.0554, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.80804443359375, "rewards/margins": 5.160937309265137, "rewards/rejected": -6.973437309265137, "step": 6350 }, { "epoch": 2.394578313253012, "grad_norm": 10.720701007836308, "learning_rate": 4.0144954819277103e-07, "logits/chosen": -2.76171875, "logits/rejected": -2.7679686546325684, "logps/chosen": -315.1000061035156, "logps/rejected": -423.5, "loss": 0.0784, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.6506073474884033, "rewards/margins": 5.338281154632568, "rewards/rejected": -6.9921875, "step": 6360 }, { "epoch": 2.398343373493976, "grad_norm": 6.842850440291511, "learning_rate": 4.005082831325301e-07, "logits/chosen": -2.7085938453674316, "logits/rejected": -2.7132811546325684, "logps/chosen": -384.20001220703125, "logps/rejected": -439.95001220703125, "loss": 0.0796, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3371093273162842, "rewards/margins": 5.164843559265137, "rewards/rejected": -6.5, "step": 6370 }, { "epoch": 2.4021084337349397, "grad_norm": 43.518804833142, "learning_rate": 3.9956701807228913e-07, "logits/chosen": -2.739062547683716, "logits/rejected": -2.768749952316284, "logps/chosen": -402.8500061035156, "logps/rejected": -432.29998779296875, "loss": 0.0731, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3000977039337158, "rewards/margins": 4.996874809265137, "rewards/rejected": -6.296875, "step": 6380 }, { "epoch": 2.4058734939759034, "grad_norm": 86.60114859578512, "learning_rate": 3.9862575301204816e-07, "logits/chosen": -2.7835936546325684, "logits/rejected": -2.8570313453674316, "logps/chosen": -381.75, "logps/rejected": -473.0, "loss": 0.0424, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3315918445587158, "rewards/margins": 5.525781154632568, "rewards/rejected": -6.856249809265137, "step": 6390 }, { "epoch": 2.4096385542168672, "grad_norm": 36.12052858091622, "learning_rate": 3.976844879518072e-07, "logits/chosen": -2.700000047683716, "logits/rejected": -2.6859374046325684, "logps/chosen": -323.75, "logps/rejected": -381.45001220703125, "loss": 0.0383, "rewards/accuracies": 1.0, "rewards/chosen": -1.315185546875, "rewards/margins": 5.125, "rewards/rejected": -6.439062595367432, "step": 6400 }, { "epoch": 2.4134036144578315, "grad_norm": 38.10335053746824, "learning_rate": 3.9674322289156627e-07, "logits/chosen": -2.7007813453674316, "logits/rejected": -2.8062500953674316, "logps/chosen": -370.6499938964844, "logps/rejected": -471.70001220703125, "loss": 0.043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.214440941810608, "rewards/margins": 5.42578125, "rewards/rejected": -6.639062404632568, "step": 6410 }, { "epoch": 2.4171686746987953, "grad_norm": 18.528353290722915, "learning_rate": 3.958019578313253e-07, "logits/chosen": -2.6890625953674316, "logits/rejected": -2.852343797683716, "logps/chosen": -357.95001220703125, "logps/rejected": -402.1000061035156, "loss": 0.0737, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.203039526939392, "rewards/margins": 5.185156345367432, "rewards/rejected": -6.385937690734863, "step": 6420 }, { "epoch": 2.420933734939759, "grad_norm": 18.717735930512806, "learning_rate": 3.948606927710843e-07, "logits/chosen": -2.79296875, "logits/rejected": -2.6953125, "logps/chosen": -377.79998779296875, "logps/rejected": -432.25, "loss": 0.065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.709570288658142, "rewards/margins": 5.003125190734863, "rewards/rejected": -6.714062690734863, "step": 6430 }, { "epoch": 2.424698795180723, "grad_norm": 7.6401484979566625, "learning_rate": 3.9391942771084335e-07, "logits/chosen": -2.74609375, "logits/rejected": -2.69140625, "logps/chosen": -323.70001220703125, "logps/rejected": -431.25, "loss": 0.0539, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.38232421875, "rewards/margins": 5.092968940734863, "rewards/rejected": -6.474999904632568, "step": 6440 }, { "epoch": 2.4284638554216866, "grad_norm": 32.331114889352904, "learning_rate": 3.929781626506024e-07, "logits/chosen": -2.7992186546325684, "logits/rejected": -2.8179688453674316, "logps/chosen": -338.0, "logps/rejected": -449.29998779296875, "loss": 0.0429, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4337890148162842, "rewards/margins": 5.236718654632568, "rewards/rejected": -6.673437595367432, "step": 6450 }, { "epoch": 2.4322289156626504, "grad_norm": 14.320570713571819, "learning_rate": 3.920368975903614e-07, "logits/chosen": -2.7359375953674316, "logits/rejected": -2.6664061546325684, "logps/chosen": -358.45001220703125, "logps/rejected": -443.70001220703125, "loss": 0.0615, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4599609375, "rewards/margins": 5.03125, "rewards/rejected": -6.493750095367432, "step": 6460 }, { "epoch": 2.4359939759036147, "grad_norm": 12.815355792494842, "learning_rate": 3.910956325301205e-07, "logits/chosen": -2.715625047683716, "logits/rejected": -2.758593797683716, "logps/chosen": -394.32501220703125, "logps/rejected": -476.79998779296875, "loss": 0.0382, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5187499523162842, "rewards/margins": 5.801562309265137, "rewards/rejected": -7.317187309265137, "step": 6470 }, { "epoch": 2.4397590361445785, "grad_norm": 6.689120119743464, "learning_rate": 3.901543674698795e-07, "logits/chosen": -2.760937452316284, "logits/rejected": -2.8257813453674316, "logps/chosen": -373.20001220703125, "logps/rejected": -413.5, "loss": 0.0459, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4119141101837158, "rewards/margins": 4.981249809265137, "rewards/rejected": -6.395312309265137, "step": 6480 }, { "epoch": 2.4435240963855422, "grad_norm": 8.576213895335233, "learning_rate": 3.892131024096386e-07, "logits/chosen": -2.6976561546325684, "logits/rejected": -2.7289061546325684, "logps/chosen": -371.3500061035156, "logps/rejected": -453.8999938964844, "loss": 0.0455, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9296875, "rewards/margins": 5.344531059265137, "rewards/rejected": -7.276562690734863, "step": 6490 }, { "epoch": 2.447289156626506, "grad_norm": 26.30953633542935, "learning_rate": 3.8827183734939756e-07, "logits/chosen": -2.733593702316284, "logits/rejected": -2.796875, "logps/chosen": -354.29998779296875, "logps/rejected": -425.8500061035156, "loss": 0.049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5154297351837158, "rewards/margins": 5.162499904632568, "rewards/rejected": -6.681250095367432, "step": 6500 }, { "epoch": 2.45105421686747, "grad_norm": 36.90849177236238, "learning_rate": 3.873305722891566e-07, "logits/chosen": -2.734375, "logits/rejected": -2.7203125953674316, "logps/chosen": -355.54998779296875, "logps/rejected": -437.29998779296875, "loss": 0.043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.704248070716858, "rewards/margins": 5.307031154632568, "rewards/rejected": -7.006249904632568, "step": 6510 }, { "epoch": 2.4548192771084336, "grad_norm": 41.441758636945735, "learning_rate": 3.8638930722891567e-07, "logits/chosen": -2.7679686546325684, "logits/rejected": -2.7289061546325684, "logps/chosen": -324.45001220703125, "logps/rejected": -434.0, "loss": 0.0926, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.9607422351837158, "rewards/margins": 5.194531440734863, "rewards/rejected": -7.154687404632568, "step": 6520 }, { "epoch": 2.4585843373493974, "grad_norm": 7.244224338947816, "learning_rate": 3.8544804216867464e-07, "logits/chosen": -2.8031249046325684, "logits/rejected": -2.78125, "logps/chosen": -351.3500061035156, "logps/rejected": -405.79998779296875, "loss": 0.0582, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8341796398162842, "rewards/margins": 5.088281154632568, "rewards/rejected": -6.920312404632568, "step": 6530 }, { "epoch": 2.462349397590361, "grad_norm": 31.361736844062516, "learning_rate": 3.845067771084337e-07, "logits/chosen": -2.823437452316284, "logits/rejected": -2.799999952316284, "logps/chosen": -428.3999938964844, "logps/rejected": -489.0, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": -2.0171875953674316, "rewards/margins": 5.102343559265137, "rewards/rejected": -7.120312690734863, "step": 6540 }, { "epoch": 2.4661144578313254, "grad_norm": 8.521187328659481, "learning_rate": 3.8356551204819275e-07, "logits/chosen": -2.8648438453674316, "logits/rejected": -2.785937547683716, "logps/chosen": -389.25, "logps/rejected": -490.5, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": -2.0972657203674316, "rewards/margins": 5.84375, "rewards/rejected": -7.948437690734863, "step": 6550 }, { "epoch": 2.4698795180722892, "grad_norm": 33.157690421734515, "learning_rate": 3.826242469879518e-07, "logits/chosen": -2.9085936546325684, "logits/rejected": -2.866406202316284, "logps/chosen": -346.25, "logps/rejected": -442.6000061035156, "loss": 0.049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8603515625, "rewards/margins": 5.30078125, "rewards/rejected": -7.157812595367432, "step": 6560 }, { "epoch": 2.473644578313253, "grad_norm": 32.10733972584366, "learning_rate": 3.816829819277108e-07, "logits/chosen": -2.7113280296325684, "logits/rejected": -2.7125000953674316, "logps/chosen": -363.3500061035156, "logps/rejected": -428.3500061035156, "loss": 0.0787, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0160155296325684, "rewards/margins": 5.255468845367432, "rewards/rejected": -7.270312309265137, "step": 6570 }, { "epoch": 2.477409638554217, "grad_norm": 27.942033187433704, "learning_rate": 3.807417168674699e-07, "logits/chosen": -2.895312547683716, "logits/rejected": -2.8343749046325684, "logps/chosen": -323.29998779296875, "logps/rejected": -396.45001220703125, "loss": 0.0583, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.40234375, "rewards/margins": 4.653906345367432, "rewards/rejected": -6.059374809265137, "step": 6580 }, { "epoch": 2.4811746987951806, "grad_norm": 18.322564522670685, "learning_rate": 3.798004518072289e-07, "logits/chosen": -2.823437452316284, "logits/rejected": -2.720703125, "logps/chosen": -364.70001220703125, "logps/rejected": -444.54998779296875, "loss": 0.0344, "rewards/accuracies": 1.0, "rewards/chosen": -1.428125023841858, "rewards/margins": 5.352343559265137, "rewards/rejected": -6.7890625, "step": 6590 }, { "epoch": 2.4849397590361444, "grad_norm": 14.36471981548342, "learning_rate": 3.7885918674698793e-07, "logits/chosen": -2.8765625953674316, "logits/rejected": -2.8304686546325684, "logps/chosen": -370.3999938964844, "logps/rejected": -430.8500061035156, "loss": 0.0378, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1921875476837158, "rewards/margins": 5.424218654632568, "rewards/rejected": -6.618750095367432, "step": 6600 }, { "epoch": 2.4887048192771086, "grad_norm": 8.578930327971731, "learning_rate": 3.7791792168674696e-07, "logits/chosen": -2.731250047683716, "logits/rejected": -2.750781297683716, "logps/chosen": -366.6499938964844, "logps/rejected": -440.29998779296875, "loss": 0.0347, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.693945288658142, "rewards/margins": 5.5703125, "rewards/rejected": -7.267187595367432, "step": 6610 }, { "epoch": 2.4924698795180724, "grad_norm": 13.128307853448632, "learning_rate": 3.7697665662650604e-07, "logits/chosen": -2.8414063453674316, "logits/rejected": -2.788281202316284, "logps/chosen": -364.25, "logps/rejected": -480.6499938964844, "loss": 0.0412, "rewards/accuracies": 1.0, "rewards/chosen": -1.9548828601837158, "rewards/margins": 5.509375095367432, "rewards/rejected": -7.473437309265137, "step": 6620 }, { "epoch": 2.496234939759036, "grad_norm": 49.620232666930946, "learning_rate": 3.7603539156626506e-07, "logits/chosen": -2.926562547683716, "logits/rejected": -2.899218797683716, "logps/chosen": -361.79998779296875, "logps/rejected": -419.3500061035156, "loss": 0.0585, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.783789038658142, "rewards/margins": 5.01953125, "rewards/rejected": -6.803124904632568, "step": 6630 }, { "epoch": 2.5, "grad_norm": 55.702181095016705, "learning_rate": 3.750941265060241e-07, "logits/chosen": -2.758593797683716, "logits/rejected": -2.8140625953674316, "logps/chosen": -339.42498779296875, "logps/rejected": -392.8999938964844, "loss": 0.0517, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.1416137218475342, "rewards/margins": 5.037499904632568, "rewards/rejected": -6.181250095367432, "step": 6640 }, { "epoch": 2.503765060240964, "grad_norm": 6.851421300156694, "learning_rate": 3.741528614457831e-07, "logits/chosen": -2.785937547683716, "logits/rejected": -2.8671875, "logps/chosen": -347.1499938964844, "logps/rejected": -443.0, "loss": 0.045, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0712769031524658, "rewards/margins": 5.393750190734863, "rewards/rejected": -6.469531059265137, "step": 6650 }, { "epoch": 2.5075301204819276, "grad_norm": 38.110957407609945, "learning_rate": 3.732115963855422e-07, "logits/chosen": -2.8578124046325684, "logits/rejected": -2.867968797683716, "logps/chosen": -348.29998779296875, "logps/rejected": -414.04998779296875, "loss": 0.0546, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.166894555091858, "rewards/margins": 5.046093940734863, "rewards/rejected": -6.220312595367432, "step": 6660 }, { "epoch": 2.5112951807228914, "grad_norm": 8.735146762061799, "learning_rate": 3.7227033132530117e-07, "logits/chosen": -2.7710938453674316, "logits/rejected": -2.7046875953674316, "logps/chosen": -364.1499938964844, "logps/rejected": -477.29998779296875, "loss": 0.0553, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.560937523841858, "rewards/margins": 5.415625095367432, "rewards/rejected": -6.979687690734863, "step": 6670 }, { "epoch": 2.515060240963855, "grad_norm": 52.67955229626992, "learning_rate": 3.713290662650602e-07, "logits/chosen": -2.729687452316284, "logits/rejected": -2.703906297683716, "logps/chosen": -373.6000061035156, "logps/rejected": -447.6000061035156, "loss": 0.0485, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.586328148841858, "rewards/margins": 5.415625095367432, "rewards/rejected": -7.003125190734863, "step": 6680 }, { "epoch": 2.5188253012048194, "grad_norm": 23.719413415645217, "learning_rate": 3.703878012048193e-07, "logits/chosen": -2.77734375, "logits/rejected": -2.90234375, "logps/chosen": -385.54998779296875, "logps/rejected": -414.3999938964844, "loss": 0.0673, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.603216528892517, "rewards/margins": 5.114843845367432, "rewards/rejected": -6.720312595367432, "step": 6690 }, { "epoch": 2.522590361445783, "grad_norm": 58.1090253719701, "learning_rate": 3.6944653614457825e-07, "logits/chosen": -2.8695311546325684, "logits/rejected": -2.80859375, "logps/chosen": -320.875, "logps/rejected": -420.5, "loss": 0.0667, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.8427734375, "rewards/margins": 5.496874809265137, "rewards/rejected": -7.34375, "step": 6700 }, { "epoch": 2.526355421686747, "grad_norm": 7.040296305824664, "learning_rate": 3.6850527108433733e-07, "logits/chosen": -2.860156297683716, "logits/rejected": -2.856250047683716, "logps/chosen": -381.6499938964844, "logps/rejected": -451.0, "loss": 0.0395, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9763672351837158, "rewards/margins": 5.385937690734863, "rewards/rejected": -7.356249809265137, "step": 6710 }, { "epoch": 2.5301204819277108, "grad_norm": 48.01701218503371, "learning_rate": 3.6756400602409636e-07, "logits/chosen": -2.78515625, "logits/rejected": -2.8203125, "logps/chosen": -375.0, "logps/rejected": -448.6000061035156, "loss": 0.0511, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7593567371368408, "rewards/margins": 5.064843654632568, "rewards/rejected": -6.823437690734863, "step": 6720 }, { "epoch": 2.5338855421686746, "grad_norm": 17.624353473405428, "learning_rate": 3.6662274096385544e-07, "logits/chosen": -2.7289061546325684, "logits/rejected": -2.688281297683716, "logps/chosen": -362.04998779296875, "logps/rejected": -452.0, "loss": 0.0517, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.6358642578125, "rewards/margins": 5.290625095367432, "rewards/rejected": -6.928124904632568, "step": 6730 }, { "epoch": 2.537650602409639, "grad_norm": 34.19821676250007, "learning_rate": 3.656814759036144e-07, "logits/chosen": -2.828906297683716, "logits/rejected": -2.7437500953674316, "logps/chosen": -336.0, "logps/rejected": -432.70001220703125, "loss": 0.0805, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.966406226158142, "rewards/margins": 4.853906154632568, "rewards/rejected": -6.814062595367432, "step": 6740 }, { "epoch": 2.5414156626506026, "grad_norm": 20.28198212600134, "learning_rate": 3.647402108433735e-07, "logits/chosen": -2.71484375, "logits/rejected": -2.776562452316284, "logps/chosen": -349.75, "logps/rejected": -418.3500061035156, "loss": 0.0463, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6697266101837158, "rewards/margins": 5.1328125, "rewards/rejected": -6.809374809265137, "step": 6750 }, { "epoch": 2.5451807228915664, "grad_norm": 50.43105001342857, "learning_rate": 3.637989457831325e-07, "logits/chosen": -2.7359375953674316, "logits/rejected": -2.7874999046325684, "logps/chosen": -367.25, "logps/rejected": -438.0, "loss": 0.0561, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.672460913658142, "rewards/margins": 5.260156154632568, "rewards/rejected": -6.935937404632568, "step": 6760 }, { "epoch": 2.54894578313253, "grad_norm": 22.916827552199987, "learning_rate": 3.6285768072289154e-07, "logits/chosen": -2.7679686546325684, "logits/rejected": -2.8023438453674316, "logps/chosen": -335.8999938964844, "logps/rejected": -398.29998779296875, "loss": 0.0566, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.256616234779358, "rewards/margins": 4.953125, "rewards/rejected": -6.214062690734863, "step": 6770 }, { "epoch": 2.552710843373494, "grad_norm": 34.52138919015937, "learning_rate": 3.6191641566265057e-07, "logits/chosen": -2.8203125, "logits/rejected": -2.8046875, "logps/chosen": -383.3999938964844, "logps/rejected": -481.0, "loss": 0.0449, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.373998999595642, "rewards/margins": 5.079687595367432, "rewards/rejected": -6.453125, "step": 6780 }, { "epoch": 2.5564759036144578, "grad_norm": 7.360592360606906, "learning_rate": 3.6097515060240965e-07, "logits/chosen": -2.7515625953674316, "logits/rejected": -2.774218797683716, "logps/chosen": -365.6000061035156, "logps/rejected": -471.1000061035156, "loss": 0.0519, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.376953125, "rewards/margins": 5.296875, "rewards/rejected": -6.676562309265137, "step": 6790 }, { "epoch": 2.5602409638554215, "grad_norm": 9.991383106383307, "learning_rate": 3.600338855421687e-07, "logits/chosen": -2.7367186546325684, "logits/rejected": -2.7750000953674316, "logps/chosen": -343.45001220703125, "logps/rejected": -438.3500061035156, "loss": 0.0599, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.6611328125, "rewards/margins": 5.361718654632568, "rewards/rejected": -7.020312309265137, "step": 6800 }, { "epoch": 2.5640060240963853, "grad_norm": 25.46053965272162, "learning_rate": 3.590926204819277e-07, "logits/chosen": -2.811718702316284, "logits/rejected": -2.7671875953674316, "logps/chosen": -362.7250061035156, "logps/rejected": -429.8999938964844, "loss": 0.0515, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0501952171325684, "rewards/margins": 5.06640625, "rewards/rejected": -7.1171875, "step": 6810 }, { "epoch": 2.567771084337349, "grad_norm": 5.035519268929968, "learning_rate": 3.5815135542168673e-07, "logits/chosen": -2.852343797683716, "logits/rejected": -2.8226561546325684, "logps/chosen": -353.1499938964844, "logps/rejected": -413.95001220703125, "loss": 0.0729, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.3842041492462158, "rewards/margins": 4.824999809265137, "rewards/rejected": -6.2109375, "step": 6820 }, { "epoch": 2.5715361445783134, "grad_norm": 17.437589145065168, "learning_rate": 3.5721009036144576e-07, "logits/chosen": -2.762890577316284, "logits/rejected": -2.8304686546325684, "logps/chosen": -329.1499938964844, "logps/rejected": -409.8999938964844, "loss": 0.0371, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.745703101158142, "rewards/margins": 5.318749904632568, "rewards/rejected": -7.065625190734863, "step": 6830 }, { "epoch": 2.575301204819277, "grad_norm": 12.19348066723984, "learning_rate": 3.562688253012048e-07, "logits/chosen": -2.8882813453674316, "logits/rejected": -2.901562452316284, "logps/chosen": -367.1000061035156, "logps/rejected": -429.04998779296875, "loss": 0.0841, "rewards/accuracies": 0.96875, "rewards/chosen": -1.7878906726837158, "rewards/margins": 5.143750190734863, "rewards/rejected": -6.928124904632568, "step": 6840 }, { "epoch": 2.579066265060241, "grad_norm": 29.464432198035777, "learning_rate": 3.553275602409638e-07, "logits/chosen": -2.859375, "logits/rejected": -2.94140625, "logps/chosen": -401.29998779296875, "logps/rejected": -469.70001220703125, "loss": 0.0333, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.824609398841858, "rewards/margins": 5.6953125, "rewards/rejected": -7.515625, "step": 6850 }, { "epoch": 2.5828313253012047, "grad_norm": 29.533462990544248, "learning_rate": 3.543862951807229e-07, "logits/chosen": -2.9437499046325684, "logits/rejected": -2.938281297683716, "logps/chosen": -331.95001220703125, "logps/rejected": -408.6499938964844, "loss": 0.0499, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3855712413787842, "rewards/margins": 5.064843654632568, "rewards/rejected": -6.4453125, "step": 6860 }, { "epoch": 2.5865963855421685, "grad_norm": 82.99850004936046, "learning_rate": 3.534450301204819e-07, "logits/chosen": -2.8453125953674316, "logits/rejected": -2.729687452316284, "logps/chosen": -335.6000061035156, "logps/rejected": -459.75, "loss": 0.0693, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9640624523162842, "rewards/margins": 5.117968559265137, "rewards/rejected": -7.084374904632568, "step": 6870 }, { "epoch": 2.5903614457831328, "grad_norm": 95.74863922733556, "learning_rate": 3.5250376506024094e-07, "logits/chosen": -2.7945313453674316, "logits/rejected": -2.8375000953674316, "logps/chosen": -345.8500061035156, "logps/rejected": -405.6000061035156, "loss": 0.0641, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7121093273162842, "rewards/margins": 5.30078125, "rewards/rejected": -7.006249904632568, "step": 6880 }, { "epoch": 2.5941265060240966, "grad_norm": 9.62478006761384, "learning_rate": 3.5156249999999997e-07, "logits/chosen": -2.82421875, "logits/rejected": -2.8382811546325684, "logps/chosen": -373.20001220703125, "logps/rejected": -478.25, "loss": 0.0605, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7108886241912842, "rewards/margins": 5.345312595367432, "rewards/rejected": -7.0546875, "step": 6890 }, { "epoch": 2.5978915662650603, "grad_norm": 56.95788557591169, "learning_rate": 3.5062123493975905e-07, "logits/chosen": -2.6871094703674316, "logits/rejected": -2.8304686546325684, "logps/chosen": -315.29998779296875, "logps/rejected": -398.0, "loss": 0.0706, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.285742163658142, "rewards/margins": 5.243750095367432, "rewards/rejected": -6.53125, "step": 6900 }, { "epoch": 2.601656626506024, "grad_norm": 15.983506105935584, "learning_rate": 3.49679969879518e-07, "logits/chosen": -2.7945313453674316, "logits/rejected": -2.754687547683716, "logps/chosen": -349.3999938964844, "logps/rejected": -450.70001220703125, "loss": 0.0492, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.43505859375, "rewards/margins": 5.44921875, "rewards/rejected": -6.885937690734863, "step": 6910 }, { "epoch": 2.605421686746988, "grad_norm": 27.153390062082416, "learning_rate": 3.487387048192771e-07, "logits/chosen": -2.674999952316284, "logits/rejected": -2.684375047683716, "logps/chosen": -363.25, "logps/rejected": -432.1000061035156, "loss": 0.0483, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.649023413658142, "rewards/margins": 5.078125, "rewards/rejected": -6.729687690734863, "step": 6920 }, { "epoch": 2.6091867469879517, "grad_norm": 31.99767629662651, "learning_rate": 3.4779743975903613e-07, "logits/chosen": -2.725781202316284, "logits/rejected": -2.714062452316284, "logps/chosen": -337.45001220703125, "logps/rejected": -412.29998779296875, "loss": 0.0609, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.493749976158142, "rewards/margins": 5.040625095367432, "rewards/rejected": -6.540625095367432, "step": 6930 }, { "epoch": 2.6129518072289155, "grad_norm": 49.929477798426525, "learning_rate": 3.468561746987952e-07, "logits/chosen": -2.836718797683716, "logits/rejected": -2.819531202316284, "logps/chosen": -364.1499938964844, "logps/rejected": -455.6000061035156, "loss": 0.0484, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.800195336341858, "rewards/margins": 5.178906440734863, "rewards/rejected": -6.9765625, "step": 6940 }, { "epoch": 2.6167168674698793, "grad_norm": 30.692890303366863, "learning_rate": 3.459149096385542e-07, "logits/chosen": -2.8382811546325684, "logits/rejected": -2.8671875, "logps/chosen": -391.1000061035156, "logps/rejected": -474.70001220703125, "loss": 0.0325, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.908203125, "rewards/margins": 5.721875190734863, "rewards/rejected": -7.631249904632568, "step": 6950 }, { "epoch": 2.6204819277108435, "grad_norm": 7.525740742437691, "learning_rate": 3.4497364457831326e-07, "logits/chosen": -2.903125047683716, "logits/rejected": -2.850781202316284, "logps/chosen": -352.54998779296875, "logps/rejected": -455.3999938964844, "loss": 0.0432, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.814062476158142, "rewards/margins": 5.256249904632568, "rewards/rejected": -7.074999809265137, "step": 6960 }, { "epoch": 2.6242469879518073, "grad_norm": 9.884666686821117, "learning_rate": 3.440323795180723e-07, "logits/chosen": -2.6429686546325684, "logits/rejected": -2.723437547683716, "logps/chosen": -368.3999938964844, "logps/rejected": -445.3999938964844, "loss": 0.0641, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.911718726158142, "rewards/margins": 5.25390625, "rewards/rejected": -7.165625095367432, "step": 6970 }, { "epoch": 2.628012048192771, "grad_norm": 22.476980812978947, "learning_rate": 3.4309111445783126e-07, "logits/chosen": -2.7750000953674316, "logits/rejected": -2.799999952316284, "logps/chosen": -376.6499938964844, "logps/rejected": -492.0, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": -1.38232421875, "rewards/margins": 5.399218559265137, "rewards/rejected": -6.778124809265137, "step": 6980 }, { "epoch": 2.631777108433735, "grad_norm": 11.467499657462849, "learning_rate": 3.4214984939759034e-07, "logits/chosen": -2.815624952316284, "logits/rejected": -2.8046875, "logps/chosen": -346.3999938964844, "logps/rejected": -422.6000061035156, "loss": 0.0499, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.24267578125, "rewards/margins": 5.211718559265137, "rewards/rejected": -6.45703125, "step": 6990 }, { "epoch": 2.6355421686746987, "grad_norm": 28.405021635971075, "learning_rate": 3.4120858433734937e-07, "logits/chosen": -2.745312452316284, "logits/rejected": -2.832812547683716, "logps/chosen": -388.04998779296875, "logps/rejected": -443.3999938964844, "loss": 0.0658, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.330847144126892, "rewards/margins": 5.091406345367432, "rewards/rejected": -6.42578125, "step": 7000 }, { "epoch": 2.6393072289156625, "grad_norm": 5.878143449557956, "learning_rate": 3.4026731927710845e-07, "logits/chosen": -2.819531202316284, "logits/rejected": -2.764843702316284, "logps/chosen": -329.8999938964844, "logps/rejected": -430.3999938964844, "loss": 0.0557, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6350219249725342, "rewards/margins": 5.143750190734863, "rewards/rejected": -6.776562690734863, "step": 7010 }, { "epoch": 2.6430722891566267, "grad_norm": 24.920063056118867, "learning_rate": 3.393260542168674e-07, "logits/chosen": -2.770312547683716, "logits/rejected": -2.828906297683716, "logps/chosen": -344.1000061035156, "logps/rejected": -429.70001220703125, "loss": 0.0857, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.7355468273162842, "rewards/margins": 4.969531059265137, "rewards/rejected": -6.707812309265137, "step": 7020 }, { "epoch": 2.6468373493975905, "grad_norm": 55.90819270136255, "learning_rate": 3.383847891566265e-07, "logits/chosen": -2.764843702316284, "logits/rejected": -2.7796874046325684, "logps/chosen": -345.6000061035156, "logps/rejected": -420.54998779296875, "loss": 0.0435, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5284423828125, "rewards/margins": 5.546875, "rewards/rejected": -7.071875095367432, "step": 7030 }, { "epoch": 2.6506024096385543, "grad_norm": 18.30837194852063, "learning_rate": 3.3744352409638553e-07, "logits/chosen": -2.8062500953674316, "logits/rejected": -2.825000047683716, "logps/chosen": -329.8500061035156, "logps/rejected": -404.6000061035156, "loss": 0.0588, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.736718773841858, "rewards/margins": 5.178124904632568, "rewards/rejected": -6.907812595367432, "step": 7040 }, { "epoch": 2.654367469879518, "grad_norm": 76.79157431922147, "learning_rate": 3.3650225903614455e-07, "logits/chosen": -2.842968702316284, "logits/rejected": -2.7710938453674316, "logps/chosen": -347.1000061035156, "logps/rejected": -434.6000061035156, "loss": 0.0825, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7978515625, "rewards/margins": 4.977343559265137, "rewards/rejected": -6.771874904632568, "step": 7050 }, { "epoch": 2.658132530120482, "grad_norm": 49.307102378807585, "learning_rate": 3.355609939759036e-07, "logits/chosen": -2.9320311546325684, "logits/rejected": -2.854687452316284, "logps/chosen": -321.3500061035156, "logps/rejected": -428.6499938964844, "loss": 0.1032, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.4521484375, "rewards/margins": 5.09375, "rewards/rejected": -6.546093940734863, "step": 7060 }, { "epoch": 2.6618975903614457, "grad_norm": 24.605424127729595, "learning_rate": 3.3461972891566266e-07, "logits/chosen": -2.7484374046325684, "logits/rejected": -2.714062452316284, "logps/chosen": -354.54998779296875, "logps/rejected": -454.1499938964844, "loss": 0.0508, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.8693358898162842, "rewards/margins": 5.190625190734863, "rewards/rejected": -7.059374809265137, "step": 7070 }, { "epoch": 2.6656626506024095, "grad_norm": 16.811750694064024, "learning_rate": 3.336784638554217e-07, "logits/chosen": -2.7445311546325684, "logits/rejected": -2.745312452316284, "logps/chosen": -334.6000061035156, "logps/rejected": -444.0, "loss": 0.048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.984960913658142, "rewards/margins": 5.04296875, "rewards/rejected": -7.028124809265137, "step": 7080 }, { "epoch": 2.6694277108433733, "grad_norm": 47.4786827852664, "learning_rate": 3.327371987951807e-07, "logits/chosen": -2.899218797683716, "logits/rejected": -2.9000000953674316, "logps/chosen": -325.04998779296875, "logps/rejected": -408.8999938964844, "loss": 0.0568, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5869140625, "rewards/margins": 5.234375, "rewards/rejected": -6.824999809265137, "step": 7090 }, { "epoch": 2.6731927710843375, "grad_norm": 7.397734027571986, "learning_rate": 3.3179593373493974e-07, "logits/chosen": -2.784374952316284, "logits/rejected": -2.809375047683716, "logps/chosen": -334.25, "logps/rejected": -434.1000061035156, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -1.7527344226837158, "rewards/margins": 5.140625, "rewards/rejected": -6.8984375, "step": 7100 }, { "epoch": 2.6769578313253013, "grad_norm": 38.811458409028376, "learning_rate": 3.308546686746988e-07, "logits/chosen": -2.7886719703674316, "logits/rejected": -2.8773436546325684, "logps/chosen": -347.3999938964844, "logps/rejected": -422.6000061035156, "loss": 0.0663, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9738280773162842, "rewards/margins": 4.882031440734863, "rewards/rejected": -6.856249809265137, "step": 7110 }, { "epoch": 2.680722891566265, "grad_norm": 38.86737646279825, "learning_rate": 3.299134036144578e-07, "logits/chosen": -2.8335938453674316, "logits/rejected": -2.831249952316284, "logps/chosen": -346.6499938964844, "logps/rejected": -459.8500061035156, "loss": 0.0446, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.752343773841858, "rewards/margins": 5.314843654632568, "rewards/rejected": -7.067187309265137, "step": 7120 }, { "epoch": 2.684487951807229, "grad_norm": 25.444716131867875, "learning_rate": 3.289721385542169e-07, "logits/chosen": -2.918750047683716, "logits/rejected": -2.9039063453674316, "logps/chosen": -373.0, "logps/rejected": -444.1000061035156, "loss": 0.0299, "rewards/accuracies": 1.0, "rewards/chosen": -1.9210937023162842, "rewards/margins": 5.6171875, "rewards/rejected": -7.5390625, "step": 7130 }, { "epoch": 2.6882530120481927, "grad_norm": 14.119119386243995, "learning_rate": 3.280308734939759e-07, "logits/chosen": -2.8148436546325684, "logits/rejected": -2.73046875, "logps/chosen": -415.25, "logps/rejected": -483.29998779296875, "loss": 0.0421, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.85546875, "rewards/margins": 5.292187690734863, "rewards/rejected": -7.146874904632568, "step": 7140 }, { "epoch": 2.6920180722891565, "grad_norm": 12.225395040195759, "learning_rate": 3.270896084337349e-07, "logits/chosen": -2.7734375, "logits/rejected": -2.772656202316284, "logps/chosen": -372.3999938964844, "logps/rejected": -465.70001220703125, "loss": 0.0655, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.985009789466858, "rewards/margins": 5.557031154632568, "rewards/rejected": -7.540625095367432, "step": 7150 }, { "epoch": 2.6957831325301207, "grad_norm": 34.963962822339376, "learning_rate": 3.2614834337349395e-07, "logits/chosen": -2.797656297683716, "logits/rejected": -2.7398438453674316, "logps/chosen": -384.0, "logps/rejected": -456.95001220703125, "loss": 0.0489, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.728124976158142, "rewards/margins": 5.194531440734863, "rewards/rejected": -6.920312404632568, "step": 7160 }, { "epoch": 2.6995481927710845, "grad_norm": 10.406097304704234, "learning_rate": 3.25207078313253e-07, "logits/chosen": -2.9046874046325684, "logits/rejected": -2.817187547683716, "logps/chosen": -326.95001220703125, "logps/rejected": -428.5, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": -1.5744140148162842, "rewards/margins": 5.314843654632568, "rewards/rejected": -6.889062404632568, "step": 7170 }, { "epoch": 2.7033132530120483, "grad_norm": 35.402191590555375, "learning_rate": 3.2426581325301206e-07, "logits/chosen": -2.856250047683716, "logits/rejected": -2.8257813453674316, "logps/chosen": -370.79998779296875, "logps/rejected": -426.3999938964844, "loss": 0.0666, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.1058592796325684, "rewards/margins": 4.95703125, "rewards/rejected": -7.056250095367432, "step": 7180 }, { "epoch": 2.707078313253012, "grad_norm": 17.78800910713803, "learning_rate": 3.2332454819277103e-07, "logits/chosen": -2.8296875953674316, "logits/rejected": -2.8609375953674316, "logps/chosen": -372.25, "logps/rejected": -440.04998779296875, "loss": 0.0541, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.321093797683716, "rewards/margins": 5.132031440734863, "rewards/rejected": -7.451562404632568, "step": 7190 }, { "epoch": 2.710843373493976, "grad_norm": 55.23221274232277, "learning_rate": 3.223832831325301e-07, "logits/chosen": -2.770312547683716, "logits/rejected": -2.835156202316284, "logps/chosen": -350.1499938964844, "logps/rejected": -423.8500061035156, "loss": 0.0626, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.658349633216858, "rewards/margins": 5.267187595367432, "rewards/rejected": -6.926562309265137, "step": 7200 }, { "epoch": 2.7146084337349397, "grad_norm": 14.201287965322058, "learning_rate": 3.2144201807228914e-07, "logits/chosen": -2.8335938453674316, "logits/rejected": -2.9375, "logps/chosen": -330.1499938964844, "logps/rejected": -401.29998779296875, "loss": 0.0647, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.720703125, "rewards/margins": 4.921875, "rewards/rejected": -6.643750190734863, "step": 7210 }, { "epoch": 2.7183734939759034, "grad_norm": 15.667719696198452, "learning_rate": 3.2050075301204817e-07, "logits/chosen": -2.887500047683716, "logits/rejected": -2.836718797683716, "logps/chosen": -389.04998779296875, "logps/rejected": -519.5, "loss": 0.0455, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5712890625, "rewards/margins": 5.292187690734863, "rewards/rejected": -6.860937595367432, "step": 7220 }, { "epoch": 2.7221385542168672, "grad_norm": 10.346963491862478, "learning_rate": 3.195594879518072e-07, "logits/chosen": -2.809375047683716, "logits/rejected": -2.796875, "logps/chosen": -324.2250061035156, "logps/rejected": -411.25, "loss": 0.0504, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.498754858970642, "rewards/margins": 5.360156059265137, "rewards/rejected": -6.854687690734863, "step": 7230 }, { "epoch": 2.7259036144578315, "grad_norm": 10.434219389039715, "learning_rate": 3.1861822289156627e-07, "logits/chosen": -2.7640624046325684, "logits/rejected": -2.8882813453674316, "logps/chosen": -301.6000061035156, "logps/rejected": -381.79998779296875, "loss": 0.0408, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.288964867591858, "rewards/margins": 5.162499904632568, "rewards/rejected": -6.456250190734863, "step": 7240 }, { "epoch": 2.7296686746987953, "grad_norm": 60.420395833531515, "learning_rate": 3.176769578313253e-07, "logits/chosen": -2.8257813453674316, "logits/rejected": -2.825000047683716, "logps/chosen": -343.20001220703125, "logps/rejected": -434.0, "loss": 0.0551, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.724609375, "rewards/margins": 5.620312690734863, "rewards/rejected": -7.345312595367432, "step": 7250 }, { "epoch": 2.733433734939759, "grad_norm": 23.664110803868287, "learning_rate": 3.167356927710843e-07, "logits/chosen": -2.8296875953674316, "logits/rejected": -2.8140625953674316, "logps/chosen": -334.6000061035156, "logps/rejected": -436.3500061035156, "loss": 0.043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5060546398162842, "rewards/margins": 5.427343845367432, "rewards/rejected": -6.935937404632568, "step": 7260 }, { "epoch": 2.737198795180723, "grad_norm": 9.052112946562985, "learning_rate": 3.1579442771084335e-07, "logits/chosen": -2.7578125, "logits/rejected": -2.720703125, "logps/chosen": -310.6499938964844, "logps/rejected": -443.3999938964844, "loss": 0.0442, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.534570336341858, "rewards/margins": 5.682812690734863, "rewards/rejected": -7.217187404632568, "step": 7270 }, { "epoch": 2.7409638554216866, "grad_norm": 43.838561919609475, "learning_rate": 3.1485316265060243e-07, "logits/chosen": -2.758593797683716, "logits/rejected": -2.792187452316284, "logps/chosen": -365.8500061035156, "logps/rejected": -441.0, "loss": 0.0718, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2698485851287842, "rewards/margins": 5.010937690734863, "rewards/rejected": -6.28125, "step": 7280 }, { "epoch": 2.744728915662651, "grad_norm": 19.125087485254486, "learning_rate": 3.139118975903614e-07, "logits/chosen": -2.676562547683716, "logits/rejected": -2.8031249046325684, "logps/chosen": -352.0, "logps/rejected": -435.5, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": -1.8642578125, "rewards/margins": 5.309374809265137, "rewards/rejected": -7.178124904632568, "step": 7290 }, { "epoch": 2.7484939759036147, "grad_norm": 7.107818389788362, "learning_rate": 3.129706325301205e-07, "logits/chosen": -2.686718702316284, "logits/rejected": -2.765625, "logps/chosen": -323.8999938964844, "logps/rejected": -449.95001220703125, "loss": 0.0422, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.858496069908142, "rewards/margins": 5.901562690734863, "rewards/rejected": -7.7578125, "step": 7300 }, { "epoch": 2.7522590361445785, "grad_norm": 23.501054975800784, "learning_rate": 3.120293674698795e-07, "logits/chosen": -2.835156202316284, "logits/rejected": -2.8515625, "logps/chosen": -339.0, "logps/rejected": -398.79998779296875, "loss": 0.0578, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.832617163658142, "rewards/margins": 5.235937595367432, "rewards/rejected": -7.074999809265137, "step": 7310 }, { "epoch": 2.7560240963855422, "grad_norm": 41.79540601905452, "learning_rate": 3.1108810240963854e-07, "logits/chosen": -2.787890672683716, "logits/rejected": -2.8335938453674316, "logps/chosen": -370.8999938964844, "logps/rejected": -459.6000061035156, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": -1.6438477039337158, "rewards/margins": 5.67578125, "rewards/rejected": -7.3203125, "step": 7320 }, { "epoch": 2.759789156626506, "grad_norm": 18.387954804045656, "learning_rate": 3.1014683734939757e-07, "logits/chosen": -2.7210936546325684, "logits/rejected": -2.827343702316284, "logps/chosen": -387.5, "logps/rejected": -413.1000061035156, "loss": 0.0665, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.4072265625, "rewards/margins": 4.878125190734863, "rewards/rejected": -6.282812595367432, "step": 7330 }, { "epoch": 2.76355421686747, "grad_norm": 9.863631239526546, "learning_rate": 3.092055722891566e-07, "logits/chosen": -2.848437547683716, "logits/rejected": -2.778125047683716, "logps/chosen": -325.29998779296875, "logps/rejected": -400.8500061035156, "loss": 0.0515, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4744141101837158, "rewards/margins": 5.006249904632568, "rewards/rejected": -6.4765625, "step": 7340 }, { "epoch": 2.7673192771084336, "grad_norm": 43.82323168636366, "learning_rate": 3.0826430722891567e-07, "logits/chosen": -2.859375, "logits/rejected": -2.782031297683716, "logps/chosen": -385.20001220703125, "logps/rejected": -458.8500061035156, "loss": 0.0812, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.690039038658142, "rewards/margins": 4.854687690734863, "rewards/rejected": -6.548437595367432, "step": 7350 }, { "epoch": 2.7710843373493974, "grad_norm": 15.474192456106875, "learning_rate": 3.0732304216867465e-07, "logits/chosen": -2.792187452316284, "logits/rejected": -2.8125, "logps/chosen": -379.29998779296875, "logps/rejected": -477.0, "loss": 0.0753, "rewards/accuracies": 0.96875, "rewards/chosen": -1.698828101158142, "rewards/margins": 5.438281059265137, "rewards/rejected": -7.143750190734863, "step": 7360 }, { "epoch": 2.774849397590361, "grad_norm": 18.349467754603744, "learning_rate": 3.063817771084337e-07, "logits/chosen": -2.7750000953674316, "logits/rejected": -2.842968702316284, "logps/chosen": -330.3999938964844, "logps/rejected": -425.20001220703125, "loss": 0.0509, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3374207019805908, "rewards/margins": 5.371874809265137, "rewards/rejected": -6.7109375, "step": 7370 }, { "epoch": 2.7786144578313254, "grad_norm": 37.69807183401939, "learning_rate": 3.0544051204819275e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.7328124046325684, "logps/chosen": -337.875, "logps/rejected": -439.3500061035156, "loss": 0.0469, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.6494140625, "rewards/margins": 5.371874809265137, "rewards/rejected": -7.021874904632568, "step": 7380 }, { "epoch": 2.7823795180722892, "grad_norm": 17.42664011185642, "learning_rate": 3.0449924698795183e-07, "logits/chosen": -2.796093702316284, "logits/rejected": -2.8414063453674316, "logps/chosen": -366.6000061035156, "logps/rejected": -431.29998779296875, "loss": 0.0513, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.83740234375, "rewards/margins": 5.360156059265137, "rewards/rejected": -7.193749904632568, "step": 7390 }, { "epoch": 2.786144578313253, "grad_norm": 16.724056117311164, "learning_rate": 3.035579819277108e-07, "logits/chosen": -2.7562499046325684, "logits/rejected": -2.819531202316284, "logps/chosen": -364.79998779296875, "logps/rejected": -433.25, "loss": 0.0347, "rewards/accuracies": 1.0, "rewards/chosen": -1.804296851158142, "rewards/margins": 5.1484375, "rewards/rejected": -6.949999809265137, "step": 7400 }, { "epoch": 2.789909638554217, "grad_norm": 64.16234694699402, "learning_rate": 3.026167168674699e-07, "logits/chosen": -2.8046875, "logits/rejected": -2.828906297683716, "logps/chosen": -391.8999938964844, "logps/rejected": -464.79998779296875, "loss": 0.0609, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.096874952316284, "rewards/margins": 5.498437404632568, "rewards/rejected": -7.590624809265137, "step": 7410 }, { "epoch": 2.7936746987951806, "grad_norm": 34.723130403070655, "learning_rate": 3.016754518072289e-07, "logits/chosen": -2.746875047683716, "logits/rejected": -2.7906250953674316, "logps/chosen": -362.79998779296875, "logps/rejected": -469.5, "loss": 0.0763, "rewards/accuracies": 0.96875, "rewards/chosen": -2.2564454078674316, "rewards/margins": 5.285937309265137, "rewards/rejected": -7.545312404632568, "step": 7420 }, { "epoch": 2.797439759036145, "grad_norm": 20.142430954360677, "learning_rate": 3.0073418674698794e-07, "logits/chosen": -2.796875, "logits/rejected": -2.7671875953674316, "logps/chosen": -354.95001220703125, "logps/rejected": -450.29998779296875, "loss": 0.0368, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0899658203125, "rewards/margins": 5.667187690734863, "rewards/rejected": -7.762499809265137, "step": 7430 }, { "epoch": 2.8012048192771086, "grad_norm": 21.15521070486241, "learning_rate": 2.9979292168674696e-07, "logits/chosen": -2.850781202316284, "logits/rejected": -3.026562452316284, "logps/chosen": -381.75, "logps/rejected": -432.6000061035156, "loss": 0.0423, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1993165016174316, "rewards/margins": 5.08203125, "rewards/rejected": -7.284375190734863, "step": 7440 }, { "epoch": 2.8049698795180724, "grad_norm": 15.403190689922958, "learning_rate": 2.9885165662650604e-07, "logits/chosen": -2.7679686546325684, "logits/rejected": -2.77734375, "logps/chosen": -358.6000061035156, "logps/rejected": -456.1000061035156, "loss": 0.0695, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.7328124046325684, "rewards/margins": 5.413281440734863, "rewards/rejected": -8.149999618530273, "step": 7450 }, { "epoch": 2.808734939759036, "grad_norm": 22.724436080242967, "learning_rate": 2.9791039156626507e-07, "logits/chosen": -2.784374952316284, "logits/rejected": -2.8046875, "logps/chosen": -385.25, "logps/rejected": -465.1499938964844, "loss": 0.0504, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5140624046325684, "rewards/margins": 5.327343940734863, "rewards/rejected": -7.840624809265137, "step": 7460 }, { "epoch": 2.8125, "grad_norm": 43.37998313852645, "learning_rate": 2.9696912650602404e-07, "logits/chosen": -2.754687547683716, "logits/rejected": -2.7734375, "logps/chosen": -366.0, "logps/rejected": -427.70001220703125, "loss": 0.0482, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.598828077316284, "rewards/margins": 5.960156440734863, "rewards/rejected": -8.564062118530273, "step": 7470 }, { "epoch": 2.816265060240964, "grad_norm": 38.85002522051201, "learning_rate": 2.960278614457831e-07, "logits/chosen": -2.8804688453674316, "logits/rejected": -2.859375, "logps/chosen": -385.0, "logps/rejected": -431.1000061035156, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": -2.4248046875, "rewards/margins": 4.839062690734863, "rewards/rejected": -7.264062404632568, "step": 7480 }, { "epoch": 2.8200301204819276, "grad_norm": 27.113728190697156, "learning_rate": 2.9508659638554215e-07, "logits/chosen": -2.8499999046325684, "logits/rejected": -2.744140625, "logps/chosen": -320.3500061035156, "logps/rejected": -410.8999938964844, "loss": 0.0594, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.23828125, "rewards/margins": 5.017187595367432, "rewards/rejected": -7.251562595367432, "step": 7490 }, { "epoch": 2.8237951807228914, "grad_norm": 33.001018676418916, "learning_rate": 2.941453313253012e-07, "logits/chosen": -2.858593702316284, "logits/rejected": -2.836718797683716, "logps/chosen": -351.6499938964844, "logps/rejected": -413.5, "loss": 0.0596, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.4815430641174316, "rewards/margins": 4.871874809265137, "rewards/rejected": -7.353125095367432, "step": 7500 }, { "epoch": 2.827560240963855, "grad_norm": 18.46320720842537, "learning_rate": 2.932040662650602e-07, "logits/chosen": -2.8257813453674316, "logits/rejected": -2.9273438453674316, "logps/chosen": -399.29998779296875, "logps/rejected": -436.6000061035156, "loss": 0.0588, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.4566407203674316, "rewards/margins": 5.111718654632568, "rewards/rejected": -7.565625190734863, "step": 7510 }, { "epoch": 2.8313253012048194, "grad_norm": 18.65436470234468, "learning_rate": 2.922628012048193e-07, "logits/chosen": -2.859375, "logits/rejected": -2.9632811546325684, "logps/chosen": -366.3999938964844, "logps/rejected": -404.8500061035156, "loss": 0.0414, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0546875, "rewards/margins": 5.405468940734863, "rewards/rejected": -7.459374904632568, "step": 7520 }, { "epoch": 2.835090361445783, "grad_norm": 12.307702670015226, "learning_rate": 2.9132153614457826e-07, "logits/chosen": -2.80078125, "logits/rejected": -2.856250047683716, "logps/chosen": -384.5, "logps/rejected": -475.1000061035156, "loss": 0.0379, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4007811546325684, "rewards/margins": 5.473437309265137, "rewards/rejected": -7.873437404632568, "step": 7530 }, { "epoch": 2.838855421686747, "grad_norm": 7.5909239043080055, "learning_rate": 2.9038027108433734e-07, "logits/chosen": -2.797656297683716, "logits/rejected": -2.7085938453674316, "logps/chosen": -368.79998779296875, "logps/rejected": -449.1000061035156, "loss": 0.0557, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8923828601837158, "rewards/margins": 5.154687404632568, "rewards/rejected": -7.050000190734863, "step": 7540 }, { "epoch": 2.8426204819277108, "grad_norm": 29.36110905946754, "learning_rate": 2.8943900602409636e-07, "logits/chosen": -2.7757811546325684, "logits/rejected": -2.815624952316284, "logps/chosen": -376.29998779296875, "logps/rejected": -456.6000061035156, "loss": 0.0484, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.237109422683716, "rewards/margins": 5.18359375, "rewards/rejected": -7.421875, "step": 7550 }, { "epoch": 2.8463855421686746, "grad_norm": 23.113846291095808, "learning_rate": 2.8849774096385544e-07, "logits/chosen": -2.639843702316284, "logits/rejected": -2.796093702316284, "logps/chosen": -317.70001220703125, "logps/rejected": -372.1499938964844, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": -1.9578125476837158, "rewards/margins": 5.303906440734863, "rewards/rejected": -7.262499809265137, "step": 7560 }, { "epoch": 2.850150602409639, "grad_norm": 36.70359495294979, "learning_rate": 2.875564759036144e-07, "logits/chosen": -2.832812547683716, "logits/rejected": -2.83984375, "logps/chosen": -399.29998779296875, "logps/rejected": -463.3500061035156, "loss": 0.0358, "rewards/accuracies": 1.0, "rewards/chosen": -2.17578125, "rewards/margins": 5.731249809265137, "rewards/rejected": -7.907812595367432, "step": 7570 }, { "epoch": 2.8539156626506026, "grad_norm": 24.80306069127408, "learning_rate": 2.866152108433735e-07, "logits/chosen": -2.7828125953674316, "logits/rejected": -2.7890625, "logps/chosen": -346.6000061035156, "logps/rejected": -424.1000061035156, "loss": 0.0831, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.70654296875, "rewards/margins": 5.209374904632568, "rewards/rejected": -6.910937309265137, "step": 7580 }, { "epoch": 2.8576807228915664, "grad_norm": 18.964469337969092, "learning_rate": 2.856739457831325e-07, "logits/chosen": -2.778125047683716, "logits/rejected": -2.840625047683716, "logps/chosen": -345.3999938964844, "logps/rejected": -426.3500061035156, "loss": 0.0389, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.768945336341858, "rewards/margins": 5.449999809265137, "rewards/rejected": -7.220312595367432, "step": 7590 }, { "epoch": 2.86144578313253, "grad_norm": 29.636316051508718, "learning_rate": 2.8473268072289155e-07, "logits/chosen": -2.8265624046325684, "logits/rejected": -2.768749952316284, "logps/chosen": -359.5, "logps/rejected": -427.3999938964844, "loss": 0.0686, "rewards/accuracies": 0.96875, "rewards/chosen": -2.1061034202575684, "rewards/margins": 4.970312595367432, "rewards/rejected": -7.079687595367432, "step": 7600 }, { "epoch": 2.865210843373494, "grad_norm": 53.007530516969034, "learning_rate": 2.837914156626506e-07, "logits/chosen": -2.8460936546325684, "logits/rejected": -2.91796875, "logps/chosen": -389.29998779296875, "logps/rejected": -438.20001220703125, "loss": 0.0554, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.98046875, "rewards/margins": 5.246874809265137, "rewards/rejected": -7.221875190734863, "step": 7610 }, { "epoch": 2.8689759036144578, "grad_norm": 20.58338280361923, "learning_rate": 2.8285015060240966e-07, "logits/chosen": -2.8187499046325684, "logits/rejected": -2.82421875, "logps/chosen": -379.3500061035156, "logps/rejected": -443.75, "loss": 0.0509, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.063671827316284, "rewards/margins": 5.075781345367432, "rewards/rejected": -7.1328125, "step": 7620 }, { "epoch": 2.8727409638554215, "grad_norm": 20.609168693410158, "learning_rate": 2.819088855421687e-07, "logits/chosen": -2.7265625, "logits/rejected": -2.772656202316284, "logps/chosen": -364.6000061035156, "logps/rejected": -416.8999938964844, "loss": 0.0591, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.696679711341858, "rewards/margins": 4.948437690734863, "rewards/rejected": -6.6484375, "step": 7630 }, { "epoch": 2.8765060240963853, "grad_norm": 18.51779108252009, "learning_rate": 2.8096762048192766e-07, "logits/chosen": -2.8578124046325684, "logits/rejected": -2.78125, "logps/chosen": -332.5, "logps/rejected": -452.29998779296875, "loss": 0.0661, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.731054663658142, "rewards/margins": 5.046093940734863, "rewards/rejected": -6.776562690734863, "step": 7640 }, { "epoch": 2.880271084337349, "grad_norm": 110.4262654062705, "learning_rate": 2.8002635542168674e-07, "logits/chosen": -2.750781297683716, "logits/rejected": -2.750781297683716, "logps/chosen": -343.75, "logps/rejected": -415.45001220703125, "loss": 0.0859, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7130858898162842, "rewards/margins": 4.951562404632568, "rewards/rejected": -6.6640625, "step": 7650 }, { "epoch": 2.8840361445783134, "grad_norm": 9.405904646465823, "learning_rate": 2.7908509036144576e-07, "logits/chosen": -2.866406202316284, "logits/rejected": -2.8125, "logps/chosen": -364.8999938964844, "logps/rejected": -447.8500061035156, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -2.244140625, "rewards/margins": 5.469531059265137, "rewards/rejected": -7.709374904632568, "step": 7660 }, { "epoch": 2.887801204819277, "grad_norm": 36.119904410951015, "learning_rate": 2.781438253012048e-07, "logits/chosen": -2.960156202316284, "logits/rejected": -2.9359374046325684, "logps/chosen": -304.45001220703125, "logps/rejected": -453.1499938964844, "loss": 0.0669, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7253906726837158, "rewards/margins": 5.342968940734863, "rewards/rejected": -7.065625190734863, "step": 7670 }, { "epoch": 2.891566265060241, "grad_norm": 10.347531793491648, "learning_rate": 2.772025602409638e-07, "logits/chosen": -2.842968702316284, "logits/rejected": -2.813281297683716, "logps/chosen": -400.70001220703125, "logps/rejected": -467.8999938964844, "loss": 0.0501, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.74462890625, "rewards/margins": 5.747656345367432, "rewards/rejected": -7.495312690734863, "step": 7680 }, { "epoch": 2.8953313253012047, "grad_norm": 10.509767791112067, "learning_rate": 2.762612951807229e-07, "logits/chosen": -2.796093702316284, "logits/rejected": -2.90234375, "logps/chosen": -322.0, "logps/rejected": -411.45001220703125, "loss": 0.0496, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.618066430091858, "rewards/margins": 5.412499904632568, "rewards/rejected": -7.029687404632568, "step": 7690 }, { "epoch": 2.8990963855421685, "grad_norm": 36.99153183944402, "learning_rate": 2.753200301204819e-07, "logits/chosen": -2.815624952316284, "logits/rejected": -2.8304686546325684, "logps/chosen": -383.29998779296875, "logps/rejected": -460.29998779296875, "loss": 0.0345, "rewards/accuracies": 1.0, "rewards/chosen": -1.614355444908142, "rewards/margins": 5.452343940734863, "rewards/rejected": -7.060937404632568, "step": 7700 }, { "epoch": 2.9028614457831328, "grad_norm": 7.95630608528309, "learning_rate": 2.7437876506024095e-07, "logits/chosen": -2.7640624046325684, "logits/rejected": -2.7320313453674316, "logps/chosen": -358.8500061035156, "logps/rejected": -474.1000061035156, "loss": 0.0363, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4646484851837158, "rewards/margins": 6.141406059265137, "rewards/rejected": -7.6015625, "step": 7710 }, { "epoch": 2.9066265060240966, "grad_norm": 9.57700986806421, "learning_rate": 2.734375e-07, "logits/chosen": -2.7890625, "logits/rejected": -2.80078125, "logps/chosen": -385.20001220703125, "logps/rejected": -449.0, "loss": 0.0511, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.876562476158142, "rewards/margins": 5.508593559265137, "rewards/rejected": -7.392187595367432, "step": 7720 }, { "epoch": 2.9103915662650603, "grad_norm": 7.23755016750353, "learning_rate": 2.7249623493975906e-07, "logits/chosen": -2.815624952316284, "logits/rejected": -2.7984375953674316, "logps/chosen": -365.5, "logps/rejected": -489.70001220703125, "loss": 0.044, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0166015625, "rewards/margins": 5.655468940734863, "rewards/rejected": -7.670312404632568, "step": 7730 }, { "epoch": 2.914156626506024, "grad_norm": 6.492626943402005, "learning_rate": 2.7155496987951803e-07, "logits/chosen": -2.819531202316284, "logits/rejected": -2.8515625, "logps/chosen": -363.3500061035156, "logps/rejected": -454.79998779296875, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": -1.576562523841858, "rewards/margins": 5.45703125, "rewards/rejected": -7.03125, "step": 7740 }, { "epoch": 2.917921686746988, "grad_norm": 81.42953684714062, "learning_rate": 2.706137048192771e-07, "logits/chosen": -2.606640577316284, "logits/rejected": -2.7261719703674316, "logps/chosen": -398.3999938964844, "logps/rejected": -443.29998779296875, "loss": 0.0318, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9113280773162842, "rewards/margins": 5.421875, "rewards/rejected": -7.334374904632568, "step": 7750 }, { "epoch": 2.9216867469879517, "grad_norm": 16.0613481107781, "learning_rate": 2.6967243975903614e-07, "logits/chosen": -2.65234375, "logits/rejected": -2.710156202316284, "logps/chosen": -371.6499938964844, "logps/rejected": -463.29998779296875, "loss": 0.0539, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.837499976158142, "rewards/margins": 5.499218940734863, "rewards/rejected": -7.342187404632568, "step": 7760 }, { "epoch": 2.9254518072289155, "grad_norm": 11.375709569943423, "learning_rate": 2.687311746987952e-07, "logits/chosen": -2.7640624046325684, "logits/rejected": -2.7640624046325684, "logps/chosen": -358.45001220703125, "logps/rejected": -464.29998779296875, "loss": 0.0364, "rewards/accuracies": 1.0, "rewards/chosen": -1.553125023841858, "rewards/margins": 5.471093654632568, "rewards/rejected": -7.0234375, "step": 7770 }, { "epoch": 2.9292168674698793, "grad_norm": 43.71989523361243, "learning_rate": 2.677899096385542e-07, "logits/chosen": -2.817187547683716, "logits/rejected": -2.92578125, "logps/chosen": -375.20001220703125, "logps/rejected": -431.04998779296875, "loss": 0.0486, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.639807105064392, "rewards/margins": 5.334374904632568, "rewards/rejected": -6.984375, "step": 7780 }, { "epoch": 2.9329819277108435, "grad_norm": 59.73824710110152, "learning_rate": 2.6684864457831327e-07, "logits/chosen": -2.7367186546325684, "logits/rejected": -2.7484374046325684, "logps/chosen": -399.45001220703125, "logps/rejected": -495.3999938964844, "loss": 0.0415, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.519628882408142, "rewards/margins": 5.701562404632568, "rewards/rejected": -7.21875, "step": 7790 }, { "epoch": 2.9367469879518073, "grad_norm": 3.2708894828461723, "learning_rate": 2.659073795180723e-07, "logits/chosen": -2.8851561546325684, "logits/rejected": -2.85546875, "logps/chosen": -346.1499938964844, "logps/rejected": -466.70001220703125, "loss": 0.0769, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.582421898841858, "rewards/margins": 5.495312690734863, "rewards/rejected": -7.077343940734863, "step": 7800 }, { "epoch": 2.940512048192771, "grad_norm": 8.2809037576754, "learning_rate": 2.6496611445783127e-07, "logits/chosen": -2.75390625, "logits/rejected": -2.7906250953674316, "logps/chosen": -347.3500061035156, "logps/rejected": -449.20001220703125, "loss": 0.0675, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8699219226837158, "rewards/margins": 5.459374904632568, "rewards/rejected": -7.334374904632568, "step": 7810 }, { "epoch": 2.944277108433735, "grad_norm": 32.26525326409121, "learning_rate": 2.6402484939759035e-07, "logits/chosen": -2.82421875, "logits/rejected": -2.836718797683716, "logps/chosen": -350.8999938964844, "logps/rejected": -404.8999938964844, "loss": 0.0539, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.947363257408142, "rewards/margins": 5.172656059265137, "rewards/rejected": -7.120312690734863, "step": 7820 }, { "epoch": 2.9480421686746987, "grad_norm": 16.856478617196036, "learning_rate": 2.630835843373494e-07, "logits/chosen": -2.846874952316284, "logits/rejected": -2.878124952316284, "logps/chosen": -348.6499938964844, "logps/rejected": -444.8999938964844, "loss": 0.0522, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.19140625, "rewards/margins": 5.358593940734863, "rewards/rejected": -7.551562309265137, "step": 7830 }, { "epoch": 2.9518072289156625, "grad_norm": 35.69060094438088, "learning_rate": 2.6214231927710845e-07, "logits/chosen": -2.813281297683716, "logits/rejected": -2.83984375, "logps/chosen": -342.625, "logps/rejected": -444.8500061035156, "loss": 0.0573, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.988671898841858, "rewards/margins": 5.39453125, "rewards/rejected": -7.379687309265137, "step": 7840 }, { "epoch": 2.9555722891566267, "grad_norm": 22.88092325811607, "learning_rate": 2.6120105421686743e-07, "logits/chosen": -2.8460936546325684, "logits/rejected": -2.890625, "logps/chosen": -369.54998779296875, "logps/rejected": -442.45001220703125, "loss": 0.0545, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0484375953674316, "rewards/margins": 5.265625, "rewards/rejected": -7.310937404632568, "step": 7850 }, { "epoch": 2.9593373493975905, "grad_norm": 42.931501011094674, "learning_rate": 2.602597891566265e-07, "logits/chosen": -2.852343797683716, "logits/rejected": -2.8531250953674316, "logps/chosen": -360.1499938964844, "logps/rejected": -455.5, "loss": 0.0399, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.873046875, "rewards/margins": 5.603125095367432, "rewards/rejected": -7.4765625, "step": 7860 }, { "epoch": 2.9631024096385543, "grad_norm": 12.382250743247676, "learning_rate": 2.5931852409638553e-07, "logits/chosen": -2.821093797683716, "logits/rejected": -2.7789063453674316, "logps/chosen": -324.1499938964844, "logps/rejected": -401.3999938964844, "loss": 0.065, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.245898485183716, "rewards/margins": 4.780468940734863, "rewards/rejected": -7.026562690734863, "step": 7870 }, { "epoch": 2.966867469879518, "grad_norm": 28.20574314782039, "learning_rate": 2.5837725903614456e-07, "logits/chosen": -2.7796874046325684, "logits/rejected": -2.780468702316284, "logps/chosen": -375.1499938964844, "logps/rejected": -444.45001220703125, "loss": 0.0473, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0570311546325684, "rewards/margins": 5.591406345367432, "rewards/rejected": -7.646874904632568, "step": 7880 }, { "epoch": 2.970632530120482, "grad_norm": 15.421636196622373, "learning_rate": 2.574359939759036e-07, "logits/chosen": -2.9039063453674316, "logits/rejected": -2.878124952316284, "logps/chosen": -318.70001220703125, "logps/rejected": -437.6499938964844, "loss": 0.0455, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0054688453674316, "rewards/margins": 5.08203125, "rewards/rejected": -7.084374904632568, "step": 7890 }, { "epoch": 2.9743975903614457, "grad_norm": 23.318001756831055, "learning_rate": 2.5649472891566267e-07, "logits/chosen": -2.7601561546325684, "logits/rejected": -2.9140625, "logps/chosen": -385.04998779296875, "logps/rejected": -443.29998779296875, "loss": 0.0398, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.387500047683716, "rewards/margins": 5.292187690734863, "rewards/rejected": -7.681250095367432, "step": 7900 }, { "epoch": 2.9781626506024095, "grad_norm": 19.967798038441746, "learning_rate": 2.555534638554217e-07, "logits/chosen": -2.819531202316284, "logits/rejected": -2.76171875, "logps/chosen": -387.1499938964844, "logps/rejected": -514.7000122070312, "loss": 0.0417, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.203125, "rewards/margins": 5.8671875, "rewards/rejected": -8.068750381469727, "step": 7910 }, { "epoch": 2.9819277108433733, "grad_norm": 25.13845289255712, "learning_rate": 2.546121987951807e-07, "logits/chosen": -2.797656297683716, "logits/rejected": -2.83984375, "logps/chosen": -359.75, "logps/rejected": -442.6000061035156, "loss": 0.0504, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.226367235183716, "rewards/margins": 5.478125095367432, "rewards/rejected": -7.704687595367432, "step": 7920 }, { "epoch": 2.9856927710843375, "grad_norm": 33.93857541732993, "learning_rate": 2.5367093373493975e-07, "logits/chosen": -2.878124952316284, "logits/rejected": -2.893749952316284, "logps/chosen": -391.5, "logps/rejected": -465.6000061035156, "loss": 0.0419, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1656250953674316, "rewards/margins": 5.364062309265137, "rewards/rejected": -7.5234375, "step": 7930 }, { "epoch": 2.9894578313253013, "grad_norm": 27.86639940371873, "learning_rate": 2.5272966867469883e-07, "logits/chosen": -2.71484375, "logits/rejected": -2.780468702316284, "logps/chosen": -361.1499938964844, "logps/rejected": -431.54998779296875, "loss": 0.0519, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.223437547683716, "rewards/margins": 5.234375, "rewards/rejected": -7.454687595367432, "step": 7940 }, { "epoch": 2.993222891566265, "grad_norm": 43.451034820152344, "learning_rate": 2.517884036144578e-07, "logits/chosen": -2.67578125, "logits/rejected": -2.75, "logps/chosen": -377.54998779296875, "logps/rejected": -492.1000061035156, "loss": 0.0527, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0257811546325684, "rewards/margins": 5.354687690734863, "rewards/rejected": -7.379687309265137, "step": 7950 }, { "epoch": 2.996987951807229, "grad_norm": 53.71459879670919, "learning_rate": 2.5084713855421683e-07, "logits/chosen": -2.792187452316284, "logits/rejected": -2.8765625953674316, "logps/chosen": -354.8500061035156, "logps/rejected": -451.3500061035156, "loss": 0.0627, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.298828125, "rewards/margins": 5.298437595367432, "rewards/rejected": -7.596875190734863, "step": 7960 }, { "epoch": 3.0007530120481927, "grad_norm": 10.821576465873422, "learning_rate": 2.499058734939759e-07, "logits/chosen": -2.875, "logits/rejected": -2.8804688453674316, "logps/chosen": -334.25, "logps/rejected": -439.0, "loss": 0.0527, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.245312452316284, "rewards/margins": 5.452343940734863, "rewards/rejected": -7.701562404632568, "step": 7970 }, { "epoch": 3.0045180722891565, "grad_norm": 15.037678844319375, "learning_rate": 2.4896460843373493e-07, "logits/chosen": -2.668750047683716, "logits/rejected": -2.742968797683716, "logps/chosen": -343.75, "logps/rejected": -454.45001220703125, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -2.0951170921325684, "rewards/margins": 5.948437690734863, "rewards/rejected": -8.043749809265137, "step": 7980 }, { "epoch": 3.0082831325301207, "grad_norm": 7.377414890919427, "learning_rate": 2.4802334337349396e-07, "logits/chosen": -2.741015672683716, "logits/rejected": -2.8179688453674316, "logps/chosen": -395.54998779296875, "logps/rejected": -455.70001220703125, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -1.593652367591858, "rewards/margins": 6.035937309265137, "rewards/rejected": -7.625, "step": 7990 }, { "epoch": 3.0120481927710845, "grad_norm": 7.629671841945206, "learning_rate": 2.47082078313253e-07, "logits/chosen": -2.805468797683716, "logits/rejected": -2.8187499046325684, "logps/chosen": -370.1000061035156, "logps/rejected": -455.1000061035156, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -2.064648389816284, "rewards/margins": 6.087500095367432, "rewards/rejected": -8.15625, "step": 8000 }, { "epoch": 3.0158132530120483, "grad_norm": 9.705142563236343, "learning_rate": 2.46140813253012e-07, "logits/chosen": -2.7503905296325684, "logits/rejected": -2.882031202316284, "logps/chosen": -340.29998779296875, "logps/rejected": -421.29998779296875, "loss": 0.0278, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.3203125, "rewards/margins": 6.120312690734863, "rewards/rejected": -8.435937881469727, "step": 8010 }, { "epoch": 3.019578313253012, "grad_norm": 5.097216648689681, "learning_rate": 2.451995481927711e-07, "logits/chosen": -2.82421875, "logits/rejected": -2.8226561546325684, "logps/chosen": -332.75, "logps/rejected": -449.04998779296875, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -2.188671827316284, "rewards/margins": 6.278124809265137, "rewards/rejected": -8.470312118530273, "step": 8020 }, { "epoch": 3.023343373493976, "grad_norm": 16.548891011720887, "learning_rate": 2.442582831325301e-07, "logits/chosen": -2.846874952316284, "logits/rejected": -2.9195313453674316, "logps/chosen": -337.0, "logps/rejected": -404.8999938964844, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -1.933007836341858, "rewards/margins": 6.160937309265137, "rewards/rejected": -8.092187881469727, "step": 8030 }, { "epoch": 3.0271084337349397, "grad_norm": 27.09347469799971, "learning_rate": 2.4331701807228915e-07, "logits/chosen": -2.764843702316284, "logits/rejected": -2.8125, "logps/chosen": -342.8500061035156, "logps/rejected": -423.0, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -2.12548828125, "rewards/margins": 6.328125, "rewards/rejected": -8.449999809265137, "step": 8040 }, { "epoch": 3.0308734939759034, "grad_norm": 7.636087581855699, "learning_rate": 2.4237575301204817e-07, "logits/chosen": -2.8023438453674316, "logits/rejected": -2.7484374046325684, "logps/chosen": -367.3500061035156, "logps/rejected": -471.54998779296875, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -2.661328077316284, "rewards/margins": 6.365624904632568, "rewards/rejected": -9.03125, "step": 8050 }, { "epoch": 3.0346385542168677, "grad_norm": 9.793045174613333, "learning_rate": 2.414344879518072e-07, "logits/chosen": -2.8460936546325684, "logits/rejected": -2.891406297683716, "logps/chosen": -432.1499938964844, "logps/rejected": -500.5, "loss": 0.0219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.594531297683716, "rewards/margins": 6.3984375, "rewards/rejected": -8.990625381469727, "step": 8060 }, { "epoch": 3.0384036144578315, "grad_norm": 9.544755156566099, "learning_rate": 2.404932228915663e-07, "logits/chosen": -2.75, "logits/rejected": -2.8687500953674316, "logps/chosen": -387.6000061035156, "logps/rejected": -469.3999938964844, "loss": 0.0177, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4662108421325684, "rewards/margins": 6.484375, "rewards/rejected": -8.956250190734863, "step": 8070 }, { "epoch": 3.0421686746987953, "grad_norm": 15.136975072006207, "learning_rate": 2.395519578313253e-07, "logits/chosen": -2.862499952316284, "logits/rejected": -2.897656202316284, "logps/chosen": -409.54998779296875, "logps/rejected": -494.29998779296875, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -2.284374952316284, "rewards/margins": 6.479687690734863, "rewards/rejected": -8.759374618530273, "step": 8080 }, { "epoch": 3.045933734939759, "grad_norm": 5.947915669741454, "learning_rate": 2.3861069277108433e-07, "logits/chosen": -2.80078125, "logits/rejected": -2.8265624046325684, "logps/chosen": -354.04998779296875, "logps/rejected": -421.79998779296875, "loss": 0.0183, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6781249046325684, "rewards/margins": 6.364062309265137, "rewards/rejected": -9.034375190734863, "step": 8090 }, { "epoch": 3.049698795180723, "grad_norm": 5.323969062156925, "learning_rate": 2.3766942771084336e-07, "logits/chosen": -2.839062452316284, "logits/rejected": -2.9515624046325684, "logps/chosen": -356.3999938964844, "logps/rejected": -483.70001220703125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -3.46875, "rewards/margins": 6.839062690734863, "rewards/rejected": -10.307812690734863, "step": 8100 }, { "epoch": 3.0534638554216866, "grad_norm": 6.2064405087434755, "learning_rate": 2.3672816265060239e-07, "logits/chosen": -2.9359374046325684, "logits/rejected": -2.9546875953674316, "logps/chosen": -387.54998779296875, "logps/rejected": -514.7999877929688, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -3.39453125, "rewards/margins": 6.9296875, "rewards/rejected": -10.324999809265137, "step": 8110 }, { "epoch": 3.0572289156626504, "grad_norm": 6.839018616655108, "learning_rate": 2.3578689759036144e-07, "logits/chosen": -2.766406297683716, "logits/rejected": -2.75, "logps/chosen": -386.5, "logps/rejected": -525.8499755859375, "loss": 0.0214, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.608203172683716, "rewards/margins": 6.401562690734863, "rewards/rejected": -9.012499809265137, "step": 8120 }, { "epoch": 3.0609939759036147, "grad_norm": 4.229293586308938, "learning_rate": 2.3484563253012047e-07, "logits/chosen": -2.9140625, "logits/rejected": -2.8843750953674316, "logps/chosen": -434.70001220703125, "logps/rejected": -498.3999938964844, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -2.9447264671325684, "rewards/margins": 6.682812690734863, "rewards/rejected": -9.623437881469727, "step": 8130 }, { "epoch": 3.0647590361445785, "grad_norm": 5.096587902932893, "learning_rate": 2.3390436746987952e-07, "logits/chosen": -2.7093749046325684, "logits/rejected": -2.9085936546325684, "logps/chosen": -357.95001220703125, "logps/rejected": -424.0, "loss": 0.016, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.581249952316284, "rewards/margins": 6.420312404632568, "rewards/rejected": -8.996874809265137, "step": 8140 }, { "epoch": 3.0685240963855422, "grad_norm": 5.359416774705301, "learning_rate": 2.3296310240963855e-07, "logits/chosen": -2.8070311546325684, "logits/rejected": -2.805468797683716, "logps/chosen": -383.95001220703125, "logps/rejected": -496.29998779296875, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -2.4957032203674316, "rewards/margins": 6.4453125, "rewards/rejected": -8.946874618530273, "step": 8150 }, { "epoch": 3.072289156626506, "grad_norm": 14.390967570248185, "learning_rate": 2.320218373493976e-07, "logits/chosen": -2.895703077316284, "logits/rejected": -2.893749952316284, "logps/chosen": -340.54998779296875, "logps/rejected": -446.20001220703125, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -2.4691405296325684, "rewards/margins": 6.498437404632568, "rewards/rejected": -8.970312118530273, "step": 8160 }, { "epoch": 3.07605421686747, "grad_norm": 10.45014186935602, "learning_rate": 2.3108057228915663e-07, "logits/chosen": -2.907031297683716, "logits/rejected": -2.957812547683716, "logps/chosen": -373.70001220703125, "logps/rejected": -414.95001220703125, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -2.618359327316284, "rewards/margins": 6.242968559265137, "rewards/rejected": -8.854687690734863, "step": 8170 }, { "epoch": 3.0798192771084336, "grad_norm": 3.6207468547323027, "learning_rate": 2.3013930722891565e-07, "logits/chosen": -2.9359374046325684, "logits/rejected": -2.98828125, "logps/chosen": -382.6499938964844, "logps/rejected": -492.29998779296875, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -2.9800782203674316, "rewards/margins": 6.420312404632568, "rewards/rejected": -9.399999618530273, "step": 8180 }, { "epoch": 3.0835843373493974, "grad_norm": 10.82597503049925, "learning_rate": 2.291980421686747e-07, "logits/chosen": -2.961718797683716, "logits/rejected": -2.9859375953674316, "logps/chosen": -380.1000061035156, "logps/rejected": -461.70001220703125, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -3.141796827316284, "rewards/margins": 6.234375, "rewards/rejected": -9.381250381469727, "step": 8190 }, { "epoch": 3.0873493975903616, "grad_norm": 4.635000328768958, "learning_rate": 2.282567771084337e-07, "logits/chosen": -2.921093702316284, "logits/rejected": -2.9859375953674316, "logps/chosen": -373.20001220703125, "logps/rejected": -480.1000061035156, "loss": 0.0216, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.317187547683716, "rewards/margins": 6.439062595367432, "rewards/rejected": -9.756250381469727, "step": 8200 }, { "epoch": 3.0911144578313254, "grad_norm": 6.858430175805578, "learning_rate": 2.2731551204819276e-07, "logits/chosen": -2.8023438453674316, "logits/rejected": -2.86328125, "logps/chosen": -350.6000061035156, "logps/rejected": -470.1000061035156, "loss": 0.0335, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.294921875, "rewards/margins": 6.568749904632568, "rewards/rejected": -9.857812881469727, "step": 8210 }, { "epoch": 3.0948795180722892, "grad_norm": 21.99696648065775, "learning_rate": 2.2637424698795179e-07, "logits/chosen": -2.8921875953674316, "logits/rejected": -2.890625, "logps/chosen": -312.45001220703125, "logps/rejected": -440.6000061035156, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -3.046093702316284, "rewards/margins": 6.887499809265137, "rewards/rejected": -9.932812690734863, "step": 8220 }, { "epoch": 3.098644578313253, "grad_norm": 2.7524648027821548, "learning_rate": 2.2543298192771084e-07, "logits/chosen": -2.7953124046325684, "logits/rejected": -2.9273438453674316, "logps/chosen": -335.20001220703125, "logps/rejected": -472.29998779296875, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -2.40673828125, "rewards/margins": 7.109375, "rewards/rejected": -9.5078125, "step": 8230 }, { "epoch": 3.102409638554217, "grad_norm": 1.8569811529064681, "learning_rate": 2.2449171686746987e-07, "logits/chosen": -2.9585938453674316, "logits/rejected": -2.9000000953674316, "logps/chosen": -354.45001220703125, "logps/rejected": -491.20001220703125, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -2.794140577316284, "rewards/margins": 7.3359375, "rewards/rejected": -10.1328125, "step": 8240 }, { "epoch": 3.1061746987951806, "grad_norm": 13.680414550360327, "learning_rate": 2.235504518072289e-07, "logits/chosen": -2.848437547683716, "logits/rejected": -2.8984375, "logps/chosen": -346.75, "logps/rejected": -455.3999938964844, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": -2.964062452316284, "rewards/margins": 6.587500095367432, "rewards/rejected": -9.548437118530273, "step": 8250 }, { "epoch": 3.1099397590361444, "grad_norm": 8.853724437787067, "learning_rate": 2.2260918674698795e-07, "logits/chosen": -2.964062452316284, "logits/rejected": -2.9609375, "logps/chosen": -408.6499938964844, "logps/rejected": -488.0, "loss": 0.0207, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.850830078125, "rewards/margins": 6.381249904632568, "rewards/rejected": -9.2265625, "step": 8260 }, { "epoch": 3.1137048192771086, "grad_norm": 4.229843163140124, "learning_rate": 2.2166792168674697e-07, "logits/chosen": -2.8960938453674316, "logits/rejected": -2.9609375, "logps/chosen": -351.8999938964844, "logps/rejected": -474.3999938964844, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -2.874218702316284, "rewards/margins": 6.806250095367432, "rewards/rejected": -9.684374809265137, "step": 8270 }, { "epoch": 3.1174698795180724, "grad_norm": 9.566731865615697, "learning_rate": 2.2072665662650602e-07, "logits/chosen": -2.8460936546325684, "logits/rejected": -2.78515625, "logps/chosen": -396.79998779296875, "logps/rejected": -504.8999938964844, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -2.953906297683716, "rewards/margins": 6.537499904632568, "rewards/rejected": -9.495312690734863, "step": 8280 }, { "epoch": 3.121234939759036, "grad_norm": 2.358890361367117, "learning_rate": 2.1978539156626505e-07, "logits/chosen": -2.83984375, "logits/rejected": -2.8218750953674316, "logps/chosen": -412.6000061035156, "logps/rejected": -498.79998779296875, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -2.9175782203674316, "rewards/margins": 6.915625095367432, "rewards/rejected": -9.8359375, "step": 8290 }, { "epoch": 3.125, "grad_norm": 77.22790270478953, "learning_rate": 2.1884412650602408e-07, "logits/chosen": -2.9281249046325684, "logits/rejected": -2.9984374046325684, "logps/chosen": -367.75, "logps/rejected": -485.20001220703125, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -3.5589842796325684, "rewards/margins": 6.389062404632568, "rewards/rejected": -9.948437690734863, "step": 8300 }, { "epoch": 3.128765060240964, "grad_norm": 5.025323592911707, "learning_rate": 2.1790286144578313e-07, "logits/chosen": -2.953906297683716, "logits/rejected": -2.893749952316284, "logps/chosen": -343.5, "logps/rejected": -449.95001220703125, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -3.0394530296325684, "rewards/margins": 6.626562595367432, "rewards/rejected": -9.670312881469727, "step": 8310 }, { "epoch": 3.1325301204819276, "grad_norm": 30.724092627478814, "learning_rate": 2.1696159638554216e-07, "logits/chosen": -2.9085936546325684, "logits/rejected": -2.899218797683716, "logps/chosen": -343.29998779296875, "logps/rejected": -444.70001220703125, "loss": 0.0244, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1695313453674316, "rewards/margins": 6.448437690734863, "rewards/rejected": -9.618749618530273, "step": 8320 }, { "epoch": 3.1362951807228914, "grad_norm": 8.212538376786464, "learning_rate": 2.160203313253012e-07, "logits/chosen": -2.7718749046325684, "logits/rejected": -2.8179688453674316, "logps/chosen": -364.5, "logps/rejected": -464.45001220703125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -2.4644532203674316, "rewards/margins": 6.6015625, "rewards/rejected": -9.067187309265137, "step": 8330 }, { "epoch": 3.1400602409638556, "grad_norm": 2.203749556515757, "learning_rate": 2.1507906626506024e-07, "logits/chosen": -2.9593749046325684, "logits/rejected": -2.918750047683716, "logps/chosen": -375.04998779296875, "logps/rejected": -490.1000061035156, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -2.959765672683716, "rewards/margins": 7.079687595367432, "rewards/rejected": -10.037500381469727, "step": 8340 }, { "epoch": 3.1438253012048194, "grad_norm": 10.933725278170083, "learning_rate": 2.141378012048193e-07, "logits/chosen": -2.8089842796325684, "logits/rejected": -2.8414063453674316, "logps/chosen": -368.70001220703125, "logps/rejected": -464.5, "loss": 0.0179, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.7984375953674316, "rewards/margins": 6.540625095367432, "rewards/rejected": -9.337499618530273, "step": 8350 }, { "epoch": 3.147590361445783, "grad_norm": 57.77257436662636, "learning_rate": 2.131965361445783e-07, "logits/chosen": -2.8804688453674316, "logits/rejected": -2.893749952316284, "logps/chosen": -372.1000061035156, "logps/rejected": -459.79998779296875, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -2.802734375, "rewards/margins": 6.837500095367432, "rewards/rejected": -9.637499809265137, "step": 8360 }, { "epoch": 3.151355421686747, "grad_norm": 6.570308355645981, "learning_rate": 2.1225527108433732e-07, "logits/chosen": -2.938281297683716, "logits/rejected": -2.9437499046325684, "logps/chosen": -331.75, "logps/rejected": -433.5, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -2.934375047683716, "rewards/margins": 6.734375, "rewards/rejected": -9.662500381469727, "step": 8370 }, { "epoch": 3.1551204819277108, "grad_norm": 3.3543988225696872, "learning_rate": 2.1131400602409637e-07, "logits/chosen": -2.858593702316284, "logits/rejected": -2.9476561546325684, "logps/chosen": -374.45001220703125, "logps/rejected": -477.70001220703125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.53125, "rewards/margins": 6.7109375, "rewards/rejected": -10.246874809265137, "step": 8380 }, { "epoch": 3.1588855421686746, "grad_norm": 2.3977043013238037, "learning_rate": 2.103727409638554e-07, "logits/chosen": -2.922656297683716, "logits/rejected": -2.879687547683716, "logps/chosen": -386.70001220703125, "logps/rejected": -500.3999938964844, "loss": 0.0181, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5171875953674316, "rewards/margins": 7.1953125, "rewards/rejected": -10.712499618530273, "step": 8390 }, { "epoch": 3.1626506024096384, "grad_norm": 32.74699669727631, "learning_rate": 2.0943147590361445e-07, "logits/chosen": -2.987499952316284, "logits/rejected": -3.010937452316284, "logps/chosen": -375.75, "logps/rejected": -497.1000061035156, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -3.270312547683716, "rewards/margins": 6.971875190734863, "rewards/rejected": -10.243749618530273, "step": 8400 }, { "epoch": 3.1664156626506026, "grad_norm": 6.873661301552943, "learning_rate": 2.0849021084337348e-07, "logits/chosen": -2.910937547683716, "logits/rejected": -2.9593749046325684, "logps/chosen": -371.25, "logps/rejected": -446.1499938964844, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -3.2476563453674316, "rewards/margins": 6.782812595367432, "rewards/rejected": -10.03125, "step": 8410 }, { "epoch": 3.1701807228915664, "grad_norm": 14.603532811101706, "learning_rate": 2.0754894578313253e-07, "logits/chosen": -2.9789061546325684, "logits/rejected": -3.0492186546325684, "logps/chosen": -362.2250061035156, "logps/rejected": -453.29998779296875, "loss": 0.019, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.950000047683716, "rewards/margins": 6.785937309265137, "rewards/rejected": -9.732812881469727, "step": 8420 }, { "epoch": 3.17394578313253, "grad_norm": 92.81114748632639, "learning_rate": 2.0660768072289156e-07, "logits/chosen": -2.8578124046325684, "logits/rejected": -2.9585938453674316, "logps/chosen": -371.95001220703125, "logps/rejected": -474.0, "loss": 0.0273, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.848437547683716, "rewards/margins": 6.849999904632568, "rewards/rejected": -9.699999809265137, "step": 8430 }, { "epoch": 3.177710843373494, "grad_norm": 5.9182188343522295, "learning_rate": 2.0566641566265058e-07, "logits/chosen": -2.899218797683716, "logits/rejected": -3.0445313453674316, "logps/chosen": -373.3999938964844, "logps/rejected": -438.70001220703125, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -2.720703125, "rewards/margins": 6.620312690734863, "rewards/rejected": -9.334375381469727, "step": 8440 }, { "epoch": 3.1814759036144578, "grad_norm": 6.519935031723902, "learning_rate": 2.0472515060240964e-07, "logits/chosen": -2.85546875, "logits/rejected": -2.8414063453674316, "logps/chosen": -379.8999938964844, "logps/rejected": -480.6000061035156, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -3.06640625, "rewards/margins": 7.154687404632568, "rewards/rejected": -10.215624809265137, "step": 8450 }, { "epoch": 3.1852409638554215, "grad_norm": 12.03556144541563, "learning_rate": 2.0378388554216866e-07, "logits/chosen": -2.9273438453674316, "logits/rejected": -2.9453125, "logps/chosen": -407.25, "logps/rejected": -470.1000061035156, "loss": 0.0285, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.524218797683716, "rewards/margins": 6.634375095367432, "rewards/rejected": -10.160937309265137, "step": 8460 }, { "epoch": 3.1890060240963853, "grad_norm": 14.824667819777341, "learning_rate": 2.0284262048192772e-07, "logits/chosen": -2.8890624046325684, "logits/rejected": -2.828906297683716, "logps/chosen": -335.8500061035156, "logps/rejected": -472.75, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -3.1328125, "rewards/margins": 6.729687690734863, "rewards/rejected": -9.860937118530273, "step": 8470 }, { "epoch": 3.1927710843373496, "grad_norm": 4.122536718872049, "learning_rate": 2.0190135542168674e-07, "logits/chosen": -3.102343797683716, "logits/rejected": -2.9664063453674316, "logps/chosen": -356.29998779296875, "logps/rejected": -466.8999938964844, "loss": 0.0212, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.524218797683716, "rewards/margins": 6.2734375, "rewards/rejected": -9.795312881469727, "step": 8480 }, { "epoch": 3.1965361445783134, "grad_norm": 4.826314329970964, "learning_rate": 2.0096009036144577e-07, "logits/chosen": -2.8499999046325684, "logits/rejected": -2.9453125, "logps/chosen": -373.70001220703125, "logps/rejected": -443.6499938964844, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -3.4292969703674316, "rewards/margins": 6.698437690734863, "rewards/rejected": -10.134374618530273, "step": 8490 }, { "epoch": 3.200301204819277, "grad_norm": 4.682833278450023, "learning_rate": 2.0001882530120482e-07, "logits/chosen": -2.996875047683716, "logits/rejected": -3.000781297683716, "logps/chosen": -370.20001220703125, "logps/rejected": -492.3500061035156, "loss": 0.0129, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.08203125, "rewards/margins": 6.798437595367432, "rewards/rejected": -10.887499809265137, "step": 8500 }, { "epoch": 3.204066265060241, "grad_norm": 29.392964828798725, "learning_rate": 1.9907756024096385e-07, "logits/chosen": -2.8804688453674316, "logits/rejected": -2.9710936546325684, "logps/chosen": -386.3999938964844, "logps/rejected": -494.3999938964844, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -3.1468749046325684, "rewards/margins": 6.907812595367432, "rewards/rejected": -10.053125381469727, "step": 8510 }, { "epoch": 3.2078313253012047, "grad_norm": 6.369896141620953, "learning_rate": 1.981362951807229e-07, "logits/chosen": -2.946093797683716, "logits/rejected": -3.0289063453674316, "logps/chosen": -360.45001220703125, "logps/rejected": -451.29998779296875, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -3.1480469703674316, "rewards/margins": 6.974999904632568, "rewards/rejected": -10.121874809265137, "step": 8520 }, { "epoch": 3.2115963855421685, "grad_norm": 12.030186384638213, "learning_rate": 1.971950301204819e-07, "logits/chosen": -2.9765625, "logits/rejected": -2.987499952316284, "logps/chosen": -361.45001220703125, "logps/rejected": -475.29998779296875, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -3.020312547683716, "rewards/margins": 6.853125095367432, "rewards/rejected": -9.876562118530273, "step": 8530 }, { "epoch": 3.2153614457831328, "grad_norm": 4.837908402238901, "learning_rate": 1.9625376506024096e-07, "logits/chosen": -2.8695311546325684, "logits/rejected": -2.901562452316284, "logps/chosen": -338.625, "logps/rejected": -460.1499938964844, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -3.041015625, "rewards/margins": 7.268750190734863, "rewards/rejected": -10.306249618530273, "step": 8540 }, { "epoch": 3.2191265060240966, "grad_norm": 5.092306026026517, "learning_rate": 1.9531249999999998e-07, "logits/chosen": -2.860156297683716, "logits/rejected": -2.8578124046325684, "logps/chosen": -380.45001220703125, "logps/rejected": -489.70001220703125, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -3.131640672683716, "rewards/margins": 6.807812690734863, "rewards/rejected": -9.934374809265137, "step": 8550 }, { "epoch": 3.2228915662650603, "grad_norm": 7.361828293862332, "learning_rate": 1.94371234939759e-07, "logits/chosen": -2.7847657203674316, "logits/rejected": -2.8695311546325684, "logps/chosen": -361.8999938964844, "logps/rejected": -448.20001220703125, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -2.8525390625, "rewards/margins": 6.676562309265137, "rewards/rejected": -9.543749809265137, "step": 8560 }, { "epoch": 3.226656626506024, "grad_norm": 10.88063982049563, "learning_rate": 1.9342996987951806e-07, "logits/chosen": -2.9515624046325684, "logits/rejected": -2.9195313453674316, "logps/chosen": -384.6000061035156, "logps/rejected": -474.6000061035156, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": -2.9136719703674316, "rewards/margins": 6.845312595367432, "rewards/rejected": -9.759374618530273, "step": 8570 }, { "epoch": 3.230421686746988, "grad_norm": 9.041067581243027, "learning_rate": 1.924887048192771e-07, "logits/chosen": -2.928906202316284, "logits/rejected": -2.9398436546325684, "logps/chosen": -352.25, "logps/rejected": -462.29998779296875, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -3.1109375953674316, "rewards/margins": 6.515625, "rewards/rejected": -9.625, "step": 8580 }, { "epoch": 3.2341867469879517, "grad_norm": 86.05798076457265, "learning_rate": 1.9154743975903614e-07, "logits/chosen": -2.8140625953674316, "logits/rejected": -2.9609375, "logps/chosen": -363.6499938964844, "logps/rejected": -426.3500061035156, "loss": 0.0155, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.7699217796325684, "rewards/margins": 6.985937595367432, "rewards/rejected": -9.751562118530273, "step": 8590 }, { "epoch": 3.2379518072289155, "grad_norm": 18.836452821023308, "learning_rate": 1.9060617469879517e-07, "logits/chosen": -2.9546875953674316, "logits/rejected": -2.967968702316284, "logps/chosen": -380.5, "logps/rejected": -493.8999938964844, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -3.001171827316284, "rewards/margins": 6.298437595367432, "rewards/rejected": -9.295312881469727, "step": 8600 }, { "epoch": 3.2417168674698793, "grad_norm": 3.441410176965049, "learning_rate": 1.8966490963855422e-07, "logits/chosen": -2.9710936546325684, "logits/rejected": -2.973437547683716, "logps/chosen": -349.95001220703125, "logps/rejected": -464.6499938964844, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -2.6376953125, "rewards/margins": 6.59375, "rewards/rejected": -9.229687690734863, "step": 8610 }, { "epoch": 3.2454819277108435, "grad_norm": 5.959921347692062, "learning_rate": 1.8872364457831325e-07, "logits/chosen": -2.8734374046325684, "logits/rejected": -2.984375, "logps/chosen": -419.6000061035156, "logps/rejected": -484.70001220703125, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -2.8824219703674316, "rewards/margins": 7.025000095367432, "rewards/rejected": -9.909375190734863, "step": 8620 }, { "epoch": 3.2492469879518073, "grad_norm": 13.083251301953121, "learning_rate": 1.8778237951807228e-07, "logits/chosen": -2.8382811546325684, "logits/rejected": -2.946093797683716, "logps/chosen": -351.79998779296875, "logps/rejected": -417.20001220703125, "loss": 0.0198, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4761719703674316, "rewards/margins": 6.5625, "rewards/rejected": -9.037500381469727, "step": 8630 }, { "epoch": 3.253012048192771, "grad_norm": 6.964507117273818, "learning_rate": 1.8684111445783133e-07, "logits/chosen": -2.893749952316284, "logits/rejected": -2.883593797683716, "logps/chosen": -356.3500061035156, "logps/rejected": -464.5, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -3.103515625, "rewards/margins": 6.564062595367432, "rewards/rejected": -9.667187690734863, "step": 8640 }, { "epoch": 3.256777108433735, "grad_norm": 14.04261966685205, "learning_rate": 1.8589984939759036e-07, "logits/chosen": -2.811718702316284, "logits/rejected": -2.9546875953674316, "logps/chosen": -334.6000061035156, "logps/rejected": -445.5, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -2.8910155296325684, "rewards/margins": 7.050000190734863, "rewards/rejected": -9.943750381469727, "step": 8650 }, { "epoch": 3.2605421686746987, "grad_norm": 7.514191398759485, "learning_rate": 1.849585843373494e-07, "logits/chosen": -2.8460936546325684, "logits/rejected": -2.9437499046325684, "logps/chosen": -343.3500061035156, "logps/rejected": -442.5, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -2.75390625, "rewards/margins": 7.015625, "rewards/rejected": -9.771875381469727, "step": 8660 }, { "epoch": 3.2643072289156625, "grad_norm": 15.829726897721322, "learning_rate": 1.8401731927710844e-07, "logits/chosen": -2.922656297683716, "logits/rejected": -2.953125, "logps/chosen": -376.6000061035156, "logps/rejected": -450.79998779296875, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -3.12109375, "rewards/margins": 6.848437309265137, "rewards/rejected": -9.978124618530273, "step": 8670 }, { "epoch": 3.2680722891566267, "grad_norm": 3.1582606670064886, "learning_rate": 1.8307605421686744e-07, "logits/chosen": -2.8617186546325684, "logits/rejected": -3.0054688453674316, "logps/chosen": -364.29998779296875, "logps/rejected": -463.5, "loss": 0.0194, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9453125, "rewards/margins": 6.832812309265137, "rewards/rejected": -9.771875381469727, "step": 8680 }, { "epoch": 3.2718373493975905, "grad_norm": 6.379196106266893, "learning_rate": 1.821347891566265e-07, "logits/chosen": -2.9039063453674316, "logits/rejected": -2.9312500953674316, "logps/chosen": -397.04998779296875, "logps/rejected": -486.20001220703125, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -3.380078077316284, "rewards/margins": 7.076562404632568, "rewards/rejected": -10.446874618530273, "step": 8690 }, { "epoch": 3.2756024096385543, "grad_norm": 6.372822004282907, "learning_rate": 1.8119352409638552e-07, "logits/chosen": -3.0960936546325684, "logits/rejected": -2.9937500953674316, "logps/chosen": -350.20001220703125, "logps/rejected": -497.5, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7671875953674316, "rewards/margins": 7.203125, "rewards/rejected": -10.96875, "step": 8700 }, { "epoch": 3.279367469879518, "grad_norm": 11.58886187370764, "learning_rate": 1.8025225903614457e-07, "logits/chosen": -2.948437452316284, "logits/rejected": -2.92578125, "logps/chosen": -366.20001220703125, "logps/rejected": -439.0, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -2.6953125, "rewards/margins": 6.465624809265137, "rewards/rejected": -9.160937309265137, "step": 8710 }, { "epoch": 3.283132530120482, "grad_norm": 48.89805339887949, "learning_rate": 1.793109939759036e-07, "logits/chosen": -2.8960938453674316, "logits/rejected": -2.938281297683716, "logps/chosen": -404.79998779296875, "logps/rejected": -476.3999938964844, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -3.1605467796325684, "rewards/margins": 6.751562595367432, "rewards/rejected": -9.909375190734863, "step": 8720 }, { "epoch": 3.2868975903614457, "grad_norm": 2.9538352002428168, "learning_rate": 1.7836972891566265e-07, "logits/chosen": -2.9351563453674316, "logits/rejected": -2.953906297683716, "logps/chosen": -384.3999938964844, "logps/rejected": -437.95001220703125, "loss": 0.0131, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1480469703674316, "rewards/margins": 6.501562595367432, "rewards/rejected": -9.65625, "step": 8730 }, { "epoch": 3.2906626506024095, "grad_norm": 10.576816818880408, "learning_rate": 1.7742846385542167e-07, "logits/chosen": -2.8921875953674316, "logits/rejected": -2.938281297683716, "logps/chosen": -361.79998779296875, "logps/rejected": -452.8999938964844, "loss": 0.017, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.046093702316284, "rewards/margins": 6.610937595367432, "rewards/rejected": -9.660937309265137, "step": 8740 }, { "epoch": 3.2944277108433733, "grad_norm": 16.54374733317796, "learning_rate": 1.764871987951807e-07, "logits/chosen": -3.00390625, "logits/rejected": -3.004687547683716, "logps/chosen": -353.3999938964844, "logps/rejected": -468.95001220703125, "loss": 0.0188, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.6058592796325684, "rewards/margins": 6.665625095367432, "rewards/rejected": -10.268750190734863, "step": 8750 }, { "epoch": 3.2981927710843375, "grad_norm": 26.344487585196827, "learning_rate": 1.7554593373493975e-07, "logits/chosen": -2.82421875, "logits/rejected": -2.890625, "logps/chosen": -342.6000061035156, "logps/rejected": -460.3999938964844, "loss": 0.0187, "rewards/accuracies": 1.0, "rewards/chosen": -3.766406297683716, "rewards/margins": 6.3984375, "rewards/rejected": -10.168749809265137, "step": 8760 }, { "epoch": 3.3019578313253013, "grad_norm": 23.68581877312385, "learning_rate": 1.7460466867469878e-07, "logits/chosen": -2.903125047683716, "logits/rejected": -2.977343797683716, "logps/chosen": -396.3500061035156, "logps/rejected": -469.29998779296875, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -3.8695311546325684, "rewards/margins": 6.5546875, "rewards/rejected": -10.428125381469727, "step": 8770 }, { "epoch": 3.305722891566265, "grad_norm": 52.28199434643924, "learning_rate": 1.7366340361445783e-07, "logits/chosen": -2.942187547683716, "logits/rejected": -2.964062452316284, "logps/chosen": -352.7250061035156, "logps/rejected": -434.8500061035156, "loss": 0.0241, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.2476563453674316, "rewards/margins": 6.417187690734863, "rewards/rejected": -9.668749809265137, "step": 8780 }, { "epoch": 3.309487951807229, "grad_norm": 15.51151637410237, "learning_rate": 1.7272213855421686e-07, "logits/chosen": -2.991406202316284, "logits/rejected": -2.9585938453674316, "logps/chosen": -368.1000061035156, "logps/rejected": -467.20001220703125, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -3.4410157203674316, "rewards/margins": 7.0, "rewards/rejected": -10.440625190734863, "step": 8790 }, { "epoch": 3.3132530120481927, "grad_norm": 8.979714335639104, "learning_rate": 1.7178087349397591e-07, "logits/chosen": -2.9742188453674316, "logits/rejected": -2.953125, "logps/chosen": -351.29998779296875, "logps/rejected": -456.3999938964844, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -3.482421875, "rewards/margins": 6.9765625, "rewards/rejected": -10.457812309265137, "step": 8800 }, { "epoch": 3.3170180722891565, "grad_norm": 9.683707296044199, "learning_rate": 1.7083960843373494e-07, "logits/chosen": -2.8984375, "logits/rejected": -2.98046875, "logps/chosen": -385.29998779296875, "logps/rejected": -508.3999938964844, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -3.557812452316284, "rewards/margins": 6.831250190734863, "rewards/rejected": -10.387499809265137, "step": 8810 }, { "epoch": 3.3207831325301207, "grad_norm": 21.697322200734657, "learning_rate": 1.6989834337349397e-07, "logits/chosen": -2.9437499046325684, "logits/rejected": -2.9945311546325684, "logps/chosen": -354.04998779296875, "logps/rejected": -448.25, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": -3.4332032203674316, "rewards/margins": 6.782812595367432, "rewards/rejected": -10.215624809265137, "step": 8820 }, { "epoch": 3.3245481927710845, "grad_norm": 18.698405293680043, "learning_rate": 1.6895707831325302e-07, "logits/chosen": -2.831249952316284, "logits/rejected": -2.9453125, "logps/chosen": -428.79998779296875, "logps/rejected": -524.5999755859375, "loss": 0.0195, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.793750047683716, "rewards/margins": 6.8125, "rewards/rejected": -10.612500190734863, "step": 8830 }, { "epoch": 3.3283132530120483, "grad_norm": 6.088746815519931, "learning_rate": 1.6801581325301205e-07, "logits/chosen": -2.9351563453674316, "logits/rejected": -2.887500047683716, "logps/chosen": -364.25, "logps/rejected": -462.79998779296875, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -3.100390672683716, "rewards/margins": 6.831250190734863, "rewards/rejected": -9.934374809265137, "step": 8840 }, { "epoch": 3.332078313253012, "grad_norm": 3.6223677919632156, "learning_rate": 1.6707454819277107e-07, "logits/chosen": -2.8984375, "logits/rejected": -2.9859375953674316, "logps/chosen": -401.8500061035156, "logps/rejected": -502.8999938964844, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -3.57421875, "rewards/margins": 7.060937404632568, "rewards/rejected": -10.639062881469727, "step": 8850 }, { "epoch": 3.335843373493976, "grad_norm": 6.412417972347709, "learning_rate": 1.661332831325301e-07, "logits/chosen": -2.8187499046325684, "logits/rejected": -2.957812547683716, "logps/chosen": -401.8500061035156, "logps/rejected": -463.20001220703125, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -3.0199217796325684, "rewards/margins": 6.790625095367432, "rewards/rejected": -9.809374809265137, "step": 8860 }, { "epoch": 3.3396084337349397, "grad_norm": 13.60302667584383, "learning_rate": 1.6519201807228913e-07, "logits/chosen": -2.9937500953674316, "logits/rejected": -3.026562452316284, "logps/chosen": -311.6000061035156, "logps/rejected": -400.29998779296875, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -2.5972657203674316, "rewards/margins": 6.28125, "rewards/rejected": -8.876562118530273, "step": 8870 }, { "epoch": 3.3433734939759034, "grad_norm": 2.6720012005641953, "learning_rate": 1.6425075301204818e-07, "logits/chosen": -2.999218702316284, "logits/rejected": -3.046875, "logps/chosen": -424.20001220703125, "logps/rejected": -454.20001220703125, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -3.0640625953674316, "rewards/margins": 6.568749904632568, "rewards/rejected": -9.6328125, "step": 8880 }, { "epoch": 3.3471385542168672, "grad_norm": 16.416683078876932, "learning_rate": 1.633094879518072e-07, "logits/chosen": -2.87890625, "logits/rejected": -2.88671875, "logps/chosen": -370.0, "logps/rejected": -459.79998779296875, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -3.232421875, "rewards/margins": 6.420312404632568, "rewards/rejected": -9.6484375, "step": 8890 }, { "epoch": 3.3509036144578315, "grad_norm": 7.749944427179234, "learning_rate": 1.6236822289156626e-07, "logits/chosen": -2.8695311546325684, "logits/rejected": -2.940624952316284, "logps/chosen": -331.54998779296875, "logps/rejected": -420.29998779296875, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -3.0707030296325684, "rewards/margins": 6.810937404632568, "rewards/rejected": -9.8828125, "step": 8900 }, { "epoch": 3.3546686746987953, "grad_norm": 3.596543182416036, "learning_rate": 1.6142695783132529e-07, "logits/chosen": -2.8734374046325684, "logits/rejected": -2.9156250953674316, "logps/chosen": -379.3999938964844, "logps/rejected": -474.3999938964844, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -3.39453125, "rewards/margins": 6.8671875, "rewards/rejected": -10.260937690734863, "step": 8910 }, { "epoch": 3.358433734939759, "grad_norm": 5.662215624262561, "learning_rate": 1.6048569277108434e-07, "logits/chosen": -2.9078125953674316, "logits/rejected": -2.9546875953674316, "logps/chosen": -367.95001220703125, "logps/rejected": -479.1000061035156, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -3.567187547683716, "rewards/margins": 6.704687595367432, "rewards/rejected": -10.276562690734863, "step": 8920 }, { "epoch": 3.362198795180723, "grad_norm": 5.667599222753553, "learning_rate": 1.5954442771084337e-07, "logits/chosen": -3.012500047683716, "logits/rejected": -3.1109375953674316, "logps/chosen": -342.6499938964844, "logps/rejected": -448.6499938964844, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -3.432421922683716, "rewards/margins": 6.529687404632568, "rewards/rejected": -9.962499618530273, "step": 8930 }, { "epoch": 3.3659638554216866, "grad_norm": 6.809275235993305, "learning_rate": 1.586031626506024e-07, "logits/chosen": -2.999218702316284, "logits/rejected": -3.0453124046325684, "logps/chosen": -384.1499938964844, "logps/rejected": -489.3999938964844, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -3.142578125, "rewards/margins": 6.690625190734863, "rewards/rejected": -9.834375381469727, "step": 8940 }, { "epoch": 3.3697289156626504, "grad_norm": 32.43141493188418, "learning_rate": 1.5766189759036145e-07, "logits/chosen": -2.9867186546325684, "logits/rejected": -3.0289063453674316, "logps/chosen": -411.70001220703125, "logps/rejected": -487.1000061035156, "loss": 0.0156, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.6820311546325684, "rewards/margins": 7.095312595367432, "rewards/rejected": -10.7734375, "step": 8950 }, { "epoch": 3.3734939759036147, "grad_norm": 21.598311139159872, "learning_rate": 1.5672063253012047e-07, "logits/chosen": -2.9906249046325684, "logits/rejected": -3.164843797683716, "logps/chosen": -413.1499938964844, "logps/rejected": -495.0, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -3.4515624046325684, "rewards/margins": 7.021874904632568, "rewards/rejected": -10.478124618530273, "step": 8960 }, { "epoch": 3.3772590361445785, "grad_norm": 20.411842168870095, "learning_rate": 1.5577936746987953e-07, "logits/chosen": -2.9234375953674316, "logits/rejected": -2.989062547683716, "logps/chosen": -381.79998779296875, "logps/rejected": -455.0, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -3.921875, "rewards/margins": 6.7109375, "rewards/rejected": -10.637499809265137, "step": 8970 }, { "epoch": 3.3810240963855422, "grad_norm": 5.3487783935746664, "learning_rate": 1.5483810240963855e-07, "logits/chosen": -3.0374999046325684, "logits/rejected": -3.0062499046325684, "logps/chosen": -379.1000061035156, "logps/rejected": -504.0, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -3.5335936546325684, "rewards/margins": 7.203125, "rewards/rejected": -10.734375, "step": 8980 }, { "epoch": 3.384789156626506, "grad_norm": 16.492064750254553, "learning_rate": 1.538968373493976e-07, "logits/chosen": -2.922656297683716, "logits/rejected": -2.992968797683716, "logps/chosen": -378.25, "logps/rejected": -478.3999938964844, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": -3.389843702316284, "rewards/margins": 7.185937404632568, "rewards/rejected": -10.574999809265137, "step": 8990 }, { "epoch": 3.38855421686747, "grad_norm": 2.5775670313293246, "learning_rate": 1.5295557228915663e-07, "logits/chosen": -2.9625000953674316, "logits/rejected": -3.0179686546325684, "logps/chosen": -366.8999938964844, "logps/rejected": -449.8999938964844, "loss": 0.0161, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3148436546325684, "rewards/margins": 6.753125190734863, "rewards/rejected": -10.0703125, "step": 9000 }, { "epoch": 3.3923192771084336, "grad_norm": 2.4958766972316075, "learning_rate": 1.5201430722891563e-07, "logits/chosen": -2.9828124046325684, "logits/rejected": -2.964062452316284, "logps/chosen": -365.6499938964844, "logps/rejected": -475.29998779296875, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -3.2265625, "rewards/margins": 6.925000190734863, "rewards/rejected": -10.149999618530273, "step": 9010 }, { "epoch": 3.3960843373493974, "grad_norm": 6.49368730213055, "learning_rate": 1.5107304216867469e-07, "logits/chosen": -2.9046874046325684, "logits/rejected": -3.028125047683716, "logps/chosen": -353.6000061035156, "logps/rejected": -437.79998779296875, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -3.5492186546325684, "rewards/margins": 7.057812690734863, "rewards/rejected": -10.603124618530273, "step": 9020 }, { "epoch": 3.399849397590361, "grad_norm": 3.9360485883311105, "learning_rate": 1.501317771084337e-07, "logits/chosen": -3.124218702316284, "logits/rejected": -3.0523438453674316, "logps/chosen": -433.0, "logps/rejected": -574.0, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -3.547656297683716, "rewards/margins": 7.6875, "rewards/rejected": -11.240625381469727, "step": 9030 }, { "epoch": 3.4036144578313254, "grad_norm": 7.900597383735534, "learning_rate": 1.4919051204819277e-07, "logits/chosen": -3.059375047683716, "logits/rejected": -3.149218797683716, "logps/chosen": -381.25, "logps/rejected": -481.95001220703125, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -3.858593702316284, "rewards/margins": 7.1328125, "rewards/rejected": -11.0, "step": 9040 }, { "epoch": 3.4073795180722892, "grad_norm": 10.657196054709473, "learning_rate": 1.482492469879518e-07, "logits/chosen": -2.870312452316284, "logits/rejected": -2.967968702316284, "logps/chosen": -335.75, "logps/rejected": -427.3500061035156, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -3.961718797683716, "rewards/margins": 6.881249904632568, "rewards/rejected": -10.850000381469727, "step": 9050 }, { "epoch": 3.411144578313253, "grad_norm": 34.55418535571039, "learning_rate": 1.4730798192771085e-07, "logits/chosen": -3.0289063453674316, "logits/rejected": -3.1015625, "logps/chosen": -385.04998779296875, "logps/rejected": -494.5, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -3.840625047683716, "rewards/margins": 7.228125095367432, "rewards/rejected": -11.0625, "step": 9060 }, { "epoch": 3.414909638554217, "grad_norm": 13.222980212319845, "learning_rate": 1.4636671686746987e-07, "logits/chosen": -2.8851561546325684, "logits/rejected": -3.0640625953674316, "logps/chosen": -369.8500061035156, "logps/rejected": -439.1000061035156, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -3.375, "rewards/margins": 6.790625095367432, "rewards/rejected": -10.171875, "step": 9070 }, { "epoch": 3.4186746987951806, "grad_norm": 1.5727717447152576, "learning_rate": 1.454254518072289e-07, "logits/chosen": -2.9468750953674316, "logits/rejected": -2.9625000953674316, "logps/chosen": -363.5, "logps/rejected": -456.6000061035156, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -3.42578125, "rewards/margins": 7.254687309265137, "rewards/rejected": -10.682812690734863, "step": 9080 }, { "epoch": 3.4224397590361444, "grad_norm": 8.51462544376336, "learning_rate": 1.4448418674698795e-07, "logits/chosen": -2.9859375953674316, "logits/rejected": -3.03515625, "logps/chosen": -399.95001220703125, "logps/rejected": -524.5999755859375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -3.788281202316284, "rewards/margins": 7.318749904632568, "rewards/rejected": -11.109375, "step": 9090 }, { "epoch": 3.4262048192771086, "grad_norm": 10.126706925633835, "learning_rate": 1.4354292168674698e-07, "logits/chosen": -2.9671874046325684, "logits/rejected": -3.047656297683716, "logps/chosen": -364.3999938964844, "logps/rejected": -465.1000061035156, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.5804686546325684, "rewards/margins": 6.9296875, "rewards/rejected": -10.5, "step": 9100 }, { "epoch": 3.4299698795180724, "grad_norm": 4.598515652712798, "learning_rate": 1.4260165662650603e-07, "logits/chosen": -2.98828125, "logits/rejected": -3.0453124046325684, "logps/chosen": -376.8500061035156, "logps/rejected": -464.70001220703125, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -4.115624904632568, "rewards/margins": 6.707812309265137, "rewards/rejected": -10.831250190734863, "step": 9110 }, { "epoch": 3.433734939759036, "grad_norm": 3.989946065087293, "learning_rate": 1.4166039156626506e-07, "logits/chosen": -2.98046875, "logits/rejected": -2.9281249046325684, "logps/chosen": -345.6499938964844, "logps/rejected": -442.1499938964844, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -3.74609375, "rewards/margins": 7.237500190734863, "rewards/rejected": -10.987500190734863, "step": 9120 }, { "epoch": 3.4375, "grad_norm": 16.698688865713397, "learning_rate": 1.4071912650602409e-07, "logits/chosen": -3.0015625953674316, "logits/rejected": -3.12890625, "logps/chosen": -415.375, "logps/rejected": -463.20001220703125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -3.6234374046325684, "rewards/margins": 6.954687595367432, "rewards/rejected": -10.574999809265137, "step": 9130 }, { "epoch": 3.441265060240964, "grad_norm": 7.615325322367894, "learning_rate": 1.3977786144578314e-07, "logits/chosen": -2.999218702316284, "logits/rejected": -3.075000047683716, "logps/chosen": -390.20001220703125, "logps/rejected": -527.7000122070312, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -4.167187690734863, "rewards/margins": 6.932812690734863, "rewards/rejected": -11.109375, "step": 9140 }, { "epoch": 3.4450301204819276, "grad_norm": 27.693546896035695, "learning_rate": 1.3883659638554216e-07, "logits/chosen": -3.0414061546325684, "logits/rejected": -3.112499952316284, "logps/chosen": -384.75, "logps/rejected": -474.29998779296875, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -3.6031250953674316, "rewards/margins": 6.928124904632568, "rewards/rejected": -10.537500381469727, "step": 9150 }, { "epoch": 3.4487951807228914, "grad_norm": 6.0997759490755135, "learning_rate": 1.3789533132530122e-07, "logits/chosen": -2.9585938453674316, "logits/rejected": -2.85546875, "logps/chosen": -354.0, "logps/rejected": -460.8999938964844, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -3.499218702316284, "rewards/margins": 6.762499809265137, "rewards/rejected": -10.262499809265137, "step": 9160 }, { "epoch": 3.4525602409638556, "grad_norm": 2.8250213904529065, "learning_rate": 1.3695406626506022e-07, "logits/chosen": -2.9085936546325684, "logits/rejected": -2.977343797683716, "logps/chosen": -390.29998779296875, "logps/rejected": -454.0, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": -2.9058594703674316, "rewards/margins": 7.014062404632568, "rewards/rejected": -9.915624618530273, "step": 9170 }, { "epoch": 3.4563253012048194, "grad_norm": 7.712550065810043, "learning_rate": 1.3601280120481927e-07, "logits/chosen": -2.91796875, "logits/rejected": -2.9632811546325684, "logps/chosen": -364.45001220703125, "logps/rejected": -460.3999938964844, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -3.3773436546325684, "rewards/margins": 7.342187404632568, "rewards/rejected": -10.721875190734863, "step": 9180 }, { "epoch": 3.460090361445783, "grad_norm": 9.062614125586212, "learning_rate": 1.350715361445783e-07, "logits/chosen": -2.983593702316284, "logits/rejected": -3.0062499046325684, "logps/chosen": -336.3500061035156, "logps/rejected": -429.29998779296875, "loss": 0.0265, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.414843797683716, "rewards/margins": 6.846875190734863, "rewards/rejected": -10.264062881469727, "step": 9190 }, { "epoch": 3.463855421686747, "grad_norm": 10.211246808640446, "learning_rate": 1.3413027108433732e-07, "logits/chosen": -2.9781250953674316, "logits/rejected": -3.010937452316284, "logps/chosen": -395.75, "logps/rejected": -459.8999938964844, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -3.5703125, "rewards/margins": 7.201562404632568, "rewards/rejected": -10.776562690734863, "step": 9200 }, { "epoch": 3.4676204819277108, "grad_norm": 8.091059060975327, "learning_rate": 1.3318900602409638e-07, "logits/chosen": -2.9398436546325684, "logits/rejected": -2.971874952316284, "logps/chosen": -360.1499938964844, "logps/rejected": -438.20001220703125, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": -3.334765672683716, "rewards/margins": 6.715624809265137, "rewards/rejected": -10.046875, "step": 9210 }, { "epoch": 3.4713855421686746, "grad_norm": 13.842588339434508, "learning_rate": 1.322477409638554e-07, "logits/chosen": -3.0250000953674316, "logits/rejected": -3.0492186546325684, "logps/chosen": -388.95001220703125, "logps/rejected": -496.8999938964844, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -3.960156202316284, "rewards/margins": 6.8671875, "rewards/rejected": -10.829687118530273, "step": 9220 }, { "epoch": 3.475150602409639, "grad_norm": 44.872726904784436, "learning_rate": 1.3130647590361446e-07, "logits/chosen": -2.957812547683716, "logits/rejected": -3.03125, "logps/chosen": -359.79998779296875, "logps/rejected": -452.29998779296875, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -3.405468702316284, "rewards/margins": 6.684374809265137, "rewards/rejected": -10.082812309265137, "step": 9230 }, { "epoch": 3.4789156626506026, "grad_norm": 15.20256135566823, "learning_rate": 1.3036521084337348e-07, "logits/chosen": -2.9437499046325684, "logits/rejected": -2.938281297683716, "logps/chosen": -353.04998779296875, "logps/rejected": -485.6000061035156, "loss": 0.0234, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.2660155296325684, "rewards/margins": 6.817187309265137, "rewards/rejected": -10.090624809265137, "step": 9240 }, { "epoch": 3.4826807228915664, "grad_norm": 18.00842258629746, "learning_rate": 1.2942394578313254e-07, "logits/chosen": -3.039843797683716, "logits/rejected": -3.057812452316284, "logps/chosen": -387.0, "logps/rejected": -451.0, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": -3.57421875, "rewards/margins": 6.454687595367432, "rewards/rejected": -10.029687881469727, "step": 9250 }, { "epoch": 3.48644578313253, "grad_norm": 5.80079806795706, "learning_rate": 1.2848268072289156e-07, "logits/chosen": -2.936718702316284, "logits/rejected": -2.8968749046325684, "logps/chosen": -393.29998779296875, "logps/rejected": -502.8999938964844, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -3.65234375, "rewards/margins": 7.084374904632568, "rewards/rejected": -10.731249809265137, "step": 9260 }, { "epoch": 3.490210843373494, "grad_norm": 15.007300412529665, "learning_rate": 1.275414156626506e-07, "logits/chosen": -2.969531297683716, "logits/rejected": -2.9593749046325684, "logps/chosen": -394.45001220703125, "logps/rejected": -475.6000061035156, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": -3.232421875, "rewards/margins": 7.279687404632568, "rewards/rejected": -10.512499809265137, "step": 9270 }, { "epoch": 3.4939759036144578, "grad_norm": 7.230867885277974, "learning_rate": 1.2660015060240964e-07, "logits/chosen": -3.016406297683716, "logits/rejected": -2.9156250953674316, "logps/chosen": -350.79998779296875, "logps/rejected": -487.0, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -3.415234327316284, "rewards/margins": 7.165625095367432, "rewards/rejected": -10.584375381469727, "step": 9280 }, { "epoch": 3.4977409638554215, "grad_norm": 10.37386247423026, "learning_rate": 1.2565888554216867e-07, "logits/chosen": -2.9437499046325684, "logits/rejected": -2.983593702316284, "logps/chosen": -391.8500061035156, "logps/rejected": -487.0, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -3.5390625, "rewards/margins": 6.809374809265137, "rewards/rejected": -10.350000381469727, "step": 9290 }, { "epoch": 3.5015060240963853, "grad_norm": 29.520801828684508, "learning_rate": 1.247176204819277e-07, "logits/chosen": -2.8843750953674316, "logits/rejected": -2.987499952316284, "logps/chosen": -420.5, "logps/rejected": -504.8999938964844, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -3.774218797683716, "rewards/margins": 7.073437690734863, "rewards/rejected": -10.837499618530273, "step": 9300 }, { "epoch": 3.505271084337349, "grad_norm": 2.083103068600227, "learning_rate": 1.2377635542168675e-07, "logits/chosen": -2.909374952316284, "logits/rejected": -2.9898438453674316, "logps/chosen": -342.1000061035156, "logps/rejected": -436.75, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -3.6429686546325684, "rewards/margins": 6.721875190734863, "rewards/rejected": -10.371874809265137, "step": 9310 }, { "epoch": 3.5090361445783134, "grad_norm": 56.099918970031204, "learning_rate": 1.2283509036144578e-07, "logits/chosen": -2.9507813453674316, "logits/rejected": -2.9546875953674316, "logps/chosen": -401.8500061035156, "logps/rejected": -503.70001220703125, "loss": 0.0192, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3550782203674316, "rewards/margins": 6.967187404632568, "rewards/rejected": -10.3125, "step": 9320 }, { "epoch": 3.512801204819277, "grad_norm": 49.70713916631745, "learning_rate": 1.2189382530120483e-07, "logits/chosen": -2.9273438453674316, "logits/rejected": -3.046875, "logps/chosen": -397.8999938964844, "logps/rejected": -463.79998779296875, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": -3.457812547683716, "rewards/margins": 7.207812309265137, "rewards/rejected": -10.670312881469727, "step": 9330 }, { "epoch": 3.516566265060241, "grad_norm": 3.059696295925203, "learning_rate": 1.2095256024096386e-07, "logits/chosen": -3.0640625953674316, "logits/rejected": -3.07421875, "logps/chosen": -330.32501220703125, "logps/rejected": -441.5, "loss": 0.0115, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2386717796325684, "rewards/margins": 6.970312595367432, "rewards/rejected": -10.212499618530273, "step": 9340 }, { "epoch": 3.5203313253012047, "grad_norm": 4.477833537314691, "learning_rate": 1.2001129518072288e-07, "logits/chosen": -2.918750047683716, "logits/rejected": -2.9593749046325684, "logps/chosen": -357.8999938964844, "logps/rejected": -486.1000061035156, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -3.3843750953674316, "rewards/margins": 7.003125190734863, "rewards/rejected": -10.389062881469727, "step": 9350 }, { "epoch": 3.5240963855421685, "grad_norm": 8.733366316263007, "learning_rate": 1.1907003012048192e-07, "logits/chosen": -2.9742188453674316, "logits/rejected": -2.9867186546325684, "logps/chosen": -343.1000061035156, "logps/rejected": -466.8999938964844, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -3.401171922683716, "rewards/margins": 7.217187404632568, "rewards/rejected": -10.614062309265137, "step": 9360 }, { "epoch": 3.5278614457831328, "grad_norm": 5.038520664154948, "learning_rate": 1.1812876506024095e-07, "logits/chosen": -3.015625, "logits/rejected": -2.952343702316284, "logps/chosen": -379.29998779296875, "logps/rejected": -490.54998779296875, "loss": 0.0163, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.223828077316284, "rewards/margins": 6.682812690734863, "rewards/rejected": -9.907812118530273, "step": 9370 }, { "epoch": 3.5316265060240966, "grad_norm": 4.330696792139867, "learning_rate": 1.1718749999999999e-07, "logits/chosen": -2.907031297683716, "logits/rejected": -2.9945311546325684, "logps/chosen": -336.8999938964844, "logps/rejected": -490.6000061035156, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.1957030296325684, "rewards/margins": 7.479687690734863, "rewards/rejected": -10.668749809265137, "step": 9380 }, { "epoch": 3.5353915662650603, "grad_norm": 8.46454724991022, "learning_rate": 1.1624623493975903e-07, "logits/chosen": -2.9046874046325684, "logits/rejected": -2.9585938453674316, "logps/chosen": -370.3500061035156, "logps/rejected": -473.1000061035156, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.55078125, "rewards/margins": 6.981249809265137, "rewards/rejected": -10.528124809265137, "step": 9390 }, { "epoch": 3.539156626506024, "grad_norm": 18.13846939165654, "learning_rate": 1.1530496987951807e-07, "logits/chosen": -2.918750047683716, "logits/rejected": -2.8617186546325684, "logps/chosen": -365.04998779296875, "logps/rejected": -505.20001220703125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -3.430468797683716, "rewards/margins": 7.345312595367432, "rewards/rejected": -10.787500381469727, "step": 9400 }, { "epoch": 3.542921686746988, "grad_norm": 11.070599089044228, "learning_rate": 1.1436370481927711e-07, "logits/chosen": -2.90234375, "logits/rejected": -2.9039063453674316, "logps/chosen": -357.45001220703125, "logps/rejected": -439.1000061035156, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -3.2890625, "rewards/margins": 6.815625190734863, "rewards/rejected": -10.107812881469727, "step": 9410 }, { "epoch": 3.5466867469879517, "grad_norm": 46.05303817743823, "learning_rate": 1.1342243975903614e-07, "logits/chosen": -2.940624952316284, "logits/rejected": -2.9546875953674316, "logps/chosen": -345.79998779296875, "logps/rejected": -444.0, "loss": 0.0161, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3414063453674316, "rewards/margins": 7.168749809265137, "rewards/rejected": -10.509374618530273, "step": 9420 }, { "epoch": 3.5504518072289155, "grad_norm": 13.209323351597215, "learning_rate": 1.1248117469879518e-07, "logits/chosen": -2.88671875, "logits/rejected": -2.95703125, "logps/chosen": -358.1499938964844, "logps/rejected": -446.8999938964844, "loss": 0.0222, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1787109375, "rewards/margins": 6.779687404632568, "rewards/rejected": -9.953125, "step": 9430 }, { "epoch": 3.5542168674698793, "grad_norm": 3.0210082157214795, "learning_rate": 1.1153990963855422e-07, "logits/chosen": -2.996875047683716, "logits/rejected": -2.999218702316284, "logps/chosen": -333.8500061035156, "logps/rejected": -423.70001220703125, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -3.382031202316284, "rewards/margins": 7.003125190734863, "rewards/rejected": -10.389062881469727, "step": 9440 }, { "epoch": 3.5579819277108435, "grad_norm": 10.657406283337089, "learning_rate": 1.1059864457831326e-07, "logits/chosen": -3.0960936546325684, "logits/rejected": -3.077343702316284, "logps/chosen": -386.5, "logps/rejected": -477.5, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -3.5257811546325684, "rewards/margins": 6.462500095367432, "rewards/rejected": -9.984375, "step": 9450 }, { "epoch": 3.5617469879518073, "grad_norm": 4.136566438764674, "learning_rate": 1.0965737951807228e-07, "logits/chosen": -2.9632811546325684, "logits/rejected": -3.035937547683716, "logps/chosen": -333.1499938964844, "logps/rejected": -432.04998779296875, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -3.344921827316284, "rewards/margins": 6.582812309265137, "rewards/rejected": -9.923437118530273, "step": 9460 }, { "epoch": 3.565512048192771, "grad_norm": 34.53161459263672, "learning_rate": 1.0871611445783132e-07, "logits/chosen": -2.8167967796325684, "logits/rejected": -2.921875, "logps/chosen": -345.8500061035156, "logps/rejected": -472.6000061035156, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -3.2171874046325684, "rewards/margins": 7.120312690734863, "rewards/rejected": -10.346875190734863, "step": 9470 }, { "epoch": 3.569277108433735, "grad_norm": 7.625445808462877, "learning_rate": 1.0777484939759035e-07, "logits/chosen": -2.9898438453674316, "logits/rejected": -3.035937547683716, "logps/chosen": -389.95001220703125, "logps/rejected": -488.25, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -2.794140577316284, "rewards/margins": 7.293749809265137, "rewards/rejected": -10.090624809265137, "step": 9480 }, { "epoch": 3.5730421686746987, "grad_norm": 15.634765725753903, "learning_rate": 1.0683358433734939e-07, "logits/chosen": -2.9242186546325684, "logits/rejected": -2.9867186546325684, "logps/chosen": -414.20001220703125, "logps/rejected": -500.0, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -3.3671875, "rewards/margins": 6.901562690734863, "rewards/rejected": -10.265625, "step": 9490 }, { "epoch": 3.5768072289156625, "grad_norm": 27.91619224258925, "learning_rate": 1.0589231927710843e-07, "logits/chosen": -2.913281202316284, "logits/rejected": -2.90625, "logps/chosen": -329.95001220703125, "logps/rejected": -470.70001220703125, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -3.3929686546325684, "rewards/margins": 7.123437404632568, "rewards/rejected": -10.512499809265137, "step": 9500 }, { "epoch": 3.5805722891566267, "grad_norm": 10.270142642199762, "learning_rate": 1.0495105421686747e-07, "logits/chosen": -2.9312500953674316, "logits/rejected": -2.928906202316284, "logps/chosen": -385.6499938964844, "logps/rejected": -502.79998779296875, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": -3.389453172683716, "rewards/margins": 6.790625095367432, "rewards/rejected": -10.170312881469727, "step": 9510 }, { "epoch": 3.5843373493975905, "grad_norm": 7.016880631193724, "learning_rate": 1.0400978915662651e-07, "logits/chosen": -3.065624952316284, "logits/rejected": -3.01171875, "logps/chosen": -346.2749938964844, "logps/rejected": -446.54998779296875, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -3.53125, "rewards/margins": 6.815625190734863, "rewards/rejected": -10.346875190734863, "step": 9520 }, { "epoch": 3.5881024096385543, "grad_norm": 22.07202254674331, "learning_rate": 1.0306852409638555e-07, "logits/chosen": -2.9703125953674316, "logits/rejected": -2.94921875, "logps/chosen": -393.3999938964844, "logps/rejected": -527.5, "loss": 0.0203, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7203125953674316, "rewards/margins": 7.115624904632568, "rewards/rejected": -10.831250190734863, "step": 9530 }, { "epoch": 3.591867469879518, "grad_norm": 10.26812192867134, "learning_rate": 1.0212725903614456e-07, "logits/chosen": -2.870312452316284, "logits/rejected": -2.867968797683716, "logps/chosen": -344.95001220703125, "logps/rejected": -439.5, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -3.3656249046325684, "rewards/margins": 7.079687595367432, "rewards/rejected": -10.440625190734863, "step": 9540 }, { "epoch": 3.595632530120482, "grad_norm": 11.155801595228278, "learning_rate": 1.011859939759036e-07, "logits/chosen": -2.9515624046325684, "logits/rejected": -3.063281297683716, "logps/chosen": -356.29998779296875, "logps/rejected": -458.29998779296875, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -3.578125, "rewards/margins": 6.817187309265137, "rewards/rejected": -10.403124809265137, "step": 9550 }, { "epoch": 3.5993975903614457, "grad_norm": 3.516471924909327, "learning_rate": 1.0024472891566264e-07, "logits/chosen": -2.9164061546325684, "logits/rejected": -2.965625047683716, "logps/chosen": -356.8999938964844, "logps/rejected": -470.6000061035156, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.2718749046325684, "rewards/margins": 6.901562690734863, "rewards/rejected": -10.171875, "step": 9560 }, { "epoch": 3.6031626506024095, "grad_norm": 10.470791093770439, "learning_rate": 9.930346385542168e-08, "logits/chosen": -2.8921875953674316, "logits/rejected": -3.06640625, "logps/chosen": -405.54998779296875, "logps/rejected": -438.79998779296875, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -3.2635741233825684, "rewards/margins": 6.590624809265137, "rewards/rejected": -9.854687690734863, "step": 9570 }, { "epoch": 3.6069277108433733, "grad_norm": 6.699699150476468, "learning_rate": 9.836219879518072e-08, "logits/chosen": -2.983593702316284, "logits/rejected": -3.071093797683716, "logps/chosen": -388.0, "logps/rejected": -475.79998779296875, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.5625, "rewards/margins": 6.957812309265137, "rewards/rejected": -10.520312309265137, "step": 9580 }, { "epoch": 3.6106927710843375, "grad_norm": 16.012446310632427, "learning_rate": 9.742093373493976e-08, "logits/chosen": -2.8218750953674316, "logits/rejected": -3.0335936546325684, "logps/chosen": -392.75, "logps/rejected": -477.20001220703125, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -3.350390672683716, "rewards/margins": 6.885937690734863, "rewards/rejected": -10.234375, "step": 9590 }, { "epoch": 3.6144578313253013, "grad_norm": 6.31472172650331, "learning_rate": 9.64796686746988e-08, "logits/chosen": -2.965625047683716, "logits/rejected": -2.961718797683716, "logps/chosen": -401.04998779296875, "logps/rejected": -519.9000244140625, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.228515625, "rewards/margins": 7.573437690734863, "rewards/rejected": -10.796875, "step": 9600 }, { "epoch": 3.618222891566265, "grad_norm": 5.423217765634736, "learning_rate": 9.553840361445783e-08, "logits/chosen": -2.9164061546325684, "logits/rejected": -2.871875047683716, "logps/chosen": -386.6000061035156, "logps/rejected": -515.0999755859375, "loss": 0.0216, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.700000047683716, "rewards/margins": 6.875, "rewards/rejected": -10.565625190734863, "step": 9610 }, { "epoch": 3.621987951807229, "grad_norm": 2.5003690695081238, "learning_rate": 9.459713855421685e-08, "logits/chosen": -2.9554686546325684, "logits/rejected": -2.9593749046325684, "logps/chosen": -394.0, "logps/rejected": -494.70001220703125, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -3.23828125, "rewards/margins": 6.809374809265137, "rewards/rejected": -10.045312881469727, "step": 9620 }, { "epoch": 3.6257530120481927, "grad_norm": 49.52752049601757, "learning_rate": 9.36558734939759e-08, "logits/chosen": -2.9320311546325684, "logits/rejected": -2.9937500953674316, "logps/chosen": -416.6000061035156, "logps/rejected": -488.5, "loss": 0.0157, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.747851610183716, "rewards/margins": 6.790625095367432, "rewards/rejected": -9.534375190734863, "step": 9630 }, { "epoch": 3.6295180722891565, "grad_norm": 1.9745380218888964, "learning_rate": 9.271460843373493e-08, "logits/chosen": -2.9859375953674316, "logits/rejected": -2.9976563453674316, "logps/chosen": -419.04998779296875, "logps/rejected": -532.7000122070312, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -3.1695313453674316, "rewards/margins": 7.321875095367432, "rewards/rejected": -10.493749618530273, "step": 9640 }, { "epoch": 3.6332831325301207, "grad_norm": 14.228377590494329, "learning_rate": 9.177334337349397e-08, "logits/chosen": -2.984375, "logits/rejected": -2.9781250953674316, "logps/chosen": -377.3999938964844, "logps/rejected": -483.70001220703125, "loss": 0.0176, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.308398485183716, "rewards/margins": 6.893750190734863, "rewards/rejected": -10.198437690734863, "step": 9650 }, { "epoch": 3.6370481927710845, "grad_norm": 1.3834373710001902, "learning_rate": 9.083207831325301e-08, "logits/chosen": -2.883593797683716, "logits/rejected": -2.944531202316284, "logps/chosen": -355.8999938964844, "logps/rejected": -468.79998779296875, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -3.337890625, "rewards/margins": 7.478125095367432, "rewards/rejected": -10.818750381469727, "step": 9660 }, { "epoch": 3.6408132530120483, "grad_norm": 6.67109752992545, "learning_rate": 8.989081325301204e-08, "logits/chosen": -3.0171875953674316, "logits/rejected": -3.0687499046325684, "logps/chosen": -329.54998779296875, "logps/rejected": -396.6000061035156, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -3.496875047683716, "rewards/margins": 6.470312595367432, "rewards/rejected": -9.9609375, "step": 9670 }, { "epoch": 3.644578313253012, "grad_norm": 14.103335959715594, "learning_rate": 8.894954819277108e-08, "logits/chosen": -2.848437547683716, "logits/rejected": -2.9546875953674316, "logps/chosen": -439.8500061035156, "logps/rejected": -548.4000244140625, "loss": 0.0159, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5796875953674316, "rewards/margins": 6.959374904632568, "rewards/rejected": -10.543749809265137, "step": 9680 }, { "epoch": 3.648343373493976, "grad_norm": 5.089837145865527, "learning_rate": 8.800828313253012e-08, "logits/chosen": -2.8539061546325684, "logits/rejected": -2.8984375, "logps/chosen": -342.20001220703125, "logps/rejected": -450.1000061035156, "loss": 0.0299, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3343749046325684, "rewards/margins": 6.729687690734863, "rewards/rejected": -10.053125381469727, "step": 9690 }, { "epoch": 3.6521084337349397, "grad_norm": 5.714474072650423, "learning_rate": 8.706701807228915e-08, "logits/chosen": -2.94921875, "logits/rejected": -3.046093702316284, "logps/chosen": -376.79998779296875, "logps/rejected": -451.04998779296875, "loss": 0.0157, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2828125953674316, "rewards/margins": 6.832812309265137, "rewards/rejected": -10.110937118530273, "step": 9700 }, { "epoch": 3.6558734939759034, "grad_norm": 1.0544844673187264, "learning_rate": 8.612575301204819e-08, "logits/chosen": -2.9242186546325684, "logits/rejected": -2.905078172683716, "logps/chosen": -349.20001220703125, "logps/rejected": -486.5, "loss": 0.0188, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.630859375, "rewards/margins": 7.571875095367432, "rewards/rejected": -11.203125, "step": 9710 }, { "epoch": 3.6596385542168672, "grad_norm": 3.848178535301144, "learning_rate": 8.518448795180723e-08, "logits/chosen": -2.995312452316284, "logits/rejected": -2.953125, "logps/chosen": -363.1499938964844, "logps/rejected": -500.1000061035156, "loss": 0.0141, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.721874952316284, "rewards/margins": 6.909375190734863, "rewards/rejected": -10.628125190734863, "step": 9720 }, { "epoch": 3.6634036144578315, "grad_norm": 31.967948175676135, "learning_rate": 8.424322289156627e-08, "logits/chosen": -2.9078125953674316, "logits/rejected": -2.9609375, "logps/chosen": -400.6000061035156, "logps/rejected": -475.5, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -3.4976563453674316, "rewards/margins": 6.704687595367432, "rewards/rejected": -10.199999809265137, "step": 9730 }, { "epoch": 3.6671686746987953, "grad_norm": 1.7900615967745617, "learning_rate": 8.33019578313253e-08, "logits/chosen": -2.946093797683716, "logits/rejected": -3.020312547683716, "logps/chosen": -434.6000061035156, "logps/rejected": -535.0, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -3.728515625, "rewards/margins": 7.473437309265137, "rewards/rejected": -11.203125, "step": 9740 }, { "epoch": 3.670933734939759, "grad_norm": 3.2015821987455064, "learning_rate": 8.236069277108433e-08, "logits/chosen": -2.981250047683716, "logits/rejected": -2.9281249046325684, "logps/chosen": -363.04998779296875, "logps/rejected": -475.8999938964844, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.606250047683716, "rewards/margins": 7.431250095367432, "rewards/rejected": -11.028124809265137, "step": 9750 }, { "epoch": 3.674698795180723, "grad_norm": 19.594655145858997, "learning_rate": 8.141942771084337e-08, "logits/chosen": -2.8960938453674316, "logits/rejected": -3.02734375, "logps/chosen": -377.29998779296875, "logps/rejected": -422.5, "loss": 0.0222, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5546875, "rewards/margins": 6.589062690734863, "rewards/rejected": -10.139062881469727, "step": 9760 }, { "epoch": 3.6784638554216866, "grad_norm": 7.893108752244451, "learning_rate": 8.047816265060241e-08, "logits/chosen": -2.967968702316284, "logits/rejected": -3.0023436546325684, "logps/chosen": -368.75, "logps/rejected": -460.54998779296875, "loss": 0.0191, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.571093797683716, "rewards/margins": 6.5859375, "rewards/rejected": -10.159375190734863, "step": 9770 }, { "epoch": 3.682228915662651, "grad_norm": 3.429019845354725, "learning_rate": 7.953689759036144e-08, "logits/chosen": -3.0804686546325684, "logits/rejected": -3.0445313453674316, "logps/chosen": -417.5, "logps/rejected": -516.0999755859375, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -3.994140625, "rewards/margins": 6.831250190734863, "rewards/rejected": -10.817187309265137, "step": 9780 }, { "epoch": 3.6859939759036147, "grad_norm": 21.85940240160595, "learning_rate": 7.859563253012048e-08, "logits/chosen": -3.034374952316284, "logits/rejected": -3.043750047683716, "logps/chosen": -395.1000061035156, "logps/rejected": -471.20001220703125, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -3.419921875, "rewards/margins": 7.09375, "rewards/rejected": -10.521875381469727, "step": 9790 }, { "epoch": 3.6897590361445785, "grad_norm": 36.73361632228114, "learning_rate": 7.76543674698795e-08, "logits/chosen": -2.9828124046325684, "logits/rejected": -3.0414061546325684, "logps/chosen": -361.95001220703125, "logps/rejected": -485.0, "loss": 0.0279, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.641796827316284, "rewards/margins": 6.981249809265137, "rewards/rejected": -10.623437881469727, "step": 9800 }, { "epoch": 3.6935240963855422, "grad_norm": 28.11727531872651, "learning_rate": 7.671310240963855e-08, "logits/chosen": -2.9749999046325684, "logits/rejected": -2.9390625953674316, "logps/chosen": -381.20001220703125, "logps/rejected": -475.79998779296875, "loss": 0.0238, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5093750953674316, "rewards/margins": 6.809374809265137, "rewards/rejected": -10.3203125, "step": 9810 }, { "epoch": 3.697289156626506, "grad_norm": 19.783992366329457, "learning_rate": 7.577183734939759e-08, "logits/chosen": -2.8828125, "logits/rejected": -3.05078125, "logps/chosen": -402.8999938964844, "logps/rejected": -481.70001220703125, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -3.4964842796325684, "rewards/margins": 6.895312309265137, "rewards/rejected": -10.393750190734863, "step": 9820 }, { "epoch": 3.70105421686747, "grad_norm": 10.11270769897374, "learning_rate": 7.483057228915663e-08, "logits/chosen": -2.9078125953674316, "logits/rejected": -2.9554686546325684, "logps/chosen": -344.8500061035156, "logps/rejected": -427.95001220703125, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -3.4429688453674316, "rewards/margins": 6.168749809265137, "rewards/rejected": -9.607812881469727, "step": 9830 }, { "epoch": 3.7048192771084336, "grad_norm": 17.817996213674604, "learning_rate": 7.388930722891567e-08, "logits/chosen": -2.964062452316284, "logits/rejected": -2.969531297683716, "logps/chosen": -376.6499938964844, "logps/rejected": -513.2000122070312, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -3.6624999046325684, "rewards/margins": 7.104687690734863, "rewards/rejected": -10.765625, "step": 9840 }, { "epoch": 3.7085843373493974, "grad_norm": 8.08442492747352, "learning_rate": 7.29480421686747e-08, "logits/chosen": -2.901562452316284, "logits/rejected": -2.936718702316284, "logps/chosen": -374.45001220703125, "logps/rejected": -477.70001220703125, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -3.0492186546325684, "rewards/margins": 7.057812690734863, "rewards/rejected": -10.100000381469727, "step": 9850 }, { "epoch": 3.712349397590361, "grad_norm": 10.757354082109297, "learning_rate": 7.200677710843372e-08, "logits/chosen": -2.9585938453674316, "logits/rejected": -3.042187452316284, "logps/chosen": -362.1499938964844, "logps/rejected": -449.1000061035156, "loss": 0.0154, "rewards/accuracies": 1.0, "rewards/chosen": -3.200000047683716, "rewards/margins": 7.403124809265137, "rewards/rejected": -10.600000381469727, "step": 9860 }, { "epoch": 3.7161144578313254, "grad_norm": 10.832835024643611, "learning_rate": 7.106551204819276e-08, "logits/chosen": -2.969531297683716, "logits/rejected": -2.984375, "logps/chosen": -328.6499938964844, "logps/rejected": -424.8999938964844, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -3.151171922683716, "rewards/margins": 6.735937595367432, "rewards/rejected": -9.887499809265137, "step": 9870 }, { "epoch": 3.7198795180722892, "grad_norm": 3.27243600732778, "learning_rate": 7.01242469879518e-08, "logits/chosen": -3.071093797683716, "logits/rejected": -3.098437547683716, "logps/chosen": -360.8500061035156, "logps/rejected": -495.20001220703125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -4.126953125, "rewards/margins": 6.871874809265137, "rewards/rejected": -10.987500190734863, "step": 9880 }, { "epoch": 3.723644578313253, "grad_norm": 3.073338937968815, "learning_rate": 6.918298192771084e-08, "logits/chosen": -3.055468797683716, "logits/rejected": -3.0843749046325684, "logps/chosen": -370.45001220703125, "logps/rejected": -511.20001220703125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -3.767578125, "rewards/margins": 7.560937404632568, "rewards/rejected": -11.321874618530273, "step": 9890 }, { "epoch": 3.727409638554217, "grad_norm": 1.6467402116053247, "learning_rate": 6.824171686746988e-08, "logits/chosen": -3.018749952316284, "logits/rejected": -3.0484375953674316, "logps/chosen": -356.45001220703125, "logps/rejected": -473.20001220703125, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -3.617968797683716, "rewards/margins": 7.446875095367432, "rewards/rejected": -11.0625, "step": 9900 }, { "epoch": 3.7311746987951806, "grad_norm": 3.613637049624932, "learning_rate": 6.730045180722892e-08, "logits/chosen": -2.9609375, "logits/rejected": -2.91015625, "logps/chosen": -386.54998779296875, "logps/rejected": -515.4000244140625, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -3.592968702316284, "rewards/margins": 7.337500095367432, "rewards/rejected": -10.923437118530273, "step": 9910 }, { "epoch": 3.734939759036145, "grad_norm": 36.48152298020133, "learning_rate": 6.635918674698796e-08, "logits/chosen": -2.9390625953674316, "logits/rejected": -3.0570311546325684, "logps/chosen": -363.70001220703125, "logps/rejected": -487.29998779296875, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.645312547683716, "rewards/margins": 7.324999809265137, "rewards/rejected": -10.96875, "step": 9920 }, { "epoch": 3.7387048192771086, "grad_norm": 31.394146323649082, "learning_rate": 6.541792168674699e-08, "logits/chosen": -3.05859375, "logits/rejected": -3.0570311546325684, "logps/chosen": -383.04998779296875, "logps/rejected": -478.04998779296875, "loss": 0.0189, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.41796875, "rewards/margins": 6.551562309265137, "rewards/rejected": -9.96875, "step": 9930 }, { "epoch": 3.7424698795180724, "grad_norm": 3.1957093767160822, "learning_rate": 6.447665662650601e-08, "logits/chosen": -2.870312452316284, "logits/rejected": -3.0687499046325684, "logps/chosen": -366.29998779296875, "logps/rejected": -468.6000061035156, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -3.4039063453674316, "rewards/margins": 6.832812309265137, "rewards/rejected": -10.231249809265137, "step": 9940 }, { "epoch": 3.746234939759036, "grad_norm": 15.25097760004395, "learning_rate": 6.353539156626505e-08, "logits/chosen": -3.0335936546325684, "logits/rejected": -3.090625047683716, "logps/chosen": -405.6000061035156, "logps/rejected": -490.8999938964844, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -3.641406297683716, "rewards/margins": 7.140625, "rewards/rejected": -10.78125, "step": 9950 }, { "epoch": 3.75, "grad_norm": 3.483456207696156, "learning_rate": 6.259412650602409e-08, "logits/chosen": -3.0140624046325684, "logits/rejected": -3.1507811546325684, "logps/chosen": -378.1000061035156, "logps/rejected": -482.5, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -3.715625047683716, "rewards/margins": 7.1796875, "rewards/rejected": -10.890625, "step": 9960 }, { "epoch": 3.753765060240964, "grad_norm": 15.94281481847458, "learning_rate": 6.165286144578313e-08, "logits/chosen": -2.8187499046325684, "logits/rejected": -2.934375047683716, "logps/chosen": -390.79998779296875, "logps/rejected": -482.6000061035156, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -3.3597655296325684, "rewards/margins": 6.939062595367432, "rewards/rejected": -10.303125381469727, "step": 9970 }, { "epoch": 3.7575301204819276, "grad_norm": 20.91620815611824, "learning_rate": 6.071159638554216e-08, "logits/chosen": -2.9789061546325684, "logits/rejected": -2.938281297683716, "logps/chosen": -345.1499938964844, "logps/rejected": -449.20001220703125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -3.4730467796325684, "rewards/margins": 6.90625, "rewards/rejected": -10.384374618530273, "step": 9980 }, { "epoch": 3.7612951807228914, "grad_norm": 3.1029185562480865, "learning_rate": 5.97703313253012e-08, "logits/chosen": -2.934375047683716, "logits/rejected": -3.059375047683716, "logps/chosen": -382.29998779296875, "logps/rejected": -518.9500122070312, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -3.08203125, "rewards/margins": 7.2578125, "rewards/rejected": -10.342187881469727, "step": 9990 }, { "epoch": 3.765060240963855, "grad_norm": 19.11912284529666, "learning_rate": 5.882906626506024e-08, "logits/chosen": -2.930468797683716, "logits/rejected": -3.0914063453674316, "logps/chosen": -336.70001220703125, "logps/rejected": -434.20001220703125, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.369140625, "rewards/margins": 6.814062595367432, "rewards/rejected": -10.181249618530273, "step": 10000 }, { "epoch": 3.7688253012048194, "grad_norm": 4.733037361552664, "learning_rate": 5.788780120481927e-08, "logits/chosen": -2.8515625, "logits/rejected": -2.98828125, "logps/chosen": -412.45001220703125, "logps/rejected": -494.70001220703125, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.4164061546325684, "rewards/margins": 7.400000095367432, "rewards/rejected": -10.817187309265137, "step": 10010 }, { "epoch": 3.772590361445783, "grad_norm": 43.585174235285905, "learning_rate": 5.694653614457831e-08, "logits/chosen": -2.9554686546325684, "logits/rejected": -3.0015625953674316, "logps/chosen": -421.25, "logps/rejected": -489.8999938964844, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -3.2484374046325684, "rewards/margins": 7.025000095367432, "rewards/rejected": -10.271875381469727, "step": 10020 }, { "epoch": 3.776355421686747, "grad_norm": 1.9332886665612852, "learning_rate": 5.600527108433735e-08, "logits/chosen": -2.92578125, "logits/rejected": -2.9789061546325684, "logps/chosen": -433.3999938964844, "logps/rejected": -511.20001220703125, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -3.3843750953674316, "rewards/margins": 7.243750095367432, "rewards/rejected": -10.628125190734863, "step": 10030 }, { "epoch": 3.7801204819277108, "grad_norm": 18.97276897327046, "learning_rate": 5.506400602409638e-08, "logits/chosen": -2.917187452316284, "logits/rejected": -2.914843797683716, "logps/chosen": -421.79998779296875, "logps/rejected": -576.5999755859375, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -4.286718845367432, "rewards/margins": 7.989062309265137, "rewards/rejected": -12.268750190734863, "step": 10040 }, { "epoch": 3.7838855421686746, "grad_norm": 1.7609559012267022, "learning_rate": 5.412274096385542e-08, "logits/chosen": -2.9828124046325684, "logits/rejected": -2.9906249046325684, "logps/chosen": -385.3999938964844, "logps/rejected": -464.6000061035156, "loss": 0.0196, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.716015577316284, "rewards/margins": 6.864062309265137, "rewards/rejected": -10.574999809265137, "step": 10050 }, { "epoch": 3.787650602409639, "grad_norm": 1.8206858695043522, "learning_rate": 5.318147590361446e-08, "logits/chosen": -2.938281297683716, "logits/rejected": -2.9390625953674316, "logps/chosen": -392.1000061035156, "logps/rejected": -508.1000061035156, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": -3.589062452316284, "rewards/margins": 7.268750190734863, "rewards/rejected": -10.856249809265137, "step": 10060 }, { "epoch": 3.7914156626506026, "grad_norm": 1.8061371793221015, "learning_rate": 5.224021084337349e-08, "logits/chosen": -2.938281297683716, "logits/rejected": -2.98828125, "logps/chosen": -374.95001220703125, "logps/rejected": -492.70001220703125, "loss": 0.0119, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.83984375, "rewards/margins": 7.2578125, "rewards/rejected": -11.09375, "step": 10070 }, { "epoch": 3.7951807228915664, "grad_norm": 4.883847009730281, "learning_rate": 5.1298945783132524e-08, "logits/chosen": -2.9945311546325684, "logits/rejected": -3.0328125953674316, "logps/chosen": -356.6000061035156, "logps/rejected": -479.0, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -3.44921875, "rewards/margins": 6.868750095367432, "rewards/rejected": -10.318750381469727, "step": 10080 }, { "epoch": 3.79894578313253, "grad_norm": 6.005079606417416, "learning_rate": 5.0357680722891564e-08, "logits/chosen": -2.981250047683716, "logits/rejected": -3.0062499046325684, "logps/chosen": -341.04998779296875, "logps/rejected": -468.79998779296875, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -3.6171875, "rewards/margins": 6.954687595367432, "rewards/rejected": -10.571874618530273, "step": 10090 }, { "epoch": 3.802710843373494, "grad_norm": 10.950140532987039, "learning_rate": 4.94164156626506e-08, "logits/chosen": -2.9007811546325684, "logits/rejected": -2.9195313453674316, "logps/chosen": -364.04998779296875, "logps/rejected": -467.20001220703125, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -3.466796875, "rewards/margins": 7.287499904632568, "rewards/rejected": -10.756250381469727, "step": 10100 }, { "epoch": 3.8064759036144578, "grad_norm": 3.536485419760468, "learning_rate": 4.847515060240964e-08, "logits/chosen": -2.96484375, "logits/rejected": -3.0210938453674316, "logps/chosen": -364.8500061035156, "logps/rejected": -432.79998779296875, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -3.421093702316284, "rewards/margins": 6.823437690734863, "rewards/rejected": -10.248437881469727, "step": 10110 }, { "epoch": 3.8102409638554215, "grad_norm": 22.155928024108217, "learning_rate": 4.753388554216867e-08, "logits/chosen": -3.012500047683716, "logits/rejected": -2.9742188453674316, "logps/chosen": -385.79998779296875, "logps/rejected": -487.29998779296875, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -3.3550782203674316, "rewards/margins": 7.15625, "rewards/rejected": -10.506250381469727, "step": 10120 }, { "epoch": 3.8140060240963853, "grad_norm": 28.150562132403202, "learning_rate": 4.659262048192771e-08, "logits/chosen": -3.0328125953674316, "logits/rejected": -3.092968702316284, "logps/chosen": -396.70001220703125, "logps/rejected": -473.20001220703125, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -3.3421874046325684, "rewards/margins": 6.853125095367432, "rewards/rejected": -10.196874618530273, "step": 10130 }, { "epoch": 3.817771084337349, "grad_norm": 50.49135970963374, "learning_rate": 4.5651355421686744e-08, "logits/chosen": -2.9609375, "logits/rejected": -3.0875000953674316, "logps/chosen": -363.79998779296875, "logps/rejected": -459.8999938964844, "loss": 0.021, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.524609327316284, "rewards/margins": 6.753125190734863, "rewards/rejected": -10.2734375, "step": 10140 }, { "epoch": 3.8215361445783134, "grad_norm": 1.547171171834634, "learning_rate": 4.4710090361445784e-08, "logits/chosen": -2.9515624046325684, "logits/rejected": -2.932812452316284, "logps/chosen": -401.8500061035156, "logps/rejected": -512.0, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -3.6015625, "rewards/margins": 7.564062595367432, "rewards/rejected": -11.168749809265137, "step": 10150 }, { "epoch": 3.825301204819277, "grad_norm": 8.675544839861143, "learning_rate": 4.3768825301204824e-08, "logits/chosen": -2.96875, "logits/rejected": -2.9710936546325684, "logps/chosen": -412.25, "logps/rejected": -502.70001220703125, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -3.8929686546325684, "rewards/margins": 7.467187404632568, "rewards/rejected": -11.356249809265137, "step": 10160 }, { "epoch": 3.829066265060241, "grad_norm": 47.537711575226794, "learning_rate": 4.282756024096385e-08, "logits/chosen": -3.016406297683716, "logits/rejected": -2.9585938453674316, "logps/chosen": -384.8999938964844, "logps/rejected": -458.95001220703125, "loss": 0.0127, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.4937500953674316, "rewards/margins": 6.939062595367432, "rewards/rejected": -10.4375, "step": 10170 }, { "epoch": 3.8328313253012047, "grad_norm": 2.3185176467066904, "learning_rate": 4.188629518072289e-08, "logits/chosen": -2.98828125, "logits/rejected": -3.046093702316284, "logps/chosen": -392.95001220703125, "logps/rejected": -490.45001220703125, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -3.446484327316284, "rewards/margins": 7.515625, "rewards/rejected": -10.959375381469727, "step": 10180 }, { "epoch": 3.8365963855421685, "grad_norm": 3.235327138017543, "learning_rate": 4.094503012048193e-08, "logits/chosen": -2.9195313453674316, "logits/rejected": -2.948437452316284, "logps/chosen": -372.0, "logps/rejected": -472.6499938964844, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -3.395703077316284, "rewards/margins": 7.139062404632568, "rewards/rejected": -10.532812118530273, "step": 10190 }, { "epoch": 3.8403614457831328, "grad_norm": 6.971140568295343, "learning_rate": 4.0003765060240957e-08, "logits/chosen": -2.996875047683716, "logits/rejected": -3.0718750953674316, "logps/chosen": -372.75, "logps/rejected": -461.29998779296875, "loss": 0.0177, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3304686546325684, "rewards/margins": 7.053124904632568, "rewards/rejected": -10.389062881469727, "step": 10200 }, { "epoch": 3.8441265060240966, "grad_norm": 11.297204570302151, "learning_rate": 3.9062499999999997e-08, "logits/chosen": -2.9398436546325684, "logits/rejected": -3.038281202316284, "logps/chosen": -399.79998779296875, "logps/rejected": -483.20001220703125, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -3.6429686546325684, "rewards/margins": 7.134375095367432, "rewards/rejected": -10.771875381469727, "step": 10210 }, { "epoch": 3.8478915662650603, "grad_norm": 30.030441551841303, "learning_rate": 3.8121234939759036e-08, "logits/chosen": -2.967968702316284, "logits/rejected": -2.953906297683716, "logps/chosen": -366.0, "logps/rejected": -500.20001220703125, "loss": 0.0195, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3746094703674316, "rewards/margins": 6.979687690734863, "rewards/rejected": -10.346875190734863, "step": 10220 }, { "epoch": 3.851656626506024, "grad_norm": 6.406369612443275, "learning_rate": 3.717996987951807e-08, "logits/chosen": -2.9546875953674316, "logits/rejected": -3.043750047683716, "logps/chosen": -390.04998779296875, "logps/rejected": -476.29998779296875, "loss": 0.0132, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.412109375, "rewards/margins": 7.081250190734863, "rewards/rejected": -10.5, "step": 10230 }, { "epoch": 3.855421686746988, "grad_norm": 2.0036096482388053, "learning_rate": 3.623870481927711e-08, "logits/chosen": -2.97265625, "logits/rejected": -2.975781202316284, "logps/chosen": -360.75, "logps/rejected": -490.5, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -3.2503905296325684, "rewards/margins": 7.446875095367432, "rewards/rejected": -10.701562881469727, "step": 10240 }, { "epoch": 3.8591867469879517, "grad_norm": 4.594107230338675, "learning_rate": 3.529743975903614e-08, "logits/chosen": -2.96875, "logits/rejected": -3.034374952316284, "logps/chosen": -361.1000061035156, "logps/rejected": -479.1499938964844, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -3.5667967796325684, "rewards/margins": 7.084374904632568, "rewards/rejected": -10.649999618530273, "step": 10250 }, { "epoch": 3.8629518072289155, "grad_norm": 13.257304028070132, "learning_rate": 3.4356174698795176e-08, "logits/chosen": -3.00390625, "logits/rejected": -3.010937452316284, "logps/chosen": -383.20001220703125, "logps/rejected": -526.7999877929688, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -3.6050782203674316, "rewards/margins": 7.518750190734863, "rewards/rejected": -11.114062309265137, "step": 10260 }, { "epoch": 3.8667168674698793, "grad_norm": 18.93035863753128, "learning_rate": 3.3414909638554216e-08, "logits/chosen": -2.9937500953674316, "logits/rejected": -3.0023436546325684, "logps/chosen": -411.3999938964844, "logps/rejected": -540.7999877929688, "loss": 0.0254, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.582812547683716, "rewards/margins": 6.839062690734863, "rewards/rejected": -10.425000190734863, "step": 10270 }, { "epoch": 3.8704819277108435, "grad_norm": 12.740243103607307, "learning_rate": 3.2473644578313256e-08, "logits/chosen": -2.91796875, "logits/rejected": -2.953906297683716, "logps/chosen": -363.1499938964844, "logps/rejected": -429.3999938964844, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -3.66455078125, "rewards/margins": 6.860937595367432, "rewards/rejected": -10.521875381469727, "step": 10280 }, { "epoch": 3.8742469879518073, "grad_norm": 44.50285846787039, "learning_rate": 3.153237951807228e-08, "logits/chosen": -2.9546875953674316, "logits/rejected": -3.018749952316284, "logps/chosen": -398.1499938964844, "logps/rejected": -485.8500061035156, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -3.6156249046325684, "rewards/margins": 6.932812690734863, "rewards/rejected": -10.5390625, "step": 10290 }, { "epoch": 3.878012048192771, "grad_norm": 9.11413204527144, "learning_rate": 3.059111445783132e-08, "logits/chosen": -2.9710936546325684, "logits/rejected": -2.9906249046325684, "logps/chosen": -367.1499938964844, "logps/rejected": -478.29998779296875, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -3.4828124046325684, "rewards/margins": 7.120312690734863, "rewards/rejected": -10.606249809265137, "step": 10300 }, { "epoch": 3.881777108433735, "grad_norm": 8.507339661476522, "learning_rate": 2.9649849397590362e-08, "logits/chosen": -2.9722657203674316, "logits/rejected": -3.108593702316284, "logps/chosen": -363.29998779296875, "logps/rejected": -416.70001220703125, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -3.2242188453674316, "rewards/margins": 6.7109375, "rewards/rejected": -9.9375, "step": 10310 }, { "epoch": 3.8855421686746987, "grad_norm": 40.117787591025056, "learning_rate": 2.8708584337349396e-08, "logits/chosen": -3.0648436546325684, "logits/rejected": -2.9453125, "logps/chosen": -344.5, "logps/rejected": -474.0, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -4.067187309265137, "rewards/margins": 7.057812690734863, "rewards/rejected": -11.131250381469727, "step": 10320 }, { "epoch": 3.8893072289156625, "grad_norm": 9.327017599650718, "learning_rate": 2.7767319277108432e-08, "logits/chosen": -2.8812499046325684, "logits/rejected": -2.996875047683716, "logps/chosen": -319.0, "logps/rejected": -436.1000061035156, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -3.359375, "rewards/margins": 6.956250190734863, "rewards/rejected": -10.318750381469727, "step": 10330 }, { "epoch": 3.8930722891566267, "grad_norm": 9.118854221355882, "learning_rate": 2.682605421686747e-08, "logits/chosen": -2.983593702316284, "logits/rejected": -2.948437452316284, "logps/chosen": -370.20001220703125, "logps/rejected": -437.0, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": -3.467578172683716, "rewards/margins": 6.34375, "rewards/rejected": -9.815625190734863, "step": 10340 }, { "epoch": 3.8968373493975905, "grad_norm": 145.75645167260004, "learning_rate": 2.5884789156626505e-08, "logits/chosen": -2.948437452316284, "logits/rejected": -3.032031297683716, "logps/chosen": -377.3999938964844, "logps/rejected": -463.70001220703125, "loss": 0.0226, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.321484327316284, "rewards/margins": 6.8828125, "rewards/rejected": -10.207812309265137, "step": 10350 }, { "epoch": 3.9006024096385543, "grad_norm": 2.7196341398939556, "learning_rate": 2.4943524096385542e-08, "logits/chosen": -2.924999952316284, "logits/rejected": -2.985156297683716, "logps/chosen": -360.1499938964844, "logps/rejected": -438.54998779296875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -3.5703125, "rewards/margins": 6.998437404632568, "rewards/rejected": -10.568750381469727, "step": 10360 }, { "epoch": 3.904367469879518, "grad_norm": 2.59448652923292, "learning_rate": 2.400225903614458e-08, "logits/chosen": -2.9828124046325684, "logits/rejected": -3.010937452316284, "logps/chosen": -422.8999938964844, "logps/rejected": -475.6000061035156, "loss": 0.0235, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.563281297683716, "rewards/margins": 7.189062595367432, "rewards/rejected": -10.75, "step": 10370 }, { "epoch": 3.908132530120482, "grad_norm": 73.71512019796718, "learning_rate": 2.3060993975903612e-08, "logits/chosen": -2.764843702316284, "logits/rejected": -2.9906249046325684, "logps/chosen": -372.6499938964844, "logps/rejected": -428.45001220703125, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -3.3960938453674316, "rewards/margins": 6.348437309265137, "rewards/rejected": -9.753125190734863, "step": 10380 }, { "epoch": 3.9118975903614457, "grad_norm": 16.014656936379115, "learning_rate": 2.2119728915662652e-08, "logits/chosen": -2.9625000953674316, "logits/rejected": -3.014843702316284, "logps/chosen": -376.0, "logps/rejected": -454.3500061035156, "loss": 0.0219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.418750047683716, "rewards/margins": 6.668749809265137, "rewards/rejected": -10.090624809265137, "step": 10390 }, { "epoch": 3.9156626506024095, "grad_norm": 2.7803144039293044, "learning_rate": 2.1178463855421685e-08, "logits/chosen": -2.918750047683716, "logits/rejected": -3.035937547683716, "logps/chosen": -404.70001220703125, "logps/rejected": -509.1000061035156, "loss": 0.0286, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8046875, "rewards/margins": 7.074999809265137, "rewards/rejected": -10.875, "step": 10400 }, { "epoch": 3.9194277108433733, "grad_norm": 8.711249909687115, "learning_rate": 2.0237198795180722e-08, "logits/chosen": -2.981250047683716, "logits/rejected": -3.0914063453674316, "logps/chosen": -404.20001220703125, "logps/rejected": -492.20001220703125, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -3.6068358421325684, "rewards/margins": 7.271874904632568, "rewards/rejected": -10.8828125, "step": 10410 }, { "epoch": 3.9231927710843375, "grad_norm": 1.565411973708353, "learning_rate": 1.9295933734939758e-08, "logits/chosen": -2.967968702316284, "logits/rejected": -2.9984374046325684, "logps/chosen": -356.70001220703125, "logps/rejected": -493.79998779296875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.4976563453674316, "rewards/margins": 7.185937404632568, "rewards/rejected": -10.684374809265137, "step": 10420 }, { "epoch": 3.9269578313253013, "grad_norm": 6.409164188528361, "learning_rate": 1.8354668674698795e-08, "logits/chosen": -2.9625000953674316, "logits/rejected": -2.987499952316284, "logps/chosen": -344.8999938964844, "logps/rejected": -460.5, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": -3.551953077316284, "rewards/margins": 7.287499904632568, "rewards/rejected": -10.856249809265137, "step": 10430 }, { "epoch": 3.930722891566265, "grad_norm": 61.1184814210395, "learning_rate": 1.741340361445783e-08, "logits/chosen": -2.885937452316284, "logits/rejected": -2.9789061546325684, "logps/chosen": -407.6499938964844, "logps/rejected": -484.8999938964844, "loss": 0.0445, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.321093797683716, "rewards/margins": 6.956250190734863, "rewards/rejected": -10.274999618530273, "step": 10440 }, { "epoch": 3.934487951807229, "grad_norm": 3.109706488457313, "learning_rate": 1.6472138554216868e-08, "logits/chosen": -2.8960938453674316, "logits/rejected": -2.9515624046325684, "logps/chosen": -353.70001220703125, "logps/rejected": -439.3999938964844, "loss": 0.0137, "rewards/accuracies": 1.0, "rewards/chosen": -3.510937452316284, "rewards/margins": 6.556250095367432, "rewards/rejected": -10.068750381469727, "step": 10450 }, { "epoch": 3.9382530120481927, "grad_norm": 10.36974954063179, "learning_rate": 1.5530873493975905e-08, "logits/chosen": -2.9703125953674316, "logits/rejected": -2.964062452316284, "logps/chosen": -358.6499938964844, "logps/rejected": -444.6000061035156, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": -3.7828125953674316, "rewards/margins": 7.053124904632568, "rewards/rejected": -10.840624809265137, "step": 10460 }, { "epoch": 3.9420180722891565, "grad_norm": 1.5346147816907378, "learning_rate": 1.4589608433734938e-08, "logits/chosen": -2.9195313453674316, "logits/rejected": -2.967968702316284, "logps/chosen": -374.0, "logps/rejected": -451.0, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": -3.318359375, "rewards/margins": 7.199999809265137, "rewards/rejected": -10.518750190734863, "step": 10470 }, { "epoch": 3.9457831325301207, "grad_norm": 2.509699334533227, "learning_rate": 1.3648343373493974e-08, "logits/chosen": -2.921875, "logits/rejected": -2.952343702316284, "logps/chosen": -395.04998779296875, "logps/rejected": -509.29998779296875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -3.5992188453674316, "rewards/margins": 7.168749809265137, "rewards/rejected": -10.771875381469727, "step": 10480 }, { "epoch": 3.9495481927710845, "grad_norm": 4.375624991144258, "learning_rate": 1.2707078313253011e-08, "logits/chosen": -2.8359375, "logits/rejected": -2.944531202316284, "logps/chosen": -410.20001220703125, "logps/rejected": -475.0, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -3.77734375, "rewards/margins": 6.778124809265137, "rewards/rejected": -10.556249618530273, "step": 10490 }, { "epoch": 3.9533132530120483, "grad_norm": 8.37128005383685, "learning_rate": 1.1765813253012048e-08, "logits/chosen": -2.9281249046325684, "logits/rejected": -2.9664063453674316, "logps/chosen": -416.70001220703125, "logps/rejected": -499.54998779296875, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -3.5171875953674316, "rewards/margins": 6.860937595367432, "rewards/rejected": -10.3828125, "step": 10500 }, { "epoch": 3.957078313253012, "grad_norm": 9.242575279843917, "learning_rate": 1.0824548192771083e-08, "logits/chosen": -2.991406202316284, "logits/rejected": -3.0445313453674316, "logps/chosen": -423.0, "logps/rejected": -508.5, "loss": 0.0334, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.537890672683716, "rewards/margins": 6.673437595367432, "rewards/rejected": -10.212499618530273, "step": 10510 }, { "epoch": 3.960843373493976, "grad_norm": 6.9518446873414055, "learning_rate": 9.883283132530119e-09, "logits/chosen": -2.9281249046325684, "logits/rejected": -2.973437547683716, "logps/chosen": -389.20001220703125, "logps/rejected": -475.3999938964844, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -3.500781297683716, "rewards/margins": 7.053124904632568, "rewards/rejected": -10.5625, "step": 10520 }, { "epoch": 3.9646084337349397, "grad_norm": 10.059363406618143, "learning_rate": 8.942018072289156e-09, "logits/chosen": -2.98828125, "logits/rejected": -3.03515625, "logps/chosen": -363.04998779296875, "logps/rejected": -458.0, "loss": 0.0161, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.69921875, "rewards/margins": 6.925000190734863, "rewards/rejected": -10.623437881469727, "step": 10530 }, { "epoch": 3.9683734939759034, "grad_norm": 4.8524081079211365, "learning_rate": 8.000753012048192e-09, "logits/chosen": -2.936718702316284, "logits/rejected": -2.97265625, "logps/chosen": -358.1000061035156, "logps/rejected": -447.8999938964844, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -3.154296875, "rewards/margins": 6.928124904632568, "rewards/rejected": -10.082812309265137, "step": 10540 }, { "epoch": 3.9721385542168672, "grad_norm": 8.69292896204975, "learning_rate": 7.059487951807229e-09, "logits/chosen": -2.90234375, "logits/rejected": -2.9820313453674316, "logps/chosen": -355.54998779296875, "logps/rejected": -428.6000061035156, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -3.170703172683716, "rewards/margins": 6.993750095367432, "rewards/rejected": -10.1640625, "step": 10550 }, { "epoch": 3.9759036144578315, "grad_norm": 10.44213955168897, "learning_rate": 6.118222891566265e-09, "logits/chosen": -3.016406297683716, "logits/rejected": -2.9312500953674316, "logps/chosen": -380.5, "logps/rejected": -491.70001220703125, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -3.243359327316284, "rewards/margins": 6.657812595367432, "rewards/rejected": -9.901562690734863, "step": 10560 }, { "epoch": 3.9796686746987953, "grad_norm": 3.0086244423273727, "learning_rate": 5.176957831325301e-09, "logits/chosen": -3.014843702316284, "logits/rejected": -3.1109375953674316, "logps/chosen": -373.29998779296875, "logps/rejected": -445.8999938964844, "loss": 0.0143, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.567187547683716, "rewards/margins": 7.0546875, "rewards/rejected": -10.625, "step": 10570 }, { "epoch": 3.983433734939759, "grad_norm": 22.9898840351717, "learning_rate": 4.235692771084337e-09, "logits/chosen": -2.936718702316284, "logits/rejected": -2.973437547683716, "logps/chosen": -383.29998779296875, "logps/rejected": -479.3999938964844, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -3.5621094703674316, "rewards/margins": 7.243750095367432, "rewards/rejected": -10.8125, "step": 10580 }, { "epoch": 3.987198795180723, "grad_norm": 3.181036249887907, "learning_rate": 3.2944277108433736e-09, "logits/chosen": -3.039843797683716, "logits/rejected": -3.1617188453674316, "logps/chosen": -387.1499938964844, "logps/rejected": -466.54998779296875, "loss": 0.0189, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.319531202316284, "rewards/margins": 6.7265625, "rewards/rejected": -10.043749809265137, "step": 10590 }, { "epoch": 3.9909638554216866, "grad_norm": 1.095175769363068, "learning_rate": 2.3531626506024098e-09, "logits/chosen": -3.03515625, "logits/rejected": -3.137500047683716, "logps/chosen": -429.79998779296875, "logps/rejected": -490.79998779296875, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": -3.507031202316284, "rewards/margins": 6.981249809265137, "rewards/rejected": -10.4921875, "step": 10600 }, { "epoch": 3.994728915662651, "grad_norm": 3.0334459305326784, "learning_rate": 1.4118975903614457e-09, "logits/chosen": -2.95703125, "logits/rejected": -2.982421875, "logps/chosen": -389.25, "logps/rejected": -520.3499755859375, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -3.339062452316284, "rewards/margins": 7.610937595367432, "rewards/rejected": -10.943750381469727, "step": 10610 }, { "epoch": 3.9984939759036147, "grad_norm": 6.163406615172941, "learning_rate": 4.706325301204819e-10, "logits/chosen": -2.9625000953674316, "logits/rejected": -3.01171875, "logps/chosen": -386.1499938964844, "logps/rejected": -515.2000122070312, "loss": 0.0122, "rewards/accuracies": 1.0, "rewards/chosen": -3.5703125, "rewards/margins": 6.896874904632568, "rewards/rejected": -10.481249809265137, "step": 10620 } ], "logging_steps": 10, "max_steps": 10624, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }