{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9997175939000282, "eval_steps": 500, "global_step": 10620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003765414666290125, "grad_norm": 67.7432198319446, "learning_rate": 9.991525423728812e-07, "logits/chosen": -0.9952728152275085, "logits/rejected": -0.718493640422821, "logps/chosen": -310.29998779296875, "logps/rejected": -289.7875061035156, "loss": 0.6864, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.007350921630859375, "rewards/margins": 0.02181548997759819, "rewards/rejected": -0.014499664306640625, "step": 10 }, { "epoch": 0.00753082933258025, "grad_norm": 60.81643595225716, "learning_rate": 9.98210922787194e-07, "logits/chosen": -0.979968249797821, "logits/rejected": -0.8651489019393921, "logps/chosen": -354.2875061035156, "logps/rejected": -317.6499938964844, "loss": 0.6677, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.10319213569164276, "rewards/margins": 0.060343168675899506, "rewards/rejected": 0.04284515231847763, "step": 20 }, { "epoch": 0.011296243998870376, "grad_norm": 75.73986824584787, "learning_rate": 9.972693032015067e-07, "logits/chosen": -0.981945812702179, "logits/rejected": -0.8436737060546875, "logps/chosen": -315.9750061035156, "logps/rejected": -298.5874938964844, "loss": 0.6797, "rewards/accuracies": 0.4375, "rewards/chosen": 0.1348831206560135, "rewards/margins": 0.04076080396771431, "rewards/rejected": 0.09417724609375, "step": 30 }, { "epoch": 0.0150616586651605, "grad_norm": 60.532467465020176, "learning_rate": 9.963276836158193e-07, "logits/chosen": -0.9978882074356079, "logits/rejected": -0.875927746295929, "logps/chosen": -319.76251220703125, "logps/rejected": -297.5375061035156, "loss": 0.6335, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.18035125732421875, "rewards/margins": 0.14685669541358948, "rewards/rejected": 0.03354339674115181, "step": 40 }, { "epoch": 0.018827073331450627, "grad_norm": 63.22057049205004, "learning_rate": 9.953860640301318e-07, "logits/chosen": -1.083398461341858, "logits/rejected": -0.898486316204071, "logps/chosen": -325.11248779296875, "logps/rejected": -309.17498779296875, "loss": 0.6443, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.10948791354894638, "rewards/margins": 0.15833130478858948, "rewards/rejected": -0.04886169359087944, "step": 50 }, { "epoch": 0.022592487997740753, "grad_norm": 63.42097585017889, "learning_rate": 9.944444444444444e-07, "logits/chosen": -0.9825439453125, "logits/rejected": -0.962329089641571, "logps/chosen": -330.82501220703125, "logps/rejected": -280.7250061035156, "loss": 0.6542, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.13559111952781677, "rewards/margins": 0.14374694228172302, "rewards/rejected": -0.008077239617705345, "step": 60 }, { "epoch": 0.026357902664030878, "grad_norm": 69.63475969254995, "learning_rate": 9.935028248587571e-07, "logits/chosen": -1.0872802734375, "logits/rejected": -0.937695324420929, "logps/chosen": -314.2875061035156, "logps/rejected": -303.2124938964844, "loss": 0.6282, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.20128783583641052, "rewards/margins": 0.23421630263328552, "rewards/rejected": -0.03332214429974556, "step": 70 }, { "epoch": 0.030123317330321, "grad_norm": 52.56952278035122, "learning_rate": 9.925612052730697e-07, "logits/chosen": -0.9498535394668579, "logits/rejected": -0.7635558843612671, "logps/chosen": -322.0, "logps/rejected": -319.04998779296875, "loss": 0.6052, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2769638001918793, "rewards/margins": 0.30173951387405396, "rewards/rejected": -0.02459716796875, "step": 80 }, { "epoch": 0.033888731996611125, "grad_norm": 66.53508465879354, "learning_rate": 9.916195856873822e-07, "logits/chosen": -1.001123070716858, "logits/rejected": -0.952880859375, "logps/chosen": -378.6625061035156, "logps/rejected": -342.79998779296875, "loss": 0.6181, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.31887513399124146, "rewards/margins": 0.2580322325229645, "rewards/rejected": 0.06081237643957138, "step": 90 }, { "epoch": 0.037654146662901254, "grad_norm": 64.63737707906591, "learning_rate": 9.906779661016948e-07, "logits/chosen": -1.094702124595642, "logits/rejected": -0.931469738483429, "logps/chosen": -360.375, "logps/rejected": -327.17498779296875, "loss": 0.6059, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.40128785371780396, "rewards/margins": 0.30000001192092896, "rewards/rejected": 0.10114135593175888, "step": 100 }, { "epoch": 0.041419561329191376, "grad_norm": 74.08107269585282, "learning_rate": 9.897363465160075e-07, "logits/chosen": -1.080322265625, "logits/rejected": -1.0352783203125, "logps/chosen": -383.88751220703125, "logps/rejected": -342.61248779296875, "loss": 0.5634, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.6162353754043579, "rewards/margins": 0.4089721739292145, "rewards/rejected": 0.20726318657398224, "step": 110 }, { "epoch": 0.045184975995481505, "grad_norm": 67.30401221006436, "learning_rate": 9.8879472693032e-07, "logits/chosen": -1.031103491783142, "logits/rejected": -1.0171020030975342, "logps/chosen": -310.8374938964844, "logps/rejected": -289.5249938964844, "loss": 0.6077, "rewards/accuracies": 0.625, "rewards/chosen": 0.514324963092804, "rewards/margins": 0.36048585176467896, "rewards/rejected": 0.153900146484375, "step": 120 }, { "epoch": 0.04895039066177163, "grad_norm": 56.34235961605615, "learning_rate": 9.878531073446327e-07, "logits/chosen": -0.972949206829071, "logits/rejected": -0.851061999797821, "logps/chosen": -348.5375061035156, "logps/rejected": -319.6000061035156, "loss": 0.5861, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.21090087294578552, "rewards/margins": 0.45532530546188354, "rewards/rejected": -0.24466553330421448, "step": 130 }, { "epoch": 0.052715805328061756, "grad_norm": 57.85792030946294, "learning_rate": 9.869114877589454e-07, "logits/chosen": -1.090972900390625, "logits/rejected": -0.977862536907196, "logps/chosen": -279.92498779296875, "logps/rejected": -302.23748779296875, "loss": 0.6544, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.07686920464038849, "rewards/margins": 0.4113830626010895, "rewards/rejected": -0.334747314453125, "step": 140 }, { "epoch": 0.05648121999435188, "grad_norm": 73.6051542299407, "learning_rate": 9.85969868173258e-07, "logits/chosen": -1.1359374523162842, "logits/rejected": -0.9763549566268921, "logps/chosen": -301.73748779296875, "logps/rejected": -309.9375, "loss": 0.6504, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.16049499809741974, "rewards/margins": 0.31151121854782104, "rewards/rejected": -0.15115050971508026, "step": 150 }, { "epoch": 0.060246634660642, "grad_norm": 50.702666534248024, "learning_rate": 9.850282485875705e-07, "logits/chosen": -1.297460913658142, "logits/rejected": -1.0850830078125, "logps/chosen": -289.6875, "logps/rejected": -272.3374938964844, "loss": 0.5715, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.499267578125, "rewards/margins": 0.447579950094223, "rewards/rejected": 0.05181274563074112, "step": 160 }, { "epoch": 0.06401204932693212, "grad_norm": 57.83502675621121, "learning_rate": 9.840866290018833e-07, "logits/chosen": -1.1868377923965454, "logits/rejected": -1.0643310546875, "logps/chosen": -323.20001220703125, "logps/rejected": -310.0874938964844, "loss": 0.5665, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.6664794683456421, "rewards/margins": 0.4915267825126648, "rewards/rejected": 0.1750946044921875, "step": 170 }, { "epoch": 0.06777746399322225, "grad_norm": 52.13358138342031, "learning_rate": 9.831450094161958e-07, "logits/chosen": -1.1559569835662842, "logits/rejected": -1.043920874595642, "logps/chosen": -301.2875061035156, "logps/rejected": -303.29998779296875, "loss": 0.533, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.67822265625, "rewards/margins": 0.6084839105606079, "rewards/rejected": 0.07015838474035263, "step": 180 }, { "epoch": 0.07154287865951238, "grad_norm": 56.34962609647144, "learning_rate": 9.822033898305084e-07, "logits/chosen": -1.2628905773162842, "logits/rejected": -1.1959960460662842, "logps/chosen": -318.48748779296875, "logps/rejected": -256.2250061035156, "loss": 0.5605, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2950683534145355, "rewards/margins": 0.6148681640625, "rewards/rejected": -0.319357305765152, "step": 190 }, { "epoch": 0.07530829332580251, "grad_norm": 64.31894964873136, "learning_rate": 9.81261770244821e-07, "logits/chosen": -1.3201172351837158, "logits/rejected": -1.0859375, "logps/chosen": -310.36248779296875, "logps/rejected": -286.0625, "loss": 0.5332, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.06758423149585724, "rewards/margins": 0.7124099731445312, "rewards/rejected": -0.779650866985321, "step": 200 }, { "epoch": 0.07907370799209262, "grad_norm": 121.65098809667658, "learning_rate": 9.803201506591337e-07, "logits/chosen": -1.1337401866912842, "logits/rejected": -1.058349609375, "logps/chosen": -336.67498779296875, "logps/rejected": -317.79998779296875, "loss": 0.6204, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.010418700985610485, "rewards/margins": 0.5572448968887329, "rewards/rejected": -0.5681518316268921, "step": 210 }, { "epoch": 0.08283912265838275, "grad_norm": 70.9193303154546, "learning_rate": 9.793785310734463e-07, "logits/chosen": -1.182519555091858, "logits/rejected": -1.015356421470642, "logps/chosen": -347.67498779296875, "logps/rejected": -312.8999938964844, "loss": 0.5271, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2569740414619446, "rewards/margins": 0.6869751214981079, "rewards/rejected": -0.4292968809604645, "step": 220 }, { "epoch": 0.08660453732467288, "grad_norm": 52.94943642259272, "learning_rate": 9.784369114877588e-07, "logits/chosen": -1.1904296875, "logits/rejected": -1.027978539466858, "logps/chosen": -333.16876220703125, "logps/rejected": -323.70001220703125, "loss": 0.5169, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.32177734375, "rewards/margins": 0.664074718952179, "rewards/rejected": -0.3416809141635895, "step": 230 }, { "epoch": 0.09036995199096301, "grad_norm": 80.14480258162332, "learning_rate": 9.774952919020714e-07, "logits/chosen": -1.1249511241912842, "logits/rejected": -1.018945336341858, "logps/chosen": -366.7875061035156, "logps/rejected": -320.2875061035156, "loss": 0.5885, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.48036497831344604, "rewards/margins": 0.6225830316543579, "rewards/rejected": -0.14215393364429474, "step": 240 }, { "epoch": 0.09413536665725313, "grad_norm": 72.88526870483943, "learning_rate": 9.765536723163841e-07, "logits/chosen": -1.2603027820587158, "logits/rejected": -1.097143530845642, "logps/chosen": -326.57501220703125, "logps/rejected": -309.01251220703125, "loss": 0.5162, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.621289074420929, "rewards/margins": 0.7651122808456421, "rewards/rejected": -0.14398804306983948, "step": 250 }, { "epoch": 0.09790078132354325, "grad_norm": 45.76629288906599, "learning_rate": 9.756120527306967e-07, "logits/chosen": -1.086035132408142, "logits/rejected": -1.0494873523712158, "logps/chosen": -342.1875, "logps/rejected": -306.6875, "loss": 0.5989, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.5513824224472046, "rewards/margins": 0.5939697027206421, "rewards/rejected": -0.042755126953125, "step": 260 }, { "epoch": 0.10166619598983338, "grad_norm": 53.64636191724752, "learning_rate": 9.746704331450095e-07, "logits/chosen": -1.3335449695587158, "logits/rejected": -1.1886718273162842, "logps/chosen": -322.8999938964844, "logps/rejected": -293.4750061035156, "loss": 0.5868, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.5032768249511719, "rewards/margins": 0.575939953327179, "rewards/rejected": -0.07274170219898224, "step": 270 }, { "epoch": 0.10543161065612351, "grad_norm": 56.80530416478003, "learning_rate": 9.73728813559322e-07, "logits/chosen": -1.2702147960662842, "logits/rejected": -1.2205810546875, "logps/chosen": -354.5, "logps/rejected": -353.8999938964844, "loss": 0.6047, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.12821045517921448, "rewards/margins": 0.557861328125, "rewards/rejected": -0.42974853515625, "step": 280 }, { "epoch": 0.10919702532241363, "grad_norm": 51.47063240456252, "learning_rate": 9.727871939736346e-07, "logits/chosen": -1.2412109375, "logits/rejected": -1.133813500404358, "logps/chosen": -320.57501220703125, "logps/rejected": -307.875, "loss": 0.6249, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.24738159775733948, "rewards/margins": 0.4513305723667145, "rewards/rejected": -0.6987549066543579, "step": 290 }, { "epoch": 0.11296243998870376, "grad_norm": 71.56506182510057, "learning_rate": 9.718455743879473e-07, "logits/chosen": -1.1805908679962158, "logits/rejected": -1.0539062023162842, "logps/chosen": -322.8374938964844, "logps/rejected": -311.9624938964844, "loss": 0.6459, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3472534120082855, "rewards/margins": 0.340362548828125, "rewards/rejected": -0.6879943609237671, "step": 300 }, { "epoch": 0.11672785465499388, "grad_norm": 76.10189768931424, "learning_rate": 9.709039548022599e-07, "logits/chosen": -1.1149413585662842, "logits/rejected": -1.1630859375, "logps/chosen": -374.36248779296875, "logps/rejected": -316.0, "loss": 0.6208, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.32673341035842896, "rewards/margins": 0.49467772245407104, "rewards/rejected": -0.8206787109375, "step": 310 }, { "epoch": 0.120493269321284, "grad_norm": 62.76298563467578, "learning_rate": 9.699623352165724e-07, "logits/chosen": -1.0282714366912842, "logits/rejected": -1.003149390220642, "logps/chosen": -359.7124938964844, "logps/rejected": -327.9624938964844, "loss": 0.5843, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.15433350205421448, "rewards/margins": 0.508892834186554, "rewards/rejected": -0.6633666753768921, "step": 320 }, { "epoch": 0.12425868398757413, "grad_norm": 62.262934581428496, "learning_rate": 9.69020715630885e-07, "logits/chosen": -1.1897461414337158, "logits/rejected": -1.0333740711212158, "logps/chosen": -314.2749938964844, "logps/rejected": -293.75, "loss": 0.6039, "rewards/accuracies": 0.6875, "rewards/chosen": -0.10505981743335724, "rewards/margins": 0.42225342988967896, "rewards/rejected": -0.5272461175918579, "step": 330 }, { "epoch": 0.12802409865386424, "grad_norm": 44.35650937143971, "learning_rate": 9.680790960451978e-07, "logits/chosen": -1.201635718345642, "logits/rejected": -1.005517601966858, "logps/chosen": -376.3125, "logps/rejected": -336.17498779296875, "loss": 0.5156, "rewards/accuracies": 0.71875, "rewards/chosen": 0.02421874925494194, "rewards/margins": 0.7228759527206421, "rewards/rejected": -0.6984008550643921, "step": 340 }, { "epoch": 0.13178951332015437, "grad_norm": 93.23954518778305, "learning_rate": 9.671374764595103e-07, "logits/chosen": -1.2033202648162842, "logits/rejected": -1.0687072277069092, "logps/chosen": -358.07501220703125, "logps/rejected": -341.1000061035156, "loss": 0.605, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07807616889476776, "rewards/margins": 0.503735363483429, "rewards/rejected": -0.5820251703262329, "step": 350 }, { "epoch": 0.1355549279864445, "grad_norm": 53.9487291584966, "learning_rate": 9.661958568738229e-07, "logits/chosen": -1.3254883289337158, "logits/rejected": -1.1331055164337158, "logps/chosen": -325.8999938964844, "logps/rejected": -298.36248779296875, "loss": 0.6162, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1240234375, "rewards/margins": 0.514892578125, "rewards/rejected": -0.3914596438407898, "step": 360 }, { "epoch": 0.13932034265273463, "grad_norm": 61.82420157061729, "learning_rate": 9.652542372881356e-07, "logits/chosen": -1.2808105945587158, "logits/rejected": -1.1416137218475342, "logps/chosen": -347.70001220703125, "logps/rejected": -319.7124938964844, "loss": 0.5761, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0775909423828125, "rewards/margins": 0.568157970905304, "rewards/rejected": -0.49092406034469604, "step": 370 }, { "epoch": 0.14308575731902476, "grad_norm": 47.07372589056406, "learning_rate": 9.643126177024482e-07, "logits/chosen": -1.3065917491912842, "logits/rejected": -1.033105492591858, "logps/chosen": -292.4125061035156, "logps/rejected": -307.82501220703125, "loss": 0.633, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11763610690832138, "rewards/margins": 0.44011229276657104, "rewards/rejected": -0.557342529296875, "step": 380 }, { "epoch": 0.1468511719853149, "grad_norm": 44.28679525986681, "learning_rate": 9.633709981167607e-07, "logits/chosen": -1.337499976158142, "logits/rejected": -1.373632788658142, "logps/chosen": -308.9437561035156, "logps/rejected": -273.9375, "loss": 0.5394, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.08088989555835724, "rewards/margins": 0.6612304449081421, "rewards/rejected": -0.74273681640625, "step": 390 }, { "epoch": 0.15061658665160502, "grad_norm": 57.71378438044312, "learning_rate": 9.624293785310735e-07, "logits/chosen": -1.2761719226837158, "logits/rejected": -1.0947265625, "logps/chosen": -336.9750061035156, "logps/rejected": -332.75, "loss": 0.5491, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.0008789062267169356, "rewards/margins": 0.71124267578125, "rewards/rejected": -0.712451159954071, "step": 400 }, { "epoch": 0.15438200131789515, "grad_norm": 57.80176607190309, "learning_rate": 9.61487758945386e-07, "logits/chosen": -1.141992211341858, "logits/rejected": -1.0800292491912842, "logps/chosen": -337.0625, "logps/rejected": -327.8500061035156, "loss": 0.5587, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.3149780333042145, "rewards/margins": 0.60662841796875, "rewards/rejected": -0.2921386659145355, "step": 410 }, { "epoch": 0.15814741598418525, "grad_norm": 87.51668642138263, "learning_rate": 9.605461393596986e-07, "logits/chosen": -1.2880127429962158, "logits/rejected": -1.102362036705017, "logps/chosen": -336.6000061035156, "logps/rejected": -339.3125, "loss": 0.5796, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2772506773471832, "rewards/margins": 0.601428210735321, "rewards/rejected": -0.32389527559280396, "step": 420 }, { "epoch": 0.16191283065047538, "grad_norm": 47.52254344110622, "learning_rate": 9.596045197740112e-07, "logits/chosen": -1.2498290538787842, "logits/rejected": -1.117456078529358, "logps/chosen": -365.0, "logps/rejected": -319.8500061035156, "loss": 0.5213, "rewards/accuracies": 0.71875, "rewards/chosen": 0.3717102110385895, "rewards/margins": 0.820202648639679, "rewards/rejected": -0.44853514432907104, "step": 430 }, { "epoch": 0.1656782453167655, "grad_norm": 56.26284637430883, "learning_rate": 9.58662900188324e-07, "logits/chosen": -1.328759789466858, "logits/rejected": -1.096044898033142, "logps/chosen": -304.3374938964844, "logps/rejected": -304.3500061035156, "loss": 0.5964, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.27747803926467896, "rewards/margins": 0.5682373046875, "rewards/rejected": -0.291085809469223, "step": 440 }, { "epoch": 0.16944365998305563, "grad_norm": 58.83375211620542, "learning_rate": 9.577212806026365e-07, "logits/chosen": -1.315673828125, "logits/rejected": -1.186914086341858, "logps/chosen": -325.73748779296875, "logps/rejected": -304.8500061035156, "loss": 0.6132, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.37255859375, "rewards/margins": 0.5526367425918579, "rewards/rejected": -0.17986755073070526, "step": 450 }, { "epoch": 0.17320907464934576, "grad_norm": 55.71302278902864, "learning_rate": 9.56779661016949e-07, "logits/chosen": -1.307470679283142, "logits/rejected": -1.18798828125, "logps/chosen": -332.29998779296875, "logps/rejected": -319.76251220703125, "loss": 0.5764, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.41672974824905396, "rewards/margins": 0.51116943359375, "rewards/rejected": -0.09440307319164276, "step": 460 }, { "epoch": 0.1769744893156359, "grad_norm": 59.48415736161866, "learning_rate": 9.558380414312616e-07, "logits/chosen": -1.2122802734375, "logits/rejected": -1.1159179210662842, "logps/chosen": -361.4750061035156, "logps/rejected": -321.04998779296875, "loss": 0.5691, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.30621033906936646, "rewards/margins": 0.6288207769393921, "rewards/rejected": -0.32291871309280396, "step": 470 }, { "epoch": 0.18073990398192602, "grad_norm": 62.779254196182784, "learning_rate": 9.548964218455744e-07, "logits/chosen": -1.166259765625, "logits/rejected": -1.115234375, "logps/chosen": -332.57501220703125, "logps/rejected": -298.5249938964844, "loss": 0.5644, "rewards/accuracies": 0.71875, "rewards/chosen": 0.25802308320999146, "rewards/margins": 0.608172595500946, "rewards/rejected": -0.350167840719223, "step": 480 }, { "epoch": 0.18450531864821612, "grad_norm": 47.2724446393742, "learning_rate": 9.53954802259887e-07, "logits/chosen": -1.333984375, "logits/rejected": -1.202490210533142, "logps/chosen": -340.4750061035156, "logps/rejected": -310.92498779296875, "loss": 0.5702, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.33430176973342896, "rewards/margins": 0.6546386480331421, "rewards/rejected": -0.32002562284469604, "step": 490 }, { "epoch": 0.18827073331450625, "grad_norm": 57.7963536971741, "learning_rate": 9.530131826741996e-07, "logits/chosen": -1.3118164539337158, "logits/rejected": -1.0532410144805908, "logps/chosen": -347.75, "logps/rejected": -301.8125, "loss": 0.4845, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.10901794582605362, "rewards/margins": 0.9565185308456421, "rewards/rejected": -0.846545398235321, "step": 500 }, { "epoch": 0.19203614798079638, "grad_norm": 77.7824635577585, "learning_rate": 9.520715630885122e-07, "logits/chosen": -1.2794921398162842, "logits/rejected": -1.1529541015625, "logps/chosen": -338.6000061035156, "logps/rejected": -304.73748779296875, "loss": 0.6328, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.168548583984375, "rewards/margins": 0.6254593133926392, "rewards/rejected": -0.7943878173828125, "step": 510 }, { "epoch": 0.1958015626470865, "grad_norm": 60.60001324507701, "learning_rate": 9.511299435028248e-07, "logits/chosen": -1.2001464366912842, "logits/rejected": -1.158056616783142, "logps/chosen": -337.2124938964844, "logps/rejected": -294.2250061035156, "loss": 0.5173, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.08378295600414276, "rewards/margins": 0.8429321050643921, "rewards/rejected": -0.758953869342804, "step": 520 }, { "epoch": 0.19956697731337664, "grad_norm": 42.587829936388765, "learning_rate": 9.501883239171374e-07, "logits/chosen": -1.199072241783142, "logits/rejected": -1.1140258312225342, "logps/chosen": -313.32501220703125, "logps/rejected": -346.0625, "loss": 0.5277, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.11332397162914276, "rewards/margins": 0.8187500238418579, "rewards/rejected": -0.705920398235321, "step": 530 }, { "epoch": 0.20333239197966677, "grad_norm": 44.05219230418422, "learning_rate": 9.492467043314501e-07, "logits/chosen": -1.31591796875, "logits/rejected": -1.2682616710662842, "logps/chosen": -312.4750061035156, "logps/rejected": -295.3374938964844, "loss": 0.5749, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13365402817726135, "rewards/margins": 0.6903442144393921, "rewards/rejected": -0.5563720464706421, "step": 540 }, { "epoch": 0.2070978066459569, "grad_norm": 114.10661687528746, "learning_rate": 9.483050847457626e-07, "logits/chosen": -1.337011694908142, "logits/rejected": -1.221093773841858, "logps/chosen": -298.01251220703125, "logps/rejected": -290.61248779296875, "loss": 0.6171, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.17236022651195526, "rewards/margins": 0.7229339480400085, "rewards/rejected": -0.5505615472793579, "step": 550 }, { "epoch": 0.21086322131224702, "grad_norm": 70.72842716901373, "learning_rate": 9.473634651600753e-07, "logits/chosen": -1.3826172351837158, "logits/rejected": -1.258154273033142, "logps/chosen": -356.9750061035156, "logps/rejected": -303.75, "loss": 0.5177, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.828112781047821, "rewards/margins": 0.892578125, "rewards/rejected": -0.06376342475414276, "step": 560 }, { "epoch": 0.21462863597853712, "grad_norm": 57.78906457881488, "learning_rate": 9.464218455743879e-07, "logits/chosen": -1.202392578125, "logits/rejected": -1.0363280773162842, "logps/chosen": -316.6312561035156, "logps/rejected": -332.6000061035156, "loss": 0.6195, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.557299792766571, "rewards/margins": 0.604327380657196, "rewards/rejected": -0.04627380520105362, "step": 570 }, { "epoch": 0.21839405064482725, "grad_norm": 51.60506321784956, "learning_rate": 9.454802259887005e-07, "logits/chosen": -1.1885864734649658, "logits/rejected": -1.1045410633087158, "logps/chosen": -342.70001220703125, "logps/rejected": -298.9375, "loss": 0.5346, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.450897216796875, "rewards/margins": 0.7425781488418579, "rewards/rejected": -0.2915405333042145, "step": 580 }, { "epoch": 0.22215946531111738, "grad_norm": 56.43761631615041, "learning_rate": 9.445386064030132e-07, "logits/chosen": -1.1663086414337158, "logits/rejected": -1.064306616783142, "logps/chosen": -331.6625061035156, "logps/rejected": -303.875, "loss": 0.5358, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.464150995016098, "rewards/margins": 0.8133544921875, "rewards/rejected": -0.3491149842739105, "step": 590 }, { "epoch": 0.2259248799774075, "grad_norm": 50.887343073293565, "learning_rate": 9.435969868173258e-07, "logits/chosen": -1.167138695716858, "logits/rejected": -1.1481201648712158, "logps/chosen": -359.4750061035156, "logps/rejected": -309.1000061035156, "loss": 0.5279, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.15134887397289276, "rewards/margins": 0.9344116449356079, "rewards/rejected": -0.783215343952179, "step": 600 }, { "epoch": 0.22969029464369764, "grad_norm": 75.74546186150988, "learning_rate": 9.426553672316384e-07, "logits/chosen": -1.126562476158142, "logits/rejected": -1.040277123451233, "logps/chosen": -315.29998779296875, "logps/rejected": -302.38751220703125, "loss": 0.5833, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01245727576315403, "rewards/margins": 0.653857409954071, "rewards/rejected": -0.6404663324356079, "step": 610 }, { "epoch": 0.23345570930998777, "grad_norm": 63.418112570536344, "learning_rate": 9.41713747645951e-07, "logits/chosen": -1.097558617591858, "logits/rejected": -0.9658752679824829, "logps/chosen": -336.42498779296875, "logps/rejected": -299.2875061035156, "loss": 0.6043, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.07614898681640625, "rewards/margins": 0.577197253704071, "rewards/rejected": -0.6533569097518921, "step": 620 }, { "epoch": 0.2372211239762779, "grad_norm": 60.336254961247754, "learning_rate": 9.407721280602636e-07, "logits/chosen": -1.055273413658142, "logits/rejected": -0.966412365436554, "logps/chosen": -398.11248779296875, "logps/rejected": -328.8374938964844, "loss": 0.5201, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.11302032321691513, "rewards/margins": 0.8218994140625, "rewards/rejected": -0.7095702886581421, "step": 630 }, { "epoch": 0.240986538642568, "grad_norm": 53.06540737951244, "learning_rate": 9.398305084745763e-07, "logits/chosen": -1.337304711341858, "logits/rejected": -1.068017601966858, "logps/chosen": -344.70001220703125, "logps/rejected": -323.04998779296875, "loss": 0.5217, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.07797546684741974, "rewards/margins": 0.8014465570449829, "rewards/rejected": -0.8796142339706421, "step": 640 }, { "epoch": 0.24475195330885813, "grad_norm": 49.53042879938056, "learning_rate": 9.388888888888888e-07, "logits/chosen": -1.231665015220642, "logits/rejected": -1.040185570716858, "logps/chosen": -324.76251220703125, "logps/rejected": -294.23748779296875, "loss": 0.4822, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.014892578125, "rewards/margins": 0.960009753704071, "rewards/rejected": -0.94476318359375, "step": 650 }, { "epoch": 0.24851736797514826, "grad_norm": 44.248471389931694, "learning_rate": 9.379472693032015e-07, "logits/chosen": -1.2609374523162842, "logits/rejected": -1.1388061046600342, "logps/chosen": -343.0249938964844, "logps/rejected": -320.7124938964844, "loss": 0.5744, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.18967589735984802, "rewards/margins": 0.7291504144668579, "rewards/rejected": -0.540142834186554, "step": 660 }, { "epoch": 0.2522827826414384, "grad_norm": 75.30241738349395, "learning_rate": 9.37005649717514e-07, "logits/chosen": -1.2489745616912842, "logits/rejected": -1.18438720703125, "logps/chosen": -296.3999938964844, "logps/rejected": -292.29998779296875, "loss": 0.5869, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.24228516221046448, "rewards/margins": 0.670764148235321, "rewards/rejected": -0.4290939271450043, "step": 670 }, { "epoch": 0.2560481973077285, "grad_norm": 65.41506659981937, "learning_rate": 9.360640301318267e-07, "logits/chosen": -1.256250023841858, "logits/rejected": -1.045068383216858, "logps/chosen": -294.51251220703125, "logps/rejected": -289.57501220703125, "loss": 0.5273, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.45602720975875854, "rewards/margins": 0.763836681842804, "rewards/rejected": -0.308004766702652, "step": 680 }, { "epoch": 0.25981361197401864, "grad_norm": 59.203580998784055, "learning_rate": 9.351224105461392e-07, "logits/chosen": -1.125830054283142, "logits/rejected": -1.077783226966858, "logps/chosen": -369.75, "logps/rejected": -291.38751220703125, "loss": 0.524, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.449981689453125, "rewards/margins": 0.8723999261856079, "rewards/rejected": -0.42216795682907104, "step": 690 }, { "epoch": 0.26357902664030874, "grad_norm": 48.72951703826048, "learning_rate": 9.341807909604519e-07, "logits/chosen": -1.260595679283142, "logits/rejected": -1.162451148033142, "logps/chosen": -318.82501220703125, "logps/rejected": -296.625, "loss": 0.5633, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.312347412109375, "rewards/margins": 0.753662109375, "rewards/rejected": -0.4418090879917145, "step": 700 }, { "epoch": 0.2673444413065989, "grad_norm": 69.88184204474227, "learning_rate": 9.332391713747646e-07, "logits/chosen": -1.1165771484375, "logits/rejected": -1.1187317371368408, "logps/chosen": -346.6000061035156, "logps/rejected": -296.7875061035156, "loss": 0.602, "rewards/accuracies": 0.65625, "rewards/chosen": 0.47235107421875, "rewards/margins": 0.784350574016571, "rewards/rejected": -0.31283873319625854, "step": 710 }, { "epoch": 0.271109855972889, "grad_norm": 40.22531161062541, "learning_rate": 9.322975517890772e-07, "logits/chosen": -1.322265625, "logits/rejected": -1.153906226158142, "logps/chosen": -285.1000061035156, "logps/rejected": -281.1812438964844, "loss": 0.572, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.475607305765152, "rewards/margins": 0.7274169921875, "rewards/rejected": -0.25169676542282104, "step": 720 }, { "epoch": 0.27487527063917916, "grad_norm": 48.73024225260636, "learning_rate": 9.313559322033898e-07, "logits/chosen": -1.15374755859375, "logits/rejected": -1.095025658607483, "logps/chosen": -334.3687438964844, "logps/rejected": -281.07501220703125, "loss": 0.5303, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.37897950410842896, "rewards/margins": 0.9101318120956421, "rewards/rejected": -0.532135009765625, "step": 730 }, { "epoch": 0.27864068530546926, "grad_norm": 51.05934153124105, "learning_rate": 9.304143126177024e-07, "logits/chosen": -1.162817358970642, "logits/rejected": -1.0538513660430908, "logps/chosen": -350.625, "logps/rejected": -314.07501220703125, "loss": 0.4898, "rewards/accuracies": 0.78125, "rewards/chosen": 0.03034057654440403, "rewards/margins": 0.91546630859375, "rewards/rejected": -0.884521484375, "step": 740 }, { "epoch": 0.2824060999717594, "grad_norm": 49.813376599952235, "learning_rate": 9.29472693032015e-07, "logits/chosen": -1.19873046875, "logits/rejected": -1.053863525390625, "logps/chosen": -360.6000061035156, "logps/rejected": -316.23748779296875, "loss": 0.5357, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03046875074505806, "rewards/margins": 0.900738537311554, "rewards/rejected": -0.8697754144668579, "step": 750 }, { "epoch": 0.2861715146380495, "grad_norm": 68.57401382081026, "learning_rate": 9.285310734463276e-07, "logits/chosen": -1.1609375476837158, "logits/rejected": -1.1505858898162842, "logps/chosen": -357.32501220703125, "logps/rejected": -294.67498779296875, "loss": 0.5288, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03902893140912056, "rewards/margins": 0.873364269733429, "rewards/rejected": -0.9125396609306335, "step": 760 }, { "epoch": 0.2899369293043396, "grad_norm": 56.3542114640027, "learning_rate": 9.275894538606402e-07, "logits/chosen": -1.2700927257537842, "logits/rejected": -1.098974585533142, "logps/chosen": -336.42498779296875, "logps/rejected": -343.57501220703125, "loss": 0.5226, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.03896331787109375, "rewards/margins": 0.9751220941543579, "rewards/rejected": -1.014044165611267, "step": 770 }, { "epoch": 0.2937023439706298, "grad_norm": 46.84295761407464, "learning_rate": 9.266478342749529e-07, "logits/chosen": -1.232666015625, "logits/rejected": -1.074493408203125, "logps/chosen": -330.26251220703125, "logps/rejected": -319.625, "loss": 0.5372, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.02793579176068306, "rewards/margins": 0.997875988483429, "rewards/rejected": -1.0271422863006592, "step": 780 }, { "epoch": 0.2974677586369199, "grad_norm": 44.321248244743785, "learning_rate": 9.257062146892654e-07, "logits/chosen": -1.1725585460662842, "logits/rejected": -1.0822265148162842, "logps/chosen": -327.8374938964844, "logps/rejected": -292.5874938964844, "loss": 0.6103, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.12764587998390198, "rewards/margins": 0.77557373046875, "rewards/rejected": -0.648419201374054, "step": 790 }, { "epoch": 0.30123317330321003, "grad_norm": 83.11480965082718, "learning_rate": 9.247645951035781e-07, "logits/chosen": -1.0137207508087158, "logits/rejected": -0.9642578363418579, "logps/chosen": -343.01251220703125, "logps/rejected": -291.82501220703125, "loss": 0.561, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4403930604457855, "rewards/margins": 0.8373168706893921, "rewards/rejected": -0.39698487520217896, "step": 800 }, { "epoch": 0.30499858796950013, "grad_norm": 69.8215334816492, "learning_rate": 9.238229755178907e-07, "logits/chosen": -1.135839819908142, "logits/rejected": -0.994384765625, "logps/chosen": -319.2124938964844, "logps/rejected": -339.9624938964844, "loss": 0.5681, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.12104034423828125, "rewards/margins": 0.794604480266571, "rewards/rejected": -0.6738036870956421, "step": 810 }, { "epoch": 0.3087640026357903, "grad_norm": 77.4672035218517, "learning_rate": 9.228813559322034e-07, "logits/chosen": -1.0670897960662842, "logits/rejected": -0.97509765625, "logps/chosen": -331.75, "logps/rejected": -304.20001220703125, "loss": 0.553, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.13225403428077698, "rewards/margins": 0.7711181640625, "rewards/rejected": -0.6392486691474915, "step": 820 }, { "epoch": 0.3125294173020804, "grad_norm": 40.082595738995764, "learning_rate": 9.21939736346516e-07, "logits/chosen": -1.413964867591858, "logits/rejected": -1.288964867591858, "logps/chosen": -282.40625, "logps/rejected": -279.625, "loss": 0.6089, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.29186707735061646, "rewards/margins": 0.6807861328125, "rewards/rejected": -0.3889404237270355, "step": 830 }, { "epoch": 0.3162948319683705, "grad_norm": 64.0942124437234, "learning_rate": 9.209981167608286e-07, "logits/chosen": -1.2793700695037842, "logits/rejected": -1.150976538658142, "logps/chosen": -365.5249938964844, "logps/rejected": -369.17498779296875, "loss": 0.5762, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.5195220708847046, "rewards/margins": 0.7163330316543579, "rewards/rejected": -0.19708862900733948, "step": 840 }, { "epoch": 0.32006024663466065, "grad_norm": 57.702896809826846, "learning_rate": 9.200564971751413e-07, "logits/chosen": -1.1767089366912842, "logits/rejected": -1.0697753429412842, "logps/chosen": -315.7875061035156, "logps/rejected": -278.67498779296875, "loss": 0.4948, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.4289093017578125, "rewards/margins": 0.836560070514679, "rewards/rejected": -0.40764158964157104, "step": 850 }, { "epoch": 0.32382566130095075, "grad_norm": 91.90580116961952, "learning_rate": 9.191148775894538e-07, "logits/chosen": -1.209716796875, "logits/rejected": -1.114892601966858, "logps/chosen": -391.76251220703125, "logps/rejected": -347.54998779296875, "loss": 0.535, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15660400688648224, "rewards/margins": 0.857495129108429, "rewards/rejected": -0.700671374797821, "step": 860 }, { "epoch": 0.3275910759672409, "grad_norm": 67.42423024731741, "learning_rate": 9.181732580037665e-07, "logits/chosen": -1.237451195716858, "logits/rejected": -1.179931640625, "logps/chosen": -305.4375, "logps/rejected": -282.75, "loss": 0.6587, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.09675903618335724, "rewards/margins": 0.5741943120956421, "rewards/rejected": -0.47753602266311646, "step": 870 }, { "epoch": 0.331356490633531, "grad_norm": 53.14310138901821, "learning_rate": 9.17231638418079e-07, "logits/chosen": -1.246826171875, "logits/rejected": -1.11358642578125, "logps/chosen": -323.3999938964844, "logps/rejected": -310.125, "loss": 0.5252, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.03656310960650444, "rewards/margins": 0.8984619379043579, "rewards/rejected": -0.935498058795929, "step": 880 }, { "epoch": 0.33512190529982117, "grad_norm": 44.257760113626965, "learning_rate": 9.162900188323917e-07, "logits/chosen": -1.160589575767517, "logits/rejected": -1.0225098133087158, "logps/chosen": -319.875, "logps/rejected": -276.6000061035156, "loss": 0.5908, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.19160766899585724, "rewards/margins": 0.6745666265487671, "rewards/rejected": -0.8660644292831421, "step": 890 }, { "epoch": 0.33888731996611127, "grad_norm": 52.821806347200486, "learning_rate": 9.153483992467042e-07, "logits/chosen": -1.1565430164337158, "logits/rejected": -1.154882788658142, "logps/chosen": -356.17498779296875, "logps/rejected": -325.92498779296875, "loss": 0.4991, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.15374755859375, "rewards/margins": 0.800030529499054, "rewards/rejected": -0.6458495855331421, "step": 900 }, { "epoch": 0.34265273463240137, "grad_norm": 52.99855769914897, "learning_rate": 9.144067796610169e-07, "logits/chosen": -1.186010718345642, "logits/rejected": -1.118798851966858, "logps/chosen": -341.6625061035156, "logps/rejected": -322.88751220703125, "loss": 0.5083, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07716064155101776, "rewards/margins": 0.915478527545929, "rewards/rejected": -0.9923340082168579, "step": 910 }, { "epoch": 0.3464181492986915, "grad_norm": 42.75289338127359, "learning_rate": 9.134651600753295e-07, "logits/chosen": -1.2173340320587158, "logits/rejected": -1.1384398937225342, "logps/chosen": -350.6499938964844, "logps/rejected": -318.45001220703125, "loss": 0.5187, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.0019500732887536287, "rewards/margins": 0.892578125, "rewards/rejected": -0.8902832269668579, "step": 920 }, { "epoch": 0.3501835639649816, "grad_norm": 53.765417882393365, "learning_rate": 9.125235404896422e-07, "logits/chosen": -1.2715332508087158, "logits/rejected": -1.1755492687225342, "logps/chosen": -335.0625, "logps/rejected": -298.8125, "loss": 0.5351, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.40308839082717896, "rewards/margins": 0.745532214641571, "rewards/rejected": -1.148193359375, "step": 930 }, { "epoch": 0.3539489786312718, "grad_norm": 62.64140287069352, "learning_rate": 9.115819209039548e-07, "logits/chosen": -1.2268555164337158, "logits/rejected": -1.1227538585662842, "logps/chosen": -344.7749938964844, "logps/rejected": -319.9750061035156, "loss": 0.6157, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6789184808731079, "rewards/margins": 0.7367919683456421, "rewards/rejected": -1.416357398033142, "step": 940 }, { "epoch": 0.3577143932975619, "grad_norm": 44.27868386490319, "learning_rate": 9.106403013182674e-07, "logits/chosen": -1.113378882408142, "logits/rejected": -1.01953125, "logps/chosen": -368.11248779296875, "logps/rejected": -359.8374938964844, "loss": 0.5628, "rewards/accuracies": 0.6875, "rewards/chosen": -0.621600329875946, "rewards/margins": 0.9186035394668579, "rewards/rejected": -1.539770483970642, "step": 950 }, { "epoch": 0.36147980796385204, "grad_norm": 54.94704181079034, "learning_rate": 9.0969868173258e-07, "logits/chosen": -1.14794921875, "logits/rejected": -1.0332520008087158, "logps/chosen": -325.36248779296875, "logps/rejected": -305.45001220703125, "loss": 0.495, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.584454357624054, "rewards/margins": 0.9576690793037415, "rewards/rejected": -1.5427734851837158, "step": 960 }, { "epoch": 0.36524522263014214, "grad_norm": 60.7433867611468, "learning_rate": 9.087570621468926e-07, "logits/chosen": -1.1739013195037842, "logits/rejected": -1.1054198741912842, "logps/chosen": -332.3374938964844, "logps/rejected": -323.5249938964844, "loss": 0.5339, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4964965879917145, "rewards/margins": 0.954418957233429, "rewards/rejected": -1.4517090320587158, "step": 970 }, { "epoch": 0.36901063729643224, "grad_norm": 48.48272925277439, "learning_rate": 9.078154425612052e-07, "logits/chosen": -1.132470726966858, "logits/rejected": -0.9149414300918579, "logps/chosen": -364.75, "logps/rejected": -352.0, "loss": 0.5106, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5073486566543579, "rewards/margins": 1.138037085533142, "rewards/rejected": -1.64599609375, "step": 980 }, { "epoch": 0.3727760519627224, "grad_norm": 79.70126046423111, "learning_rate": 9.068738229755179e-07, "logits/chosen": -1.1689453125, "logits/rejected": -0.9771728515625, "logps/chosen": -378.75, "logps/rejected": -349.5249938964844, "loss": 0.5652, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7266174554824829, "rewards/margins": 0.843005359172821, "rewards/rejected": -1.569238305091858, "step": 990 }, { "epoch": 0.3765414666290125, "grad_norm": 40.25479750627583, "learning_rate": 9.059322033898304e-07, "logits/chosen": -1.224853515625, "logits/rejected": -1.225976586341858, "logps/chosen": -358.5375061035156, "logps/rejected": -294.875, "loss": 0.5163, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.39445191621780396, "rewards/margins": 0.8917297124862671, "rewards/rejected": -1.2864258289337158, "step": 1000 }, { "epoch": 0.38030688129530266, "grad_norm": 53.24232996530205, "learning_rate": 9.049905838041431e-07, "logits/chosen": -1.3005859851837158, "logits/rejected": -1.1137206554412842, "logps/chosen": -338.7250061035156, "logps/rejected": -315.70001220703125, "loss": 0.5121, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.510607898235321, "rewards/margins": 0.910998523235321, "rewards/rejected": -1.4208984375, "step": 1010 }, { "epoch": 0.38407229596159276, "grad_norm": 45.70650742007789, "learning_rate": 9.040489642184556e-07, "logits/chosen": -1.243066430091858, "logits/rejected": -1.063574194908142, "logps/chosen": -315.29998779296875, "logps/rejected": -311.76251220703125, "loss": 0.5461, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.45687562227249146, "rewards/margins": 0.892260730266571, "rewards/rejected": -1.348358154296875, "step": 1020 }, { "epoch": 0.3878377106278829, "grad_norm": 53.181762981896, "learning_rate": 9.031073446327683e-07, "logits/chosen": -1.198876976966858, "logits/rejected": -1.111572265625, "logps/chosen": -348.5625, "logps/rejected": -283.23748779296875, "loss": 0.5269, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11013183742761612, "rewards/margins": 0.9209228754043579, "rewards/rejected": -1.0304443836212158, "step": 1030 }, { "epoch": 0.391603125294173, "grad_norm": 54.01230866856104, "learning_rate": 9.021657250470808e-07, "logits/chosen": -1.2483398914337158, "logits/rejected": -1.050195336341858, "logps/chosen": -307.7250061035156, "logps/rejected": -293.67498779296875, "loss": 0.5813, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01894378662109375, "rewards/margins": 0.776684582233429, "rewards/rejected": -0.756787121295929, "step": 1040 }, { "epoch": 0.3953685399604632, "grad_norm": 44.051066253856746, "learning_rate": 9.012241054613936e-07, "logits/chosen": -1.3581054210662842, "logits/rejected": -1.1930663585662842, "logps/chosen": -271.7124938964844, "logps/rejected": -270.57501220703125, "loss": 0.4725, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.09050293266773224, "rewards/margins": 1.0639159679412842, "rewards/rejected": -0.9722564816474915, "step": 1050 }, { "epoch": 0.3991339546267533, "grad_norm": 51.945445309998426, "learning_rate": 9.002824858757063e-07, "logits/chosen": -1.219335913658142, "logits/rejected": -1.212744116783142, "logps/chosen": -344.3999938964844, "logps/rejected": -262.20001220703125, "loss": 0.5332, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.12247314304113388, "rewards/margins": 0.942626953125, "rewards/rejected": -0.8203125, "step": 1060 }, { "epoch": 0.4028993692930434, "grad_norm": 65.84533736905239, "learning_rate": 8.993408662900188e-07, "logits/chosen": -1.3394043445587158, "logits/rejected": -1.3256347179412842, "logps/chosen": -316.9750061035156, "logps/rejected": -305.38751220703125, "loss": 0.5659, "rewards/accuracies": 0.71875, "rewards/chosen": 0.201385498046875, "rewards/margins": 0.961621105670929, "rewards/rejected": -0.7594940066337585, "step": 1070 }, { "epoch": 0.40666478395933353, "grad_norm": 68.82083604885162, "learning_rate": 8.983992467043315e-07, "logits/chosen": -1.2788574695587158, "logits/rejected": -1.1342041492462158, "logps/chosen": -321.4375, "logps/rejected": -329.3999938964844, "loss": 0.5312, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.29741209745407104, "rewards/margins": 0.965100109577179, "rewards/rejected": -0.6680389642715454, "step": 1080 }, { "epoch": 0.41043019862562363, "grad_norm": 60.35094883595223, "learning_rate": 8.97457627118644e-07, "logits/chosen": -1.171057105064392, "logits/rejected": -1.120019555091858, "logps/chosen": -373.6000061035156, "logps/rejected": -312.5249938964844, "loss": 0.5642, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.5713866949081421, "rewards/margins": 0.778076171875, "rewards/rejected": -0.20686034858226776, "step": 1090 }, { "epoch": 0.4141956132919138, "grad_norm": 55.645292098096654, "learning_rate": 8.965160075329567e-07, "logits/chosen": -1.3798828125, "logits/rejected": -1.19903564453125, "logps/chosen": -321.7437438964844, "logps/rejected": -315.07501220703125, "loss": 0.488, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.520458996295929, "rewards/margins": 0.9463745355606079, "rewards/rejected": -0.426034539937973, "step": 1100 }, { "epoch": 0.4179610279582039, "grad_norm": 68.2917217711993, "learning_rate": 8.955743879472692e-07, "logits/chosen": -1.2252686023712158, "logits/rejected": -1.167871117591858, "logps/chosen": -331.4125061035156, "logps/rejected": -338.57501220703125, "loss": 0.4924, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.500353991985321, "rewards/margins": 1.08111572265625, "rewards/rejected": -0.5808349847793579, "step": 1110 }, { "epoch": 0.42172644262449405, "grad_norm": 32.37202706577371, "learning_rate": 8.946327683615819e-07, "logits/chosen": -1.168066382408142, "logits/rejected": -0.969531238079071, "logps/chosen": -366.25, "logps/rejected": -306.6625061035156, "loss": 0.6191, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.22347411513328552, "rewards/margins": 0.817431628704071, "rewards/rejected": -0.594348132610321, "step": 1120 }, { "epoch": 0.42549185729078415, "grad_norm": 69.01716903892432, "learning_rate": 8.936911487758944e-07, "logits/chosen": -1.268041968345642, "logits/rejected": -1.131646752357483, "logps/chosen": -352.17498779296875, "logps/rejected": -306.1499938964844, "loss": 0.4941, "rewards/accuracies": 0.75, "rewards/chosen": 0.21611633896827698, "rewards/margins": 0.916961669921875, "rewards/rejected": -0.7012237310409546, "step": 1130 }, { "epoch": 0.42925727195707425, "grad_norm": 59.13621299403754, "learning_rate": 8.927495291902071e-07, "logits/chosen": -1.244970679283142, "logits/rejected": -1.235449194908142, "logps/chosen": -302.54998779296875, "logps/rejected": -278.6875, "loss": 0.5869, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11154785007238388, "rewards/margins": 0.833819568157196, "rewards/rejected": -0.722485363483429, "step": 1140 }, { "epoch": 0.4330226866233644, "grad_norm": 57.69410600972973, "learning_rate": 8.918079096045197e-07, "logits/chosen": -1.3175780773162842, "logits/rejected": -1.1737792491912842, "logps/chosen": -300.25, "logps/rejected": -275.86248779296875, "loss": 0.5084, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.3555358946323395, "rewards/margins": 0.9267212152481079, "rewards/rejected": -0.5705322027206421, "step": 1150 }, { "epoch": 0.4367881012896545, "grad_norm": 69.3755643217806, "learning_rate": 8.908662900188324e-07, "logits/chosen": -1.2648437023162842, "logits/rejected": -1.099853515625, "logps/chosen": -326.04998779296875, "logps/rejected": -317.36248779296875, "loss": 0.57, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3463806211948395, "rewards/margins": 0.8364623785018921, "rewards/rejected": -0.4898437559604645, "step": 1160 }, { "epoch": 0.44055351595594466, "grad_norm": 59.80678045776294, "learning_rate": 8.89924670433145e-07, "logits/chosen": -1.245019555091858, "logits/rejected": -1.0145752429962158, "logps/chosen": -323.75, "logps/rejected": -299.7124938964844, "loss": 0.5683, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.62945556640625, "rewards/margins": 0.757495105266571, "rewards/rejected": -0.12870177626609802, "step": 1170 }, { "epoch": 0.44431893062223476, "grad_norm": 52.06196488434708, "learning_rate": 8.889830508474576e-07, "logits/chosen": -1.1889556646347046, "logits/rejected": -0.9948974847793579, "logps/chosen": -308.98748779296875, "logps/rejected": -304.1625061035156, "loss": 0.5223, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.6082397699356079, "rewards/margins": 0.885498046875, "rewards/rejected": -0.27752685546875, "step": 1180 }, { "epoch": 0.4480843452885249, "grad_norm": 41.59033573189127, "learning_rate": 8.880414312617702e-07, "logits/chosen": -1.335205078125, "logits/rejected": -1.131982445716858, "logps/chosen": -354.1625061035156, "logps/rejected": -316.25, "loss": 0.5276, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.5937865972518921, "rewards/margins": 0.905010998249054, "rewards/rejected": -0.3113769590854645, "step": 1190 }, { "epoch": 0.451849759954815, "grad_norm": 49.351831537204454, "learning_rate": 8.870998116760829e-07, "logits/chosen": -1.166357398033142, "logits/rejected": -1.07122802734375, "logps/chosen": -351.7250061035156, "logps/rejected": -296.3999938964844, "loss": 0.5102, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.4165802001953125, "rewards/margins": 0.937573254108429, "rewards/rejected": -0.520397961139679, "step": 1200 }, { "epoch": 0.4556151746211051, "grad_norm": 50.292265057505105, "learning_rate": 8.861581920903954e-07, "logits/chosen": -1.3796875476837158, "logits/rejected": -1.352294921875, "logps/chosen": -327.75, "logps/rejected": -299.8125, "loss": 0.6109, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.09303741157054901, "rewards/margins": 0.778454601764679, "rewards/rejected": -0.6853393316268921, "step": 1210 }, { "epoch": 0.4593805892873953, "grad_norm": 62.28815784421993, "learning_rate": 8.852165725047081e-07, "logits/chosen": -1.28515625, "logits/rejected": -1.1781737804412842, "logps/chosen": -338.45623779296875, "logps/rejected": -294.32501220703125, "loss": 0.477, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.12276306003332138, "rewards/margins": 1.1885254383087158, "rewards/rejected": -1.0653808116912842, "step": 1220 }, { "epoch": 0.4631460039536854, "grad_norm": 63.71738637218158, "learning_rate": 8.842749529190206e-07, "logits/chosen": -1.187353491783142, "logits/rejected": -1.1921875476837158, "logps/chosen": -355.67498779296875, "logps/rejected": -314.48748779296875, "loss": 0.6094, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13222046196460724, "rewards/margins": 0.913281261920929, "rewards/rejected": -1.044946312904358, "step": 1230 }, { "epoch": 0.46691141861997554, "grad_norm": 68.40385442172993, "learning_rate": 8.833333333333333e-07, "logits/chosen": -1.143579125404358, "logits/rejected": -1.044677734375, "logps/chosen": -349.1000061035156, "logps/rejected": -327.79998779296875, "loss": 0.578, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15977172553539276, "rewards/margins": 0.865771472454071, "rewards/rejected": -1.025390625, "step": 1240 }, { "epoch": 0.47067683328626564, "grad_norm": 72.43449839756417, "learning_rate": 8.823917137476458e-07, "logits/chosen": -1.298583984375, "logits/rejected": -1.186181664466858, "logps/chosen": -375.26251220703125, "logps/rejected": -332.6499938964844, "loss": 0.5664, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13370056450366974, "rewards/margins": 0.8942626714706421, "rewards/rejected": -1.0278809070587158, "step": 1250 }, { "epoch": 0.4744422479525558, "grad_norm": 91.79601983295129, "learning_rate": 8.814500941619585e-07, "logits/chosen": -1.2264893054962158, "logits/rejected": -1.1642577648162842, "logps/chosen": -384.54998779296875, "logps/rejected": -333.1000061035156, "loss": 0.5872, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.13266602158546448, "rewards/margins": 0.80322265625, "rewards/rejected": -0.9355102777481079, "step": 1260 }, { "epoch": 0.4782076626188459, "grad_norm": 49.297948536372175, "learning_rate": 8.805084745762711e-07, "logits/chosen": -1.1335937976837158, "logits/rejected": -1.009057641029358, "logps/chosen": -286.5625, "logps/rejected": -302.8125, "loss": 0.5565, "rewards/accuracies": 0.6875, "rewards/chosen": -0.08525390923023224, "rewards/margins": 0.715710461139679, "rewards/rejected": -0.80029296875, "step": 1270 }, { "epoch": 0.481973077285136, "grad_norm": 57.121214252405, "learning_rate": 8.795668549905838e-07, "logits/chosen": -1.15966796875, "logits/rejected": -0.98663330078125, "logps/chosen": -345.5249938964844, "logps/rejected": -321.75, "loss": 0.5808, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.01614074781537056, "rewards/margins": 0.73394775390625, "rewards/rejected": -0.749987781047821, "step": 1280 }, { "epoch": 0.48573849195142615, "grad_norm": 67.10476892639106, "learning_rate": 8.786252354048965e-07, "logits/chosen": -1.355566382408142, "logits/rejected": -1.159082055091858, "logps/chosen": -319.2250061035156, "logps/rejected": -299.67498779296875, "loss": 0.5314, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.03278503566980362, "rewards/margins": 0.7641235589981079, "rewards/rejected": -0.731249988079071, "step": 1290 }, { "epoch": 0.48950390661771626, "grad_norm": 56.53982601807659, "learning_rate": 8.77683615819209e-07, "logits/chosen": -1.25537109375, "logits/rejected": -1.152978539466858, "logps/chosen": -309.48748779296875, "logps/rejected": -289.36248779296875, "loss": 0.5159, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.04414062574505806, "rewards/margins": 0.8474365472793579, "rewards/rejected": -0.891070544719696, "step": 1300 }, { "epoch": 0.4932693212840064, "grad_norm": 75.5266688499085, "learning_rate": 8.767419962335217e-07, "logits/chosen": -1.068212866783142, "logits/rejected": -0.9343932867050171, "logps/chosen": -359.63751220703125, "logps/rejected": -342.4624938964844, "loss": 0.6505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2824249267578125, "rewards/margins": 0.764086902141571, "rewards/rejected": -1.047094702720642, "step": 1310 }, { "epoch": 0.4970347359502965, "grad_norm": 57.142261918756795, "learning_rate": 8.758003766478342e-07, "logits/chosen": -1.118554711341858, "logits/rejected": -1.006317138671875, "logps/chosen": -332.21875, "logps/rejected": -334.2124938964844, "loss": 0.5246, "rewards/accuracies": 0.71875, "rewards/chosen": -0.14900512993335724, "rewards/margins": 0.9176269769668579, "rewards/rejected": -1.0669677257537842, "step": 1320 }, { "epoch": 0.5008001506165867, "grad_norm": 53.07438033347404, "learning_rate": 8.748587570621469e-07, "logits/chosen": -1.160302758216858, "logits/rejected": -1.042382836341858, "logps/chosen": -360.9375, "logps/rejected": -312.2749938964844, "loss": 0.5169, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.13225555419921875, "rewards/margins": 0.9079834222793579, "rewards/rejected": -1.0405151844024658, "step": 1330 }, { "epoch": 0.5045655652828768, "grad_norm": 61.120352100447214, "learning_rate": 8.739171374764594e-07, "logits/chosen": -1.1972167491912842, "logits/rejected": -0.9951171875, "logps/chosen": -299.2124938964844, "logps/rejected": -268.04998779296875, "loss": 0.5567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.20554199814796448, "rewards/margins": 0.8849242925643921, "rewards/rejected": -1.0901367664337158, "step": 1340 }, { "epoch": 0.5083309799491669, "grad_norm": 48.615297224379255, "learning_rate": 8.729755178907721e-07, "logits/chosen": -1.308984398841858, "logits/rejected": -1.1498534679412842, "logps/chosen": -346.07501220703125, "logps/rejected": -317.54998779296875, "loss": 0.5066, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.02910766564309597, "rewards/margins": 0.9386841058731079, "rewards/rejected": -0.967663586139679, "step": 1350 }, { "epoch": 0.512096394615457, "grad_norm": 49.08555044792431, "learning_rate": 8.720338983050847e-07, "logits/chosen": -1.329199194908142, "logits/rejected": -1.1901123523712158, "logps/chosen": -318.9125061035156, "logps/rejected": -318.3374938964844, "loss": 0.4659, "rewards/accuracies": 0.75, "rewards/chosen": -0.005679321475327015, "rewards/margins": 1.1212646961212158, "rewards/rejected": -1.127648949623108, "step": 1360 }, { "epoch": 0.5158618092817472, "grad_norm": 46.78922748149193, "learning_rate": 8.710922787193973e-07, "logits/chosen": -1.19580078125, "logits/rejected": -1.12744140625, "logps/chosen": -283.7749938964844, "logps/rejected": -287.7875061035156, "loss": 0.5032, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.21900025010108948, "rewards/margins": 0.9527832269668579, "rewards/rejected": -1.1717407703399658, "step": 1370 }, { "epoch": 0.5196272239480373, "grad_norm": 85.60409488767218, "learning_rate": 8.7015065913371e-07, "logits/chosen": -1.300439476966858, "logits/rejected": -1.1732361316680908, "logps/chosen": -362.0, "logps/rejected": -324.6000061035156, "loss": 0.5257, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.192474365234375, "rewards/margins": 0.9425414800643921, "rewards/rejected": -1.13507080078125, "step": 1380 }, { "epoch": 0.5233926386143274, "grad_norm": 75.54128957908766, "learning_rate": 8.692090395480226e-07, "logits/chosen": -1.2165038585662842, "logits/rejected": -1.0014770030975342, "logps/chosen": -316.3500061035156, "logps/rejected": -324.92498779296875, "loss": 0.5546, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.49151611328125, "rewards/margins": 0.81646728515625, "rewards/rejected": -1.308068871498108, "step": 1390 }, { "epoch": 0.5271580532806175, "grad_norm": 72.82218632239822, "learning_rate": 8.682674199623352e-07, "logits/chosen": -1.22796630859375, "logits/rejected": -1.170751929283142, "logps/chosen": -368.5062561035156, "logps/rejected": -295.10626220703125, "loss": 0.5576, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.285400390625, "rewards/margins": 0.865710437297821, "rewards/rejected": -1.150781273841858, "step": 1400 }, { "epoch": 0.5309234679469077, "grad_norm": 59.55348887077688, "learning_rate": 8.673258003766478e-07, "logits/chosen": -1.2433593273162842, "logits/rejected": -1.0966308116912842, "logps/chosen": -334.32501220703125, "logps/rejected": -312.6000061035156, "loss": 0.4932, "rewards/accuracies": 0.75, "rewards/chosen": -0.659655749797821, "rewards/margins": 1.0566527843475342, "rewards/rejected": -1.7148253917694092, "step": 1410 }, { "epoch": 0.5346888826131978, "grad_norm": 46.573001773788135, "learning_rate": 8.663841807909604e-07, "logits/chosen": -1.257177710533142, "logits/rejected": -1.092193603515625, "logps/chosen": -332.25, "logps/rejected": -322.54998779296875, "loss": 0.5378, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.889251708984375, "rewards/margins": 1.0422484874725342, "rewards/rejected": -1.93212890625, "step": 1420 }, { "epoch": 0.5384542972794879, "grad_norm": 58.94004710976964, "learning_rate": 8.654425612052731e-07, "logits/chosen": -1.2023437023162842, "logits/rejected": -1.036865234375, "logps/chosen": -318.3374938964844, "logps/rejected": -325.3125, "loss": 0.5982, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.857983410358429, "rewards/margins": 0.8272705078125, "rewards/rejected": -1.68505859375, "step": 1430 }, { "epoch": 0.542219711945778, "grad_norm": 51.331951524988256, "learning_rate": 8.645009416195856e-07, "logits/chosen": -1.150634765625, "logits/rejected": -1.084326148033142, "logps/chosen": -309.0625, "logps/rejected": -306.54998779296875, "loss": 0.5832, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2645019590854645, "rewards/margins": 0.7969329953193665, "rewards/rejected": -1.0619049072265625, "step": 1440 }, { "epoch": 0.5459851266120681, "grad_norm": 47.787870962834, "learning_rate": 8.635593220338983e-07, "logits/chosen": -1.16015625, "logits/rejected": -1.052587866783142, "logps/chosen": -310.2124938964844, "logps/rejected": -294.48748779296875, "loss": 0.5111, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.06785278022289276, "rewards/margins": 1.007470726966858, "rewards/rejected": -1.0754883289337158, "step": 1450 }, { "epoch": 0.5497505412783583, "grad_norm": 87.75402170679877, "learning_rate": 8.626177024482108e-07, "logits/chosen": -1.0187256336212158, "logits/rejected": -1.0277831554412842, "logps/chosen": -365.9750061035156, "logps/rejected": -311.3999938964844, "loss": 0.5553, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.15187987685203552, "rewards/margins": 0.896533191204071, "rewards/rejected": -1.048925757408142, "step": 1460 }, { "epoch": 0.5535159559446484, "grad_norm": 72.88875391590557, "learning_rate": 8.616760828625235e-07, "logits/chosen": -1.0178344249725342, "logits/rejected": -0.93585205078125, "logps/chosen": -380.6000061035156, "logps/rejected": -354.75, "loss": 0.5158, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.162139892578125, "rewards/margins": 0.9379333257675171, "rewards/rejected": -1.100067138671875, "step": 1470 }, { "epoch": 0.5572813706109385, "grad_norm": 66.83559764426029, "learning_rate": 8.60734463276836e-07, "logits/chosen": -1.265222191810608, "logits/rejected": -1.0319092273712158, "logps/chosen": -330.67498779296875, "logps/rejected": -298.1499938964844, "loss": 0.5058, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3393310606479645, "rewards/margins": 1.1429932117462158, "rewards/rejected": -1.48162841796875, "step": 1480 }, { "epoch": 0.5610467852772286, "grad_norm": 53.63255027081545, "learning_rate": 8.597928436911487e-07, "logits/chosen": -1.017187476158142, "logits/rejected": -0.917492687702179, "logps/chosen": -368.82501220703125, "logps/rejected": -342.9937438964844, "loss": 0.4973, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.14740295708179474, "rewards/margins": 1.0641601085662842, "rewards/rejected": -1.2119140625, "step": 1490 }, { "epoch": 0.5648121999435188, "grad_norm": 47.140969919856964, "learning_rate": 8.588512241054614e-07, "logits/chosen": -1.206787109375, "logits/rejected": -0.996826171875, "logps/chosen": -277.3125, "logps/rejected": -307.8500061035156, "loss": 0.5638, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.005145263858139515, "rewards/margins": 0.7794860601425171, "rewards/rejected": -0.774273693561554, "step": 1500 }, { "epoch": 0.5685776146098089, "grad_norm": 64.21674139577348, "learning_rate": 8.57909604519774e-07, "logits/chosen": -1.21875, "logits/rejected": -1.0322387218475342, "logps/chosen": -359.7124938964844, "logps/rejected": -318.29998779296875, "loss": 0.471, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.0999298095703125, "rewards/margins": 1.1138916015625, "rewards/rejected": -1.013671875, "step": 1510 }, { "epoch": 0.572343029276099, "grad_norm": 48.66979183048351, "learning_rate": 8.569679849340866e-07, "logits/chosen": -1.208642601966858, "logits/rejected": -1.125146508216858, "logps/chosen": -335.48748779296875, "logps/rejected": -329.7250061035156, "loss": 0.4889, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.08710937201976776, "rewards/margins": 0.9697510004043579, "rewards/rejected": -1.0573852062225342, "step": 1520 }, { "epoch": 0.5761084439423891, "grad_norm": 53.89202084397055, "learning_rate": 8.560263653483992e-07, "logits/chosen": -1.1488769054412842, "logits/rejected": -1.114990234375, "logps/chosen": -343.0874938964844, "logps/rejected": -325.4375, "loss": 0.4883, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.28901368379592896, "rewards/margins": 1.0382201671600342, "rewards/rejected": -1.327734351158142, "step": 1530 }, { "epoch": 0.5798738586086792, "grad_norm": 43.76534481525038, "learning_rate": 8.550847457627118e-07, "logits/chosen": -1.171240210533142, "logits/rejected": -0.98876953125, "logps/chosen": -331.4125061035156, "logps/rejected": -296.70001220703125, "loss": 0.4376, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.3695617616176605, "rewards/margins": 1.239892601966858, "rewards/rejected": -1.6101562976837158, "step": 1540 }, { "epoch": 0.5836392732749695, "grad_norm": 71.36917983477866, "learning_rate": 8.541431261770244e-07, "logits/chosen": -1.226660132408142, "logits/rejected": -1.141088843345642, "logps/chosen": -367.1000061035156, "logps/rejected": -323.67498779296875, "loss": 0.5199, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.49100953340530396, "rewards/margins": 1.089575171470642, "rewards/rejected": -1.580786108970642, "step": 1550 }, { "epoch": 0.5874046879412596, "grad_norm": 59.53685581870847, "learning_rate": 8.532015065913371e-07, "logits/chosen": -1.0729491710662842, "logits/rejected": -1.120690941810608, "logps/chosen": -367.0625, "logps/rejected": -301.3999938964844, "loss": 0.5997, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.48888856172561646, "rewards/margins": 0.91650390625, "rewards/rejected": -1.4041869640350342, "step": 1560 }, { "epoch": 0.5911701026075497, "grad_norm": 49.541854576874634, "learning_rate": 8.522598870056497e-07, "logits/chosen": -1.111474633216858, "logits/rejected": -0.9537597894668579, "logps/chosen": -338.9125061035156, "logps/rejected": -308.95001220703125, "loss": 0.52, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.10115966945886612, "rewards/margins": 0.920422375202179, "rewards/rejected": -1.02203369140625, "step": 1570 }, { "epoch": 0.5949355172738398, "grad_norm": 53.925396419368, "learning_rate": 8.513182674199623e-07, "logits/chosen": -1.1218750476837158, "logits/rejected": -1.026757836341858, "logps/chosen": -302.5874938964844, "logps/rejected": -269.7124938964844, "loss": 0.5613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.12767943739891052, "rewards/margins": 0.8435424566268921, "rewards/rejected": -0.7152557373046875, "step": 1580 }, { "epoch": 0.5987009319401299, "grad_norm": 80.67775875064109, "learning_rate": 8.503766478342749e-07, "logits/chosen": -1.127050757408142, "logits/rejected": -0.9379333257675171, "logps/chosen": -353.0375061035156, "logps/rejected": -321.42498779296875, "loss": 0.5713, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.08250121772289276, "rewards/margins": 0.7477782964706421, "rewards/rejected": -0.6650635004043579, "step": 1590 }, { "epoch": 0.6024663466064201, "grad_norm": 79.34082880099425, "learning_rate": 8.494350282485875e-07, "logits/chosen": -1.117040991783142, "logits/rejected": -0.994091808795929, "logps/chosen": -332.57501220703125, "logps/rejected": -329.7875061035156, "loss": 0.5021, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.01601562462747097, "rewards/margins": 1.08660888671875, "rewards/rejected": -1.1024901866912842, "step": 1600 }, { "epoch": 0.6062317612727102, "grad_norm": 52.873717746184695, "learning_rate": 8.484934086629002e-07, "logits/chosen": -1.146386742591858, "logits/rejected": -1.130126953125, "logps/chosen": -345.1000061035156, "logps/rejected": -273.0375061035156, "loss": 0.4686, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.12197265774011612, "rewards/margins": 1.2021973133087158, "rewards/rejected": -1.080816626548767, "step": 1610 }, { "epoch": 0.6099971759390003, "grad_norm": 53.67242998301258, "learning_rate": 8.475517890772128e-07, "logits/chosen": -1.1964843273162842, "logits/rejected": -0.8931884765625, "logps/chosen": -294.4750061035156, "logps/rejected": -280.375, "loss": 0.4768, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.13176879286766052, "rewards/margins": 1.1825439929962158, "rewards/rejected": -1.313873291015625, "step": 1620 }, { "epoch": 0.6137625906052904, "grad_norm": 51.725216316070984, "learning_rate": 8.466101694915254e-07, "logits/chosen": -1.0912597179412842, "logits/rejected": -1.00048828125, "logps/chosen": -328.3999938964844, "logps/rejected": -317.625, "loss": 0.5418, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6514251828193665, "rewards/margins": 1.005639672279358, "rewards/rejected": -1.658483862876892, "step": 1630 }, { "epoch": 0.6175280052715806, "grad_norm": 51.48227316347932, "learning_rate": 8.456685499058381e-07, "logits/chosen": -1.139550805091858, "logits/rejected": -1.0480468273162842, "logps/chosen": -374.70001220703125, "logps/rejected": -337.4750061035156, "loss": 0.4986, "rewards/accuracies": 0.75, "rewards/chosen": -0.8118835687637329, "rewards/margins": 1.097387671470642, "rewards/rejected": -1.90869140625, "step": 1640 }, { "epoch": 0.6212934199378707, "grad_norm": 98.67410251036817, "learning_rate": 8.447269303201506e-07, "logits/chosen": -1.2092773914337158, "logits/rejected": -0.9532715082168579, "logps/chosen": -360.5375061035156, "logps/rejected": -338.07501220703125, "loss": 0.6032, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.361914038658142, "rewards/margins": 0.990374743938446, "rewards/rejected": -2.3526368141174316, "step": 1650 }, { "epoch": 0.6250588346041608, "grad_norm": 61.18490283405885, "learning_rate": 8.437853107344633e-07, "logits/chosen": -1.2089111804962158, "logits/rejected": -1.1356933116912842, "logps/chosen": -355.29998779296875, "logps/rejected": -328.2250061035156, "loss": 0.6009, "rewards/accuracies": 0.6875, "rewards/chosen": -1.387841820716858, "rewards/margins": 0.8280029296875, "rewards/rejected": -2.2173829078674316, "step": 1660 }, { "epoch": 0.6288242492704509, "grad_norm": 58.22247605272227, "learning_rate": 8.428436911487758e-07, "logits/chosen": -1.227148413658142, "logits/rejected": -1.0777587890625, "logps/chosen": -334.25, "logps/rejected": -318.36248779296875, "loss": 0.4952, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.193701148033142, "rewards/margins": 0.946704089641571, "rewards/rejected": -2.1412110328674316, "step": 1670 }, { "epoch": 0.632589663936741, "grad_norm": 66.64271140272947, "learning_rate": 8.419020715630885e-07, "logits/chosen": -1.345117211341858, "logits/rejected": -1.2078125476837158, "logps/chosen": -308.25, "logps/rejected": -303.1000061035156, "loss": 0.5426, "rewards/accuracies": 0.71875, "rewards/chosen": -1.049652099609375, "rewards/margins": 0.9511474370956421, "rewards/rejected": -2.000537157058716, "step": 1680 }, { "epoch": 0.6363550786030312, "grad_norm": 35.925060706537835, "learning_rate": 8.40960451977401e-07, "logits/chosen": -1.1923828125, "logits/rejected": -1.0118896961212158, "logps/chosen": -327.6000061035156, "logps/rejected": -341.6499938964844, "loss": 0.4757, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7756103277206421, "rewards/margins": 1.245019555091858, "rewards/rejected": -2.021240234375, "step": 1690 }, { "epoch": 0.6401204932693213, "grad_norm": 67.32493770503069, "learning_rate": 8.400188323917137e-07, "logits/chosen": -1.2946288585662842, "logits/rejected": -1.228515625, "logps/chosen": -355.6000061035156, "logps/rejected": -334.0625, "loss": 0.5662, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7938873171806335, "rewards/margins": 0.9300781488418579, "rewards/rejected": -1.7236328125, "step": 1700 }, { "epoch": 0.6438859079356114, "grad_norm": 70.00015246642344, "learning_rate": 8.390772128060262e-07, "logits/chosen": -1.193115234375, "logits/rejected": -1.056494116783142, "logps/chosen": -376.4125061035156, "logps/rejected": -322.7250061035156, "loss": 0.4726, "rewards/accuracies": 0.75, "rewards/chosen": -0.3187255859375, "rewards/margins": 1.407934546470642, "rewards/rejected": -1.726660132408142, "step": 1710 }, { "epoch": 0.6476513226019015, "grad_norm": 69.42507705707126, "learning_rate": 8.38135593220339e-07, "logits/chosen": -1.281982421875, "logits/rejected": -1.226159691810608, "logps/chosen": -327.125, "logps/rejected": -321.54998779296875, "loss": 0.5574, "rewards/accuracies": 0.71875, "rewards/chosen": 0.16914062201976776, "rewards/margins": 0.946362316608429, "rewards/rejected": -0.777099609375, "step": 1720 }, { "epoch": 0.6514167372681917, "grad_norm": 54.95652396881109, "learning_rate": 8.371939736346516e-07, "logits/chosen": -1.2732422351837158, "logits/rejected": -1.12939453125, "logps/chosen": -354.75, "logps/rejected": -320.67498779296875, "loss": 0.5202, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.39808350801467896, "rewards/margins": 1.000982642173767, "rewards/rejected": -0.602752685546875, "step": 1730 }, { "epoch": 0.6551821519344818, "grad_norm": 92.33106074305749, "learning_rate": 8.362523540489642e-07, "logits/chosen": -1.3794434070587158, "logits/rejected": -1.2755858898162842, "logps/chosen": -324.7749938964844, "logps/rejected": -316.29998779296875, "loss": 0.5323, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.193328857421875, "rewards/margins": 0.967541515827179, "rewards/rejected": -0.775311291217804, "step": 1740 }, { "epoch": 0.6589475666007719, "grad_norm": 82.12757338534196, "learning_rate": 8.353107344632768e-07, "logits/chosen": -1.189428687095642, "logits/rejected": -1.148584008216858, "logps/chosen": -379.92498779296875, "logps/rejected": -352.82501220703125, "loss": 0.5584, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.23882445693016052, "rewards/margins": 1.0003540515899658, "rewards/rejected": -0.7607513666152954, "step": 1750 }, { "epoch": 0.662712981267062, "grad_norm": 49.72280262532852, "learning_rate": 8.343691148775894e-07, "logits/chosen": -1.2202637195587158, "logits/rejected": -1.1426270008087158, "logps/chosen": -365.23748779296875, "logps/rejected": -317.2250061035156, "loss": 0.525, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.29163819551467896, "rewards/margins": 0.908496081829071, "rewards/rejected": -0.6171935796737671, "step": 1760 }, { "epoch": 0.6664783959333521, "grad_norm": 68.67597699097001, "learning_rate": 8.33427495291902e-07, "logits/chosen": -1.1787841320037842, "logits/rejected": -1.194482445716858, "logps/chosen": -370.04998779296875, "logps/rejected": -339.92498779296875, "loss": 0.6386, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03621826320886612, "rewards/margins": 0.595751941204071, "rewards/rejected": -0.559643566608429, "step": 1770 }, { "epoch": 0.6702438105996423, "grad_norm": 56.04162596762439, "learning_rate": 8.324858757062147e-07, "logits/chosen": -1.205322265625, "logits/rejected": -1.0492446422576904, "logps/chosen": -359.6499938964844, "logps/rejected": -330.3500061035156, "loss": 0.5728, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.15562744438648224, "rewards/margins": 1.040557861328125, "rewards/rejected": -0.8843017816543579, "step": 1780 }, { "epoch": 0.6740092252659324, "grad_norm": 68.76303947510783, "learning_rate": 8.315442561205272e-07, "logits/chosen": -1.103662133216858, "logits/rejected": -1.0371825695037842, "logps/chosen": -316.23748779296875, "logps/rejected": -331.7749938964844, "loss": 0.5675, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.14304199814796448, "rewards/margins": 0.7821807861328125, "rewards/rejected": -0.6397705078125, "step": 1790 }, { "epoch": 0.6777746399322225, "grad_norm": 37.475160497521415, "learning_rate": 8.306026365348399e-07, "logits/chosen": -1.18701171875, "logits/rejected": -0.984423816204071, "logps/chosen": -306.76251220703125, "logps/rejected": -293.38751220703125, "loss": 0.4556, "rewards/accuracies": 0.75, "rewards/chosen": 0.21506348252296448, "rewards/margins": 1.137841820716858, "rewards/rejected": -0.922100841999054, "step": 1800 }, { "epoch": 0.6815400545985126, "grad_norm": 67.16081243439695, "learning_rate": 8.296610169491525e-07, "logits/chosen": -1.148535132408142, "logits/rejected": -1.0633056163787842, "logps/chosen": -345.70001220703125, "logps/rejected": -314.70001220703125, "loss": 0.5425, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24470214545726776, "rewards/margins": 0.858959972858429, "rewards/rejected": -0.614453136920929, "step": 1810 }, { "epoch": 0.6853054692648027, "grad_norm": 46.412065682887274, "learning_rate": 8.287193973634651e-07, "logits/chosen": -1.1133301258087158, "logits/rejected": -0.8109649419784546, "logps/chosen": -342.86248779296875, "logps/rejected": -358.125, "loss": 0.4781, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.07938232272863388, "rewards/margins": 1.172119140625, "rewards/rejected": -1.0923950672149658, "step": 1820 }, { "epoch": 0.689070883931093, "grad_norm": 70.83086461436096, "learning_rate": 8.277777777777777e-07, "logits/chosen": -1.16015625, "logits/rejected": -0.927990734577179, "logps/chosen": -311.3999938964844, "logps/rejected": -310.8374938964844, "loss": 0.5087, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22904053330421448, "rewards/margins": 1.0361328125, "rewards/rejected": -0.80767822265625, "step": 1830 }, { "epoch": 0.692836298597383, "grad_norm": 63.532200613108586, "learning_rate": 8.268361581920904e-07, "logits/chosen": -1.1808593273162842, "logits/rejected": -1.023168921470642, "logps/chosen": -308.75, "logps/rejected": -315.38751220703125, "loss": 0.5354, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.09272994846105576, "rewards/margins": 0.932873547077179, "rewards/rejected": -0.839892566204071, "step": 1840 }, { "epoch": 0.6966017132636732, "grad_norm": 36.32201273487466, "learning_rate": 8.25894538606403e-07, "logits/chosen": -1.195214867591858, "logits/rejected": -1.083154320716858, "logps/chosen": -344.61248779296875, "logps/rejected": -297.1875, "loss": 0.4821, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.11377563327550888, "rewards/margins": 1.1287841796875, "rewards/rejected": -1.014032006263733, "step": 1850 }, { "epoch": 0.7003671279299633, "grad_norm": 47.75028071396728, "learning_rate": 8.249529190207156e-07, "logits/chosen": -1.163427710533142, "logits/rejected": -1.03369140625, "logps/chosen": -351.1625061035156, "logps/rejected": -323.29998779296875, "loss": 0.4713, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0870208740234375, "rewards/margins": 1.217431664466858, "rewards/rejected": -1.305017113685608, "step": 1860 }, { "epoch": 0.7041325425962535, "grad_norm": 75.40785932929641, "learning_rate": 8.240112994350283e-07, "logits/chosen": -1.1240723133087158, "logits/rejected": -0.982739269733429, "logps/chosen": -331.7749938964844, "logps/rejected": -344.88751220703125, "loss": 0.6382, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.38048094511032104, "rewards/margins": 0.9729248285293579, "rewards/rejected": -1.3531372547149658, "step": 1870 }, { "epoch": 0.7078979572625436, "grad_norm": 75.35903050325551, "learning_rate": 8.230696798493408e-07, "logits/chosen": -1.123632788658142, "logits/rejected": -1.005126953125, "logps/chosen": -324.6499938964844, "logps/rejected": -300.75, "loss": 0.5263, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.44011229276657104, "rewards/margins": 1.058190941810608, "rewards/rejected": -1.497583031654358, "step": 1880 }, { "epoch": 0.7116633719288337, "grad_norm": 67.38378911644158, "learning_rate": 8.221280602636535e-07, "logits/chosen": -1.1787598133087158, "logits/rejected": -0.9417785406112671, "logps/chosen": -314.88751220703125, "logps/rejected": -313.01251220703125, "loss": 0.5332, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.26176148653030396, "rewards/margins": 1.113226294517517, "rewards/rejected": -1.374548316001892, "step": 1890 }, { "epoch": 0.7154287865951238, "grad_norm": 75.1220968123658, "learning_rate": 8.21186440677966e-07, "logits/chosen": -1.1277587413787842, "logits/rejected": -0.9554198980331421, "logps/chosen": -357.04998779296875, "logps/rejected": -314.67498779296875, "loss": 0.545, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.24080200493335724, "rewards/margins": 0.910754382610321, "rewards/rejected": -1.151123046875, "step": 1900 }, { "epoch": 0.7191942012614139, "grad_norm": 51.41193616956627, "learning_rate": 8.202448210922787e-07, "logits/chosen": -1.124609351158142, "logits/rejected": -0.94189453125, "logps/chosen": -328.4750061035156, "logps/rejected": -327.9125061035156, "loss": 0.5862, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00836181640625, "rewards/margins": 0.7149292230606079, "rewards/rejected": -0.707165539264679, "step": 1910 }, { "epoch": 0.7229596159277041, "grad_norm": 47.35838855238533, "learning_rate": 8.193032015065912e-07, "logits/chosen": -1.0875976085662842, "logits/rejected": -1.0997436046600342, "logps/chosen": -338.54998779296875, "logps/rejected": -294.8999938964844, "loss": 0.5005, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21718139946460724, "rewards/margins": 0.922198474407196, "rewards/rejected": -0.7050415277481079, "step": 1920 }, { "epoch": 0.7267250305939942, "grad_norm": 55.84001726550569, "learning_rate": 8.183615819209039e-07, "logits/chosen": -1.2140624523162842, "logits/rejected": -0.9531615972518921, "logps/chosen": -320.875, "logps/rejected": -341.1000061035156, "loss": 0.5392, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08634033054113388, "rewards/margins": 1.1215331554412842, "rewards/rejected": -1.035192847251892, "step": 1930 }, { "epoch": 0.7304904452602843, "grad_norm": 49.14061031363928, "learning_rate": 8.174199623352165e-07, "logits/chosen": -1.158300757408142, "logits/rejected": -0.9710754156112671, "logps/chosen": -332.51873779296875, "logps/rejected": -349.9375, "loss": 0.5148, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.06694336235523224, "rewards/margins": 0.948803722858429, "rewards/rejected": -1.015655517578125, "step": 1940 }, { "epoch": 0.7342558599265744, "grad_norm": 104.35445301936622, "learning_rate": 8.164783427495292e-07, "logits/chosen": -1.164453148841858, "logits/rejected": -1.0612061023712158, "logps/chosen": -372.1625061035156, "logps/rejected": -341.6499938964844, "loss": 0.5925, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.06764526665210724, "rewards/margins": 1.0041992664337158, "rewards/rejected": -1.0726470947265625, "step": 1950 }, { "epoch": 0.7380212745928645, "grad_norm": 47.575532390690405, "learning_rate": 8.155367231638418e-07, "logits/chosen": -1.2060546875, "logits/rejected": -1.0026428699493408, "logps/chosen": -289.01251220703125, "logps/rejected": -307.45001220703125, "loss": 0.5038, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.18352051079273224, "rewards/margins": 1.0661132335662842, "rewards/rejected": -0.8821808099746704, "step": 1960 }, { "epoch": 0.7417866892591547, "grad_norm": 41.987996233567436, "learning_rate": 8.145951035781544e-07, "logits/chosen": -1.2834961414337158, "logits/rejected": -1.0763671398162842, "logps/chosen": -295.83123779296875, "logps/rejected": -292.625, "loss": 0.5254, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.03079834021627903, "rewards/margins": 0.9804321527481079, "rewards/rejected": -0.948840320110321, "step": 1970 }, { "epoch": 0.7455521039254448, "grad_norm": 58.06748177008894, "learning_rate": 8.13653483992467e-07, "logits/chosen": -1.236822485923767, "logits/rejected": -1.083276391029358, "logps/chosen": -295.2250061035156, "logps/rejected": -306.42498779296875, "loss": 0.5214, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.162841796875, "rewards/margins": 0.9004882574081421, "rewards/rejected": -0.737866222858429, "step": 1980 }, { "epoch": 0.7493175185917349, "grad_norm": 68.25947181954159, "learning_rate": 8.127118644067796e-07, "logits/chosen": -1.302343726158142, "logits/rejected": -1.182373046875, "logps/chosen": -333.92498779296875, "logps/rejected": -307.45001220703125, "loss": 0.5181, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.08859863132238388, "rewards/margins": 1.03509521484375, "rewards/rejected": -1.124169945716858, "step": 1990 }, { "epoch": 0.753082933258025, "grad_norm": 59.050914091924284, "learning_rate": 8.117702448210922e-07, "logits/chosen": -1.354101538658142, "logits/rejected": -1.1455810070037842, "logps/chosen": -324.9624938964844, "logps/rejected": -326.57501220703125, "loss": 0.5411, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.008007812313735485, "rewards/margins": 1.070288062095642, "rewards/rejected": -1.07806396484375, "step": 2000 }, { "epoch": 0.7568483479243152, "grad_norm": 55.269834999412005, "learning_rate": 8.108286252354049e-07, "logits/chosen": -1.381445288658142, "logits/rejected": -1.2558104991912842, "logps/chosen": -319.98748779296875, "logps/rejected": -310.6000061035156, "loss": 0.4841, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.12664183974266052, "rewards/margins": 0.933154284954071, "rewards/rejected": -1.059423804283142, "step": 2010 }, { "epoch": 0.7606137625906053, "grad_norm": 72.82030643317208, "learning_rate": 8.098870056497174e-07, "logits/chosen": -1.1968262195587158, "logits/rejected": -1.0991699695587158, "logps/chosen": -388.6875, "logps/rejected": -329.125, "loss": 0.4749, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.298187255859375, "rewards/margins": 1.062036156654358, "rewards/rejected": -1.3594238758087158, "step": 2020 }, { "epoch": 0.7643791772568954, "grad_norm": 46.94766295186256, "learning_rate": 8.089453860640301e-07, "logits/chosen": -1.310644507408142, "logits/rejected": -1.250585913658142, "logps/chosen": -347.5249938964844, "logps/rejected": -315.4375, "loss": 0.4431, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2414703369140625, "rewards/margins": 1.1535857915878296, "rewards/rejected": -1.3937256336212158, "step": 2030 }, { "epoch": 0.7681445919231855, "grad_norm": 46.24934303447658, "learning_rate": 8.080037664783426e-07, "logits/chosen": -1.2819335460662842, "logits/rejected": -1.1426270008087158, "logps/chosen": -359.5375061035156, "logps/rejected": -325.54998779296875, "loss": 0.6614, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5689147710800171, "rewards/margins": 0.9695373773574829, "rewards/rejected": -1.53857421875, "step": 2040 }, { "epoch": 0.7719100065894756, "grad_norm": 44.93247266958999, "learning_rate": 8.070621468926553e-07, "logits/chosen": -1.242773413658142, "logits/rejected": -1.194433569908142, "logps/chosen": -395.5249938964844, "logps/rejected": -332.5375061035156, "loss": 0.5296, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6788574457168579, "rewards/margins": 1.0670654773712158, "rewards/rejected": -1.746240258216858, "step": 2050 }, { "epoch": 0.7756754212557658, "grad_norm": 76.37478010886988, "learning_rate": 8.06120527306968e-07, "logits/chosen": -1.336084008216858, "logits/rejected": -1.138330101966858, "logps/chosen": -340.0, "logps/rejected": -316.0874938964844, "loss": 0.4973, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.763598620891571, "rewards/margins": 1.2666137218475342, "rewards/rejected": -2.030468702316284, "step": 2060 }, { "epoch": 0.7794408359220559, "grad_norm": 44.2500689064603, "learning_rate": 8.051789077212806e-07, "logits/chosen": -1.1508300304412842, "logits/rejected": -1.208715796470642, "logps/chosen": -363.61248779296875, "logps/rejected": -335.7124938964844, "loss": 0.4956, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7221924066543579, "rewards/margins": 1.1898193359375, "rewards/rejected": -1.9125244617462158, "step": 2070 }, { "epoch": 0.783206250588346, "grad_norm": 54.89533478316895, "learning_rate": 8.042372881355933e-07, "logits/chosen": -1.2829101085662842, "logits/rejected": -1.221533179283142, "logps/chosen": -356.4750061035156, "logps/rejected": -329.875, "loss": 0.5154, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8681701421737671, "rewards/margins": 1.064123511314392, "rewards/rejected": -1.9319336414337158, "step": 2080 }, { "epoch": 0.7869716652546361, "grad_norm": 47.100074612292275, "learning_rate": 8.032956685499058e-07, "logits/chosen": -1.1594727039337158, "logits/rejected": -1.09716796875, "logps/chosen": -327.79376220703125, "logps/rejected": -296.7437438964844, "loss": 0.4974, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4554504454135895, "rewards/margins": 1.1765625476837158, "rewards/rejected": -1.630883812904358, "step": 2090 }, { "epoch": 0.7907370799209263, "grad_norm": 66.89329987850586, "learning_rate": 8.023540489642185e-07, "logits/chosen": -1.310449242591858, "logits/rejected": -1.034692406654358, "logps/chosen": -353.26251220703125, "logps/rejected": -334.3125, "loss": 0.5298, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.37642210721969604, "rewards/margins": 1.0270507335662842, "rewards/rejected": -1.4032714366912842, "step": 2100 }, { "epoch": 0.7945024945872164, "grad_norm": 50.983987998656026, "learning_rate": 8.01412429378531e-07, "logits/chosen": -1.0324218273162842, "logits/rejected": -1.034326195716858, "logps/chosen": -379.6499938964844, "logps/rejected": -345.17498779296875, "loss": 0.5542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.42543333768844604, "rewards/margins": 1.0201416015625, "rewards/rejected": -1.4468810558319092, "step": 2110 }, { "epoch": 0.7982679092535065, "grad_norm": 51.223206538007155, "learning_rate": 8.004708097928437e-07, "logits/chosen": -1.205224633216858, "logits/rejected": -1.0399169921875, "logps/chosen": -357.48748779296875, "logps/rejected": -342.3500061035156, "loss": 0.5316, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.443939208984375, "rewards/margins": 1.081945776939392, "rewards/rejected": -1.5243408679962158, "step": 2120 }, { "epoch": 0.8020333239197966, "grad_norm": 48.0545342273388, "learning_rate": 7.995291902071562e-07, "logits/chosen": -1.185449242591858, "logits/rejected": -1.121972680091858, "logps/chosen": -347.08123779296875, "logps/rejected": -328.04998779296875, "loss": 0.5258, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4208740293979645, "rewards/margins": 1.100341796875, "rewards/rejected": -1.5206298828125, "step": 2130 }, { "epoch": 0.8057987385860867, "grad_norm": 53.795233046735945, "learning_rate": 7.985875706214689e-07, "logits/chosen": -1.3123047351837158, "logits/rejected": -1.123266577720642, "logps/chosen": -300.8125, "logps/rejected": -288.875, "loss": 0.5188, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.49323731660842896, "rewards/margins": 1.032006859779358, "rewards/rejected": -1.5255126953125, "step": 2140 }, { "epoch": 0.809564153252377, "grad_norm": 54.573028513993, "learning_rate": 7.976459510357815e-07, "logits/chosen": -1.2018311023712158, "logits/rejected": -1.0329711437225342, "logps/chosen": -373.7250061035156, "logps/rejected": -387.13751220703125, "loss": 0.532, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.624523937702179, "rewards/margins": 1.1019408702850342, "rewards/rejected": -1.72705078125, "step": 2150 }, { "epoch": 0.8133295679186671, "grad_norm": 65.79836207310922, "learning_rate": 7.967043314500941e-07, "logits/chosen": -1.2322509288787842, "logits/rejected": -1.0648925304412842, "logps/chosen": -365.2749938964844, "logps/rejected": -340.42498779296875, "loss": 0.5174, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.24797973036766052, "rewards/margins": 1.1090576648712158, "rewards/rejected": -1.357366919517517, "step": 2160 }, { "epoch": 0.8170949825849572, "grad_norm": 39.23270047982163, "learning_rate": 7.957627118644067e-07, "logits/chosen": -1.138696312904358, "logits/rejected": -0.9664551019668579, "logps/chosen": -320.25, "logps/rejected": -293.17498779296875, "loss": 0.5545, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6270385980606079, "rewards/margins": 1.1199462413787842, "rewards/rejected": -1.746240258216858, "step": 2170 }, { "epoch": 0.8208603972512473, "grad_norm": 79.83307307663144, "learning_rate": 7.948210922787194e-07, "logits/chosen": -1.159277319908142, "logits/rejected": -1.065527319908142, "logps/chosen": -370.04998779296875, "logps/rejected": -324.5, "loss": 0.5541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.508209228515625, "rewards/margins": 1.1807861328125, "rewards/rejected": -1.6887938976287842, "step": 2180 }, { "epoch": 0.8246258119175374, "grad_norm": 68.91697003143956, "learning_rate": 7.93879472693032e-07, "logits/chosen": -1.140380859375, "logits/rejected": -1.2205078601837158, "logps/chosen": -360.57501220703125, "logps/rejected": -300.29998779296875, "loss": 0.5067, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.43835753202438354, "rewards/margins": 1.118310570716858, "rewards/rejected": -1.5568115711212158, "step": 2190 }, { "epoch": 0.8283912265838276, "grad_norm": 57.24126815662822, "learning_rate": 7.929378531073446e-07, "logits/chosen": -1.306249976158142, "logits/rejected": -1.2109863758087158, "logps/chosen": -339.0249938964844, "logps/rejected": -314.4750061035156, "loss": 0.5511, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.429421991109848, "rewards/margins": 1.1176025867462158, "rewards/rejected": -1.5482909679412842, "step": 2200 }, { "epoch": 0.8321566412501177, "grad_norm": 73.9459293333842, "learning_rate": 7.919962335216572e-07, "logits/chosen": -1.2291991710662842, "logits/rejected": -1.0842773914337158, "logps/chosen": -324.4125061035156, "logps/rejected": -294.76251220703125, "loss": 0.4922, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.40397948026657104, "rewards/margins": 0.982495129108429, "rewards/rejected": -1.385351538658142, "step": 2210 }, { "epoch": 0.8359220559164078, "grad_norm": 45.73119434490944, "learning_rate": 7.910546139359699e-07, "logits/chosen": -1.355712890625, "logits/rejected": -1.198706030845642, "logps/chosen": -335.1000061035156, "logps/rejected": -294.2250061035156, "loss": 0.4383, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.32658690214157104, "rewards/margins": 1.362829566001892, "rewards/rejected": -1.6903197765350342, "step": 2220 }, { "epoch": 0.8396874705826979, "grad_norm": 63.17817313155051, "learning_rate": 7.901129943502824e-07, "logits/chosen": -1.2641112804412842, "logits/rejected": -1.0893065929412842, "logps/chosen": -328.79998779296875, "logps/rejected": -320.57501220703125, "loss": 0.5068, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5682403445243835, "rewards/margins": 1.1194579601287842, "rewards/rejected": -1.6880371570587158, "step": 2230 }, { "epoch": 0.8434528852489881, "grad_norm": 47.27760589371626, "learning_rate": 7.891713747645951e-07, "logits/chosen": -1.208837866783142, "logits/rejected": -0.982128918170929, "logps/chosen": -334.4750061035156, "logps/rejected": -335.2875061035156, "loss": 0.5367, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6488891839981079, "rewards/margins": 1.126184105873108, "rewards/rejected": -1.775292992591858, "step": 2240 }, { "epoch": 0.8472182999152782, "grad_norm": 41.69402484234185, "learning_rate": 7.882297551789076e-07, "logits/chosen": -1.1330077648162842, "logits/rejected": -1.0285766124725342, "logps/chosen": -301.125, "logps/rejected": -294.5375061035156, "loss": 0.5509, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.50408935546875, "rewards/margins": 1.047949194908142, "rewards/rejected": -1.5524413585662842, "step": 2250 }, { "epoch": 0.8509837145815683, "grad_norm": 43.0726247855285, "learning_rate": 7.872881355932203e-07, "logits/chosen": -1.0606505870819092, "logits/rejected": -0.8490844964981079, "logps/chosen": -375.0, "logps/rejected": -363.29998779296875, "loss": 0.5294, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.15156860649585724, "rewards/margins": 1.033105492591858, "rewards/rejected": -1.18359375, "step": 2260 }, { "epoch": 0.8547491292478584, "grad_norm": 56.17479306450865, "learning_rate": 7.863465160075328e-07, "logits/chosen": -1.2126281261444092, "logits/rejected": -0.934277355670929, "logps/chosen": -291.5249938964844, "logps/rejected": -295.32501220703125, "loss": 0.4546, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.07402344048023224, "rewards/margins": 1.234533667564392, "rewards/rejected": -1.1591308116912842, "step": 2270 }, { "epoch": 0.8585145439141485, "grad_norm": 62.75039787766592, "learning_rate": 7.854048964218455e-07, "logits/chosen": -1.1236572265625, "logits/rejected": -1.0951659679412842, "logps/chosen": -331.75, "logps/rejected": -300.5, "loss": 0.5594, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.03276977688074112, "rewards/margins": 1.036901831626892, "rewards/rejected": -1.004418969154358, "step": 2280 }, { "epoch": 0.8622799585804387, "grad_norm": 50.3903994420224, "learning_rate": 7.844632768361582e-07, "logits/chosen": -1.0828125476837158, "logits/rejected": -0.9519012570381165, "logps/chosen": -330.9750061035156, "logps/rejected": -337.70001220703125, "loss": 0.5244, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.04697570949792862, "rewards/margins": 1.165771484375, "rewards/rejected": -1.1190674304962158, "step": 2290 }, { "epoch": 0.8660453732467288, "grad_norm": 57.736988147754545, "learning_rate": 7.835216572504708e-07, "logits/chosen": -1.114404320716858, "logits/rejected": -1.026910424232483, "logps/chosen": -325.6625061035156, "logps/rejected": -327.36248779296875, "loss": 0.5301, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.04006347805261612, "rewards/margins": 1.043554663658142, "rewards/rejected": -1.003570556640625, "step": 2300 }, { "epoch": 0.8698107879130189, "grad_norm": 33.52099537468075, "learning_rate": 7.825800376647835e-07, "logits/chosen": -1.004858374595642, "logits/rejected": -0.906420886516571, "logps/chosen": -324.5625, "logps/rejected": -315.92498779296875, "loss": 0.5049, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.27778321504592896, "rewards/margins": 1.0272216796875, "rewards/rejected": -0.748687744140625, "step": 2310 }, { "epoch": 0.873576202579309, "grad_norm": 51.02633502176133, "learning_rate": 7.81638418079096e-07, "logits/chosen": -0.937670886516571, "logits/rejected": -0.9818359613418579, "logps/chosen": -421.51251220703125, "logps/rejected": -344.45001220703125, "loss": 0.506, "rewards/accuracies": 0.78125, "rewards/chosen": 0.03509521484375, "rewards/margins": 1.1328856945037842, "rewards/rejected": -1.098486304283142, "step": 2320 }, { "epoch": 0.8773416172455992, "grad_norm": 71.70827000291709, "learning_rate": 7.806967984934087e-07, "logits/chosen": -1.039038062095642, "logits/rejected": -0.902539074420929, "logps/chosen": -376.82501220703125, "logps/rejected": -355.07501220703125, "loss": 0.5689, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13546141982078552, "rewards/margins": 1.052252173423767, "rewards/rejected": -1.188452124595642, "step": 2330 }, { "epoch": 0.8811070319118893, "grad_norm": 63.98209605591222, "learning_rate": 7.797551789077212e-07, "logits/chosen": -1.0664551258087158, "logits/rejected": -0.887280285358429, "logps/chosen": -298.75, "logps/rejected": -289.625, "loss": 0.5171, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01492919959127903, "rewards/margins": 0.931835949420929, "rewards/rejected": -0.946868896484375, "step": 2340 }, { "epoch": 0.8848724465781794, "grad_norm": 45.00412503567729, "learning_rate": 7.788135593220339e-07, "logits/chosen": -1.032934546470642, "logits/rejected": -0.8917480707168579, "logps/chosen": -319.36248779296875, "logps/rejected": -317.20001220703125, "loss": 0.476, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1675872802734375, "rewards/margins": 1.251257300376892, "rewards/rejected": -1.0844542980194092, "step": 2350 }, { "epoch": 0.8886378612444695, "grad_norm": 31.637360977800828, "learning_rate": 7.778719397363464e-07, "logits/chosen": -1.1088440418243408, "logits/rejected": -1.0145690441131592, "logps/chosen": -325.76251220703125, "logps/rejected": -320.875, "loss": 0.5244, "rewards/accuracies": 0.75, "rewards/chosen": 0.071746826171875, "rewards/margins": 1.254309058189392, "rewards/rejected": -1.1822388172149658, "step": 2360 }, { "epoch": 0.8924032759107596, "grad_norm": 56.79943852881527, "learning_rate": 7.769303201506591e-07, "logits/chosen": -1.094842553138733, "logits/rejected": -0.8989013433456421, "logps/chosen": -358.6499938964844, "logps/rejected": -341.6000061035156, "loss": 0.5502, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.21119384467601776, "rewards/margins": 1.1129639148712158, "rewards/rejected": -0.9025634527206421, "step": 2370 }, { "epoch": 0.8961686905770498, "grad_norm": 65.09302581641369, "learning_rate": 7.759887005649717e-07, "logits/chosen": -1.221643090248108, "logits/rejected": -1.1564133167266846, "logps/chosen": -300.7124938964844, "logps/rejected": -251.66250610351562, "loss": 0.5246, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.11306152492761612, "rewards/margins": 1.1724121570587158, "rewards/rejected": -1.0591309070587158, "step": 2380 }, { "epoch": 0.8999341052433399, "grad_norm": 71.40943575370588, "learning_rate": 7.750470809792843e-07, "logits/chosen": -1.1144530773162842, "logits/rejected": -1.029229760169983, "logps/chosen": -338.04998779296875, "logps/rejected": -291.3125, "loss": 0.6304, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0455169677734375, "rewards/margins": 1.004186987876892, "rewards/rejected": -1.05035400390625, "step": 2390 }, { "epoch": 0.90369951990963, "grad_norm": 55.95844110663838, "learning_rate": 7.74105461393597e-07, "logits/chosen": -1.180810570716858, "logits/rejected": -1.055761694908142, "logps/chosen": -297.29998779296875, "logps/rejected": -286.76251220703125, "loss": 0.5783, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.13447265326976776, "rewards/margins": 0.8954833745956421, "rewards/rejected": -1.0300445556640625, "step": 2400 }, { "epoch": 0.9074649345759201, "grad_norm": 52.44015981142328, "learning_rate": 7.731638418079096e-07, "logits/chosen": -1.1845214366912842, "logits/rejected": -1.0676147937774658, "logps/chosen": -333.63751220703125, "logps/rejected": -311.1499938964844, "loss": 0.5149, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.10113525390625, "rewards/margins": 0.9699341058731079, "rewards/rejected": -1.0721924304962158, "step": 2410 }, { "epoch": 0.9112303492422102, "grad_norm": 48.973757400739636, "learning_rate": 7.722222222222222e-07, "logits/chosen": -1.190820336341858, "logits/rejected": -1.0363891124725342, "logps/chosen": -358.36248779296875, "logps/rejected": -344.42498779296875, "loss": 0.4587, "rewards/accuracies": 0.78125, "rewards/chosen": 0.18305663764476776, "rewards/margins": 1.156518578529358, "rewards/rejected": -0.973620593547821, "step": 2420 }, { "epoch": 0.9149957639085005, "grad_norm": 74.5211698235463, "learning_rate": 7.712806026365349e-07, "logits/chosen": -1.17822265625, "logits/rejected": -1.10589599609375, "logps/chosen": -349.79998779296875, "logps/rejected": -335.875, "loss": 0.6066, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.254446417093277, "rewards/margins": 0.9467986822128296, "rewards/rejected": -1.201818823814392, "step": 2430 }, { "epoch": 0.9187611785747906, "grad_norm": 50.124853772599096, "learning_rate": 7.703389830508474e-07, "logits/chosen": -1.269189476966858, "logits/rejected": -1.183801293373108, "logps/chosen": -334.375, "logps/rejected": -309.67498779296875, "loss": 0.5584, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.17111817002296448, "rewards/margins": 0.8713744878768921, "rewards/rejected": -1.0421264171600342, "step": 2440 }, { "epoch": 0.9225265932410807, "grad_norm": 74.30811100749132, "learning_rate": 7.693973634651601e-07, "logits/chosen": -1.2238280773162842, "logits/rejected": -1.19732666015625, "logps/chosen": -353.125, "logps/rejected": -324.3500061035156, "loss": 0.5451, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18871155381202698, "rewards/margins": 0.9139159917831421, "rewards/rejected": -1.103051781654358, "step": 2450 }, { "epoch": 0.9262920079073708, "grad_norm": 63.05419552142683, "learning_rate": 7.684557438794726e-07, "logits/chosen": -1.126220703125, "logits/rejected": -1.060571312904358, "logps/chosen": -346.5, "logps/rejected": -284.92498779296875, "loss": 0.5878, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11181030422449112, "rewards/margins": 0.838391125202179, "rewards/rejected": -0.949749767780304, "step": 2460 }, { "epoch": 0.930057422573661, "grad_norm": 61.32142707079708, "learning_rate": 7.675141242937853e-07, "logits/chosen": -1.3088867664337158, "logits/rejected": -1.0632140636444092, "logps/chosen": -306.875, "logps/rejected": -339.7250061035156, "loss": 0.4693, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.07565917819738388, "rewards/margins": 1.1574218273162842, "rewards/rejected": -1.0827147960662842, "step": 2470 }, { "epoch": 0.9338228372399511, "grad_norm": 63.08490568257269, "learning_rate": 7.665725047080978e-07, "logits/chosen": -1.1621825695037842, "logits/rejected": -1.100976586341858, "logps/chosen": -345.01251220703125, "logps/rejected": -309.32501220703125, "loss": 0.4913, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.04829101637005806, "rewards/margins": 1.093774437904358, "rewards/rejected": -1.14129638671875, "step": 2480 }, { "epoch": 0.9375882519062412, "grad_norm": 42.624555402291534, "learning_rate": 7.656308851224105e-07, "logits/chosen": -1.26953125, "logits/rejected": -1.02783203125, "logps/chosen": -358.13751220703125, "logps/rejected": -312.3125, "loss": 0.5029, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.16295775771141052, "rewards/margins": 1.1054198741912842, "rewards/rejected": -1.268457055091858, "step": 2490 }, { "epoch": 0.9413536665725313, "grad_norm": 31.68513196969711, "learning_rate": 7.64689265536723e-07, "logits/chosen": -1.206323266029358, "logits/rejected": -1.16632080078125, "logps/chosen": -376.45001220703125, "logps/rejected": -314.7250061035156, "loss": 0.465, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3503662049770355, "rewards/margins": 1.2098877429962158, "rewards/rejected": -1.56103515625, "step": 2500 }, { "epoch": 0.9451190812388214, "grad_norm": 95.68787058352456, "learning_rate": 7.637476459510358e-07, "logits/chosen": -1.182519555091858, "logits/rejected": -1.086828589439392, "logps/chosen": -345.5249938964844, "logps/rejected": -328.79998779296875, "loss": 0.5263, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5566040277481079, "rewards/margins": 1.110498070716858, "rewards/rejected": -1.6681396961212158, "step": 2510 }, { "epoch": 0.9488844959051116, "grad_norm": 48.89644129977044, "learning_rate": 7.628060263653484e-07, "logits/chosen": -1.1362793445587158, "logits/rejected": -1.0662109851837158, "logps/chosen": -356.5249938964844, "logps/rejected": -312.67498779296875, "loss": 0.4552, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.36240845918655396, "rewards/margins": 1.1914551258087158, "rewards/rejected": -1.553369164466858, "step": 2520 }, { "epoch": 0.9526499105714017, "grad_norm": 46.8312493385338, "learning_rate": 7.61864406779661e-07, "logits/chosen": -1.2631103992462158, "logits/rejected": -1.1154769659042358, "logps/chosen": -337.6625061035156, "logps/rejected": -291.29998779296875, "loss": 0.5158, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.4964538514614105, "rewards/margins": 1.174291968345642, "rewards/rejected": -1.670373558998108, "step": 2530 }, { "epoch": 0.9564153252376918, "grad_norm": 81.3348591049344, "learning_rate": 7.609227871939736e-07, "logits/chosen": -1.3320801258087158, "logits/rejected": -1.2479736804962158, "logps/chosen": -325.625, "logps/rejected": -334.3125, "loss": 0.6004, "rewards/accuracies": 0.625, "rewards/chosen": -0.418923944234848, "rewards/margins": 0.8903473019599915, "rewards/rejected": -1.309545874595642, "step": 2540 }, { "epoch": 0.9601807399039819, "grad_norm": 46.16642066107827, "learning_rate": 7.599811676082862e-07, "logits/chosen": -1.3301270008087158, "logits/rejected": -1.251611351966858, "logps/chosen": -361.7250061035156, "logps/rejected": -337.8999938964844, "loss": 0.577, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5355590581893921, "rewards/margins": 1.0679199695587158, "rewards/rejected": -1.603247046470642, "step": 2550 }, { "epoch": 0.963946154570272, "grad_norm": 71.30358301364899, "learning_rate": 7.590395480225989e-07, "logits/chosen": -1.3466796875, "logits/rejected": -1.2634766101837158, "logps/chosen": -316.1499938964844, "logps/rejected": -299.2250061035156, "loss": 0.5636, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8147827386856079, "rewards/margins": 0.842639148235321, "rewards/rejected": -1.656835913658142, "step": 2560 }, { "epoch": 0.9677115692365622, "grad_norm": 44.911579309704116, "learning_rate": 7.580979284369114e-07, "logits/chosen": -1.2655761241912842, "logits/rejected": -1.129907250404358, "logps/chosen": -340.23748779296875, "logps/rejected": -335.54998779296875, "loss": 0.5508, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.613116443157196, "rewards/margins": 1.051904320716858, "rewards/rejected": -1.6652343273162842, "step": 2570 }, { "epoch": 0.9714769839028523, "grad_norm": 83.79706918858975, "learning_rate": 7.571563088512241e-07, "logits/chosen": -1.337060570716858, "logits/rejected": -1.1611328125, "logps/chosen": -314.2875061035156, "logps/rejected": -306.8999938964844, "loss": 0.5058, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.19997863471508026, "rewards/margins": 1.0658447742462158, "rewards/rejected": -1.2671630382537842, "step": 2580 }, { "epoch": 0.9752423985691424, "grad_norm": 67.78113959461925, "learning_rate": 7.562146892655367e-07, "logits/chosen": -1.2433593273162842, "logits/rejected": -1.151513695716858, "logps/chosen": -325.23126220703125, "logps/rejected": -300.51251220703125, "loss": 0.4826, "rewards/accuracies": 0.75, "rewards/chosen": -0.01760253868997097, "rewards/margins": 1.1220703125, "rewards/rejected": -1.139184594154358, "step": 2590 }, { "epoch": 0.9790078132354325, "grad_norm": 61.13784049577483, "learning_rate": 7.552730696798493e-07, "logits/chosen": -1.294921875, "logits/rejected": -1.178955078125, "logps/chosen": -363.125, "logps/rejected": -326.92498779296875, "loss": 0.5233, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.06473693996667862, "rewards/margins": 1.0222899913787842, "rewards/rejected": -0.957531750202179, "step": 2600 }, { "epoch": 0.9827732279017227, "grad_norm": 30.29615978098796, "learning_rate": 7.543314500941619e-07, "logits/chosen": -1.296972632408142, "logits/rejected": -1.1790039539337158, "logps/chosen": -292.29998779296875, "logps/rejected": -311.0375061035156, "loss": 0.5074, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.17583617568016052, "rewards/margins": 1.127233862876892, "rewards/rejected": -1.302880883216858, "step": 2610 }, { "epoch": 0.9865386425680128, "grad_norm": 51.458751321606655, "learning_rate": 7.533898305084745e-07, "logits/chosen": -1.352929711341858, "logits/rejected": -1.359716773033142, "logps/chosen": -341.375, "logps/rejected": -323.5874938964844, "loss": 0.5228, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16978760063648224, "rewards/margins": 1.0509521961212158, "rewards/rejected": -1.220117211341858, "step": 2620 }, { "epoch": 0.9903040572343029, "grad_norm": 49.72318769857057, "learning_rate": 7.524482109227872e-07, "logits/chosen": -1.292578101158142, "logits/rejected": -1.255859375, "logps/chosen": -310.82501220703125, "logps/rejected": -285.95001220703125, "loss": 0.5201, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.261270135641098, "rewards/margins": 0.955578625202179, "rewards/rejected": -1.216406226158142, "step": 2630 }, { "epoch": 0.994069471900593, "grad_norm": 47.97592716007591, "learning_rate": 7.515065913370998e-07, "logits/chosen": -1.270117163658142, "logits/rejected": -1.025732398033142, "logps/chosen": -307.63751220703125, "logps/rejected": -338.7749938964844, "loss": 0.5444, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.29643553495407104, "rewards/margins": 0.9876953363418579, "rewards/rejected": -1.2844116687774658, "step": 2640 }, { "epoch": 0.9978348865668831, "grad_norm": 79.57442642543985, "learning_rate": 7.505649717514124e-07, "logits/chosen": -1.3327147960662842, "logits/rejected": -1.2426879405975342, "logps/chosen": -310.63751220703125, "logps/rejected": -305.38751220703125, "loss": 0.5668, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.24123534560203552, "rewards/margins": 0.883374035358429, "rewards/rejected": -1.1243407726287842, "step": 2650 }, { "epoch": 1.001882707333145, "grad_norm": 20.543932384495356, "learning_rate": 7.496233521657251e-07, "logits/chosen": -1.1664689779281616, "logits/rejected": -1.1752697229385376, "logps/chosen": -348.8035583496094, "logps/rejected": -317.8690490722656, "loss": 0.3981, "rewards/accuracies": 0.8154761791229248, "rewards/chosen": 0.0391264408826828, "rewards/margins": 1.5112013816833496, "rewards/rejected": -1.4718453884124756, "step": 2660 }, { "epoch": 1.005648121999435, "grad_norm": 33.31966520529445, "learning_rate": 7.486817325800376e-07, "logits/chosen": -1.299414038658142, "logits/rejected": -1.242089867591858, "logps/chosen": -301.875, "logps/rejected": -305.4750061035156, "loss": 0.1793, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.44725340604782104, "rewards/margins": 2.503466844558716, "rewards/rejected": -2.0558104515075684, "step": 2670 }, { "epoch": 1.0094135366657253, "grad_norm": 16.29678430099621, "learning_rate": 7.477401129943503e-07, "logits/chosen": -1.326757788658142, "logits/rejected": -1.2341187000274658, "logps/chosen": -358.0249938964844, "logps/rejected": -340.2749938964844, "loss": 0.163, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.20684051513671875, "rewards/margins": 2.6934571266174316, "rewards/rejected": -2.4867186546325684, "step": 2680 }, { "epoch": 1.0131789513320155, "grad_norm": 13.3115652293274, "learning_rate": 7.467984934086628e-07, "logits/chosen": -1.289892554283142, "logits/rejected": -1.078466773033142, "logps/chosen": -365.8999938964844, "logps/rejected": -383.0625, "loss": 0.1483, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.07449951022863388, "rewards/margins": 2.959033250808716, "rewards/rejected": -2.884960889816284, "step": 2690 }, { "epoch": 1.0169443659983055, "grad_norm": 32.62893548215821, "learning_rate": 7.458568738229755e-07, "logits/chosen": -1.3898437023162842, "logits/rejected": -1.260644555091858, "logps/chosen": -328.1499938964844, "logps/rejected": -316.13751220703125, "loss": 0.1682, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.33466798067092896, "rewards/margins": 2.9136719703674316, "rewards/rejected": -2.578125, "step": 2700 }, { "epoch": 1.0207097806645957, "grad_norm": 31.708755166480465, "learning_rate": 7.44915254237288e-07, "logits/chosen": -1.360986351966858, "logits/rejected": -1.3599761724472046, "logps/chosen": -345.63751220703125, "logps/rejected": -299.7875061035156, "loss": 0.1704, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.36212158203125, "rewards/margins": 2.819140672683716, "rewards/rejected": -2.455859422683716, "step": 2710 }, { "epoch": 1.0244751953308857, "grad_norm": 12.656562014485546, "learning_rate": 7.439736346516007e-07, "logits/chosen": -1.395410180091858, "logits/rejected": -1.399072289466858, "logps/chosen": -326.9125061035156, "logps/rejected": -305.45001220703125, "loss": 0.1537, "rewards/accuracies": 0.96875, "rewards/chosen": 0.567883312702179, "rewards/margins": 2.837695360183716, "rewards/rejected": -2.27001953125, "step": 2720 }, { "epoch": 1.028240609997176, "grad_norm": 19.956500672621296, "learning_rate": 7.430320150659133e-07, "logits/chosen": -1.4485352039337158, "logits/rejected": -1.32421875, "logps/chosen": -282.67498779296875, "logps/rejected": -307.4125061035156, "loss": 0.145, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4218551516532898, "rewards/margins": 3.0986328125, "rewards/rejected": -2.67919921875, "step": 2730 }, { "epoch": 1.0320060246634661, "grad_norm": 18.65792079336215, "learning_rate": 7.42090395480226e-07, "logits/chosen": -1.45556640625, "logits/rejected": -1.3791015148162842, "logps/chosen": -328.7250061035156, "logps/rejected": -334.29998779296875, "loss": 0.1859, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.3257080018520355, "rewards/margins": 2.904296875, "rewards/rejected": -2.576953172683716, "step": 2740 }, { "epoch": 1.0357714393297561, "grad_norm": 32.445658218031, "learning_rate": 7.411487758945386e-07, "logits/chosen": -1.476953148841858, "logits/rejected": -1.313867211341858, "logps/chosen": -316.0, "logps/rejected": -344.51251220703125, "loss": 0.1551, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.8947814702987671, "rewards/margins": 2.90234375, "rewards/rejected": -2.007519483566284, "step": 2750 }, { "epoch": 1.0395368539960463, "grad_norm": 20.802823900235232, "learning_rate": 7.402071563088512e-07, "logits/chosen": -1.5908203125, "logits/rejected": -1.4631836414337158, "logps/chosen": -305.59375, "logps/rejected": -303.6000061035156, "loss": 0.1636, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8214172124862671, "rewards/margins": 2.8179688453674316, "rewards/rejected": -1.996557593345642, "step": 2760 }, { "epoch": 1.0433022686623366, "grad_norm": 21.02906128833692, "learning_rate": 7.392655367231638e-07, "logits/chosen": -1.527563452720642, "logits/rejected": -1.604589819908142, "logps/chosen": -329.51251220703125, "logps/rejected": -340.45001220703125, "loss": 0.2025, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5435791015625, "rewards/margins": 2.70751953125, "rewards/rejected": -2.166015625, "step": 2770 }, { "epoch": 1.0470676833286265, "grad_norm": 15.86558922655178, "learning_rate": 7.383239171374764e-07, "logits/chosen": -1.59375, "logits/rejected": -1.5476562976837158, "logps/chosen": -324.38751220703125, "logps/rejected": -344.6875, "loss": 0.1354, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4000244140625, "rewards/margins": 3.078906297683716, "rewards/rejected": -2.6806640625, "step": 2780 }, { "epoch": 1.0508330979949168, "grad_norm": 13.36151413119395, "learning_rate": 7.37382297551789e-07, "logits/chosen": -1.6237304210662842, "logits/rejected": -1.4515380859375, "logps/chosen": -296.5375061035156, "logps/rejected": -355.7749938964844, "loss": 0.1846, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16383667290210724, "rewards/margins": 3.042773485183716, "rewards/rejected": -2.87841796875, "step": 2790 }, { "epoch": 1.0545985126612067, "grad_norm": 15.584376142644613, "learning_rate": 7.364406779661017e-07, "logits/chosen": -1.7667968273162842, "logits/rejected": -1.6100585460662842, "logps/chosen": -342.6000061035156, "logps/rejected": -334.2875061035156, "loss": 0.1397, "rewards/accuracies": 0.96875, "rewards/chosen": -0.07529296725988388, "rewards/margins": 3.220703125, "rewards/rejected": -3.2943358421325684, "step": 2800 }, { "epoch": 1.058363927327497, "grad_norm": 39.5261851071324, "learning_rate": 7.354990583804142e-07, "logits/chosen": -1.745019555091858, "logits/rejected": -1.572265625, "logps/chosen": -306.0, "logps/rejected": -341.5375061035156, "loss": 0.1879, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2530761659145355, "rewards/margins": 3.0185546875, "rewards/rejected": -3.2724609375, "step": 2810 }, { "epoch": 1.0621293419937872, "grad_norm": 40.63956688924452, "learning_rate": 7.345574387947269e-07, "logits/chosen": -1.65966796875, "logits/rejected": -1.6064453125, "logps/chosen": -378.5249938964844, "logps/rejected": -352.32501220703125, "loss": 0.1809, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.12680359184741974, "rewards/margins": 3.081835985183716, "rewards/rejected": -3.207226514816284, "step": 2820 }, { "epoch": 1.0658947566600772, "grad_norm": 22.389531402870297, "learning_rate": 7.336158192090395e-07, "logits/chosen": -1.690332055091858, "logits/rejected": -1.61279296875, "logps/chosen": -340.5625, "logps/rejected": -354.6499938964844, "loss": 0.1703, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.03168945387005806, "rewards/margins": 2.820019483566284, "rewards/rejected": -2.8529295921325684, "step": 2830 }, { "epoch": 1.0696601713263674, "grad_norm": 16.184663294879055, "learning_rate": 7.326741996233521e-07, "logits/chosen": -1.729101538658142, "logits/rejected": -1.6007812023162842, "logps/chosen": -343.4624938964844, "logps/rejected": -354.6000061035156, "loss": 0.149, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.06309814751148224, "rewards/margins": 3.0484375953674316, "rewards/rejected": -2.984570264816284, "step": 2840 }, { "epoch": 1.0734255859926574, "grad_norm": 23.48773797201156, "learning_rate": 7.317325800376648e-07, "logits/chosen": -1.6238281726837158, "logits/rejected": -1.592187523841858, "logps/chosen": -300.0, "logps/rejected": -292.26251220703125, "loss": 0.1476, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.24991455674171448, "rewards/margins": 3.1615233421325684, "rewards/rejected": -2.911816358566284, "step": 2850 }, { "epoch": 1.0771910006589476, "grad_norm": 10.569373005936349, "learning_rate": 7.307909604519774e-07, "logits/chosen": -1.563085913658142, "logits/rejected": -1.59130859375, "logps/chosen": -379.7124938964844, "logps/rejected": -312.2250061035156, "loss": 0.1712, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.09613037109375, "rewards/margins": 2.7847657203674316, "rewards/rejected": -2.689453125, "step": 2860 }, { "epoch": 1.0809564153252378, "grad_norm": 36.797374759858045, "learning_rate": 7.298493408662901e-07, "logits/chosen": -1.671289086341858, "logits/rejected": -1.560937523841858, "logps/chosen": -338.8999938964844, "logps/rejected": -355.1499938964844, "loss": 0.1346, "rewards/accuracies": 0.96875, "rewards/chosen": 0.50933837890625, "rewards/margins": 3.24609375, "rewards/rejected": -2.7372069358825684, "step": 2870 }, { "epoch": 1.0847218299915278, "grad_norm": 19.32620606929358, "learning_rate": 7.289077212806026e-07, "logits/chosen": -1.636083960533142, "logits/rejected": -1.4365966320037842, "logps/chosen": -316.70623779296875, "logps/rejected": -330.8500061035156, "loss": 0.1172, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.097381591796875, "rewards/margins": 3.3072266578674316, "rewards/rejected": -3.207812547683716, "step": 2880 }, { "epoch": 1.088487244657818, "grad_norm": 13.681454929793857, "learning_rate": 7.279661016949153e-07, "logits/chosen": -1.57861328125, "logits/rejected": -1.6240234375, "logps/chosen": -315.48748779296875, "logps/rejected": -331.6625061035156, "loss": 0.1799, "rewards/accuracies": 0.9375, "rewards/chosen": -0.273733526468277, "rewards/margins": 3.1607422828674316, "rewards/rejected": -3.434375047683716, "step": 2890 }, { "epoch": 1.092252659324108, "grad_norm": 20.67017477576611, "learning_rate": 7.270244821092278e-07, "logits/chosen": -1.6687500476837158, "logits/rejected": -1.63330078125, "logps/chosen": -366.54998779296875, "logps/rejected": -355.6875, "loss": 0.1334, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6272338628768921, "rewards/margins": 3.5374999046325684, "rewards/rejected": -4.162499904632568, "step": 2900 }, { "epoch": 1.0960180739903982, "grad_norm": 27.39321451015341, "learning_rate": 7.260828625235405e-07, "logits/chosen": -1.712060570716858, "logits/rejected": -1.692480444908142, "logps/chosen": -357.3999938964844, "logps/rejected": -340.3999938964844, "loss": 0.1467, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9510253667831421, "rewards/margins": 3.2005858421325684, "rewards/rejected": -4.151562690734863, "step": 2910 }, { "epoch": 1.0997834886566884, "grad_norm": 19.953698798463545, "learning_rate": 7.25141242937853e-07, "logits/chosen": -1.7829101085662842, "logits/rejected": -1.7417480945587158, "logps/chosen": -347.5375061035156, "logps/rejected": -372.82501220703125, "loss": 0.1382, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.95623779296875, "rewards/margins": 3.324414014816284, "rewards/rejected": -4.28125, "step": 2920 }, { "epoch": 1.1035489033229784, "grad_norm": 26.15139295124515, "learning_rate": 7.241996233521657e-07, "logits/chosen": -1.7877929210662842, "logits/rejected": -1.70458984375, "logps/chosen": -352.8125, "logps/rejected": -396.45001220703125, "loss": 0.1532, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.650646984577179, "rewards/margins": 3.478710889816284, "rewards/rejected": -4.133203029632568, "step": 2930 }, { "epoch": 1.1073143179892686, "grad_norm": 24.95711851183801, "learning_rate": 7.232580037664782e-07, "logits/chosen": -1.666894555091858, "logits/rejected": -1.639746069908142, "logps/chosen": -367.6000061035156, "logps/rejected": -347.54998779296875, "loss": 0.1393, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2812866270542145, "rewards/margins": 3.1807618141174316, "rewards/rejected": -3.4634766578674316, "step": 2940 }, { "epoch": 1.1110797326555586, "grad_norm": 15.191328002471687, "learning_rate": 7.223163841807909e-07, "logits/chosen": -1.804589867591858, "logits/rejected": -1.57080078125, "logps/chosen": -329.625, "logps/rejected": -327.875, "loss": 0.1548, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.16782836616039276, "rewards/margins": 3.2632813453674316, "rewards/rejected": -3.4297852516174316, "step": 2950 }, { "epoch": 1.1148451473218488, "grad_norm": 4.982315835979355, "learning_rate": 7.213747645951035e-07, "logits/chosen": -1.611328125, "logits/rejected": -1.531103491783142, "logps/chosen": -364.4750061035156, "logps/rejected": -350.5, "loss": 0.1308, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.01088867150247097, "rewards/margins": 3.4749999046325684, "rewards/rejected": -3.4658203125, "step": 2960 }, { "epoch": 1.118610561988139, "grad_norm": 32.27253787376418, "learning_rate": 7.204331450094162e-07, "logits/chosen": -1.668359398841858, "logits/rejected": -1.543554663658142, "logps/chosen": -301.63751220703125, "logps/rejected": -335.82501220703125, "loss": 0.2001, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.39274901151657104, "rewards/margins": 3.343945264816284, "rewards/rejected": -3.7372069358825684, "step": 2970 }, { "epoch": 1.122375976654429, "grad_norm": 23.972423994019692, "learning_rate": 7.194915254237288e-07, "logits/chosen": -1.596582055091858, "logits/rejected": -1.635156273841858, "logps/chosen": -346.125, "logps/rejected": -337.3374938964844, "loss": 0.1283, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.23884277045726776, "rewards/margins": 3.55078125, "rewards/rejected": -3.795703172683716, "step": 2980 }, { "epoch": 1.1261413913207192, "grad_norm": 21.062447878400523, "learning_rate": 7.185499058380414e-07, "logits/chosen": -1.554785132408142, "logits/rejected": -1.506616234779358, "logps/chosen": -378.38751220703125, "logps/rejected": -351.8999938964844, "loss": 0.1125, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3648681640625, "rewards/margins": 3.6742186546325684, "rewards/rejected": -4.039258003234863, "step": 2990 }, { "epoch": 1.1299068059870092, "grad_norm": 40.104584635789195, "learning_rate": 7.17608286252354e-07, "logits/chosen": -1.724218726158142, "logits/rejected": -1.466406226158142, "logps/chosen": -317.84375, "logps/rejected": -329.0, "loss": 0.1425, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.46485596895217896, "rewards/margins": 3.3204102516174316, "rewards/rejected": -3.7855467796325684, "step": 3000 }, { "epoch": 1.1336722206532994, "grad_norm": 40.73745041807942, "learning_rate": 7.166666666666667e-07, "logits/chosen": -1.6082031726837158, "logits/rejected": -1.6443359851837158, "logps/chosen": -346.92498779296875, "logps/rejected": -326.29998779296875, "loss": 0.1215, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2775634825229645, "rewards/margins": 3.389453172683716, "rewards/rejected": -3.6675782203674316, "step": 3010 }, { "epoch": 1.1374376353195896, "grad_norm": 19.944750606727915, "learning_rate": 7.157250470809792e-07, "logits/chosen": -1.8499023914337158, "logits/rejected": -1.599755883216858, "logps/chosen": -332.54998779296875, "logps/rejected": -381.375, "loss": 0.1678, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.25871580839157104, "rewards/margins": 3.40625, "rewards/rejected": -3.664257764816284, "step": 3020 }, { "epoch": 1.1412030499858796, "grad_norm": 28.19217742608714, "learning_rate": 7.147834274952919e-07, "logits/chosen": -1.694921851158142, "logits/rejected": -1.5910155773162842, "logps/chosen": -300.76251220703125, "logps/rejected": -328.01251220703125, "loss": 0.1571, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.06530151516199112, "rewards/margins": 3.4720702171325684, "rewards/rejected": -3.5379395484924316, "step": 3030 }, { "epoch": 1.1449684646521698, "grad_norm": 34.38175851206615, "learning_rate": 7.138418079096044e-07, "logits/chosen": -1.5986328125, "logits/rejected": -1.566308617591858, "logps/chosen": -353.4750061035156, "logps/rejected": -341.4750061035156, "loss": 0.1515, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.21384277939796448, "rewards/margins": 3.2787108421325684, "rewards/rejected": -3.06494140625, "step": 3040 }, { "epoch": 1.1487338793184598, "grad_norm": 29.707015130031085, "learning_rate": 7.129001883239171e-07, "logits/chosen": -1.603613257408142, "logits/rejected": -1.4221680164337158, "logps/chosen": -332.25, "logps/rejected": -383.32501220703125, "loss": 0.1358, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.15817871689796448, "rewards/margins": 3.379101514816284, "rewards/rejected": -3.2236328125, "step": 3050 }, { "epoch": 1.15249929398475, "grad_norm": 15.215453567227181, "learning_rate": 7.119585687382296e-07, "logits/chosen": -1.484277367591858, "logits/rejected": -1.45703125, "logps/chosen": -356.42498779296875, "logps/rejected": -347.70001220703125, "loss": 0.1489, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.1737060546875, "rewards/margins": 3.2255859375, "rewards/rejected": -3.0511717796325684, "step": 3060 }, { "epoch": 1.1562647086510403, "grad_norm": 21.698336752368824, "learning_rate": 7.110169491525423e-07, "logits/chosen": -1.7018554210662842, "logits/rejected": -1.678613305091858, "logps/chosen": -298.2749938964844, "logps/rejected": -294.17498779296875, "loss": 0.1435, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.08116455376148224, "rewards/margins": 3.214648485183716, "rewards/rejected": -3.2962403297424316, "step": 3070 }, { "epoch": 1.1600301233173302, "grad_norm": 23.798876319862654, "learning_rate": 7.100753295668551e-07, "logits/chosen": -1.53857421875, "logits/rejected": -1.5255858898162842, "logps/chosen": -392.2749938964844, "logps/rejected": -370.0, "loss": 0.1288, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.06093750149011612, "rewards/margins": 3.5259766578674316, "rewards/rejected": -3.463671922683716, "step": 3080 }, { "epoch": 1.1637955379836205, "grad_norm": 29.528991343213487, "learning_rate": 7.091337099811676e-07, "logits/chosen": -1.788476586341858, "logits/rejected": -1.661035180091858, "logps/chosen": -336.4375, "logps/rejected": -341.3374938964844, "loss": 0.196, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.40613096952438354, "rewards/margins": 3.0687499046325684, "rewards/rejected": -3.4751954078674316, "step": 3090 }, { "epoch": 1.1675609526499107, "grad_norm": 20.501217364875924, "learning_rate": 7.081920903954803e-07, "logits/chosen": -1.720605492591858, "logits/rejected": -1.589941382408142, "logps/chosen": -329.3999938964844, "logps/rejected": -305.4750061035156, "loss": 0.1128, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.23262634873390198, "rewards/margins": 3.56640625, "rewards/rejected": -3.795117139816284, "step": 3100 }, { "epoch": 1.1713263673162007, "grad_norm": 15.931676973036325, "learning_rate": 7.072504708097928e-07, "logits/chosen": -1.832617163658142, "logits/rejected": -1.7917969226837158, "logps/chosen": -372.20001220703125, "logps/rejected": -334.0249938964844, "loss": 0.1392, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.5976196527481079, "rewards/margins": 3.365234375, "rewards/rejected": -3.9632811546325684, "step": 3110 }, { "epoch": 1.1750917819824909, "grad_norm": 20.183788869845248, "learning_rate": 7.063088512241055e-07, "logits/chosen": -1.68896484375, "logits/rejected": -1.664941430091858, "logps/chosen": -354.57501220703125, "logps/rejected": -380.8999938964844, "loss": 0.1855, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.9339507818222046, "rewards/margins": 3.616406202316284, "rewards/rejected": -4.550976753234863, "step": 3120 }, { "epoch": 1.1788571966487809, "grad_norm": 10.258850744866285, "learning_rate": 7.05367231638418e-07, "logits/chosen": -1.85791015625, "logits/rejected": -1.7029297351837158, "logps/chosen": -313.6625061035156, "logps/rejected": -346.82501220703125, "loss": 0.1264, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6472717523574829, "rewards/margins": 3.5814452171325684, "rewards/rejected": -4.2265625, "step": 3130 }, { "epoch": 1.182622611315071, "grad_norm": 45.16257361832549, "learning_rate": 7.044256120527307e-07, "logits/chosen": -1.812890648841858, "logits/rejected": -1.687744140625, "logps/chosen": -301.45001220703125, "logps/rejected": -311.4375, "loss": 0.1346, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.5633910894393921, "rewards/margins": 3.2513670921325684, "rewards/rejected": -3.814453125, "step": 3140 }, { "epoch": 1.1863880259813613, "grad_norm": 27.68787309161926, "learning_rate": 7.034839924670432e-07, "logits/chosen": -1.7332031726837158, "logits/rejected": -1.7086913585662842, "logps/chosen": -328.92498779296875, "logps/rejected": -309.38751220703125, "loss": 0.1955, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2682556211948395, "rewards/margins": 3.1163086891174316, "rewards/rejected": -3.384472608566284, "step": 3150 }, { "epoch": 1.1901534406476513, "grad_norm": 18.254702663256616, "learning_rate": 7.025423728813559e-07, "logits/chosen": -1.642187476158142, "logits/rejected": -1.7160155773162842, "logps/chosen": -346.5, "logps/rejected": -349.2250061035156, "loss": 0.1543, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.299652099609375, "rewards/margins": 3.078320264816284, "rewards/rejected": -3.3779296875, "step": 3160 }, { "epoch": 1.1939188553139415, "grad_norm": 22.10772486748456, "learning_rate": 7.016007532956685e-07, "logits/chosen": -1.6028320789337158, "logits/rejected": -1.480249047279358, "logps/chosen": -370.76251220703125, "logps/rejected": -377.11248779296875, "loss": 0.1248, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.599316418170929, "rewards/margins": 3.45556640625, "rewards/rejected": -4.053124904632568, "step": 3170 }, { "epoch": 1.1976842699802317, "grad_norm": 24.720952694234466, "learning_rate": 7.006591337099811e-07, "logits/chosen": -1.623046875, "logits/rejected": -1.5082519054412842, "logps/chosen": -339.29376220703125, "logps/rejected": -329.92498779296875, "loss": 0.1392, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8509765863418579, "rewards/margins": 3.279003858566284, "rewards/rejected": -4.130566596984863, "step": 3180 }, { "epoch": 1.2014496846465217, "grad_norm": 14.276231734648428, "learning_rate": 6.997175141242938e-07, "logits/chosen": -1.8673827648162842, "logits/rejected": -1.7966797351837158, "logps/chosen": -339.20001220703125, "logps/rejected": -332.9750061035156, "loss": 0.1716, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0404541492462158, "rewards/margins": 3.287890672683716, "rewards/rejected": -4.327734470367432, "step": 3190 }, { "epoch": 1.205215099312812, "grad_norm": 36.493179145897784, "learning_rate": 6.987758945386064e-07, "logits/chosen": -1.8369140625, "logits/rejected": -1.725488305091858, "logps/chosen": -328.3125, "logps/rejected": -349.1000061035156, "loss": 0.1543, "rewards/accuracies": 0.9375, "rewards/chosen": -1.064855933189392, "rewards/margins": 3.3349609375, "rewards/rejected": -4.398046970367432, "step": 3200 }, { "epoch": 1.208980513979102, "grad_norm": 33.15116247588805, "learning_rate": 6.97834274952919e-07, "logits/chosen": -1.745214819908142, "logits/rejected": -1.796289086341858, "logps/chosen": -365.0375061035156, "logps/rejected": -375.8500061035156, "loss": 0.1213, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1397216320037842, "rewards/margins": 3.6605467796325684, "rewards/rejected": -4.800390720367432, "step": 3210 }, { "epoch": 1.212745928645392, "grad_norm": 44.23839224277966, "learning_rate": 6.968926553672316e-07, "logits/chosen": -1.8478515148162842, "logits/rejected": -1.705957055091858, "logps/chosen": -354.0249938964844, "logps/rejected": -365.5625, "loss": 0.1727, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.231591820716858, "rewards/margins": 3.530566453933716, "rewards/rejected": -4.7587890625, "step": 3220 }, { "epoch": 1.2165113433116823, "grad_norm": 26.117292865212825, "learning_rate": 6.959510357815442e-07, "logits/chosen": -1.883691430091858, "logits/rejected": -1.752050757408142, "logps/chosen": -348.3125, "logps/rejected": -336.8500061035156, "loss": 0.1238, "rewards/accuracies": 0.96875, "rewards/chosen": -1.426184058189392, "rewards/margins": 3.4994139671325684, "rewards/rejected": -4.925000190734863, "step": 3230 }, { "epoch": 1.2202767579779723, "grad_norm": 25.546307239214062, "learning_rate": 6.950094161958569e-07, "logits/chosen": -1.8083007335662842, "logits/rejected": -1.694433569908142, "logps/chosen": -355.92498779296875, "logps/rejected": -373.4750061035156, "loss": 0.1385, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.052209496498108, "rewards/margins": 3.5833983421325684, "rewards/rejected": -4.63671875, "step": 3240 }, { "epoch": 1.2240421726442625, "grad_norm": 43.673318110325944, "learning_rate": 6.940677966101694e-07, "logits/chosen": -1.8825194835662842, "logits/rejected": -1.746826171875, "logps/chosen": -335.2124938964844, "logps/rejected": -350.01251220703125, "loss": 0.1461, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.078405737876892, "rewards/margins": 3.3470702171325684, "rewards/rejected": -4.42578125, "step": 3250 }, { "epoch": 1.2278075873105525, "grad_norm": 19.026854414676038, "learning_rate": 6.931261770244821e-07, "logits/chosen": -1.701171875, "logits/rejected": -1.6740233898162842, "logps/chosen": -334.3500061035156, "logps/rejected": -331.4624938964844, "loss": 0.1267, "rewards/accuracies": 0.96875, "rewards/chosen": -1.202337622642517, "rewards/margins": 3.539843797683716, "rewards/rejected": -4.741015434265137, "step": 3260 }, { "epoch": 1.2315730019768427, "grad_norm": 19.35064758214762, "learning_rate": 6.921845574387946e-07, "logits/chosen": -1.71142578125, "logits/rejected": -1.74072265625, "logps/chosen": -351.0, "logps/rejected": -327.11248779296875, "loss": 0.164, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.26324462890625, "rewards/margins": 3.402148485183716, "rewards/rejected": -4.664453029632568, "step": 3270 }, { "epoch": 1.235338416643133, "grad_norm": 17.971116121292795, "learning_rate": 6.912429378531073e-07, "logits/chosen": -1.822265625, "logits/rejected": -1.648535132408142, "logps/chosen": -341.75, "logps/rejected": -359.9750061035156, "loss": 0.1227, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1258423328399658, "rewards/margins": 3.477343797683716, "rewards/rejected": -4.604296684265137, "step": 3280 }, { "epoch": 1.239103831309423, "grad_norm": 25.834558249643656, "learning_rate": 6.903013182674198e-07, "logits/chosen": -1.8405272960662842, "logits/rejected": -1.6970703601837158, "logps/chosen": -383.4375, "logps/rejected": -360.625, "loss": 0.1463, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.125860571861267, "rewards/margins": 3.3101563453674316, "rewards/rejected": -4.436718940734863, "step": 3290 }, { "epoch": 1.2428692459757131, "grad_norm": 18.697788154200374, "learning_rate": 6.893596986817326e-07, "logits/chosen": -1.8037109375, "logits/rejected": -1.771093726158142, "logps/chosen": -337.4375, "logps/rejected": -344.5249938964844, "loss": 0.1459, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7681976556777954, "rewards/margins": 3.5064454078674316, "rewards/rejected": -4.275781154632568, "step": 3300 }, { "epoch": 1.2466346606420031, "grad_norm": 17.544979157429044, "learning_rate": 6.884180790960453e-07, "logits/chosen": -1.834082007408142, "logits/rejected": -1.651269555091858, "logps/chosen": -339.5625, "logps/rejected": -372.63751220703125, "loss": 0.1405, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.251708984375, "rewards/margins": 3.66015625, "rewards/rejected": -4.912109375, "step": 3310 }, { "epoch": 1.2504000753082933, "grad_norm": 19.956031987900875, "learning_rate": 6.874764595103578e-07, "logits/chosen": -1.7736327648162842, "logits/rejected": -1.673730492591858, "logps/chosen": -375.6000061035156, "logps/rejected": -361.75, "loss": 0.1436, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0316345691680908, "rewards/margins": 3.489990234375, "rewards/rejected": -4.521874904632568, "step": 3320 }, { "epoch": 1.2541654899745835, "grad_norm": 24.80195497702995, "learning_rate": 6.865348399246705e-07, "logits/chosen": -1.836328148841858, "logits/rejected": -1.669531226158142, "logps/chosen": -331.29998779296875, "logps/rejected": -363.7875061035156, "loss": 0.1867, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.17547607421875, "rewards/margins": 3.215039014816284, "rewards/rejected": -4.38671875, "step": 3330 }, { "epoch": 1.2579309046408735, "grad_norm": 19.10357533733567, "learning_rate": 6.85593220338983e-07, "logits/chosen": -1.742285132408142, "logits/rejected": -1.764550805091858, "logps/chosen": -391.11248779296875, "logps/rejected": -367.04998779296875, "loss": 0.134, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7732909917831421, "rewards/margins": 3.559375047683716, "rewards/rejected": -4.332421779632568, "step": 3340 }, { "epoch": 1.2616963193071637, "grad_norm": 27.300936705330052, "learning_rate": 6.846516007532957e-07, "logits/chosen": -1.755859375, "logits/rejected": -1.5790526866912842, "logps/chosen": -326.2749938964844, "logps/rejected": -329.23748779296875, "loss": 0.1913, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.1297485828399658, "rewards/margins": 3.144726514816284, "rewards/rejected": -4.273828029632568, "step": 3350 }, { "epoch": 1.2654617339734537, "grad_norm": 17.162215658238573, "learning_rate": 6.837099811676082e-07, "logits/chosen": -1.6613280773162842, "logits/rejected": -1.510888695716858, "logps/chosen": -343.2250061035156, "logps/rejected": -335.2250061035156, "loss": 0.1593, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9496215581893921, "rewards/margins": 3.272265672683716, "rewards/rejected": -4.22265625, "step": 3360 }, { "epoch": 1.269227148639744, "grad_norm": 24.782169846870694, "learning_rate": 6.827683615819209e-07, "logits/chosen": -1.853515625, "logits/rejected": -1.7490234375, "logps/chosen": -328.67498779296875, "logps/rejected": -329.0, "loss": 0.1171, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8594604730606079, "rewards/margins": 3.396289110183716, "rewards/rejected": -4.255468845367432, "step": 3370 }, { "epoch": 1.2729925633060342, "grad_norm": 65.61944326337925, "learning_rate": 6.818267419962335e-07, "logits/chosen": -1.74365234375, "logits/rejected": -1.580957055091858, "logps/chosen": -338.3374938964844, "logps/rejected": -323.29998779296875, "loss": 0.194, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.117334008216858, "rewards/margins": 3.1439452171325684, "rewards/rejected": -4.260546684265137, "step": 3380 }, { "epoch": 1.2767579779723242, "grad_norm": 32.50138433477934, "learning_rate": 6.808851224105461e-07, "logits/chosen": -1.732519507408142, "logits/rejected": -1.6276366710662842, "logps/chosen": -346.4375, "logps/rejected": -363.8374938964844, "loss": 0.1756, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7668212652206421, "rewards/margins": 3.5482420921325684, "rewards/rejected": -4.316601753234863, "step": 3390 }, { "epoch": 1.2805233926386144, "grad_norm": 28.232736802237923, "learning_rate": 6.799435028248587e-07, "logits/chosen": -1.8235352039337158, "logits/rejected": -1.7521483898162842, "logps/chosen": -288.4375, "logps/rejected": -329.95001220703125, "loss": 0.1595, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.5691253542900085, "rewards/margins": 3.376953125, "rewards/rejected": -3.945507764816284, "step": 3400 }, { "epoch": 1.2842888073049044, "grad_norm": 15.582104246739457, "learning_rate": 6.790018832391713e-07, "logits/chosen": -1.8115234375, "logits/rejected": -1.661230444908142, "logps/chosen": -346.82501220703125, "logps/rejected": -374.4125061035156, "loss": 0.1252, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6659515500068665, "rewards/margins": 3.5103516578674316, "rewards/rejected": -4.175585746765137, "step": 3410 }, { "epoch": 1.2880542219711946, "grad_norm": 21.559239417962118, "learning_rate": 6.78060263653484e-07, "logits/chosen": -1.573828101158142, "logits/rejected": -1.6259765625, "logps/chosen": -349.1499938964844, "logps/rejected": -322.6499938964844, "loss": 0.1229, "rewards/accuracies": 0.96875, "rewards/chosen": -0.50396728515625, "rewards/margins": 3.3482422828674316, "rewards/rejected": -3.8511719703674316, "step": 3420 }, { "epoch": 1.2918196366374848, "grad_norm": 30.760791197143302, "learning_rate": 6.771186440677966e-07, "logits/chosen": -1.830957055091858, "logits/rejected": -1.7747070789337158, "logps/chosen": -340.45001220703125, "logps/rejected": -327.20001220703125, "loss": 0.1408, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7962310910224915, "rewards/margins": 3.3978514671325684, "rewards/rejected": -4.192578315734863, "step": 3430 }, { "epoch": 1.2955850513037748, "grad_norm": 28.437571687207754, "learning_rate": 6.761770244821092e-07, "logits/chosen": -1.7136719226837158, "logits/rejected": -1.7213866710662842, "logps/chosen": -380.6875, "logps/rejected": -345.75, "loss": 0.1487, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.8062744140625, "rewards/margins": 3.5396485328674316, "rewards/rejected": -4.345703125, "step": 3440 }, { "epoch": 1.299350465970065, "grad_norm": 43.94566461004294, "learning_rate": 6.752354048964219e-07, "logits/chosen": -1.753515601158142, "logits/rejected": -1.6711914539337158, "logps/chosen": -338.75, "logps/rejected": -323.70001220703125, "loss": 0.1383, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.7685180902481079, "rewards/margins": 3.3880858421325684, "rewards/rejected": -4.155859470367432, "step": 3450 }, { "epoch": 1.303115880636355, "grad_norm": 33.79891724607883, "learning_rate": 6.742937853107344e-07, "logits/chosen": -1.8662109375, "logits/rejected": -1.7746093273162842, "logps/chosen": -368.0375061035156, "logps/rejected": -318.54998779296875, "loss": 0.107, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0400512218475342, "rewards/margins": 3.479687452316284, "rewards/rejected": -4.51953125, "step": 3460 }, { "epoch": 1.3068812953026452, "grad_norm": 14.922709321710325, "learning_rate": 6.733521657250471e-07, "logits/chosen": -1.956152319908142, "logits/rejected": -1.714257836341858, "logps/chosen": -300.45001220703125, "logps/rejected": -347.0249938964844, "loss": 0.145, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.0894286632537842, "rewards/margins": 3.459765672683716, "rewards/rejected": -4.547265529632568, "step": 3470 }, { "epoch": 1.3106467099689354, "grad_norm": 27.71436737139531, "learning_rate": 6.724105461393596e-07, "logits/chosen": -1.9119141101837158, "logits/rejected": -1.751074194908142, "logps/chosen": -315.5375061035156, "logps/rejected": -313.29998779296875, "loss": 0.14, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.668261706829071, "rewards/margins": 3.262500047683716, "rewards/rejected": -3.928906202316284, "step": 3480 }, { "epoch": 1.3144121246352254, "grad_norm": 19.360536768793786, "learning_rate": 6.714689265536723e-07, "logits/chosen": -1.836523413658142, "logits/rejected": -1.8210937976837158, "logps/chosen": -377.125, "logps/rejected": -335.6499938964844, "loss": 0.1929, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.53045654296875, "rewards/margins": 3.257617235183716, "rewards/rejected": -3.787109375, "step": 3490 }, { "epoch": 1.3181775393015156, "grad_norm": 9.75859100496894, "learning_rate": 6.705273069679848e-07, "logits/chosen": -1.85595703125, "logits/rejected": -1.779296875, "logps/chosen": -357.70001220703125, "logps/rejected": -315.125, "loss": 0.1403, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.45077210664749146, "rewards/margins": 3.5396971702575684, "rewards/rejected": -3.9908204078674316, "step": 3500 }, { "epoch": 1.3219429539678056, "grad_norm": 30.083828176761134, "learning_rate": 6.695856873822975e-07, "logits/chosen": -1.798486351966858, "logits/rejected": -1.699121117591858, "logps/chosen": -359.1000061035156, "logps/rejected": -386.0, "loss": 0.1298, "rewards/accuracies": 0.9375, "rewards/chosen": -0.46308594942092896, "rewards/margins": 3.7857422828674316, "rewards/rejected": -4.246289253234863, "step": 3510 }, { "epoch": 1.3257083686340958, "grad_norm": 13.960934779757741, "learning_rate": 6.6864406779661e-07, "logits/chosen": -1.924414038658142, "logits/rejected": -1.7521483898162842, "logps/chosen": -331.3125, "logps/rejected": -361.20001220703125, "loss": 0.0916, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.4074462950229645, "rewards/margins": 4.060546875, "rewards/rejected": -4.465234279632568, "step": 3520 }, { "epoch": 1.329473783300386, "grad_norm": 21.27656608087164, "learning_rate": 6.677024482109228e-07, "logits/chosen": -1.835546851158142, "logits/rejected": -1.9001953601837158, "logps/chosen": -351.5, "logps/rejected": -343.8999938964844, "loss": 0.1056, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.677844226360321, "rewards/margins": 3.6771483421325684, "rewards/rejected": -4.353906154632568, "step": 3530 }, { "epoch": 1.333239197966676, "grad_norm": 39.78041285996303, "learning_rate": 6.667608286252354e-07, "logits/chosen": -1.954980492591858, "logits/rejected": -1.896386742591858, "logps/chosen": -345.5625, "logps/rejected": -374.86248779296875, "loss": 0.1875, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.826855480670929, "rewards/margins": 3.76953125, "rewards/rejected": -4.596093654632568, "step": 3540 }, { "epoch": 1.3370046126329662, "grad_norm": 37.089383764736965, "learning_rate": 6.65819209039548e-07, "logits/chosen": -1.9235351085662842, "logits/rejected": -1.997656226158142, "logps/chosen": -330.61248779296875, "logps/rejected": -346.20001220703125, "loss": 0.1555, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.2440185546875, "rewards/margins": 3.623046875, "rewards/rejected": -4.869140625, "step": 3550 }, { "epoch": 1.3407700272992562, "grad_norm": 36.304683594309736, "learning_rate": 6.648775894538606e-07, "logits/chosen": -1.878515601158142, "logits/rejected": -1.849609375, "logps/chosen": -332.7749938964844, "logps/rejected": -328.875, "loss": 0.1259, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.748297095298767, "rewards/margins": 3.699414014816284, "rewards/rejected": -5.449999809265137, "step": 3560 }, { "epoch": 1.3445354419655464, "grad_norm": 21.111665450212808, "learning_rate": 6.639359698681732e-07, "logits/chosen": -1.9792969226837158, "logits/rejected": -1.839453101158142, "logps/chosen": -325.38751220703125, "logps/rejected": -338.07501220703125, "loss": 0.1786, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.959912121295929, "rewards/margins": 3.6756348609924316, "rewards/rejected": -4.635156154632568, "step": 3570 }, { "epoch": 1.3483008566318366, "grad_norm": 55.23733848910727, "learning_rate": 6.629943502824859e-07, "logits/chosen": -1.8860352039337158, "logits/rejected": -1.687890648841858, "logps/chosen": -354.26251220703125, "logps/rejected": -353.79998779296875, "loss": 0.1613, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.2619476318359375, "rewards/margins": 3.585156202316284, "rewards/rejected": -4.847265720367432, "step": 3580 }, { "epoch": 1.3520662712981268, "grad_norm": 25.905197637822937, "learning_rate": 6.620527306967985e-07, "logits/chosen": -1.7682616710662842, "logits/rejected": -1.669580101966858, "logps/chosen": -317.26251220703125, "logps/rejected": -322.8374938964844, "loss": 0.1171, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8970092535018921, "rewards/margins": 3.778515577316284, "rewards/rejected": -4.676562309265137, "step": 3590 }, { "epoch": 1.3558316859644168, "grad_norm": 20.333541774590767, "learning_rate": 6.611111111111111e-07, "logits/chosen": -1.614355444908142, "logits/rejected": -1.557714819908142, "logps/chosen": -367.70001220703125, "logps/rejected": -333.8500061035156, "loss": 0.155, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.40812987089157104, "rewards/margins": 3.472460985183716, "rewards/rejected": -3.8818359375, "step": 3600 }, { "epoch": 1.3595971006307068, "grad_norm": 8.780423955023492, "learning_rate": 6.601694915254237e-07, "logits/chosen": -1.672265648841858, "logits/rejected": -1.625634789466858, "logps/chosen": -331.8374938964844, "logps/rejected": -367.3500061035156, "loss": 0.1252, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.14476318657398224, "rewards/margins": 3.982714891433716, "rewards/rejected": -4.125781059265137, "step": 3610 }, { "epoch": 1.363362515296997, "grad_norm": 42.224656119883726, "learning_rate": 6.592278719397363e-07, "logits/chosen": -1.7257812023162842, "logits/rejected": -1.605859398841858, "logps/chosen": -330.3187561035156, "logps/rejected": -324.54998779296875, "loss": 0.1485, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.02410888671875, "rewards/margins": 2.953125, "rewards/rejected": -2.978515625, "step": 3620 }, { "epoch": 1.3671279299632872, "grad_norm": 34.44528341187513, "learning_rate": 6.582862523540489e-07, "logits/chosen": -1.744238257408142, "logits/rejected": -1.7187988758087158, "logps/chosen": -347.92498779296875, "logps/rejected": -343.1499938964844, "loss": 0.1405, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.044708251953125, "rewards/margins": 3.412109375, "rewards/rejected": -3.456835985183716, "step": 3630 }, { "epoch": 1.3708933446295775, "grad_norm": 29.34177219539941, "learning_rate": 6.573446327683616e-07, "logits/chosen": -1.8494141101837158, "logits/rejected": -1.7392578125, "logps/chosen": -335.88751220703125, "logps/rejected": -338.6625061035156, "loss": 0.177, "rewards/accuracies": 0.9375, "rewards/chosen": -0.17036132514476776, "rewards/margins": 3.201367139816284, "rewards/rejected": -3.370800733566284, "step": 3640 }, { "epoch": 1.3746587592958674, "grad_norm": 18.376906054800372, "learning_rate": 6.564030131826742e-07, "logits/chosen": -1.7400391101837158, "logits/rejected": -1.6315429210662842, "logps/chosen": -370.17498779296875, "logps/rejected": -364.82501220703125, "loss": 0.156, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.010035705752670765, "rewards/margins": 3.4710936546325684, "rewards/rejected": -3.4789061546325684, "step": 3650 }, { "epoch": 1.3784241739621577, "grad_norm": 23.06591615509829, "learning_rate": 6.554613935969869e-07, "logits/chosen": -1.847070336341858, "logits/rejected": -1.803808569908142, "logps/chosen": -313.92498779296875, "logps/rejected": -311.73748779296875, "loss": 0.1405, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.31416016817092896, "rewards/margins": 3.3084959983825684, "rewards/rejected": -3.6253905296325684, "step": 3660 }, { "epoch": 1.3821895886284477, "grad_norm": 23.461926274209493, "learning_rate": 6.545197740112994e-07, "logits/chosen": -1.788964867591858, "logits/rejected": -1.5905272960662842, "logps/chosen": -322.75, "logps/rejected": -340.6000061035156, "loss": 0.118, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5001220703125, "rewards/margins": 3.4583983421325684, "rewards/rejected": -3.9593749046325684, "step": 3670 }, { "epoch": 1.3859550032947379, "grad_norm": 25.092167639593654, "learning_rate": 6.535781544256121e-07, "logits/chosen": -1.6577637195587158, "logits/rejected": -1.619726538658142, "logps/chosen": -351.125, "logps/rejected": -332.45001220703125, "loss": 0.1671, "rewards/accuracies": 0.9375, "rewards/chosen": -0.479592889547348, "rewards/margins": 3.2362303733825684, "rewards/rejected": -3.7138671875, "step": 3680 }, { "epoch": 1.389720417961028, "grad_norm": 32.0196821226414, "learning_rate": 6.526365348399246e-07, "logits/chosen": -1.854589819908142, "logits/rejected": -1.6123046875, "logps/chosen": -334.54998779296875, "logps/rejected": -387.29998779296875, "loss": 0.1894, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.9334472417831421, "rewards/margins": 3.25439453125, "rewards/rejected": -4.189843654632568, "step": 3690 }, { "epoch": 1.393485832627318, "grad_norm": 25.005514528117207, "learning_rate": 6.516949152542373e-07, "logits/chosen": -1.8512694835662842, "logits/rejected": -1.873046875, "logps/chosen": -341.36248779296875, "logps/rejected": -312.17498779296875, "loss": 0.1762, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5751708745956421, "rewards/margins": 3.3150391578674316, "rewards/rejected": -3.8902344703674316, "step": 3700 }, { "epoch": 1.3972512472936083, "grad_norm": 36.612679051649934, "learning_rate": 6.507532956685498e-07, "logits/chosen": -1.8673827648162842, "logits/rejected": -1.793066382408142, "logps/chosen": -351.2749938964844, "logps/rejected": -340.38751220703125, "loss": 0.1491, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.712603747844696, "rewards/margins": 3.5308594703674316, "rewards/rejected": -4.2431640625, "step": 3710 }, { "epoch": 1.4010166619598983, "grad_norm": 45.44979702250484, "learning_rate": 6.498116760828625e-07, "logits/chosen": -1.8400390148162842, "logits/rejected": -1.912109375, "logps/chosen": -382.8500061035156, "logps/rejected": -354.6000061035156, "loss": 0.1319, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.6321365237236023, "rewards/margins": 3.685351610183716, "rewards/rejected": -4.315625190734863, "step": 3720 }, { "epoch": 1.4047820766261885, "grad_norm": 33.6854332365387, "learning_rate": 6.48870056497175e-07, "logits/chosen": -1.8291015625, "logits/rejected": -1.639892578125, "logps/chosen": -354.04998779296875, "logps/rejected": -370.70001220703125, "loss": 0.1435, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.745776355266571, "rewards/margins": 3.807812452316284, "rewards/rejected": -4.552734375, "step": 3730 }, { "epoch": 1.4085474912924787, "grad_norm": 16.047377273207655, "learning_rate": 6.479284369114877e-07, "logits/chosen": -1.9304687976837158, "logits/rejected": -1.8036620616912842, "logps/chosen": -326.5625, "logps/rejected": -331.1499938964844, "loss": 0.1711, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.755291759967804, "rewards/margins": 3.3294434547424316, "rewards/rejected": -4.083984375, "step": 3740 }, { "epoch": 1.4123129059587687, "grad_norm": 23.211007117040882, "learning_rate": 6.469868173258003e-07, "logits/chosen": -1.9289062023162842, "logits/rejected": -1.9091796875, "logps/chosen": -324.0249938964844, "logps/rejected": -341.13751220703125, "loss": 0.1481, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.651110827922821, "rewards/margins": 3.3935546875, "rewards/rejected": -4.044726371765137, "step": 3750 }, { "epoch": 1.416078320625059, "grad_norm": 27.040648962548758, "learning_rate": 6.46045197740113e-07, "logits/chosen": -1.7370116710662842, "logits/rejected": -1.589453101158142, "logps/chosen": -369.95001220703125, "logps/rejected": -363.5249938964844, "loss": 0.1025, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2643798887729645, "rewards/margins": 3.7793946266174316, "rewards/rejected": -4.046484470367432, "step": 3760 }, { "epoch": 1.4198437352913489, "grad_norm": 37.48746075607661, "learning_rate": 6.451035781544256e-07, "logits/chosen": -1.981835961341858, "logits/rejected": -1.744042992591858, "logps/chosen": -298.0249938964844, "logps/rejected": -369.6499938964844, "loss": 0.17, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8374267816543579, "rewards/margins": 3.5673828125, "rewards/rejected": -4.40625, "step": 3770 }, { "epoch": 1.423609149957639, "grad_norm": 11.989167551318168, "learning_rate": 6.441619585687382e-07, "logits/chosen": -1.93359375, "logits/rejected": -1.76611328125, "logps/chosen": -337.73748779296875, "logps/rejected": -345.45001220703125, "loss": 0.1615, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.898358166217804, "rewards/margins": 3.7738280296325684, "rewards/rejected": -4.673437595367432, "step": 3780 }, { "epoch": 1.4273745646239293, "grad_norm": 14.001811515198112, "learning_rate": 6.432203389830508e-07, "logits/chosen": -1.790429711341858, "logits/rejected": -1.7080078125, "logps/chosen": -371.0, "logps/rejected": -356.67498779296875, "loss": 0.1682, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.1973755359649658, "rewards/margins": 3.318554639816284, "rewards/rejected": -4.516797065734863, "step": 3790 }, { "epoch": 1.4311399792902193, "grad_norm": 23.334666909570636, "learning_rate": 6.422787193973634e-07, "logits/chosen": -1.994726538658142, "logits/rejected": -1.893164038658142, "logps/chosen": -331.8374938964844, "logps/rejected": -343.0249938964844, "loss": 0.1551, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.4774901866912842, "rewards/margins": 3.2484374046325684, "rewards/rejected": -4.727343559265137, "step": 3800 }, { "epoch": 1.4349053939565095, "grad_norm": 35.63219734820654, "learning_rate": 6.41337099811676e-07, "logits/chosen": -1.743749976158142, "logits/rejected": -1.8307616710662842, "logps/chosen": -366.2250061035156, "logps/rejected": -361.82501220703125, "loss": 0.1482, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.1598694324493408, "rewards/margins": 3.3377928733825684, "rewards/rejected": -4.497656345367432, "step": 3810 }, { "epoch": 1.4386708086227995, "grad_norm": 35.84511504302666, "learning_rate": 6.403954802259887e-07, "logits/chosen": -1.847509741783142, "logits/rejected": -1.765527367591858, "logps/chosen": -340.26251220703125, "logps/rejected": -332.76251220703125, "loss": 0.1217, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7197510004043579, "rewards/margins": 3.7822265625, "rewards/rejected": -4.50390625, "step": 3820 }, { "epoch": 1.4424362232890897, "grad_norm": 12.304333733940389, "learning_rate": 6.394538606403013e-07, "logits/chosen": -1.683007836341858, "logits/rejected": -1.540624976158142, "logps/chosen": -333.54998779296875, "logps/rejected": -332.45001220703125, "loss": 0.1326, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.9931885004043579, "rewards/margins": 3.611328125, "rewards/rejected": -4.602734565734863, "step": 3830 }, { "epoch": 1.44620163795538, "grad_norm": 13.186603222151332, "learning_rate": 6.385122410546139e-07, "logits/chosen": -1.79931640625, "logits/rejected": -1.6970703601837158, "logps/chosen": -337.29998779296875, "logps/rejected": -352.11248779296875, "loss": 0.1585, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.613800048828125, "rewards/margins": 3.533398389816284, "rewards/rejected": -4.146093845367432, "step": 3840 }, { "epoch": 1.44996705262167, "grad_norm": 22.49714260935584, "learning_rate": 6.375706214689265e-07, "logits/chosen": -1.798730492591858, "logits/rejected": -1.74560546875, "logps/chosen": -316.7875061035156, "logps/rejected": -320.3999938964844, "loss": 0.1277, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6992126703262329, "rewards/margins": 3.4359374046325684, "rewards/rejected": -4.134765625, "step": 3850 }, { "epoch": 1.4537324672879601, "grad_norm": 28.670350677260103, "learning_rate": 6.366290018832391e-07, "logits/chosen": -1.865136742591858, "logits/rejected": -1.6085937023162842, "logps/chosen": -354.38751220703125, "logps/rejected": -384.75, "loss": 0.1316, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8417907953262329, "rewards/margins": 3.830078125, "rewards/rejected": -4.671875, "step": 3860 }, { "epoch": 1.4574978819542501, "grad_norm": 23.6341024887183, "learning_rate": 6.356873822975519e-07, "logits/chosen": -1.7683594226837158, "logits/rejected": -1.710839867591858, "logps/chosen": -405.1000061035156, "logps/rejected": -399.54998779296875, "loss": 0.1375, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.198095679283142, "rewards/margins": 3.615039110183716, "rewards/rejected": -4.814453125, "step": 3870 }, { "epoch": 1.4612632966205403, "grad_norm": 14.774547210751305, "learning_rate": 6.347457627118644e-07, "logits/chosen": -1.867773413658142, "logits/rejected": -1.7677733898162842, "logps/chosen": -327.375, "logps/rejected": -322.3999938964844, "loss": 0.1531, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.496240258216858, "rewards/margins": 3.45703125, "rewards/rejected": -4.951952934265137, "step": 3880 }, { "epoch": 1.4650287112868305, "grad_norm": 38.24582841763419, "learning_rate": 6.338041431261771e-07, "logits/chosen": -1.8019530773162842, "logits/rejected": -1.667871117591858, "logps/chosen": -352.5, "logps/rejected": -348.1499938964844, "loss": 0.1179, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.2002044916152954, "rewards/margins": 3.7318358421325684, "rewards/rejected": -4.9296875, "step": 3890 }, { "epoch": 1.4687941259531205, "grad_norm": 32.23170351918425, "learning_rate": 6.328625235404896e-07, "logits/chosen": -1.75, "logits/rejected": -1.6057617664337158, "logps/chosen": -369.4375, "logps/rejected": -400.20001220703125, "loss": 0.1341, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.204614281654358, "rewards/margins": 3.591601610183716, "rewards/rejected": -4.795702934265137, "step": 3900 }, { "epoch": 1.4725595406194107, "grad_norm": 16.0625994043934, "learning_rate": 6.319209039548023e-07, "logits/chosen": -1.7365233898162842, "logits/rejected": -1.748046875, "logps/chosen": -389.7250061035156, "logps/rejected": -354.57501220703125, "loss": 0.1516, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.8347717523574829, "rewards/margins": 3.740527391433716, "rewards/rejected": -4.574999809265137, "step": 3910 }, { "epoch": 1.4763249552857007, "grad_norm": 19.93272083191731, "learning_rate": 6.309792843691148e-07, "logits/chosen": -1.939843773841858, "logits/rejected": -1.695410132408142, "logps/chosen": -334.79998779296875, "logps/rejected": -365.45001220703125, "loss": 0.1597, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1515624523162842, "rewards/margins": 3.4263672828674316, "rewards/rejected": -4.579297065734863, "step": 3920 }, { "epoch": 1.480090369951991, "grad_norm": 8.51248619535511, "learning_rate": 6.300376647834275e-07, "logits/chosen": -1.8997070789337158, "logits/rejected": -1.818457007408142, "logps/chosen": -329.51251220703125, "logps/rejected": -323.75, "loss": 0.124, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.734234631061554, "rewards/margins": 3.6631836891174316, "rewards/rejected": -4.397265434265137, "step": 3930 }, { "epoch": 1.4838557846182812, "grad_norm": 16.50372190890688, "learning_rate": 6.2909604519774e-07, "logits/chosen": -1.9533202648162842, "logits/rejected": -1.76416015625, "logps/chosen": -319.48126220703125, "logps/rejected": -325.29998779296875, "loss": 0.1468, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.8883301019668579, "rewards/margins": 3.6923828125, "rewards/rejected": -4.580468654632568, "step": 3940 }, { "epoch": 1.4876211992845711, "grad_norm": 20.90756873087298, "learning_rate": 6.281544256120527e-07, "logits/chosen": -1.71875, "logits/rejected": -1.840917944908142, "logps/chosen": -406.98748779296875, "logps/rejected": -332.07501220703125, "loss": 0.1634, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.998321533203125, "rewards/margins": 3.42578125, "rewards/rejected": -4.423437595367432, "step": 3950 }, { "epoch": 1.4913866139508614, "grad_norm": 44.03654257179281, "learning_rate": 6.272128060263653e-07, "logits/chosen": -1.863867163658142, "logits/rejected": -1.793554663658142, "logps/chosen": -324.61248779296875, "logps/rejected": -321.79998779296875, "loss": 0.152, "rewards/accuracies": 0.9375, "rewards/chosen": -0.702954113483429, "rewards/margins": 3.55859375, "rewards/rejected": -4.261914253234863, "step": 3960 }, { "epoch": 1.4951520286171514, "grad_norm": 40.270847021751365, "learning_rate": 6.262711864406779e-07, "logits/chosen": -1.81005859375, "logits/rejected": -1.7771484851837158, "logps/chosen": -371.2749938964844, "logps/rejected": -352.67498779296875, "loss": 0.1325, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.596997082233429, "rewards/margins": 3.7210936546325684, "rewards/rejected": -4.317187309265137, "step": 3970 }, { "epoch": 1.4989174432834416, "grad_norm": 32.04165371153839, "learning_rate": 6.253295668549906e-07, "logits/chosen": -1.88623046875, "logits/rejected": -1.787695288658142, "logps/chosen": -345.57501220703125, "logps/rejected": -359.2749938964844, "loss": 0.126, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6420654058456421, "rewards/margins": 3.533398389816284, "rewards/rejected": -4.173047065734863, "step": 3980 }, { "epoch": 1.5026828579497318, "grad_norm": 8.734664107118178, "learning_rate": 6.243879472693032e-07, "logits/chosen": -1.7311522960662842, "logits/rejected": -1.7197265625, "logps/chosen": -365.3500061035156, "logps/rejected": -358.3999938964844, "loss": 0.1591, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6334747076034546, "rewards/margins": 3.5814452171325684, "rewards/rejected": -4.215624809265137, "step": 3990 }, { "epoch": 1.506448272616022, "grad_norm": 17.31996856342497, "learning_rate": 6.234463276836158e-07, "logits/chosen": -1.771875023841858, "logits/rejected": -1.77197265625, "logps/chosen": -358.3374938964844, "logps/rejected": -376.82501220703125, "loss": 0.1269, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.6421753168106079, "rewards/margins": 3.5736327171325684, "rewards/rejected": -4.216210842132568, "step": 4000 }, { "epoch": 1.510213687282312, "grad_norm": 29.939868938085805, "learning_rate": 6.225047080979284e-07, "logits/chosen": -1.9246094226837158, "logits/rejected": -1.8215820789337158, "logps/chosen": -339.6000061035156, "logps/rejected": -346.04998779296875, "loss": 0.1624, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.015234351158142, "rewards/margins": 3.4720702171325684, "rewards/rejected": -4.485156059265137, "step": 4010 }, { "epoch": 1.513979101948602, "grad_norm": 17.326891321996648, "learning_rate": 6.21563088512241e-07, "logits/chosen": -1.9494140148162842, "logits/rejected": -1.8466796875, "logps/chosen": -336.9750061035156, "logps/rejected": -355.54998779296875, "loss": 0.1287, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.126379370689392, "rewards/margins": 3.8460936546325684, "rewards/rejected": -4.969922065734863, "step": 4020 }, { "epoch": 1.5177445166148922, "grad_norm": 13.989920110946413, "learning_rate": 6.206214689265537e-07, "logits/chosen": -1.9689452648162842, "logits/rejected": -1.8361327648162842, "logps/chosen": -348.625, "logps/rejected": -379.17498779296875, "loss": 0.1811, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.722509741783142, "rewards/margins": 3.458691358566284, "rewards/rejected": -5.183203220367432, "step": 4030 }, { "epoch": 1.5215099312811824, "grad_norm": 56.570093290655, "learning_rate": 6.196798493408662e-07, "logits/chosen": -1.77197265625, "logits/rejected": -1.783789038658142, "logps/chosen": -355.79998779296875, "logps/rejected": -345.17498779296875, "loss": 0.1639, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.0039551258087158, "rewards/margins": 3.504101514816284, "rewards/rejected": -4.507031440734863, "step": 4040 }, { "epoch": 1.5252753459474726, "grad_norm": 21.524513461988647, "learning_rate": 6.187382297551789e-07, "logits/chosen": -1.787500023841858, "logits/rejected": -1.7239258289337158, "logps/chosen": -345.5, "logps/rejected": -373.6499938964844, "loss": 0.1237, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.947265625, "rewards/margins": 3.5855469703674316, "rewards/rejected": -4.534765720367432, "step": 4050 }, { "epoch": 1.5290407606137626, "grad_norm": 23.159350397040843, "learning_rate": 6.177966101694914e-07, "logits/chosen": -1.8494141101837158, "logits/rejected": -1.668359398841858, "logps/chosen": -305.7250061035156, "logps/rejected": -350.75, "loss": 0.1311, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.010717749595642, "rewards/margins": 3.465625047683716, "rewards/rejected": -4.475390434265137, "step": 4060 }, { "epoch": 1.5328061752800526, "grad_norm": 22.493354011580884, "learning_rate": 6.168549905838041e-07, "logits/chosen": -1.830468773841858, "logits/rejected": -1.7595703601837158, "logps/chosen": -318.0375061035156, "logps/rejected": -339.2250061035156, "loss": 0.1706, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.758197009563446, "rewards/margins": 3.408886671066284, "rewards/rejected": -4.165234565734863, "step": 4070 }, { "epoch": 1.5365715899463428, "grad_norm": 15.244329076848462, "learning_rate": 6.159133709981166e-07, "logits/chosen": -1.7136719226837158, "logits/rejected": -1.6869628429412842, "logps/chosen": -343.75, "logps/rejected": -348.7250061035156, "loss": 0.1909, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.939135730266571, "rewards/margins": 3.197949171066284, "rewards/rejected": -4.135546684265137, "step": 4080 }, { "epoch": 1.540337004612633, "grad_norm": 10.720802305816708, "learning_rate": 6.149717514124293e-07, "logits/chosen": -1.845800757408142, "logits/rejected": -1.68017578125, "logps/chosen": -333.6625061035156, "logps/rejected": -334.9750061035156, "loss": 0.1379, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.787579357624054, "rewards/margins": 3.298632860183716, "rewards/rejected": -4.083788871765137, "step": 4090 }, { "epoch": 1.5441024192789232, "grad_norm": 32.05333063326745, "learning_rate": 6.140301318267421e-07, "logits/chosen": -1.786718726158142, "logits/rejected": -1.6183593273162842, "logps/chosen": -329.9375, "logps/rejected": -327.2124938964844, "loss": 0.1761, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.0382568836212158, "rewards/margins": 3.283203125, "rewards/rejected": -4.323046684265137, "step": 4100 }, { "epoch": 1.5478678339452132, "grad_norm": 21.710716061834475, "learning_rate": 6.130885122410546e-07, "logits/chosen": -1.8458983898162842, "logits/rejected": -1.784082055091858, "logps/chosen": -313.125, "logps/rejected": -351.875, "loss": 0.1479, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.2201263904571533, "rewards/margins": 3.4593749046325684, "rewards/rejected": -4.6796875, "step": 4110 }, { "epoch": 1.5516332486115032, "grad_norm": 11.243705626336432, "learning_rate": 6.121468926553673e-07, "logits/chosen": -1.8840820789337158, "logits/rejected": -1.628759741783142, "logps/chosen": -320.75, "logps/rejected": -376.04998779296875, "loss": 0.1304, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9298156499862671, "rewards/margins": 3.647656202316284, "rewards/rejected": -4.578906059265137, "step": 4120 }, { "epoch": 1.5553986632777934, "grad_norm": 27.361236428747084, "learning_rate": 6.112052730696798e-07, "logits/chosen": -1.816796898841858, "logits/rejected": -1.7470703125, "logps/chosen": -338.7749938964844, "logps/rejected": -335.92498779296875, "loss": 0.1503, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.6135498285293579, "rewards/margins": 3.581249952316284, "rewards/rejected": -4.196679592132568, "step": 4130 }, { "epoch": 1.5591640779440836, "grad_norm": 15.312204267720912, "learning_rate": 6.102636534839925e-07, "logits/chosen": -1.8147461414337158, "logits/rejected": -1.7527344226837158, "logps/chosen": -359.04998779296875, "logps/rejected": -339.4375, "loss": 0.1123, "rewards/accuracies": 0.96875, "rewards/chosen": -0.699719250202179, "rewards/margins": 3.727246046066284, "rewards/rejected": -4.426171779632568, "step": 4140 }, { "epoch": 1.5629294926103738, "grad_norm": 44.316587991092206, "learning_rate": 6.09322033898305e-07, "logits/chosen": -1.811621069908142, "logits/rejected": -1.7463867664337158, "logps/chosen": -318.8500061035156, "logps/rejected": -340.625, "loss": 0.1483, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5370391607284546, "rewards/margins": 3.7183594703674316, "rewards/rejected": -4.256249904632568, "step": 4150 }, { "epoch": 1.5666949072766638, "grad_norm": 12.429455246453346, "learning_rate": 6.083804143126177e-07, "logits/chosen": -1.927148461341858, "logits/rejected": -1.834570288658142, "logps/chosen": -322.79998779296875, "logps/rejected": -324.45001220703125, "loss": 0.1242, "rewards/accuracies": 0.96875, "rewards/chosen": -0.5932251214981079, "rewards/margins": 3.5150389671325684, "rewards/rejected": -4.106640815734863, "step": 4160 }, { "epoch": 1.5704603219429538, "grad_norm": 22.150663969522878, "learning_rate": 6.074387947269303e-07, "logits/chosen": -2.019335985183716, "logits/rejected": -1.865869164466858, "logps/chosen": -303.45001220703125, "logps/rejected": -358.57501220703125, "loss": 0.1205, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7226898074150085, "rewards/margins": 3.642578125, "rewards/rejected": -4.366015434265137, "step": 4170 }, { "epoch": 1.574225736609244, "grad_norm": 38.77502829737456, "learning_rate": 6.064971751412429e-07, "logits/chosen": -1.73388671875, "logits/rejected": -1.730078101158142, "logps/chosen": -347.9624938964844, "logps/rejected": -335.7749938964844, "loss": 0.1412, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7664276361465454, "rewards/margins": 3.644726514816284, "rewards/rejected": -4.409765720367432, "step": 4180 }, { "epoch": 1.5779911512755342, "grad_norm": 10.723847542408851, "learning_rate": 6.055555555555555e-07, "logits/chosen": -1.935156226158142, "logits/rejected": -1.807031273841858, "logps/chosen": -341.38751220703125, "logps/rejected": -415.67498779296875, "loss": 0.1618, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.9057983160018921, "rewards/margins": 3.8760743141174316, "rewards/rejected": -4.780859470367432, "step": 4190 }, { "epoch": 1.5817565659418245, "grad_norm": 18.282298414458722, "learning_rate": 6.046139359698681e-07, "logits/chosen": -1.823144555091858, "logits/rejected": -1.772070288658142, "logps/chosen": -326.2250061035156, "logps/rejected": -324.45001220703125, "loss": 0.1418, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6585327386856079, "rewards/margins": 3.6478514671325684, "rewards/rejected": -4.3046875, "step": 4200 }, { "epoch": 1.5855219806081144, "grad_norm": 27.472379607640814, "learning_rate": 6.036723163841808e-07, "logits/chosen": -1.880468726158142, "logits/rejected": -1.806640625, "logps/chosen": -342.82501220703125, "logps/rejected": -310.1499938964844, "loss": 0.1516, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.605804443359375, "rewards/margins": 3.669921875, "rewards/rejected": -4.273828029632568, "step": 4210 }, { "epoch": 1.5892873952744044, "grad_norm": 23.884946376544654, "learning_rate": 6.027306967984934e-07, "logits/chosen": -1.8349609375, "logits/rejected": -1.874609351158142, "logps/chosen": -310.7124938964844, "logps/rejected": -337.75, "loss": 0.1434, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6624511480331421, "rewards/margins": 3.697070360183716, "rewards/rejected": -4.358202934265137, "step": 4220 }, { "epoch": 1.5930528099406946, "grad_norm": 24.36814585528149, "learning_rate": 6.01789077212806e-07, "logits/chosen": -1.8195312023162842, "logits/rejected": -1.8134765625, "logps/chosen": -353.57501220703125, "logps/rejected": -348.7250061035156, "loss": 0.1356, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7187164425849915, "rewards/margins": 3.730273485183716, "rewards/rejected": -4.445703029632568, "step": 4230 }, { "epoch": 1.5968182246069849, "grad_norm": 39.58216108256215, "learning_rate": 6.008474576271187e-07, "logits/chosen": -1.79833984375, "logits/rejected": -1.668359398841858, "logps/chosen": -335.6000061035156, "logps/rejected": -350.875, "loss": 0.1484, "rewards/accuracies": 0.9375, "rewards/chosen": -0.623120129108429, "rewards/margins": 3.2847657203674316, "rewards/rejected": -3.9095702171325684, "step": 4240 }, { "epoch": 1.600583639273275, "grad_norm": 15.893775386410876, "learning_rate": 5.999058380414312e-07, "logits/chosen": -1.865625023841858, "logits/rejected": -1.8507812023162842, "logps/chosen": -352.6875, "logps/rejected": -331.32501220703125, "loss": 0.1448, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7280639410018921, "rewards/margins": 3.4873046875, "rewards/rejected": -4.213671684265137, "step": 4250 }, { "epoch": 1.604349053939565, "grad_norm": 32.29641687137846, "learning_rate": 5.989642184557439e-07, "logits/chosen": -1.8693358898162842, "logits/rejected": -1.821874976158142, "logps/chosen": -321.5874938964844, "logps/rejected": -332.04998779296875, "loss": 0.1247, "rewards/accuracies": 0.96875, "rewards/chosen": -0.696368396282196, "rewards/margins": 3.667187452316284, "rewards/rejected": -4.364843845367432, "step": 4260 }, { "epoch": 1.6081144686058553, "grad_norm": 20.255741402439785, "learning_rate": 5.980225988700564e-07, "logits/chosen": -1.8430664539337158, "logits/rejected": -1.745703101158142, "logps/chosen": -359.25, "logps/rejected": -360.0249938964844, "loss": 0.1365, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.854663074016571, "rewards/margins": 3.8570313453674316, "rewards/rejected": -4.711328029632568, "step": 4270 }, { "epoch": 1.6118798832721453, "grad_norm": 22.64897785819965, "learning_rate": 5.970809792843691e-07, "logits/chosen": -1.774804711341858, "logits/rejected": -1.6848633289337158, "logps/chosen": -352.92498779296875, "logps/rejected": -370.75, "loss": 0.122, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.940704345703125, "rewards/margins": 3.7669920921325684, "rewards/rejected": -4.705078125, "step": 4280 }, { "epoch": 1.6156452979384355, "grad_norm": 26.7173232490466, "learning_rate": 5.961393596986816e-07, "logits/chosen": -1.7667968273162842, "logits/rejected": -1.7033202648162842, "logps/chosen": -369.3999938964844, "logps/rejected": -366.5625, "loss": 0.0992, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6143432855606079, "rewards/margins": 4.03515625, "rewards/rejected": -4.650781154632568, "step": 4290 }, { "epoch": 1.6194107126047257, "grad_norm": 38.841860286328526, "learning_rate": 5.951977401129943e-07, "logits/chosen": -1.921875, "logits/rejected": -1.8349609375, "logps/chosen": -315.875, "logps/rejected": -372.04998779296875, "loss": 0.1634, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.131445288658142, "rewards/margins": 3.678906202316284, "rewards/rejected": -4.808984279632568, "step": 4300 }, { "epoch": 1.6231761272710157, "grad_norm": 15.80376073594496, "learning_rate": 5.942561205273068e-07, "logits/chosen": -2.0399413108825684, "logits/rejected": -1.79638671875, "logps/chosen": -347.04998779296875, "logps/rejected": -367.4750061035156, "loss": 0.1194, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7464355230331421, "rewards/margins": 3.890331983566284, "rewards/rejected": -4.636914253234863, "step": 4310 }, { "epoch": 1.626941541937306, "grad_norm": 27.453869159206548, "learning_rate": 5.933145009416196e-07, "logits/chosen": -1.948632836341858, "logits/rejected": -1.9578125476837158, "logps/chosen": -335.79998779296875, "logps/rejected": -323.92498779296875, "loss": 0.1153, "rewards/accuracies": 0.96875, "rewards/chosen": -0.5896240472793579, "rewards/margins": 3.694140672683716, "rewards/rejected": -4.28515625, "step": 4320 }, { "epoch": 1.6307069566035959, "grad_norm": 22.32284617433, "learning_rate": 5.923728813559323e-07, "logits/chosen": -1.8878905773162842, "logits/rejected": -1.8689453601837158, "logps/chosen": -347.1499938964844, "logps/rejected": -368.1499938964844, "loss": 0.1284, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5417388677597046, "rewards/margins": 3.85546875, "rewards/rejected": -4.396484375, "step": 4330 }, { "epoch": 1.634472371269886, "grad_norm": 30.322497920494996, "learning_rate": 5.914312617702448e-07, "logits/chosen": -1.926660180091858, "logits/rejected": -1.642675757408142, "logps/chosen": -298.75, "logps/rejected": -365.0, "loss": 0.1552, "rewards/accuracies": 0.9375, "rewards/chosen": -0.671313464641571, "rewards/margins": 3.3534178733825684, "rewards/rejected": -4.0224609375, "step": 4340 }, { "epoch": 1.6382377859361763, "grad_norm": 18.94805529358045, "learning_rate": 5.904896421845575e-07, "logits/chosen": -1.9542968273162842, "logits/rejected": -2.0166015625, "logps/chosen": -355.7749938964844, "logps/rejected": -319.5249938964844, "loss": 0.1314, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5640594363212585, "rewards/margins": 3.3218750953674316, "rewards/rejected": -3.885546922683716, "step": 4350 }, { "epoch": 1.6420032006024663, "grad_norm": 17.418135617763696, "learning_rate": 5.8954802259887e-07, "logits/chosen": -1.914160132408142, "logits/rejected": -1.862695336341858, "logps/chosen": -325.7124938964844, "logps/rejected": -332.57501220703125, "loss": 0.1435, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8386383056640625, "rewards/margins": 3.483593702316284, "rewards/rejected": -4.3203125, "step": 4360 }, { "epoch": 1.6457686152687565, "grad_norm": 8.57594152594666, "learning_rate": 5.886064030131827e-07, "logits/chosen": -1.901757836341858, "logits/rejected": -1.8798828125, "logps/chosen": -377.0, "logps/rejected": -382.6000061035156, "loss": 0.1104, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.953125, "rewards/margins": 3.662890672683716, "rewards/rejected": -4.615624904632568, "step": 4370 }, { "epoch": 1.6495340299350465, "grad_norm": 10.100604006840312, "learning_rate": 5.876647834274952e-07, "logits/chosen": -1.9968750476837158, "logits/rejected": -1.8123047351837158, "logps/chosen": -312.25, "logps/rejected": -338.6499938964844, "loss": 0.1533, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.047570824623108, "rewards/margins": 3.472460985183716, "rewards/rejected": -4.519335746765137, "step": 4380 }, { "epoch": 1.6532994446013367, "grad_norm": 32.75652106410717, "learning_rate": 5.867231638418079e-07, "logits/chosen": -1.785058617591858, "logits/rejected": -1.753027319908142, "logps/chosen": -360.5249938964844, "logps/rejected": -350.0249938964844, "loss": 0.1154, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7515197992324829, "rewards/margins": 3.8646483421325684, "rewards/rejected": -4.618945121765137, "step": 4390 }, { "epoch": 1.657064859267627, "grad_norm": 29.00613707779619, "learning_rate": 5.857815442561205e-07, "logits/chosen": -1.8416016101837158, "logits/rejected": -1.7038085460662842, "logps/chosen": -358.42498779296875, "logps/rejected": -358.45001220703125, "loss": 0.1386, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.976061999797821, "rewards/margins": 3.7171874046325684, "rewards/rejected": -4.693749904632568, "step": 4400 }, { "epoch": 1.6608302739339171, "grad_norm": 22.438171210601983, "learning_rate": 5.848399246704331e-07, "logits/chosen": -1.9972655773162842, "logits/rejected": -1.873632788658142, "logps/chosen": -347.57501220703125, "logps/rejected": -372.5, "loss": 0.0909, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.125695824623108, "rewards/margins": 4.091796875, "rewards/rejected": -5.216796875, "step": 4410 }, { "epoch": 1.6645956886002071, "grad_norm": 22.24345783974691, "learning_rate": 5.838983050847457e-07, "logits/chosen": -1.8849608898162842, "logits/rejected": -1.891210913658142, "logps/chosen": -363.6499938964844, "logps/rejected": -372.32501220703125, "loss": 0.1697, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.465246558189392, "rewards/margins": 3.609668016433716, "rewards/rejected": -5.076171875, "step": 4420 }, { "epoch": 1.6683611032664971, "grad_norm": 26.20262932679582, "learning_rate": 5.829566854990584e-07, "logits/chosen": -1.79638671875, "logits/rejected": -1.833105444908142, "logps/chosen": -402.70001220703125, "logps/rejected": -385.67498779296875, "loss": 0.1321, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.101782202720642, "rewards/margins": 3.72265625, "rewards/rejected": -4.826562404632568, "step": 4430 }, { "epoch": 1.6721265179327873, "grad_norm": 43.88489710548497, "learning_rate": 5.82015065913371e-07, "logits/chosen": -1.76953125, "logits/rejected": -1.770117163658142, "logps/chosen": -348.20001220703125, "logps/rejected": -320.7749938964844, "loss": 0.1042, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.178503394126892, "rewards/margins": 3.9066405296325684, "rewards/rejected": -5.0859375, "step": 4440 }, { "epoch": 1.6758919325990775, "grad_norm": 11.138723150803186, "learning_rate": 5.810734463276837e-07, "logits/chosen": -1.8525390625, "logits/rejected": -1.615136742591858, "logps/chosen": -382.25, "logps/rejected": -394.82501220703125, "loss": 0.0992, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9792236089706421, "rewards/margins": 4.121289253234863, "rewards/rejected": -5.100781440734863, "step": 4450 }, { "epoch": 1.6796573472653678, "grad_norm": 20.616251278432784, "learning_rate": 5.801318267419962e-07, "logits/chosen": -1.9109375476837158, "logits/rejected": -1.678125023841858, "logps/chosen": -327.61248779296875, "logps/rejected": -354.82501220703125, "loss": 0.1265, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0007202625274658, "rewards/margins": 3.867480516433716, "rewards/rejected": -4.866406440734863, "step": 4460 }, { "epoch": 1.6834227619316577, "grad_norm": 25.130165947789155, "learning_rate": 5.791902071563089e-07, "logits/chosen": -2.0111327171325684, "logits/rejected": -1.7742187976837158, "logps/chosen": -372.1625061035156, "logps/rejected": -370.9750061035156, "loss": 0.1542, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.116918921470642, "rewards/margins": 3.843945264816284, "rewards/rejected": -4.960546970367432, "step": 4470 }, { "epoch": 1.6871881765979477, "grad_norm": 40.73903582417902, "learning_rate": 5.782485875706214e-07, "logits/chosen": -1.8943359851837158, "logits/rejected": -1.9345703125, "logps/chosen": -320.5625, "logps/rejected": -320.70001220703125, "loss": 0.1539, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.042456030845642, "rewards/margins": 3.546093702316284, "rewards/rejected": -4.591406345367432, "step": 4480 }, { "epoch": 1.690953591264238, "grad_norm": 45.61916986381256, "learning_rate": 5.773069679849341e-07, "logits/chosen": -1.9853515625, "logits/rejected": -1.9231445789337158, "logps/chosen": -327.95001220703125, "logps/rejected": -333.54998779296875, "loss": 0.1312, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.69024658203125, "rewards/margins": 3.6294922828674316, "rewards/rejected": -4.319921970367432, "step": 4490 }, { "epoch": 1.6947190059305282, "grad_norm": 36.95286421717757, "learning_rate": 5.763653483992466e-07, "logits/chosen": -2.0357422828674316, "logits/rejected": -1.8909180164337158, "logps/chosen": -346.7250061035156, "logps/rejected": -362.1499938964844, "loss": 0.1477, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0513427257537842, "rewards/margins": 3.8935546875, "rewards/rejected": -4.945703029632568, "step": 4500 }, { "epoch": 1.6984844205968184, "grad_norm": 32.29752469260365, "learning_rate": 5.754237288135593e-07, "logits/chosen": -1.8860352039337158, "logits/rejected": -1.9816405773162842, "logps/chosen": -369.7875061035156, "logps/rejected": -358.67498779296875, "loss": 0.1238, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.8447021245956421, "rewards/margins": 3.5785155296325684, "rewards/rejected": -4.423828125, "step": 4510 }, { "epoch": 1.7022498352631084, "grad_norm": 12.529349010378095, "learning_rate": 5.744821092278718e-07, "logits/chosen": -1.985742211341858, "logits/rejected": -1.893945336341858, "logps/chosen": -337.86248779296875, "logps/rejected": -334.3999938964844, "loss": 0.1572, "rewards/accuracies": 0.9375, "rewards/chosen": -1.131689429283142, "rewards/margins": 3.409374952316284, "rewards/rejected": -4.54296875, "step": 4520 }, { "epoch": 1.7060152499293983, "grad_norm": 22.262506735880628, "learning_rate": 5.735404896421845e-07, "logits/chosen": -2.044140577316284, "logits/rejected": -1.9089844226837158, "logps/chosen": -325.5, "logps/rejected": -372.1000061035156, "loss": 0.1317, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.987591564655304, "rewards/margins": 3.689648389816284, "rewards/rejected": -4.6748046875, "step": 4530 }, { "epoch": 1.7097806645956886, "grad_norm": 10.401412901230211, "learning_rate": 5.725988700564971e-07, "logits/chosen": -1.8224608898162842, "logits/rejected": -1.877539038658142, "logps/chosen": -352.88751220703125, "logps/rejected": -354.20001220703125, "loss": 0.1335, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.8591064214706421, "rewards/margins": 3.585742235183716, "rewards/rejected": -4.444531440734863, "step": 4540 }, { "epoch": 1.7135460792619788, "grad_norm": 21.310797699535083, "learning_rate": 5.716572504708098e-07, "logits/chosen": -1.9045898914337158, "logits/rejected": -1.8278319835662842, "logps/chosen": -371.29998779296875, "logps/rejected": -333.36248779296875, "loss": 0.1802, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.0136291980743408, "rewards/margins": 3.41015625, "rewards/rejected": -4.422265529632568, "step": 4550 }, { "epoch": 1.717311493928269, "grad_norm": 18.24923128504647, "learning_rate": 5.707156308851224e-07, "logits/chosen": -1.8079102039337158, "logits/rejected": -1.7559082508087158, "logps/chosen": -404.8125, "logps/rejected": -341.82501220703125, "loss": 0.1671, "rewards/accuracies": 0.9375, "rewards/chosen": -1.018286108970642, "rewards/margins": 3.4193358421325684, "rewards/rejected": -4.436718940734863, "step": 4560 }, { "epoch": 1.721076908594559, "grad_norm": 40.216281806546306, "learning_rate": 5.69774011299435e-07, "logits/chosen": -1.8678710460662842, "logits/rejected": -1.69873046875, "logps/chosen": -350.1875, "logps/rejected": -379.79998779296875, "loss": 0.1485, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.4173705577850342, "rewards/margins": 3.526660203933716, "rewards/rejected": -4.9453125, "step": 4570 }, { "epoch": 1.724842323260849, "grad_norm": 8.308722758917813, "learning_rate": 5.688323917137477e-07, "logits/chosen": -1.8359375, "logits/rejected": -1.789453148841858, "logps/chosen": -324.0, "logps/rejected": -347.98748779296875, "loss": 0.1587, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.707482933998108, "rewards/margins": 3.5478515625, "rewards/rejected": -5.252734184265137, "step": 4580 }, { "epoch": 1.7286077379271392, "grad_norm": 19.05183889443206, "learning_rate": 5.678907721280602e-07, "logits/chosen": -1.8369140625, "logits/rejected": -1.8122069835662842, "logps/chosen": -356.92498779296875, "logps/rejected": -342.17498779296875, "loss": 0.1347, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -2.2540040016174316, "rewards/margins": 3.6004881858825684, "rewards/rejected": -5.857031345367432, "step": 4590 }, { "epoch": 1.7323731525934294, "grad_norm": 30.162928786214596, "learning_rate": 5.669491525423729e-07, "logits/chosen": -1.76953125, "logits/rejected": -1.647070288658142, "logps/chosen": -344.5249938964844, "logps/rejected": -336.57501220703125, "loss": 0.1659, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.2269043922424316, "rewards/margins": 3.5210938453674316, "rewards/rejected": -5.750781059265137, "step": 4600 }, { "epoch": 1.7361385672597196, "grad_norm": 29.125320968169238, "learning_rate": 5.660075329566855e-07, "logits/chosen": -1.98193359375, "logits/rejected": -1.87744140625, "logps/chosen": -326.8374938964844, "logps/rejected": -343.2749938964844, "loss": 0.1303, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.7661621570587158, "rewards/margins": 3.676953077316284, "rewards/rejected": -5.444531440734863, "step": 4610 }, { "epoch": 1.7399039819260096, "grad_norm": 29.670514340126875, "learning_rate": 5.650659133709981e-07, "logits/chosen": -1.785253882408142, "logits/rejected": -1.7838866710662842, "logps/chosen": -380.2250061035156, "logps/rejected": -371.3500061035156, "loss": 0.1686, "rewards/accuracies": 0.9375, "rewards/chosen": -1.758154273033142, "rewards/margins": 3.7740235328674316, "rewards/rejected": -5.532422065734863, "step": 4620 }, { "epoch": 1.7436693965922996, "grad_norm": 24.63376514965193, "learning_rate": 5.641242937853107e-07, "logits/chosen": -1.8196289539337158, "logits/rejected": -1.8615233898162842, "logps/chosen": -373.11248779296875, "logps/rejected": -340.7250061035156, "loss": 0.167, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.618048071861267, "rewards/margins": 3.7198243141174316, "rewards/rejected": -5.337500095367432, "step": 4630 }, { "epoch": 1.7474348112585898, "grad_norm": 41.33498747194505, "learning_rate": 5.631826741996233e-07, "logits/chosen": -1.837890625, "logits/rejected": -1.884765625, "logps/chosen": -347.7749938964844, "logps/rejected": -356.70001220703125, "loss": 0.1969, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.686303734779358, "rewards/margins": 3.256542921066284, "rewards/rejected": -4.947265625, "step": 4640 }, { "epoch": 1.75120022592488, "grad_norm": 48.742989624565155, "learning_rate": 5.622410546139359e-07, "logits/chosen": -1.855371117591858, "logits/rejected": -1.784570336341858, "logps/chosen": -346.26251220703125, "logps/rejected": -331.7250061035156, "loss": 0.1858, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.978857398033142, "rewards/margins": 3.277539014816284, "rewards/rejected": -5.256249904632568, "step": 4650 }, { "epoch": 1.7549656405911702, "grad_norm": 22.677668187837213, "learning_rate": 5.612994350282486e-07, "logits/chosen": -1.8489258289337158, "logits/rejected": -1.714453101158142, "logps/chosen": -362.32501220703125, "logps/rejected": -341.25, "loss": 0.1536, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.6924316883087158, "rewards/margins": 3.1728515625, "rewards/rejected": -4.863671779632568, "step": 4660 }, { "epoch": 1.7587310552574602, "grad_norm": 18.156112899303167, "learning_rate": 5.603578154425612e-07, "logits/chosen": -1.9970703125, "logits/rejected": -1.7482421398162842, "logps/chosen": -315.9375, "logps/rejected": -343.04998779296875, "loss": 0.1335, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.319091796875, "rewards/margins": 3.508007764816284, "rewards/rejected": -4.826562404632568, "step": 4670 }, { "epoch": 1.7624964699237502, "grad_norm": 20.743616967003227, "learning_rate": 5.594161958568739e-07, "logits/chosen": -1.783789038658142, "logits/rejected": -1.7600586414337158, "logps/chosen": -387.75, "logps/rejected": -343.25, "loss": 0.1567, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.3787841796875, "rewards/margins": 3.504101514816284, "rewards/rejected": -4.884765625, "step": 4680 }, { "epoch": 1.7662618845900404, "grad_norm": 12.327054205900724, "learning_rate": 5.584745762711864e-07, "logits/chosen": -1.9059569835662842, "logits/rejected": -1.871191382408142, "logps/chosen": -345.92498779296875, "logps/rejected": -376.7749938964844, "loss": 0.1409, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.3887450695037842, "rewards/margins": 3.617968797683716, "rewards/rejected": -5.006640434265137, "step": 4690 }, { "epoch": 1.7700272992563306, "grad_norm": 26.085122621843272, "learning_rate": 5.575329566854991e-07, "logits/chosen": -1.9172852039337158, "logits/rejected": -1.930078148841858, "logps/chosen": -359.625, "logps/rejected": -345.2749938964844, "loss": 0.0867, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4093139171600342, "rewards/margins": 3.9925780296325684, "rewards/rejected": -5.399218559265137, "step": 4700 }, { "epoch": 1.7737927139226208, "grad_norm": 19.412255596101915, "learning_rate": 5.565913370998116e-07, "logits/chosen": -1.9873046875, "logits/rejected": -1.884667992591858, "logps/chosen": -346.625, "logps/rejected": -342.20001220703125, "loss": 0.1207, "rewards/accuracies": 0.96875, "rewards/chosen": -1.457189917564392, "rewards/margins": 3.8824219703674316, "rewards/rejected": -5.338476657867432, "step": 4710 }, { "epoch": 1.7775581285889108, "grad_norm": 32.04808103195865, "learning_rate": 5.556497175141243e-07, "logits/chosen": -1.801416039466858, "logits/rejected": -1.892480492591858, "logps/chosen": -380.7875061035156, "logps/rejected": -351.63751220703125, "loss": 0.1688, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.256079077720642, "rewards/margins": 3.3349609375, "rewards/rejected": -4.590624809265137, "step": 4720 }, { "epoch": 1.7813235432552008, "grad_norm": 18.341452029837797, "learning_rate": 5.547080979284368e-07, "logits/chosen": -1.9485352039337158, "logits/rejected": -1.8610351085662842, "logps/chosen": -332.95001220703125, "logps/rejected": -313.8687438964844, "loss": 0.1439, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.84844970703125, "rewards/margins": 3.4697265625, "rewards/rejected": -4.320703029632568, "step": 4730 }, { "epoch": 1.785088957921491, "grad_norm": 36.61616194652542, "learning_rate": 5.537664783427495e-07, "logits/chosen": -1.8083984851837158, "logits/rejected": -1.8190429210662842, "logps/chosen": -375.9750061035156, "logps/rejected": -354.42498779296875, "loss": 0.1492, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.709216296672821, "rewards/margins": 3.9554686546325684, "rewards/rejected": -4.665625095367432, "step": 4740 }, { "epoch": 1.7888543725877812, "grad_norm": 31.921691397159798, "learning_rate": 5.52824858757062e-07, "logits/chosen": -1.8225586414337158, "logits/rejected": -1.8669922351837158, "logps/chosen": -375.6499938964844, "logps/rejected": -375.82501220703125, "loss": 0.1101, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0612914562225342, "rewards/margins": 3.676562547683716, "rewards/rejected": -4.735156059265137, "step": 4750 }, { "epoch": 1.7926197872540715, "grad_norm": 29.464711551357308, "learning_rate": 5.518832391713747e-07, "logits/chosen": -1.71533203125, "logits/rejected": -1.6760742664337158, "logps/chosen": -330.11248779296875, "logps/rejected": -331.07501220703125, "loss": 0.1549, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.090887427330017, "rewards/margins": 3.632031202316284, "rewards/rejected": -4.725195407867432, "step": 4760 }, { "epoch": 1.7963852019203614, "grad_norm": 30.664971262239046, "learning_rate": 5.509416195856874e-07, "logits/chosen": -1.7434570789337158, "logits/rejected": -1.599023461341858, "logps/chosen": -386.61248779296875, "logps/rejected": -383.11248779296875, "loss": 0.1606, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.1012084484100342, "rewards/margins": 3.7958984375, "rewards/rejected": -4.8984375, "step": 4770 }, { "epoch": 1.8001506165866517, "grad_norm": 20.558262978324144, "learning_rate": 5.5e-07, "logits/chosen": -1.6948730945587158, "logits/rejected": -1.61328125, "logps/chosen": -377.3125, "logps/rejected": -383.2250061035156, "loss": 0.1222, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.2862548828125, "rewards/margins": 3.998046875, "rewards/rejected": -5.286718845367432, "step": 4780 }, { "epoch": 1.8039160312529416, "grad_norm": 45.37993695187757, "learning_rate": 5.490583804143126e-07, "logits/chosen": -1.853613257408142, "logits/rejected": -1.693457007408142, "logps/chosen": -353.3125, "logps/rejected": -346.76251220703125, "loss": 0.1838, "rewards/accuracies": 0.9375, "rewards/chosen": -0.993334949016571, "rewards/margins": 3.578808546066284, "rewards/rejected": -4.574999809265137, "step": 4790 }, { "epoch": 1.8076814459192319, "grad_norm": 18.972661251817104, "learning_rate": 5.481167608286252e-07, "logits/chosen": -1.733984351158142, "logits/rejected": -1.542138695716858, "logps/chosen": -384.54998779296875, "logps/rejected": -377.45001220703125, "loss": 0.1164, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.88140869140625, "rewards/margins": 3.7939453125, "rewards/rejected": -4.677734375, "step": 4800 }, { "epoch": 1.811446860585522, "grad_norm": 37.87850657251627, "learning_rate": 5.471751412429378e-07, "logits/chosen": -1.907324194908142, "logits/rejected": -1.8645508289337158, "logps/chosen": -325.3125, "logps/rejected": -320.86248779296875, "loss": 0.1685, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.7500671148300171, "rewards/margins": 3.2940430641174316, "rewards/rejected": -4.041015625, "step": 4810 }, { "epoch": 1.815212275251812, "grad_norm": 19.569285186124446, "learning_rate": 5.462335216572505e-07, "logits/chosen": -1.9533202648162842, "logits/rejected": -1.7649414539337158, "logps/chosen": -287.88751220703125, "logps/rejected": -339.6499938964844, "loss": 0.1633, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.965576171875, "rewards/margins": 3.360595703125, "rewards/rejected": -4.325976371765137, "step": 4820 }, { "epoch": 1.8189776899181023, "grad_norm": 31.607857445141146, "learning_rate": 5.45291902071563e-07, "logits/chosen": -1.8523437976837158, "logits/rejected": -1.849609375, "logps/chosen": -342.36248779296875, "logps/rejected": -339.32501220703125, "loss": 0.1372, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.907238781452179, "rewards/margins": 3.396679639816284, "rewards/rejected": -4.303515434265137, "step": 4830 }, { "epoch": 1.8227431045843923, "grad_norm": 31.767028500075263, "learning_rate": 5.443502824858757e-07, "logits/chosen": -1.8307616710662842, "logits/rejected": -1.913671851158142, "logps/chosen": -342.25, "logps/rejected": -329.61248779296875, "loss": 0.1615, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1688629388809204, "rewards/margins": 3.386523485183716, "rewards/rejected": -4.5556640625, "step": 4840 }, { "epoch": 1.8265085192506825, "grad_norm": 21.131624855558197, "learning_rate": 5.434086629001883e-07, "logits/chosen": -1.7711913585662842, "logits/rejected": -1.696874976158142, "logps/chosen": -336.0625, "logps/rejected": -314.8500061035156, "loss": 0.1341, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.607617199420929, "rewards/margins": 3.4925780296325684, "rewards/rejected": -4.101171970367432, "step": 4850 }, { "epoch": 1.8302739339169727, "grad_norm": 25.32971664474488, "learning_rate": 5.424670433145009e-07, "logits/chosen": -1.82861328125, "logits/rejected": -1.73876953125, "logps/chosen": -372.2749938964844, "logps/rejected": -345.57501220703125, "loss": 0.1219, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5577453374862671, "rewards/margins": 3.6080079078674316, "rewards/rejected": -4.166796684265137, "step": 4860 }, { "epoch": 1.834039348583263, "grad_norm": 43.532791864471044, "learning_rate": 5.415254237288135e-07, "logits/chosen": -1.827050805091858, "logits/rejected": -1.69775390625, "logps/chosen": -302.9312438964844, "logps/rejected": -349.0874938964844, "loss": 0.1267, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7402588129043579, "rewards/margins": 3.820507764816284, "rewards/rejected": -4.563672065734863, "step": 4870 }, { "epoch": 1.8378047632495529, "grad_norm": 18.00452198993154, "learning_rate": 5.405838041431261e-07, "logits/chosen": -1.7750976085662842, "logits/rejected": -1.6295897960662842, "logps/chosen": -353.9125061035156, "logps/rejected": -343.98748779296875, "loss": 0.2035, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.013830542564392, "rewards/margins": 3.3790526390075684, "rewards/rejected": -4.396093845367432, "step": 4880 }, { "epoch": 1.8415701779158429, "grad_norm": 25.928943196340825, "learning_rate": 5.396421845574389e-07, "logits/chosen": -1.9500000476837158, "logits/rejected": -1.796484351158142, "logps/chosen": -315.1000061035156, "logps/rejected": -328.0, "loss": 0.1564, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.232153296470642, "rewards/margins": 3.3956055641174316, "rewards/rejected": -4.628125190734863, "step": 4890 }, { "epoch": 1.845335592582133, "grad_norm": 37.183205970966945, "learning_rate": 5.387005649717514e-07, "logits/chosen": -1.78271484375, "logits/rejected": -1.5797851085662842, "logps/chosen": -334.8374938964844, "logps/rejected": -391.82501220703125, "loss": 0.1166, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.344018578529358, "rewards/margins": 3.907421827316284, "rewards/rejected": -5.247656345367432, "step": 4900 }, { "epoch": 1.8491010072484233, "grad_norm": 26.244880892872533, "learning_rate": 5.377589453860641e-07, "logits/chosen": -1.8191406726837158, "logits/rejected": -1.7204101085662842, "logps/chosen": -324.3125, "logps/rejected": -328.82501220703125, "loss": 0.1189, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.1067016124725342, "rewards/margins": 3.78515625, "rewards/rejected": -4.890625, "step": 4910 }, { "epoch": 1.8528664219147135, "grad_norm": 26.235766617377884, "learning_rate": 5.368173258003766e-07, "logits/chosen": -1.852929711341858, "logits/rejected": -1.7759277820587158, "logps/chosen": -395.13751220703125, "logps/rejected": -385.875, "loss": 0.1275, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.9435058832168579, "rewards/margins": 3.662109375, "rewards/rejected": -4.603906154632568, "step": 4920 }, { "epoch": 1.8566318365810035, "grad_norm": 35.807234565547574, "learning_rate": 5.358757062146893e-07, "logits/chosen": -1.847509741783142, "logits/rejected": -1.7517578601837158, "logps/chosen": -352.98748779296875, "logps/rejected": -359.29998779296875, "loss": 0.1903, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.052972435951233, "rewards/margins": 3.5716795921325684, "rewards/rejected": -4.620898246765137, "step": 4930 }, { "epoch": 1.8603972512472935, "grad_norm": 13.381854563343166, "learning_rate": 5.349340866290018e-07, "logits/chosen": -1.7316405773162842, "logits/rejected": -1.558569312095642, "logps/chosen": -320.57501220703125, "logps/rejected": -349.54998779296875, "loss": 0.1403, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8792968988418579, "rewards/margins": 4.022705078125, "rewards/rejected": -4.90234375, "step": 4940 }, { "epoch": 1.8641626659135837, "grad_norm": 46.32574861017387, "learning_rate": 5.339924670433145e-07, "logits/chosen": -1.79296875, "logits/rejected": -1.802636742591858, "logps/chosen": -362.14373779296875, "logps/rejected": -323.04998779296875, "loss": 0.1701, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8315674066543579, "rewards/margins": 3.5113282203674316, "rewards/rejected": -4.343163967132568, "step": 4950 }, { "epoch": 1.867928080579874, "grad_norm": 31.176177550859425, "learning_rate": 5.33050847457627e-07, "logits/chosen": -1.879492163658142, "logits/rejected": -1.700292944908142, "logps/chosen": -323.75, "logps/rejected": -362.23748779296875, "loss": 0.1716, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8976684808731079, "rewards/margins": 3.47314453125, "rewards/rejected": -4.369531154632568, "step": 4960 }, { "epoch": 1.8716934952461641, "grad_norm": 37.34182641083443, "learning_rate": 5.321092278719397e-07, "logits/chosen": -1.690527319908142, "logits/rejected": -1.638671875, "logps/chosen": -356.20001220703125, "logps/rejected": -352.92498779296875, "loss": 0.1761, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.9358886480331421, "rewards/margins": 3.538769483566284, "rewards/rejected": -4.474218845367432, "step": 4970 }, { "epoch": 1.8754589099124541, "grad_norm": 12.932821292361496, "learning_rate": 5.311676082862523e-07, "logits/chosen": -1.906152367591858, "logits/rejected": -1.727636694908142, "logps/chosen": -353.1499938964844, "logps/rejected": -365.1000061035156, "loss": 0.1142, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.1389281749725342, "rewards/margins": 3.794921875, "rewards/rejected": -4.934374809265137, "step": 4980 }, { "epoch": 1.879224324578744, "grad_norm": 29.964351295990493, "learning_rate": 5.302259887005649e-07, "logits/chosen": -1.8386719226837158, "logits/rejected": -1.677734375, "logps/chosen": -368.7749938964844, "logps/rejected": -421.5375061035156, "loss": 0.1468, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.389367699623108, "rewards/margins": 3.817333936691284, "rewards/rejected": -5.2080078125, "step": 4990 }, { "epoch": 1.8829897392450343, "grad_norm": 11.56322355880577, "learning_rate": 5.292843691148776e-07, "logits/chosen": -1.9246094226837158, "logits/rejected": -1.8478271961212158, "logps/chosen": -335.92498779296875, "logps/rejected": -331.2250061035156, "loss": 0.1454, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.431970238685608, "rewards/margins": 3.7369141578674316, "rewards/rejected": -5.171484470367432, "step": 5000 }, { "epoch": 1.8867551539113245, "grad_norm": 28.3826186573507, "learning_rate": 5.283427495291902e-07, "logits/chosen": -1.888574242591858, "logits/rejected": -1.8139159679412842, "logps/chosen": -395.26251220703125, "logps/rejected": -393.1499938964844, "loss": 0.2036, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.644018530845642, "rewards/margins": 3.711718797683716, "rewards/rejected": -5.355859279632568, "step": 5010 }, { "epoch": 1.8905205685776147, "grad_norm": 30.210081750255053, "learning_rate": 5.274011299435028e-07, "logits/chosen": -1.899316430091858, "logits/rejected": -1.798437476158142, "logps/chosen": -316.9750061035156, "logps/rejected": -329.6499938964844, "loss": 0.1671, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.448632836341858, "rewards/margins": 3.2822265625, "rewards/rejected": -4.734375, "step": 5020 }, { "epoch": 1.8942859832439047, "grad_norm": 22.444387575609365, "learning_rate": 5.264595103578154e-07, "logits/chosen": -1.874609351158142, "logits/rejected": -1.7067382335662842, "logps/chosen": -377.8999938964844, "logps/rejected": -329.45001220703125, "loss": 0.1299, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.3123290538787842, "rewards/margins": 3.5240235328674316, "rewards/rejected": -4.837500095367432, "step": 5030 }, { "epoch": 1.8980513979101947, "grad_norm": 15.15809730010531, "learning_rate": 5.25517890772128e-07, "logits/chosen": -1.7688477039337158, "logits/rejected": -1.738183617591858, "logps/chosen": -395.875, "logps/rejected": -382.57501220703125, "loss": 0.1104, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0874512195587158, "rewards/margins": 3.541796922683716, "rewards/rejected": -4.628125190734863, "step": 5040 }, { "epoch": 1.901816812576485, "grad_norm": 39.942975501475225, "learning_rate": 5.245762711864407e-07, "logits/chosen": -1.9050781726837158, "logits/rejected": -1.7565429210662842, "logps/chosen": -341.2250061035156, "logps/rejected": -345.6499938964844, "loss": 0.1395, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5479247570037842, "rewards/margins": 3.678906202316284, "rewards/rejected": -5.227734565734863, "step": 5050 }, { "epoch": 1.9055822272427752, "grad_norm": 41.039501483146616, "learning_rate": 5.236346516007532e-07, "logits/chosen": -1.914160132408142, "logits/rejected": -1.820556640625, "logps/chosen": -356.3500061035156, "logps/rejected": -389.67498779296875, "loss": 0.1312, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2764892578125, "rewards/margins": 3.580078125, "rewards/rejected": -4.856640815734863, "step": 5060 }, { "epoch": 1.9093476419090654, "grad_norm": 24.77245941034832, "learning_rate": 5.226930320150659e-07, "logits/chosen": -1.8258788585662842, "logits/rejected": -1.792382836341858, "logps/chosen": -367.54998779296875, "logps/rejected": -350.17498779296875, "loss": 0.1419, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.19378662109375, "rewards/margins": 3.866406202316284, "rewards/rejected": -5.059765815734863, "step": 5070 }, { "epoch": 1.9131130565753554, "grad_norm": 52.07862842677906, "learning_rate": 5.217514124293784e-07, "logits/chosen": -1.86474609375, "logits/rejected": -1.818359375, "logps/chosen": -357.25, "logps/rejected": -355.3500061035156, "loss": 0.1424, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.962249755859375, "rewards/margins": 3.65185546875, "rewards/rejected": -4.614062309265137, "step": 5080 }, { "epoch": 1.9168784712416453, "grad_norm": 14.872638407363585, "learning_rate": 5.208097928436911e-07, "logits/chosen": -1.7658202648162842, "logits/rejected": -1.8201172351837158, "logps/chosen": -379.92498779296875, "logps/rejected": -366.57501220703125, "loss": 0.1289, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.7805541753768921, "rewards/margins": 3.9136719703674316, "rewards/rejected": -4.693359375, "step": 5090 }, { "epoch": 1.9206438859079356, "grad_norm": 12.066632639228219, "learning_rate": 5.198681732580037e-07, "logits/chosen": -2.08837890625, "logits/rejected": -1.89794921875, "logps/chosen": -318.73748779296875, "logps/rejected": -358.32501220703125, "loss": 0.1807, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.206140160560608, "rewards/margins": 3.760986328125, "rewards/rejected": -4.96875, "step": 5100 }, { "epoch": 1.9244093005742258, "grad_norm": 24.444124352999147, "learning_rate": 5.189265536723164e-07, "logits/chosen": -1.912109375, "logits/rejected": -1.863183617591858, "logps/chosen": -346.6875, "logps/rejected": -343.6000061035156, "loss": 0.1543, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.5135986804962158, "rewards/margins": 3.4908204078674316, "rewards/rejected": -5.004296779632568, "step": 5110 }, { "epoch": 1.928174715240516, "grad_norm": 17.16923179821838, "learning_rate": 5.179849340866291e-07, "logits/chosen": -1.955664038658142, "logits/rejected": -1.93603515625, "logps/chosen": -353.7250061035156, "logps/rejected": -368.75, "loss": 0.1056, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2987854480743408, "rewards/margins": 4.2138671875, "rewards/rejected": -5.512890815734863, "step": 5120 }, { "epoch": 1.931940129906806, "grad_norm": 49.63552748934161, "learning_rate": 5.170433145009416e-07, "logits/chosen": -2.0420899391174316, "logits/rejected": -1.889550805091858, "logps/chosen": -300.79998779296875, "logps/rejected": -339.7749938964844, "loss": 0.1688, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.7137572765350342, "rewards/margins": 3.6119141578674316, "rewards/rejected": -5.327343940734863, "step": 5130 }, { "epoch": 1.935705544573096, "grad_norm": 46.251422548079134, "learning_rate": 5.161016949152543e-07, "logits/chosen": -2.030468702316284, "logits/rejected": -1.8913085460662842, "logps/chosen": -327.63751220703125, "logps/rejected": -332.95001220703125, "loss": 0.1448, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.2496337890625, "rewards/margins": 3.4296875, "rewards/rejected": -4.6796875, "step": 5140 }, { "epoch": 1.9394709592393862, "grad_norm": 25.962383753780827, "learning_rate": 5.151600753295668e-07, "logits/chosen": -2.0423827171325684, "logits/rejected": -1.85595703125, "logps/chosen": -351.29998779296875, "logps/rejected": -349.2250061035156, "loss": 0.1623, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5505249500274658, "rewards/margins": 3.725781202316284, "rewards/rejected": -5.278124809265137, "step": 5150 }, { "epoch": 1.9432363739056764, "grad_norm": 10.715370282873733, "learning_rate": 5.142184557438795e-07, "logits/chosen": -1.830175757408142, "logits/rejected": -1.8288085460662842, "logps/chosen": -386.63751220703125, "logps/rejected": -346.5, "loss": 0.1466, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.6439208984375, "rewards/margins": 3.2640624046325684, "rewards/rejected": -4.913671970367432, "step": 5160 }, { "epoch": 1.9470017885719666, "grad_norm": 22.806914590265816, "learning_rate": 5.13276836158192e-07, "logits/chosen": -1.8776366710662842, "logits/rejected": -1.7444336414337158, "logps/chosen": -332.7749938964844, "logps/rejected": -353.25, "loss": 0.1747, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.388830542564392, "rewards/margins": 3.379687547683716, "rewards/rejected": -4.770312309265137, "step": 5170 }, { "epoch": 1.9507672032382566, "grad_norm": 24.625563577944064, "learning_rate": 5.123352165725047e-07, "logits/chosen": -2.0337891578674316, "logits/rejected": -1.7677733898162842, "logps/chosen": -317.70001220703125, "logps/rejected": -353.70001220703125, "loss": 0.1369, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.513403296470642, "rewards/margins": 3.5191407203674316, "rewards/rejected": -5.028906345367432, "step": 5180 }, { "epoch": 1.9545326179045466, "grad_norm": 32.09519788009678, "learning_rate": 5.113935969868173e-07, "logits/chosen": -2.124316453933716, "logits/rejected": -2.0474610328674316, "logps/chosen": -325.25, "logps/rejected": -345.0249938964844, "loss": 0.122, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.63031005859375, "rewards/margins": 3.614062547683716, "rewards/rejected": -5.249609470367432, "step": 5190 }, { "epoch": 1.9582980325708368, "grad_norm": 31.611006501312, "learning_rate": 5.104519774011299e-07, "logits/chosen": -1.9000976085662842, "logits/rejected": -1.909765601158142, "logps/chosen": -356.45001220703125, "logps/rejected": -371.45001220703125, "loss": 0.1113, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.508569359779358, "rewards/margins": 3.8033204078674316, "rewards/rejected": -5.309374809265137, "step": 5200 }, { "epoch": 1.962063447237127, "grad_norm": 8.33353807473751, "learning_rate": 5.095103578154425e-07, "logits/chosen": -1.9558594226837158, "logits/rejected": -1.974023461341858, "logps/chosen": -307.1499938964844, "logps/rejected": -304.3500061035156, "loss": 0.1588, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.4042847156524658, "rewards/margins": 3.4970703125, "rewards/rejected": -4.901562690734863, "step": 5210 }, { "epoch": 1.9658288619034172, "grad_norm": 26.452094944534657, "learning_rate": 5.085687382297551e-07, "logits/chosen": -1.9059569835662842, "logits/rejected": -1.887109398841858, "logps/chosen": -342.0874938964844, "logps/rejected": -375.1000061035156, "loss": 0.1058, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.330957055091858, "rewards/margins": 3.987109422683716, "rewards/rejected": -5.3203125, "step": 5220 }, { "epoch": 1.9695942765697072, "grad_norm": 15.68470261346708, "learning_rate": 5.076271186440678e-07, "logits/chosen": -1.912988305091858, "logits/rejected": -1.8234374523162842, "logps/chosen": -363.8374938964844, "logps/rejected": -337.7250061035156, "loss": 0.1822, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.6479736566543579, "rewards/margins": 3.8277344703674316, "rewards/rejected": -4.474999904632568, "step": 5230 }, { "epoch": 1.9733596912359974, "grad_norm": 26.802241853304245, "learning_rate": 5.066854990583804e-07, "logits/chosen": -1.9308593273162842, "logits/rejected": -1.737646460533142, "logps/chosen": -309.20001220703125, "logps/rejected": -366.125, "loss": 0.1146, "rewards/accuracies": 0.96875, "rewards/chosen": -0.5946410894393921, "rewards/margins": 3.8958983421325684, "rewards/rejected": -4.492578029632568, "step": 5240 }, { "epoch": 1.9771251059022874, "grad_norm": 41.266281062465744, "learning_rate": 5.05743879472693e-07, "logits/chosen": -1.9587891101837158, "logits/rejected": -1.88525390625, "logps/chosen": -348.32501220703125, "logps/rejected": -345.1000061035156, "loss": 0.148, "rewards/accuracies": 0.96875, "rewards/chosen": -0.931530773639679, "rewards/margins": 3.411328077316284, "rewards/rejected": -4.342187404632568, "step": 5250 }, { "epoch": 1.9808905205685776, "grad_norm": 11.352574586890492, "learning_rate": 5.048022598870057e-07, "logits/chosen": -1.914160132408142, "logits/rejected": -1.722070336341858, "logps/chosen": -359.4750061035156, "logps/rejected": -370.2250061035156, "loss": 0.1129, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.96484375, "rewards/margins": 3.9996094703674316, "rewards/rejected": -4.965624809265137, "step": 5260 }, { "epoch": 1.9846559352348678, "grad_norm": 21.303104560738525, "learning_rate": 5.038606403013182e-07, "logits/chosen": -1.8762695789337158, "logits/rejected": -1.882177710533142, "logps/chosen": -335.3999938964844, "logps/rejected": -345.6499938964844, "loss": 0.1449, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1388671398162842, "rewards/margins": 3.7328124046325684, "rewards/rejected": -4.868750095367432, "step": 5270 }, { "epoch": 1.9884213499011578, "grad_norm": 16.258555048870477, "learning_rate": 5.029190207156309e-07, "logits/chosen": -2.0283203125, "logits/rejected": -2.007031202316284, "logps/chosen": -320.5249938964844, "logps/rejected": -312.73748779296875, "loss": 0.14, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8846985101699829, "rewards/margins": 3.6859374046325684, "rewards/rejected": -4.569531440734863, "step": 5280 }, { "epoch": 1.992186764567448, "grad_norm": 34.90533022212825, "learning_rate": 5.019774011299434e-07, "logits/chosen": -1.8806641101837158, "logits/rejected": -1.8469727039337158, "logps/chosen": -384.2250061035156, "logps/rejected": -369.79998779296875, "loss": 0.1496, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.788684070110321, "rewards/margins": 3.690234422683716, "rewards/rejected": -4.475781440734863, "step": 5290 }, { "epoch": 1.995952179233738, "grad_norm": 21.852346617531552, "learning_rate": 5.010357815442561e-07, "logits/chosen": -2.013671875, "logits/rejected": -1.8966796398162842, "logps/chosen": -314.26251220703125, "logps/rejected": -348.42498779296875, "loss": 0.1887, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.1990966796875, "rewards/margins": 3.4271483421325684, "rewards/rejected": -4.626172065734863, "step": 5300 }, { "epoch": 1.9997175939000282, "grad_norm": 17.610672664468378, "learning_rate": 5.000941619585686e-07, "logits/chosen": -2.004687547683716, "logits/rejected": -1.9337890148162842, "logps/chosen": -366.29998779296875, "logps/rejected": -374.0249938964844, "loss": 0.098, "rewards/accuracies": 0.96875, "rewards/chosen": -1.123266577720642, "rewards/margins": 4.166015625, "rewards/rejected": -5.291406154632568, "step": 5310 }, { "epoch": 2.00376541466629, "grad_norm": 3.660817416063245, "learning_rate": 4.991525423728813e-07, "logits/chosen": -1.9832589626312256, "logits/rejected": -1.839099645614624, "logps/chosen": -328.03570556640625, "logps/rejected": -371.75, "loss": 0.0412, "rewards/accuracies": 0.9940476417541504, "rewards/chosen": -0.713506817817688, "rewards/margins": 4.827753067016602, "rewards/rejected": -5.540550708770752, "step": 5320 }, { "epoch": 2.0075308293325804, "grad_norm": 5.623617588474347, "learning_rate": 4.98210922787194e-07, "logits/chosen": -2.152636766433716, "logits/rejected": -2.010058641433716, "logps/chosen": -291.13751220703125, "logps/rejected": -332.88751220703125, "loss": 0.0423, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0252563953399658, "rewards/margins": 4.694140434265137, "rewards/rejected": -5.715624809265137, "step": 5330 }, { "epoch": 2.01129624399887, "grad_norm": 6.356391747164878, "learning_rate": 4.972693032015066e-07, "logits/chosen": -2.203906297683716, "logits/rejected": -2.042407274246216, "logps/chosen": -308.73748779296875, "logps/rejected": -345.7749938964844, "loss": 0.0426, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.126708984375, "rewards/margins": 5.012109279632568, "rewards/rejected": -6.140234470367432, "step": 5340 }, { "epoch": 2.0150616586651604, "grad_norm": 5.547693542736835, "learning_rate": 4.963276836158192e-07, "logits/chosen": -2.1719727516174316, "logits/rejected": -2.246875047683716, "logps/chosen": -311.45001220703125, "logps/rejected": -315.76251220703125, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": -0.8362182378768921, "rewards/margins": 4.985937595367432, "rewards/rejected": -5.818749904632568, "step": 5350 }, { "epoch": 2.0188270733314506, "grad_norm": 11.393458189092376, "learning_rate": 4.953860640301318e-07, "logits/chosen": -2.2035155296325684, "logits/rejected": -2.1426758766174316, "logps/chosen": -344.2250061035156, "logps/rejected": -374.32501220703125, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": -0.778149425983429, "rewards/margins": 5.276171684265137, "rewards/rejected": -6.053515434265137, "step": 5360 }, { "epoch": 2.022592487997741, "grad_norm": 9.340545447220643, "learning_rate": 4.944444444444445e-07, "logits/chosen": -2.235156297683716, "logits/rejected": -2.127685546875, "logps/chosen": -332.4125061035156, "logps/rejected": -360.82501220703125, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -1.0587646961212158, "rewards/margins": 5.460156440734863, "rewards/rejected": -6.523828029632568, "step": 5370 }, { "epoch": 2.026357902664031, "grad_norm": 8.6897859051519, "learning_rate": 4.93502824858757e-07, "logits/chosen": -2.2876954078674316, "logits/rejected": -2.221972703933716, "logps/chosen": -327.07501220703125, "logps/rejected": -372.1499938964844, "loss": 0.0351, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2680175304412842, "rewards/margins": 5.308984279632568, "rewards/rejected": -6.578711032867432, "step": 5380 }, { "epoch": 2.030123317330321, "grad_norm": 11.27251464693851, "learning_rate": 4.925612052730697e-07, "logits/chosen": -2.2813963890075684, "logits/rejected": -2.188281297683716, "logps/chosen": -333.92498779296875, "logps/rejected": -387.8999938964844, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -1.595483422279358, "rewards/margins": 5.569531440734863, "rewards/rejected": -7.164843559265137, "step": 5390 }, { "epoch": 2.033888731996611, "grad_norm": 7.312277978132687, "learning_rate": 4.916195856873823e-07, "logits/chosen": -2.2470703125, "logits/rejected": -2.2494139671325684, "logps/chosen": -358.3999938964844, "logps/rejected": -369.8500061035156, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": -1.767822265625, "rewards/margins": 5.524218559265137, "rewards/rejected": -7.293749809265137, "step": 5400 }, { "epoch": 2.0376541466629012, "grad_norm": 18.53604107163307, "learning_rate": 4.906779661016949e-07, "logits/chosen": -2.263671875, "logits/rejected": -2.272167921066284, "logps/chosen": -356.875, "logps/rejected": -384.4750061035156, "loss": 0.0539, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1076903343200684, "rewards/margins": 5.710546970367432, "rewards/rejected": -7.818749904632568, "step": 5410 }, { "epoch": 2.0414195613291914, "grad_norm": 7.409516788776852, "learning_rate": 4.897363465160076e-07, "logits/chosen": -2.3443360328674316, "logits/rejected": -2.129101514816284, "logps/chosen": -347.95001220703125, "logps/rejected": -394.6499938964844, "loss": 0.0279, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7798340320587158, "rewards/margins": 5.958203315734863, "rewards/rejected": -7.739062309265137, "step": 5420 }, { "epoch": 2.0451849759954817, "grad_norm": 4.902543823582157, "learning_rate": 4.887947269303201e-07, "logits/chosen": -2.3382811546325684, "logits/rejected": -2.2115235328674316, "logps/chosen": -321.57501220703125, "logps/rejected": -385.3500061035156, "loss": 0.056, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.1461424827575684, "rewards/margins": 5.803124904632568, "rewards/rejected": -7.950781345367432, "step": 5430 }, { "epoch": 2.0489503906617714, "grad_norm": 3.030976038546493, "learning_rate": 4.878531073446328e-07, "logits/chosen": -2.298535108566284, "logits/rejected": -2.2548828125, "logps/chosen": -367.38751220703125, "logps/rejected": -392.67498779296875, "loss": 0.0242, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.19189453125, "rewards/margins": 6.329297065734863, "rewards/rejected": -8.52734375, "step": 5440 }, { "epoch": 2.0527158053280616, "grad_norm": 3.7951467619026116, "learning_rate": 4.869114877589453e-07, "logits/chosen": -2.239453077316284, "logits/rejected": -2.29345703125, "logps/chosen": -362.7250061035156, "logps/rejected": -387.88751220703125, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -2.047070264816284, "rewards/margins": 5.782812595367432, "rewards/rejected": -7.828906059265137, "step": 5450 }, { "epoch": 2.056481219994352, "grad_norm": 15.83947722679052, "learning_rate": 4.85969868173258e-07, "logits/chosen": -2.196484327316284, "logits/rejected": -2.166210889816284, "logps/chosen": -350.3500061035156, "logps/rejected": -397.3500061035156, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": -1.8560791015625, "rewards/margins": 5.569140434265137, "rewards/rejected": -7.422656059265137, "step": 5460 }, { "epoch": 2.060246634660642, "grad_norm": 5.274225888498379, "learning_rate": 4.850282485875705e-07, "logits/chosen": -2.436328172683716, "logits/rejected": -2.2896485328674316, "logps/chosen": -343.3999938964844, "logps/rejected": -402.75, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": -2.18212890625, "rewards/margins": 6.098046779632568, "rewards/rejected": -8.283594131469727, "step": 5470 }, { "epoch": 2.0640120493269323, "grad_norm": 29.81253158418771, "learning_rate": 4.840866290018832e-07, "logits/chosen": -2.4327149391174316, "logits/rejected": -2.3426756858825684, "logps/chosen": -360.4750061035156, "logps/rejected": -374.8999938964844, "loss": 0.0209, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8791015148162842, "rewards/margins": 6.1171875, "rewards/rejected": -7.998437404632568, "step": 5480 }, { "epoch": 2.067777463993222, "grad_norm": 8.34967100936806, "learning_rate": 4.831450094161959e-07, "logits/chosen": -2.2349610328674316, "logits/rejected": -2.3038086891174316, "logps/chosen": -364.2749938964844, "logps/rejected": -385.0249938964844, "loss": 0.0356, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.036206007003784, "rewards/margins": 6.031640529632568, "rewards/rejected": -8.071093559265137, "step": 5490 }, { "epoch": 2.0715428786595123, "grad_norm": 5.6951144686900115, "learning_rate": 4.822033898305084e-07, "logits/chosen": -2.219531297683716, "logits/rejected": -2.3470702171325684, "logps/chosen": -379.8374938964844, "logps/rejected": -416.0249938964844, "loss": 0.0232, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7483398914337158, "rewards/margins": 5.994531154632568, "rewards/rejected": -7.7421875, "step": 5500 }, { "epoch": 2.0753082933258025, "grad_norm": 20.987900171601016, "learning_rate": 4.812617702448211e-07, "logits/chosen": -2.302929639816284, "logits/rejected": -2.2503905296325684, "logps/chosen": -331.5, "logps/rejected": -341.625, "loss": 0.0302, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4202392101287842, "rewards/margins": 5.964453220367432, "rewards/rejected": -7.383593559265137, "step": 5510 }, { "epoch": 2.0790737079920927, "grad_norm": 11.169379964589021, "learning_rate": 4.803201506591336e-07, "logits/chosen": -2.2216796875, "logits/rejected": -2.1800780296325684, "logps/chosen": -369.45001220703125, "logps/rejected": -440.125, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -1.2989990711212158, "rewards/margins": 6.311718940734863, "rewards/rejected": -7.612890720367432, "step": 5520 }, { "epoch": 2.082839122658383, "grad_norm": 5.407957533896539, "learning_rate": 4.793785310734463e-07, "logits/chosen": -2.1426758766174316, "logits/rejected": -2.291699171066284, "logps/chosen": -381.51251220703125, "logps/rejected": -390.9750061035156, "loss": 0.024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.223291039466858, "rewards/margins": 5.962500095367432, "rewards/rejected": -7.190625190734863, "step": 5530 }, { "epoch": 2.086604537324673, "grad_norm": 11.393298089474603, "learning_rate": 4.78436911487759e-07, "logits/chosen": -2.3677735328674316, "logits/rejected": -2.4253907203674316, "logps/chosen": -340.9125061035156, "logps/rejected": -387.76251220703125, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": -1.7642943859100342, "rewards/margins": 6.099218845367432, "rewards/rejected": -7.869531154632568, "step": 5540 }, { "epoch": 2.090369951990963, "grad_norm": 13.748294171300165, "learning_rate": 4.774952919020715e-07, "logits/chosen": -2.400195360183716, "logits/rejected": -2.370312452316284, "logps/chosen": -358.13751220703125, "logps/rejected": -378.95001220703125, "loss": 0.0286, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5150635242462158, "rewards/margins": 5.852734565734863, "rewards/rejected": -7.365234375, "step": 5550 }, { "epoch": 2.094135366657253, "grad_norm": 4.421504264583442, "learning_rate": 4.7655367231638416e-07, "logits/chosen": -2.345703125, "logits/rejected": -2.3480467796325684, "logps/chosen": -352.625, "logps/rejected": -372.5, "loss": 0.0259, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.990545630455017, "rewards/margins": 6.129687309265137, "rewards/rejected": -8.1171875, "step": 5560 }, { "epoch": 2.0979007813235433, "grad_norm": 35.095429283361646, "learning_rate": 4.7561205273069677e-07, "logits/chosen": -2.205078125, "logits/rejected": -2.1302733421325684, "logps/chosen": -408.54998779296875, "logps/rejected": -475.2749938964844, "loss": 0.0359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.741406202316284, "rewards/margins": 6.526953220367432, "rewards/rejected": -9.263280868530273, "step": 5570 }, { "epoch": 2.1016661959898335, "grad_norm": 3.586040891527408, "learning_rate": 4.746704331450094e-07, "logits/chosen": -2.4154295921325684, "logits/rejected": -2.2484374046325684, "logps/chosen": -342.1499938964844, "logps/rejected": -383.20001220703125, "loss": 0.0503, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.9268555641174316, "rewards/margins": 5.823437690734863, "rewards/rejected": -8.745312690734863, "step": 5580 }, { "epoch": 2.1054316106561237, "grad_norm": 2.704843043004244, "learning_rate": 4.7372881355932204e-07, "logits/chosen": -2.2523436546325684, "logits/rejected": -2.2291016578674316, "logps/chosen": -331.5375061035156, "logps/rejected": -396.82501220703125, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": -2.0853028297424316, "rewards/margins": 5.927343845367432, "rewards/rejected": -8.004687309265137, "step": 5590 }, { "epoch": 2.1091970253224135, "grad_norm": 5.279693710569951, "learning_rate": 4.7278719397363464e-07, "logits/chosen": -2.234081983566284, "logits/rejected": -2.2451171875, "logps/chosen": -357.86248779296875, "logps/rejected": -385.79998779296875, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": -2.274951219558716, "rewards/margins": 6.442187309265137, "rewards/rejected": -8.717187881469727, "step": 5600 }, { "epoch": 2.1129624399887037, "grad_norm": 4.549682402743766, "learning_rate": 4.7184557438794725e-07, "logits/chosen": -2.155468702316284, "logits/rejected": -2.2474608421325684, "logps/chosen": -393.75, "logps/rejected": -431.67498779296875, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": -1.979150414466858, "rewards/margins": 6.544531345367432, "rewards/rejected": -8.532031059265137, "step": 5610 }, { "epoch": 2.116727854654994, "grad_norm": 8.279946237140829, "learning_rate": 4.7090395480225986e-07, "logits/chosen": -2.3802733421325684, "logits/rejected": -2.2826170921325684, "logps/chosen": -347.8999938964844, "logps/rejected": -385.11248779296875, "loss": 0.0344, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.238818407058716, "rewards/margins": 6.131249904632568, "rewards/rejected": -8.37109375, "step": 5620 }, { "epoch": 2.120493269321284, "grad_norm": 33.827987907043784, "learning_rate": 4.6996233521657246e-07, "logits/chosen": -2.2582030296325684, "logits/rejected": -2.359570264816284, "logps/chosen": -383.5, "logps/rejected": -405.6499938964844, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -2.607525587081909, "rewards/margins": 6.366406440734863, "rewards/rejected": -8.971094131469727, "step": 5630 }, { "epoch": 2.1242586839875743, "grad_norm": 5.871363259131933, "learning_rate": 4.6902071563088507e-07, "logits/chosen": -2.1585936546325684, "logits/rejected": -2.2958984375, "logps/chosen": -361.88751220703125, "logps/rejected": -370.625, "loss": 0.028, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.226489305496216, "rewards/margins": 5.905468940734863, "rewards/rejected": -8.132031440734863, "step": 5640 }, { "epoch": 2.128024098653864, "grad_norm": 5.249479307688626, "learning_rate": 4.6807909604519773e-07, "logits/chosen": -2.246777296066284, "logits/rejected": -2.2015624046325684, "logps/chosen": -350.6000061035156, "logps/rejected": -395.4750061035156, "loss": 0.034, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0657715797424316, "rewards/margins": 6.162890434265137, "rewards/rejected": -8.23046875, "step": 5650 }, { "epoch": 2.1317895133201543, "grad_norm": 1.8097333235133441, "learning_rate": 4.6713747645951033e-07, "logits/chosen": -2.265429735183716, "logits/rejected": -2.09423828125, "logps/chosen": -323.36248779296875, "logps/rejected": -382.125, "loss": 0.024, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.171679735183716, "rewards/margins": 6.260156154632568, "rewards/rejected": -8.432812690734863, "step": 5660 }, { "epoch": 2.1355549279864445, "grad_norm": 15.824400341358444, "learning_rate": 4.6619585687382294e-07, "logits/chosen": -2.284960985183716, "logits/rejected": -2.223339796066284, "logps/chosen": -354.07501220703125, "logps/rejected": -380.51251220703125, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": -1.792810082435608, "rewards/margins": 5.832812309265137, "rewards/rejected": -7.625781059265137, "step": 5670 }, { "epoch": 2.1393203426527347, "grad_norm": 7.225867228994377, "learning_rate": 4.6525423728813555e-07, "logits/chosen": -2.104296922683716, "logits/rejected": -2.101367235183716, "logps/chosen": -358.29998779296875, "logps/rejected": -374.3500061035156, "loss": 0.02, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9950439929962158, "rewards/margins": 6.262109279632568, "rewards/rejected": -8.258593559265137, "step": 5680 }, { "epoch": 2.143085757319025, "grad_norm": 9.768332118277801, "learning_rate": 4.6431261770244815e-07, "logits/chosen": -2.4857420921325684, "logits/rejected": -2.339062452316284, "logps/chosen": -334.6875, "logps/rejected": -411.3999938964844, "loss": 0.0292, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.326367139816284, "rewards/margins": 6.370312690734863, "rewards/rejected": -8.696093559265137, "step": 5690 }, { "epoch": 2.1468511719853147, "grad_norm": 9.57680924156168, "learning_rate": 4.6337099811676076e-07, "logits/chosen": -2.378124952316284, "logits/rejected": -2.403125047683716, "logps/chosen": -381.0, "logps/rejected": -389.04998779296875, "loss": 0.0263, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4412598609924316, "rewards/margins": 5.87109375, "rewards/rejected": -8.310937881469727, "step": 5700 }, { "epoch": 2.150616586651605, "grad_norm": 7.145243252440076, "learning_rate": 4.624293785310734e-07, "logits/chosen": -2.39453125, "logits/rejected": -2.3822264671325684, "logps/chosen": -330.8999938964844, "logps/rejected": -380.7749938964844, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": -2.2994384765625, "rewards/margins": 6.551953315734863, "rewards/rejected": -8.854687690734863, "step": 5710 }, { "epoch": 2.154382001317895, "grad_norm": 1.2010377711795042, "learning_rate": 4.614877589453861e-07, "logits/chosen": -2.172070264816284, "logits/rejected": -2.261523485183716, "logps/chosen": -419.6000061035156, "logps/rejected": -427.8500061035156, "loss": 0.0274, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.6565794944763184, "rewards/margins": 6.349609375, "rewards/rejected": -9.005468368530273, "step": 5720 }, { "epoch": 2.1581474159841854, "grad_norm": 5.253883999538228, "learning_rate": 4.605461393596987e-07, "logits/chosen": -2.4287109375, "logits/rejected": -2.443554639816284, "logps/chosen": -385.1499938964844, "logps/rejected": -418.25, "loss": 0.0408, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.7079100608825684, "rewards/margins": 6.198828220367432, "rewards/rejected": -8.90234375, "step": 5730 }, { "epoch": 2.1619128306504756, "grad_norm": 5.4961635995846, "learning_rate": 4.596045197740113e-07, "logits/chosen": -2.170703172683716, "logits/rejected": -2.333789110183716, "logps/chosen": -405.01251220703125, "logps/rejected": -427.20001220703125, "loss": 0.0287, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6236815452575684, "rewards/margins": 6.329297065734863, "rewards/rejected": -8.950780868530273, "step": 5740 }, { "epoch": 2.1656782453167653, "grad_norm": 22.684292481333976, "learning_rate": 4.586629001883239e-07, "logits/chosen": -2.234179735183716, "logits/rejected": -2.267578125, "logps/chosen": -377.04998779296875, "logps/rejected": -392.2250061035156, "loss": 0.036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.103271484375, "rewards/margins": 6.628515720367432, "rewards/rejected": -8.727343559265137, "step": 5750 }, { "epoch": 2.1694436599830556, "grad_norm": 5.081471512050982, "learning_rate": 4.5772128060263656e-07, "logits/chosen": -2.408007860183716, "logits/rejected": -2.289843797683716, "logps/chosen": -303.17498779296875, "logps/rejected": -377.75, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": -1.815454125404358, "rewards/margins": 6.322656154632568, "rewards/rejected": -8.137499809265137, "step": 5760 }, { "epoch": 2.1732090746493458, "grad_norm": 9.806462901656124, "learning_rate": 4.5677966101694916e-07, "logits/chosen": -2.318554639816284, "logits/rejected": -2.256054639816284, "logps/chosen": -340.79998779296875, "logps/rejected": -386.8999938964844, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": -2.3221435546875, "rewards/margins": 6.169531345367432, "rewards/rejected": -8.490625381469727, "step": 5770 }, { "epoch": 2.176974489315636, "grad_norm": 1.7992723397396782, "learning_rate": 4.5583804143126177e-07, "logits/chosen": -2.377734422683716, "logits/rejected": -2.3490233421325684, "logps/chosen": -341.17498779296875, "logps/rejected": -358.75, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -2.4615111351013184, "rewards/margins": 6.588281154632568, "rewards/rejected": -9.05078125, "step": 5780 }, { "epoch": 2.180739903981926, "grad_norm": 2.765710592258824, "learning_rate": 4.548964218455744e-07, "logits/chosen": -2.166699171066284, "logits/rejected": -2.3169922828674316, "logps/chosen": -359.75, "logps/rejected": -373.5, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": -2.235119581222534, "rewards/margins": 6.369531154632568, "rewards/rejected": -8.607812881469727, "step": 5790 }, { "epoch": 2.184505318648216, "grad_norm": 2.851877012370796, "learning_rate": 4.53954802259887e-07, "logits/chosen": -2.312304735183716, "logits/rejected": -2.3358397483825684, "logps/chosen": -365.42498779296875, "logps/rejected": -389.5, "loss": 0.0251, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.967993140220642, "rewards/margins": 6.241796970367432, "rewards/rejected": -8.206250190734863, "step": 5800 }, { "epoch": 2.188270733314506, "grad_norm": 1.5181138190273058, "learning_rate": 4.530131826741996e-07, "logits/chosen": -2.505175828933716, "logits/rejected": -2.4605469703674316, "logps/chosen": -332.1499938964844, "logps/rejected": -385.375, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": -2.3902831077575684, "rewards/margins": 6.336328029632568, "rewards/rejected": -8.729687690734863, "step": 5810 }, { "epoch": 2.1920361479807964, "grad_norm": 3.728889045228137, "learning_rate": 4.5207156308851225e-07, "logits/chosen": -2.2933592796325684, "logits/rejected": -2.378710985183716, "logps/chosen": -356.7124938964844, "logps/rejected": -387.4750061035156, "loss": 0.0214, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9626343250274658, "rewards/margins": 6.229296684265137, "rewards/rejected": -8.193750381469727, "step": 5820 }, { "epoch": 2.1958015626470866, "grad_norm": 5.431466460587323, "learning_rate": 4.5112994350282485e-07, "logits/chosen": -2.332226514816284, "logits/rejected": -2.2466797828674316, "logps/chosen": -365.29998779296875, "logps/rejected": -416.625, "loss": 0.0572, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5453124046325684, "rewards/margins": 6.146093845367432, "rewards/rejected": -8.698437690734863, "step": 5830 }, { "epoch": 2.199566977313377, "grad_norm": 31.91402340824485, "learning_rate": 4.5018832391713746e-07, "logits/chosen": -2.3504881858825684, "logits/rejected": -2.4205079078674316, "logps/chosen": -367.4624938964844, "logps/rejected": -392.54998779296875, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": -2.759570360183716, "rewards/margins": 6.357812404632568, "rewards/rejected": -9.123437881469727, "step": 5840 }, { "epoch": 2.2033323919796666, "grad_norm": 6.651021493084366, "learning_rate": 4.4924670433145006e-07, "logits/chosen": -2.373828172683716, "logits/rejected": -2.278515577316284, "logps/chosen": -359.26251220703125, "logps/rejected": -407.9750061035156, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": -2.33251953125, "rewards/margins": 6.479296684265137, "rewards/rejected": -8.81640625, "step": 5850 }, { "epoch": 2.207097806645957, "grad_norm": 5.561762732973763, "learning_rate": 4.4830508474576267e-07, "logits/chosen": -2.318164110183716, "logits/rejected": -2.445117235183716, "logps/chosen": -399.8999938964844, "logps/rejected": -380.375, "loss": 0.0341, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.936914086341858, "rewards/margins": 5.848437309265137, "rewards/rejected": -7.782031059265137, "step": 5860 }, { "epoch": 2.210863221312247, "grad_norm": 29.00746243209693, "learning_rate": 4.473634651600753e-07, "logits/chosen": -2.397656202316284, "logits/rejected": -2.3223633766174316, "logps/chosen": -356.8374938964844, "logps/rejected": -390.29998779296875, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -2.298815965652466, "rewards/margins": 6.257031440734863, "rewards/rejected": -8.552343368530273, "step": 5870 }, { "epoch": 2.214628635978537, "grad_norm": 6.1164202528754785, "learning_rate": 4.4642184557438794e-07, "logits/chosen": -2.397265672683716, "logits/rejected": -2.525195360183716, "logps/chosen": -372.82501220703125, "logps/rejected": -402.82501220703125, "loss": 0.0319, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5243163108825684, "rewards/margins": 6.25390625, "rewards/rejected": -8.776562690734863, "step": 5880 }, { "epoch": 2.2183940506448274, "grad_norm": 6.019520804119123, "learning_rate": 4.4548022598870054e-07, "logits/chosen": -2.3641600608825684, "logits/rejected": -2.239550828933716, "logps/chosen": -345.875, "logps/rejected": -397.4750061035156, "loss": 0.0282, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.019238233566284, "rewards/margins": 5.982031345367432, "rewards/rejected": -8.00390625, "step": 5890 }, { "epoch": 2.222159465311117, "grad_norm": 27.23400676209143, "learning_rate": 4.4453860640301315e-07, "logits/chosen": -2.3804688453674316, "logits/rejected": -2.445507764816284, "logps/chosen": -376.79998779296875, "logps/rejected": -397.8500061035156, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": -2.251513719558716, "rewards/margins": 6.649218559265137, "rewards/rejected": -8.895312309265137, "step": 5900 }, { "epoch": 2.2259248799774074, "grad_norm": 5.529100292203322, "learning_rate": 4.4359698681732576e-07, "logits/chosen": -2.2421875, "logits/rejected": -2.3041014671325684, "logps/chosen": -367.98748779296875, "logps/rejected": -422.625, "loss": 0.0271, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.728515625, "rewards/margins": 6.6328125, "rewards/rejected": -9.361719131469727, "step": 5910 }, { "epoch": 2.2296902946436976, "grad_norm": 5.350655602808785, "learning_rate": 4.4265536723163836e-07, "logits/chosen": -2.3042969703674316, "logits/rejected": -2.5220704078674316, "logps/chosen": -376.79998779296875, "logps/rejected": -357.95001220703125, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -2.4564452171325684, "rewards/margins": 6.328906059265137, "rewards/rejected": -8.794530868530273, "step": 5920 }, { "epoch": 2.233455709309988, "grad_norm": 2.8574433566750246, "learning_rate": 4.41713747645951e-07, "logits/chosen": -2.296875, "logits/rejected": -2.396289110183716, "logps/chosen": -341.6875, "logps/rejected": -355.125, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": -2.211132764816284, "rewards/margins": 6.193359375, "rewards/rejected": -8.400781631469727, "step": 5930 }, { "epoch": 2.237221123976278, "grad_norm": 42.04239988607671, "learning_rate": 4.4077212806026363e-07, "logits/chosen": -2.3519530296325684, "logits/rejected": -2.4228515625, "logps/chosen": -357.2749938964844, "logps/rejected": -371.125, "loss": 0.0308, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.881805419921875, "rewards/margins": 6.141406059265137, "rewards/rejected": -8.0234375, "step": 5940 }, { "epoch": 2.240986538642568, "grad_norm": 9.92151547564227, "learning_rate": 4.3983050847457623e-07, "logits/chosen": -2.287304639816284, "logits/rejected": -2.2503905296325684, "logps/chosen": -386.0, "logps/rejected": -382.29998779296875, "loss": 0.0657, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.026684522628784, "rewards/margins": 6.300000190734863, "rewards/rejected": -8.331250190734863, "step": 5950 }, { "epoch": 2.244751953308858, "grad_norm": 18.631044345628663, "learning_rate": 4.3888888888888884e-07, "logits/chosen": -2.336132764816284, "logits/rejected": -2.139453172683716, "logps/chosen": -374.67498779296875, "logps/rejected": -434.42498779296875, "loss": 0.0172, "rewards/accuracies": 1.0, "rewards/chosen": -1.8689453601837158, "rewards/margins": 6.025000095367432, "rewards/rejected": -7.892187595367432, "step": 5960 }, { "epoch": 2.2485173679751482, "grad_norm": 4.763902081208366, "learning_rate": 4.3794726930320145e-07, "logits/chosen": -2.4380860328674316, "logits/rejected": -2.509960889816284, "logps/chosen": -364.73748779296875, "logps/rejected": -381.54998779296875, "loss": 0.0324, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.746557593345642, "rewards/margins": 6.0, "rewards/rejected": -7.75, "step": 5970 }, { "epoch": 2.2522827826414384, "grad_norm": 17.084545952528547, "learning_rate": 4.370056497175141e-07, "logits/chosen": -2.3251953125, "logits/rejected": -2.3521485328674316, "logps/chosen": -373.0249938964844, "logps/rejected": -377.1000061035156, "loss": 0.0402, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.142871141433716, "rewards/margins": 6.089062690734863, "rewards/rejected": -8.231249809265137, "step": 5980 }, { "epoch": 2.2560481973077287, "grad_norm": 13.616167927435905, "learning_rate": 4.3606403013182676e-07, "logits/chosen": -2.3109374046325684, "logits/rejected": -2.379687547683716, "logps/chosen": -359.13751220703125, "logps/rejected": -385.375, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -1.944726586341858, "rewards/margins": 6.123046875, "rewards/rejected": -8.071093559265137, "step": 5990 }, { "epoch": 2.2598136119740184, "grad_norm": 13.532113066568094, "learning_rate": 4.3512241054613937e-07, "logits/chosen": -2.211718797683716, "logits/rejected": -2.17578125, "logps/chosen": -365.2749938964844, "logps/rejected": -396.0249938964844, "loss": 0.0208, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.176806688308716, "rewards/margins": 6.446093559265137, "rewards/rejected": -8.625391006469727, "step": 6000 }, { "epoch": 2.2635790266403086, "grad_norm": 13.17434516587738, "learning_rate": 4.34180790960452e-07, "logits/chosen": -2.352734327316284, "logits/rejected": -2.383984327316284, "logps/chosen": -322.20001220703125, "logps/rejected": -373.57501220703125, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -2.845703125, "rewards/margins": 5.97265625, "rewards/rejected": -8.821093559265137, "step": 6010 }, { "epoch": 2.267344441306599, "grad_norm": 25.52097854980064, "learning_rate": 4.332391713747646e-07, "logits/chosen": -2.3529295921325684, "logits/rejected": -2.558398485183716, "logps/chosen": -336.2875061035156, "logps/rejected": -321.75, "loss": 0.0332, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.27490234375, "rewards/margins": 6.075781345367432, "rewards/rejected": -9.3515625, "step": 6020 }, { "epoch": 2.271109855972889, "grad_norm": 1.6056364427113183, "learning_rate": 4.322975517890772e-07, "logits/chosen": -2.3349609375, "logits/rejected": -2.4097657203674316, "logps/chosen": -351.42498779296875, "logps/rejected": -387.57501220703125, "loss": 0.0277, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.035888671875, "rewards/margins": 6.581250190734863, "rewards/rejected": -9.619531631469727, "step": 6030 }, { "epoch": 2.2748752706391793, "grad_norm": 45.82016284804068, "learning_rate": 4.3135593220338985e-07, "logits/chosen": -2.4078125953674316, "logits/rejected": -2.421679735183716, "logps/chosen": -346.13751220703125, "logps/rejected": -415.4750061035156, "loss": 0.0231, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.584277391433716, "rewards/margins": 6.61328125, "rewards/rejected": -9.201562881469727, "step": 6040 }, { "epoch": 2.2786406853054695, "grad_norm": 10.723706587610058, "learning_rate": 4.3041431261770245e-07, "logits/chosen": -2.389453172683716, "logits/rejected": -2.525390625, "logps/chosen": -406.20001220703125, "logps/rejected": -409.32501220703125, "loss": 0.0364, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2955079078674316, "rewards/margins": 6.441015720367432, "rewards/rejected": -8.736719131469727, "step": 6050 }, { "epoch": 2.2824060999717593, "grad_norm": 10.840705980280998, "learning_rate": 4.2947269303201506e-07, "logits/chosen": -2.2509765625, "logits/rejected": -2.51953125, "logps/chosen": -389.7250061035156, "logps/rejected": -399.875, "loss": 0.0243, "rewards/accuracies": 1.0, "rewards/chosen": -3.055371046066284, "rewards/margins": 6.513281345367432, "rewards/rejected": -9.575780868530273, "step": 6060 }, { "epoch": 2.2861715146380495, "grad_norm": 9.91463466955974, "learning_rate": 4.2853107344632767e-07, "logits/chosen": -2.4029297828674316, "logits/rejected": -2.5296874046325684, "logps/chosen": -359.625, "logps/rejected": -360.3500061035156, "loss": 0.0385, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.267382860183716, "rewards/margins": 6.173437595367432, "rewards/rejected": -9.4453125, "step": 6070 }, { "epoch": 2.2899369293043397, "grad_norm": 7.362312101366391, "learning_rate": 4.275894538606403e-07, "logits/chosen": -2.341992139816284, "logits/rejected": -2.315722703933716, "logps/chosen": -327.42498779296875, "logps/rejected": -380.0249938964844, "loss": 0.0245, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.7938475608825684, "rewards/margins": 6.45703125, "rewards/rejected": -9.25390625, "step": 6080 }, { "epoch": 2.29370234397063, "grad_norm": 6.335222814193443, "learning_rate": 4.266478342749529e-07, "logits/chosen": -2.423046827316284, "logits/rejected": -2.395312547683716, "logps/chosen": -360.2250061035156, "logps/rejected": -403.4750061035156, "loss": 0.0219, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4781494140625, "rewards/margins": 6.693749904632568, "rewards/rejected": -9.170312881469727, "step": 6090 }, { "epoch": 2.2974677586369197, "grad_norm": 9.420367139773736, "learning_rate": 4.2570621468926554e-07, "logits/chosen": -2.381054639816284, "logits/rejected": -2.391796827316284, "logps/chosen": -397.875, "logps/rejected": -415.42498779296875, "loss": 0.0192, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1119751930236816, "rewards/margins": 6.669921875, "rewards/rejected": -8.785937309265137, "step": 6100 }, { "epoch": 2.30123317330321, "grad_norm": 2.058574358486619, "learning_rate": 4.2476459510357815e-07, "logits/chosen": -2.4595704078674316, "logits/rejected": -2.5191407203674316, "logps/chosen": -349.88751220703125, "logps/rejected": -375.79998779296875, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": -1.8531005382537842, "rewards/margins": 6.544921875, "rewards/rejected": -8.40234375, "step": 6110 }, { "epoch": 2.3049985879695, "grad_norm": 17.41185979486115, "learning_rate": 4.2382297551789075e-07, "logits/chosen": -2.4623045921325684, "logits/rejected": -2.4837889671325684, "logps/chosen": -366.20001220703125, "logps/rejected": -391.7749938964844, "loss": 0.0314, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3921875953674316, "rewards/margins": 6.318749904632568, "rewards/rejected": -8.710156440734863, "step": 6120 }, { "epoch": 2.3087640026357903, "grad_norm": 13.138854375409752, "learning_rate": 4.2288135593220336e-07, "logits/chosen": -2.341503858566284, "logits/rejected": -2.3306641578674316, "logps/chosen": -400.04998779296875, "logps/rejected": -428.04998779296875, "loss": 0.0256, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6751341819763184, "rewards/margins": 6.743750095367432, "rewards/rejected": -9.414843559265137, "step": 6130 }, { "epoch": 2.3125294173020805, "grad_norm": 2.9076944256896193, "learning_rate": 4.2193973634651596e-07, "logits/chosen": -2.255859375, "logits/rejected": -2.3931641578674316, "logps/chosen": -401.625, "logps/rejected": -406.8500061035156, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": -2.2168946266174316, "rewards/margins": 6.532812595367432, "rewards/rejected": -8.749218940734863, "step": 6140 }, { "epoch": 2.3162948319683707, "grad_norm": 29.606625280575923, "learning_rate": 4.2099811676082857e-07, "logits/chosen": -2.366406202316284, "logits/rejected": -2.498828172683716, "logps/chosen": -365.7250061035156, "logps/rejected": -394.7250061035156, "loss": 0.0328, "rewards/accuracies": 1.0, "rewards/chosen": -2.320690870285034, "rewards/margins": 6.385937690734863, "rewards/rejected": -8.705469131469727, "step": 6150 }, { "epoch": 2.3200602466346605, "grad_norm": 5.687831225804184, "learning_rate": 4.2005649717514123e-07, "logits/chosen": -2.3529295921325684, "logits/rejected": -2.271289110183716, "logps/chosen": -381.6000061035156, "logps/rejected": -420.25, "loss": 0.025, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.713085889816284, "rewards/margins": 6.410937309265137, "rewards/rejected": -9.124218940734863, "step": 6160 }, { "epoch": 2.3238256613009507, "grad_norm": 3.0224178495709317, "learning_rate": 4.1911487758945384e-07, "logits/chosen": -2.4623045921325684, "logits/rejected": -2.4712891578674316, "logps/chosen": -369.29998779296875, "logps/rejected": -382.0, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": -2.2755370140075684, "rewards/margins": 6.76953125, "rewards/rejected": -9.046875, "step": 6170 }, { "epoch": 2.327591075967241, "grad_norm": 4.139677856445872, "learning_rate": 4.1817325800376644e-07, "logits/chosen": -2.275390625, "logits/rejected": -2.380664110183716, "logps/chosen": -348.2749938964844, "logps/rejected": -391.8500061035156, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -2.6363282203674316, "rewards/margins": 6.48828125, "rewards/rejected": -9.119531631469727, "step": 6180 }, { "epoch": 2.331356490633531, "grad_norm": 6.72001374671472, "learning_rate": 4.1723163841807905e-07, "logits/chosen": -2.3023438453674316, "logits/rejected": -2.41552734375, "logps/chosen": -400.875, "logps/rejected": -431.4750061035156, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -2.989501953125, "rewards/margins": 6.796093940734863, "rewards/rejected": -9.788281440734863, "step": 6190 }, { "epoch": 2.3351219052998213, "grad_norm": 6.452192374689596, "learning_rate": 4.1629001883239166e-07, "logits/chosen": -2.3773436546325684, "logits/rejected": -2.3912110328674316, "logps/chosen": -348.2124938964844, "logps/rejected": -404.7749938964844, "loss": 0.0224, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.59912109375, "rewards/margins": 6.833593845367432, "rewards/rejected": -9.432812690734863, "step": 6200 }, { "epoch": 2.338887319966111, "grad_norm": 14.141189995882225, "learning_rate": 4.153483992467043e-07, "logits/chosen": -2.3642578125, "logits/rejected": -2.367968797683716, "logps/chosen": -381.92498779296875, "logps/rejected": -418.8500061035156, "loss": 0.05, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.5815186500549316, "rewards/margins": 6.425000190734863, "rewards/rejected": -9.007031440734863, "step": 6210 }, { "epoch": 2.3426527346324013, "grad_norm": 32.553399192321145, "learning_rate": 4.1440677966101697e-07, "logits/chosen": -2.20654296875, "logits/rejected": -2.3589844703674316, "logps/chosen": -377.42498779296875, "logps/rejected": -409.2749938964844, "loss": 0.0339, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2342286109924316, "rewards/margins": 7.025781154632568, "rewards/rejected": -9.258593559265137, "step": 6220 }, { "epoch": 2.3464181492986915, "grad_norm": 21.58627375231656, "learning_rate": 4.134651600753296e-07, "logits/chosen": -2.4126954078674316, "logits/rejected": -2.452929735183716, "logps/chosen": -356.2875061035156, "logps/rejected": -362.45001220703125, "loss": 0.049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1580810546875, "rewards/margins": 6.321875095367432, "rewards/rejected": -8.479687690734863, "step": 6230 }, { "epoch": 2.3501835639649817, "grad_norm": 5.835254302945561, "learning_rate": 4.125235404896422e-07, "logits/chosen": -2.3720703125, "logits/rejected": -2.323046922683716, "logps/chosen": -392.2250061035156, "logps/rejected": -408.625, "loss": 0.024, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.320605516433716, "rewards/margins": 6.608593940734863, "rewards/rejected": -8.930468559265137, "step": 6240 }, { "epoch": 2.353948978631272, "grad_norm": 6.109320813157088, "learning_rate": 4.115819209039548e-07, "logits/chosen": -2.375, "logits/rejected": -2.3041014671325684, "logps/chosen": -353.9750061035156, "logps/rejected": -387.6000061035156, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -2.003124952316284, "rewards/margins": 6.37890625, "rewards/rejected": -8.378125190734863, "step": 6250 }, { "epoch": 2.3577143932975617, "grad_norm": 10.444468156389638, "learning_rate": 4.106403013182674e-07, "logits/chosen": -2.2962889671325684, "logits/rejected": -2.435351610183716, "logps/chosen": -362.57501220703125, "logps/rejected": -422.2250061035156, "loss": 0.0393, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.922900438308716, "rewards/margins": 6.567187309265137, "rewards/rejected": -9.486719131469727, "step": 6260 }, { "epoch": 2.361479807963852, "grad_norm": 21.467695152963817, "learning_rate": 4.0969868173258006e-07, "logits/chosen": -2.4896483421325684, "logits/rejected": -2.552539110183716, "logps/chosen": -330.67498779296875, "logps/rejected": -354.7250061035156, "loss": 0.0378, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.843945264816284, "rewards/margins": 6.157031059265137, "rewards/rejected": -9.000781059265137, "step": 6270 }, { "epoch": 2.365245222630142, "grad_norm": 6.170955969746833, "learning_rate": 4.0875706214689266e-07, "logits/chosen": -2.3755860328674316, "logits/rejected": -2.5142579078674316, "logps/chosen": -378.32501220703125, "logps/rejected": -361.2749938964844, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": -2.152783155441284, "rewards/margins": 6.545312404632568, "rewards/rejected": -8.69140625, "step": 6280 }, { "epoch": 2.3690106372964324, "grad_norm": 16.78907522405875, "learning_rate": 4.0781544256120527e-07, "logits/chosen": -2.1456055641174316, "logits/rejected": -2.254101514816284, "logps/chosen": -408.8999938964844, "logps/rejected": -407.1499938964844, "loss": 0.0507, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.436474561691284, "rewards/margins": 6.219531059265137, "rewards/rejected": -8.659375190734863, "step": 6290 }, { "epoch": 2.3727760519627226, "grad_norm": 4.183344788887618, "learning_rate": 4.068738229755179e-07, "logits/chosen": -2.4964842796325684, "logits/rejected": -2.4828124046325684, "logps/chosen": -321.45001220703125, "logps/rejected": -356.6625061035156, "loss": 0.0703, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8482422828674316, "rewards/margins": 6.612500190734863, "rewards/rejected": -9.458593368530273, "step": 6300 }, { "epoch": 2.3765414666290123, "grad_norm": 8.151392576296361, "learning_rate": 4.059322033898305e-07, "logits/chosen": -2.401562452316284, "logits/rejected": -2.3648438453674316, "logps/chosen": -378.0, "logps/rejected": -407.29998779296875, "loss": 0.0456, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.4512696266174316, "rewards/margins": 6.375, "rewards/rejected": -8.828125, "step": 6310 }, { "epoch": 2.3803068812953025, "grad_norm": 1.9465210196876601, "learning_rate": 4.049905838041431e-07, "logits/chosen": -2.1131834983825684, "logits/rejected": -2.160937547683716, "logps/chosen": -397.1000061035156, "logps/rejected": -421.7749938964844, "loss": 0.0143, "rewards/accuracies": 1.0, "rewards/chosen": -2.4229493141174316, "rewards/margins": 6.330078125, "rewards/rejected": -8.755468368530273, "step": 6320 }, { "epoch": 2.3840722959615928, "grad_norm": 13.204634756016686, "learning_rate": 4.0404896421845575e-07, "logits/chosen": -2.220996141433716, "logits/rejected": -2.297656297683716, "logps/chosen": -382.2749938964844, "logps/rejected": -396.2749938964844, "loss": 0.0518, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0047607421875, "rewards/margins": 6.387109279632568, "rewards/rejected": -8.391406059265137, "step": 6330 }, { "epoch": 2.387837710627883, "grad_norm": 24.186717360307355, "learning_rate": 4.0310734463276835e-07, "logits/chosen": -2.287304639816284, "logits/rejected": -2.5228514671325684, "logps/chosen": -330.1625061035156, "logps/rejected": -371.45001220703125, "loss": 0.0164, "rewards/accuracies": 1.0, "rewards/chosen": -1.5111815929412842, "rewards/margins": 6.390234470367432, "rewards/rejected": -7.897656440734863, "step": 6340 }, { "epoch": 2.391603125294173, "grad_norm": 4.739045818572183, "learning_rate": 4.0216572504708096e-07, "logits/chosen": -2.328418016433716, "logits/rejected": -2.327343702316284, "logps/chosen": -374.9750061035156, "logps/rejected": -411.6499938964844, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": -2.2420897483825684, "rewards/margins": 6.54296875, "rewards/rejected": -8.788281440734863, "step": 6350 }, { "epoch": 2.3953685399604634, "grad_norm": 5.2354098071619, "learning_rate": 4.0122410546139357e-07, "logits/chosen": -2.509765625, "logits/rejected": -2.541210889816284, "logps/chosen": -342.32501220703125, "logps/rejected": -388.7749938964844, "loss": 0.0442, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.636425733566284, "rewards/margins": 6.146093845367432, "rewards/rejected": -8.785937309265137, "step": 6360 }, { "epoch": 2.399133954626753, "grad_norm": 10.18781570755564, "learning_rate": 4.002824858757062e-07, "logits/chosen": -2.397265672683716, "logits/rejected": -2.2676758766174316, "logps/chosen": -350.5625, "logps/rejected": -406.1000061035156, "loss": 0.0238, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.828418016433716, "rewards/margins": 6.533593654632568, "rewards/rejected": -9.367968559265137, "step": 6370 }, { "epoch": 2.4028993692930434, "grad_norm": 3.1924354190889948, "learning_rate": 3.9934086629001883e-07, "logits/chosen": -2.402148485183716, "logits/rejected": -2.5882811546325684, "logps/chosen": -351.38751220703125, "logps/rejected": -355.125, "loss": 0.0176, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.458203077316284, "rewards/margins": 6.188281059265137, "rewards/rejected": -8.646093368530273, "step": 6380 }, { "epoch": 2.4066647839593336, "grad_norm": 9.568593005246232, "learning_rate": 3.9839924670433144e-07, "logits/chosen": -2.243359327316284, "logits/rejected": -2.3626952171325684, "logps/chosen": -374.79998779296875, "logps/rejected": -361.42498779296875, "loss": 0.03, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.239306688308716, "rewards/margins": 6.364843845367432, "rewards/rejected": -8.602343559265137, "step": 6390 }, { "epoch": 2.410430198625624, "grad_norm": 5.047486743077962, "learning_rate": 3.9745762711864405e-07, "logits/chosen": -2.267382860183716, "logits/rejected": -2.4164061546325684, "logps/chosen": -400.3999938964844, "logps/rejected": -407.6000061035156, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -2.8213133811950684, "rewards/margins": 6.360156059265137, "rewards/rejected": -9.18359375, "step": 6400 }, { "epoch": 2.4141956132919136, "grad_norm": 3.7644519205990963, "learning_rate": 3.9651600753295665e-07, "logits/chosen": -2.366015672683716, "logits/rejected": -2.341796875, "logps/chosen": -387.7250061035156, "logps/rejected": -427.2749938964844, "loss": 0.0297, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.126757860183716, "rewards/margins": 6.489062309265137, "rewards/rejected": -9.62109375, "step": 6410 }, { "epoch": 2.417961027958204, "grad_norm": 7.2527218651659915, "learning_rate": 3.9557438794726926e-07, "logits/chosen": -2.3094725608825684, "logits/rejected": -2.369921922683716, "logps/chosen": -332.7250061035156, "logps/rejected": -409.7250061035156, "loss": 0.013, "rewards/accuracies": 1.0, "rewards/chosen": -2.9242186546325684, "rewards/margins": 6.432812690734863, "rewards/rejected": -9.357030868530273, "step": 6420 }, { "epoch": 2.421726442624494, "grad_norm": 10.268917957711958, "learning_rate": 3.9463276836158186e-07, "logits/chosen": -2.372851610183716, "logits/rejected": -2.459179639816284, "logps/chosen": -397.3500061035156, "logps/rejected": -416.82501220703125, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -2.6199951171875, "rewards/margins": 6.348437309265137, "rewards/rejected": -8.967968940734863, "step": 6430 }, { "epoch": 2.425491857290784, "grad_norm": 1.0175455628472267, "learning_rate": 3.936911487758945e-07, "logits/chosen": -2.593554735183716, "logits/rejected": -2.59765625, "logps/chosen": -316.3500061035156, "logps/rejected": -333.3500061035156, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": -2.578076124191284, "rewards/margins": 6.622265815734863, "rewards/rejected": -9.193750381469727, "step": 6440 }, { "epoch": 2.4292572719570744, "grad_norm": 8.07633529312297, "learning_rate": 3.9274952919020713e-07, "logits/chosen": -2.38525390625, "logits/rejected": -2.5673828125, "logps/chosen": -431.2250061035156, "logps/rejected": -432.79998779296875, "loss": 0.043, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.167895555496216, "rewards/margins": 6.407812595367432, "rewards/rejected": -9.577343940734863, "step": 6450 }, { "epoch": 2.4330226866233646, "grad_norm": 8.23655527012357, "learning_rate": 3.9180790960451974e-07, "logits/chosen": -2.4136719703674316, "logits/rejected": -2.368359327316284, "logps/chosen": -383.79998779296875, "logps/rejected": -416.51251220703125, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -2.6753907203674316, "rewards/margins": 7.006249904632568, "rewards/rejected": -9.684374809265137, "step": 6460 }, { "epoch": 2.4367881012896544, "grad_norm": 43.257740389803224, "learning_rate": 3.9086629001883234e-07, "logits/chosen": -2.41015625, "logits/rejected": -2.5005860328674316, "logps/chosen": -383.2749938964844, "logps/rejected": -393.3999938964844, "loss": 0.0594, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.3384766578674316, "rewards/margins": 6.172265529632568, "rewards/rejected": -9.508593559265137, "step": 6470 }, { "epoch": 2.4405535159559446, "grad_norm": 44.359679725718465, "learning_rate": 3.89924670433145e-07, "logits/chosen": -2.5009765625, "logits/rejected": -2.5472655296325684, "logps/chosen": -372.5375061035156, "logps/rejected": -407.3500061035156, "loss": 0.0404, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.654296875, "rewards/margins": 6.464062690734863, "rewards/rejected": -10.114062309265137, "step": 6480 }, { "epoch": 2.444318930622235, "grad_norm": 1.6420565207002815, "learning_rate": 3.889830508474576e-07, "logits/chosen": -2.428906202316284, "logits/rejected": -2.478710889816284, "logps/chosen": -385.9750061035156, "logps/rejected": -416.29998779296875, "loss": 0.0443, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.6253905296325684, "rewards/margins": 6.880468845367432, "rewards/rejected": -10.509374618530273, "step": 6490 }, { "epoch": 2.448084345288525, "grad_norm": 6.888964553923304, "learning_rate": 3.8804143126177027e-07, "logits/chosen": -2.656445264816284, "logits/rejected": -2.5230469703674316, "logps/chosen": -327.625, "logps/rejected": -395.2250061035156, "loss": 0.0237, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.284716844558716, "rewards/margins": 7.002734184265137, "rewards/rejected": -10.289843559265137, "step": 6500 }, { "epoch": 2.451849759954815, "grad_norm": 24.234256528129833, "learning_rate": 3.8709981167608287e-07, "logits/chosen": -2.4740233421325684, "logits/rejected": -2.3456053733825684, "logps/chosen": -342.7250061035156, "logps/rejected": -396.7749938964844, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -3.743457078933716, "rewards/margins": 6.352343559265137, "rewards/rejected": -10.096094131469727, "step": 6510 }, { "epoch": 2.455615174621105, "grad_norm": 26.559371373193798, "learning_rate": 3.861581920903955e-07, "logits/chosen": -2.367968797683716, "logits/rejected": -2.4814453125, "logps/chosen": -356.5874938964844, "logps/rejected": -377.125, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": -2.716015577316284, "rewards/margins": 6.993750095367432, "rewards/rejected": -9.70703125, "step": 6520 }, { "epoch": 2.4593805892873952, "grad_norm": 5.330607189832248, "learning_rate": 3.852165725047081e-07, "logits/chosen": -2.501171827316284, "logits/rejected": -2.508593797683716, "logps/chosen": -376.7250061035156, "logps/rejected": -402.17498779296875, "loss": 0.0198, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.624706983566284, "rewards/margins": 6.642968654632568, "rewards/rejected": -9.272656440734863, "step": 6530 }, { "epoch": 2.4631460039536854, "grad_norm": 3.4301475265743475, "learning_rate": 3.842749529190207e-07, "logits/chosen": -2.432812452316284, "logits/rejected": -2.5853514671325684, "logps/chosen": -373.32501220703125, "logps/rejected": -388.70001220703125, "loss": 0.0225, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.575976610183716, "rewards/margins": 6.317187309265137, "rewards/rejected": -8.89453125, "step": 6540 }, { "epoch": 2.4669114186199756, "grad_norm": 3.866234286422724, "learning_rate": 3.8333333333333335e-07, "logits/chosen": -2.5814452171325684, "logits/rejected": -2.475781202316284, "logps/chosen": -352.45001220703125, "logps/rejected": -408.7250061035156, "loss": 0.017, "rewards/accuracies": 1.0, "rewards/chosen": -2.7052245140075684, "rewards/margins": 6.946484565734863, "rewards/rejected": -9.649218559265137, "step": 6550 }, { "epoch": 2.470676833286266, "grad_norm": 5.9063250955024085, "learning_rate": 3.8239171374764596e-07, "logits/chosen": -2.447460889816284, "logits/rejected": -2.5726561546325684, "logps/chosen": -370.3999938964844, "logps/rejected": -387.57501220703125, "loss": 0.0409, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.667773485183716, "rewards/margins": 6.50390625, "rewards/rejected": -9.178125381469727, "step": 6560 }, { "epoch": 2.4744422479525556, "grad_norm": 8.824186212002518, "learning_rate": 3.8145009416195856e-07, "logits/chosen": -2.5248045921325684, "logits/rejected": -2.531054735183716, "logps/chosen": -321.1875, "logps/rejected": -404.1499938964844, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": -2.966552734375, "rewards/margins": 7.435937404632568, "rewards/rejected": -10.40234375, "step": 6570 }, { "epoch": 2.478207662618846, "grad_norm": 7.659146107372641, "learning_rate": 3.8050847457627117e-07, "logits/chosen": -2.461132764816284, "logits/rejected": -2.5638670921325684, "logps/chosen": -414.51251220703125, "logps/rejected": -418.32501220703125, "loss": 0.0162, "rewards/accuracies": 1.0, "rewards/chosen": -3.0782227516174316, "rewards/margins": 6.880468845367432, "rewards/rejected": -9.960156440734863, "step": 6580 }, { "epoch": 2.481973077285136, "grad_norm": 0.5097983295095814, "learning_rate": 3.795668549905838e-07, "logits/chosen": -2.5810546875, "logits/rejected": -2.538867235183716, "logps/chosen": -347.54998779296875, "logps/rejected": -416.7749938964844, "loss": 0.027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.068359375, "rewards/margins": 7.141406059265137, "rewards/rejected": -11.208593368530273, "step": 6590 }, { "epoch": 2.4857384919514263, "grad_norm": 48.390077435042734, "learning_rate": 3.786252354048964e-07, "logits/chosen": -2.4048829078674316, "logits/rejected": -2.256640672683716, "logps/chosen": -402.61248779296875, "logps/rejected": -399.79998779296875, "loss": 0.0341, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.227560520172119, "rewards/margins": 6.733202934265137, "rewards/rejected": -9.960156440734863, "step": 6600 }, { "epoch": 2.489503906617716, "grad_norm": 14.646491906084586, "learning_rate": 3.7768361581920904e-07, "logits/chosen": -2.486132860183716, "logits/rejected": -2.486328125, "logps/chosen": -342.7749938964844, "logps/rejected": -408.7250061035156, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.582714796066284, "rewards/margins": 6.71484375, "rewards/rejected": -10.297656059265137, "step": 6610 }, { "epoch": 2.4932693212840062, "grad_norm": 1.4426308347547572, "learning_rate": 3.7674199623352165e-07, "logits/chosen": -2.436718702316284, "logits/rejected": -2.4212889671325684, "logps/chosen": -370.125, "logps/rejected": -420.3500061035156, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -3.264404296875, "rewards/margins": 7.125781059265137, "rewards/rejected": -10.385937690734863, "step": 6620 }, { "epoch": 2.4970347359502965, "grad_norm": 1.4817257473358545, "learning_rate": 3.7580037664783425e-07, "logits/chosen": -2.5015625953674316, "logits/rejected": -2.590625047683716, "logps/chosen": -355.8999938964844, "logps/rejected": -381.6499938964844, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -3.3230223655700684, "rewards/margins": 7.244531154632568, "rewards/rejected": -10.571874618530273, "step": 6630 }, { "epoch": 2.5008001506165867, "grad_norm": 10.585272090166935, "learning_rate": 3.7485875706214686e-07, "logits/chosen": -2.3248047828674316, "logits/rejected": -2.423632860183716, "logps/chosen": -381.45001220703125, "logps/rejected": -404.45001220703125, "loss": 0.028, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.215136766433716, "rewards/margins": 6.898046970367432, "rewards/rejected": -10.110156059265137, "step": 6640 }, { "epoch": 2.504565565282877, "grad_norm": 2.472522716091627, "learning_rate": 3.7391713747645947e-07, "logits/chosen": -2.5845704078674316, "logits/rejected": -2.5726561546325684, "logps/chosen": -348.48748779296875, "logps/rejected": -398.45001220703125, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -2.955273389816284, "rewards/margins": 6.782812595367432, "rewards/rejected": -9.732812881469727, "step": 6650 }, { "epoch": 2.508330979949167, "grad_norm": 13.747474475308707, "learning_rate": 3.7297551789077207e-07, "logits/chosen": -2.5697264671325684, "logits/rejected": -2.591992139816284, "logps/chosen": -350.11248779296875, "logps/rejected": -384.1499938964844, "loss": 0.0395, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.300830125808716, "rewards/margins": 6.641015529632568, "rewards/rejected": -9.935937881469727, "step": 6660 }, { "epoch": 2.512096394615457, "grad_norm": 5.8525618494529885, "learning_rate": 3.7203389830508473e-07, "logits/chosen": -2.508593797683716, "logits/rejected": -2.5150389671325684, "logps/chosen": -342.2250061035156, "logps/rejected": -384.125, "loss": 0.0336, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2353453636169434, "rewards/margins": 6.927734375, "rewards/rejected": -10.1640625, "step": 6670 }, { "epoch": 2.515861809281747, "grad_norm": 5.527694139943655, "learning_rate": 3.7109227871939734e-07, "logits/chosen": -2.4751954078674316, "logits/rejected": -2.3939452171325684, "logps/chosen": -357.07501220703125, "logps/rejected": -438.4750061035156, "loss": 0.0221, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.969970703125, "rewards/margins": 6.687890529632568, "rewards/rejected": -9.665624618530273, "step": 6680 }, { "epoch": 2.5196272239480373, "grad_norm": 2.999676862015879, "learning_rate": 3.7015065913370995e-07, "logits/chosen": -2.439648389816284, "logits/rejected": -2.411328077316284, "logps/chosen": -365.75, "logps/rejected": -398.0, "loss": 0.0368, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.395800828933716, "rewards/margins": 7.091796875, "rewards/rejected": -9.48828125, "step": 6690 }, { "epoch": 2.5233926386143275, "grad_norm": 4.574401060244411, "learning_rate": 3.6920903954802255e-07, "logits/chosen": -2.449414014816284, "logits/rejected": -2.476757764816284, "logps/chosen": -347.70001220703125, "logps/rejected": -396.82501220703125, "loss": 0.0341, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8355469703674316, "rewards/margins": 6.548437595367432, "rewards/rejected": -9.379687309265137, "step": 6700 }, { "epoch": 2.5271580532806173, "grad_norm": 15.86339765849825, "learning_rate": 3.6826741996233516e-07, "logits/chosen": -2.45703125, "logits/rejected": -2.3910155296325684, "logps/chosen": -377.5249938964844, "logps/rejected": -423.0249938964844, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -3.1776123046875, "rewards/margins": 6.678906440734863, "rewards/rejected": -9.8515625, "step": 6710 }, { "epoch": 2.5309234679469075, "grad_norm": 5.537229255423815, "learning_rate": 3.673258003766478e-07, "logits/chosen": -2.4478516578674316, "logits/rejected": -2.427929639816284, "logps/chosen": -349.125, "logps/rejected": -393.82501220703125, "loss": 0.0301, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.992919921875, "rewards/margins": 6.442578315734863, "rewards/rejected": -9.436718940734863, "step": 6720 }, { "epoch": 2.5346888826131977, "grad_norm": 38.54306231671961, "learning_rate": 3.663841807909605e-07, "logits/chosen": -2.3832030296325684, "logits/rejected": -2.489453077316284, "logps/chosen": -377.1499938964844, "logps/rejected": -372.29998779296875, "loss": 0.0439, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.747607469558716, "rewards/margins": 6.498827934265137, "rewards/rejected": -9.246874809265137, "step": 6730 }, { "epoch": 2.538454297279488, "grad_norm": 5.232612971645867, "learning_rate": 3.654425612052731e-07, "logits/chosen": -2.451171875, "logits/rejected": -2.368847608566284, "logps/chosen": -384.82501220703125, "logps/rejected": -440.54998779296875, "loss": 0.0328, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.910449266433716, "rewards/margins": 6.862500190734863, "rewards/rejected": -9.76953125, "step": 6740 }, { "epoch": 2.542219711945778, "grad_norm": 8.632566451030387, "learning_rate": 3.645009416195857e-07, "logits/chosen": -2.4027342796325684, "logits/rejected": -2.4458985328674316, "logps/chosen": -367.42498779296875, "logps/rejected": -420.8999938964844, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -3.101806640625, "rewards/margins": 6.6171875, "rewards/rejected": -9.72265625, "step": 6750 }, { "epoch": 2.5459851266120683, "grad_norm": 2.6320373471397245, "learning_rate": 3.635593220338983e-07, "logits/chosen": -2.391406297683716, "logits/rejected": -2.4541015625, "logps/chosen": -383.875, "logps/rejected": -423.5249938964844, "loss": 0.0336, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.268749952316284, "rewards/margins": 7.079687595367432, "rewards/rejected": -10.345312118530273, "step": 6760 }, { "epoch": 2.5497505412783585, "grad_norm": 14.543066528080951, "learning_rate": 3.626177024482109e-07, "logits/chosen": -2.5859375, "logits/rejected": -2.536914110183716, "logps/chosen": -372.07501220703125, "logps/rejected": -398.29998779296875, "loss": 0.0454, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.399121046066284, "rewards/margins": 6.319140434265137, "rewards/rejected": -9.717187881469727, "step": 6770 }, { "epoch": 2.5535159559446483, "grad_norm": 13.69018809583001, "learning_rate": 3.6167608286252356e-07, "logits/chosen": -2.366992235183716, "logits/rejected": -2.5498046875, "logps/chosen": -385.67498779296875, "logps/rejected": -389.7749938964844, "loss": 0.0434, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.2779297828674316, "rewards/margins": 6.151562690734863, "rewards/rejected": -9.432031631469727, "step": 6780 }, { "epoch": 2.5572813706109385, "grad_norm": 10.784377952456115, "learning_rate": 3.6073446327683617e-07, "logits/chosen": -2.4356446266174316, "logits/rejected": -2.467578172683716, "logps/chosen": -348.20001220703125, "logps/rejected": -426.75, "loss": 0.0258, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5647215843200684, "rewards/margins": 7.021093845367432, "rewards/rejected": -10.579687118530273, "step": 6790 }, { "epoch": 2.5610467852772287, "grad_norm": 5.538295751007449, "learning_rate": 3.5979284369114877e-07, "logits/chosen": -2.421093702316284, "logits/rejected": -2.4322266578674316, "logps/chosen": -372.2250061035156, "logps/rejected": -414.95001220703125, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -3.0322265625, "rewards/margins": 7.302343845367432, "rewards/rejected": -10.336718559265137, "step": 6800 }, { "epoch": 2.564812199943519, "grad_norm": 6.836730353900834, "learning_rate": 3.588512241054614e-07, "logits/chosen": -2.4652342796325684, "logits/rejected": -2.4820313453674316, "logps/chosen": -344.07501220703125, "logps/rejected": -385.1499938964844, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -2.3873047828674316, "rewards/margins": 6.603906154632568, "rewards/rejected": -8.989062309265137, "step": 6810 }, { "epoch": 2.5685776146098087, "grad_norm": 8.418037348402157, "learning_rate": 3.57909604519774e-07, "logits/chosen": -2.537890672683716, "logits/rejected": -2.5404295921325684, "logps/chosen": -339.7875061035156, "logps/rejected": -379.70001220703125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -2.77294921875, "rewards/margins": 6.721093654632568, "rewards/rejected": -9.495312690734863, "step": 6820 }, { "epoch": 2.572343029276099, "grad_norm": 92.3268486896674, "learning_rate": 3.569679849340866e-07, "logits/chosen": -2.482226610183716, "logits/rejected": -2.4306640625, "logps/chosen": -324.4125061035156, "logps/rejected": -385.0, "loss": 0.0439, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.473925828933716, "rewards/margins": 6.655077934265137, "rewards/rejected": -9.130468368530273, "step": 6830 }, { "epoch": 2.576108443942389, "grad_norm": 13.538300241542498, "learning_rate": 3.5602636534839925e-07, "logits/chosen": -2.371875047683716, "logits/rejected": -2.3984375, "logps/chosen": -369.79998779296875, "logps/rejected": -390.0249938964844, "loss": 0.0308, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.600512742996216, "rewards/margins": 6.617968559265137, "rewards/rejected": -9.2109375, "step": 6840 }, { "epoch": 2.5798738586086793, "grad_norm": 42.64415706516517, "learning_rate": 3.5508474576271186e-07, "logits/chosen": -2.421679735183716, "logits/rejected": -2.5472655296325684, "logps/chosen": -389.5, "logps/rejected": -390.125, "loss": 0.0496, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.811962842941284, "rewards/margins": 6.436718940734863, "rewards/rejected": -9.253125190734863, "step": 6850 }, { "epoch": 2.5836392732749696, "grad_norm": 4.315002992782844, "learning_rate": 3.5414312617702446e-07, "logits/chosen": -2.4496092796325684, "logits/rejected": -2.3666014671325684, "logps/chosen": -357.95001220703125, "logps/rejected": -418.67498779296875, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -2.944531202316284, "rewards/margins": 6.513671875, "rewards/rejected": -9.459375381469727, "step": 6860 }, { "epoch": 2.5874046879412598, "grad_norm": 56.158499706514725, "learning_rate": 3.5320150659133707e-07, "logits/chosen": -2.422656297683716, "logits/rejected": -2.356640577316284, "logps/chosen": -329.45001220703125, "logps/rejected": -364.8500061035156, "loss": 0.0387, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.692675828933716, "rewards/margins": 6.373827934265137, "rewards/rejected": -9.06640625, "step": 6870 }, { "epoch": 2.5911701026075495, "grad_norm": 27.830162022787547, "learning_rate": 3.522598870056497e-07, "logits/chosen": -2.4332032203674316, "logits/rejected": -2.365039110183716, "logps/chosen": -376.20001220703125, "logps/rejected": -389.25, "loss": 0.0384, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.326171875, "rewards/margins": 6.43359375, "rewards/rejected": -9.758593559265137, "step": 6880 }, { "epoch": 2.5949355172738398, "grad_norm": 9.81104329904005, "learning_rate": 3.5131826741996234e-07, "logits/chosen": -2.458984375, "logits/rejected": -2.6128907203674316, "logps/chosen": -368.29998779296875, "logps/rejected": -397.79998779296875, "loss": 0.0261, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9771485328674316, "rewards/margins": 6.542187690734863, "rewards/rejected": -9.513280868530273, "step": 6890 }, { "epoch": 2.59870093194013, "grad_norm": 2.4606994579199766, "learning_rate": 3.5037664783427494e-07, "logits/chosen": -2.5830078125, "logits/rejected": -2.460156202316284, "logps/chosen": -370.29998779296875, "logps/rejected": -408.875, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -3.0389647483825684, "rewards/margins": 6.526562690734863, "rewards/rejected": -9.564062118530273, "step": 6900 }, { "epoch": 2.60246634660642, "grad_norm": 18.701282848177552, "learning_rate": 3.4943502824858755e-07, "logits/chosen": -2.4027342796325684, "logits/rejected": -2.4927735328674316, "logps/chosen": -400.0, "logps/rejected": -384.6000061035156, "loss": 0.0316, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6758790016174316, "rewards/margins": 7.047656059265137, "rewards/rejected": -9.725781440734863, "step": 6910 }, { "epoch": 2.60623176127271, "grad_norm": 28.283627213175517, "learning_rate": 3.4849340866290015e-07, "logits/chosen": -2.4126954078674316, "logits/rejected": -2.358593702316284, "logps/chosen": -361.7250061035156, "logps/rejected": -386.25, "loss": 0.0296, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.711474657058716, "rewards/margins": 7.112500190734863, "rewards/rejected": -9.823437690734863, "step": 6920 }, { "epoch": 2.609997175939, "grad_norm": 23.808408514672543, "learning_rate": 3.4755178907721276e-07, "logits/chosen": -2.457812547683716, "logits/rejected": -2.3370118141174316, "logps/chosen": -353.0, "logps/rejected": -425.625, "loss": 0.037, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.91259765625, "rewards/margins": 6.92578125, "rewards/rejected": -9.84375, "step": 6930 }, { "epoch": 2.6137625906052904, "grad_norm": 2.941062065360318, "learning_rate": 3.4661016949152537e-07, "logits/chosen": -2.4566407203674316, "logits/rejected": -2.4892578125, "logps/chosen": -357.32501220703125, "logps/rejected": -379.7749938964844, "loss": 0.0221, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0718750953674316, "rewards/margins": 6.430468559265137, "rewards/rejected": -9.502344131469727, "step": 6940 }, { "epoch": 2.6175280052715806, "grad_norm": 1.9690896905958744, "learning_rate": 3.45668549905838e-07, "logits/chosen": -2.462890625, "logits/rejected": -2.466992139816284, "logps/chosen": -335.7250061035156, "logps/rejected": -377.4750061035156, "loss": 0.0236, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1019043922424316, "rewards/margins": 6.365234375, "rewards/rejected": -9.469531059265137, "step": 6950 }, { "epoch": 2.621293419937871, "grad_norm": 1.7567638996950288, "learning_rate": 3.4472693032015063e-07, "logits/chosen": -2.377148389816284, "logits/rejected": -2.4124999046325684, "logps/chosen": -347.32501220703125, "logps/rejected": -388.125, "loss": 0.0307, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.107617139816284, "rewards/margins": 6.233202934265137, "rewards/rejected": -9.340624809265137, "step": 6960 }, { "epoch": 2.625058834604161, "grad_norm": 69.31643121646039, "learning_rate": 3.4378531073446324e-07, "logits/chosen": -2.440624952316284, "logits/rejected": -2.327929735183716, "logps/chosen": -354.2749938964844, "logps/rejected": -406.6499938964844, "loss": 0.0387, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.711132764816284, "rewards/margins": 6.17578125, "rewards/rejected": -8.889062881469727, "step": 6970 }, { "epoch": 2.6288242492704508, "grad_norm": 9.952515109123823, "learning_rate": 3.4284369114877584e-07, "logits/chosen": -2.4541015625, "logits/rejected": -2.408984422683716, "logps/chosen": -355.51251220703125, "logps/rejected": -408.6499938964844, "loss": 0.0278, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8121581077575684, "rewards/margins": 6.217968940734863, "rewards/rejected": -9.029687881469727, "step": 6980 }, { "epoch": 2.632589663936741, "grad_norm": 1.9385902683773069, "learning_rate": 3.419020715630885e-07, "logits/chosen": -2.1703124046325684, "logits/rejected": -2.2132811546325684, "logps/chosen": -396.5, "logps/rejected": -414.5, "loss": 0.0314, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.5462403297424316, "rewards/margins": 6.806250095367432, "rewards/rejected": -9.353906631469727, "step": 6990 }, { "epoch": 2.636355078603031, "grad_norm": 54.10298071402341, "learning_rate": 3.409604519774011e-07, "logits/chosen": -2.4306640625, "logits/rejected": -2.4482421875, "logps/chosen": -376.42498779296875, "logps/rejected": -409.9750061035156, "loss": 0.0293, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.28533935546875, "rewards/margins": 6.782422065734863, "rewards/rejected": -10.071093559265137, "step": 7000 }, { "epoch": 2.6401204932693214, "grad_norm": 3.078554088201743, "learning_rate": 3.4001883239171377e-07, "logits/chosen": -2.420117139816284, "logits/rejected": -2.395312547683716, "logps/chosen": -374.20001220703125, "logps/rejected": -404.54998779296875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -2.955371141433716, "rewards/margins": 6.784375190734863, "rewards/rejected": -9.7421875, "step": 7010 }, { "epoch": 2.643885907935611, "grad_norm": 4.410201026957016, "learning_rate": 3.390772128060264e-07, "logits/chosen": -2.4417967796325684, "logits/rejected": -2.447460889816284, "logps/chosen": -337.67498779296875, "logps/rejected": -406.2250061035156, "loss": 0.0333, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.4298338890075684, "rewards/margins": 6.569140434265137, "rewards/rejected": -10.0, "step": 7020 }, { "epoch": 2.6476513226019014, "grad_norm": 7.491972126785826, "learning_rate": 3.38135593220339e-07, "logits/chosen": -2.494335889816284, "logits/rejected": -2.477343797683716, "logps/chosen": -379.32501220703125, "logps/rejected": -401.57501220703125, "loss": 0.0369, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.421679735183716, "rewards/margins": 6.478515625, "rewards/rejected": -9.896875381469727, "step": 7030 }, { "epoch": 2.6514167372681916, "grad_norm": 3.8316013442686745, "learning_rate": 3.371939736346516e-07, "logits/chosen": -2.5455079078674316, "logits/rejected": -2.6109375953674316, "logps/chosen": -330.3125, "logps/rejected": -375.8999938964844, "loss": 0.0274, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5250487327575684, "rewards/margins": 6.842577934265137, "rewards/rejected": -10.3671875, "step": 7040 }, { "epoch": 2.655182151934482, "grad_norm": 26.53523046711157, "learning_rate": 3.362523540489642e-07, "logits/chosen": -2.575000047683716, "logits/rejected": -2.5259766578674316, "logps/chosen": -353.3999938964844, "logps/rejected": -395.375, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": -3.2013182640075684, "rewards/margins": 6.811718940734863, "rewards/rejected": -10.01171875, "step": 7050 }, { "epoch": 2.658947566600772, "grad_norm": 14.443175886776295, "learning_rate": 3.3531073446327685e-07, "logits/chosen": -2.594921827316284, "logits/rejected": -2.726367235183716, "logps/chosen": -353.29998779296875, "logps/rejected": -383.7749938964844, "loss": 0.0333, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.621875047683716, "rewards/margins": 6.850390434265137, "rewards/rejected": -10.471875190734863, "step": 7060 }, { "epoch": 2.6627129812670622, "grad_norm": 12.926550840194048, "learning_rate": 3.3436911487758946e-07, "logits/chosen": -2.4056639671325684, "logits/rejected": -2.5478515625, "logps/chosen": -372.1625061035156, "logps/rejected": -389.20001220703125, "loss": 0.0274, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1607422828674316, "rewards/margins": 6.440234184265137, "rewards/rejected": -9.600781440734863, "step": 7070 }, { "epoch": 2.666478395933352, "grad_norm": 5.774139970874191, "learning_rate": 3.3342749529190207e-07, "logits/chosen": -2.5230469703674316, "logits/rejected": -2.5630860328674316, "logps/chosen": -330.2875061035156, "logps/rejected": -366.54998779296875, "loss": 0.0341, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.33013916015625, "rewards/margins": 6.505468845367432, "rewards/rejected": -9.833593368530273, "step": 7080 }, { "epoch": 2.670243810599642, "grad_norm": 24.571857446836663, "learning_rate": 3.3248587570621467e-07, "logits/chosen": -2.508984327316284, "logits/rejected": -2.4732422828674316, "logps/chosen": -343.375, "logps/rejected": -410.45001220703125, "loss": 0.035, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.218945264816284, "rewards/margins": 6.842577934265137, "rewards/rejected": -10.064062118530273, "step": 7090 }, { "epoch": 2.6740092252659324, "grad_norm": 2.946401549644335, "learning_rate": 3.315442561205273e-07, "logits/chosen": -2.406054735183716, "logits/rejected": -2.419140577316284, "logps/chosen": -371.95001220703125, "logps/rejected": -451.5, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -2.827441453933716, "rewards/margins": 7.403124809265137, "rewards/rejected": -10.240625381469727, "step": 7100 }, { "epoch": 2.6777746399322226, "grad_norm": 8.255963221794984, "learning_rate": 3.306026365348399e-07, "logits/chosen": -2.626171827316284, "logits/rejected": -2.613085985183716, "logps/chosen": -394.9750061035156, "logps/rejected": -426.54998779296875, "loss": 0.0495, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.087500095367432, "rewards/margins": 7.001562595367432, "rewards/rejected": -11.086718559265137, "step": 7110 }, { "epoch": 2.6815400545985124, "grad_norm": 42.601715481000646, "learning_rate": 3.2966101694915254e-07, "logits/chosen": -2.6910157203674316, "logits/rejected": -2.629101514816284, "logps/chosen": -363.63751220703125, "logps/rejected": -406.75, "loss": 0.0418, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.789257764816284, "rewards/margins": 6.538281440734863, "rewards/rejected": -10.332812309265137, "step": 7120 }, { "epoch": 2.6853054692648026, "grad_norm": 8.444678662858863, "learning_rate": 3.2871939736346515e-07, "logits/chosen": -2.4527344703674316, "logits/rejected": -2.594921827316284, "logps/chosen": -370.2250061035156, "logps/rejected": -413.75, "loss": 0.0173, "rewards/accuracies": 1.0, "rewards/chosen": -3.0318360328674316, "rewards/margins": 7.250390529632568, "rewards/rejected": -10.285937309265137, "step": 7130 }, { "epoch": 2.689070883931093, "grad_norm": 20.937951169106753, "learning_rate": 3.2777777777777776e-07, "logits/chosen": -2.607421875, "logits/rejected": -2.6263670921325684, "logps/chosen": -383.5249938964844, "logps/rejected": -406.5249938964844, "loss": 0.0321, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.018359422683716, "rewards/margins": 6.389843940734863, "rewards/rejected": -9.409375190734863, "step": 7140 }, { "epoch": 2.692836298597383, "grad_norm": 14.589390487226096, "learning_rate": 3.2683615819209036e-07, "logits/chosen": -2.5169920921325684, "logits/rejected": -2.4867186546325684, "logps/chosen": -343.79998779296875, "logps/rejected": -400.8500061035156, "loss": 0.0419, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.23486328125, "rewards/margins": 6.843359470367432, "rewards/rejected": -10.077343940734863, "step": 7150 }, { "epoch": 2.6966017132636733, "grad_norm": 11.696076511892707, "learning_rate": 3.2589453860640297e-07, "logits/chosen": -2.436718702316284, "logits/rejected": -2.375195264816284, "logps/chosen": -402.875, "logps/rejected": -426.7749938964844, "loss": 0.02, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.016308546066284, "rewards/margins": 6.64453125, "rewards/rejected": -9.666406631469727, "step": 7160 }, { "epoch": 2.7003671279299635, "grad_norm": 5.745511047477004, "learning_rate": 3.2495291902071563e-07, "logits/chosen": -2.475878953933716, "logits/rejected": -2.5316405296325684, "logps/chosen": -396.0625, "logps/rejected": -400.79998779296875, "loss": 0.0167, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.874218702316284, "rewards/margins": 7.009375095367432, "rewards/rejected": -9.88671875, "step": 7170 }, { "epoch": 2.7041325425962537, "grad_norm": 27.23205638364505, "learning_rate": 3.2401129943502824e-07, "logits/chosen": -2.5296874046325684, "logits/rejected": -2.489062547683716, "logps/chosen": -320.82501220703125, "logps/rejected": -399.2250061035156, "loss": 0.0386, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.594470262527466, "rewards/margins": 6.953515529632568, "rewards/rejected": -9.553906440734863, "step": 7180 }, { "epoch": 2.7078979572625435, "grad_norm": 39.21912574688416, "learning_rate": 3.2306967984934084e-07, "logits/chosen": -2.4903321266174316, "logits/rejected": -2.4281249046325684, "logps/chosen": -355.7250061035156, "logps/rejected": -419.875, "loss": 0.0287, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.726452589035034, "rewards/margins": 6.472265720367432, "rewards/rejected": -9.205469131469727, "step": 7190 }, { "epoch": 2.7116633719288337, "grad_norm": 14.695533064846392, "learning_rate": 3.2212806026365345e-07, "logits/chosen": -2.4281249046325684, "logits/rejected": -2.380078077316284, "logps/chosen": -358.0, "logps/rejected": -420.54998779296875, "loss": 0.0227, "rewards/accuracies": 1.0, "rewards/chosen": -3.4052734375, "rewards/margins": 6.714062690734863, "rewards/rejected": -10.121874809265137, "step": 7200 }, { "epoch": 2.715428786595124, "grad_norm": 9.044870071126066, "learning_rate": 3.2118644067796605e-07, "logits/chosen": -2.5576171875, "logits/rejected": -2.5582032203674316, "logps/chosen": -358.6499938964844, "logps/rejected": -394.7749938964844, "loss": 0.0242, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.719043016433716, "rewards/margins": 6.813672065734863, "rewards/rejected": -10.532031059265137, "step": 7210 }, { "epoch": 2.7191942012614136, "grad_norm": 19.852988118608028, "learning_rate": 3.2024482109227866e-07, "logits/chosen": -2.463085889816284, "logits/rejected": -2.537402391433716, "logps/chosen": -404.67498779296875, "logps/rejected": -400.3999938964844, "loss": 0.0294, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8744139671325684, "rewards/margins": 7.150781154632568, "rewards/rejected": -11.028905868530273, "step": 7220 }, { "epoch": 2.722959615927704, "grad_norm": 15.229593907082322, "learning_rate": 3.1930320150659137e-07, "logits/chosen": -2.5240235328674316, "logits/rejected": -2.6748046875, "logps/chosen": -375.5, "logps/rejected": -382.7749938964844, "loss": 0.0344, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1603026390075684, "rewards/margins": 6.534765720367432, "rewards/rejected": -9.694531440734863, "step": 7230 }, { "epoch": 2.726725030593994, "grad_norm": 4.087395268038336, "learning_rate": 3.18361581920904e-07, "logits/chosen": -2.6015625, "logits/rejected": -2.7158203125, "logps/chosen": -348.8500061035156, "logps/rejected": -361.125, "loss": 0.02, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.412792921066284, "rewards/margins": 7.111718654632568, "rewards/rejected": -10.52734375, "step": 7240 }, { "epoch": 2.7304904452602843, "grad_norm": 8.438973801522256, "learning_rate": 3.174199623352166e-07, "logits/chosen": -2.58984375, "logits/rejected": -2.57763671875, "logps/chosen": -328.7250061035156, "logps/rejected": -377.04998779296875, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": -3.5431885719299316, "rewards/margins": 6.734375, "rewards/rejected": -10.274218559265137, "step": 7250 }, { "epoch": 2.7342558599265745, "grad_norm": 13.3533471084429, "learning_rate": 3.164783427495292e-07, "logits/chosen": -2.538281202316284, "logits/rejected": -2.5541014671325684, "logps/chosen": -371.6625061035156, "logps/rejected": -404.67498779296875, "loss": 0.0263, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2457032203674316, "rewards/margins": 6.95703125, "rewards/rejected": -10.203906059265137, "step": 7260 }, { "epoch": 2.7380212745928647, "grad_norm": 0.80137703167706, "learning_rate": 3.155367231638418e-07, "logits/chosen": -2.576171875, "logits/rejected": -2.4449219703674316, "logps/chosen": -347.04998779296875, "logps/rejected": -409.04998779296875, "loss": 0.0321, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.323046922683716, "rewards/margins": 6.958593845367432, "rewards/rejected": -10.280468940734863, "step": 7270 }, { "epoch": 2.741786689259155, "grad_norm": 18.699325807939523, "learning_rate": 3.145951035781544e-07, "logits/chosen": -2.4375, "logits/rejected": -2.470507860183716, "logps/chosen": -371.26251220703125, "logps/rejected": -407.70001220703125, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -3.5565428733825684, "rewards/margins": 7.303124904632568, "rewards/rejected": -10.864843368530273, "step": 7280 }, { "epoch": 2.7455521039254447, "grad_norm": 56.97677743156309, "learning_rate": 3.1365348399246706e-07, "logits/chosen": -2.4791016578674316, "logits/rejected": -2.59375, "logps/chosen": -379.4750061035156, "logps/rejected": -418.7749938964844, "loss": 0.0285, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.2213134765625, "rewards/margins": 7.197656154632568, "rewards/rejected": -10.419530868530273, "step": 7290 }, { "epoch": 2.749317518591735, "grad_norm": 51.65042998134157, "learning_rate": 3.1271186440677967e-07, "logits/chosen": -2.4751954078674316, "logits/rejected": -2.4916014671325684, "logps/chosen": -379.5249938964844, "logps/rejected": -408.8500061035156, "loss": 0.0348, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.409472703933716, "rewards/margins": 6.73046875, "rewards/rejected": -10.135937690734863, "step": 7300 }, { "epoch": 2.753082933258025, "grad_norm": 1.522995587944264, "learning_rate": 3.117702448210923e-07, "logits/chosen": -2.615234375, "logits/rejected": -2.635937452316284, "logps/chosen": -345.7250061035156, "logps/rejected": -384.1000061035156, "loss": 0.0201, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.383544921875, "rewards/margins": 6.567968845367432, "rewards/rejected": -9.94921875, "step": 7310 }, { "epoch": 2.7568483479243153, "grad_norm": 4.350541880292792, "learning_rate": 3.108286252354049e-07, "logits/chosen": -2.357128858566284, "logits/rejected": -2.4039063453674316, "logps/chosen": -381.32501220703125, "logps/rejected": -420.0249938964844, "loss": 0.0233, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5838379859924316, "rewards/margins": 6.608593940734863, "rewards/rejected": -9.192968368530273, "step": 7320 }, { "epoch": 2.760613762590605, "grad_norm": 12.880653539191796, "learning_rate": 3.098870056497175e-07, "logits/chosen": -2.5283203125, "logits/rejected": -2.5556640625, "logps/chosen": -375.54998779296875, "logps/rejected": -412.1499938964844, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -2.823193311691284, "rewards/margins": 7.095703125, "rewards/rejected": -9.921093940734863, "step": 7330 }, { "epoch": 2.7643791772568953, "grad_norm": 16.392943865132036, "learning_rate": 3.0894538606403015e-07, "logits/chosen": -2.6292967796325684, "logits/rejected": -2.6419920921325684, "logps/chosen": -344.1000061035156, "logps/rejected": -382.17498779296875, "loss": 0.0292, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.20068359375, "rewards/margins": 6.676562309265137, "rewards/rejected": -9.877344131469727, "step": 7340 }, { "epoch": 2.7681445919231855, "grad_norm": 4.461080029310208, "learning_rate": 3.0800376647834275e-07, "logits/chosen": -2.4857420921325684, "logits/rejected": -2.729687452316284, "logps/chosen": -344.8374938964844, "logps/rejected": -356.1499938964844, "loss": 0.0341, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.754589796066284, "rewards/margins": 6.438672065734863, "rewards/rejected": -9.1875, "step": 7350 }, { "epoch": 2.7719100065894757, "grad_norm": 17.57360583408183, "learning_rate": 3.0706214689265536e-07, "logits/chosen": -2.5361328125, "logits/rejected": -2.5416016578674316, "logps/chosen": -359.2749938964844, "logps/rejected": -410.29998779296875, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -2.9820313453674316, "rewards/margins": 6.852734565734863, "rewards/rejected": -9.845312118530273, "step": 7360 }, { "epoch": 2.775675421255766, "grad_norm": 51.662269560389916, "learning_rate": 3.0612052730696797e-07, "logits/chosen": -2.559375047683716, "logits/rejected": -2.5423827171325684, "logps/chosen": -349.875, "logps/rejected": -415.6000061035156, "loss": 0.0334, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.815136671066284, "rewards/margins": 6.974218845367432, "rewards/rejected": -9.7890625, "step": 7370 }, { "epoch": 2.779440835922056, "grad_norm": 5.002915573390412, "learning_rate": 3.0517890772128057e-07, "logits/chosen": -2.39501953125, "logits/rejected": -2.4095702171325684, "logps/chosen": -362.70001220703125, "logps/rejected": -394.70001220703125, "loss": 0.0524, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.125195264816284, "rewards/margins": 7.044921875, "rewards/rejected": -10.167187690734863, "step": 7380 }, { "epoch": 2.783206250588346, "grad_norm": 25.31068956105325, "learning_rate": 3.042372881355932e-07, "logits/chosen": -2.5464844703674316, "logits/rejected": -2.5796875953674316, "logps/chosen": -380.6000061035156, "logps/rejected": -392.8999938964844, "loss": 0.0228, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5111083984375, "rewards/margins": 6.900390625, "rewards/rejected": -9.409375190734863, "step": 7390 }, { "epoch": 2.786971665254636, "grad_norm": 1.67411418167338, "learning_rate": 3.0329566854990584e-07, "logits/chosen": -2.627734422683716, "logits/rejected": -2.5283203125, "logps/chosen": -320.54998779296875, "logps/rejected": -367.7749938964844, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": -3.3388428688049316, "rewards/margins": 6.893359184265137, "rewards/rejected": -10.232030868530273, "step": 7400 }, { "epoch": 2.7907370799209263, "grad_norm": 12.427576156437697, "learning_rate": 3.0235404896421844e-07, "logits/chosen": -2.5640625953674316, "logits/rejected": -2.4755859375, "logps/chosen": -367.1499938964844, "logps/rejected": -424.9750061035156, "loss": 0.0321, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.4852538108825684, "rewards/margins": 6.686718940734863, "rewards/rejected": -10.176562309265137, "step": 7410 }, { "epoch": 2.7945024945872166, "grad_norm": 32.077724454573826, "learning_rate": 3.0141242937853105e-07, "logits/chosen": -2.399218797683716, "logits/rejected": -2.331249952316284, "logps/chosen": -416.7250061035156, "logps/rejected": -459.6000061035156, "loss": 0.0271, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.435107469558716, "rewards/margins": 6.955468654632568, "rewards/rejected": -10.389843940734863, "step": 7420 }, { "epoch": 2.7982679092535063, "grad_norm": 11.293643187410655, "learning_rate": 3.0047080979284366e-07, "logits/chosen": -2.60986328125, "logits/rejected": -2.572070360183716, "logps/chosen": -371.5375061035156, "logps/rejected": -428.875, "loss": 0.0345, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.007031440734863, "rewards/margins": 6.921484470367432, "rewards/rejected": -10.931249618530273, "step": 7430 }, { "epoch": 2.8020333239197965, "grad_norm": 97.8448425858044, "learning_rate": 2.9952919020715626e-07, "logits/chosen": -2.4422850608825684, "logits/rejected": -2.429492235183716, "logps/chosen": -385.2124938964844, "logps/rejected": -433.70001220703125, "loss": 0.0275, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7281250953674316, "rewards/margins": 6.76171875, "rewards/rejected": -10.487500190734863, "step": 7440 }, { "epoch": 2.8057987385860867, "grad_norm": 40.50090695028622, "learning_rate": 2.9858757062146887e-07, "logits/chosen": -2.3705077171325684, "logits/rejected": -2.4673829078674316, "logps/chosen": -411.9125061035156, "logps/rejected": -412.7250061035156, "loss": 0.0423, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3301758766174316, "rewards/margins": 6.879687309265137, "rewards/rejected": -10.207812309265137, "step": 7450 }, { "epoch": 2.809564153252377, "grad_norm": 8.956994731664585, "learning_rate": 2.9764595103578153e-07, "logits/chosen": -2.488085985183716, "logits/rejected": -2.517578125, "logps/chosen": -400.5, "logps/rejected": -434.125, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": -3.2049317359924316, "rewards/margins": 6.648828029632568, "rewards/rejected": -9.850781440734863, "step": 7460 }, { "epoch": 2.813329567918667, "grad_norm": 14.140119132964438, "learning_rate": 2.9670433145009413e-07, "logits/chosen": -2.4564452171325684, "logits/rejected": -2.472460985183716, "logps/chosen": -398.5249938964844, "logps/rejected": -417.7749938964844, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -3.434765577316284, "rewards/margins": 6.736328125, "rewards/rejected": -10.168749809265137, "step": 7470 }, { "epoch": 2.8170949825849574, "grad_norm": 3.2473123085005144, "learning_rate": 2.9576271186440674e-07, "logits/chosen": -2.510546922683716, "logits/rejected": -2.6162109375, "logps/chosen": -360.92498779296875, "logps/rejected": -368.8500061035156, "loss": 0.0455, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.581249952316284, "rewards/margins": 6.620312690734863, "rewards/rejected": -10.204687118530273, "step": 7480 }, { "epoch": 2.820860397251247, "grad_norm": 5.24206623507758, "learning_rate": 2.948210922787194e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.68359375, "logps/chosen": -375.8999938964844, "logps/rejected": -399.2250061035156, "loss": 0.0424, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.6332030296325684, "rewards/margins": 6.833593845367432, "rewards/rejected": -10.473437309265137, "step": 7490 }, { "epoch": 2.8246258119175374, "grad_norm": 2.8519592628554253, "learning_rate": 2.93879472693032e-07, "logits/chosen": -2.4927735328674316, "logits/rejected": -2.566601514816284, "logps/chosen": -363.8999938964844, "logps/rejected": -397.8500061035156, "loss": 0.0569, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.7903809547424316, "rewards/margins": 6.565625190734863, "rewards/rejected": -10.356249809265137, "step": 7500 }, { "epoch": 2.8283912265838276, "grad_norm": 5.851607272654398, "learning_rate": 2.9293785310734467e-07, "logits/chosen": -2.5044922828674316, "logits/rejected": -2.622265577316284, "logps/chosen": -393.3999938964844, "logps/rejected": -395.0, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -3.683398485183716, "rewards/margins": 6.821875095367432, "rewards/rejected": -10.504687309265137, "step": 7510 }, { "epoch": 2.832156641250118, "grad_norm": 2.4957182329606455, "learning_rate": 2.9199623352165727e-07, "logits/chosen": -2.5835938453674316, "logits/rejected": -2.688281297683716, "logps/chosen": -327.6499938964844, "logps/rejected": -348.8500061035156, "loss": 0.054, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.293652296066284, "rewards/margins": 6.432031154632568, "rewards/rejected": -9.725781440734863, "step": 7520 }, { "epoch": 2.8359220559164076, "grad_norm": 27.14546995786111, "learning_rate": 2.910546139359699e-07, "logits/chosen": -2.51416015625, "logits/rejected": -2.6778321266174316, "logps/chosen": -347.7749938964844, "logps/rejected": -363.2749938964844, "loss": 0.0276, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.4200196266174316, "rewards/margins": 6.994921684265137, "rewards/rejected": -10.416406631469727, "step": 7530 }, { "epoch": 2.8396874705826978, "grad_norm": 0.7323604221859311, "learning_rate": 2.901129943502825e-07, "logits/chosen": -2.472851514816284, "logits/rejected": -2.425585985183716, "logps/chosen": -385.7250061035156, "logps/rejected": -397.9750061035156, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": -2.9994139671325684, "rewards/margins": 6.755468845367432, "rewards/rejected": -9.763280868530273, "step": 7540 }, { "epoch": 2.843452885248988, "grad_norm": 3.1928744996361176, "learning_rate": 2.891713747645951e-07, "logits/chosen": -2.4720702171325684, "logits/rejected": -2.4996094703674316, "logps/chosen": -344.2875061035156, "logps/rejected": -362.1000061035156, "loss": 0.0269, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5884766578674316, "rewards/margins": 6.918749809265137, "rewards/rejected": -9.500781059265137, "step": 7550 }, { "epoch": 2.847218299915278, "grad_norm": 1.4059241553855397, "learning_rate": 2.882297551789077e-07, "logits/chosen": -2.4664063453674316, "logits/rejected": -2.6039061546325684, "logps/chosen": -355.5, "logps/rejected": -369.79998779296875, "loss": 0.045, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.1041016578674316, "rewards/margins": 7.025390625, "rewards/rejected": -10.132031440734863, "step": 7560 }, { "epoch": 2.8509837145815684, "grad_norm": 11.810429773859262, "learning_rate": 2.8728813559322036e-07, "logits/chosen": -2.3082032203674316, "logits/rejected": -2.346874952316284, "logps/chosen": -413.1499938964844, "logps/rejected": -425.6000061035156, "loss": 0.0224, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.214404344558716, "rewards/margins": 6.884375095367432, "rewards/rejected": -10.09765625, "step": 7570 }, { "epoch": 2.8547491292478586, "grad_norm": 3.8254479284523075, "learning_rate": 2.8634651600753296e-07, "logits/chosen": -2.436328172683716, "logits/rejected": -2.601367235183716, "logps/chosen": -390.6875, "logps/rejected": -429.5249938964844, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -3.797070264816284, "rewards/margins": 7.064843654632568, "rewards/rejected": -10.86328125, "step": 7580 }, { "epoch": 2.8585145439141484, "grad_norm": 8.244507041631035, "learning_rate": 2.8540489642184557e-07, "logits/chosen": -2.6519532203674316, "logits/rejected": -2.503124952316284, "logps/chosen": -395.2250061035156, "logps/rejected": -471.82501220703125, "loss": 0.0186, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.4810547828674316, "rewards/margins": 7.2109375, "rewards/rejected": -10.690625190734863, "step": 7590 }, { "epoch": 2.8622799585804386, "grad_norm": 39.322484525438, "learning_rate": 2.844632768361582e-07, "logits/chosen": -2.4599609375, "logits/rejected": -2.490429639816284, "logps/chosen": -382.79998779296875, "logps/rejected": -419.5249938964844, "loss": 0.0213, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5580077171325684, "rewards/margins": 7.267968654632568, "rewards/rejected": -10.824999809265137, "step": 7600 }, { "epoch": 2.866045373246729, "grad_norm": 3.620344204593331, "learning_rate": 2.835216572504708e-07, "logits/chosen": -2.5986328125, "logits/rejected": -2.660351514816284, "logps/chosen": -383.3999938964844, "logps/rejected": -429.8500061035156, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -3.416210889816284, "rewards/margins": 7.055468559265137, "rewards/rejected": -10.475781440734863, "step": 7610 }, { "epoch": 2.869810787913019, "grad_norm": 12.17779101096709, "learning_rate": 2.825800376647834e-07, "logits/chosen": -2.5894532203674316, "logits/rejected": -2.626953125, "logps/chosen": -362.8500061035156, "logps/rejected": -397.54998779296875, "loss": 0.0284, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.932177782058716, "rewards/margins": 6.839062690734863, "rewards/rejected": -9.767969131469727, "step": 7620 }, { "epoch": 2.873576202579309, "grad_norm": 9.90899033595215, "learning_rate": 2.8163841807909605e-07, "logits/chosen": -2.474609375, "logits/rejected": -2.6656250953674316, "logps/chosen": -351.1000061035156, "logps/rejected": -373.7250061035156, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -3.013671875, "rewards/margins": 6.578125, "rewards/rejected": -9.592187881469727, "step": 7630 }, { "epoch": 2.877341617245599, "grad_norm": 13.47531205506839, "learning_rate": 2.8069679849340865e-07, "logits/chosen": -2.455078125, "logits/rejected": -2.5357422828674316, "logps/chosen": -344.70001220703125, "logps/rejected": -394.8500061035156, "loss": 0.0398, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5048828125, "rewards/margins": 6.762109279632568, "rewards/rejected": -10.275781631469727, "step": 7640 }, { "epoch": 2.881107031911889, "grad_norm": 5.694498053677785, "learning_rate": 2.7975517890772126e-07, "logits/chosen": -2.3326172828674316, "logits/rejected": -2.3504881858825684, "logps/chosen": -404.1499938964844, "logps/rejected": -449.7749938964844, "loss": 0.031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.409960985183716, "rewards/margins": 6.739843845367432, "rewards/rejected": -10.150781631469727, "step": 7650 }, { "epoch": 2.8848724465781794, "grad_norm": 3.4255106223166782, "learning_rate": 2.7881355932203387e-07, "logits/chosen": -2.5708985328674316, "logits/rejected": -2.6513671875, "logps/chosen": -338.25, "logps/rejected": -393.7250061035156, "loss": 0.03, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.275585889816284, "rewards/margins": 6.5703125, "rewards/rejected": -9.846094131469727, "step": 7660 }, { "epoch": 2.8886378612444696, "grad_norm": 8.755782435206804, "learning_rate": 2.7787193973634647e-07, "logits/chosen": -2.4971680641174316, "logits/rejected": -2.5853514671325684, "logps/chosen": -359.92498779296875, "logps/rejected": -377.82501220703125, "loss": 0.0241, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.588085889816284, "rewards/margins": 6.556250095367432, "rewards/rejected": -9.146875381469727, "step": 7670 }, { "epoch": 2.89240327591076, "grad_norm": 22.409434215071183, "learning_rate": 2.7693032015065913e-07, "logits/chosen": -2.4873046875, "logits/rejected": -2.5005860328674316, "logps/chosen": -383.76251220703125, "logps/rejected": -415.57501220703125, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": -3.0866026878356934, "rewards/margins": 6.859765529632568, "rewards/rejected": -9.954687118530273, "step": 7680 }, { "epoch": 2.89616869057705, "grad_norm": 4.881900169037461, "learning_rate": 2.7598870056497174e-07, "logits/chosen": -2.334179639816284, "logits/rejected": -2.487597703933716, "logps/chosen": -421.17498779296875, "logps/rejected": -460.92498779296875, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": -2.599316358566284, "rewards/margins": 6.838281154632568, "rewards/rejected": -9.446093559265137, "step": 7690 }, { "epoch": 2.89993410524334, "grad_norm": 3.832733900764205, "learning_rate": 2.7504708097928434e-07, "logits/chosen": -2.4365234375, "logits/rejected": -2.568359375, "logps/chosen": -347.82501220703125, "logps/rejected": -384.32501220703125, "loss": 0.0211, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.054980516433716, "rewards/margins": 6.628515720367432, "rewards/rejected": -9.682812690734863, "step": 7700 }, { "epoch": 2.90369951990963, "grad_norm": 1.2790235013971571, "learning_rate": 2.7410546139359695e-07, "logits/chosen": -2.5904297828674316, "logits/rejected": -2.5083985328674316, "logps/chosen": -351.38751220703125, "logps/rejected": -415.625, "loss": 0.0193, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.53515625, "rewards/margins": 7.10546875, "rewards/rejected": -10.637499809265137, "step": 7710 }, { "epoch": 2.9074649345759203, "grad_norm": 6.595080745380579, "learning_rate": 2.7316384180790956e-07, "logits/chosen": -2.383984327316284, "logits/rejected": -2.4857420921325684, "logps/chosen": -360.7749938964844, "logps/rejected": -434.70001220703125, "loss": 0.0194, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.65234375, "rewards/margins": 6.673047065734863, "rewards/rejected": -10.328906059265137, "step": 7720 }, { "epoch": 2.91123034924221, "grad_norm": 28.380742774780163, "learning_rate": 2.7222222222222216e-07, "logits/chosen": -2.5982422828674316, "logits/rejected": -2.490234375, "logps/chosen": -378.2250061035156, "logps/rejected": -414.9750061035156, "loss": 0.0213, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8880858421325684, "rewards/margins": 6.7890625, "rewards/rejected": -10.674219131469727, "step": 7730 }, { "epoch": 2.9149957639085002, "grad_norm": 2.451573228847488, "learning_rate": 2.712806026365349e-07, "logits/chosen": -2.4677734375, "logits/rejected": -2.450390577316284, "logps/chosen": -366.6875, "logps/rejected": -424.125, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -3.1810545921325684, "rewards/margins": 7.041406154632568, "rewards/rejected": -10.2265625, "step": 7740 }, { "epoch": 2.9187611785747904, "grad_norm": 9.242623804792935, "learning_rate": 2.703389830508475e-07, "logits/chosen": -2.476757764816284, "logits/rejected": -2.5751953125, "logps/chosen": -369.0, "logps/rejected": -415.5, "loss": 0.0345, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1507811546325684, "rewards/margins": 6.630078315734863, "rewards/rejected": -9.78125, "step": 7750 }, { "epoch": 2.9225265932410807, "grad_norm": 2.4472986365619085, "learning_rate": 2.693973634651601e-07, "logits/chosen": -2.6380858421325684, "logits/rejected": -2.620312452316284, "logps/chosen": -337.0625, "logps/rejected": -390.9750061035156, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -3.57080078125, "rewards/margins": 7.345703125, "rewards/rejected": -10.914843559265137, "step": 7760 }, { "epoch": 2.926292007907371, "grad_norm": 50.287395985478746, "learning_rate": 2.684557438794727e-07, "logits/chosen": -2.546093702316284, "logits/rejected": -2.5804686546325684, "logps/chosen": -374.0, "logps/rejected": -408.0249938964844, "loss": 0.0352, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5611329078674316, "rewards/margins": 6.638281345367432, "rewards/rejected": -10.200780868530273, "step": 7770 }, { "epoch": 2.930057422573661, "grad_norm": 7.7292572550281085, "learning_rate": 2.675141242937853e-07, "logits/chosen": -2.7007813453674316, "logits/rejected": -2.550976514816284, "logps/chosen": -334.5249938964844, "logps/rejected": -412.875, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": -3.668701171875, "rewards/margins": 7.016015529632568, "rewards/rejected": -10.684374809265137, "step": 7780 }, { "epoch": 2.9338228372399513, "grad_norm": 12.821384164151334, "learning_rate": 2.665725047080979e-07, "logits/chosen": -2.4912109375, "logits/rejected": -2.5494141578674316, "logps/chosen": -398.5249938964844, "logps/rejected": -408.42498779296875, "loss": 0.0242, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.538134813308716, "rewards/margins": 6.896093845367432, "rewards/rejected": -10.439062118530273, "step": 7790 }, { "epoch": 2.937588251906241, "grad_norm": 5.55036984569003, "learning_rate": 2.6563088512241057e-07, "logits/chosen": -2.4037108421325684, "logits/rejected": -2.6800780296325684, "logps/chosen": -384.92498779296875, "logps/rejected": -414.5, "loss": 0.0348, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.299609422683716, "rewards/margins": 7.642187595367432, "rewards/rejected": -10.940625190734863, "step": 7800 }, { "epoch": 2.9413536665725313, "grad_norm": 4.4235174138486855, "learning_rate": 2.6468926553672317e-07, "logits/chosen": -2.6201171875, "logits/rejected": -2.685546875, "logps/chosen": -324.0, "logps/rejected": -415.20001220703125, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": -3.554980516433716, "rewards/margins": 6.717187404632568, "rewards/rejected": -10.2734375, "step": 7810 }, { "epoch": 2.9451190812388215, "grad_norm": 35.22379329492294, "learning_rate": 2.637476459510358e-07, "logits/chosen": -2.639453172683716, "logits/rejected": -2.692578077316284, "logps/chosen": -320.32501220703125, "logps/rejected": -352.2250061035156, "loss": 0.0324, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.0445313453674316, "rewards/margins": 6.667578220367432, "rewards/rejected": -9.711718559265137, "step": 7820 }, { "epoch": 2.9488844959051117, "grad_norm": 19.91989737430842, "learning_rate": 2.628060263653484e-07, "logits/chosen": -2.5416016578674316, "logits/rejected": -2.541210889816284, "logps/chosen": -385.3374938964844, "logps/rejected": -410.3999938964844, "loss": 0.0298, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.247314453125, "rewards/margins": 6.696093559265137, "rewards/rejected": -9.942968368530273, "step": 7830 }, { "epoch": 2.9526499105714015, "grad_norm": 2.4988662908422383, "learning_rate": 2.61864406779661e-07, "logits/chosen": -2.6263670921325684, "logits/rejected": -2.5914063453674316, "logps/chosen": -340.70001220703125, "logps/rejected": -421.9750061035156, "loss": 0.026, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7093749046325684, "rewards/margins": 6.811327934265137, "rewards/rejected": -10.53125, "step": 7840 }, { "epoch": 2.9564153252376917, "grad_norm": 10.365282211377592, "learning_rate": 2.6092278719397365e-07, "logits/chosen": -2.583984375, "logits/rejected": -2.664257764816284, "logps/chosen": -380.54998779296875, "logps/rejected": -406.875, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -3.927734375, "rewards/margins": 6.959374904632568, "rewards/rejected": -10.885937690734863, "step": 7850 }, { "epoch": 2.960180739903982, "grad_norm": 3.221599808895403, "learning_rate": 2.5998116760828626e-07, "logits/chosen": -2.590039014816284, "logits/rejected": -2.757617235183716, "logps/chosen": -374.375, "logps/rejected": -395.75, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -4.08984375, "rewards/margins": 6.773046970367432, "rewards/rejected": -10.864062309265137, "step": 7860 }, { "epoch": 2.963946154570272, "grad_norm": 6.1092567530022315, "learning_rate": 2.5903954802259886e-07, "logits/chosen": -2.6576170921325684, "logits/rejected": -2.638476610183716, "logps/chosen": -372.70001220703125, "logps/rejected": -418.70001220703125, "loss": 0.0227, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6025390625, "rewards/margins": 6.928124904632568, "rewards/rejected": -10.532031059265137, "step": 7870 }, { "epoch": 2.9677115692365623, "grad_norm": 4.746637914886942, "learning_rate": 2.5809792843691147e-07, "logits/chosen": -2.5728516578674316, "logits/rejected": -2.607617139816284, "logps/chosen": -396.0, "logps/rejected": -448.67498779296875, "loss": 0.0259, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.627734422683716, "rewards/margins": 6.65625, "rewards/rejected": -10.286718368530273, "step": 7880 }, { "epoch": 2.9714769839028525, "grad_norm": 33.31769347064665, "learning_rate": 2.571563088512241e-07, "logits/chosen": -2.7392578125, "logits/rejected": -2.739453077316284, "logps/chosen": -355.32501220703125, "logps/rejected": -410.82501220703125, "loss": 0.0263, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.844921827316284, "rewards/margins": 6.928124904632568, "rewards/rejected": -10.775781631469727, "step": 7890 }, { "epoch": 2.9752423985691423, "grad_norm": 22.16988609009356, "learning_rate": 2.562146892655367e-07, "logits/chosen": -2.703906297683716, "logits/rejected": -2.745898485183716, "logps/chosen": -347.79998779296875, "logps/rejected": -385.6000061035156, "loss": 0.0373, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2538084983825684, "rewards/margins": 6.708593845367432, "rewards/rejected": -9.955469131469727, "step": 7900 }, { "epoch": 2.9790078132354325, "grad_norm": 5.948249132231057, "learning_rate": 2.5527306967984934e-07, "logits/chosen": -2.659374952316284, "logits/rejected": -2.7901368141174316, "logps/chosen": -380.2250061035156, "logps/rejected": -393.1000061035156, "loss": 0.023, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7060546875, "rewards/margins": 6.962109565734863, "rewards/rejected": -10.671875, "step": 7910 }, { "epoch": 2.9827732279017227, "grad_norm": 5.007124909196879, "learning_rate": 2.5433145009416195e-07, "logits/chosen": -2.5171875953674316, "logits/rejected": -2.4452147483825684, "logps/chosen": -373.67498779296875, "logps/rejected": -409.42498779296875, "loss": 0.0236, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.4742674827575684, "rewards/margins": 7.284375190734863, "rewards/rejected": -10.753125190734863, "step": 7920 }, { "epoch": 2.986538642568013, "grad_norm": 2.1361643645294164, "learning_rate": 2.5338983050847455e-07, "logits/chosen": -2.4507813453674316, "logits/rejected": -2.632031202316284, "logps/chosen": -381.5, "logps/rejected": -395.29998779296875, "loss": 0.0157, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.259082078933716, "rewards/margins": 7.254296779632568, "rewards/rejected": -10.517187118530273, "step": 7930 }, { "epoch": 2.9903040572343027, "grad_norm": 2.9196505301310762, "learning_rate": 2.5244821092278716e-07, "logits/chosen": -2.4571290016174316, "logits/rejected": -2.5341796875, "logps/chosen": -428.4750061035156, "logps/rejected": -435.32501220703125, "loss": 0.0334, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5687499046325684, "rewards/margins": 7.055468559265137, "rewards/rejected": -10.624218940734863, "step": 7940 }, { "epoch": 2.994069471900593, "grad_norm": 13.44006505916467, "learning_rate": 2.5150659133709977e-07, "logits/chosen": -2.698046922683716, "logits/rejected": -2.6705079078674316, "logps/chosen": -357.2124938964844, "logps/rejected": -404.25, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": -3.9300780296325684, "rewards/margins": 7.150781154632568, "rewards/rejected": -11.080469131469727, "step": 7950 }, { "epoch": 2.997834886566883, "grad_norm": 0.788567322257285, "learning_rate": 2.5056497175141237e-07, "logits/chosen": -2.529296875, "logits/rejected": -2.599609375, "logps/chosen": -360.45001220703125, "logps/rejected": -396.2749938964844, "loss": 0.0173, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.783007860183716, "rewards/margins": 6.967187404632568, "rewards/rejected": -10.752344131469727, "step": 7960 }, { "epoch": 3.001882707333145, "grad_norm": 2.7819882729267573, "learning_rate": 2.4962335216572503e-07, "logits/chosen": -2.546130895614624, "logits/rejected": -2.717261791229248, "logps/chosen": -356.0714416503906, "logps/rejected": -392.19049072265625, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": -3.3711867332458496, "rewards/margins": 7.316964149475098, "rewards/rejected": -10.684523582458496, "step": 7970 }, { "epoch": 3.0056481219994353, "grad_norm": 0.44816534233459077, "learning_rate": 2.4868173258003764e-07, "logits/chosen": -2.599804639816284, "logits/rejected": -2.611132860183716, "logps/chosen": -365.95001220703125, "logps/rejected": -416.7250061035156, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -3.2244629859924316, "rewards/margins": 7.591406345367432, "rewards/rejected": -10.815625190734863, "step": 7980 }, { "epoch": 3.0094135366657255, "grad_norm": 1.4580336954380175, "learning_rate": 2.4774011299435024e-07, "logits/chosen": -2.645312547683716, "logits/rejected": -2.6851563453674316, "logps/chosen": -355.54998779296875, "logps/rejected": -391.42498779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.2076172828674316, "rewards/margins": 7.674218654632568, "rewards/rejected": -10.878125190734863, "step": 7990 }, { "epoch": 3.0131789513320153, "grad_norm": 0.6370576678636246, "learning_rate": 2.467984934086629e-07, "logits/chosen": -2.535937547683716, "logits/rejected": -2.75390625, "logps/chosen": -399.11248779296875, "logps/rejected": -403.45001220703125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -2.8832030296325684, "rewards/margins": 7.65234375, "rewards/rejected": -10.530468940734863, "step": 8000 }, { "epoch": 3.0169443659983055, "grad_norm": 7.071365536854872, "learning_rate": 2.458568738229755e-07, "logits/chosen": -2.682421922683716, "logits/rejected": -2.615039110183716, "logps/chosen": -348.23748779296875, "logps/rejected": -455.1499938964844, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.5278077125549316, "rewards/margins": 8.547656059265137, "rewards/rejected": -12.08203125, "step": 8010 }, { "epoch": 3.0207097806645957, "grad_norm": 1.8441450238778365, "learning_rate": 2.449152542372881e-07, "logits/chosen": -2.7115235328674316, "logits/rejected": -2.819531202316284, "logps/chosen": -395.75, "logps/rejected": -451.7749938964844, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.731640577316284, "rewards/margins": 8.888280868530273, "rewards/rejected": -12.618749618530273, "step": 8020 }, { "epoch": 3.024475195330886, "grad_norm": 2.1759411637859607, "learning_rate": 2.439736346516007e-07, "logits/chosen": -2.6328125, "logits/rejected": -2.714648485183716, "logps/chosen": -420.32501220703125, "logps/rejected": -452.0, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -4.4619140625, "rewards/margins": 8.310937881469727, "rewards/rejected": -12.7734375, "step": 8030 }, { "epoch": 3.028240609997176, "grad_norm": 0.41207806552746545, "learning_rate": 2.430320150659134e-07, "logits/chosen": -2.6304688453674316, "logits/rejected": -2.8296875953674316, "logps/chosen": -379.51251220703125, "logps/rejected": -392.9750061035156, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.400683403015137, "rewards/margins": 8.276562690734863, "rewards/rejected": -12.678125381469727, "step": 8040 }, { "epoch": 3.032006024663466, "grad_norm": 2.730913202823694, "learning_rate": 2.42090395480226e-07, "logits/chosen": -2.7142577171325684, "logits/rejected": -2.7574219703674316, "logps/chosen": -372.92498779296875, "logps/rejected": -454.75, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -3.8765625953674316, "rewards/margins": 8.22265625, "rewards/rejected": -12.10546875, "step": 8050 }, { "epoch": 3.035771439329756, "grad_norm": 2.5736603507427365, "learning_rate": 2.411487758945386e-07, "logits/chosen": -2.6644530296325684, "logits/rejected": -2.67578125, "logps/chosen": -360.42498779296875, "logps/rejected": -435.125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.631054639816284, "rewards/margins": 8.328906059265137, "rewards/rejected": -11.952343940734863, "step": 8060 }, { "epoch": 3.0395368539960463, "grad_norm": 0.65364453635947, "learning_rate": 2.402071563088512e-07, "logits/chosen": -2.7734375, "logits/rejected": -2.9361329078674316, "logps/chosen": -362.1000061035156, "logps/rejected": -428.5249938964844, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.626757860183716, "rewards/margins": 8.795312881469727, "rewards/rejected": -12.423437118530273, "step": 8070 }, { "epoch": 3.0433022686623366, "grad_norm": 1.3508510093940227, "learning_rate": 2.392655367231638e-07, "logits/chosen": -2.577441453933716, "logits/rejected": -2.6685547828674316, "logps/chosen": -420.2250061035156, "logps/rejected": -475.45001220703125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.7691407203674316, "rewards/margins": 8.6640625, "rewards/rejected": -12.4296875, "step": 8080 }, { "epoch": 3.0470676833286268, "grad_norm": 0.7214929689513986, "learning_rate": 2.3832391713747647e-07, "logits/chosen": -2.6513671875, "logits/rejected": -2.8076171875, "logps/chosen": -382.23748779296875, "logps/rejected": -418.875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -4.000878810882568, "rewards/margins": 8.410937309265137, "rewards/rejected": -12.409375190734863, "step": 8090 }, { "epoch": 3.0508330979949165, "grad_norm": 9.240162755146695, "learning_rate": 2.3738229755178907e-07, "logits/chosen": -2.716015577316284, "logits/rejected": -2.830273389816284, "logps/chosen": -396.57501220703125, "logps/rejected": -434.75, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -4.606054782867432, "rewards/margins": 8.003125190734863, "rewards/rejected": -12.610937118530273, "step": 8100 }, { "epoch": 3.0545985126612067, "grad_norm": 2.0321624801880334, "learning_rate": 2.3644067796610168e-07, "logits/chosen": -2.7289061546325684, "logits/rejected": -2.719921827316284, "logps/chosen": -359.2250061035156, "logps/rejected": -412.70001220703125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.54052734375, "rewards/margins": 8.625, "rewards/rejected": -13.168749809265137, "step": 8110 }, { "epoch": 3.058363927327497, "grad_norm": 1.125186324735733, "learning_rate": 2.354990583804143e-07, "logits/chosen": -2.7142577171325684, "logits/rejected": -2.7447266578674316, "logps/chosen": -385.67498779296875, "logps/rejected": -429.95001220703125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -5.000781059265137, "rewards/margins": 8.647656440734863, "rewards/rejected": -13.6484375, "step": 8120 }, { "epoch": 3.062129341993787, "grad_norm": 3.1001299963159195, "learning_rate": 2.3455743879472692e-07, "logits/chosen": -2.5785155296325684, "logits/rejected": -2.8548827171325684, "logps/chosen": -413.3999938964844, "logps/rejected": -411.29998779296875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.432324409484863, "rewards/margins": 8.481249809265137, "rewards/rejected": -12.917187690734863, "step": 8130 }, { "epoch": 3.0658947566600774, "grad_norm": 0.8018403199548889, "learning_rate": 2.3361581920903952e-07, "logits/chosen": -2.506054639816284, "logits/rejected": -2.668652296066284, "logps/chosen": -380.57501220703125, "logps/rejected": -418.5, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.2294921875, "rewards/margins": 8.883593559265137, "rewards/rejected": -13.104687690734863, "step": 8140 }, { "epoch": 3.069660171326367, "grad_norm": 0.5231238014977423, "learning_rate": 2.3267419962335216e-07, "logits/chosen": -2.7470703125, "logits/rejected": -2.604687452316284, "logps/chosen": -377.07501220703125, "logps/rejected": -435.4750061035156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.270312309265137, "rewards/margins": 8.36328125, "rewards/rejected": -12.633593559265137, "step": 8150 }, { "epoch": 3.0734255859926574, "grad_norm": 1.1904182616255508, "learning_rate": 2.3173258003766476e-07, "logits/chosen": -2.7642579078674316, "logits/rejected": -2.795703172683716, "logps/chosen": -386.5625, "logps/rejected": -452.95001220703125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.186327934265137, "rewards/margins": 8.413281440734863, "rewards/rejected": -12.6015625, "step": 8160 }, { "epoch": 3.0771910006589476, "grad_norm": 15.927093437643038, "learning_rate": 2.3079096045197737e-07, "logits/chosen": -2.741015672683716, "logits/rejected": -2.774707078933716, "logps/chosen": -381.1000061035156, "logps/rejected": -444.7749938964844, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -4.453515529632568, "rewards/margins": 8.546875, "rewards/rejected": -12.995312690734863, "step": 8170 }, { "epoch": 3.080956415325238, "grad_norm": 2.067540886970012, "learning_rate": 2.2984934086629003e-07, "logits/chosen": -2.643749952316284, "logits/rejected": -2.6861329078674316, "logps/chosen": -396.8999938964844, "logps/rejected": -467.7250061035156, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.625390529632568, "rewards/margins": 8.897656440734863, "rewards/rejected": -13.521875381469727, "step": 8180 }, { "epoch": 3.084721829991528, "grad_norm": 2.1405598261984675, "learning_rate": 2.2890772128060263e-07, "logits/chosen": -2.6166014671325684, "logits/rejected": -2.7783203125, "logps/chosen": -404.67498779296875, "logps/rejected": -441.25, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -4.524609565734863, "rewards/margins": 8.526562690734863, "rewards/rejected": -13.059374809265137, "step": 8190 }, { "epoch": 3.0884872446578178, "grad_norm": 2.4013952747243317, "learning_rate": 2.2796610169491524e-07, "logits/chosen": -2.6478514671325684, "logits/rejected": -2.765429735183716, "logps/chosen": -395.9750061035156, "logps/rejected": -452.54998779296875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.977343797683716, "rewards/margins": 8.8125, "rewards/rejected": -12.787500381469727, "step": 8200 }, { "epoch": 3.092252659324108, "grad_norm": 5.30026391980462, "learning_rate": 2.2702448210922787e-07, "logits/chosen": -2.5951170921325684, "logits/rejected": -2.6382813453674316, "logps/chosen": -404.8999938964844, "logps/rejected": -470.5874938964844, "loss": 0.0174, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.293554782867432, "rewards/margins": 8.508593559265137, "rewards/rejected": -12.794530868530273, "step": 8210 }, { "epoch": 3.096018073990398, "grad_norm": 0.41952848290381156, "learning_rate": 2.2608286252354048e-07, "logits/chosen": -2.753124952316284, "logits/rejected": -2.7552733421325684, "logps/chosen": -356.75, "logps/rejected": -444.625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -4.106884956359863, "rewards/margins": 8.081250190734863, "rewards/rejected": -12.1875, "step": 8220 }, { "epoch": 3.0997834886566884, "grad_norm": 0.3835023641924963, "learning_rate": 2.251412429378531e-07, "logits/chosen": -2.529101610183716, "logits/rejected": -2.7621092796325684, "logps/chosen": -389.70001220703125, "logps/rejected": -439.82501220703125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.30108642578125, "rewards/margins": 8.091405868530273, "rewards/rejected": -11.395312309265137, "step": 8230 }, { "epoch": 3.1035489033229786, "grad_norm": 1.2128279837417748, "learning_rate": 2.2419962335216572e-07, "logits/chosen": -2.689257860183716, "logits/rejected": -2.6869139671325684, "logps/chosen": -352.8500061035156, "logps/rejected": -453.7250061035156, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.8294920921325684, "rewards/margins": 8.625781059265137, "rewards/rejected": -12.453906059265137, "step": 8240 }, { "epoch": 3.1073143179892684, "grad_norm": 0.594246712799787, "learning_rate": 2.2325800376647832e-07, "logits/chosen": -2.604296922683716, "logits/rejected": -2.8089842796325684, "logps/chosen": -393.1499938964844, "logps/rejected": -423.29998779296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.501171827316284, "rewards/margins": 8.842968940734863, "rewards/rejected": -12.339062690734863, "step": 8250 }, { "epoch": 3.1110797326555586, "grad_norm": 0.8175603330527009, "learning_rate": 2.2231638418079096e-07, "logits/chosen": -2.822070360183716, "logits/rejected": -2.8623046875, "logps/chosen": -365.5, "logps/rejected": -435.32501220703125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -4.325146675109863, "rewards/margins": 8.55078125, "rewards/rejected": -12.870312690734863, "step": 8260 }, { "epoch": 3.114845147321849, "grad_norm": 56.573722514396856, "learning_rate": 2.2137476459510356e-07, "logits/chosen": -2.6666016578674316, "logits/rejected": -2.7294921875, "logps/chosen": -387.3500061035156, "logps/rejected": -447.8999938964844, "loss": 0.0451, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.381054878234863, "rewards/margins": 8.967968940734863, "rewards/rejected": -13.345312118530273, "step": 8270 }, { "epoch": 3.118610561988139, "grad_norm": 0.5702536444750999, "learning_rate": 2.2043314500941617e-07, "logits/chosen": -2.7376952171325684, "logits/rejected": -2.849609375, "logps/chosen": -376.8999938964844, "logps/rejected": -453.17498779296875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.609570503234863, "rewards/margins": 8.568750381469727, "rewards/rejected": -13.181249618530273, "step": 8280 }, { "epoch": 3.1223759766544292, "grad_norm": 4.1371767388700125, "learning_rate": 2.194915254237288e-07, "logits/chosen": -2.660351514816284, "logits/rejected": -2.7855467796325684, "logps/chosen": -390.20001220703125, "logps/rejected": -423.8500061035156, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -4.241015434265137, "rewards/margins": 8.208593368530273, "rewards/rejected": -12.443750381469727, "step": 8290 }, { "epoch": 3.126141391320719, "grad_norm": 3.9775025260394514, "learning_rate": 2.185499058380414e-07, "logits/chosen": -2.829882860183716, "logits/rejected": -2.887500047683716, "logps/chosen": -363.86248779296875, "logps/rejected": -413.57501220703125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -4.387793064117432, "rewards/margins": 8.68359375, "rewards/rejected": -13.071093559265137, "step": 8300 }, { "epoch": 3.129906805987009, "grad_norm": 3.05171044772741, "learning_rate": 2.1760828625235404e-07, "logits/chosen": -2.585156202316284, "logits/rejected": -2.696484327316284, "logps/chosen": -396.1875, "logps/rejected": -466.0, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.266015529632568, "rewards/margins": 9.447656631469727, "rewards/rejected": -13.717187881469727, "step": 8310 }, { "epoch": 3.1336722206532994, "grad_norm": 1.224723775459871, "learning_rate": 2.1666666666666667e-07, "logits/chosen": -2.83984375, "logits/rejected": -3.030468702316284, "logps/chosen": -355.2250061035156, "logps/rejected": -418.45001220703125, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -4.278124809265137, "rewards/margins": 8.996874809265137, "rewards/rejected": -13.285937309265137, "step": 8320 }, { "epoch": 3.1374376353195896, "grad_norm": 1.1370100191203558, "learning_rate": 2.1572504708097928e-07, "logits/chosen": -2.779101610183716, "logits/rejected": -2.8150391578674316, "logps/chosen": -387.54998779296875, "logps/rejected": -463.25, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.265429496765137, "rewards/margins": 9.232812881469727, "rewards/rejected": -13.501562118530273, "step": 8330 }, { "epoch": 3.14120304998588, "grad_norm": 0.43536115550217436, "learning_rate": 2.147834274952919e-07, "logits/chosen": -2.704296827316284, "logits/rejected": -2.843945264816284, "logps/chosen": -423.8500061035156, "logps/rejected": -458.95001220703125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.101366996765137, "rewards/margins": 8.385937690734863, "rewards/rejected": -12.485937118530273, "step": 8340 }, { "epoch": 3.1449684646521696, "grad_norm": 2.2486428675532815, "learning_rate": 2.1384180790960452e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.844921827316284, "logps/chosen": -386.57501220703125, "logps/rejected": -411.2250061035156, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.65673828125, "rewards/margins": 8.419530868530273, "rewards/rejected": -13.075780868530273, "step": 8350 }, { "epoch": 3.14873387931846, "grad_norm": 2.688519519759049, "learning_rate": 2.1290018832391713e-07, "logits/chosen": -2.8121094703674316, "logits/rejected": -2.9400391578674316, "logps/chosen": -364.8125, "logps/rejected": -414.2749938964844, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -4.986132621765137, "rewards/margins": 8.214062690734863, "rewards/rejected": -13.205469131469727, "step": 8360 }, { "epoch": 3.15249929398475, "grad_norm": 1.112897106646457, "learning_rate": 2.1195856873822976e-07, "logits/chosen": -2.7406249046325684, "logits/rejected": -2.7767577171325684, "logps/chosen": -345.6875, "logps/rejected": -419.20001220703125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -3.731738328933716, "rewards/margins": 8.846094131469727, "rewards/rejected": -12.578906059265137, "step": 8370 }, { "epoch": 3.1562647086510403, "grad_norm": 1.6022982928162435, "learning_rate": 2.1101694915254237e-07, "logits/chosen": -2.7142577171325684, "logits/rejected": -2.755078077316284, "logps/chosen": -351.6000061035156, "logps/rejected": -440.625, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -3.8570313453674316, "rewards/margins": 8.547656059265137, "rewards/rejected": -12.409375190734863, "step": 8380 }, { "epoch": 3.1600301233173305, "grad_norm": 0.8399992712972693, "learning_rate": 2.1007532956685497e-07, "logits/chosen": -2.739062547683716, "logits/rejected": -2.914843797683716, "logps/chosen": -354.04998779296875, "logps/rejected": -401.8500061035156, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -3.5580077171325684, "rewards/margins": 8.846094131469727, "rewards/rejected": -12.407812118530273, "step": 8390 }, { "epoch": 3.1637955379836207, "grad_norm": 4.178137946154618, "learning_rate": 2.091337099811676e-07, "logits/chosen": -2.743945360183716, "logits/rejected": -2.8291015625, "logps/chosen": -385.3999938964844, "logps/rejected": -427.07501220703125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.281836032867432, "rewards/margins": 9.137499809265137, "rewards/rejected": -13.419530868530273, "step": 8400 }, { "epoch": 3.1675609526499104, "grad_norm": 0.9462810922263535, "learning_rate": 2.081920903954802e-07, "logits/chosen": -2.6597657203674316, "logits/rejected": -2.8675780296325684, "logps/chosen": -376.0249938964844, "logps/rejected": -402.8999938964844, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -3.9740233421325684, "rewards/margins": 8.442968368530273, "rewards/rejected": -12.412500381469727, "step": 8410 }, { "epoch": 3.1713263673162007, "grad_norm": 14.187644853766374, "learning_rate": 2.0725047080979282e-07, "logits/chosen": -2.6832032203674316, "logits/rejected": -2.731250047683716, "logps/chosen": -354.13751220703125, "logps/rejected": -420.375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.225976467132568, "rewards/margins": 8.629687309265137, "rewards/rejected": -12.862500190734863, "step": 8420 }, { "epoch": 3.175091781982491, "grad_norm": 0.5042376668588574, "learning_rate": 2.0630885122410545e-07, "logits/chosen": -2.5882811546325684, "logits/rejected": -2.709765672683716, "logps/chosen": -394.20001220703125, "logps/rejected": -437.75, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.511328220367432, "rewards/margins": 8.252344131469727, "rewards/rejected": -12.758593559265137, "step": 8430 }, { "epoch": 3.178857196648781, "grad_norm": 0.7049770394686645, "learning_rate": 2.0536723163841808e-07, "logits/chosen": -2.757617235183716, "logits/rejected": -2.875781297683716, "logps/chosen": -377.4750061035156, "logps/rejected": -403.6000061035156, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.8408203125, "rewards/margins": 8.64453125, "rewards/rejected": -13.482812881469727, "step": 8440 }, { "epoch": 3.182622611315071, "grad_norm": 0.5220553687522633, "learning_rate": 2.044256120527307e-07, "logits/chosen": -2.7490234375, "logits/rejected": -2.784960985183716, "logps/chosen": -351.54998779296875, "logps/rejected": -439.20001220703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.008984565734863, "rewards/margins": 8.629687309265137, "rewards/rejected": -12.634374618530273, "step": 8450 }, { "epoch": 3.186388025981361, "grad_norm": 2.1893494525653745, "learning_rate": 2.0348399246704332e-07, "logits/chosen": -2.664257764816284, "logits/rejected": -2.805468797683716, "logps/chosen": -385.625, "logps/rejected": -452.875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -3.809375047683716, "rewards/margins": 8.7734375, "rewards/rejected": -12.592187881469727, "step": 8460 }, { "epoch": 3.1901534406476513, "grad_norm": 2.003329655038442, "learning_rate": 2.0254237288135593e-07, "logits/chosen": -2.7808594703674316, "logits/rejected": -2.7412109375, "logps/chosen": -308.79998779296875, "logps/rejected": -382.95001220703125, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -3.4913573265075684, "rewards/margins": 8.506250381469727, "rewards/rejected": -11.993749618530273, "step": 8470 }, { "epoch": 3.1939188553139415, "grad_norm": 1.5007987968752539, "learning_rate": 2.0160075329566853e-07, "logits/chosen": -2.6302733421325684, "logits/rejected": -2.6861329078674316, "logps/chosen": -392.38751220703125, "logps/rejected": -483.54998779296875, "loss": 0.0148, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.575390815734863, "rewards/margins": 8.706250190734863, "rewards/rejected": -13.28125, "step": 8480 }, { "epoch": 3.1976842699802317, "grad_norm": 0.9486939540610907, "learning_rate": 2.0065913370998117e-07, "logits/chosen": -2.7650389671325684, "logits/rejected": -2.7310547828674316, "logps/chosen": -369.3125, "logps/rejected": -456.75, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -3.7442383766174316, "rewards/margins": 9.024999618530273, "rewards/rejected": -12.765625, "step": 8490 }, { "epoch": 3.201449684646522, "grad_norm": 6.008260302731578, "learning_rate": 1.9971751412429377e-07, "logits/chosen": -2.6615233421325684, "logits/rejected": -2.8128905296325684, "logps/chosen": -358.32501220703125, "logps/rejected": -395.2749938964844, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.065039157867432, "rewards/margins": 8.157812118530273, "rewards/rejected": -12.220312118530273, "step": 8500 }, { "epoch": 3.2052150993128117, "grad_norm": 2.083769019643562, "learning_rate": 1.9877589453860638e-07, "logits/chosen": -2.863476514816284, "logits/rejected": -2.846484422683716, "logps/chosen": -345.75, "logps/rejected": -419.20001220703125, "loss": 0.0094, "rewards/accuracies": 1.0, "rewards/chosen": -4.032617092132568, "rewards/margins": 7.996874809265137, "rewards/rejected": -12.024218559265137, "step": 8510 }, { "epoch": 3.208980513979102, "grad_norm": 1.7684179445460873, "learning_rate": 1.97834274952919e-07, "logits/chosen": -2.7974610328674316, "logits/rejected": -2.970703125, "logps/chosen": -363.0375061035156, "logps/rejected": -397.2749938964844, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -3.6817383766174316, "rewards/margins": 8.309374809265137, "rewards/rejected": -11.985937118530273, "step": 8520 }, { "epoch": 3.212745928645392, "grad_norm": 0.8301633317640076, "learning_rate": 1.9689265536723162e-07, "logits/chosen": -2.73046875, "logits/rejected": -2.9462890625, "logps/chosen": -361.5625, "logps/rejected": -418.7749938964844, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.425390720367432, "rewards/margins": 8.792187690734863, "rewards/rejected": -13.2109375, "step": 8530 }, { "epoch": 3.2165113433116823, "grad_norm": 0.5046695652938228, "learning_rate": 1.9595103578154425e-07, "logits/chosen": -2.7431640625, "logits/rejected": -2.863085985183716, "logps/chosen": -397.54998779296875, "logps/rejected": -430.3999938964844, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -4.197900295257568, "rewards/margins": 8.539843559265137, "rewards/rejected": -12.737500190734863, "step": 8540 }, { "epoch": 3.2202767579779725, "grad_norm": 2.9062362483178235, "learning_rate": 1.9500941619585686e-07, "logits/chosen": -2.6763672828674316, "logits/rejected": -2.7660155296325684, "logps/chosen": -447.875, "logps/rejected": -495.125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.241991996765137, "rewards/margins": 8.450780868530273, "rewards/rejected": -12.6875, "step": 8550 }, { "epoch": 3.2240421726442623, "grad_norm": 2.7449213348609613, "learning_rate": 1.9406779661016946e-07, "logits/chosen": -2.8304686546325684, "logits/rejected": -2.925976514816284, "logps/chosen": -382.4750061035156, "logps/rejected": -411.57501220703125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -3.600878953933716, "rewards/margins": 8.448437690734863, "rewards/rejected": -12.043749809265137, "step": 8560 }, { "epoch": 3.2278075873105525, "grad_norm": 1.432378938730968, "learning_rate": 1.9312617702448212e-07, "logits/chosen": -2.677539110183716, "logits/rejected": -2.8343749046325684, "logps/chosen": -392.125, "logps/rejected": -436.42498779296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -3.8511719703674316, "rewards/margins": 8.090624809265137, "rewards/rejected": -11.940625190734863, "step": 8570 }, { "epoch": 3.2315730019768427, "grad_norm": 4.822621651854341, "learning_rate": 1.9218455743879473e-07, "logits/chosen": -2.787890672683716, "logits/rejected": -2.6806640625, "logps/chosen": -353.92498779296875, "logps/rejected": -438.42498779296875, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -4.073144435882568, "rewards/margins": 8.521875381469727, "rewards/rejected": -12.59375, "step": 8580 }, { "epoch": 3.235338416643133, "grad_norm": 16.78509923819022, "learning_rate": 1.9124293785310734e-07, "logits/chosen": -2.7525391578674316, "logits/rejected": -2.8583984375, "logps/chosen": -386.42498779296875, "logps/rejected": -429.20001220703125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -3.832226514816284, "rewards/margins": 8.2890625, "rewards/rejected": -12.126562118530273, "step": 8590 }, { "epoch": 3.239103831309423, "grad_norm": 3.4423949346865137, "learning_rate": 1.9030131826741997e-07, "logits/chosen": -2.6773438453674316, "logits/rejected": -2.8984375, "logps/chosen": -387.04998779296875, "logps/rejected": -426.125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.433007717132568, "rewards/margins": 8.574999809265137, "rewards/rejected": -13.006250381469727, "step": 8600 }, { "epoch": 3.242869245975713, "grad_norm": 0.8263049261339659, "learning_rate": 1.8935969868173257e-07, "logits/chosen": -2.6626954078674316, "logits/rejected": -2.9208984375, "logps/chosen": -418.82501220703125, "logps/rejected": -443.375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.393359184265137, "rewards/margins": 9.317187309265137, "rewards/rejected": -13.717187881469727, "step": 8610 }, { "epoch": 3.246634660642003, "grad_norm": 2.8777479664100394, "learning_rate": 1.8841807909604518e-07, "logits/chosen": -2.7466797828674316, "logits/rejected": -2.9326171875, "logps/chosen": -384.0874938964844, "logps/rejected": -410.6499938964844, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -4.215234279632568, "rewards/margins": 8.458593368530273, "rewards/rejected": -12.675000190734863, "step": 8620 }, { "epoch": 3.2504000753082933, "grad_norm": 0.45943728489027535, "learning_rate": 1.8747645951035781e-07, "logits/chosen": -2.7630858421325684, "logits/rejected": -2.802929639816284, "logps/chosen": -391.1000061035156, "logps/rejected": -450.79998779296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.195898532867432, "rewards/margins": 8.799219131469727, "rewards/rejected": -12.990625381469727, "step": 8630 }, { "epoch": 3.2541654899745835, "grad_norm": 2.4790981961386573, "learning_rate": 1.8653483992467042e-07, "logits/chosen": -2.795703172683716, "logits/rejected": -2.721484422683716, "logps/chosen": -379.45001220703125, "logps/rejected": -453.125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.633886814117432, "rewards/margins": 8.735937118530273, "rewards/rejected": -13.3671875, "step": 8640 }, { "epoch": 3.2579309046408738, "grad_norm": 21.944116196304826, "learning_rate": 1.8559322033898303e-07, "logits/chosen": -2.696093797683716, "logits/rejected": -2.8453125953674316, "logps/chosen": -381.875, "logps/rejected": -434.45001220703125, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -5.046875, "rewards/margins": 8.998437881469727, "rewards/rejected": -14.050000190734863, "step": 8650 }, { "epoch": 3.2616963193071635, "grad_norm": 0.8198291133311267, "learning_rate": 1.8465160075329566e-07, "logits/chosen": -2.7025389671325684, "logits/rejected": -2.8052735328674316, "logps/chosen": -369.79998779296875, "logps/rejected": -441.1000061035156, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -4.375, "rewards/margins": 8.83984375, "rewards/rejected": -13.223437309265137, "step": 8660 }, { "epoch": 3.2654617339734537, "grad_norm": 0.8807787783292197, "learning_rate": 1.8370998116760826e-07, "logits/chosen": -2.682421922683716, "logits/rejected": -2.838085889816284, "logps/chosen": -416.6875, "logps/rejected": -448.29998779296875, "loss": 0.0066, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.6832032203674316, "rewards/margins": 8.486719131469727, "rewards/rejected": -12.160937309265137, "step": 8670 }, { "epoch": 3.269227148639744, "grad_norm": 12.237494752385535, "learning_rate": 1.8276836158192087e-07, "logits/chosen": -2.708984375, "logits/rejected": -2.718945264816284, "logps/chosen": -406.125, "logps/rejected": -464.75, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.388281345367432, "rewards/margins": 8.864843368530273, "rewards/rejected": -13.254687309265137, "step": 8680 }, { "epoch": 3.272992563306034, "grad_norm": 2.587895803609115, "learning_rate": 1.8182674199623353e-07, "logits/chosen": -2.6900391578674316, "logits/rejected": -2.7876954078674316, "logps/chosen": -376.45001220703125, "logps/rejected": -457.6499938964844, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.394921779632568, "rewards/margins": 8.776562690734863, "rewards/rejected": -13.165624618530273, "step": 8690 }, { "epoch": 3.2767579779723244, "grad_norm": 1.0242562076107558, "learning_rate": 1.8088512241054614e-07, "logits/chosen": -2.750195264816284, "logits/rejected": -2.6908202171325684, "logps/chosen": -391.9750061035156, "logps/rejected": -434.17498779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -3.911328077316284, "rewards/margins": 8.46484375, "rewards/rejected": -12.378125190734863, "step": 8700 }, { "epoch": 3.280523392638614, "grad_norm": 0.8604693969552694, "learning_rate": 1.7994350282485877e-07, "logits/chosen": -2.8203125, "logits/rejected": -2.8531250953674316, "logps/chosen": -374.125, "logps/rejected": -448.8999938964844, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.103125095367432, "rewards/margins": 8.934374809265137, "rewards/rejected": -13.038281440734863, "step": 8710 }, { "epoch": 3.2842888073049044, "grad_norm": 0.9201844769027728, "learning_rate": 1.7900188323917138e-07, "logits/chosen": -2.7515625953674316, "logits/rejected": -2.8910155296325684, "logps/chosen": -402.67498779296875, "logps/rejected": -440.92498779296875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.661328315734863, "rewards/margins": 8.802343368530273, "rewards/rejected": -13.464062690734863, "step": 8720 }, { "epoch": 3.2880542219711946, "grad_norm": 0.39280919945829385, "learning_rate": 1.7806026365348398e-07, "logits/chosen": -2.7671875953674316, "logits/rejected": -2.896289110183716, "logps/chosen": -398.57501220703125, "logps/rejected": -433.04998779296875, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -4.741503715515137, "rewards/margins": 8.81640625, "rewards/rejected": -13.556249618530273, "step": 8730 }, { "epoch": 3.291819636637485, "grad_norm": 9.53539283130743, "learning_rate": 1.7711864406779661e-07, "logits/chosen": -2.8140625953674316, "logits/rejected": -2.9222655296325684, "logps/chosen": -388.5, "logps/rejected": -456.0249938964844, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.807421684265137, "rewards/margins": 9.571874618530273, "rewards/rejected": -14.381250381469727, "step": 8740 }, { "epoch": 3.295585051303775, "grad_norm": 1.701854347266704, "learning_rate": 1.7617702448210922e-07, "logits/chosen": -2.744921922683716, "logits/rejected": -2.889453172683716, "logps/chosen": -408.6499938964844, "logps/rejected": -453.625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -5.192773342132568, "rewards/margins": 8.87109375, "rewards/rejected": -14.067187309265137, "step": 8750 }, { "epoch": 3.2993504659700648, "grad_norm": 6.349174706509713, "learning_rate": 1.7523540489642183e-07, "logits/chosen": -2.8080077171325684, "logits/rejected": -2.9623045921325684, "logps/chosen": -412.20001220703125, "logps/rejected": -436.8999938964844, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -5.20361328125, "rewards/margins": 8.734375, "rewards/rejected": -13.9375, "step": 8760 }, { "epoch": 3.303115880636355, "grad_norm": 3.8426963435213404, "learning_rate": 1.7429378531073446e-07, "logits/chosen": -2.7662110328674316, "logits/rejected": -2.9263672828674316, "logps/chosen": -370.29998779296875, "logps/rejected": -437.8500061035156, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.62890625, "rewards/margins": 9.585156440734863, "rewards/rejected": -14.223437309265137, "step": 8770 }, { "epoch": 3.306881295302645, "grad_norm": 1.295390380565723, "learning_rate": 1.7335216572504707e-07, "logits/chosen": -2.8505859375, "logits/rejected": -2.893749952316284, "logps/chosen": -380.07501220703125, "logps/rejected": -423.1499938964844, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.463623046875, "rewards/margins": 8.58984375, "rewards/rejected": -13.059374809265137, "step": 8780 }, { "epoch": 3.3106467099689354, "grad_norm": 1.6201789889171732, "learning_rate": 1.7241054613935967e-07, "logits/chosen": -2.8046875, "logits/rejected": -2.93359375, "logps/chosen": -336.9624938964844, "logps/rejected": -401.20001220703125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.096386909484863, "rewards/margins": 8.854687690734863, "rewards/rejected": -12.957812309265137, "step": 8790 }, { "epoch": 3.3144121246352256, "grad_norm": 1.2332613049500456, "learning_rate": 1.714689265536723e-07, "logits/chosen": -2.6712889671325684, "logits/rejected": -2.66796875, "logps/chosen": -377.3999938964844, "logps/rejected": -449.0, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -3.958203077316284, "rewards/margins": 8.91015625, "rewards/rejected": -12.869531631469727, "step": 8800 }, { "epoch": 3.318177539301516, "grad_norm": 13.188100239088351, "learning_rate": 1.705273069679849e-07, "logits/chosen": -2.811718702316284, "logits/rejected": -2.8501954078674316, "logps/chosen": -372.6625061035156, "logps/rejected": -443.2250061035156, "loss": 0.0164, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.336523532867432, "rewards/margins": 8.899218559265137, "rewards/rejected": -13.2421875, "step": 8810 }, { "epoch": 3.3219429539678056, "grad_norm": 0.22647677122968532, "learning_rate": 1.6958568738229754e-07, "logits/chosen": -2.7601561546325684, "logits/rejected": -2.9429688453674316, "logps/chosen": -399.2250061035156, "logps/rejected": -416.20001220703125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.482226371765137, "rewards/margins": 8.469531059265137, "rewards/rejected": -12.946874618530273, "step": 8820 }, { "epoch": 3.325708368634096, "grad_norm": 3.3930061385069465, "learning_rate": 1.6864406779661018e-07, "logits/chosen": -2.827343702316284, "logits/rejected": -2.741992235183716, "logps/chosen": -381.79998779296875, "logps/rejected": -437.92498779296875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.899023532867432, "rewards/margins": 8.251562118530273, "rewards/rejected": -13.149218559265137, "step": 8830 }, { "epoch": 3.329473783300386, "grad_norm": 1.0512267067322738, "learning_rate": 1.6770244821092278e-07, "logits/chosen": -2.871875047683716, "logits/rejected": -3.0087890625, "logps/chosen": -349.2749938964844, "logps/rejected": -424.9750061035156, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.5615234375, "rewards/margins": 8.921093940734863, "rewards/rejected": -13.482812881469727, "step": 8840 }, { "epoch": 3.3332391979666762, "grad_norm": 0.9291401719435558, "learning_rate": 1.667608286252354e-07, "logits/chosen": -2.93359375, "logits/rejected": -2.9134764671325684, "logps/chosen": -343.04998779296875, "logps/rejected": -431.57501220703125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -5.005175590515137, "rewards/margins": 8.409375190734863, "rewards/rejected": -13.417187690734863, "step": 8850 }, { "epoch": 3.337004612632966, "grad_norm": 1.3672969717456946, "learning_rate": 1.6581920903954802e-07, "logits/chosen": -2.745898485183716, "logits/rejected": -2.875195264816284, "logps/chosen": -380.125, "logps/rejected": -418.25, "loss": 0.0208, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.869335889816284, "rewards/margins": 8.866406440734863, "rewards/rejected": -12.736719131469727, "step": 8860 }, { "epoch": 3.340770027299256, "grad_norm": 0.35632388516876123, "learning_rate": 1.6487758945386063e-07, "logits/chosen": -2.8115234375, "logits/rejected": -2.8023438453674316, "logps/chosen": -344.20001220703125, "logps/rejected": -449.0, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.112256050109863, "rewards/margins": 8.831250190734863, "rewards/rejected": -12.942187309265137, "step": 8870 }, { "epoch": 3.3445354419655464, "grad_norm": 3.0799762564650988, "learning_rate": 1.6393596986817326e-07, "logits/chosen": -2.73046875, "logits/rejected": -2.779492139816284, "logps/chosen": -402.0, "logps/rejected": -441.79998779296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.012304782867432, "rewards/margins": 8.524999618530273, "rewards/rejected": -12.535937309265137, "step": 8880 }, { "epoch": 3.3483008566318366, "grad_norm": 1.716593448214403, "learning_rate": 1.6299435028248587e-07, "logits/chosen": -2.6986327171325684, "logits/rejected": -2.7728514671325684, "logps/chosen": -369.45001220703125, "logps/rejected": -402.125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -3.231689453125, "rewards/margins": 8.711718559265137, "rewards/rejected": -11.949999809265137, "step": 8890 }, { "epoch": 3.352066271298127, "grad_norm": 2.609055027555382, "learning_rate": 1.6205273069679847e-07, "logits/chosen": -2.6880860328674316, "logits/rejected": -2.7425780296325684, "logps/chosen": -403.3500061035156, "logps/rejected": -469.3500061035156, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -3.975292921066284, "rewards/margins": 8.616406440734863, "rewards/rejected": -12.587499618530273, "step": 8900 }, { "epoch": 3.355831685964417, "grad_norm": 6.0629274469712575, "learning_rate": 1.611111111111111e-07, "logits/chosen": -2.7457032203674316, "logits/rejected": -2.8779296875, "logps/chosen": -368.04998779296875, "logps/rejected": -419.2749938964844, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -3.984814405441284, "rewards/margins": 8.931249618530273, "rewards/rejected": -12.920312881469727, "step": 8910 }, { "epoch": 3.359597100630707, "grad_norm": 0.7671680368297407, "learning_rate": 1.601694915254237e-07, "logits/chosen": -2.712890625, "logits/rejected": -2.853320360183716, "logps/chosen": -398.5249938964844, "logps/rejected": -427.25, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.8702149391174316, "rewards/margins": 9.172656059265137, "rewards/rejected": -13.051562309265137, "step": 8920 }, { "epoch": 3.363362515296997, "grad_norm": 3.011836976923051, "learning_rate": 1.5922787193973632e-07, "logits/chosen": -2.770703077316284, "logits/rejected": -2.7886719703674316, "logps/chosen": -403.0249938964844, "logps/rejected": -490.29998779296875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.6376953125, "rewards/margins": 9.075780868530273, "rewards/rejected": -13.711718559265137, "step": 8930 }, { "epoch": 3.3671279299632872, "grad_norm": 0.1560684088398386, "learning_rate": 1.5828625235404898e-07, "logits/chosen": -2.7685546875, "logits/rejected": -2.8462891578674316, "logps/chosen": -370.6499938964844, "logps/rejected": -399.17498779296875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -3.826367139816284, "rewards/margins": 8.928125381469727, "rewards/rejected": -12.750781059265137, "step": 8940 }, { "epoch": 3.3708933446295775, "grad_norm": 1.1344017461503728, "learning_rate": 1.5734463276836158e-07, "logits/chosen": -2.7646484375, "logits/rejected": -2.889843702316284, "logps/chosen": -393.0249938964844, "logps/rejected": -429.79998779296875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.582226753234863, "rewards/margins": 8.827343940734863, "rewards/rejected": -13.404687881469727, "step": 8950 }, { "epoch": 3.3746587592958672, "grad_norm": 1.4360264390093909, "learning_rate": 1.564030131826742e-07, "logits/chosen": -2.9261717796325684, "logits/rejected": -2.8160157203674316, "logps/chosen": -375.76251220703125, "logps/rejected": -445.92498779296875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.643164157867432, "rewards/margins": 8.560155868530273, "rewards/rejected": -13.203125, "step": 8960 }, { "epoch": 3.3784241739621574, "grad_norm": 1.1992862621201192, "learning_rate": 1.5546139359698682e-07, "logits/chosen": -2.8001952171325684, "logits/rejected": -2.8564453125, "logps/chosen": -379.75, "logps/rejected": -437.92498779296875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.7001953125, "rewards/margins": 8.474218368530273, "rewards/rejected": -13.176562309265137, "step": 8970 }, { "epoch": 3.3821895886284477, "grad_norm": 1.36641604929091, "learning_rate": 1.5451977401129943e-07, "logits/chosen": -2.939257860183716, "logits/rejected": -3.062304735183716, "logps/chosen": -356.61248779296875, "logps/rejected": -406.625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.733984470367432, "rewards/margins": 8.785937309265137, "rewards/rejected": -13.520312309265137, "step": 8980 }, { "epoch": 3.385955003294738, "grad_norm": 7.095334040957014, "learning_rate": 1.5357815442561204e-07, "logits/chosen": -2.966796875, "logits/rejected": -3.001953125, "logps/chosen": -342.625, "logps/rejected": -416.0249938964844, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -5.0751953125, "rewards/margins": 8.764843940734863, "rewards/rejected": -13.839062690734863, "step": 8990 }, { "epoch": 3.389720417961028, "grad_norm": 3.5721794161648166, "learning_rate": 1.5263653483992467e-07, "logits/chosen": -2.8050780296325684, "logits/rejected": -2.758984327316284, "logps/chosen": -389.79998779296875, "logps/rejected": -464.70001220703125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -4.421484470367432, "rewards/margins": 8.965624809265137, "rewards/rejected": -13.385937690734863, "step": 9000 }, { "epoch": 3.3934858326273183, "grad_norm": 0.43458276346484936, "learning_rate": 1.5169491525423728e-07, "logits/chosen": -2.641796827316284, "logits/rejected": -2.836718797683716, "logps/chosen": -382.6000061035156, "logps/rejected": -431.2749938964844, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -5.116991996765137, "rewards/margins": 8.625, "rewards/rejected": -13.743749618530273, "step": 9010 }, { "epoch": 3.397251247293608, "grad_norm": 0.5780189025051037, "learning_rate": 1.507532956685499e-07, "logits/chosen": -2.592578172683716, "logits/rejected": -2.742968797683716, "logps/chosen": -414.5, "logps/rejected": -477.0, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.6611328125, "rewards/margins": 8.684374809265137, "rewards/rejected": -13.350000381469727, "step": 9020 }, { "epoch": 3.4010166619598983, "grad_norm": 2.9485673621439776, "learning_rate": 1.4981167608286251e-07, "logits/chosen": -2.82421875, "logits/rejected": -2.9935545921325684, "logps/chosen": -395.5249938964844, "logps/rejected": -424.8500061035156, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -4.685107231140137, "rewards/margins": 8.844531059265137, "rewards/rejected": -13.528124809265137, "step": 9030 }, { "epoch": 3.4047820766261885, "grad_norm": 0.5804874471717184, "learning_rate": 1.4887005649717512e-07, "logits/chosen": -2.641406297683716, "logits/rejected": -2.8257813453674316, "logps/chosen": -392.2250061035156, "logps/rejected": -438.82501220703125, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -4.2060546875, "rewards/margins": 8.564062118530273, "rewards/rejected": -12.767187118530273, "step": 9040 }, { "epoch": 3.4085474912924787, "grad_norm": 0.8109146806168563, "learning_rate": 1.4792843691148775e-07, "logits/chosen": -2.817578077316284, "logits/rejected": -2.874218702316284, "logps/chosen": -383.38751220703125, "logps/rejected": -439.3500061035156, "loss": 0.0106, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.621777534484863, "rewards/margins": 8.564062118530273, "rewards/rejected": -13.181249618530273, "step": 9050 }, { "epoch": 3.4123129059587685, "grad_norm": 1.3633430967791682, "learning_rate": 1.4698681732580036e-07, "logits/chosen": -2.7017579078674316, "logits/rejected": -2.861523389816284, "logps/chosen": -320.79998779296875, "logps/rejected": -395.95001220703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.195508003234863, "rewards/margins": 8.864843368530273, "rewards/rejected": -13.060937881469727, "step": 9060 }, { "epoch": 3.4160783206250587, "grad_norm": 0.5369131903900564, "learning_rate": 1.46045197740113e-07, "logits/chosen": -2.7632813453674316, "logits/rejected": -2.861523389816284, "logps/chosen": -385.1499938964844, "logps/rejected": -460.1000061035156, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -4.615234375, "rewards/margins": 8.540624618530273, "rewards/rejected": -13.160937309265137, "step": 9070 }, { "epoch": 3.419843735291349, "grad_norm": 4.210040926127824, "learning_rate": 1.4510357815442563e-07, "logits/chosen": -2.7080078125, "logits/rejected": -2.8003907203674316, "logps/chosen": -390.9750061035156, "logps/rejected": -434.20001220703125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -3.8998045921325684, "rewards/margins": 9.118749618530273, "rewards/rejected": -13.024999618530273, "step": 9080 }, { "epoch": 3.423609149957639, "grad_norm": 2.451990218668562, "learning_rate": 1.4416195856873823e-07, "logits/chosen": -2.8042969703674316, "logits/rejected": -2.7769532203674316, "logps/chosen": -355.6000061035156, "logps/rejected": -423.75, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.229296684265137, "rewards/margins": 8.602343559265137, "rewards/rejected": -12.828125, "step": 9090 }, { "epoch": 3.4273745646239293, "grad_norm": 0.27718242938023124, "learning_rate": 1.4322033898305084e-07, "logits/chosen": -2.6810545921325684, "logits/rejected": -2.882031202316284, "logps/chosen": -374.1625061035156, "logps/rejected": -450.92498779296875, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -4.157055854797363, "rewards/margins": 9.135937690734863, "rewards/rejected": -13.290624618530273, "step": 9100 }, { "epoch": 3.4311399792902195, "grad_norm": 0.9642371299775202, "learning_rate": 1.4227871939736347e-07, "logits/chosen": -2.789843797683716, "logits/rejected": -2.812304735183716, "logps/chosen": -353.375, "logps/rejected": -431.9750061035156, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -4.211816310882568, "rewards/margins": 8.727343559265137, "rewards/rejected": -12.942187309265137, "step": 9110 }, { "epoch": 3.4349053939565093, "grad_norm": 63.077118398820446, "learning_rate": 1.4133709981167608e-07, "logits/chosen": -2.7421875, "logits/rejected": -2.819531202316284, "logps/chosen": -399.3999938964844, "logps/rejected": -474.8500061035156, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.497656345367432, "rewards/margins": 9.50390625, "rewards/rejected": -13.995312690734863, "step": 9120 }, { "epoch": 3.4386708086227995, "grad_norm": 0.8678935633848547, "learning_rate": 1.4039548022598868e-07, "logits/chosen": -2.758593797683716, "logits/rejected": -2.8169922828674316, "logps/chosen": -386.6000061035156, "logps/rejected": -458.54998779296875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -4.778515815734863, "rewards/margins": 8.92578125, "rewards/rejected": -13.704687118530273, "step": 9130 }, { "epoch": 3.4424362232890897, "grad_norm": 17.952187992794208, "learning_rate": 1.3945386064030132e-07, "logits/chosen": -2.7685546875, "logits/rejected": -3.020703077316284, "logps/chosen": -381.67498779296875, "logps/rejected": -421.625, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -4.602734565734863, "rewards/margins": 8.77734375, "rewards/rejected": -13.3828125, "step": 9140 }, { "epoch": 3.44620163795538, "grad_norm": 1.989990612423368, "learning_rate": 1.3851224105461392e-07, "logits/chosen": -2.687304735183716, "logits/rejected": -2.787109375, "logps/chosen": -386.48748779296875, "logps/rejected": -454.1000061035156, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.568554878234863, "rewards/margins": 9.192187309265137, "rewards/rejected": -13.765625, "step": 9150 }, { "epoch": 3.44996705262167, "grad_norm": 0.9575697247529744, "learning_rate": 1.3757062146892653e-07, "logits/chosen": -2.7095704078674316, "logits/rejected": -2.88671875, "logps/chosen": -364.9125061035156, "logps/rejected": -430.79998779296875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.115038871765137, "rewards/margins": 9.198437690734863, "rewards/rejected": -13.309374809265137, "step": 9160 }, { "epoch": 3.45373246728796, "grad_norm": 3.8339359040611236, "learning_rate": 1.3662900188323916e-07, "logits/chosen": -2.738476514816284, "logits/rejected": -2.8251953125, "logps/chosen": -361.2124938964844, "logps/rejected": -424.54998779296875, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -4.331250190734863, "rewards/margins": 8.646875381469727, "rewards/rejected": -12.96875, "step": 9170 }, { "epoch": 3.45749788195425, "grad_norm": 2.152450341859522, "learning_rate": 1.3568738229755177e-07, "logits/chosen": -2.804882764816284, "logits/rejected": -2.795703172683716, "logps/chosen": -365.23748779296875, "logps/rejected": -426.82501220703125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.622851371765137, "rewards/margins": 8.986719131469727, "rewards/rejected": -13.615625381469727, "step": 9180 }, { "epoch": 3.4612632966205403, "grad_norm": 3.827748716906089, "learning_rate": 1.3474576271186443e-07, "logits/chosen": -2.762500047683716, "logits/rejected": -2.888671875, "logps/chosen": -379.54998779296875, "logps/rejected": -428.04998779296875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.575781345367432, "rewards/margins": 8.266406059265137, "rewards/rejected": -12.848437309265137, "step": 9190 }, { "epoch": 3.4650287112868305, "grad_norm": 0.44821146297181064, "learning_rate": 1.3380414312617703e-07, "logits/chosen": -2.6412110328674316, "logits/rejected": -2.789843797683716, "logps/chosen": -384.875, "logps/rejected": -434.70001220703125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -4.489062309265137, "rewards/margins": 8.713281631469727, "rewards/rejected": -13.206250190734863, "step": 9200 }, { "epoch": 3.4687941259531208, "grad_norm": 4.471216344778823, "learning_rate": 1.3286252354048964e-07, "logits/chosen": -2.721484422683716, "logits/rejected": -2.7876954078674316, "logps/chosen": -411.86248779296875, "logps/rejected": -443.82501220703125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.188086032867432, "rewards/margins": 8.58984375, "rewards/rejected": -12.779687881469727, "step": 9210 }, { "epoch": 3.4725595406194105, "grad_norm": 0.6271593385721091, "learning_rate": 1.3192090395480227e-07, "logits/chosen": -2.612109422683716, "logits/rejected": -2.684765577316284, "logps/chosen": -416.79998779296875, "logps/rejected": -483.6499938964844, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.87890625, "rewards/margins": 9.079687118530273, "rewards/rejected": -13.957812309265137, "step": 9220 }, { "epoch": 3.4763249552857007, "grad_norm": 1.2876550168439, "learning_rate": 1.3097928436911488e-07, "logits/chosen": -2.8160157203674316, "logits/rejected": -2.791796922683716, "logps/chosen": -388.0, "logps/rejected": -481.67498779296875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -5.785937309265137, "rewards/margins": 9.026562690734863, "rewards/rejected": -14.8203125, "step": 9230 }, { "epoch": 3.480090369951991, "grad_norm": 21.40086301899737, "learning_rate": 1.3003766478342748e-07, "logits/chosen": -2.6207032203674316, "logits/rejected": -2.6986327171325684, "logps/chosen": -412.6499938964844, "logps/rejected": -461.1499938964844, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -4.664648532867432, "rewards/margins": 8.94140625, "rewards/rejected": -13.610937118530273, "step": 9240 }, { "epoch": 3.483855784618281, "grad_norm": 1.8047361221017713, "learning_rate": 1.2909604519774012e-07, "logits/chosen": -2.7542967796325684, "logits/rejected": -2.8818359375, "logps/chosen": -399.20001220703125, "logps/rejected": -430.8999938964844, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -5.319921970367432, "rewards/margins": 8.763280868530273, "rewards/rejected": -14.079687118530273, "step": 9250 }, { "epoch": 3.4876211992845714, "grad_norm": 1.5663414715284856, "learning_rate": 1.2815442561205272e-07, "logits/chosen": -2.800585985183716, "logits/rejected": -2.927929639816284, "logps/chosen": -389.9750061035156, "logps/rejected": -446.20001220703125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -5.93359375, "rewards/margins": 8.602343559265137, "rewards/rejected": -14.540624618530273, "step": 9260 }, { "epoch": 3.491386613950861, "grad_norm": 0.3264810825759672, "learning_rate": 1.2721280602636533e-07, "logits/chosen": -2.810546875, "logits/rejected": -2.96484375, "logps/chosen": -368.6000061035156, "logps/rejected": -429.7749938964844, "loss": 0.0085, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.174218654632568, "rewards/margins": 8.715624809265137, "rewards/rejected": -13.887499809265137, "step": 9270 }, { "epoch": 3.4951520286171514, "grad_norm": 2.9987183010052423, "learning_rate": 1.2627118644067796e-07, "logits/chosen": -2.792773485183716, "logits/rejected": -2.948046922683716, "logps/chosen": -376.95001220703125, "logps/rejected": -437.32501220703125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -5.278124809265137, "rewards/margins": 9.104687690734863, "rewards/rejected": -14.3828125, "step": 9280 }, { "epoch": 3.4989174432834416, "grad_norm": 2.601785521384261, "learning_rate": 1.2532956685499057e-07, "logits/chosen": -2.71875, "logits/rejected": -2.9261717796325684, "logps/chosen": -391.95001220703125, "logps/rejected": -442.45001220703125, "loss": 0.0114, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.5361328125, "rewards/margins": 9.164843559265137, "rewards/rejected": -14.709375381469727, "step": 9290 }, { "epoch": 3.5026828579497318, "grad_norm": 17.40737875562753, "learning_rate": 1.243879472693032e-07, "logits/chosen": -2.679492235183716, "logits/rejected": -2.744140625, "logps/chosen": -366.54998779296875, "logps/rejected": -436.70001220703125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -4.594531059265137, "rewards/margins": 8.768750190734863, "rewards/rejected": -13.362500190734863, "step": 9300 }, { "epoch": 3.506448272616022, "grad_norm": 13.431825406751448, "learning_rate": 1.234463276836158e-07, "logits/chosen": -2.7261719703674316, "logits/rejected": -2.7919921875, "logps/chosen": -402.29998779296875, "logps/rejected": -457.8500061035156, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -4.815625190734863, "rewards/margins": 8.638280868530273, "rewards/rejected": -13.456250190734863, "step": 9310 }, { "epoch": 3.510213687282312, "grad_norm": 0.4137577347903259, "learning_rate": 1.2250470809792844e-07, "logits/chosen": -2.7291016578674316, "logits/rejected": -2.7994141578674316, "logps/chosen": -373.17498779296875, "logps/rejected": -436.5, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -5.428320407867432, "rewards/margins": 8.580469131469727, "rewards/rejected": -14.009374618530273, "step": 9320 }, { "epoch": 3.513979101948602, "grad_norm": 5.317443733913433, "learning_rate": 1.2156308851224105e-07, "logits/chosen": -2.880078077316284, "logits/rejected": -2.8167967796325684, "logps/chosen": -383.3500061035156, "logps/rejected": -447.875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.404296875, "rewards/margins": 8.914843559265137, "rewards/rejected": -13.321874618530273, "step": 9330 }, { "epoch": 3.517744516614892, "grad_norm": 0.7229705691770656, "learning_rate": 1.2062146892655368e-07, "logits/chosen": -2.6767578125, "logits/rejected": -2.782421827316284, "logps/chosen": -448.2250061035156, "logps/rejected": -467.6499938964844, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.525781154632568, "rewards/margins": 9.098437309265137, "rewards/rejected": -13.623437881469727, "step": 9340 }, { "epoch": 3.5215099312811824, "grad_norm": 1.9601449016721042, "learning_rate": 1.1967984934086629e-07, "logits/chosen": -2.843945264816284, "logits/rejected": -2.9986329078674316, "logps/chosen": -391.2749938964844, "logps/rejected": -416.0, "loss": 0.009, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.5654296875, "rewards/margins": 9.103906631469727, "rewards/rejected": -13.661718368530273, "step": 9350 }, { "epoch": 3.5252753459474726, "grad_norm": 1.402724360153286, "learning_rate": 1.187382297551789e-07, "logits/chosen": -2.8091797828674316, "logits/rejected": -2.733203172683716, "logps/chosen": -392.875, "logps/rejected": -479.75, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -5.823437690734863, "rewards/margins": 9.149999618530273, "rewards/rejected": -14.987500190734863, "step": 9360 }, { "epoch": 3.5290407606137624, "grad_norm": 6.027513947534834, "learning_rate": 1.1779661016949153e-07, "logits/chosen": -2.746875047683716, "logits/rejected": -2.707226514816284, "logps/chosen": -396.625, "logps/rejected": -470.29998779296875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.951464653015137, "rewards/margins": 8.803125381469727, "rewards/rejected": -13.753125190734863, "step": 9370 }, { "epoch": 3.5328061752800526, "grad_norm": 0.4823318682428961, "learning_rate": 1.1685499058380414e-07, "logits/chosen": -2.763476610183716, "logits/rejected": -2.8656249046325684, "logps/chosen": -394.5, "logps/rejected": -427.32501220703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -5.706250190734863, "rewards/margins": 8.590624809265137, "rewards/rejected": -14.3046875, "step": 9380 }, { "epoch": 3.536571589946343, "grad_norm": 0.7038804772277362, "learning_rate": 1.1591337099811675e-07, "logits/chosen": -2.7339844703674316, "logits/rejected": -2.787890672683716, "logps/chosen": -391.4750061035156, "logps/rejected": -462.5249938964844, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -5.069140434265137, "rewards/margins": 8.75, "rewards/rejected": -13.807031631469727, "step": 9390 }, { "epoch": 3.540337004612633, "grad_norm": 3.575566093163582, "learning_rate": 1.1497175141242937e-07, "logits/chosen": -3.0160155296325684, "logits/rejected": -3.03515625, "logps/chosen": -361.13751220703125, "logps/rejected": -440.20001220703125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -5.1484375, "rewards/margins": 8.836718559265137, "rewards/rejected": -13.989062309265137, "step": 9400 }, { "epoch": 3.544102419278923, "grad_norm": 6.916860620571467, "learning_rate": 1.1403013182674199e-07, "logits/chosen": -2.8583984375, "logits/rejected": -2.9046874046325684, "logps/chosen": -405.25, "logps/rejected": -474.82501220703125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -5.239062309265137, "rewards/margins": 8.873437881469727, "rewards/rejected": -14.112500190734863, "step": 9410 }, { "epoch": 3.5478678339452134, "grad_norm": 1.8327338050105106, "learning_rate": 1.1308851224105461e-07, "logits/chosen": -2.7294921875, "logits/rejected": -2.6988282203674316, "logps/chosen": -391.75, "logps/rejected": -485.25, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -5.285742282867432, "rewards/margins": 9.127344131469727, "rewards/rejected": -14.417187690734863, "step": 9420 }, { "epoch": 3.551633248611503, "grad_norm": 0.9878391471155835, "learning_rate": 1.1214689265536723e-07, "logits/chosen": -2.7798829078674316, "logits/rejected": -2.749218702316284, "logps/chosen": -392.4750061035156, "logps/rejected": -478.4750061035156, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.990234375, "rewards/margins": 8.657812118530273, "rewards/rejected": -13.649999618530273, "step": 9430 }, { "epoch": 3.5553986632777934, "grad_norm": 0.17162813018218967, "learning_rate": 1.1120527306967985e-07, "logits/chosen": -2.7392578125, "logits/rejected": -2.9306640625, "logps/chosen": -381.29998779296875, "logps/rejected": -427.0, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.573046684265137, "rewards/margins": 9.02734375, "rewards/rejected": -13.600000381469727, "step": 9440 }, { "epoch": 3.5591640779440836, "grad_norm": 2.966601240897449, "learning_rate": 1.1026365348399245e-07, "logits/chosen": -2.773242235183716, "logits/rejected": -2.9248046875, "logps/chosen": -373.8999938964844, "logps/rejected": -424.6000061035156, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -4.349609375, "rewards/margins": 8.912500381469727, "rewards/rejected": -13.264843940734863, "step": 9450 }, { "epoch": 3.562929492610374, "grad_norm": 0.5587053376247754, "learning_rate": 1.0932203389830507e-07, "logits/chosen": -2.6878905296325684, "logits/rejected": -2.919140577316284, "logps/chosen": -395.48748779296875, "logps/rejected": -428.1000061035156, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.698290824890137, "rewards/margins": 9.28125, "rewards/rejected": -13.982812881469727, "step": 9460 }, { "epoch": 3.5666949072766636, "grad_norm": 1.9892296010846549, "learning_rate": 1.083804143126177e-07, "logits/chosen": -2.908398389816284, "logits/rejected": -2.953320264816284, "logps/chosen": -343.5625, "logps/rejected": -403.17498779296875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -4.66015625, "rewards/margins": 8.802343368530273, "rewards/rejected": -13.456250190734863, "step": 9470 }, { "epoch": 3.570460321942954, "grad_norm": 1.4193435012110778, "learning_rate": 1.0743879472693033e-07, "logits/chosen": -2.8550782203674316, "logits/rejected": -2.8843750953674316, "logps/chosen": -357.0249938964844, "logps/rejected": -426.75, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.599023342132568, "rewards/margins": 9.036718368530273, "rewards/rejected": -13.634374618530273, "step": 9480 }, { "epoch": 3.574225736609244, "grad_norm": 0.9238542450738857, "learning_rate": 1.0649717514124293e-07, "logits/chosen": -2.661425828933716, "logits/rejected": -2.8931641578674316, "logps/chosen": -432.4125061035156, "logps/rejected": -438.07501220703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.283593654632568, "rewards/margins": 9.2734375, "rewards/rejected": -13.551562309265137, "step": 9490 }, { "epoch": 3.5779911512755342, "grad_norm": 0.2397817072813676, "learning_rate": 1.0555555555555555e-07, "logits/chosen": -2.8335938453674316, "logits/rejected": -3.019726514816284, "logps/chosen": -370.2749938964844, "logps/rejected": -417.9750061035156, "loss": 0.0106, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.621484279632568, "rewards/margins": 8.671093940734863, "rewards/rejected": -13.287500381469727, "step": 9500 }, { "epoch": 3.5817565659418245, "grad_norm": 2.322290081153374, "learning_rate": 1.0461393596986817e-07, "logits/chosen": -2.693359375, "logits/rejected": -2.780078172683716, "logps/chosen": -409.57501220703125, "logps/rejected": -458.2749938964844, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -4.398241996765137, "rewards/margins": 9.077343940734863, "rewards/rejected": -13.479687690734863, "step": 9510 }, { "epoch": 3.5855219806081147, "grad_norm": 1.6651313670995171, "learning_rate": 1.0367231638418078e-07, "logits/chosen": -2.826953172683716, "logits/rejected": -2.895703077316284, "logps/chosen": -322.7124938964844, "logps/rejected": -421.6499938964844, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -5.18505859375, "rewards/margins": 9.164843559265137, "rewards/rejected": -14.346875190734863, "step": 9520 }, { "epoch": 3.5892873952744044, "grad_norm": 6.51264733454041, "learning_rate": 1.027306967984934e-07, "logits/chosen": -2.6888670921325684, "logits/rejected": -2.777539014816284, "logps/chosen": -344.9750061035156, "logps/rejected": -435.45001220703125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.601244926452637, "rewards/margins": 8.89453125, "rewards/rejected": -13.50390625, "step": 9530 }, { "epoch": 3.5930528099406946, "grad_norm": 3.18098348668117, "learning_rate": 1.0178907721280603e-07, "logits/chosen": -2.8003907203674316, "logits/rejected": -2.929492235183716, "logps/chosen": -357.29998779296875, "logps/rejected": -406.75, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -4.72265625, "rewards/margins": 8.9140625, "rewards/rejected": -13.637499809265137, "step": 9540 }, { "epoch": 3.596818224606985, "grad_norm": 2.9790264118321272, "learning_rate": 1.0084745762711865e-07, "logits/chosen": -2.735546827316284, "logits/rejected": -3.053515672683716, "logps/chosen": -352.17498779296875, "logps/rejected": -401.1000061035156, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -4.2314453125, "rewards/margins": 8.810155868530273, "rewards/rejected": -13.032812118530273, "step": 9550 }, { "epoch": 3.600583639273275, "grad_norm": 3.865106568179669, "learning_rate": 9.990583804143126e-08, "logits/chosen": -2.7201170921325684, "logits/rejected": -2.923632860183716, "logps/chosen": -391.0, "logps/rejected": -451.04998779296875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.73046875, "rewards/margins": 8.861719131469727, "rewards/rejected": -13.595312118530273, "step": 9560 }, { "epoch": 3.604349053939565, "grad_norm": 1.9195687168323776, "learning_rate": 9.896421845574388e-08, "logits/chosen": -2.6646485328674316, "logits/rejected": -2.9369139671325684, "logps/chosen": -374.2124938964844, "logps/rejected": -418.3999938964844, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -4.175244331359863, "rewards/margins": 8.955469131469727, "rewards/rejected": -13.137499809265137, "step": 9570 }, { "epoch": 3.608114468605855, "grad_norm": 0.8745631382074905, "learning_rate": 9.80225988700565e-08, "logits/chosen": -2.78515625, "logits/rejected": -2.917187452316284, "logps/chosen": -367.20001220703125, "logps/rejected": -418.32501220703125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.392870903015137, "rewards/margins": 9.44921875, "rewards/rejected": -13.846875190734863, "step": 9580 }, { "epoch": 3.6118798832721453, "grad_norm": 0.7777306345707964, "learning_rate": 9.70809792843691e-08, "logits/chosen": -2.7115235328674316, "logits/rejected": -2.7496094703674316, "logps/chosen": -412.5, "logps/rejected": -467.20001220703125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -4.139892578125, "rewards/margins": 9.135156631469727, "rewards/rejected": -13.2734375, "step": 9590 }, { "epoch": 3.6156452979384355, "grad_norm": 4.767920002013949, "learning_rate": 9.613935969868172e-08, "logits/chosen": -2.829296827316284, "logits/rejected": -2.866015672683716, "logps/chosen": -365.20001220703125, "logps/rejected": -418.82501220703125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.508398532867432, "rewards/margins": 8.604687690734863, "rewards/rejected": -13.110937118530273, "step": 9600 }, { "epoch": 3.6194107126047257, "grad_norm": 0.8613173509682894, "learning_rate": 9.519774011299435e-08, "logits/chosen": -2.640625, "logits/rejected": -2.724902391433716, "logps/chosen": -411.92498779296875, "logps/rejected": -453.625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.424023628234863, "rewards/margins": 8.771093368530273, "rewards/rejected": -13.198437690734863, "step": 9610 }, { "epoch": 3.623176127271016, "grad_norm": 1.0088006741062088, "learning_rate": 9.425612052730697e-08, "logits/chosen": -2.646679639816284, "logits/rejected": -2.711132764816284, "logps/chosen": -416.6625061035156, "logps/rejected": -479.5, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -4.682324409484863, "rewards/margins": 9.22265625, "rewards/rejected": -13.899999618530273, "step": 9620 }, { "epoch": 3.626941541937306, "grad_norm": 0.41108433873434924, "learning_rate": 9.331450094161958e-08, "logits/chosen": -2.629687547683716, "logits/rejected": -2.752734422683716, "logps/chosen": -400.70001220703125, "logps/rejected": -460.67498779296875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -3.902148485183716, "rewards/margins": 8.869531631469727, "rewards/rejected": -12.760156631469727, "step": 9630 }, { "epoch": 3.630706956603596, "grad_norm": 0.4725523507986853, "learning_rate": 9.23728813559322e-08, "logits/chosen": -2.646679639816284, "logits/rejected": -2.7748045921325684, "logps/chosen": -379.51251220703125, "logps/rejected": -438.95001220703125, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -4.704882621765137, "rewards/margins": 9.103906631469727, "rewards/rejected": -13.8046875, "step": 9640 }, { "epoch": 3.634472371269886, "grad_norm": 1.0974995970091987, "learning_rate": 9.143126177024482e-08, "logits/chosen": -2.71875, "logits/rejected": -2.7455077171325684, "logps/chosen": -391.04998779296875, "logps/rejected": -495.5, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -5.234765529632568, "rewards/margins": 9.106249809265137, "rewards/rejected": -14.337499618530273, "step": 9650 }, { "epoch": 3.6382377859361763, "grad_norm": 0.8394282485346493, "learning_rate": 9.048964218455742e-08, "logits/chosen": -2.5130858421325684, "logits/rejected": -2.693359375, "logps/chosen": -426.20001220703125, "logps/rejected": -458.29998779296875, "loss": 0.0088, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.715429782867432, "rewards/margins": 8.678125381469727, "rewards/rejected": -13.395312309265137, "step": 9660 }, { "epoch": 3.642003200602466, "grad_norm": 6.1376472069966015, "learning_rate": 8.954802259887006e-08, "logits/chosen": -2.88671875, "logits/rejected": -2.9244141578674316, "logps/chosen": -364.61248779296875, "logps/rejected": -412.8500061035156, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -5.130102634429932, "rewards/margins": 9.208593368530273, "rewards/rejected": -14.345312118530273, "step": 9670 }, { "epoch": 3.6457686152687563, "grad_norm": 0.8387683180226446, "learning_rate": 8.860640301318268e-08, "logits/chosen": -2.7845702171325684, "logits/rejected": -2.8267579078674316, "logps/chosen": -367.2250061035156, "logps/rejected": -440.29998779296875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.838476657867432, "rewards/margins": 9.073437690734863, "rewards/rejected": -13.9140625, "step": 9680 }, { "epoch": 3.6495340299350465, "grad_norm": 1.5687798417559868, "learning_rate": 8.766478342749528e-08, "logits/chosen": -2.7630858421325684, "logits/rejected": -2.8265624046325684, "logps/chosen": -387.2250061035156, "logps/rejected": -454.20001220703125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -4.7734375, "rewards/margins": 9.456250190734863, "rewards/rejected": -14.223437309265137, "step": 9690 }, { "epoch": 3.6532994446013367, "grad_norm": 15.848412592194382, "learning_rate": 8.67231638418079e-08, "logits/chosen": -2.6966795921325684, "logits/rejected": -2.844921827316284, "logps/chosen": -362.54998779296875, "logps/rejected": -446.375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.567968845367432, "rewards/margins": 9.26171875, "rewards/rejected": -13.831250190734863, "step": 9700 }, { "epoch": 3.657064859267627, "grad_norm": 2.6622877040949837, "learning_rate": 8.578154425612052e-08, "logits/chosen": -2.624218702316284, "logits/rejected": -2.911914110183716, "logps/chosen": -379.13751220703125, "logps/rejected": -431.3500061035156, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -4.536718845367432, "rewards/margins": 9.1171875, "rewards/rejected": -13.651562690734863, "step": 9710 }, { "epoch": 3.660830273933917, "grad_norm": 0.2895454288415041, "learning_rate": 8.483992467043314e-08, "logits/chosen": -2.7105469703674316, "logits/rejected": -2.8375000953674316, "logps/chosen": -375.8999938964844, "logps/rejected": -450.20001220703125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -4.6015625, "rewards/margins": 9.057031631469727, "rewards/rejected": -13.660937309265137, "step": 9720 }, { "epoch": 3.6645956886002073, "grad_norm": 0.2312869278917707, "learning_rate": 8.389830508474575e-08, "logits/chosen": -2.811718702316284, "logits/rejected": -2.9232420921325684, "logps/chosen": -405.6625061035156, "logps/rejected": -448.7749938964844, "loss": 0.0139, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.710156440734863, "rewards/margins": 8.975781440734863, "rewards/rejected": -13.6875, "step": 9730 }, { "epoch": 3.668361103266497, "grad_norm": 0.3226790737644192, "learning_rate": 8.295668549905838e-08, "logits/chosen": -2.7718749046325684, "logits/rejected": -2.8597655296325684, "logps/chosen": -384.86248779296875, "logps/rejected": -433.5249938964844, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -4.253320217132568, "rewards/margins": 9.158594131469727, "rewards/rejected": -13.4140625, "step": 9740 }, { "epoch": 3.6721265179327873, "grad_norm": 4.118666770385664, "learning_rate": 8.2015065913371e-08, "logits/chosen": -2.7845702171325684, "logits/rejected": -2.8558592796325684, "logps/chosen": -367.42498779296875, "logps/rejected": -451.6499938964844, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -5.116406440734863, "rewards/margins": 9.292187690734863, "rewards/rejected": -14.407812118530273, "step": 9750 }, { "epoch": 3.6758919325990775, "grad_norm": 5.066299697378162, "learning_rate": 8.107344632768361e-08, "logits/chosen": -2.7347655296325684, "logits/rejected": -2.83984375, "logps/chosen": -392.79998779296875, "logps/rejected": -455.57501220703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.8427734375, "rewards/margins": 8.841405868530273, "rewards/rejected": -13.689062118530273, "step": 9760 }, { "epoch": 3.6796573472653678, "grad_norm": 5.873436600633285, "learning_rate": 8.013182674199623e-08, "logits/chosen": -2.8050780296325684, "logits/rejected": -2.9912109375, "logps/chosen": -366.45001220703125, "logps/rejected": -414.1499938964844, "loss": 0.0169, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.013476371765137, "rewards/margins": 8.475781440734863, "rewards/rejected": -13.487500190734863, "step": 9770 }, { "epoch": 3.6834227619316575, "grad_norm": 1.1644952873466454, "learning_rate": 7.919020715630885e-08, "logits/chosen": -2.7607421875, "logits/rejected": -2.7066407203674316, "logps/chosen": -380.1000061035156, "logps/rejected": -460.1499938964844, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.689453125, "rewards/margins": 9.142187118530273, "rewards/rejected": -13.829687118530273, "step": 9780 }, { "epoch": 3.6871881765979477, "grad_norm": 1.1575114866175549, "learning_rate": 7.824858757062147e-08, "logits/chosen": -2.77734375, "logits/rejected": -2.950000047683716, "logps/chosen": -351.82501220703125, "logps/rejected": -395.92498779296875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -4.158398628234863, "rewards/margins": 8.517187118530273, "rewards/rejected": -12.675000190734863, "step": 9790 }, { "epoch": 3.690953591264238, "grad_norm": 0.27641555218292946, "learning_rate": 7.730696798493408e-08, "logits/chosen": -2.5376954078674316, "logits/rejected": -2.700390577316284, "logps/chosen": -434.7250061035156, "logps/rejected": -464.57501220703125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.036328315734863, "rewards/margins": 8.996874809265137, "rewards/rejected": -13.03125, "step": 9800 }, { "epoch": 3.694719005930528, "grad_norm": 0.4800453474953728, "learning_rate": 7.63653483992467e-08, "logits/chosen": -2.8179688453674316, "logits/rejected": -2.742968797683716, "logps/chosen": -341.8500061035156, "logps/rejected": -427.8999938964844, "loss": 0.0108, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.082617282867432, "rewards/margins": 9.857030868530273, "rewards/rejected": -14.939062118530273, "step": 9810 }, { "epoch": 3.6984844205968184, "grad_norm": 2.905040146088029, "learning_rate": 7.542372881355932e-08, "logits/chosen": -2.744140625, "logits/rejected": -2.7748045921325684, "logps/chosen": -387.2749938964844, "logps/rejected": -433.1000061035156, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -4.808789253234863, "rewards/margins": 9.360937118530273, "rewards/rejected": -14.173437118530273, "step": 9820 }, { "epoch": 3.7022498352631086, "grad_norm": 1.3214296458607337, "learning_rate": 7.448210922787193e-08, "logits/chosen": -2.7236328125, "logits/rejected": -2.9091796875, "logps/chosen": -368.57501220703125, "logps/rejected": -449.1499938964844, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.716796875, "rewards/margins": 9.335156440734863, "rewards/rejected": -14.043749809265137, "step": 9830 }, { "epoch": 3.7060152499293983, "grad_norm": 3.703627938773156, "learning_rate": 7.354048964218455e-08, "logits/chosen": -2.83203125, "logits/rejected": -2.825390577316284, "logps/chosen": -370.01251220703125, "logps/rejected": -440.67498779296875, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.124804496765137, "rewards/margins": 9.131250381469727, "rewards/rejected": -13.253125190734863, "step": 9840 }, { "epoch": 3.7097806645956886, "grad_norm": 2.0191965823188553, "learning_rate": 7.259887005649717e-08, "logits/chosen": -2.6753907203674316, "logits/rejected": -2.889453172683716, "logps/chosen": -373.7749938964844, "logps/rejected": -399.5, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -4.588086128234863, "rewards/margins": 8.373437881469727, "rewards/rejected": -12.959375381469727, "step": 9850 }, { "epoch": 3.7135460792619788, "grad_norm": 1.2923428947367195, "learning_rate": 7.16572504708098e-08, "logits/chosen": -2.7867188453674316, "logits/rejected": -3.077343702316284, "logps/chosen": -382.4750061035156, "logps/rejected": -428.9750061035156, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.212304592132568, "rewards/margins": 9.050000190734863, "rewards/rejected": -13.262499809265137, "step": 9860 }, { "epoch": 3.717311493928269, "grad_norm": 3.752799634783009, "learning_rate": 7.071563088512241e-08, "logits/chosen": -2.7652344703674316, "logits/rejected": -2.7691407203674316, "logps/chosen": -391.75, "logps/rejected": -453.04998779296875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -4.613671779632568, "rewards/margins": 8.752344131469727, "rewards/rejected": -13.373437881469727, "step": 9870 }, { "epoch": 3.7210769085945588, "grad_norm": 0.6543777726784893, "learning_rate": 6.977401129943503e-08, "logits/chosen": -2.730664014816284, "logits/rejected": -2.894335985183716, "logps/chosen": -406.8999938964844, "logps/rejected": -421.1000061035156, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.597363471984863, "rewards/margins": 8.729687690734863, "rewards/rejected": -13.321874618530273, "step": 9880 }, { "epoch": 3.724842323260849, "grad_norm": 1.7754300925458888, "learning_rate": 6.883239171374765e-08, "logits/chosen": -2.873046875, "logits/rejected": -2.9361329078674316, "logps/chosen": -339.17498779296875, "logps/rejected": -403.79998779296875, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.127343654632568, "rewards/margins": 8.985156059265137, "rewards/rejected": -13.114062309265137, "step": 9890 }, { "epoch": 3.728607737927139, "grad_norm": 12.575760931979577, "learning_rate": 6.789077212806025e-08, "logits/chosen": -2.700390577316284, "logits/rejected": -2.8837890625, "logps/chosen": -372.32501220703125, "logps/rejected": -452.92498779296875, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.513964653015137, "rewards/margins": 8.9375, "rewards/rejected": -13.456250190734863, "step": 9900 }, { "epoch": 3.7323731525934294, "grad_norm": 2.0838031039043723, "learning_rate": 6.694915254237287e-08, "logits/chosen": -2.630859375, "logits/rejected": -2.8753905296325684, "logps/chosen": -385.625, "logps/rejected": -450.5, "loss": 0.0105, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.830273628234863, "rewards/margins": 8.901562690734863, "rewards/rejected": -13.740625381469727, "step": 9910 }, { "epoch": 3.7361385672597196, "grad_norm": 1.5670678893431638, "learning_rate": 6.60075329566855e-08, "logits/chosen": -2.6753907203674316, "logits/rejected": -2.8199219703674316, "logps/chosen": -388.625, "logps/rejected": -446.04998779296875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.546093940734863, "rewards/margins": 9.024218559265137, "rewards/rejected": -13.574999809265137, "step": 9920 }, { "epoch": 3.73990398192601, "grad_norm": 1.4549763328187943, "learning_rate": 6.506591337099811e-08, "logits/chosen": -2.7455077171325684, "logits/rejected": -2.8031249046325684, "logps/chosen": -385.5, "logps/rejected": -458.92498779296875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.658203125, "rewards/margins": 9.108593940734863, "rewards/rejected": -13.765625, "step": 9930 }, { "epoch": 3.7436693965922996, "grad_norm": 2.885871103419611, "learning_rate": 6.412429378531073e-08, "logits/chosen": -2.816601514816284, "logits/rejected": -2.8443360328674316, "logps/chosen": -393.7749938964844, "logps/rejected": -469.04998779296875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -4.888867378234863, "rewards/margins": 9.200780868530273, "rewards/rejected": -14.084375381469727, "step": 9940 }, { "epoch": 3.74743481125859, "grad_norm": 0.96040591215521, "learning_rate": 6.318267419962335e-08, "logits/chosen": -2.8189454078674316, "logits/rejected": -2.9380860328674316, "logps/chosen": -354.45001220703125, "logps/rejected": -420.95001220703125, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -4.421338081359863, "rewards/margins": 8.435937881469727, "rewards/rejected": -12.853124618530273, "step": 9950 }, { "epoch": 3.75120022592488, "grad_norm": 4.6267464509981195, "learning_rate": 6.224105461393597e-08, "logits/chosen": -2.8021483421325684, "logits/rejected": -2.9263672828674316, "logps/chosen": -362.04998779296875, "logps/rejected": -437.57501220703125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -4.312792778015137, "rewards/margins": 9.283594131469727, "rewards/rejected": -13.589062690734863, "step": 9960 }, { "epoch": 3.75496564059117, "grad_norm": 0.7301455768317319, "learning_rate": 6.129943502824859e-08, "logits/chosen": -2.7822265625, "logits/rejected": -2.7796874046325684, "logps/chosen": -389.11248779296875, "logps/rejected": -480.1000061035156, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.739062309265137, "rewards/margins": 8.818750381469727, "rewards/rejected": -13.556249618530273, "step": 9970 }, { "epoch": 3.75873105525746, "grad_norm": 0.740450238587212, "learning_rate": 6.03578154425612e-08, "logits/chosen": -2.917187452316284, "logits/rejected": -3.0355467796325684, "logps/chosen": -372.07501220703125, "logps/rejected": -420.625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.875195503234863, "rewards/margins": 8.380468368530273, "rewards/rejected": -13.254687309265137, "step": 9980 }, { "epoch": 3.76249646992375, "grad_norm": 0.6040358180817501, "learning_rate": 5.9416195856873816e-08, "logits/chosen": -2.6917967796325684, "logits/rejected": -2.8304686546325684, "logps/chosen": -360.0874938964844, "logps/rejected": -420.3500061035156, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -5.037109375, "rewards/margins": 9.055468559265137, "rewards/rejected": -14.09375, "step": 9990 }, { "epoch": 3.7662618845900404, "grad_norm": 0.4225846525025981, "learning_rate": 5.847457627118644e-08, "logits/chosen": -2.747851610183716, "logits/rejected": -2.9443359375, "logps/chosen": -397.82501220703125, "logps/rejected": -440.9750061035156, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -4.260156154632568, "rewards/margins": 9.1640625, "rewards/rejected": -13.428125381469727, "step": 10000 }, { "epoch": 3.7700272992563306, "grad_norm": 18.24761692979666, "learning_rate": 5.7532956685499055e-08, "logits/chosen": -2.649218797683716, "logits/rejected": -2.828906297683716, "logps/chosen": -400.63751220703125, "logps/rejected": -431.3999938964844, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -4.238476753234863, "rewards/margins": 8.776562690734863, "rewards/rejected": -13.004687309265137, "step": 10010 }, { "epoch": 3.773792713922621, "grad_norm": 1.718139661595286, "learning_rate": 5.6591337099811674e-08, "logits/chosen": -2.7646484375, "logits/rejected": -2.907031297683716, "logps/chosen": -361.04998779296875, "logps/rejected": -409.2250061035156, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.3701171875, "rewards/margins": 9.107812881469727, "rewards/rejected": -13.475000381469727, "step": 10020 }, { "epoch": 3.777558128588911, "grad_norm": 1.3583695120484756, "learning_rate": 5.564971751412429e-08, "logits/chosen": -2.8472657203674316, "logits/rejected": -2.9828124046325684, "logps/chosen": -324.3999938964844, "logps/rejected": -405.0, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.531054496765137, "rewards/margins": 8.49609375, "rewards/rejected": -13.021093368530273, "step": 10030 }, { "epoch": 3.781323543255201, "grad_norm": 0.3235305941407165, "learning_rate": 5.4708097928436913e-08, "logits/chosen": -2.780468702316284, "logits/rejected": -2.8414063453674316, "logps/chosen": -395.5249938964844, "logps/rejected": -451.4750061035156, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.780859470367432, "rewards/margins": 8.721875190734863, "rewards/rejected": -13.504687309265137, "step": 10040 }, { "epoch": 3.785088957921491, "grad_norm": 0.6005069314029126, "learning_rate": 5.3766478342749526e-08, "logits/chosen": -2.9134764671325684, "logits/rejected": -2.9095702171325684, "logps/chosen": -383.17498779296875, "logps/rejected": -451.92498779296875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.2421875, "rewards/margins": 8.866406440734863, "rewards/rejected": -13.109375, "step": 10050 }, { "epoch": 3.7888543725877812, "grad_norm": 0.7824944757018754, "learning_rate": 5.282485875706214e-08, "logits/chosen": -2.8271484375, "logits/rejected": -2.8707032203674316, "logps/chosen": -369.375, "logps/rejected": -450.79998779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.994140625, "rewards/margins": 9.073437690734863, "rewards/rejected": -14.071874618530273, "step": 10060 }, { "epoch": 3.7926197872540715, "grad_norm": 3.0323887554113935, "learning_rate": 5.1883239171374765e-08, "logits/chosen": -2.8130860328674316, "logits/rejected": -2.762890577316284, "logps/chosen": -378.42498779296875, "logps/rejected": -443.3999938964844, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -4.341699123382568, "rewards/margins": 8.922656059265137, "rewards/rejected": -13.262499809265137, "step": 10070 }, { "epoch": 3.796385201920361, "grad_norm": 0.5191624748224248, "learning_rate": 5.094161958568738e-08, "logits/chosen": -2.736328125, "logits/rejected": -2.8880858421325684, "logps/chosen": -371.70001220703125, "logps/rejected": -432.125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.417187690734863, "rewards/margins": 8.93359375, "rewards/rejected": -13.345312118530273, "step": 10080 }, { "epoch": 3.8001506165866514, "grad_norm": 2.6490524037991596, "learning_rate": 5e-08, "logits/chosen": -2.7720704078674316, "logits/rejected": -3.0220704078674316, "logps/chosen": -356.9750061035156, "logps/rejected": -409.45001220703125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.181250095367432, "rewards/margins": 9.620312690734863, "rewards/rejected": -13.793749809265137, "step": 10090 }, { "epoch": 3.8039160312529416, "grad_norm": 2.2578746092149276, "learning_rate": 4.905838041431262e-08, "logits/chosen": -2.6693358421325684, "logits/rejected": -2.8128905296325684, "logps/chosen": -393.29998779296875, "logps/rejected": -450.2749938964844, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -4.554296970367432, "rewards/margins": 9.1015625, "rewards/rejected": -13.660937309265137, "step": 10100 }, { "epoch": 3.807681445919232, "grad_norm": 0.583698161301066, "learning_rate": 4.811676082862523e-08, "logits/chosen": -2.8365235328674316, "logits/rejected": -2.876757860183716, "logps/chosen": -369.48748779296875, "logps/rejected": -393.3999938964844, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -4.014843940734863, "rewards/margins": 9.157031059265137, "rewards/rejected": -13.170312881469727, "step": 10110 }, { "epoch": 3.811446860585522, "grad_norm": 1.3001262425275475, "learning_rate": 4.717514124293785e-08, "logits/chosen": -2.8544921875, "logits/rejected": -2.755859375, "logps/chosen": -387.1499938964844, "logps/rejected": -447.75, "loss": 0.0135, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.731347560882568, "rewards/margins": 8.713281631469727, "rewards/rejected": -13.438281059265137, "step": 10120 }, { "epoch": 3.8152122752518123, "grad_norm": 3.6746326502759055, "learning_rate": 4.623352165725047e-08, "logits/chosen": -2.701953172683716, "logits/rejected": -2.7685546875, "logps/chosen": -406.4750061035156, "logps/rejected": -420.3500061035156, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.601171970367432, "rewards/margins": 8.336718559265137, "rewards/rejected": -12.932812690734863, "step": 10130 }, { "epoch": 3.8189776899181025, "grad_norm": 4.242100172669463, "learning_rate": 4.529190207156309e-08, "logits/chosen": -2.6402344703674316, "logits/rejected": -2.792187452316284, "logps/chosen": -420.2250061035156, "logps/rejected": -437.7749938964844, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -5.1591796875, "rewards/margins": 8.866406440734863, "rewards/rejected": -14.0234375, "step": 10140 }, { "epoch": 3.8227431045843923, "grad_norm": 0.7850242802785122, "learning_rate": 4.43502824858757e-08, "logits/chosen": -2.7484374046325684, "logits/rejected": -2.9136719703674316, "logps/chosen": -394.54998779296875, "logps/rejected": -446.54998779296875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -4.907422065734863, "rewards/margins": 8.732030868530273, "rewards/rejected": -13.639062881469727, "step": 10150 }, { "epoch": 3.8265085192506825, "grad_norm": 14.22857292095752, "learning_rate": 4.340866290018833e-08, "logits/chosen": -2.7255859375, "logits/rejected": -2.8873047828674316, "logps/chosen": -379.9750061035156, "logps/rejected": -462.6499938964844, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -5.230078220367432, "rewards/margins": 10.085156440734863, "rewards/rejected": -15.3125, "step": 10160 }, { "epoch": 3.8302739339169727, "grad_norm": 0.7343208285528007, "learning_rate": 4.246704331450094e-08, "logits/chosen": -2.79296875, "logits/rejected": -2.959765672683716, "logps/chosen": -380.2124938964844, "logps/rejected": -429.95001220703125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.7900390625, "rewards/margins": 8.948437690734863, "rewards/rejected": -13.745312690734863, "step": 10170 }, { "epoch": 3.834039348583263, "grad_norm": 0.7212575095075667, "learning_rate": 4.1525423728813554e-08, "logits/chosen": -2.700390577316284, "logits/rejected": -2.908203125, "logps/chosen": -388.2875061035156, "logps/rejected": -431.70001220703125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.6025390625, "rewards/margins": 9.716405868530273, "rewards/rejected": -14.321874618530273, "step": 10180 }, { "epoch": 3.8378047632495527, "grad_norm": 0.8563774822640661, "learning_rate": 4.058380414312618e-08, "logits/chosen": -2.552734375, "logits/rejected": -2.7607421875, "logps/chosen": -414.875, "logps/rejected": -488.29998779296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.693554878234863, "rewards/margins": 9.10546875, "rewards/rejected": -13.796875, "step": 10190 }, { "epoch": 3.841570177915843, "grad_norm": 5.588981164753724, "learning_rate": 3.964218455743879e-08, "logits/chosen": -2.7591795921325684, "logits/rejected": -2.857617139816284, "logps/chosen": -383.95001220703125, "logps/rejected": -443.07501220703125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -5.398633003234863, "rewards/margins": 8.90625, "rewards/rejected": -14.301562309265137, "step": 10200 }, { "epoch": 3.845335592582133, "grad_norm": 2.8106994841631714, "learning_rate": 3.870056497175141e-08, "logits/chosen": -2.8636717796325684, "logits/rejected": -2.82421875, "logps/chosen": -357.5625, "logps/rejected": -439.0, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.608007907867432, "rewards/margins": 8.557031631469727, "rewards/rejected": -13.17578125, "step": 10210 }, { "epoch": 3.8491010072484233, "grad_norm": 0.2540016495113583, "learning_rate": 3.775894538606403e-08, "logits/chosen": -2.8208985328674316, "logits/rejected": -2.868945360183716, "logps/chosen": -362.8125, "logps/rejected": -424.8999938964844, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -5.024609565734863, "rewards/margins": 8.55859375, "rewards/rejected": -13.581250190734863, "step": 10220 }, { "epoch": 3.8528664219147135, "grad_norm": 3.4055953395717626, "learning_rate": 3.6817325800376644e-08, "logits/chosen": -2.8294920921325684, "logits/rejected": -2.8753905296325684, "logps/chosen": -339.4750061035156, "logps/rejected": -431.32501220703125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.540429592132568, "rewards/margins": 9.212499618530273, "rewards/rejected": -13.754687309265137, "step": 10230 }, { "epoch": 3.8566318365810037, "grad_norm": 4.059093779901179, "learning_rate": 3.5875706214689264e-08, "logits/chosen": -2.7554688453674316, "logits/rejected": -2.8013672828674316, "logps/chosen": -380.07501220703125, "logps/rejected": -443.04998779296875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -4.864843845367432, "rewards/margins": 9.232030868530273, "rewards/rejected": -14.095312118530273, "step": 10240 }, { "epoch": 3.8603972512472935, "grad_norm": 1.1382663937296535, "learning_rate": 3.493408662900188e-08, "logits/chosen": -2.7984375953674316, "logits/rejected": -2.873046875, "logps/chosen": -389.9750061035156, "logps/rejected": -460.1000061035156, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -5.147070407867432, "rewards/margins": 8.827343940734863, "rewards/rejected": -13.975000381469727, "step": 10250 }, { "epoch": 3.8641626659135837, "grad_norm": 3.928011930098091, "learning_rate": 3.39924670433145e-08, "logits/chosen": -2.6402344703674316, "logits/rejected": -2.8636717796325684, "logps/chosen": -445.25, "logps/rejected": -473.2749938964844, "loss": 0.0081, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.805078029632568, "rewards/margins": 8.829687118530273, "rewards/rejected": -13.6328125, "step": 10260 }, { "epoch": 3.867928080579874, "grad_norm": 0.6381335896362382, "learning_rate": 3.3050847457627116e-08, "logits/chosen": -2.7874999046325684, "logits/rejected": -2.836132764816284, "logps/chosen": -352.42498779296875, "logps/rejected": -420.6499938964844, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.561913967132568, "rewards/margins": 8.978906631469727, "rewards/rejected": -13.5390625, "step": 10270 }, { "epoch": 3.871693495246164, "grad_norm": 9.668446395421945, "learning_rate": 3.2109227871939735e-08, "logits/chosen": -2.7867188453674316, "logits/rejected": -2.8587889671325684, "logps/chosen": -366.29998779296875, "logps/rejected": -430.6000061035156, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -4.79296875, "rewards/margins": 8.9609375, "rewards/rejected": -13.759374618530273, "step": 10280 }, { "epoch": 3.875458909912454, "grad_norm": 0.5266719556465279, "learning_rate": 3.1167608286252355e-08, "logits/chosen": -2.804882764816284, "logits/rejected": -2.8775391578674316, "logps/chosen": -382.2250061035156, "logps/rejected": -438.3500061035156, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -4.597851753234863, "rewards/margins": 8.586718559265137, "rewards/rejected": -13.1875, "step": 10290 }, { "epoch": 3.879224324578744, "grad_norm": 0.8890320505200817, "learning_rate": 3.0225988700564974e-08, "logits/chosen": -2.651171922683716, "logits/rejected": -2.802539110183716, "logps/chosen": -408.1625061035156, "logps/rejected": -462.17498779296875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.345898628234863, "rewards/margins": 8.978124618530273, "rewards/rejected": -13.324999809265137, "step": 10300 }, { "epoch": 3.8829897392450343, "grad_norm": 1.0767228354650977, "learning_rate": 2.928436911487759e-08, "logits/chosen": -2.943554639816284, "logits/rejected": -2.967578172683716, "logps/chosen": -356.625, "logps/rejected": -431.125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.636328220367432, "rewards/margins": 8.961718559265137, "rewards/rejected": -13.595312118530273, "step": 10310 }, { "epoch": 3.8867551539113245, "grad_norm": 2.116740630266629, "learning_rate": 2.8342749529190204e-08, "logits/chosen": -2.730664014816284, "logits/rejected": -2.730664014816284, "logps/chosen": -409.70001220703125, "logps/rejected": -454.5, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -4.952343940734863, "rewards/margins": 8.866406440734863, "rewards/rejected": -13.8203125, "step": 10320 }, { "epoch": 3.8905205685776147, "grad_norm": 1.3777227290206497, "learning_rate": 2.7401129943502823e-08, "logits/chosen": -2.833789110183716, "logits/rejected": -2.927734375, "logps/chosen": -359.0249938964844, "logps/rejected": -429.8999938964844, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -4.790625095367432, "rewards/margins": 9.56640625, "rewards/rejected": -14.362500190734863, "step": 10330 }, { "epoch": 3.894285983243905, "grad_norm": 4.008563121681011, "learning_rate": 2.6459510357815443e-08, "logits/chosen": -2.903125047683716, "logits/rejected": -3.018749952316284, "logps/chosen": -380.3999938964844, "logps/rejected": -416.2250061035156, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -5.065625190734863, "rewards/margins": 8.419530868530273, "rewards/rejected": -13.487500190734863, "step": 10340 }, { "epoch": 3.8980513979101947, "grad_norm": 0.5222535599302425, "learning_rate": 2.551789077212806e-08, "logits/chosen": -2.800976514816284, "logits/rejected": -2.840625047683716, "logps/chosen": -369.29998779296875, "logps/rejected": -445.75, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -4.461718559265137, "rewards/margins": 9.014062881469727, "rewards/rejected": -13.479687690734863, "step": 10350 }, { "epoch": 3.901816812576485, "grad_norm": 0.7076561959259092, "learning_rate": 2.4576271186440678e-08, "logits/chosen": -2.6439452171325684, "logits/rejected": -2.934765577316284, "logps/chosen": -404.875, "logps/rejected": -406.25, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.840234279632568, "rewards/margins": 8.891406059265137, "rewards/rejected": -13.729687690734863, "step": 10360 }, { "epoch": 3.905582227242775, "grad_norm": 0.3752809321393161, "learning_rate": 2.3634651600753298e-08, "logits/chosen": -2.7925782203674316, "logits/rejected": -3.003124952316284, "logps/chosen": -358.26251220703125, "logps/rejected": -426.45001220703125, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -5.2237548828125, "rewards/margins": 9.110937118530273, "rewards/rejected": -14.3359375, "step": 10370 }, { "epoch": 3.9093476419090654, "grad_norm": 1.210815847918568, "learning_rate": 2.269303201506591e-08, "logits/chosen": -2.7337889671325684, "logits/rejected": -2.7923827171325684, "logps/chosen": -362.6499938964844, "logps/rejected": -452.17498779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -4.930078029632568, "rewards/margins": 9.337499618530273, "rewards/rejected": -14.268750190734863, "step": 10380 }, { "epoch": 3.913113056575355, "grad_norm": 0.6249282975242252, "learning_rate": 2.175141242937853e-08, "logits/chosen": -2.772265672683716, "logits/rejected": -2.938281297683716, "logps/chosen": -378.2749938964844, "logps/rejected": -439.29998779296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -4.690527439117432, "rewards/margins": 8.888280868530273, "rewards/rejected": -13.579687118530273, "step": 10390 }, { "epoch": 3.9168784712416453, "grad_norm": 1.6559053383316784, "learning_rate": 2.0809792843691146e-08, "logits/chosen": -2.7939453125, "logits/rejected": -2.913281202316284, "logps/chosen": -353.1000061035156, "logps/rejected": -424.8999938964844, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -4.650195121765137, "rewards/margins": 9.252344131469727, "rewards/rejected": -13.904687881469727, "step": 10400 }, { "epoch": 3.9206438859079356, "grad_norm": 0.45024229349839656, "learning_rate": 1.9868173258003766e-08, "logits/chosen": -2.7484374046325684, "logits/rejected": -2.921093702316284, "logps/chosen": -409.57501220703125, "logps/rejected": -441.1000061035156, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -4.691821098327637, "rewards/margins": 8.928906440734863, "rewards/rejected": -13.629687309265137, "step": 10410 }, { "epoch": 3.9244093005742258, "grad_norm": 2.570611317088752, "learning_rate": 1.8926553672316385e-08, "logits/chosen": -2.7621092796325684, "logits/rejected": -2.8998045921325684, "logps/chosen": -396.0249938964844, "logps/rejected": -442.8500061035156, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -4.748437404632568, "rewards/margins": 9.409375190734863, "rewards/rejected": -14.154687881469727, "step": 10420 }, { "epoch": 3.928174715240516, "grad_norm": 0.3072492434595627, "learning_rate": 1.7984934086629002e-08, "logits/chosen": -2.7494139671325684, "logits/rejected": -2.8603515625, "logps/chosen": -394.1000061035156, "logps/rejected": -456.70001220703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -4.51171875, "rewards/margins": 9.987500190734863, "rewards/rejected": -14.504687309265137, "step": 10430 }, { "epoch": 3.931940129906806, "grad_norm": 0.336194361853482, "learning_rate": 1.7043314500941618e-08, "logits/chosen": -2.7734375, "logits/rejected": -2.919726610183716, "logps/chosen": -450.5, "logps/rejected": -445.04998779296875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.841210842132568, "rewards/margins": 8.642187118530273, "rewards/rejected": -13.4921875, "step": 10440 }, { "epoch": 3.935705544573096, "grad_norm": 0.39844715589755425, "learning_rate": 1.6101694915254237e-08, "logits/chosen": -2.6683592796325684, "logits/rejected": -2.88671875, "logps/chosen": -358.3374938964844, "logps/rejected": -430.7250061035156, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -4.332421779632568, "rewards/margins": 9.103906631469727, "rewards/rejected": -13.4296875, "step": 10450 }, { "epoch": 3.939470959239386, "grad_norm": 0.7295720748047662, "learning_rate": 1.5160075329566854e-08, "logits/chosen": -2.73046875, "logits/rejected": -2.8619141578674316, "logps/chosen": -371.8999938964844, "logps/rejected": -451.0249938964844, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -4.494238376617432, "rewards/margins": 9.231249809265137, "rewards/rejected": -13.727343559265137, "step": 10460 }, { "epoch": 3.9432363739056764, "grad_norm": 0.9455581660756751, "learning_rate": 1.4218455743879473e-08, "logits/chosen": -2.753124952316284, "logits/rejected": -2.815624952316284, "logps/chosen": -372.125, "logps/rejected": -439.42498779296875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.909814357757568, "rewards/margins": 9.020312309265137, "rewards/rejected": -13.931249618530273, "step": 10470 }, { "epoch": 3.9470017885719666, "grad_norm": 0.529226994459462, "learning_rate": 1.327683615819209e-08, "logits/chosen": -2.8021483421325684, "logits/rejected": -2.8863282203674316, "logps/chosen": -396.92498779296875, "logps/rejected": -433.3500061035156, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -4.990527153015137, "rewards/margins": 8.864062309265137, "rewards/rejected": -13.860156059265137, "step": 10480 }, { "epoch": 3.9507672032382564, "grad_norm": 5.080143540181525, "learning_rate": 1.2335216572504707e-08, "logits/chosen": -2.6734375953674316, "logits/rejected": -2.8902344703674316, "logps/chosen": -374.1499938964844, "logps/rejected": -429.125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -4.181445121765137, "rewards/margins": 9.328906059265137, "rewards/rejected": -13.5078125, "step": 10490 }, { "epoch": 3.9545326179045466, "grad_norm": 0.7427967347287169, "learning_rate": 1.1393596986817327e-08, "logits/chosen": -2.786328077316284, "logits/rejected": -2.951171875, "logps/chosen": -369.8500061035156, "logps/rejected": -425.79998779296875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -4.546875, "rewards/margins": 8.6953125, "rewards/rejected": -13.25, "step": 10500 }, { "epoch": 3.958298032570837, "grad_norm": 0.27647712815255693, "learning_rate": 1.0451977401129943e-08, "logits/chosen": -2.7798829078674316, "logits/rejected": -2.9375, "logps/chosen": -381.2749938964844, "logps/rejected": -430.4750061035156, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.43798828125, "rewards/margins": 8.737500190734863, "rewards/rejected": -13.178125381469727, "step": 10510 }, { "epoch": 3.962063447237127, "grad_norm": 1.4223217977612517, "learning_rate": 9.51035781544256e-09, "logits/chosen": -2.73046875, "logits/rejected": -2.918750047683716, "logps/chosen": -388.7875061035156, "logps/rejected": -440.3999938964844, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.357812404632568, "rewards/margins": 9.358593940734863, "rewards/rejected": -13.71875, "step": 10520 }, { "epoch": 3.965828861903417, "grad_norm": 0.6585380200646702, "learning_rate": 8.568738229755179e-09, "logits/chosen": -2.8128905296325684, "logits/rejected": -2.973437547683716, "logps/chosen": -374.6499938964844, "logps/rejected": -462.29998779296875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -4.677441596984863, "rewards/margins": 9.421875, "rewards/rejected": -14.103124618530273, "step": 10530 }, { "epoch": 3.9695942765697074, "grad_norm": 0.335756621704226, "learning_rate": 7.627118644067796e-09, "logits/chosen": -2.7544922828674316, "logits/rejected": -2.752148389816284, "logps/chosen": -398.8999938964844, "logps/rejected": -449.375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -4.601660251617432, "rewards/margins": 9.287500381469727, "rewards/rejected": -13.887499809265137, "step": 10540 }, { "epoch": 3.9733596912359976, "grad_norm": 3.3406902733690638, "learning_rate": 6.685499058380414e-09, "logits/chosen": -2.702343702316284, "logits/rejected": -2.847460985183716, "logps/chosen": -381.29998779296875, "logps/rejected": -430.7250061035156, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -4.344922065734863, "rewards/margins": 9.009374618530273, "rewards/rejected": -13.3515625, "step": 10550 }, { "epoch": 3.9771251059022874, "grad_norm": 0.6387865429220101, "learning_rate": 5.743879472693031e-09, "logits/chosen": -2.6302733421325684, "logits/rejected": -2.91015625, "logps/chosen": -403.9750061035156, "logps/rejected": -447.2749938964844, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -5.001562595367432, "rewards/margins": 9.049219131469727, "rewards/rejected": -14.050000190734863, "step": 10560 }, { "epoch": 3.9808905205685776, "grad_norm": 0.6419034538543796, "learning_rate": 4.80225988700565e-09, "logits/chosen": -2.753710985183716, "logits/rejected": -2.9203124046325684, "logps/chosen": -383.5375061035156, "logps/rejected": -437.70001220703125, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -4.584374904632568, "rewards/margins": 9.380468368530273, "rewards/rejected": -13.964062690734863, "step": 10570 }, { "epoch": 3.984655935234868, "grad_norm": 4.312997607532136, "learning_rate": 3.860640301318267e-09, "logits/chosen": -2.7884764671325684, "logits/rejected": -2.886523485183716, "logps/chosen": -371.1000061035156, "logps/rejected": -453.0, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -5.285546779632568, "rewards/margins": 9.284375190734863, "rewards/rejected": -14.565625190734863, "step": 10580 }, { "epoch": 3.9884213499011576, "grad_norm": 0.7509798943253133, "learning_rate": 2.9190207156308854e-09, "logits/chosen": -2.7509765625, "logits/rejected": -2.697070360183716, "logps/chosen": -383.3374938964844, "logps/rejected": -491.8999938964844, "loss": 0.0104, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.685156345367432, "rewards/margins": 9.274999618530273, "rewards/rejected": -13.953125, "step": 10590 }, { "epoch": 3.992186764567448, "grad_norm": 39.14373109118196, "learning_rate": 1.977401129943503e-09, "logits/chosen": -2.8734374046325684, "logits/rejected": -3.0162110328674316, "logps/chosen": -339.375, "logps/rejected": -391.9750061035156, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -5.074999809265137, "rewards/margins": 8.693750381469727, "rewards/rejected": -13.771875381469727, "step": 10600 }, { "epoch": 3.995952179233738, "grad_norm": 1.74856701995762, "learning_rate": 1.0357815442561205e-09, "logits/chosen": -2.8525390625, "logits/rejected": -2.923046827316284, "logps/chosen": -361.6499938964844, "logps/rejected": -435.70001220703125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -4.7802734375, "rewards/margins": 9.228906631469727, "rewards/rejected": -14.003125190734863, "step": 10610 }, { "epoch": 3.9997175939000282, "grad_norm": 1.4273871607756417, "learning_rate": 9.416195856873822e-11, "logits/chosen": -2.947460889816284, "logits/rejected": -2.799023389816284, "logps/chosen": -337.8125, "logps/rejected": -408.0, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -4.723046779632568, "rewards/margins": 8.947656631469727, "rewards/rejected": -13.675000190734863, "step": 10620 } ], "logging_steps": 10, "max_steps": 10620, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }