{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 4556, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00877963125548727, "grad_norm": 109.13970328247358, "learning_rate": 9.980245829675154e-07, "logits/chosen": -2.030468702316284, "logits/rejected": -1.919531226158142, "logps/chosen": -433.3999938964844, "logps/rejected": -303.70001220703125, "loss": 0.6555, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0002197265566792339, "rewards/margins": 0.09664078056812286, "rewards/rejected": -0.09693603217601776, "step": 10 }, { "epoch": 0.01755926251097454, "grad_norm": 94.34540367670509, "learning_rate": 9.958296751536435e-07, "logits/chosen": -2.0238280296325684, "logits/rejected": -1.886328101158142, "logps/chosen": -482.5, "logps/rejected": -309.8500061035156, "loss": 0.5293, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.28374022245407104, "rewards/margins": 0.501757800579071, "rewards/rejected": -0.21784362196922302, "step": 20 }, { "epoch": 0.02633889376646181, "grad_norm": 84.05289271642286, "learning_rate": 9.936347673397717e-07, "logits/chosen": -2.0140624046325684, "logits/rejected": -1.7859375476837158, "logps/chosen": -377.1000061035156, "logps/rejected": -266.32501220703125, "loss": 0.4752, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.7591797113418579, "rewards/margins": 0.7768310308456421, "rewards/rejected": -0.01757202111184597, "step": 30 }, { "epoch": 0.03511852502194908, "grad_norm": 70.26702505015753, "learning_rate": 9.914398595259e-07, "logits/chosen": -1.951171875, "logits/rejected": -1.865234375, "logps/chosen": -364.8999938964844, "logps/rejected": -247.0, "loss": 0.4345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.990429699420929, "rewards/margins": 1.031835913658142, "rewards/rejected": -0.04274902492761612, "step": 40 }, { "epoch": 0.043898156277436345, "grad_norm": 62.654357226046166, "learning_rate": 9.89244951712028e-07, "logits/chosen": -2.075390577316284, "logits/rejected": -1.98046875, "logps/chosen": -431.1499938964844, "logps/rejected": -273.75, "loss": 0.3164, "rewards/accuracies": 0.875, "rewards/chosen": 1.0437500476837158, "rewards/margins": 1.5490233898162842, "rewards/rejected": -0.5049804449081421, "step": 50 }, { "epoch": 0.05267778753292362, "grad_norm": 72.32495506595563, "learning_rate": 9.870500438981562e-07, "logits/chosen": -1.935156226158142, "logits/rejected": -1.8683593273162842, "logps/chosen": -501.0, "logps/rejected": -326.8500061035156, "loss": 0.275, "rewards/accuracies": 0.875, "rewards/chosen": 1.001074194908142, "rewards/margins": 1.902929663658142, "rewards/rejected": -0.9029541015625, "step": 60 }, { "epoch": 0.061457418788410885, "grad_norm": 90.82689626044017, "learning_rate": 9.848551360842844e-07, "logits/chosen": -2.0765624046325684, "logits/rejected": -1.990234375, "logps/chosen": -455.8500061035156, "logps/rejected": -292.8500061035156, "loss": 0.3594, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.942919909954071, "rewards/margins": 1.808203101158142, "rewards/rejected": -0.8660644292831421, "step": 70 }, { "epoch": 0.07023705004389816, "grad_norm": 48.71442518982463, "learning_rate": 9.826602282704126e-07, "logits/chosen": -1.946874976158142, "logits/rejected": -1.884374976158142, "logps/chosen": -441.29998779296875, "logps/rejected": -303.3999938964844, "loss": 0.3085, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.9786132574081421, "rewards/margins": 1.95703125, "rewards/rejected": -0.9785400629043579, "step": 80 }, { "epoch": 0.07901668129938542, "grad_norm": 110.03171960576965, "learning_rate": 9.804653204565408e-07, "logits/chosen": -2.08203125, "logits/rejected": -2.0140624046325684, "logps/chosen": -477.29998779296875, "logps/rejected": -298.6000061035156, "loss": 0.2688, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.789599597454071, "rewards/margins": 2.289843797683716, "rewards/rejected": -1.500390648841858, "step": 90 }, { "epoch": 0.08779631255487269, "grad_norm": 90.10158868954922, "learning_rate": 9.78270412642669e-07, "logits/chosen": -2.047656297683716, "logits/rejected": -2.063281297683716, "logps/chosen": -441.04998779296875, "logps/rejected": -264.3500061035156, "loss": 0.2648, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 1.027929663658142, "rewards/margins": 2.478710889816284, "rewards/rejected": -1.4507324695587158, "step": 100 }, { "epoch": 0.09657594381035997, "grad_norm": 63.29128281876568, "learning_rate": 9.760755048287971e-07, "logits/chosen": -2.0367188453674316, "logits/rejected": -1.9226562976837158, "logps/chosen": -435.79998779296875, "logps/rejected": -316.75, "loss": 0.3114, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 1.017480492591858, "rewards/margins": 2.360156297683716, "rewards/rejected": -1.344335913658142, "step": 110 }, { "epoch": 0.10535557506584724, "grad_norm": 82.60117425160554, "learning_rate": 9.738805970149253e-07, "logits/chosen": -2.029296875, "logits/rejected": -2.008984327316284, "logps/chosen": -460.75, "logps/rejected": -306.20001220703125, "loss": 0.24, "rewards/accuracies": 0.90625, "rewards/chosen": 1.074609398841858, "rewards/margins": 2.743359327316284, "rewards/rejected": -1.669921875, "step": 120 }, { "epoch": 0.1141352063213345, "grad_norm": 75.6782650414838, "learning_rate": 9.716856892010535e-07, "logits/chosen": -2.0703125, "logits/rejected": -2.0328125953674316, "logps/chosen": -462.25, "logps/rejected": -339.0, "loss": 0.3127, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.9510253667831421, "rewards/margins": 2.646679639816284, "rewards/rejected": -1.694433569908142, "step": 130 }, { "epoch": 0.12291483757682177, "grad_norm": 85.8365746604714, "learning_rate": 9.694907813871816e-07, "logits/chosen": -1.9753906726837158, "logits/rejected": -1.9874999523162842, "logps/chosen": -428.70001220703125, "logps/rejected": -305.3999938964844, "loss": 0.3302, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.566821277141571, "rewards/margins": 2.488476514816284, "rewards/rejected": -1.921875, "step": 140 }, { "epoch": 0.13169446883230904, "grad_norm": 111.78470030134876, "learning_rate": 9.672958735733098e-07, "logits/chosen": -2.1148438453674316, "logits/rejected": -2.053906202316284, "logps/chosen": -423.3999938964844, "logps/rejected": -276.25, "loss": 0.2385, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.37567138671875, "rewards/margins": 2.6773438453674316, "rewards/rejected": -2.303906202316284, "step": 150 }, { "epoch": 0.14047410008779632, "grad_norm": 109.6378007960495, "learning_rate": 9.651009657594382e-07, "logits/chosen": -2.0562500953674316, "logits/rejected": -2.002734422683716, "logps/chosen": -395.6000061035156, "logps/rejected": -296.67498779296875, "loss": 0.2739, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.6416991949081421, "rewards/margins": 2.7640624046325684, "rewards/rejected": -2.121875047683716, "step": 160 }, { "epoch": 0.14925373134328357, "grad_norm": 67.17728743004999, "learning_rate": 9.629060579455661e-07, "logits/chosen": -2.1363282203674316, "logits/rejected": -2.057812452316284, "logps/chosen": -425.5, "logps/rejected": -270.3500061035156, "loss": 0.2559, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.841381847858429, "rewards/margins": 2.6683592796325684, "rewards/rejected": -1.825585961341858, "step": 170 }, { "epoch": 0.15803336259877085, "grad_norm": 94.40359155524563, "learning_rate": 9.607111501316945e-07, "logits/chosen": -2.1382813453674316, "logits/rejected": -2.098437547683716, "logps/chosen": -471.1000061035156, "logps/rejected": -287.95001220703125, "loss": 0.2294, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.947033703327179, "rewards/margins": 2.9976563453674316, "rewards/rejected": -2.0517578125, "step": 180 }, { "epoch": 0.16681299385425813, "grad_norm": 72.03506390577603, "learning_rate": 9.585162423178225e-07, "logits/chosen": -2.045703172683716, "logits/rejected": -2.084765672683716, "logps/chosen": -501.29998779296875, "logps/rejected": -317.1000061035156, "loss": 0.2127, "rewards/accuracies": 0.90625, "rewards/chosen": 0.67205810546875, "rewards/margins": 3.350781202316284, "rewards/rejected": -2.676953077316284, "step": 190 }, { "epoch": 0.17559262510974538, "grad_norm": 93.27955751185753, "learning_rate": 9.563213345039509e-07, "logits/chosen": -2.211718797683716, "logits/rejected": -2.0687499046325684, "logps/chosen": -463.8500061035156, "logps/rejected": -325.3500061035156, "loss": 0.2497, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.2869018614292145, "rewards/margins": 3.149609327316284, "rewards/rejected": -3.4410157203674316, "step": 200 }, { "epoch": 0.18437225636523266, "grad_norm": 77.82334585299272, "learning_rate": 9.541264266900788e-07, "logits/chosen": -2.124218702316284, "logits/rejected": -2.1273436546325684, "logps/chosen": -436.95001220703125, "logps/rejected": -288.70001220703125, "loss": 0.2713, "rewards/accuracies": 0.875, "rewards/chosen": -0.07237549126148224, "rewards/margins": 3.153515577316284, "rewards/rejected": -3.227734327316284, "step": 210 }, { "epoch": 0.19315188762071994, "grad_norm": 21.694976416377187, "learning_rate": 9.519315188762071e-07, "logits/chosen": -2.134765625, "logits/rejected": -2.01171875, "logps/chosen": -459.70001220703125, "logps/rejected": -327.8999938964844, "loss": 0.2954, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.06580200046300888, "rewards/margins": 3.3824219703674316, "rewards/rejected": -3.31640625, "step": 220 }, { "epoch": 0.2019315188762072, "grad_norm": 30.56832606851288, "learning_rate": 9.497366110623354e-07, "logits/chosen": -2.142578125, "logits/rejected": -2.0542969703674316, "logps/chosen": -458.1000061035156, "logps/rejected": -332.29998779296875, "loss": 0.2406, "rewards/accuracies": 0.90625, "rewards/chosen": 0.21712036430835724, "rewards/margins": 3.201171875, "rewards/rejected": -2.986328125, "step": 230 }, { "epoch": 0.21071115013169447, "grad_norm": 30.77078164453209, "learning_rate": 9.475417032484635e-07, "logits/chosen": -2.12109375, "logits/rejected": -1.9968750476837158, "logps/chosen": -446.70001220703125, "logps/rejected": -306.20001220703125, "loss": 0.238, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6296142339706421, "rewards/margins": 3.125781297683716, "rewards/rejected": -2.493359327316284, "step": 240 }, { "epoch": 0.21949078138718173, "grad_norm": 33.56293554427966, "learning_rate": 9.453467954345917e-07, "logits/chosen": -2.2210936546325684, "logits/rejected": -2.1546874046325684, "logps/chosen": -431.70001220703125, "logps/rejected": -295.29998779296875, "loss": 0.2395, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.33845216035842896, "rewards/margins": 3.3363280296325684, "rewards/rejected": -2.9957032203674316, "step": 250 }, { "epoch": 0.228270412642669, "grad_norm": 55.524382065908405, "learning_rate": 9.431518876207198e-07, "logits/chosen": -2.1773438453674316, "logits/rejected": -2.0042967796325684, "logps/chosen": -427.20001220703125, "logps/rejected": -320.70001220703125, "loss": 0.255, "rewards/accuracies": 0.90625, "rewards/chosen": -0.26520997285842896, "rewards/margins": 3.2484374046325684, "rewards/rejected": -3.512500047683716, "step": 260 }, { "epoch": 0.2370500438981563, "grad_norm": 54.01974298101663, "learning_rate": 9.409569798068481e-07, "logits/chosen": -2.1273436546325684, "logits/rejected": -2.111328125, "logps/chosen": -374.75, "logps/rejected": -274.70001220703125, "loss": 0.2185, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.4251342713832855, "rewards/margins": 3.160937547683716, "rewards/rejected": -2.7337889671325684, "step": 270 }, { "epoch": 0.24582967515364354, "grad_norm": 64.18071883073675, "learning_rate": 9.387620719929763e-07, "logits/chosen": -1.9445312023162842, "logits/rejected": -2.003124952316284, "logps/chosen": -430.5, "logps/rejected": -334.75, "loss": 0.2238, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.799877941608429, "rewards/margins": 3.1640625, "rewards/rejected": -2.3648438453674316, "step": 280 }, { "epoch": 0.2546093064091308, "grad_norm": 41.89305555971381, "learning_rate": 9.365671641791044e-07, "logits/chosen": -2.055859327316284, "logits/rejected": -2.1539063453674316, "logps/chosen": -415.29998779296875, "logps/rejected": -249.14999389648438, "loss": 0.2134, "rewards/accuracies": 0.90625, "rewards/chosen": 0.30503541231155396, "rewards/margins": 3.417773485183716, "rewards/rejected": -3.1148438453674316, "step": 290 }, { "epoch": 0.2633889376646181, "grad_norm": 35.25365172838076, "learning_rate": 9.343722563652326e-07, "logits/chosen": -2.0835938453674316, "logits/rejected": -2.063671827316284, "logps/chosen": -478.1000061035156, "logps/rejected": -307.6499938964844, "loss": 0.1894, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.4594482481479645, "rewards/margins": 3.591015577316284, "rewards/rejected": -3.1304688453674316, "step": 300 }, { "epoch": 0.2721685689201054, "grad_norm": 45.43698598182238, "learning_rate": 9.321773485513608e-07, "logits/chosen": -2.1117186546325684, "logits/rejected": -2.143359422683716, "logps/chosen": -392.5, "logps/rejected": -290.6000061035156, "loss": 0.2509, "rewards/accuracies": 0.90625, "rewards/chosen": 0.20820312201976776, "rewards/margins": 3.137890577316284, "rewards/rejected": -2.92578125, "step": 310 }, { "epoch": 0.28094820017559263, "grad_norm": 24.08317849479442, "learning_rate": 9.29982440737489e-07, "logits/chosen": -2.1488280296325684, "logits/rejected": -2.112109422683716, "logps/chosen": -464.6000061035156, "logps/rejected": -310.8500061035156, "loss": 0.1627, "rewards/accuracies": 0.9375, "rewards/chosen": 0.8848632574081421, "rewards/margins": 3.76171875, "rewards/rejected": -2.8753905296325684, "step": 320 }, { "epoch": 0.2897278314310799, "grad_norm": 67.04668935007606, "learning_rate": 9.277875329236171e-07, "logits/chosen": -2.0374999046325684, "logits/rejected": -2.1058592796325684, "logps/chosen": -438.5, "logps/rejected": -311.6000061035156, "loss": 0.1902, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5768798589706421, "rewards/margins": 3.610156297683716, "rewards/rejected": -3.032421827316284, "step": 330 }, { "epoch": 0.29850746268656714, "grad_norm": 105.4845968633654, "learning_rate": 9.255926251097453e-07, "logits/chosen": -2.0992188453674316, "logits/rejected": -2.1871094703674316, "logps/chosen": -420.79998779296875, "logps/rejected": -290.29998779296875, "loss": 0.2165, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.36976319551467896, "rewards/margins": 3.784374952316284, "rewards/rejected": -3.4117188453674316, "step": 340 }, { "epoch": 0.30728709394205445, "grad_norm": 72.50177354695649, "learning_rate": 9.233977172958736e-07, "logits/chosen": -2.2191405296325684, "logits/rejected": -2.1753907203674316, "logps/chosen": -434.8999938964844, "logps/rejected": -325.79998779296875, "loss": 0.1292, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.178955078125, "rewards/margins": 4.399218559265137, "rewards/rejected": -4.219531059265137, "step": 350 }, { "epoch": 0.3160667251975417, "grad_norm": 75.1023432074706, "learning_rate": 9.212028094820017e-07, "logits/chosen": -2.11328125, "logits/rejected": -2.141406297683716, "logps/chosen": -442.3999938964844, "logps/rejected": -273.45001220703125, "loss": 0.172, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5267333984375, "rewards/margins": 4.272656440734863, "rewards/rejected": -3.748046875, "step": 360 }, { "epoch": 0.32484635645302895, "grad_norm": 50.469284265081235, "learning_rate": 9.190079016681299e-07, "logits/chosen": -2.0835938453674316, "logits/rejected": -2.088671922683716, "logps/chosen": -452.3999938964844, "logps/rejected": -303.3500061035156, "loss": 0.2088, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.18303832411766052, "rewards/margins": 4.36328125, "rewards/rejected": -4.1796875, "step": 370 }, { "epoch": 0.33362598770851626, "grad_norm": 16.754396284484763, "learning_rate": 9.16812993854258e-07, "logits/chosen": -2.130859375, "logits/rejected": -2.141406297683716, "logps/chosen": -466.1000061035156, "logps/rejected": -335.54998779296875, "loss": 0.2428, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.03295288234949112, "rewards/margins": 3.80078125, "rewards/rejected": -3.7671875953674316, "step": 380 }, { "epoch": 0.3424056189640035, "grad_norm": 78.8803531723786, "learning_rate": 9.146180860403863e-07, "logits/chosen": -2.076953172683716, "logits/rejected": -2.0972657203674316, "logps/chosen": -440.04998779296875, "logps/rejected": -332.75, "loss": 0.2099, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.579296886920929, "rewards/margins": 4.063281059265137, "rewards/rejected": -3.4820313453674316, "step": 390 }, { "epoch": 0.35118525021949076, "grad_norm": 14.433067656955, "learning_rate": 9.124231782265145e-07, "logits/chosen": -2.079296827316284, "logits/rejected": -2.067578077316284, "logps/chosen": -472.5, "logps/rejected": -318.95001220703125, "loss": 0.1494, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.1320312023162842, "rewards/margins": 4.486718654632568, "rewards/rejected": -3.356250047683716, "step": 400 }, { "epoch": 0.35996488147497807, "grad_norm": 66.59493048581147, "learning_rate": 9.102282704126426e-07, "logits/chosen": -2.1039061546325684, "logits/rejected": -2.1117186546325684, "logps/chosen": -432.3500061035156, "logps/rejected": -323.3999938964844, "loss": 0.282, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.4548583924770355, "rewards/margins": 3.774218797683716, "rewards/rejected": -3.3207030296325684, "step": 410 }, { "epoch": 0.3687445127304653, "grad_norm": 134.96750879284693, "learning_rate": 9.080333625987708e-07, "logits/chosen": -2.1214842796325684, "logits/rejected": -2.075390577316284, "logps/chosen": -430.75, "logps/rejected": -299.54998779296875, "loss": 0.2632, "rewards/accuracies": 0.90625, "rewards/chosen": 0.25281983613967896, "rewards/margins": 4.217187404632568, "rewards/rejected": -3.969531297683716, "step": 420 }, { "epoch": 0.3775241439859526, "grad_norm": 53.570790263433366, "learning_rate": 9.05838454784899e-07, "logits/chosen": -2.1441407203674316, "logits/rejected": -2.1468749046325684, "logps/chosen": -462.6499938964844, "logps/rejected": -325.54998779296875, "loss": 0.1967, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22116699814796448, "rewards/margins": 3.889843702316284, "rewards/rejected": -4.114062309265137, "step": 430 }, { "epoch": 0.3863037752414399, "grad_norm": 57.08040279451106, "learning_rate": 9.036435469710271e-07, "logits/chosen": -2.112109422683716, "logits/rejected": -2.118359327316284, "logps/chosen": -416.5, "logps/rejected": -311.70001220703125, "loss": 0.2331, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5199950933456421, "rewards/margins": 4.294531345367432, "rewards/rejected": -4.817968845367432, "step": 440 }, { "epoch": 0.39508340649692714, "grad_norm": 95.43206409640776, "learning_rate": 9.014486391571554e-07, "logits/chosen": -2.216796875, "logits/rejected": -2.139453172683716, "logps/chosen": -439.70001220703125, "logps/rejected": -336.5, "loss": 0.2249, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.24812011420726776, "rewards/margins": 4.46875, "rewards/rejected": -4.710156440734863, "step": 450 }, { "epoch": 0.4038630377524144, "grad_norm": 57.93147205581145, "learning_rate": 8.992537313432835e-07, "logits/chosen": -2.2828125953674316, "logits/rejected": -2.182812452316284, "logps/chosen": -411.29998779296875, "logps/rejected": -306.70001220703125, "loss": 0.2234, "rewards/accuracies": 0.90625, "rewards/chosen": 0.8202148675918579, "rewards/margins": 3.5902342796325684, "rewards/rejected": -2.7685546875, "step": 460 }, { "epoch": 0.41264266900790164, "grad_norm": 92.79683259836257, "learning_rate": 8.970588235294118e-07, "logits/chosen": -2.178906202316284, "logits/rejected": -2.2320313453674316, "logps/chosen": -433.04998779296875, "logps/rejected": -263.95001220703125, "loss": 0.2153, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.9303222894668579, "rewards/margins": 3.6566405296325684, "rewards/rejected": -2.723559617996216, "step": 470 }, { "epoch": 0.42142230026338895, "grad_norm": 32.576034948593, "learning_rate": 8.948639157155398e-07, "logits/chosen": -2.012890577316284, "logits/rejected": -2.1917967796325684, "logps/chosen": -472.29998779296875, "logps/rejected": -313.8500061035156, "loss": 0.1522, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.0063232420943677425, "rewards/margins": 4.173047065734863, "rewards/rejected": -4.177343845367432, "step": 480 }, { "epoch": 0.4302019315188762, "grad_norm": 65.15745347067617, "learning_rate": 8.926690079016681e-07, "logits/chosen": -2.140625, "logits/rejected": -2.1429686546325684, "logps/chosen": -391.29998779296875, "logps/rejected": -326.20001220703125, "loss": 0.1283, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.711108386516571, "rewards/margins": 4.59375, "rewards/rejected": -5.305468559265137, "step": 490 }, { "epoch": 0.43898156277436345, "grad_norm": 97.60751286979364, "learning_rate": 8.904741000877962e-07, "logits/chosen": -2.1156249046325684, "logits/rejected": -2.221484422683716, "logps/chosen": -481.45001220703125, "logps/rejected": -335.20001220703125, "loss": 0.2104, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.555676281452179, "rewards/margins": 4.49609375, "rewards/rejected": -5.053124904632568, "step": 500 }, { "epoch": 0.44776119402985076, "grad_norm": 54.498651704384, "learning_rate": 8.882791922739245e-07, "logits/chosen": -2.060546875, "logits/rejected": -2.046875, "logps/chosen": -517.0999755859375, "logps/rejected": -383.6000061035156, "loss": 0.2556, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.3011230528354645, "rewards/margins": 4.125781059265137, "rewards/rejected": -4.425000190734863, "step": 510 }, { "epoch": 0.456540825285338, "grad_norm": 74.72413097614373, "learning_rate": 8.860842844600526e-07, "logits/chosen": -2.03125, "logits/rejected": -2.072265625, "logps/chosen": -433.6499938964844, "logps/rejected": -318.8500061035156, "loss": 0.2302, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05378418043255806, "rewards/margins": 4.114062309265137, "rewards/rejected": -4.168359279632568, "step": 520 }, { "epoch": 0.46532045654082527, "grad_norm": 38.340095947553, "learning_rate": 8.838893766461808e-07, "logits/chosen": -2.131640672683716, "logits/rejected": -2.0914063453674316, "logps/chosen": -405.3999938964844, "logps/rejected": -278.95001220703125, "loss": 0.1763, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.0306396484375, "rewards/margins": 4.125781059265137, "rewards/rejected": -4.15234375, "step": 530 }, { "epoch": 0.4741000877963126, "grad_norm": 46.83712424011487, "learning_rate": 8.81694468832309e-07, "logits/chosen": -2.0882811546325684, "logits/rejected": -2.0777344703674316, "logps/chosen": -491.70001220703125, "logps/rejected": -329.79998779296875, "loss": 0.1081, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.144866943359375, "rewards/margins": 4.575781345367432, "rewards/rejected": -4.434374809265137, "step": 540 }, { "epoch": 0.4828797190517998, "grad_norm": 83.49552287784984, "learning_rate": 8.794995610184372e-07, "logits/chosen": -2.149218797683716, "logits/rejected": -2.098437547683716, "logps/chosen": -389.95001220703125, "logps/rejected": -314.29998779296875, "loss": 0.1915, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.42718505859375, "rewards/margins": 4.41796875, "rewards/rejected": -4.846875190734863, "step": 550 }, { "epoch": 0.4916593503072871, "grad_norm": 44.480640119351804, "learning_rate": 8.773046532045653e-07, "logits/chosen": -2.104687452316284, "logits/rejected": -2.112109422683716, "logps/chosen": -414.79998779296875, "logps/rejected": -322.3999938964844, "loss": 0.2297, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.07548828423023224, "rewards/margins": 4.655468940734863, "rewards/rejected": -4.583593845367432, "step": 560 }, { "epoch": 0.5004389815627743, "grad_norm": 68.92253975153191, "learning_rate": 8.751097453906936e-07, "logits/chosen": -2.1968750953674316, "logits/rejected": -2.147656202316284, "logps/chosen": -432.25, "logps/rejected": -394.5, "loss": 0.2615, "rewards/accuracies": 0.875, "rewards/chosen": -0.9510253667831421, "rewards/margins": 4.291406154632568, "rewards/rejected": -5.2421875, "step": 570 }, { "epoch": 0.5092186128182616, "grad_norm": 108.15215870599476, "learning_rate": 8.729148375768217e-07, "logits/chosen": -2.1171875, "logits/rejected": -2.160937547683716, "logps/chosen": -450.8999938964844, "logps/rejected": -343.70001220703125, "loss": 0.1849, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.55517578125, "rewards/margins": 4.383593559265137, "rewards/rejected": -5.936718940734863, "step": 580 }, { "epoch": 0.517998244073749, "grad_norm": 15.306293998296644, "learning_rate": 8.7071992976295e-07, "logits/chosen": -2.1265625953674316, "logits/rejected": -2.1500000953674316, "logps/chosen": -471.95001220703125, "logps/rejected": -336.95001220703125, "loss": 0.2001, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.23862305283546448, "rewards/margins": 4.780468940734863, "rewards/rejected": -5.028124809265137, "step": 590 }, { "epoch": 0.5267778753292361, "grad_norm": 66.51668565473275, "learning_rate": 8.68525021949078e-07, "logits/chosen": -2.112109422683716, "logits/rejected": -2.1917967796325684, "logps/chosen": -449.95001220703125, "logps/rejected": -316.8500061035156, "loss": 0.2073, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.45073240995407104, "rewards/margins": 4.410937309265137, "rewards/rejected": -4.859375, "step": 600 }, { "epoch": 0.5355575065847235, "grad_norm": 64.80150769604948, "learning_rate": 8.663301141352063e-07, "logits/chosen": -2.072265625, "logits/rejected": -2.1429686546325684, "logps/chosen": -456.5, "logps/rejected": -331.5, "loss": 0.1895, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0469970703125, "rewards/margins": 4.651562690734863, "rewards/rejected": -4.599999904632568, "step": 610 }, { "epoch": 0.5443371378402108, "grad_norm": 101.53106401670574, "learning_rate": 8.641352063213345e-07, "logits/chosen": -2.223437547683716, "logits/rejected": -2.255859375, "logps/chosen": -395.3999938964844, "logps/rejected": -304.95001220703125, "loss": 0.1795, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.731152355670929, "rewards/margins": 4.413671970367432, "rewards/rejected": -5.14453125, "step": 620 }, { "epoch": 0.553116769095698, "grad_norm": 73.80132473437388, "learning_rate": 8.619402985074626e-07, "logits/chosen": -2.1773438453674316, "logits/rejected": -2.2093749046325684, "logps/chosen": -471.8999938964844, "logps/rejected": -320.0, "loss": 0.1802, "rewards/accuracies": 0.9375, "rewards/chosen": -0.28009033203125, "rewards/margins": 4.625781059265137, "rewards/rejected": -4.90625, "step": 630 }, { "epoch": 0.5618964003511853, "grad_norm": 104.32149056296203, "learning_rate": 8.597453906935908e-07, "logits/chosen": -2.166796922683716, "logits/rejected": -2.1624999046325684, "logps/chosen": -458.79998779296875, "logps/rejected": -333.29998779296875, "loss": 0.1938, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.10056152194738388, "rewards/margins": 4.796093940734863, "rewards/rejected": -4.893750190734863, "step": 640 }, { "epoch": 0.5706760316066726, "grad_norm": 50.95752771225326, "learning_rate": 8.57550482879719e-07, "logits/chosen": -2.178906202316284, "logits/rejected": -2.205078125, "logps/chosen": -467.6000061035156, "logps/rejected": -313.45001220703125, "loss": 0.2282, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.3135742247104645, "rewards/margins": 5.034375190734863, "rewards/rejected": -5.346875190734863, "step": 650 }, { "epoch": 0.5794556628621598, "grad_norm": 71.82484643634139, "learning_rate": 8.553555750658472e-07, "logits/chosen": -2.168750047683716, "logits/rejected": -2.262890577316284, "logps/chosen": -438.70001220703125, "logps/rejected": -336.1000061035156, "loss": 0.1772, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.5922485589981079, "rewards/margins": 4.578906059265137, "rewards/rejected": -5.165625095367432, "step": 660 }, { "epoch": 0.5882352941176471, "grad_norm": 85.27753406534178, "learning_rate": 8.531606672519753e-07, "logits/chosen": -2.216015577316284, "logits/rejected": -2.1796875, "logps/chosen": -466.95001220703125, "logps/rejected": -359.45001220703125, "loss": 0.1464, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.520068347454071, "rewards/margins": 5.157031059265137, "rewards/rejected": -5.673437595367432, "step": 670 }, { "epoch": 0.5970149253731343, "grad_norm": 106.36339279941318, "learning_rate": 8.509657594381035e-07, "logits/chosen": -2.1597657203674316, "logits/rejected": -2.207812547683716, "logps/chosen": -448.0, "logps/rejected": -354.0, "loss": 0.1332, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.391717553138733, "rewards/margins": 4.916406154632568, "rewards/rejected": -6.3046875, "step": 680 }, { "epoch": 0.6057945566286216, "grad_norm": 109.28671964199744, "learning_rate": 8.487708516242318e-07, "logits/chosen": -2.3355469703674316, "logits/rejected": -2.342968702316284, "logps/chosen": -459.8999938964844, "logps/rejected": -347.1499938964844, "loss": 0.2749, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.57513427734375, "rewards/margins": 4.717968940734863, "rewards/rejected": -5.296093940734863, "step": 690 }, { "epoch": 0.6145741878841089, "grad_norm": 95.56702401243868, "learning_rate": 8.465759438103599e-07, "logits/chosen": -2.2109375, "logits/rejected": -2.282421827316284, "logps/chosen": -426.75, "logps/rejected": -296.6000061035156, "loss": 0.246, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.03012695349752903, "rewards/margins": 4.321484565734863, "rewards/rejected": -4.290625095367432, "step": 700 }, { "epoch": 0.6233538191395961, "grad_norm": 113.23103347450649, "learning_rate": 8.443810359964881e-07, "logits/chosen": -2.3062500953674316, "logits/rejected": -2.252734422683716, "logps/chosen": -433.1499938964844, "logps/rejected": -321.75, "loss": 0.2098, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.11279296875, "rewards/margins": 4.678124904632568, "rewards/rejected": -4.792187690734863, "step": 710 }, { "epoch": 0.6321334503950834, "grad_norm": 36.876855762131605, "learning_rate": 8.421861281826162e-07, "logits/chosen": -2.1468749046325684, "logits/rejected": -2.17578125, "logps/chosen": -421.6000061035156, "logps/rejected": -307.6499938964844, "loss": 0.1765, "rewards/accuracies": 0.9375, "rewards/chosen": -0.70428466796875, "rewards/margins": 4.767968654632568, "rewards/rejected": -5.47265625, "step": 720 }, { "epoch": 0.6409130816505707, "grad_norm": 97.63938648530551, "learning_rate": 8.399912203687445e-07, "logits/chosen": -2.181640625, "logits/rejected": -2.1820311546325684, "logps/chosen": -480.1000061035156, "logps/rejected": -339.25, "loss": 0.155, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7850586175918579, "rewards/margins": 5.161718845367432, "rewards/rejected": -5.943749904632568, "step": 730 }, { "epoch": 0.6496927129060579, "grad_norm": 75.71291991172833, "learning_rate": 8.377963125548727e-07, "logits/chosen": -2.098828077316284, "logits/rejected": -2.1371092796325684, "logps/chosen": -439.0, "logps/rejected": -320.79998779296875, "loss": 0.3184, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.5986328125, "rewards/margins": 4.768750190734863, "rewards/rejected": -5.3671875, "step": 740 }, { "epoch": 0.6584723441615452, "grad_norm": 111.53159387834462, "learning_rate": 8.356014047410008e-07, "logits/chosen": -2.115234375, "logits/rejected": -2.1976561546325684, "logps/chosen": -435.29998779296875, "logps/rejected": -317.54998779296875, "loss": 0.3034, "rewards/accuracies": 0.875, "rewards/chosen": -0.20980224013328552, "rewards/margins": 4.228125095367432, "rewards/rejected": -4.436718940734863, "step": 750 }, { "epoch": 0.6672519754170325, "grad_norm": 61.33023200058874, "learning_rate": 8.33406496927129e-07, "logits/chosen": -2.161328077316284, "logits/rejected": -2.1996092796325684, "logps/chosen": -439.3999938964844, "logps/rejected": -315.6499938964844, "loss": 0.1935, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4091750979423523, "rewards/margins": 3.9925780296325684, "rewards/rejected": -4.400781154632568, "step": 760 }, { "epoch": 0.6760316066725197, "grad_norm": 37.83314141749024, "learning_rate": 8.312115891132572e-07, "logits/chosen": -2.27734375, "logits/rejected": -2.2066407203674316, "logps/chosen": -433.45001220703125, "logps/rejected": -318.8999938964844, "loss": 0.1838, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5521606206893921, "rewards/margins": 4.80859375, "rewards/rejected": -5.366406440734863, "step": 770 }, { "epoch": 0.684811237928007, "grad_norm": 69.75077577447426, "learning_rate": 8.290166812993854e-07, "logits/chosen": -2.1675782203674316, "logits/rejected": -2.2613282203674316, "logps/chosen": -456.95001220703125, "logps/rejected": -315.75, "loss": 0.1864, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.14486083388328552, "rewards/margins": 5.21484375, "rewards/rejected": -5.359375, "step": 780 }, { "epoch": 0.6935908691834943, "grad_norm": 90.50776580259476, "learning_rate": 8.268217734855135e-07, "logits/chosen": -2.2406249046325684, "logits/rejected": -2.294921875, "logps/chosen": -472.8999938964844, "logps/rejected": -342.6499938964844, "loss": 0.1657, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.4073547422885895, "rewards/margins": 4.453906059265137, "rewards/rejected": -4.85546875, "step": 790 }, { "epoch": 0.7023705004389815, "grad_norm": 45.30350511046526, "learning_rate": 8.246268656716417e-07, "logits/chosen": -2.313281297683716, "logits/rejected": -2.301562547683716, "logps/chosen": -475.8999938964844, "logps/rejected": -351.1000061035156, "loss": 0.1385, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6451660394668579, "rewards/margins": 4.946875095367432, "rewards/rejected": -5.586718559265137, "step": 800 }, { "epoch": 0.7111501316944688, "grad_norm": 62.7359014369885, "learning_rate": 8.2243195785777e-07, "logits/chosen": -2.2718749046325684, "logits/rejected": -2.317578077316284, "logps/chosen": -471.5, "logps/rejected": -359.25, "loss": 0.1708, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.73858642578125, "rewards/margins": 4.953125, "rewards/rejected": -5.6953125, "step": 810 }, { "epoch": 0.7199297629499561, "grad_norm": 64.39391927677985, "learning_rate": 8.20237050043898e-07, "logits/chosen": -2.265625, "logits/rejected": -2.338671922683716, "logps/chosen": -472.8999938964844, "logps/rejected": -308.8500061035156, "loss": 0.1875, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.84967041015625, "rewards/margins": 4.270312309265137, "rewards/rejected": -5.124218940734863, "step": 820 }, { "epoch": 0.7287093942054433, "grad_norm": 46.31488820683875, "learning_rate": 8.180421422300263e-07, "logits/chosen": -2.2105469703674316, "logits/rejected": -2.2093749046325684, "logps/chosen": -505.6499938964844, "logps/rejected": -330.3500061035156, "loss": 0.2332, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0020751953125, "rewards/margins": 4.325390815734863, "rewards/rejected": -4.324999809265137, "step": 830 }, { "epoch": 0.7374890254609306, "grad_norm": 68.37218163162461, "learning_rate": 8.158472344161544e-07, "logits/chosen": -2.1753907203674316, "logits/rejected": -2.266406297683716, "logps/chosen": -474.20001220703125, "logps/rejected": -354.1499938964844, "loss": 0.1858, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.12888184189796448, "rewards/margins": 4.827343940734863, "rewards/rejected": -4.699999809265137, "step": 840 }, { "epoch": 0.746268656716418, "grad_norm": 70.37051115875163, "learning_rate": 8.136523266022827e-07, "logits/chosen": -2.2289061546325684, "logits/rejected": -2.350781202316284, "logps/chosen": -455.5, "logps/rejected": -324.6499938964844, "loss": 0.1995, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.42094725370407104, "rewards/margins": 4.266406059265137, "rewards/rejected": -4.69140625, "step": 850 }, { "epoch": 0.7550482879719052, "grad_norm": 23.265883596850873, "learning_rate": 8.114574187884108e-07, "logits/chosen": -2.145703077316284, "logits/rejected": -2.197265625, "logps/chosen": -440.3999938964844, "logps/rejected": -330.79998779296875, "loss": 0.1503, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.5169433355331421, "rewards/margins": 4.733593940734863, "rewards/rejected": -4.217968940734863, "step": 860 }, { "epoch": 0.7638279192273925, "grad_norm": 39.06071564098125, "learning_rate": 8.09262510974539e-07, "logits/chosen": -2.132031202316284, "logits/rejected": -2.100390672683716, "logps/chosen": -433.1000061035156, "logps/rejected": -332.70001220703125, "loss": 0.182, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.3395141661167145, "rewards/margins": 4.525781154632568, "rewards/rejected": -4.186718940734863, "step": 870 }, { "epoch": 0.7726075504828798, "grad_norm": 58.16494119873086, "learning_rate": 8.070676031606672e-07, "logits/chosen": -2.0980467796325684, "logits/rejected": -2.188281297683716, "logps/chosen": -477.0, "logps/rejected": -340.3999938964844, "loss": 0.1971, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.788586437702179, "rewards/margins": 4.595312595367432, "rewards/rejected": -3.807421922683716, "step": 880 }, { "epoch": 0.781387181738367, "grad_norm": 54.01268795395281, "learning_rate": 8.048726953467954e-07, "logits/chosen": -2.2171874046325684, "logits/rejected": -2.188281297683716, "logps/chosen": -417.3500061035156, "logps/rejected": -315.1000061035156, "loss": 0.186, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.21174316108226776, "rewards/margins": 4.379296779632568, "rewards/rejected": -4.167578220367432, "step": 890 }, { "epoch": 0.7901668129938543, "grad_norm": 27.117656990648012, "learning_rate": 8.026777875329235e-07, "logits/chosen": -2.2152342796325684, "logits/rejected": -2.3082032203674316, "logps/chosen": -422.3500061035156, "logps/rejected": -318.95001220703125, "loss": 0.2889, "rewards/accuracies": 0.90625, "rewards/chosen": -0.11767578125, "rewards/margins": 4.21875, "rewards/rejected": -4.3359375, "step": 900 }, { "epoch": 0.7989464442493416, "grad_norm": 52.013178205321424, "learning_rate": 8.004828797190518e-07, "logits/chosen": -2.166015625, "logits/rejected": -2.127734422683716, "logps/chosen": -420.6000061035156, "logps/rejected": -326.75, "loss": 0.229, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.07258300483226776, "rewards/margins": 4.283593654632568, "rewards/rejected": -4.352343559265137, "step": 910 }, { "epoch": 0.8077260755048288, "grad_norm": 42.8864089493482, "learning_rate": 7.982879719051799e-07, "logits/chosen": -2.176562547683716, "logits/rejected": -2.2035155296325684, "logps/chosen": -516.2999877929688, "logps/rejected": -329.29998779296875, "loss": 0.1272, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08149413764476776, "rewards/margins": 4.810937404632568, "rewards/rejected": -4.896093845367432, "step": 920 }, { "epoch": 0.8165057067603161, "grad_norm": 81.29850288294224, "learning_rate": 7.960930640913082e-07, "logits/chosen": -2.1382813453674316, "logits/rejected": -2.200000047683716, "logps/chosen": -464.8500061035156, "logps/rejected": -386.79998779296875, "loss": 0.1435, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.658398449420929, "rewards/margins": 5.216406345367432, "rewards/rejected": -5.87109375, "step": 930 }, { "epoch": 0.8252853380158033, "grad_norm": 23.07930640174056, "learning_rate": 7.938981562774362e-07, "logits/chosen": -2.2132811546325684, "logits/rejected": -2.2265625, "logps/chosen": -451.3500061035156, "logps/rejected": -341.95001220703125, "loss": 0.172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.641845703125, "rewards/margins": 5.371874809265137, "rewards/rejected": -7.012499809265137, "step": 940 }, { "epoch": 0.8340649692712906, "grad_norm": 115.8755700408629, "learning_rate": 7.917032484635645e-07, "logits/chosen": -2.2152342796325684, "logits/rejected": -2.204296827316284, "logps/chosen": -462.70001220703125, "logps/rejected": -384.8999938964844, "loss": 0.1867, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.431445360183716, "rewards/margins": 5.3515625, "rewards/rejected": -7.782812595367432, "step": 950 }, { "epoch": 0.8428446005267779, "grad_norm": 25.653630387821707, "learning_rate": 7.895083406496926e-07, "logits/chosen": -2.328906297683716, "logits/rejected": -2.303515672683716, "logps/chosen": -490.75, "logps/rejected": -339.6499938964844, "loss": 0.2299, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.1357421875, "rewards/margins": 5.127343654632568, "rewards/rejected": -7.268750190734863, "step": 960 }, { "epoch": 0.8516242317822651, "grad_norm": 95.14751058027227, "learning_rate": 7.873134328358209e-07, "logits/chosen": -2.1656250953674316, "logits/rejected": -2.266406297683716, "logps/chosen": -484.3999938964844, "logps/rejected": -329.70001220703125, "loss": 0.1852, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.1531493663787842, "rewards/margins": 5.088281154632568, "rewards/rejected": -6.245312690734863, "step": 970 }, { "epoch": 0.8604038630377524, "grad_norm": 28.295653106475484, "learning_rate": 7.85118525021949e-07, "logits/chosen": -2.206249952316284, "logits/rejected": -2.283203125, "logps/chosen": -517.25, "logps/rejected": -332.8999938964844, "loss": 0.2181, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5602051019668579, "rewards/margins": 4.998437404632568, "rewards/rejected": -5.55859375, "step": 980 }, { "epoch": 0.8691834942932397, "grad_norm": 115.92989249439722, "learning_rate": 7.829236172080772e-07, "logits/chosen": -2.171875, "logits/rejected": -2.2203125953674316, "logps/chosen": -470.6000061035156, "logps/rejected": -338.5, "loss": 0.1899, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.2669921815395355, "rewards/margins": 4.7734375, "rewards/rejected": -5.040625095367432, "step": 990 }, { "epoch": 0.8779631255487269, "grad_norm": 130.84667844083398, "learning_rate": 7.807287093942054e-07, "logits/chosen": -2.073437452316284, "logits/rejected": -2.165234327316284, "logps/chosen": -475.1000061035156, "logps/rejected": -363.6000061035156, "loss": 0.1733, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.2905029356479645, "rewards/margins": 5.186718940734863, "rewards/rejected": -5.477343559265137, "step": 1000 }, { "epoch": 0.8867427568042142, "grad_norm": 37.77710702099301, "learning_rate": 7.785338015803336e-07, "logits/chosen": -2.264843702316284, "logits/rejected": -2.3125, "logps/chosen": -441.20001220703125, "logps/rejected": -335.3999938964844, "loss": 0.2014, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.19730225205421448, "rewards/margins": 4.876562595367432, "rewards/rejected": -5.076562404632568, "step": 1010 }, { "epoch": 0.8955223880597015, "grad_norm": 30.576564253058173, "learning_rate": 7.763388937664617e-07, "logits/chosen": -2.2066407203674316, "logits/rejected": -2.313281297683716, "logps/chosen": -487.29998779296875, "logps/rejected": -327.1000061035156, "loss": 0.1331, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.11490478366613388, "rewards/margins": 4.994531154632568, "rewards/rejected": -5.107031345367432, "step": 1020 }, { "epoch": 0.9043020193151887, "grad_norm": 57.88214656404599, "learning_rate": 7.7414398595259e-07, "logits/chosen": -2.331249952316284, "logits/rejected": -2.314453125, "logps/chosen": -412.75, "logps/rejected": -349.20001220703125, "loss": 0.1739, "rewards/accuracies": 0.90625, "rewards/chosen": -0.825244128704071, "rewards/margins": 5.032812595367432, "rewards/rejected": -5.860156059265137, "step": 1030 }, { "epoch": 0.913081650570676, "grad_norm": 23.925090656985137, "learning_rate": 7.719490781387181e-07, "logits/chosen": -2.193359375, "logits/rejected": -2.276171922683716, "logps/chosen": -449.0, "logps/rejected": -335.70001220703125, "loss": 0.17, "rewards/accuracies": 0.9375, "rewards/chosen": -0.765881359577179, "rewards/margins": 5.408593654632568, "rewards/rejected": -6.17578125, "step": 1040 }, { "epoch": 0.9218612818261633, "grad_norm": 83.0727283656847, "learning_rate": 7.697541703248464e-07, "logits/chosen": -2.247265577316284, "logits/rejected": -2.362499952316284, "logps/chosen": -452.04998779296875, "logps/rejected": -311.1000061035156, "loss": 0.2202, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.20871582627296448, "rewards/margins": 4.963281154632568, "rewards/rejected": -5.170312404632568, "step": 1050 }, { "epoch": 0.9306409130816505, "grad_norm": 96.27268091601239, "learning_rate": 7.675592625109744e-07, "logits/chosen": -2.1890625953674316, "logits/rejected": -2.270312547683716, "logps/chosen": -452.70001220703125, "logps/rejected": -333.8500061035156, "loss": 0.1036, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.754589855670929, "rewards/margins": 5.392968654632568, "rewards/rejected": -6.146093845367432, "step": 1060 }, { "epoch": 0.9394205443371378, "grad_norm": 91.99247970516416, "learning_rate": 7.653643546971027e-07, "logits/chosen": -2.2914061546325684, "logits/rejected": -2.3394532203674316, "logps/chosen": -438.25, "logps/rejected": -351.75, "loss": 0.2778, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.72076416015625, "rewards/margins": 4.728125095367432, "rewards/rejected": -6.448437690734863, "step": 1070 }, { "epoch": 0.9482001755926251, "grad_norm": 9.422589898632934, "learning_rate": 7.631694468832308e-07, "logits/chosen": -2.285937547683716, "logits/rejected": -2.401562452316284, "logps/chosen": -471.3999938964844, "logps/rejected": -344.95001220703125, "loss": 0.1715, "rewards/accuracies": 0.9375, "rewards/chosen": -1.692724585533142, "rewards/margins": 5.327343940734863, "rewards/rejected": -7.0234375, "step": 1080 }, { "epoch": 0.9569798068481123, "grad_norm": 19.40091159798641, "learning_rate": 7.60974539069359e-07, "logits/chosen": -2.3257813453674316, "logits/rejected": -2.374218702316284, "logps/chosen": -494.70001220703125, "logps/rejected": -354.3999938964844, "loss": 0.1768, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.658764660358429, "rewards/margins": 5.016406059265137, "rewards/rejected": -5.670312404632568, "step": 1090 }, { "epoch": 0.9657594381035997, "grad_norm": 24.834680629028092, "learning_rate": 7.587796312554873e-07, "logits/chosen": -2.194531202316284, "logits/rejected": -2.2320313453674316, "logps/chosen": -450.1000061035156, "logps/rejected": -335.54998779296875, "loss": 0.1382, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.794995129108429, "rewards/margins": 5.7734375, "rewards/rejected": -6.565625190734863, "step": 1100 }, { "epoch": 0.974539069359087, "grad_norm": 109.26386816223865, "learning_rate": 7.565847234416154e-07, "logits/chosen": -2.2054686546325684, "logits/rejected": -2.198046922683716, "logps/chosen": -441.20001220703125, "logps/rejected": -355.6499938964844, "loss": 0.2044, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.019433617591858, "rewards/margins": 5.417187690734863, "rewards/rejected": -6.432812690734863, "step": 1110 }, { "epoch": 0.9833187006145742, "grad_norm": 92.40061486472263, "learning_rate": 7.543898156277437e-07, "logits/chosen": -2.3023438453674316, "logits/rejected": -2.2542967796325684, "logps/chosen": -435.79998779296875, "logps/rejected": -353.29998779296875, "loss": 0.2875, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.7038085460662842, "rewards/margins": 4.735156059265137, "rewards/rejected": -6.439062595367432, "step": 1120 }, { "epoch": 0.9920983318700615, "grad_norm": 24.081621299054948, "learning_rate": 7.521949078138717e-07, "logits/chosen": -2.21484375, "logits/rejected": -2.35546875, "logps/chosen": -475.20001220703125, "logps/rejected": -367.0, "loss": 0.133, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2808716297149658, "rewards/margins": 5.571093559265137, "rewards/rejected": -6.848437309265137, "step": 1130 }, { "epoch": 1.0008779631255487, "grad_norm": 6.542707754283824, "learning_rate": 7.5e-07, "logits/chosen": -2.3101563453674316, "logits/rejected": -2.3359375, "logps/chosen": -491.29998779296875, "logps/rejected": -352.25, "loss": 0.2204, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.9757324457168579, "rewards/margins": 5.400000095367432, "rewards/rejected": -6.376562595367432, "step": 1140 }, { "epoch": 1.009657594381036, "grad_norm": 8.22464752681467, "learning_rate": 7.478050921861282e-07, "logits/chosen": -2.3226561546325684, "logits/rejected": -2.3960938453674316, "logps/chosen": -445.0, "logps/rejected": -348.29998779296875, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": -0.8250488042831421, "rewards/margins": 6.173437595367432, "rewards/rejected": -6.996874809265137, "step": 1150 }, { "epoch": 1.0184372256365233, "grad_norm": 41.28518142183959, "learning_rate": 7.456101843722563e-07, "logits/chosen": -2.25390625, "logits/rejected": -2.2808594703674316, "logps/chosen": -423.8500061035156, "logps/rejected": -352.0, "loss": 0.0459, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4247680604457855, "rewards/margins": 6.464062690734863, "rewards/rejected": -6.889062404632568, "step": 1160 }, { "epoch": 1.0272168568920106, "grad_norm": 6.788851461800516, "learning_rate": 7.434152765583845e-07, "logits/chosen": -2.253124952316284, "logits/rejected": -2.298046827316284, "logps/chosen": -467.3999938964844, "logps/rejected": -344.20001220703125, "loss": 0.0384, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.13165283203125, "rewards/margins": 6.7421875, "rewards/rejected": -6.875, "step": 1170 }, { "epoch": 1.035996488147498, "grad_norm": 21.063475879709234, "learning_rate": 7.412203687445126e-07, "logits/chosen": -2.20703125, "logits/rejected": -2.329296827316284, "logps/chosen": -453.29998779296875, "logps/rejected": -335.70001220703125, "loss": 0.0376, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.20769043266773224, "rewards/margins": 7.021874904632568, "rewards/rejected": -6.810937404632568, "step": 1180 }, { "epoch": 1.044776119402985, "grad_norm": 79.63007232998406, "learning_rate": 7.390254609306409e-07, "logits/chosen": -2.280468702316284, "logits/rejected": -2.380859375, "logps/chosen": -430.70001220703125, "logps/rejected": -341.6000061035156, "loss": 0.0512, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20472411811351776, "rewards/margins": 6.78125, "rewards/rejected": -6.576562404632568, "step": 1190 }, { "epoch": 1.0535557506584723, "grad_norm": 25.593289570345775, "learning_rate": 7.368305531167692e-07, "logits/chosen": -2.227734327316284, "logits/rejected": -2.4281249046325684, "logps/chosen": -452.29998779296875, "logps/rejected": -384.0, "loss": 0.0357, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.604321300983429, "rewards/margins": 8.114062309265137, "rewards/rejected": -8.712499618530273, "step": 1200 }, { "epoch": 1.0623353819139596, "grad_norm": 2.353433977029733, "learning_rate": 7.346356453028972e-07, "logits/chosen": -2.303515672683716, "logits/rejected": -2.484375, "logps/chosen": -466.1000061035156, "logps/rejected": -357.45001220703125, "loss": 0.0377, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4786376953125, "rewards/margins": 7.521874904632568, "rewards/rejected": -7.9921875, "step": 1210 }, { "epoch": 1.071115013169447, "grad_norm": 3.5237004339105806, "learning_rate": 7.324407374890255e-07, "logits/chosen": -2.309765577316284, "logits/rejected": -2.4203124046325684, "logps/chosen": -458.70001220703125, "logps/rejected": -345.25, "loss": 0.0287, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2781005799770355, "rewards/margins": 6.956250190734863, "rewards/rejected": -7.228125095367432, "step": 1220 }, { "epoch": 1.0798946444249342, "grad_norm": 14.636595390488218, "learning_rate": 7.302458296751536e-07, "logits/chosen": -2.471874952316284, "logits/rejected": -2.535937547683716, "logps/chosen": -451.0, "logps/rejected": -338.95001220703125, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": -0.637646496295929, "rewards/margins": 7.251562595367432, "rewards/rejected": -7.889062404632568, "step": 1230 }, { "epoch": 1.0886742756804215, "grad_norm": 24.305981968774155, "learning_rate": 7.280509218612819e-07, "logits/chosen": -2.372265577316284, "logits/rejected": -2.434765577316284, "logps/chosen": -438.8999938964844, "logps/rejected": -368.79998779296875, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": -1.565893530845642, "rewards/margins": 7.984375, "rewards/rejected": -9.551562309265137, "step": 1240 }, { "epoch": 1.0974539069359086, "grad_norm": 6.908149573647215, "learning_rate": 7.258560140474099e-07, "logits/chosen": -2.4320311546325684, "logits/rejected": -2.5992188453674316, "logps/chosen": -484.79998779296875, "logps/rejected": -365.95001220703125, "loss": 0.0308, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1583251953125, "rewards/margins": 8.490625381469727, "rewards/rejected": -9.653124809265137, "step": 1250 }, { "epoch": 1.106233538191396, "grad_norm": 7.488206058151286, "learning_rate": 7.236611062335382e-07, "logits/chosen": -2.481640577316284, "logits/rejected": -2.547656297683716, "logps/chosen": -420.3999938964844, "logps/rejected": -364.8999938964844, "loss": 0.0359, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.4010253846645355, "rewards/margins": 8.009374618530273, "rewards/rejected": -8.410937309265137, "step": 1260 }, { "epoch": 1.1150131694468832, "grad_norm": 62.40763934297754, "learning_rate": 7.214661984196664e-07, "logits/chosen": -2.417187452316284, "logits/rejected": -2.485546827316284, "logps/chosen": -422.8999938964844, "logps/rejected": -380.8999938964844, "loss": 0.0562, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.884228527545929, "rewards/margins": 6.974999904632568, "rewards/rejected": -7.856249809265137, "step": 1270 }, { "epoch": 1.1237928007023705, "grad_norm": 3.1149240594299874, "learning_rate": 7.192712906057946e-07, "logits/chosen": -2.407031297683716, "logits/rejected": -2.5687499046325684, "logps/chosen": -468.3500061035156, "logps/rejected": -362.1499938964844, "loss": 0.05, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.22341307997703552, "rewards/margins": 7.40625, "rewards/rejected": -7.629687309265137, "step": 1280 }, { "epoch": 1.1325724319578578, "grad_norm": 9.019174379006817, "learning_rate": 7.170763827919227e-07, "logits/chosen": -2.330859422683716, "logits/rejected": -2.530468702316284, "logps/chosen": -480.1000061035156, "logps/rejected": -358.1000061035156, "loss": 0.0456, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.103515625, "rewards/margins": 7.824999809265137, "rewards/rejected": -9.931249618530273, "step": 1290 }, { "epoch": 1.1413520632133451, "grad_norm": 51.7296547225416, "learning_rate": 7.148814749780509e-07, "logits/chosen": -2.400390625, "logits/rejected": -2.5234375, "logps/chosen": -423.5, "logps/rejected": -331.8999938964844, "loss": 0.0611, "rewards/accuracies": 0.96875, "rewards/chosen": 0.03999023512005806, "rewards/margins": 7.240624904632568, "rewards/rejected": -7.203125, "step": 1300 }, { "epoch": 1.1501316944688322, "grad_norm": 58.973834938303774, "learning_rate": 7.126865671641791e-07, "logits/chosen": -2.443359375, "logits/rejected": -2.5875000953674316, "logps/chosen": -449.29998779296875, "logps/rejected": -337.20001220703125, "loss": 0.1015, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.162109375, "rewards/margins": 6.690625190734863, "rewards/rejected": -6.845312595367432, "step": 1310 }, { "epoch": 1.1589113257243195, "grad_norm": 10.928685659671538, "learning_rate": 7.104916593503074e-07, "logits/chosen": -2.4921875, "logits/rejected": -2.639843702316284, "logps/chosen": -477.79998779296875, "logps/rejected": -373.45001220703125, "loss": 0.0348, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.7756103277206421, "rewards/margins": 7.276562690734863, "rewards/rejected": -8.048437118530273, "step": 1320 }, { "epoch": 1.1676909569798068, "grad_norm": 17.470305548338914, "learning_rate": 7.082967515364354e-07, "logits/chosen": -2.4144530296325684, "logits/rejected": -2.4917969703674316, "logps/chosen": -461.79998779296875, "logps/rejected": -378.0, "loss": 0.0258, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6912902593612671, "rewards/margins": 7.7421875, "rewards/rejected": -8.421875, "step": 1330 }, { "epoch": 1.1764705882352942, "grad_norm": 10.471402034890781, "learning_rate": 7.061018437225637e-07, "logits/chosen": -2.36328125, "logits/rejected": -2.473437547683716, "logps/chosen": -430.3999938964844, "logps/rejected": -379.6000061035156, "loss": 0.022, "rewards/accuracies": 1.0, "rewards/chosen": -1.176000952720642, "rewards/margins": 7.246874809265137, "rewards/rejected": -8.426562309265137, "step": 1340 }, { "epoch": 1.1852502194907815, "grad_norm": 7.716569511212797, "learning_rate": 7.039069359086918e-07, "logits/chosen": -2.414843797683716, "logits/rejected": -2.467968702316284, "logps/chosen": -484.04998779296875, "logps/rejected": -368.8999938964844, "loss": 0.0496, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.749951183795929, "rewards/margins": 7.645312309265137, "rewards/rejected": -8.393750190734863, "step": 1350 }, { "epoch": 1.1940298507462686, "grad_norm": 7.332905932679851, "learning_rate": 7.0171202809482e-07, "logits/chosen": -2.4296875, "logits/rejected": -2.659374952316284, "logps/chosen": -506.79998779296875, "logps/rejected": -396.70001220703125, "loss": 0.0295, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0792968273162842, "rewards/margins": 7.714062690734863, "rewards/rejected": -8.800000190734863, "step": 1360 }, { "epoch": 1.2028094820017559, "grad_norm": 36.022927358889355, "learning_rate": 6.995171202809481e-07, "logits/chosen": -2.48046875, "logits/rejected": -2.6460938453674316, "logps/chosen": -403.20001220703125, "logps/rejected": -373.75, "loss": 0.0339, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.180517554283142, "rewards/margins": 7.921875, "rewards/rejected": -9.095312118530273, "step": 1370 }, { "epoch": 1.2115891132572432, "grad_norm": 10.415996727273347, "learning_rate": 6.973222124670764e-07, "logits/chosen": -2.440624952316284, "logits/rejected": -2.660937547683716, "logps/chosen": -423.70001220703125, "logps/rejected": -351.95001220703125, "loss": 0.0193, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9304443597793579, "rewards/margins": 8.326562881469727, "rewards/rejected": -9.254687309265137, "step": 1380 }, { "epoch": 1.2203687445127305, "grad_norm": 2.6800723062854965, "learning_rate": 6.951273046532046e-07, "logits/chosen": -2.440624952316284, "logits/rejected": -2.5093750953674316, "logps/chosen": -463.1000061035156, "logps/rejected": -376.20001220703125, "loss": 0.0259, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8814361691474915, "rewards/margins": 7.842187404632568, "rewards/rejected": -8.7265625, "step": 1390 }, { "epoch": 1.2291483757682178, "grad_norm": 3.333937633914336, "learning_rate": 6.929323968393327e-07, "logits/chosen": -2.477343797683716, "logits/rejected": -2.6484375, "logps/chosen": -434.6000061035156, "logps/rejected": -327.95001220703125, "loss": 0.0239, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.211328148841858, "rewards/margins": 7.989062309265137, "rewards/rejected": -9.1953125, "step": 1400 }, { "epoch": 1.237928007023705, "grad_norm": 7.236800984080168, "learning_rate": 6.907374890254609e-07, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.651171922683716, "logps/chosen": -431.79998779296875, "logps/rejected": -377.20001220703125, "loss": 0.057, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7494139671325684, "rewards/margins": 8.004687309265137, "rewards/rejected": -10.75, "step": 1410 }, { "epoch": 1.2467076382791924, "grad_norm": 89.68578759479496, "learning_rate": 6.885425812115891e-07, "logits/chosen": -2.45703125, "logits/rejected": -2.631640672683716, "logps/chosen": -431.8999938964844, "logps/rejected": -333.5, "loss": 0.0424, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.768823266029358, "rewards/margins": 8.817187309265137, "rewards/rejected": -10.579687118530273, "step": 1420 }, { "epoch": 1.2554872695346795, "grad_norm": 53.9181708740632, "learning_rate": 6.863476733977173e-07, "logits/chosen": -2.473437547683716, "logits/rejected": -2.6742186546325684, "logps/chosen": -445.95001220703125, "logps/rejected": -347.8999938964844, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": 0.28544920682907104, "rewards/margins": 7.209374904632568, "rewards/rejected": -6.915625095367432, "step": 1430 }, { "epoch": 1.2642669007901668, "grad_norm": 36.434854944054194, "learning_rate": 6.841527655838455e-07, "logits/chosen": -2.4273438453674316, "logits/rejected": -2.5718750953674316, "logps/chosen": -393.6499938964844, "logps/rejected": -352.54998779296875, "loss": 0.0599, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8592987060546875, "rewards/margins": 7.4375, "rewards/rejected": -8.293749809265137, "step": 1440 }, { "epoch": 1.273046532045654, "grad_norm": 20.67431604894428, "learning_rate": 6.819578577699736e-07, "logits/chosen": -2.5101561546325684, "logits/rejected": -2.667187452316284, "logps/chosen": -483.3999938964844, "logps/rejected": -385.54998779296875, "loss": 0.1111, "rewards/accuracies": 0.96875, "rewards/chosen": -1.441137671470642, "rewards/margins": 8.453125, "rewards/rejected": -9.901562690734863, "step": 1450 }, { "epoch": 1.2818261633011414, "grad_norm": 5.114805909142126, "learning_rate": 6.797629499561019e-07, "logits/chosen": -2.419140577316284, "logits/rejected": -2.6820311546325684, "logps/chosen": -501.1000061035156, "logps/rejected": -376.1000061035156, "loss": 0.0536, "rewards/accuracies": 0.96875, "rewards/chosen": -2.1053709983825684, "rewards/margins": 7.839062690734863, "rewards/rejected": -9.946874618530273, "step": 1460 }, { "epoch": 1.2906057945566287, "grad_norm": 67.90946806886183, "learning_rate": 6.7756804214223e-07, "logits/chosen": -2.391796827316284, "logits/rejected": -2.6578125953674316, "logps/chosen": -495.20001220703125, "logps/rejected": -385.04998779296875, "loss": 0.0534, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4423828125, "rewards/margins": 8.024999618530273, "rewards/rejected": -9.467187881469727, "step": 1470 }, { "epoch": 1.2993854258121158, "grad_norm": 5.842886581660366, "learning_rate": 6.753731343283582e-07, "logits/chosen": -2.4515624046325684, "logits/rejected": -2.633593797683716, "logps/chosen": -466.25, "logps/rejected": -351.45001220703125, "loss": 0.0498, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.8983398675918579, "rewards/margins": 7.671875, "rewards/rejected": -8.564062118530273, "step": 1480 }, { "epoch": 1.308165057067603, "grad_norm": 51.42331842998411, "learning_rate": 6.731782265144864e-07, "logits/chosen": -2.4828124046325684, "logits/rejected": -2.587890625, "logps/chosen": -478.04998779296875, "logps/rejected": -361.8500061035156, "loss": 0.0868, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9744628667831421, "rewards/margins": 7.604687690734863, "rewards/rejected": -8.589062690734863, "step": 1490 }, { "epoch": 1.3169446883230904, "grad_norm": 29.35151098932549, "learning_rate": 6.709833187006146e-07, "logits/chosen": -2.450390577316284, "logits/rejected": -2.62890625, "logps/chosen": -452.3999938964844, "logps/rejected": -376.5, "loss": 0.0501, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2184326648712158, "rewards/margins": 7.607812404632568, "rewards/rejected": -8.829687118530273, "step": 1500 }, { "epoch": 1.3257243195785777, "grad_norm": 70.94386203474886, "learning_rate": 6.687884108867427e-07, "logits/chosen": -2.520312547683716, "logits/rejected": -2.586718797683716, "logps/chosen": -424.70001220703125, "logps/rejected": -373.3999938964844, "loss": 0.0401, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.43011474609375, "rewards/margins": 8.6015625, "rewards/rejected": -10.028124809265137, "step": 1510 }, { "epoch": 1.334503950834065, "grad_norm": 7.153019837799803, "learning_rate": 6.665935030728709e-07, "logits/chosen": -2.5101561546325684, "logits/rejected": -2.599609375, "logps/chosen": -476.45001220703125, "logps/rejected": -377.29998779296875, "loss": 0.0388, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.012719750404358, "rewards/margins": 8.139062881469727, "rewards/rejected": -9.149999618530273, "step": 1520 }, { "epoch": 1.3432835820895521, "grad_norm": 12.472620043787545, "learning_rate": 6.643985952589991e-07, "logits/chosen": -2.526562452316284, "logits/rejected": -2.586718797683716, "logps/chosen": -453.3999938964844, "logps/rejected": -389.04998779296875, "loss": 0.0622, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.970898449420929, "rewards/margins": 7.5078125, "rewards/rejected": -8.485937118530273, "step": 1530 }, { "epoch": 1.3520632133450394, "grad_norm": 34.93917929216004, "learning_rate": 6.622036874451273e-07, "logits/chosen": -2.4429688453674316, "logits/rejected": -2.542187452316284, "logps/chosen": -462.1000061035156, "logps/rejected": -384.1499938964844, "loss": 0.0327, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6491333246231079, "rewards/margins": 7.932812690734863, "rewards/rejected": -8.582812309265137, "step": 1540 }, { "epoch": 1.3608428446005267, "grad_norm": 31.69065386752026, "learning_rate": 6.600087796312554e-07, "logits/chosen": -2.530468702316284, "logits/rejected": -2.643749952316284, "logps/chosen": -428.0, "logps/rejected": -347.04998779296875, "loss": 0.0812, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.160363793373108, "rewards/margins": 7.675000190734863, "rewards/rejected": -8.839062690734863, "step": 1550 }, { "epoch": 1.369622475856014, "grad_norm": 37.12700415966545, "learning_rate": 6.578138718173837e-07, "logits/chosen": -2.553906202316284, "logits/rejected": -2.7085938453674316, "logps/chosen": -477.1499938964844, "logps/rejected": -364.3999938964844, "loss": 0.0502, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.14990234375, "rewards/margins": 7.875, "rewards/rejected": -10.029687881469727, "step": 1560 }, { "epoch": 1.3784021071115014, "grad_norm": 2.414722031121448, "learning_rate": 6.556189640035118e-07, "logits/chosen": -2.5269532203674316, "logits/rejected": -2.6585936546325684, "logps/chosen": -457.79998779296875, "logps/rejected": -397.1000061035156, "loss": 0.0694, "rewards/accuracies": 0.96875, "rewards/chosen": -2.302929639816284, "rewards/margins": 7.3125, "rewards/rejected": -9.612500190734863, "step": 1570 }, { "epoch": 1.3871817383669887, "grad_norm": 19.525703531957156, "learning_rate": 6.534240561896401e-07, "logits/chosen": -2.616406202316284, "logits/rejected": -2.7835936546325684, "logps/chosen": -461.0, "logps/rejected": -383.79998779296875, "loss": 0.0539, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.888281226158142, "rewards/margins": 7.506249904632568, "rewards/rejected": -9.395312309265137, "step": 1580 }, { "epoch": 1.395961369622476, "grad_norm": 2.9624857660743156, "learning_rate": 6.512291483757681e-07, "logits/chosen": -2.538281202316284, "logits/rejected": -2.813281297683716, "logps/chosen": -439.1000061035156, "logps/rejected": -342.29998779296875, "loss": 0.0346, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.653417944908142, "rewards/margins": 7.9453125, "rewards/rejected": -9.598437309265137, "step": 1590 }, { "epoch": 1.404741000877963, "grad_norm": 8.60748646976303, "learning_rate": 6.490342405618964e-07, "logits/chosen": -2.616406202316284, "logits/rejected": -2.72265625, "logps/chosen": -466.25, "logps/rejected": -348.79998779296875, "loss": 0.0614, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.9064209461212158, "rewards/margins": 8.167187690734863, "rewards/rejected": -10.078125, "step": 1600 }, { "epoch": 1.4135206321334504, "grad_norm": 6.861518501910524, "learning_rate": 6.468393327480246e-07, "logits/chosen": -2.447265625, "logits/rejected": -2.660937547683716, "logps/chosen": -444.8999938964844, "logps/rejected": -371.70001220703125, "loss": 0.0209, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.6099609136581421, "rewards/margins": 8.2265625, "rewards/rejected": -8.84375, "step": 1610 }, { "epoch": 1.4223002633889377, "grad_norm": 9.835804113502677, "learning_rate": 6.446444249341528e-07, "logits/chosen": -2.4820313453674316, "logits/rejected": -2.59765625, "logps/chosen": -476.29998779296875, "logps/rejected": -406.79998779296875, "loss": 0.0651, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7592102289199829, "rewards/margins": 8.393750190734863, "rewards/rejected": -9.149999618530273, "step": 1620 }, { "epoch": 1.431079894644425, "grad_norm": 55.43335867914973, "learning_rate": 6.424495171202809e-07, "logits/chosen": -2.457812547683716, "logits/rejected": -2.585156202316284, "logps/chosen": -458.6000061035156, "logps/rejected": -384.1000061035156, "loss": 0.0505, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7320556640625, "rewards/margins": 7.793749809265137, "rewards/rejected": -8.53125, "step": 1630 }, { "epoch": 1.4398595258999123, "grad_norm": 30.97088906696782, "learning_rate": 6.402546093064091e-07, "logits/chosen": -2.5726561546325684, "logits/rejected": -2.6273436546325684, "logps/chosen": -437.8500061035156, "logps/rejected": -370.29998779296875, "loss": 0.0581, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1980469226837158, "rewards/margins": 7.556250095367432, "rewards/rejected": -8.754687309265137, "step": 1640 }, { "epoch": 1.4486391571553994, "grad_norm": 47.70033602734044, "learning_rate": 6.380597014925373e-07, "logits/chosen": -2.4625000953674316, "logits/rejected": -2.6890625953674316, "logps/chosen": -445.20001220703125, "logps/rejected": -371.6000061035156, "loss": 0.0534, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.353124976158142, "rewards/margins": 8.296875, "rewards/rejected": -9.642187118530273, "step": 1650 }, { "epoch": 1.4574187884108867, "grad_norm": 0.3432780004349103, "learning_rate": 6.358647936786655e-07, "logits/chosen": -2.5367188453674316, "logits/rejected": -2.7222657203674316, "logps/chosen": -451.20001220703125, "logps/rejected": -385.1499938964844, "loss": 0.0263, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0514647960662842, "rewards/margins": 8.571874618530273, "rewards/rejected": -9.6171875, "step": 1660 }, { "epoch": 1.466198419666374, "grad_norm": 0.6019082419077392, "learning_rate": 6.336698858647936e-07, "logits/chosen": -2.498046875, "logits/rejected": -2.707812547683716, "logps/chosen": -478.75, "logps/rejected": -374.95001220703125, "loss": 0.0508, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.147314429283142, "rewards/margins": 8.162500381469727, "rewards/rejected": -9.3046875, "step": 1670 }, { "epoch": 1.4749780509218613, "grad_norm": 24.064089504214195, "learning_rate": 6.314749780509219e-07, "logits/chosen": -2.473828077316284, "logits/rejected": -2.5875000953674316, "logps/chosen": -436.70001220703125, "logps/rejected": -392.3999938964844, "loss": 0.0591, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1285400390625, "rewards/margins": 8.354687690734863, "rewards/rejected": -10.489062309265137, "step": 1680 }, { "epoch": 1.4837576821773486, "grad_norm": 16.2290064209153, "learning_rate": 6.2928007023705e-07, "logits/chosen": -2.51171875, "logits/rejected": -2.7249999046325684, "logps/chosen": -420.75, "logps/rejected": -354.20001220703125, "loss": 0.0631, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.022113084793091, "rewards/margins": 7.995312690734863, "rewards/rejected": -10.017187118530273, "step": 1690 }, { "epoch": 1.4925373134328357, "grad_norm": 3.229196553799357, "learning_rate": 6.270851624231783e-07, "logits/chosen": -2.6390624046325684, "logits/rejected": -2.8882813453674316, "logps/chosen": -467.3999938964844, "logps/rejected": -371.1499938964844, "loss": 0.0201, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3235716819763184, "rewards/margins": 7.943749904632568, "rewards/rejected": -10.271875381469727, "step": 1700 }, { "epoch": 1.5013169446883232, "grad_norm": 9.188257643019353, "learning_rate": 6.248902546093063e-07, "logits/chosen": -2.526562452316284, "logits/rejected": -2.805468797683716, "logps/chosen": -434.70001220703125, "logps/rejected": -361.75, "loss": 0.0408, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5238280296325684, "rewards/margins": 7.942187309265137, "rewards/rejected": -10.471875190734863, "step": 1710 }, { "epoch": 1.5100965759438103, "grad_norm": 29.26452195738192, "learning_rate": 6.226953467954346e-07, "logits/chosen": -2.541015625, "logits/rejected": -2.875, "logps/chosen": -426.1000061035156, "logps/rejected": -355.0, "loss": 0.027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9853515625, "rewards/margins": 8.178125381469727, "rewards/rejected": -10.165624618530273, "step": 1720 }, { "epoch": 1.5188762071992976, "grad_norm": 19.205836789188172, "learning_rate": 6.205004389815628e-07, "logits/chosen": -2.590625047683716, "logits/rejected": -2.907031297683716, "logps/chosen": -460.95001220703125, "logps/rejected": -368.0, "loss": 0.0297, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.270410180091858, "rewards/margins": 9.171875, "rewards/rejected": -10.434374809265137, "step": 1730 }, { "epoch": 1.527655838454785, "grad_norm": 34.625243306663386, "learning_rate": 6.18305531167691e-07, "logits/chosen": -2.589062452316284, "logits/rejected": -2.7984375953674316, "logps/chosen": -440.70001220703125, "logps/rejected": -359.20001220703125, "loss": 0.0908, "rewards/accuracies": 0.96875, "rewards/chosen": -1.383886694908142, "rewards/margins": 8.3046875, "rewards/rejected": -9.681249618530273, "step": 1740 }, { "epoch": 1.536435469710272, "grad_norm": 2.9213182931390484, "learning_rate": 6.161106233538191e-07, "logits/chosen": -2.582812547683716, "logits/rejected": -2.809375047683716, "logps/chosen": -479.5, "logps/rejected": -404.04998779296875, "loss": 0.0387, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9427734613418579, "rewards/margins": 8.5859375, "rewards/rejected": -9.524999618530273, "step": 1750 }, { "epoch": 1.5452151009657595, "grad_norm": 68.38428797919153, "learning_rate": 6.139157155399473e-07, "logits/chosen": -2.579296827316284, "logits/rejected": -2.831249952316284, "logps/chosen": -397.1000061035156, "logps/rejected": -356.8500061035156, "loss": 0.0485, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.036279320716858, "rewards/margins": 7.775000095367432, "rewards/rejected": -8.817187309265137, "step": 1760 }, { "epoch": 1.5539947322212466, "grad_norm": 1.0331151816332567, "learning_rate": 6.117208077260755e-07, "logits/chosen": -2.587109327316284, "logits/rejected": -2.7484374046325684, "logps/chosen": -428.3999938964844, "logps/rejected": -368.29998779296875, "loss": 0.0768, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.949902355670929, "rewards/margins": 7.314062595367432, "rewards/rejected": -8.260937690734863, "step": 1770 }, { "epoch": 1.562774363476734, "grad_norm": 82.7667900525252, "learning_rate": 6.095258999122037e-07, "logits/chosen": -2.481250047683716, "logits/rejected": -2.7109375, "logps/chosen": -439.79998779296875, "logps/rejected": -377.04998779296875, "loss": 0.0503, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.32332152128219604, "rewards/margins": 8.734375, "rewards/rejected": -9.060937881469727, "step": 1780 }, { "epoch": 1.5715539947322212, "grad_norm": 28.160790924621615, "learning_rate": 6.073309920983318e-07, "logits/chosen": -2.498046875, "logits/rejected": -2.690624952316284, "logps/chosen": -462.20001220703125, "logps/rejected": -377.8999938964844, "loss": 0.0719, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.471777319908142, "rewards/margins": 8.529687881469727, "rewards/rejected": -9.998437881469727, "step": 1790 }, { "epoch": 1.5803336259877085, "grad_norm": 13.087605043884519, "learning_rate": 6.051360842844601e-07, "logits/chosen": -2.686718702316284, "logits/rejected": -2.9164061546325684, "logps/chosen": -460.29998779296875, "logps/rejected": -366.75, "loss": 0.0456, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2251954078674316, "rewards/margins": 8.2109375, "rewards/rejected": -10.446874618530273, "step": 1800 }, { "epoch": 1.5891132572431959, "grad_norm": 82.74475452839468, "learning_rate": 6.029411764705882e-07, "logits/chosen": -2.6402344703674316, "logits/rejected": -2.848437547683716, "logps/chosen": -443.3500061035156, "logps/rejected": -392.1000061035156, "loss": 0.0368, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8682129383087158, "rewards/margins": 8.699999809265137, "rewards/rejected": -10.578125, "step": 1810 }, { "epoch": 1.597892888498683, "grad_norm": 51.98127865086831, "learning_rate": 6.007462686567164e-07, "logits/chosen": -2.5582032203674316, "logits/rejected": -2.913281202316284, "logps/chosen": -467.75, "logps/rejected": -381.0, "loss": 0.0429, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9124755859375, "rewards/margins": 8.715624809265137, "rewards/rejected": -10.629687309265137, "step": 1820 }, { "epoch": 1.6066725197541705, "grad_norm": 13.687291910833912, "learning_rate": 5.985513608428445e-07, "logits/chosen": -2.706249952316284, "logits/rejected": -2.821093797683716, "logps/chosen": -465.54998779296875, "logps/rejected": -404.3999938964844, "loss": 0.0303, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.164697289466858, "rewards/margins": 8.839062690734863, "rewards/rejected": -10.0078125, "step": 1830 }, { "epoch": 1.6154521510096576, "grad_norm": 3.141632413942404, "learning_rate": 5.963564530289728e-07, "logits/chosen": -2.7421875, "logits/rejected": -2.9671874046325684, "logps/chosen": -451.95001220703125, "logps/rejected": -354.6000061035156, "loss": 0.0487, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1751708984375, "rewards/margins": 8.1796875, "rewards/rejected": -9.3515625, "step": 1840 }, { "epoch": 1.6242317822651449, "grad_norm": 40.215753609476586, "learning_rate": 5.94161545215101e-07, "logits/chosen": -2.473437547683716, "logits/rejected": -2.79296875, "logps/chosen": -446.70001220703125, "logps/rejected": -378.04998779296875, "loss": 0.0354, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3036377429962158, "rewards/margins": 8.565625190734863, "rewards/rejected": -9.868749618530273, "step": 1850 }, { "epoch": 1.6330114135206322, "grad_norm": 121.84868653993065, "learning_rate": 5.919666374012291e-07, "logits/chosen": -2.530468702316284, "logits/rejected": -2.73046875, "logps/chosen": -490.20001220703125, "logps/rejected": -394.25, "loss": 0.0511, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.5744994878768921, "rewards/margins": 9.151562690734863, "rewards/rejected": -9.725000381469727, "step": 1860 }, { "epoch": 1.6417910447761193, "grad_norm": 1.4363450893315823, "learning_rate": 5.897717295873573e-07, "logits/chosen": -2.5914063453674316, "logits/rejected": -2.7925782203674316, "logps/chosen": -502.6499938964844, "logps/rejected": -402.29998779296875, "loss": 0.0501, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.456884741783142, "rewards/margins": 8.784375190734863, "rewards/rejected": -10.245312690734863, "step": 1870 }, { "epoch": 1.6505706760316068, "grad_norm": 6.331791348918632, "learning_rate": 5.875768217734855e-07, "logits/chosen": -2.6546874046325684, "logits/rejected": -2.8804688453674316, "logps/chosen": -502.8999938964844, "logps/rejected": -385.04998779296875, "loss": 0.0838, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.12109375, "rewards/margins": 8.404687881469727, "rewards/rejected": -10.53125, "step": 1880 }, { "epoch": 1.6593503072870939, "grad_norm": 30.203770737498132, "learning_rate": 5.853819139596137e-07, "logits/chosen": -2.573437452316284, "logits/rejected": -2.7835936546325684, "logps/chosen": -459.1499938964844, "logps/rejected": -374.3999938964844, "loss": 0.0291, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6189453601837158, "rewards/margins": 8.415624618530273, "rewards/rejected": -10.03125, "step": 1890 }, { "epoch": 1.6681299385425812, "grad_norm": 17.78176630586189, "learning_rate": 5.831870061457419e-07, "logits/chosen": -2.495312452316284, "logits/rejected": -2.623046875, "logps/chosen": -482.29998779296875, "logps/rejected": -361.1000061035156, "loss": 0.0488, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.176123023033142, "rewards/margins": 8.346875190734863, "rewards/rejected": -9.520312309265137, "step": 1900 }, { "epoch": 1.6769095697980685, "grad_norm": 5.365005743269785, "learning_rate": 5.8099209833187e-07, "logits/chosen": -2.4820313453674316, "logits/rejected": -2.6820311546325684, "logps/chosen": -498.1000061035156, "logps/rejected": -376.1000061035156, "loss": 0.0314, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.484277367591858, "rewards/margins": 9.003125190734863, "rewards/rejected": -10.487500190734863, "step": 1910 }, { "epoch": 1.6856892010535558, "grad_norm": 4.911837616919007, "learning_rate": 5.787971905179983e-07, "logits/chosen": -2.551953077316284, "logits/rejected": -2.7515625953674316, "logps/chosen": -507.5, "logps/rejected": -418.3999938964844, "loss": 0.0599, "rewards/accuracies": 0.96875, "rewards/chosen": -1.892431616783142, "rewards/margins": 9.432812690734863, "rewards/rejected": -11.321874618530273, "step": 1920 }, { "epoch": 1.694468832309043, "grad_norm": 48.41935696742479, "learning_rate": 5.766022827041263e-07, "logits/chosen": -2.6695313453674316, "logits/rejected": -2.874218702316284, "logps/chosen": -478.5, "logps/rejected": -369.3999938964844, "loss": 0.0661, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.2271971702575684, "rewards/margins": 8.535937309265137, "rewards/rejected": -10.762499809265137, "step": 1930 }, { "epoch": 1.7032484635645302, "grad_norm": 3.1904332195658536, "learning_rate": 5.744073748902546e-07, "logits/chosen": -2.678906202316284, "logits/rejected": -2.8359375, "logps/chosen": -474.70001220703125, "logps/rejected": -378.45001220703125, "loss": 0.0354, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5466797351837158, "rewards/margins": 8.540624618530273, "rewards/rejected": -10.090624809265137, "step": 1940 }, { "epoch": 1.7120280948200177, "grad_norm": 36.581197901006014, "learning_rate": 5.722124670763828e-07, "logits/chosen": -2.6664061546325684, "logits/rejected": -2.796875, "logps/chosen": -479.3999938964844, "logps/rejected": -392.1000061035156, "loss": 0.0301, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.533593773841858, "rewards/margins": 8.985937118530273, "rewards/rejected": -10.5234375, "step": 1950 }, { "epoch": 1.7208077260755048, "grad_norm": 3.0076297690669938, "learning_rate": 5.70017559262511e-07, "logits/chosen": -2.46484375, "logits/rejected": -2.8765625953674316, "logps/chosen": -519.5999755859375, "logps/rejected": -390.6000061035156, "loss": 0.0343, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.000195264816284, "rewards/margins": 8.259374618530273, "rewards/rejected": -10.254687309265137, "step": 1960 }, { "epoch": 1.7295873573309921, "grad_norm": 75.69219904133588, "learning_rate": 5.678226514486391e-07, "logits/chosen": -2.6109375953674316, "logits/rejected": -2.823437452316284, "logps/chosen": -477.5, "logps/rejected": -376.29998779296875, "loss": 0.1164, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.6555664539337158, "rewards/margins": 8.425000190734863, "rewards/rejected": -10.078125, "step": 1970 }, { "epoch": 1.7383669885864794, "grad_norm": 4.499107693623519, "learning_rate": 5.656277436347673e-07, "logits/chosen": -2.5121092796325684, "logits/rejected": -2.8316407203674316, "logps/chosen": -501.1000061035156, "logps/rejected": -397.29998779296875, "loss": 0.0534, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2420897483825684, "rewards/margins": 8.145312309265137, "rewards/rejected": -10.384374618530273, "step": 1980 }, { "epoch": 1.7471466198419665, "grad_norm": 2.123299764443924, "learning_rate": 5.634328358208955e-07, "logits/chosen": -2.59765625, "logits/rejected": -2.8656249046325684, "logps/chosen": -508.0, "logps/rejected": -381.0, "loss": 0.0178, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.7275390625, "rewards/margins": 8.609375, "rewards/rejected": -11.34375, "step": 1990 }, { "epoch": 1.755926251097454, "grad_norm": 0.238167592399743, "learning_rate": 5.612379280070237e-07, "logits/chosen": -2.5492186546325684, "logits/rejected": -2.825000047683716, "logps/chosen": -455.3500061035156, "logps/rejected": -415.70001220703125, "loss": 0.0175, "rewards/accuracies": 1.0, "rewards/chosen": -1.5980956554412842, "rewards/margins": 9.026562690734863, "rewards/rejected": -10.631250381469727, "step": 2000 }, { "epoch": 1.7647058823529411, "grad_norm": 57.651481695029794, "learning_rate": 5.590430201931518e-07, "logits/chosen": -2.609375, "logits/rejected": -2.796093702316284, "logps/chosen": -462.1000061035156, "logps/rejected": -383.6000061035156, "loss": 0.0649, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4416015148162842, "rewards/margins": 8.684374809265137, "rewards/rejected": -10.123437881469727, "step": 2010 }, { "epoch": 1.7734855136084284, "grad_norm": 8.324861608533597, "learning_rate": 5.568481123792801e-07, "logits/chosen": -2.5218749046325684, "logits/rejected": -2.78125, "logps/chosen": -503.20001220703125, "logps/rejected": -389.0, "loss": 0.0448, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0042724609375, "rewards/margins": 8.893750190734863, "rewards/rejected": -9.904687881469727, "step": 2020 }, { "epoch": 1.7822651448639157, "grad_norm": 22.411601827446088, "learning_rate": 5.546532045654082e-07, "logits/chosen": -2.596484422683716, "logits/rejected": -2.8179688453674316, "logps/chosen": -490.5, "logps/rejected": -391.75, "loss": 0.0258, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.495214819908142, "rewards/margins": 8.793749809265137, "rewards/rejected": -10.284375190734863, "step": 2030 }, { "epoch": 1.7910447761194028, "grad_norm": 4.2346820725814895, "learning_rate": 5.524582967515365e-07, "logits/chosen": -2.577343702316284, "logits/rejected": -2.766406297683716, "logps/chosen": -474.3500061035156, "logps/rejected": -425.25, "loss": 0.0597, "rewards/accuracies": 0.96875, "rewards/chosen": -1.630590796470642, "rewards/margins": 8.478124618530273, "rewards/rejected": -10.107812881469727, "step": 2040 }, { "epoch": 1.7998244073748904, "grad_norm": 12.385046369282417, "learning_rate": 5.502633889376645e-07, "logits/chosen": -2.6156249046325684, "logits/rejected": -2.82421875, "logps/chosen": -449.8999938964844, "logps/rejected": -390.3500061035156, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": -0.770031750202179, "rewards/margins": 8.639062881469727, "rewards/rejected": -9.403124809265137, "step": 2050 }, { "epoch": 1.8086040386303774, "grad_norm": 1.8844697228375282, "learning_rate": 5.480684811237928e-07, "logits/chosen": -2.5484375953674316, "logits/rejected": -2.852343797683716, "logps/chosen": -513.5999755859375, "logps/rejected": -415.79998779296875, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -2.4247069358825684, "rewards/margins": 9.189062118530273, "rewards/rejected": -11.615625381469727, "step": 2060 }, { "epoch": 1.8173836698858647, "grad_norm": 2.637432316749487, "learning_rate": 5.45873573309921e-07, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.8031249046325684, "logps/chosen": -480.0, "logps/rejected": -414.1499938964844, "loss": 0.024, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6226563453674316, "rewards/margins": 9.015625, "rewards/rejected": -11.640625, "step": 2070 }, { "epoch": 1.826163301141352, "grad_norm": 4.752570108114763, "learning_rate": 5.436786654960492e-07, "logits/chosen": -2.745312452316284, "logits/rejected": -3.008593797683716, "logps/chosen": -468.04998779296875, "logps/rejected": -398.8999938964844, "loss": 0.0567, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.5981934070587158, "rewards/margins": 8.998437881469727, "rewards/rejected": -10.603124618530273, "step": 2080 }, { "epoch": 1.8349429323968394, "grad_norm": 1.4557584230319258, "learning_rate": 5.414837576821773e-07, "logits/chosen": -2.598437547683716, "logits/rejected": -2.8257813453674316, "logps/chosen": -522.2999877929688, "logps/rejected": -383.0, "loss": 0.0337, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.663232445716858, "rewards/margins": 8.885937690734863, "rewards/rejected": -10.542187690734863, "step": 2090 }, { "epoch": 1.8437225636523267, "grad_norm": 5.671437996698917, "learning_rate": 5.392888498683055e-07, "logits/chosen": -2.612499952316284, "logits/rejected": -2.9242186546325684, "logps/chosen": -479.1000061035156, "logps/rejected": -371.6000061035156, "loss": 0.0371, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3997681140899658, "rewards/margins": 9.600000381469727, "rewards/rejected": -10.996874809265137, "step": 2100 }, { "epoch": 1.8525021949078138, "grad_norm": 25.852137847817737, "learning_rate": 5.370939420544337e-07, "logits/chosen": -2.69921875, "logits/rejected": -2.957812547683716, "logps/chosen": -494.70001220703125, "logps/rejected": -387.70001220703125, "loss": 0.0307, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2505860328674316, "rewards/margins": 9.481249809265137, "rewards/rejected": -11.734375, "step": 2110 }, { "epoch": 1.8612818261633013, "grad_norm": 10.007930071076435, "learning_rate": 5.348990342405619e-07, "logits/chosen": -2.621875047683716, "logits/rejected": -2.8687500953674316, "logps/chosen": -485.5, "logps/rejected": -425.6000061035156, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -2.8060545921325684, "rewards/margins": 9.565625190734863, "rewards/rejected": -12.375, "step": 2120 }, { "epoch": 1.8700614574187884, "grad_norm": 29.969263411854946, "learning_rate": 5.3270412642669e-07, "logits/chosen": -2.702343702316284, "logits/rejected": -2.9140625, "logps/chosen": -483.1000061035156, "logps/rejected": -437.20001220703125, "loss": 0.0255, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2657227516174316, "rewards/margins": 9.53125, "rewards/rejected": -11.793749809265137, "step": 2130 }, { "epoch": 1.8788410886742757, "grad_norm": 74.83505274593155, "learning_rate": 5.305092186128183e-07, "logits/chosen": -2.729296922683716, "logits/rejected": -3.0093750953674316, "logps/chosen": -441.0, "logps/rejected": -382.45001220703125, "loss": 0.1658, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.8695311546325684, "rewards/margins": 8.7734375, "rewards/rejected": -11.637499809265137, "step": 2140 }, { "epoch": 1.887620719929763, "grad_norm": 54.97304214370293, "learning_rate": 5.283143107989464e-07, "logits/chosen": -2.768749952316284, "logits/rejected": -2.9820313453674316, "logps/chosen": -455.8500061035156, "logps/rejected": -388.20001220703125, "loss": 0.0439, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9685547351837158, "rewards/margins": 8.732812881469727, "rewards/rejected": -10.704687118530273, "step": 2150 }, { "epoch": 1.89640035118525, "grad_norm": 9.651110238221477, "learning_rate": 5.261194029850747e-07, "logits/chosen": -2.710156202316284, "logits/rejected": -2.95703125, "logps/chosen": -459.70001220703125, "logps/rejected": -403.8500061035156, "loss": 0.0299, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7095215320587158, "rewards/margins": 8.949999809265137, "rewards/rejected": -10.65625, "step": 2160 }, { "epoch": 1.9051799824407376, "grad_norm": 2.1942154568623855, "learning_rate": 5.239244951712027e-07, "logits/chosen": -2.678906202316284, "logits/rejected": -3.046093702316284, "logps/chosen": -429.8999938964844, "logps/rejected": -370.5, "loss": 0.0716, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.79638671875, "rewards/margins": 8.962499618530273, "rewards/rejected": -11.759374618530273, "step": 2170 }, { "epoch": 1.9139596136962247, "grad_norm": 11.58708748732068, "learning_rate": 5.21729587357331e-07, "logits/chosen": -2.587890625, "logits/rejected": -2.9703125953674316, "logps/chosen": -500.20001220703125, "logps/rejected": -372.04998779296875, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": -2.0755858421325684, "rewards/margins": 9.807812690734863, "rewards/rejected": -11.881250381469727, "step": 2180 }, { "epoch": 1.922739244951712, "grad_norm": 11.266654651590073, "learning_rate": 5.195346795434592e-07, "logits/chosen": -2.7164063453674316, "logits/rejected": -2.87890625, "logps/chosen": -435.1000061035156, "logps/rejected": -369.20001220703125, "loss": 0.0449, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.085302710533142, "rewards/margins": 9.196874618530273, "rewards/rejected": -10.271875381469727, "step": 2190 }, { "epoch": 1.9315188762071993, "grad_norm": 6.332808464658899, "learning_rate": 5.173397717295873e-07, "logits/chosen": -2.698046922683716, "logits/rejected": -2.944531202316284, "logps/chosen": -426.20001220703125, "logps/rejected": -388.3999938964844, "loss": 0.0275, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.277307152748108, "rewards/margins": 9.339062690734863, "rewards/rejected": -10.609375, "step": 2200 }, { "epoch": 1.9402985074626866, "grad_norm": 2.899822450260841, "learning_rate": 5.151448639157155e-07, "logits/chosen": -2.604687452316284, "logits/rejected": -2.8359375, "logps/chosen": -485.29998779296875, "logps/rejected": -378.3999938964844, "loss": 0.0142, "rewards/accuracies": 1.0, "rewards/chosen": -0.796435534954071, "rewards/margins": 9.4296875, "rewards/rejected": -10.225000381469727, "step": 2210 }, { "epoch": 1.949078138718174, "grad_norm": 90.00031641616339, "learning_rate": 5.129499561018437e-07, "logits/chosen": -2.688281297683716, "logits/rejected": -2.9507813453674316, "logps/chosen": -497.20001220703125, "logps/rejected": -399.8999938964844, "loss": 0.0402, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4178709983825684, "rewards/margins": 9.600000381469727, "rewards/rejected": -12.018750190734863, "step": 2220 }, { "epoch": 1.957857769973661, "grad_norm": 5.759216959258573, "learning_rate": 5.107550482879719e-07, "logits/chosen": -2.7828125953674316, "logits/rejected": -3.0269532203674316, "logps/chosen": -476.79998779296875, "logps/rejected": -377.6000061035156, "loss": 0.0213, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.258007764816284, "rewards/margins": 9.470312118530273, "rewards/rejected": -12.728124618530273, "step": 2230 }, { "epoch": 1.9666374012291485, "grad_norm": 76.46826217334423, "learning_rate": 5.085601404741001e-07, "logits/chosen": -2.688281297683716, "logits/rejected": -3.0257811546325684, "logps/chosen": -433.95001220703125, "logps/rejected": -363.3500061035156, "loss": 0.0891, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.761865258216858, "rewards/margins": 8.635937690734863, "rewards/rejected": -10.393750190734863, "step": 2240 }, { "epoch": 1.9754170324846356, "grad_norm": 7.416795070241535, "learning_rate": 5.063652326602282e-07, "logits/chosen": -2.62109375, "logits/rejected": -2.850781202316284, "logps/chosen": -495.6000061035156, "logps/rejected": -415.5, "loss": 0.0447, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.2946288585662842, "rewards/margins": 8.646875381469727, "rewards/rejected": -9.932812690734863, "step": 2250 }, { "epoch": 1.984196663740123, "grad_norm": 80.48079644613428, "learning_rate": 5.041703248463565e-07, "logits/chosen": -2.6468749046325684, "logits/rejected": -2.8335938453674316, "logps/chosen": -536.5, "logps/rejected": -458.1000061035156, "loss": 0.0504, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.196874976158142, "rewards/margins": 9.204687118530273, "rewards/rejected": -10.399999618530273, "step": 2260 }, { "epoch": 1.9929762949956102, "grad_norm": 60.45942940519937, "learning_rate": 5.019754170324846e-07, "logits/chosen": -2.6734375953674316, "logits/rejected": -2.932812452316284, "logps/chosen": -393.25, "logps/rejected": -378.3999938964844, "loss": 0.0345, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.770361304283142, "rewards/margins": 8.771875381469727, "rewards/rejected": -10.548437118530273, "step": 2270 }, { "epoch": 2.0017559262510973, "grad_norm": 5.829231629804558, "learning_rate": 4.997805092186128e-07, "logits/chosen": -2.75390625, "logits/rejected": -3.000781297683716, "logps/chosen": -391.45001220703125, "logps/rejected": -397.0, "loss": 0.0342, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8310546875, "rewards/margins": 8.748437881469727, "rewards/rejected": -10.582812309265137, "step": 2280 }, { "epoch": 2.010535557506585, "grad_norm": 8.129958503101284, "learning_rate": 4.97585601404741e-07, "logits/chosen": -2.6031250953674316, "logits/rejected": -2.879687547683716, "logps/chosen": -450.3500061035156, "logps/rejected": -412.20001220703125, "loss": 0.0347, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5906250476837158, "rewards/margins": 9.707812309265137, "rewards/rejected": -11.295312881469727, "step": 2290 }, { "epoch": 2.019315188762072, "grad_norm": 1.256296094856506, "learning_rate": 4.953906935908692e-07, "logits/chosen": -2.6742186546325684, "logits/rejected": -2.9398436546325684, "logps/chosen": -486.1000061035156, "logps/rejected": -398.3999938964844, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -2.4610352516174316, "rewards/margins": 9.824999809265137, "rewards/rejected": -12.287500381469727, "step": 2300 }, { "epoch": 2.0280948200175595, "grad_norm": 2.4979103671971536, "learning_rate": 4.931957857769974e-07, "logits/chosen": -2.625, "logits/rejected": -2.9765625, "logps/chosen": -543.5, "logps/rejected": -409.20001220703125, "loss": 0.0122, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.585778832435608, "rewards/margins": 10.546875, "rewards/rejected": -12.131250381469727, "step": 2310 }, { "epoch": 2.0368744512730466, "grad_norm": 1.6401242067323407, "learning_rate": 4.910008779631255e-07, "logits/chosen": -2.764843702316284, "logits/rejected": -2.98828125, "logps/chosen": -469.3999938964844, "logps/rejected": -410.20001220703125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.1011719703674316, "rewards/margins": 9.9609375, "rewards/rejected": -12.059374809265137, "step": 2320 }, { "epoch": 2.0456540825285336, "grad_norm": 5.31502201409783, "learning_rate": 4.888059701492537e-07, "logits/chosen": -2.859375, "logits/rejected": -3.11328125, "logps/chosen": -462.1000061035156, "logps/rejected": -402.20001220703125, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.7080078125, "rewards/margins": 10.503125190734863, "rewards/rejected": -13.206250190734863, "step": 2330 }, { "epoch": 2.054433713784021, "grad_norm": 3.0086319393185024, "learning_rate": 4.866110623353819e-07, "logits/chosen": -2.7691407203674316, "logits/rejected": -3.0718750953674316, "logps/chosen": -404.75, "logps/rejected": -369.8500061035156, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -2.324414014816284, "rewards/margins": 10.246874809265137, "rewards/rejected": -12.587499618530273, "step": 2340 }, { "epoch": 2.0632133450395083, "grad_norm": 0.7091409954115243, "learning_rate": 4.8441615452151e-07, "logits/chosen": -2.7242188453674316, "logits/rejected": -3.082812547683716, "logps/chosen": -466.0, "logps/rejected": -405.3999938964844, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -2.139453172683716, "rewards/margins": 10.290624618530273, "rewards/rejected": -12.425000190734863, "step": 2350 }, { "epoch": 2.071992976294996, "grad_norm": 0.9127856913215332, "learning_rate": 4.822212467076382e-07, "logits/chosen": -2.7789063453674316, "logits/rejected": -3.0687499046325684, "logps/chosen": -468.70001220703125, "logps/rejected": -414.0, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.1996092796325684, "rewards/margins": 10.240625381469727, "rewards/rejected": -12.449999809265137, "step": 2360 }, { "epoch": 2.080772607550483, "grad_norm": 0.4108646918329786, "learning_rate": 4.800263388937664e-07, "logits/chosen": -2.79296875, "logits/rejected": -3.06640625, "logps/chosen": -404.8999938964844, "logps/rejected": -381.70001220703125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -2.2166380882263184, "rewards/margins": 10.6875, "rewards/rejected": -12.903124809265137, "step": 2370 }, { "epoch": 2.08955223880597, "grad_norm": 25.929695524989565, "learning_rate": 4.778314310798946e-07, "logits/chosen": -2.796875, "logits/rejected": -3.08203125, "logps/chosen": -489.29998779296875, "logps/rejected": -403.1499938964844, "loss": 0.044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.372265577316284, "rewards/margins": 11.120312690734863, "rewards/rejected": -14.490625381469727, "step": 2380 }, { "epoch": 2.0983318700614575, "grad_norm": 3.2995867770152203, "learning_rate": 4.7563652326602285e-07, "logits/chosen": -2.645312547683716, "logits/rejected": -3.0921874046325684, "logps/chosen": -487.3999938964844, "logps/rejected": -392.8999938964844, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -2.2496094703674316, "rewards/margins": 10.725000381469727, "rewards/rejected": -12.984375, "step": 2390 }, { "epoch": 2.1071115013169446, "grad_norm": 0.7473818449492524, "learning_rate": 4.73441615452151e-07, "logits/chosen": -2.5804686546325684, "logits/rejected": -2.932812452316284, "logps/chosen": -485.8500061035156, "logps/rejected": -416.20001220703125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.7834961414337158, "rewards/margins": 11.267187118530273, "rewards/rejected": -13.068750381469727, "step": 2400 }, { "epoch": 2.115891132572432, "grad_norm": 3.7988199924865587, "learning_rate": 4.712467076382792e-07, "logits/chosen": -2.7671875953674316, "logits/rejected": -3.0835938453674316, "logps/chosen": -476.8999938964844, "logps/rejected": -423.79998779296875, "loss": 0.0143, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.292919874191284, "rewards/margins": 10.571874618530273, "rewards/rejected": -12.865625381469727, "step": 2410 }, { "epoch": 2.124670763827919, "grad_norm": 16.27799332682975, "learning_rate": 4.6905179982440737e-07, "logits/chosen": -2.671875, "logits/rejected": -3.042187452316284, "logps/chosen": -421.8999938964844, "logps/rejected": -403.8500061035156, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -2.303417921066284, "rewards/margins": 10.759374618530273, "rewards/rejected": -13.046875, "step": 2420 }, { "epoch": 2.1334503950834067, "grad_norm": 7.129915553152379, "learning_rate": 4.6685689201053554e-07, "logits/chosen": -2.733593702316284, "logits/rejected": -3.1640625, "logps/chosen": -477.0, "logps/rejected": -409.0, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -1.944433569908142, "rewards/margins": 10.596875190734863, "rewards/rejected": -12.546875, "step": 2430 }, { "epoch": 2.142230026338894, "grad_norm": 0.48165144700266316, "learning_rate": 4.646619841966637e-07, "logits/chosen": -2.680468797683716, "logits/rejected": -3.049999952316284, "logps/chosen": -499.6000061035156, "logps/rejected": -414.04998779296875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -1.22021484375, "rewards/margins": 10.978124618530273, "rewards/rejected": -12.1875, "step": 2440 }, { "epoch": 2.151009657594381, "grad_norm": 51.656135188406964, "learning_rate": 4.6246707638279194e-07, "logits/chosen": -2.6664061546325684, "logits/rejected": -3.0484375953674316, "logps/chosen": -527.0999755859375, "logps/rejected": -414.0, "loss": 0.0085, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.46142578125, "rewards/margins": 10.625, "rewards/rejected": -13.087499618530273, "step": 2450 }, { "epoch": 2.1597892888498684, "grad_norm": 0.6427210650270349, "learning_rate": 4.602721685689201e-07, "logits/chosen": -2.8109374046325684, "logits/rejected": -3.11328125, "logps/chosen": -435.1000061035156, "logps/rejected": -410.8999938964844, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -2.098876953125, "rewards/margins": 10.529687881469727, "rewards/rejected": -12.625, "step": 2460 }, { "epoch": 2.1685689201053555, "grad_norm": 0.47656816634790133, "learning_rate": 4.580772607550483e-07, "logits/chosen": -2.747265577316284, "logits/rejected": -3.2109375, "logps/chosen": -433.29998779296875, "logps/rejected": -375.5, "loss": 0.0155, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5643553733825684, "rewards/margins": 10.326562881469727, "rewards/rejected": -12.890625, "step": 2470 }, { "epoch": 2.177348551360843, "grad_norm": 0.34832012632633447, "learning_rate": 4.5588235294117646e-07, "logits/chosen": -2.87890625, "logits/rejected": -3.200000047683716, "logps/chosen": -481.79998779296875, "logps/rejected": -385.8999938964844, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -2.3958497047424316, "rewards/margins": 10.46875, "rewards/rejected": -12.862500190734863, "step": 2480 }, { "epoch": 2.18612818261633, "grad_norm": 6.7496892019060555, "learning_rate": 4.5368744512730464e-07, "logits/chosen": -2.6851563453674316, "logits/rejected": -3.10546875, "logps/chosen": -507.3999938964844, "logps/rejected": -424.1000061035156, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.153027296066284, "rewards/margins": 11.878125190734863, "rewards/rejected": -14.024999618530273, "step": 2490 }, { "epoch": 2.194907813871817, "grad_norm": 0.6121482329361724, "learning_rate": 4.5149253731343286e-07, "logits/chosen": -2.6484375, "logits/rejected": -2.8570313453674316, "logps/chosen": -488.5, "logps/rejected": -436.5, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -2.054492235183716, "rewards/margins": 10.856249809265137, "rewards/rejected": -12.903124809265137, "step": 2500 }, { "epoch": 2.2036874451273047, "grad_norm": 6.349396051477548, "learning_rate": 4.4929762949956104e-07, "logits/chosen": -2.692187547683716, "logits/rejected": -3.0640625953674316, "logps/chosen": -496.20001220703125, "logps/rejected": -457.1000061035156, "loss": 0.024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9005615711212158, "rewards/margins": 11.1796875, "rewards/rejected": -13.081250190734863, "step": 2510 }, { "epoch": 2.212467076382792, "grad_norm": 3.644255831457209, "learning_rate": 4.471027216856892e-07, "logits/chosen": -2.703125, "logits/rejected": -3.024218797683716, "logps/chosen": -487.3500061035156, "logps/rejected": -425.6000061035156, "loss": 0.0085, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2077393531799316, "rewards/margins": 11.196874618530273, "rewards/rejected": -13.412500381469727, "step": 2520 }, { "epoch": 2.2212467076382794, "grad_norm": 16.285576198010308, "learning_rate": 4.449078138718174e-07, "logits/chosen": -2.735156297683716, "logits/rejected": -3.051562547683716, "logps/chosen": -448.70001220703125, "logps/rejected": -380.8999938964844, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.8092772960662842, "rewards/margins": 11.115625381469727, "rewards/rejected": -12.918749809265137, "step": 2530 }, { "epoch": 2.2300263388937664, "grad_norm": 2.1018638746025315, "learning_rate": 4.4271290605794556e-07, "logits/chosen": -2.746875047683716, "logits/rejected": -3.100781202316284, "logps/chosen": -465.5, "logps/rejected": -397.20001220703125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -1.140600562095642, "rewards/margins": 10.571874618530273, "rewards/rejected": -11.71875, "step": 2540 }, { "epoch": 2.2388059701492535, "grad_norm": 0.25354455787351377, "learning_rate": 4.4051799824407373e-07, "logits/chosen": -2.7249999046325684, "logits/rejected": -2.9632811546325684, "logps/chosen": -440.0, "logps/rejected": -415.1499938964844, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.2507812976837158, "rewards/margins": 10.503125190734863, "rewards/rejected": -11.756250381469727, "step": 2550 }, { "epoch": 2.247585601404741, "grad_norm": 4.465127542028957, "learning_rate": 4.3832309043020195e-07, "logits/chosen": -2.7445311546325684, "logits/rejected": -3.1460938453674316, "logps/chosen": -437.8999938964844, "logps/rejected": -379.29998779296875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.8772461414337158, "rewards/margins": 11.028124809265137, "rewards/rejected": -12.899999618530273, "step": 2560 }, { "epoch": 2.256365232660228, "grad_norm": 0.6486833863520731, "learning_rate": 4.3612818261633013e-07, "logits/chosen": -2.6429686546325684, "logits/rejected": -3.1187500953674316, "logps/chosen": -466.1000061035156, "logps/rejected": -410.3999938964844, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -2.922656297683716, "rewards/margins": 10.970312118530273, "rewards/rejected": -13.890625, "step": 2570 }, { "epoch": 2.2651448639157157, "grad_norm": 0.7044609478091358, "learning_rate": 4.339332748024583e-07, "logits/chosen": -2.774218797683716, "logits/rejected": -3.1351561546325684, "logps/chosen": -504.20001220703125, "logps/rejected": -411.3999938964844, "loss": 0.011, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.546679735183716, "rewards/margins": 10.440625190734863, "rewards/rejected": -12.981249809265137, "step": 2580 }, { "epoch": 2.2739244951712028, "grad_norm": 114.96753115572733, "learning_rate": 4.317383669885865e-07, "logits/chosen": -2.8578124046325684, "logits/rejected": -3.203906297683716, "logps/chosen": -423.8500061035156, "logps/rejected": -413.20001220703125, "loss": 0.0242, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.119335889816284, "rewards/margins": 10.962499618530273, "rewards/rejected": -14.065625190734863, "step": 2590 }, { "epoch": 2.2827041264266903, "grad_norm": 0.16763494251044442, "learning_rate": 4.2954345917471465e-07, "logits/chosen": -2.909374952316284, "logits/rejected": -3.171093702316284, "logps/chosen": -424.54998779296875, "logps/rejected": -419.79998779296875, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -2.5365233421325684, "rewards/margins": 11.149999618530273, "rewards/rejected": -13.690625190734863, "step": 2600 }, { "epoch": 2.2914837576821774, "grad_norm": 21.805969916190726, "learning_rate": 4.273485513608428e-07, "logits/chosen": -2.8460936546325684, "logits/rejected": -3.143749952316284, "logps/chosen": -454.20001220703125, "logps/rejected": -421.20001220703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.786914110183716, "rewards/margins": 11.381250381469727, "rewards/rejected": -14.171875, "step": 2610 }, { "epoch": 2.3002633889376645, "grad_norm": 0.8865558015025492, "learning_rate": 4.2515364354697105e-07, "logits/chosen": -2.8375000953674316, "logits/rejected": -3.27734375, "logps/chosen": -507.20001220703125, "logps/rejected": -414.79998779296875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -2.6654295921325684, "rewards/margins": 10.876562118530273, "rewards/rejected": -13.543749809265137, "step": 2620 }, { "epoch": 2.309043020193152, "grad_norm": 0.43481201424948623, "learning_rate": 4.229587357330992e-07, "logits/chosen": -2.903125047683716, "logits/rejected": -3.1382813453674316, "logps/chosen": -442.79998779296875, "logps/rejected": -444.20001220703125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -2.9332032203674316, "rewards/margins": 11.135937690734863, "rewards/rejected": -14.068750381469727, "step": 2630 }, { "epoch": 2.317822651448639, "grad_norm": 3.0035868757732205, "learning_rate": 4.207638279192274e-07, "logits/chosen": -2.7164063453674316, "logits/rejected": -3.1546874046325684, "logps/chosen": -477.5, "logps/rejected": -438.20001220703125, "loss": 0.008, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.118945360183716, "rewards/margins": 11.784375190734863, "rewards/rejected": -13.90625, "step": 2640 }, { "epoch": 2.3266022827041266, "grad_norm": 0.1748044867695441, "learning_rate": 4.1856892010535557e-07, "logits/chosen": -2.784374952316284, "logits/rejected": -3.0390625, "logps/chosen": -446.1000061035156, "logps/rejected": -414.70001220703125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -3.2357420921325684, "rewards/margins": 11.015625, "rewards/rejected": -14.246874809265137, "step": 2650 }, { "epoch": 2.3353819139596137, "grad_norm": 0.7395235440829846, "learning_rate": 4.1637401229148374e-07, "logits/chosen": -2.6539063453674316, "logits/rejected": -3.0687499046325684, "logps/chosen": -503.29998779296875, "logps/rejected": -451.5, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -3.322460889816284, "rewards/margins": 11.384374618530273, "rewards/rejected": -14.699999809265137, "step": 2660 }, { "epoch": 2.344161545215101, "grad_norm": 1.370237299000094, "learning_rate": 4.141791044776119e-07, "logits/chosen": -2.7249999046325684, "logits/rejected": -2.989062547683716, "logps/chosen": -484.79998779296875, "logps/rejected": -436.20001220703125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.90771484375, "rewards/margins": 11.446874618530273, "rewards/rejected": -13.362500190734863, "step": 2670 }, { "epoch": 2.3529411764705883, "grad_norm": 3.562965363362462, "learning_rate": 4.1198419666374014e-07, "logits/chosen": -2.762500047683716, "logits/rejected": -3.2132811546325684, "logps/chosen": -522.5999755859375, "logps/rejected": -448.5, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -2.107067823410034, "rewards/margins": 11.3125, "rewards/rejected": -13.421875, "step": 2680 }, { "epoch": 2.3617208077260754, "grad_norm": 0.39954500286815386, "learning_rate": 4.097892888498683e-07, "logits/chosen": -2.9007811546325684, "logits/rejected": -3.207812547683716, "logps/chosen": -437.95001220703125, "logps/rejected": -403.79998779296875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.5396728515625, "rewards/margins": 10.618749618530273, "rewards/rejected": -13.165624618530273, "step": 2690 }, { "epoch": 2.370500438981563, "grad_norm": 7.2235979515104995, "learning_rate": 4.075943810359965e-07, "logits/chosen": -2.797656297683716, "logits/rejected": -3.2367186546325684, "logps/chosen": -428.0, "logps/rejected": -392.29998779296875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.944140672683716, "rewards/margins": 11.212499618530273, "rewards/rejected": -14.149999618530273, "step": 2700 }, { "epoch": 2.37928007023705, "grad_norm": 0.09996865617916992, "learning_rate": 4.0539947322212466e-07, "logits/chosen": -2.68359375, "logits/rejected": -3.0218749046325684, "logps/chosen": -549.0999755859375, "logps/rejected": -460.5, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -1.9423828125, "rewards/margins": 11.543749809265137, "rewards/rejected": -13.490625381469727, "step": 2710 }, { "epoch": 2.388059701492537, "grad_norm": 0.9165577405271063, "learning_rate": 4.0320456540825283e-07, "logits/chosen": -2.735156297683716, "logits/rejected": -3.075000047683716, "logps/chosen": -451.5, "logps/rejected": -430.8999938964844, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.525952100753784, "rewards/margins": 10.753125190734863, "rewards/rejected": -13.284375190734863, "step": 2720 }, { "epoch": 2.3968393327480246, "grad_norm": 2.5092185290788347, "learning_rate": 4.0100965759438106e-07, "logits/chosen": -2.7953124046325684, "logits/rejected": -3.149218797683716, "logps/chosen": -451.6000061035156, "logps/rejected": -390.8500061035156, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.133593797683716, "rewards/margins": 10.518750190734863, "rewards/rejected": -12.646875381469727, "step": 2730 }, { "epoch": 2.4056189640035117, "grad_norm": 4.217551683564792, "learning_rate": 3.9881474978050923e-07, "logits/chosen": -2.750781297683716, "logits/rejected": -3.1624999046325684, "logps/chosen": -510.0, "logps/rejected": -433.6000061035156, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -2.56591796875, "rewards/margins": 11.643750190734863, "rewards/rejected": -14.209375381469727, "step": 2740 }, { "epoch": 2.4143985952589992, "grad_norm": 0.5296646290506889, "learning_rate": 3.966198419666374e-07, "logits/chosen": -2.77734375, "logits/rejected": -3.1773438453674316, "logps/chosen": -439.20001220703125, "logps/rejected": -409.95001220703125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.0345091819763184, "rewards/margins": 11.918749809265137, "rewards/rejected": -13.956250190734863, "step": 2750 }, { "epoch": 2.4231782265144863, "grad_norm": 3.787800599099169, "learning_rate": 3.944249341527656e-07, "logits/chosen": -2.7476563453674316, "logits/rejected": -3.090625047683716, "logps/chosen": -429.0, "logps/rejected": -414.29998779296875, "loss": 0.0095, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.340087890625, "rewards/margins": 10.671875, "rewards/rejected": -12.015625, "step": 2760 }, { "epoch": 2.431957857769974, "grad_norm": 0.12784068177739766, "learning_rate": 3.9223002633889375e-07, "logits/chosen": -2.7945313453674316, "logits/rejected": -3.2171874046325684, "logps/chosen": -502.6000061035156, "logps/rejected": -423.3999938964844, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.700781226158142, "rewards/margins": 11.206250190734863, "rewards/rejected": -12.899999618530273, "step": 2770 }, { "epoch": 2.440737489025461, "grad_norm": 7.938389431158632, "learning_rate": 3.9003511852502193e-07, "logits/chosen": -2.85546875, "logits/rejected": -3.266406297683716, "logps/chosen": -430.95001220703125, "logps/rejected": -364.6499938964844, "loss": 0.0125, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.839135766029358, "rewards/margins": 11.03125, "rewards/rejected": -12.875, "step": 2780 }, { "epoch": 2.449517120280948, "grad_norm": 14.998778482568488, "learning_rate": 3.8784021071115015e-07, "logits/chosen": -2.8882813453674316, "logits/rejected": -3.35546875, "logps/chosen": -489.29998779296875, "logps/rejected": -411.8999938964844, "loss": 0.0229, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9105162620544434, "rewards/margins": 10.8984375, "rewards/rejected": -13.8125, "step": 2790 }, { "epoch": 2.4582967515364356, "grad_norm": 1.7565187000826334, "learning_rate": 3.8564530289727833e-07, "logits/chosen": -2.78515625, "logits/rejected": -3.2281250953674316, "logps/chosen": -435.5, "logps/rejected": -379.79998779296875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.641796827316284, "rewards/margins": 11.184374809265137, "rewards/rejected": -13.837499618530273, "step": 2800 }, { "epoch": 2.4670763827919227, "grad_norm": 0.341595067850225, "learning_rate": 3.834503950834065e-07, "logits/chosen": -2.7679686546325684, "logits/rejected": -3.137500047683716, "logps/chosen": -445.5, "logps/rejected": -389.1499938964844, "loss": 0.0121, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1552734375, "rewards/margins": 10.665624618530273, "rewards/rejected": -12.806249618530273, "step": 2810 }, { "epoch": 2.47585601404741, "grad_norm": 0.3086115567290912, "learning_rate": 3.8125548726953467e-07, "logits/chosen": -2.753124952316284, "logits/rejected": -3.176562547683716, "logps/chosen": -475.70001220703125, "logps/rejected": -385.70001220703125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -2.162426710128784, "rewards/margins": 10.217187881469727, "rewards/rejected": -12.381250381469727, "step": 2820 }, { "epoch": 2.4846356453028973, "grad_norm": 3.008743939193885, "learning_rate": 3.7906057945566285e-07, "logits/chosen": -2.813281297683716, "logits/rejected": -3.2015624046325684, "logps/chosen": -493.3999938964844, "logps/rejected": -422.0, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -2.0386719703674316, "rewards/margins": 11.518750190734863, "rewards/rejected": -13.553125381469727, "step": 2830 }, { "epoch": 2.493415276558385, "grad_norm": 0.28240199788216075, "learning_rate": 3.76865671641791e-07, "logits/chosen": -2.7445311546325684, "logits/rejected": -3.05078125, "logps/chosen": -471.8999938964844, "logps/rejected": -413.8999938964844, "loss": 0.0141, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1889891624450684, "rewards/margins": 10.887499809265137, "rewards/rejected": -13.081250190734863, "step": 2840 }, { "epoch": 2.502194907813872, "grad_norm": 1.569333218820518, "learning_rate": 3.7467076382791925e-07, "logits/chosen": -2.850781202316284, "logits/rejected": -3.149218797683716, "logps/chosen": -434.3999938964844, "logps/rejected": -441.20001220703125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -3.505859375, "rewards/margins": 11.535937309265137, "rewards/rejected": -15.037500381469727, "step": 2850 }, { "epoch": 2.510974539069359, "grad_norm": 0.2789923961660158, "learning_rate": 3.724758560140474e-07, "logits/chosen": -2.8296875953674316, "logits/rejected": -3.1500000953674316, "logps/chosen": -447.8999938964844, "logps/rejected": -423.29998779296875, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -3.0689454078674316, "rewards/margins": 11.237500190734863, "rewards/rejected": -14.300000190734863, "step": 2860 }, { "epoch": 2.5197541703248465, "grad_norm": 0.4753340620513279, "learning_rate": 3.702809482001756e-07, "logits/chosen": -2.858593702316284, "logits/rejected": -3.129687547683716, "logps/chosen": -484.1000061035156, "logps/rejected": -421.20001220703125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.6117186546325684, "rewards/margins": 11.662500381469727, "rewards/rejected": -14.262499809265137, "step": 2870 }, { "epoch": 2.5285338015803336, "grad_norm": 1.0295897199859099, "learning_rate": 3.6808604038630377e-07, "logits/chosen": -2.856250047683716, "logits/rejected": -3.2750000953674316, "logps/chosen": -487.1000061035156, "logps/rejected": -439.20001220703125, "loss": 0.0124, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9287109375, "rewards/margins": 11.756250381469727, "rewards/rejected": -14.684374809265137, "step": 2880 }, { "epoch": 2.5373134328358207, "grad_norm": 0.12552949327545365, "learning_rate": 3.6589113257243194e-07, "logits/chosen": -2.8968749046325684, "logits/rejected": -3.2046875953674316, "logps/chosen": -480.6000061035156, "logps/rejected": -426.5, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.5074219703674316, "rewards/margins": 12.118749618530273, "rewards/rejected": -15.612500190734863, "step": 2890 }, { "epoch": 2.546093064091308, "grad_norm": 11.535142224971468, "learning_rate": 3.636962247585601e-07, "logits/chosen": -2.7578125, "logits/rejected": -3.184375047683716, "logps/chosen": -523.7000122070312, "logps/rejected": -426.3999938964844, "loss": 0.0133, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.697070360183716, "rewards/margins": 11.251562118530273, "rewards/rejected": -13.953125, "step": 2900 }, { "epoch": 2.5548726953467953, "grad_norm": 1.3369389375281253, "learning_rate": 3.6150131694468834e-07, "logits/chosen": -2.858593702316284, "logits/rejected": -3.0882811546325684, "logps/chosen": -428.25, "logps/rejected": -428.6000061035156, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.26708984375, "rewards/margins": 11.596875190734863, "rewards/rejected": -13.862500190734863, "step": 2910 }, { "epoch": 2.563652326602283, "grad_norm": 0.12741373891156255, "learning_rate": 3.593064091308165e-07, "logits/chosen": -2.840625047683716, "logits/rejected": -3.215625047683716, "logps/chosen": -508.8999938964844, "logps/rejected": -448.25, "loss": 0.0113, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0849609375, "rewards/margins": 11.515625, "rewards/rejected": -13.59375, "step": 2920 }, { "epoch": 2.57243195785777, "grad_norm": 1.7791509170089177, "learning_rate": 3.571115013169447e-07, "logits/chosen": -2.850781202316284, "logits/rejected": -3.1734375953674316, "logps/chosen": -472.6499938964844, "logps/rejected": -426.6000061035156, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -2.8910155296325684, "rewards/margins": 11.371874809265137, "rewards/rejected": -14.259374618530273, "step": 2930 }, { "epoch": 2.5812115891132574, "grad_norm": 0.2907557433975636, "learning_rate": 3.5491659350307286e-07, "logits/chosen": -2.8359375, "logits/rejected": -3.0765624046325684, "logps/chosen": -467.79998779296875, "logps/rejected": -441.79998779296875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -2.855175733566284, "rewards/margins": 11.368749618530273, "rewards/rejected": -14.228124618530273, "step": 2940 }, { "epoch": 2.5899912203687445, "grad_norm": 1.0476924646665897, "learning_rate": 3.5272168568920103e-07, "logits/chosen": -2.727343797683716, "logits/rejected": -3.180468797683716, "logps/chosen": -464.5, "logps/rejected": -406.8999938964844, "loss": 0.0084, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3169798851013184, "rewards/margins": 11.553125381469727, "rewards/rejected": -13.865625381469727, "step": 2950 }, { "epoch": 2.5987708516242316, "grad_norm": 9.104389180024516, "learning_rate": 3.505267778753292e-07, "logits/chosen": -2.9476561546325684, "logits/rejected": -3.44140625, "logps/chosen": -449.75, "logps/rejected": -373.79998779296875, "loss": 0.0281, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9820313453674316, "rewards/margins": 10.881250381469727, "rewards/rejected": -13.856249809265137, "step": 2960 }, { "epoch": 2.607550482879719, "grad_norm": 25.49941033653934, "learning_rate": 3.4833187006145743e-07, "logits/chosen": -2.8851561546325684, "logits/rejected": -3.385937452316284, "logps/chosen": -506.6000061035156, "logps/rejected": -430.70001220703125, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -3.6386475563049316, "rewards/margins": 11.34375, "rewards/rejected": -14.981249809265137, "step": 2970 }, { "epoch": 2.616330114135206, "grad_norm": 0.6357750064402232, "learning_rate": 3.461369622475856e-07, "logits/chosen": -2.815624952316284, "logits/rejected": -3.2109375, "logps/chosen": -494.70001220703125, "logps/rejected": -411.0, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4117188453674316, "rewards/margins": 11.503125190734863, "rewards/rejected": -13.909375190734863, "step": 2980 }, { "epoch": 2.6251097453906937, "grad_norm": 12.462486367828244, "learning_rate": 3.439420544337138e-07, "logits/chosen": -2.7679686546325684, "logits/rejected": -3.109375, "logps/chosen": -472.8999938964844, "logps/rejected": -466.3999938964844, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.8158690929412842, "rewards/margins": 11.871874809265137, "rewards/rejected": -13.6875, "step": 2990 }, { "epoch": 2.633889376646181, "grad_norm": 0.6622488809590633, "learning_rate": 3.4174714661984195e-07, "logits/chosen": -2.90625, "logits/rejected": -3.272656202316284, "logps/chosen": -507.1000061035156, "logps/rejected": -424.20001220703125, "loss": 0.0322, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3148436546325684, "rewards/margins": 11.407812118530273, "rewards/rejected": -14.71875, "step": 3000 }, { "epoch": 2.6426690079016684, "grad_norm": 17.74994267414667, "learning_rate": 3.395522388059701e-07, "logits/chosen": -2.96875, "logits/rejected": -3.3101563453674316, "logps/chosen": -444.0, "logps/rejected": -430.29998779296875, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -3.209606885910034, "rewards/margins": 10.965624809265137, "rewards/rejected": -14.162500381469727, "step": 3010 }, { "epoch": 2.6514486391571555, "grad_norm": 1.6508557021991002, "learning_rate": 3.3735733099209835e-07, "logits/chosen": -2.7171874046325684, "logits/rejected": -3.1539063453674316, "logps/chosen": -521.7999877929688, "logps/rejected": -429.3999938964844, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.3973631858825684, "rewards/margins": 11.565625190734863, "rewards/rejected": -13.962499618530273, "step": 3020 }, { "epoch": 2.6602282704126425, "grad_norm": 2.4941847948107765, "learning_rate": 3.351624231782265e-07, "logits/chosen": -2.8359375, "logits/rejected": -3.3125, "logps/chosen": -461.45001220703125, "logps/rejected": -391.3999938964844, "loss": 0.0106, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6383941173553467, "rewards/margins": 10.6796875, "rewards/rejected": -13.324999809265137, "step": 3030 }, { "epoch": 2.66900790166813, "grad_norm": 0.46773944577867005, "learning_rate": 3.329675153643547e-07, "logits/chosen": -2.7796874046325684, "logits/rejected": -3.0296874046325684, "logps/chosen": -473.70001220703125, "logps/rejected": -428.5, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.0389404296875, "rewards/margins": 11.693750381469727, "rewards/rejected": -13.728124618530273, "step": 3040 }, { "epoch": 2.677787532923617, "grad_norm": 7.796743106497015, "learning_rate": 3.3077260755048287e-07, "logits/chosen": -2.70703125, "logits/rejected": -3.0531249046325684, "logps/chosen": -510.1000061035156, "logps/rejected": -426.8999938964844, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -2.334277391433716, "rewards/margins": 11.012499809265137, "rewards/rejected": -13.340624809265137, "step": 3050 }, { "epoch": 2.6865671641791042, "grad_norm": 149.5400140191504, "learning_rate": 3.2857769973661104e-07, "logits/chosen": -2.819531202316284, "logits/rejected": -3.1507811546325684, "logps/chosen": -508.20001220703125, "logps/rejected": -429.1000061035156, "loss": 0.0251, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.698779344558716, "rewards/margins": 11.685937881469727, "rewards/rejected": -14.387499809265137, "step": 3060 }, { "epoch": 2.6953467954345918, "grad_norm": 2.294379394738642, "learning_rate": 3.263827919227392e-07, "logits/chosen": -2.754687547683716, "logits/rejected": -3.042187452316284, "logps/chosen": -467.1000061035156, "logps/rejected": -426.8999938964844, "loss": 0.0181, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1273436546325684, "rewards/margins": 11.746874809265137, "rewards/rejected": -14.878125190734863, "step": 3070 }, { "epoch": 2.704126426690079, "grad_norm": 0.5390688743726256, "learning_rate": 3.2418788410886744e-07, "logits/chosen": -2.8515625, "logits/rejected": -3.12109375, "logps/chosen": -478.8500061035156, "logps/rejected": -417.70001220703125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -3.1832032203674316, "rewards/margins": 11.684374809265137, "rewards/rejected": -14.887499809265137, "step": 3080 }, { "epoch": 2.7129060579455664, "grad_norm": 0.2930461290342309, "learning_rate": 3.219929762949956e-07, "logits/chosen": -2.7109375, "logits/rejected": -2.9144530296325684, "logps/chosen": -461.45001220703125, "logps/rejected": -432.20001220703125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -2.153515577316284, "rewards/margins": 11.628125190734863, "rewards/rejected": -13.787500381469727, "step": 3090 }, { "epoch": 2.7216856892010535, "grad_norm": 0.8512322206144427, "learning_rate": 3.197980684811238e-07, "logits/chosen": -2.813281297683716, "logits/rejected": -3.2750000953674316, "logps/chosen": -471.6000061035156, "logps/rejected": -413.5, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.836328148841858, "rewards/margins": 11.196874618530273, "rewards/rejected": -13.043749809265137, "step": 3100 }, { "epoch": 2.730465320456541, "grad_norm": 0.5905195511439048, "learning_rate": 3.1760316066725196e-07, "logits/chosen": -2.6429686546325684, "logits/rejected": -3.124218702316284, "logps/chosen": -437.0, "logps/rejected": -411.79998779296875, "loss": 0.0074, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.979760766029358, "rewards/margins": 11.25, "rewards/rejected": -13.21875, "step": 3110 }, { "epoch": 2.739244951712028, "grad_norm": 1.3860649479622287, "learning_rate": 3.1540825285338014e-07, "logits/chosen": -2.9164061546325684, "logits/rejected": -3.3031249046325684, "logps/chosen": -489.20001220703125, "logps/rejected": -410.29998779296875, "loss": 0.0349, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.767773389816284, "rewards/margins": 11.106249809265137, "rewards/rejected": -13.881250381469727, "step": 3120 }, { "epoch": 2.748024582967515, "grad_norm": 0.5854640407417739, "learning_rate": 3.132133450395083e-07, "logits/chosen": -2.59375, "logits/rejected": -3.090625047683716, "logps/chosen": -472.25, "logps/rejected": -410.29998779296875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -2.4012694358825684, "rewards/margins": 11.078125, "rewards/rejected": -13.481249809265137, "step": 3130 }, { "epoch": 2.7568042142230027, "grad_norm": 0.271113356300562, "learning_rate": 3.1101843722563654e-07, "logits/chosen": -2.82421875, "logits/rejected": -3.1851563453674316, "logps/chosen": -513.5999755859375, "logps/rejected": -429.70001220703125, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.95703125, "rewards/margins": 11.831250190734863, "rewards/rejected": -14.806249618530273, "step": 3140 }, { "epoch": 2.76558384547849, "grad_norm": 34.269855016463474, "learning_rate": 3.088235294117647e-07, "logits/chosen": -3.0484375953674316, "logits/rejected": -3.395312547683716, "logps/chosen": -468.75, "logps/rejected": -441.70001220703125, "loss": 0.0196, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.3515625, "rewards/margins": 11.784375190734863, "rewards/rejected": -16.137500762939453, "step": 3150 }, { "epoch": 2.7743634767339773, "grad_norm": 0.18088218776182413, "learning_rate": 3.066286215978929e-07, "logits/chosen": -2.702343702316284, "logits/rejected": -3.0570311546325684, "logps/chosen": -509.5, "logps/rejected": -448.5, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -2.2892088890075684, "rewards/margins": 11.959375381469727, "rewards/rejected": -14.243749618530273, "step": 3160 }, { "epoch": 2.7831431079894644, "grad_norm": 3.5616361764442743, "learning_rate": 3.0443371378402106e-07, "logits/chosen": -2.729687452316284, "logits/rejected": -3.254687547683716, "logps/chosen": -493.0, "logps/rejected": -406.20001220703125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -2.224902391433716, "rewards/margins": 11.518750190734863, "rewards/rejected": -13.743749618530273, "step": 3170 }, { "epoch": 2.791922739244952, "grad_norm": 0.7910837882655565, "learning_rate": 3.0223880597014923e-07, "logits/chosen": -2.774609327316284, "logits/rejected": -3.2164063453674316, "logps/chosen": -524.7999877929688, "logps/rejected": -430.29998779296875, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -2.0347657203674316, "rewards/margins": 10.639062881469727, "rewards/rejected": -12.671875, "step": 3180 }, { "epoch": 2.800702370500439, "grad_norm": 1.2630786654471164, "learning_rate": 3.000438981562774e-07, "logits/chosen": -2.74609375, "logits/rejected": -3.1742186546325684, "logps/chosen": -462.54998779296875, "logps/rejected": -444.8999938964844, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.286328077316284, "rewards/margins": 11.484375, "rewards/rejected": -13.762499809265137, "step": 3190 }, { "epoch": 2.809482001755926, "grad_norm": 0.3941320072196412, "learning_rate": 2.9784899034240563e-07, "logits/chosen": -2.8304686546325684, "logits/rejected": -3.268749952316284, "logps/chosen": -471.1000061035156, "logps/rejected": -432.79998779296875, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -3.32177734375, "rewards/margins": 11.768750190734863, "rewards/rejected": -15.090624809265137, "step": 3200 }, { "epoch": 2.8182616330114136, "grad_norm": 0.3071148935777903, "learning_rate": 2.956540825285338e-07, "logits/chosen": -2.749218702316284, "logits/rejected": -3.198437452316284, "logps/chosen": -468.79998779296875, "logps/rejected": -430.79998779296875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -3.77734375, "rewards/margins": 11.734375, "rewards/rejected": -15.512499809265137, "step": 3210 }, { "epoch": 2.8270412642669007, "grad_norm": 0.5084614550875632, "learning_rate": 2.93459174714662e-07, "logits/chosen": -2.87109375, "logits/rejected": -3.3648438453674316, "logps/chosen": -489.29998779296875, "logps/rejected": -413.20001220703125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -3.2464842796325684, "rewards/margins": 11.649999618530273, "rewards/rejected": -14.896875381469727, "step": 3220 }, { "epoch": 2.835820895522388, "grad_norm": 24.3273411326105, "learning_rate": 2.9126426690079015e-07, "logits/chosen": -2.961718797683716, "logits/rejected": -3.401562452316284, "logps/chosen": -399.6499938964844, "logps/rejected": -395.6000061035156, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -3.007617235183716, "rewards/margins": 11.540624618530273, "rewards/rejected": -14.553125381469727, "step": 3230 }, { "epoch": 2.8446005267778753, "grad_norm": 3.314565321429036, "learning_rate": 2.890693590869183e-07, "logits/chosen": -2.6656250953674316, "logits/rejected": -3.0718750953674316, "logps/chosen": -487.5, "logps/rejected": -427.5, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.892187476158142, "rewards/margins": 11.556249618530273, "rewards/rejected": -13.453125, "step": 3240 }, { "epoch": 2.853380158033363, "grad_norm": 28.686295534814914, "learning_rate": 2.868744512730465e-07, "logits/chosen": -2.7359375953674316, "logits/rejected": -3.233593702316284, "logps/chosen": -465.79998779296875, "logps/rejected": -410.79998779296875, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.409374952316284, "rewards/margins": 11.899999618530273, "rewards/rejected": -14.303125381469727, "step": 3250 }, { "epoch": 2.86215978928885, "grad_norm": 7.789187486173813, "learning_rate": 2.846795434591747e-07, "logits/chosen": -2.7984375953674316, "logits/rejected": -3.1656250953674316, "logps/chosen": -487.8999938964844, "logps/rejected": -417.6000061035156, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -2.659960985183716, "rewards/margins": 11.609375, "rewards/rejected": -14.278124809265137, "step": 3260 }, { "epoch": 2.870939420544337, "grad_norm": 29.108873641287463, "learning_rate": 2.824846356453029e-07, "logits/chosen": -2.765625, "logits/rejected": -3.1890625953674316, "logps/chosen": -452.29998779296875, "logps/rejected": -428.20001220703125, "loss": 0.0194, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.977343797683716, "rewards/margins": 11.050000190734863, "rewards/rejected": -14.024999618530273, "step": 3270 }, { "epoch": 2.8797190517998246, "grad_norm": 0.6865625237809583, "learning_rate": 2.8028972783143107e-07, "logits/chosen": -2.668750047683716, "logits/rejected": -3.0523438453674316, "logps/chosen": -534.9000244140625, "logps/rejected": -444.5, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.4755859375, "rewards/margins": 12.193750381469727, "rewards/rejected": -14.668749809265137, "step": 3280 }, { "epoch": 2.8884986830553117, "grad_norm": 11.89063702085244, "learning_rate": 2.7809482001755924e-07, "logits/chosen": -2.8031249046325684, "logits/rejected": -3.246875047683716, "logps/chosen": -533.4000244140625, "logps/rejected": -439.79998779296875, "loss": 0.014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4878907203674316, "rewards/margins": 11.518750190734863, "rewards/rejected": -14.003125190734863, "step": 3290 }, { "epoch": 2.8972783143107987, "grad_norm": 0.37840473401428426, "learning_rate": 2.758999122036874e-07, "logits/chosen": -2.8296875953674316, "logits/rejected": -3.260937452316284, "logps/chosen": -453.1000061035156, "logps/rejected": -409.1000061035156, "loss": 0.0304, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.133007764816284, "rewards/margins": 11.653124809265137, "rewards/rejected": -13.78125, "step": 3300 }, { "epoch": 2.9060579455662863, "grad_norm": 0.16595628402576, "learning_rate": 2.7370500438981564e-07, "logits/chosen": -2.8531250953674316, "logits/rejected": -3.231250047683716, "logps/chosen": -423.8500061035156, "logps/rejected": -414.1000061035156, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.3470702171325684, "rewards/margins": 11.168749809265137, "rewards/rejected": -13.512499809265137, "step": 3310 }, { "epoch": 2.9148375768217734, "grad_norm": 0.45341250829508684, "learning_rate": 2.715100965759438e-07, "logits/chosen": -2.8515625, "logits/rejected": -3.274218797683716, "logps/chosen": -516.5, "logps/rejected": -402.1000061035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.4305663108825684, "rewards/margins": 11.706250190734863, "rewards/rejected": -14.149999618530273, "step": 3320 }, { "epoch": 2.923617208077261, "grad_norm": 18.752800779260905, "learning_rate": 2.69315188762072e-07, "logits/chosen": -2.821093797683716, "logits/rejected": -3.246875047683716, "logps/chosen": -474.0, "logps/rejected": -414.0, "loss": 0.007, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3584961891174316, "rewards/margins": 12.050000190734863, "rewards/rejected": -14.409375190734863, "step": 3330 }, { "epoch": 2.932396839332748, "grad_norm": 0.2712213729846258, "learning_rate": 2.6712028094820016e-07, "logits/chosen": -2.8539061546325684, "logits/rejected": -3.1859374046325684, "logps/chosen": -513.5999755859375, "logps/rejected": -430.70001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.98095703125, "rewards/margins": 12.356249809265137, "rewards/rejected": -15.328125, "step": 3340 }, { "epoch": 2.9411764705882355, "grad_norm": 1.4837851904643853, "learning_rate": 2.6492537313432834e-07, "logits/chosen": -2.98046875, "logits/rejected": -3.4320311546325684, "logps/chosen": -419.1000061035156, "logps/rejected": -374.8999938964844, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -2.9925780296325684, "rewards/margins": 11.024999618530273, "rewards/rejected": -14.015625, "step": 3350 }, { "epoch": 2.9499561018437226, "grad_norm": 0.5966690636532355, "learning_rate": 2.627304653204565e-07, "logits/chosen": -2.9429688453674316, "logits/rejected": -3.34765625, "logps/chosen": -479.8999938964844, "logps/rejected": -446.6000061035156, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -3.4771971702575684, "rewards/margins": 12.046875, "rewards/rejected": -15.537500381469727, "step": 3360 }, { "epoch": 2.9587357330992097, "grad_norm": 0.7037761701425428, "learning_rate": 2.6053555750658474e-07, "logits/chosen": -2.8265624046325684, "logits/rejected": -3.30859375, "logps/chosen": -489.3500061035156, "logps/rejected": -447.0, "loss": 0.0133, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.331591844558716, "rewards/margins": 12.596875190734863, "rewards/rejected": -15.9375, "step": 3370 }, { "epoch": 2.967515364354697, "grad_norm": 4.5995831378514564, "learning_rate": 2.583406496927129e-07, "logits/chosen": -2.8671875, "logits/rejected": -3.1796875, "logps/chosen": -469.1000061035156, "logps/rejected": -450.20001220703125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -2.9496092796325684, "rewards/margins": 12.296875, "rewards/rejected": -15.259374618530273, "step": 3380 }, { "epoch": 2.9762949956101843, "grad_norm": 1.9942442507224407, "learning_rate": 2.561457418788411e-07, "logits/chosen": -2.729687452316284, "logits/rejected": -3.401562452316284, "logps/chosen": -499.6000061035156, "logps/rejected": -394.70001220703125, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -2.23828125, "rewards/margins": 11.303125381469727, "rewards/rejected": -13.546875, "step": 3390 }, { "epoch": 2.9850746268656714, "grad_norm": 0.826774420947246, "learning_rate": 2.5395083406496926e-07, "logits/chosen": -2.7992186546325684, "logits/rejected": -3.25, "logps/chosen": -455.0, "logps/rejected": -406.3999938964844, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -2.381542921066284, "rewards/margins": 11.965624809265137, "rewards/rejected": -14.34375, "step": 3400 }, { "epoch": 2.993854258121159, "grad_norm": 0.21107534256101781, "learning_rate": 2.5175592625109743e-07, "logits/chosen": -2.8421874046325684, "logits/rejected": -3.336718797683716, "logps/chosen": -425.54998779296875, "logps/rejected": -428.1000061035156, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.1407227516174316, "rewards/margins": 12.328125, "rewards/rejected": -15.462499618530273, "step": 3410 }, { "epoch": 3.002633889376646, "grad_norm": 0.19105478413577648, "learning_rate": 2.4956101843722566e-07, "logits/chosen": -2.676953077316284, "logits/rejected": -3.2578125, "logps/chosen": -477.0, "logps/rejected": -429.1499938964844, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -2.495800733566284, "rewards/margins": 11.5625, "rewards/rejected": -14.056249618530273, "step": 3420 }, { "epoch": 3.0114135206321335, "grad_norm": 0.2897671146845593, "learning_rate": 2.4736611062335383e-07, "logits/chosen": -2.76171875, "logits/rejected": -3.1703124046325684, "logps/chosen": -527.0999755859375, "logps/rejected": -449.6000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.3095703125, "rewards/margins": 12.865625381469727, "rewards/rejected": -15.175000190734863, "step": 3430 }, { "epoch": 3.0201931518876206, "grad_norm": 0.13142725375963107, "learning_rate": 2.45171202809482e-07, "logits/chosen": -2.952343702316284, "logits/rejected": -3.26953125, "logps/chosen": -443.70001220703125, "logps/rejected": -444.1000061035156, "loss": 0.0259, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.306835889816284, "rewards/margins": 12.168749809265137, "rewards/rejected": -15.481249809265137, "step": 3440 }, { "epoch": 3.028972783143108, "grad_norm": 0.1804390208209594, "learning_rate": 2.429762949956102e-07, "logits/chosen": -2.70703125, "logits/rejected": -3.2992186546325684, "logps/chosen": -509.1000061035156, "logps/rejected": -423.79998779296875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.696484327316284, "rewards/margins": 12.456250190734863, "rewards/rejected": -15.146875381469727, "step": 3450 }, { "epoch": 3.0377524143985952, "grad_norm": 1.5459458361152458, "learning_rate": 2.4078138718173835e-07, "logits/chosen": -2.91015625, "logits/rejected": -3.4976563453674316, "logps/chosen": -398.1499938964844, "logps/rejected": -395.70001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.157177686691284, "rewards/margins": 12.5625, "rewards/rejected": -14.71875, "step": 3460 }, { "epoch": 3.0465320456540823, "grad_norm": 0.29451979000339223, "learning_rate": 2.385864793678666e-07, "logits/chosen": -2.78515625, "logits/rejected": -3.194531202316284, "logps/chosen": -451.3500061035156, "logps/rejected": -419.8999938964844, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -2.0389161109924316, "rewards/margins": 12.375, "rewards/rejected": -14.396875381469727, "step": 3470 }, { "epoch": 3.05531167690957, "grad_norm": 0.22625894057676185, "learning_rate": 2.3639157155399472e-07, "logits/chosen": -2.859375, "logits/rejected": -3.3023438453674316, "logps/chosen": -486.29998779296875, "logps/rejected": -440.5, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.752148389816284, "rewards/margins": 12.478124618530273, "rewards/rejected": -15.221875190734863, "step": 3480 }, { "epoch": 3.064091308165057, "grad_norm": 0.433111943744663, "learning_rate": 2.341966637401229e-07, "logits/chosen": -2.9000000953674316, "logits/rejected": -3.335156202316284, "logps/chosen": -552.7999877929688, "logps/rejected": -435.5, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.7406249046325684, "rewards/margins": 13.028124809265137, "rewards/rejected": -15.774999618530273, "step": 3490 }, { "epoch": 3.0728709394205445, "grad_norm": 0.17475867501196074, "learning_rate": 2.320017559262511e-07, "logits/chosen": -2.8726563453674316, "logits/rejected": -3.2320313453674316, "logps/chosen": -474.75, "logps/rejected": -458.5, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.07421875, "rewards/margins": 12.300000190734863, "rewards/rejected": -15.375, "step": 3500 }, { "epoch": 3.0816505706760315, "grad_norm": 1.1673770666973489, "learning_rate": 2.2980684811237927e-07, "logits/chosen": -2.9242186546325684, "logits/rejected": -3.30078125, "logps/chosen": -495.8999938964844, "logps/rejected": -478.20001220703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.9310545921325684, "rewards/margins": 12.453125, "rewards/rejected": -15.387499809265137, "step": 3510 }, { "epoch": 3.090430201931519, "grad_norm": 1.8334127424859914, "learning_rate": 2.2761194029850744e-07, "logits/chosen": -2.8765625953674316, "logits/rejected": -3.499218702316284, "logps/chosen": -460.70001220703125, "logps/rejected": -415.29998779296875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.2349610328674316, "rewards/margins": 12.709375381469727, "rewards/rejected": -14.931249618530273, "step": 3520 }, { "epoch": 3.099209833187006, "grad_norm": 0.0662337109745872, "learning_rate": 2.2541703248463564e-07, "logits/chosen": -2.848437547683716, "logits/rejected": -3.24609375, "logps/chosen": -494.5, "logps/rejected": -438.79998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.72412109375, "rewards/margins": 12.690625190734863, "rewards/rejected": -15.421875, "step": 3530 }, { "epoch": 3.1079894644424932, "grad_norm": 0.17196436735775314, "learning_rate": 2.2322212467076381e-07, "logits/chosen": -2.953125, "logits/rejected": -3.3843750953674316, "logps/chosen": -461.3500061035156, "logps/rejected": -419.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.92626953125, "rewards/margins": 12.571874618530273, "rewards/rejected": -15.5, "step": 3540 }, { "epoch": 3.1167690956979808, "grad_norm": 11.34050266346943, "learning_rate": 2.21027216856892e-07, "logits/chosen": -2.932812452316284, "logits/rejected": -3.453125, "logps/chosen": -472.04998779296875, "logps/rejected": -416.8999938964844, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.991406202316284, "rewards/margins": 13.496874809265137, "rewards/rejected": -16.484375, "step": 3550 }, { "epoch": 3.125548726953468, "grad_norm": 2.835787372901712, "learning_rate": 2.188323090430202e-07, "logits/chosen": -2.8671875, "logits/rejected": -3.4242186546325684, "logps/chosen": -440.45001220703125, "logps/rejected": -394.1000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.6015625, "rewards/margins": 12.134374618530273, "rewards/rejected": -14.737500190734863, "step": 3560 }, { "epoch": 3.1343283582089554, "grad_norm": 0.2898615380402753, "learning_rate": 2.1663740122914836e-07, "logits/chosen": -2.809375047683716, "logits/rejected": -3.27734375, "logps/chosen": -472.1000061035156, "logps/rejected": -454.6000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.3189697265625, "rewards/margins": 12.465624809265137, "rewards/rejected": -14.787500381469727, "step": 3570 }, { "epoch": 3.1431079894644425, "grad_norm": 0.22346892383750283, "learning_rate": 2.1444249341527653e-07, "logits/chosen": -2.819531202316284, "logits/rejected": -3.2406249046325684, "logps/chosen": -458.70001220703125, "logps/rejected": -412.6000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.282910108566284, "rewards/margins": 13.006250381469727, "rewards/rejected": -15.300000190734863, "step": 3580 }, { "epoch": 3.1518876207199296, "grad_norm": 0.06165003089752172, "learning_rate": 2.1224758560140473e-07, "logits/chosen": -2.8843750953674316, "logits/rejected": -3.4000000953674316, "logps/chosen": -436.25, "logps/rejected": -404.70001220703125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -2.8697266578674316, "rewards/margins": 12.5625, "rewards/rejected": -15.434374809265137, "step": 3590 }, { "epoch": 3.160667251975417, "grad_norm": 0.35852430505146193, "learning_rate": 2.100526777875329e-07, "logits/chosen": -2.905468702316284, "logits/rejected": -3.313281297683716, "logps/chosen": -464.1000061035156, "logps/rejected": -479.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.8792967796325684, "rewards/margins": 13.137499809265137, "rewards/rejected": -16.021875381469727, "step": 3600 }, { "epoch": 3.169446883230904, "grad_norm": 0.6426315052360575, "learning_rate": 2.0785776997366108e-07, "logits/chosen": -2.995312452316284, "logits/rejected": -3.46875, "logps/chosen": -429.1000061035156, "logps/rejected": -421.5, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -2.863818407058716, "rewards/margins": 12.074999809265137, "rewards/rejected": -14.9375, "step": 3610 }, { "epoch": 3.1782265144863917, "grad_norm": 0.2104731802633413, "learning_rate": 2.0566286215978928e-07, "logits/chosen": -2.723437547683716, "logits/rejected": -3.2964844703674316, "logps/chosen": -528.0999755859375, "logps/rejected": -425.8999938964844, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.5757813453674316, "rewards/margins": 12.834375381469727, "rewards/rejected": -15.415624618530273, "step": 3620 }, { "epoch": 3.187006145741879, "grad_norm": 0.21403310051401384, "learning_rate": 2.0346795434591745e-07, "logits/chosen": -2.922656297683716, "logits/rejected": -3.47265625, "logps/chosen": -483.3500061035156, "logps/rejected": -429.3999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.660595655441284, "rewards/margins": 12.615625381469727, "rewards/rejected": -15.274999618530273, "step": 3630 }, { "epoch": 3.195785776997366, "grad_norm": 0.11778338364967891, "learning_rate": 2.0127304653204563e-07, "logits/chosen": -2.9906249046325684, "logits/rejected": -3.38671875, "logps/chosen": -503.8999938964844, "logps/rejected": -439.6000061035156, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.6205077171325684, "rewards/margins": 12.443750381469727, "rewards/rejected": -15.068750381469727, "step": 3640 }, { "epoch": 3.2045654082528534, "grad_norm": 1.2168006025364513, "learning_rate": 1.9907813871817383e-07, "logits/chosen": -2.82421875, "logits/rejected": -3.30078125, "logps/chosen": -473.45001220703125, "logps/rejected": -408.79998779296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.6533203125, "rewards/margins": 11.981249809265137, "rewards/rejected": -14.640625, "step": 3650 }, { "epoch": 3.2133450395083405, "grad_norm": 0.31806520029939034, "learning_rate": 1.96883230904302e-07, "logits/chosen": -2.9124999046325684, "logits/rejected": -3.325000047683716, "logps/chosen": -460.3999938964844, "logps/rejected": -434.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.6673827171325684, "rewards/margins": 12.850000381469727, "rewards/rejected": -15.515625, "step": 3660 }, { "epoch": 3.222124670763828, "grad_norm": 0.0645329869839452, "learning_rate": 1.946883230904302e-07, "logits/chosen": -2.768749952316284, "logits/rejected": -3.24609375, "logps/chosen": -485.0, "logps/rejected": -485.5, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.003857374191284, "rewards/margins": 13.121874809265137, "rewards/rejected": -16.125, "step": 3670 }, { "epoch": 3.230904302019315, "grad_norm": 0.24265794854362419, "learning_rate": 1.9249341527655837e-07, "logits/chosen": -2.921093702316284, "logits/rejected": -3.3343749046325684, "logps/chosen": -481.1000061035156, "logps/rejected": -446.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.3812012672424316, "rewards/margins": 12.559374809265137, "rewards/rejected": -14.943750381469727, "step": 3680 }, { "epoch": 3.2396839332748026, "grad_norm": 0.17333913417238933, "learning_rate": 1.9029850746268655e-07, "logits/chosen": -2.835156202316284, "logits/rejected": -3.473437547683716, "logps/chosen": -474.6499938964844, "logps/rejected": -402.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.8431639671325684, "rewards/margins": 12.606249809265137, "rewards/rejected": -15.446874618530273, "step": 3690 }, { "epoch": 3.2484635645302897, "grad_norm": 0.1394520929874817, "learning_rate": 1.8810359964881475e-07, "logits/chosen": -2.7242188453674316, "logits/rejected": -3.1968750953674316, "logps/chosen": -476.6000061035156, "logps/rejected": -433.3999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.120898485183716, "rewards/margins": 12.612500190734863, "rewards/rejected": -14.731249809265137, "step": 3700 }, { "epoch": 3.257243195785777, "grad_norm": 124.02747783437341, "learning_rate": 1.8590869183494292e-07, "logits/chosen": -2.684765577316284, "logits/rejected": -3.2796874046325684, "logps/chosen": -494.25, "logps/rejected": -471.1000061035156, "loss": 0.0055, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1326050758361816, "rewards/margins": 12.896875381469727, "rewards/rejected": -16.03125, "step": 3710 }, { "epoch": 3.2660228270412643, "grad_norm": 0.4924574534539922, "learning_rate": 1.837137840210711e-07, "logits/chosen": -2.9351563453674316, "logits/rejected": -3.358593702316284, "logps/chosen": -492.75, "logps/rejected": -443.6000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.913769483566284, "rewards/margins": 12.78125, "rewards/rejected": -15.693750381469727, "step": 3720 }, { "epoch": 3.2748024582967514, "grad_norm": 0.654673041651116, "learning_rate": 1.815188762071993e-07, "logits/chosen": -2.8746094703674316, "logits/rejected": -3.2484374046325684, "logps/chosen": -500.20001220703125, "logps/rejected": -467.70001220703125, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.586865186691284, "rewards/margins": 13.371874809265137, "rewards/rejected": -15.959375381469727, "step": 3730 }, { "epoch": 3.283582089552239, "grad_norm": 0.07458317569216595, "learning_rate": 1.7932396839332747e-07, "logits/chosen": -2.9453125, "logits/rejected": -3.311718702316284, "logps/chosen": -451.45001220703125, "logps/rejected": -462.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.230175733566284, "rewards/margins": 13.628125190734863, "rewards/rejected": -16.850000381469727, "step": 3740 }, { "epoch": 3.292361720807726, "grad_norm": 0.10553653536181591, "learning_rate": 1.7712906057945564e-07, "logits/chosen": -2.953125, "logits/rejected": -3.4906249046325684, "logps/chosen": -508.29998779296875, "logps/rejected": -426.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.3525390625, "rewards/margins": 12.456250190734863, "rewards/rejected": -15.803125381469727, "step": 3750 }, { "epoch": 3.3011413520632136, "grad_norm": 0.05722732486084981, "learning_rate": 1.7493415276558384e-07, "logits/chosen": -2.8828125, "logits/rejected": -3.340625047683716, "logps/chosen": -443.1000061035156, "logps/rejected": -435.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.0640625953674316, "rewards/margins": 13.106249809265137, "rewards/rejected": -16.165624618530273, "step": 3760 }, { "epoch": 3.3099209833187007, "grad_norm": 0.0754017491576997, "learning_rate": 1.72739244951712e-07, "logits/chosen": -2.901562452316284, "logits/rejected": -3.44140625, "logps/chosen": -500.79998779296875, "logps/rejected": -424.5, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.3499999046325684, "rewards/margins": 12.615625381469727, "rewards/rejected": -15.971875190734863, "step": 3770 }, { "epoch": 3.3187006145741877, "grad_norm": 0.2651092306336141, "learning_rate": 1.7054433713784019e-07, "logits/chosen": -2.936718702316284, "logits/rejected": -3.3203125, "logps/chosen": -445.54998779296875, "logps/rejected": -410.29998779296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.1976561546325684, "rewards/margins": 12.009374618530273, "rewards/rejected": -15.212499618530273, "step": 3780 }, { "epoch": 3.3274802458296753, "grad_norm": 0.12226431439251058, "learning_rate": 1.6834942932396839e-07, "logits/chosen": -2.9281249046325684, "logits/rejected": -3.426562547683716, "logps/chosen": -476.8500061035156, "logps/rejected": -428.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4535155296325684, "rewards/margins": 13.168749809265137, "rewards/rejected": -16.625, "step": 3790 }, { "epoch": 3.3362598770851624, "grad_norm": 1.0456090219350036, "learning_rate": 1.6615452151009656e-07, "logits/chosen": -2.8851561546325684, "logits/rejected": -3.362499952316284, "logps/chosen": -509.29998779296875, "logps/rejected": -448.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.881640672683716, "rewards/margins": 12.943750381469727, "rewards/rejected": -15.809374809265137, "step": 3800 }, { "epoch": 3.3450395083406494, "grad_norm": 0.19182329344783963, "learning_rate": 1.6395961369622473e-07, "logits/chosen": -2.77734375, "logits/rejected": -3.137500047683716, "logps/chosen": -507.8999938964844, "logps/rejected": -511.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.092578172683716, "rewards/margins": 13.212499618530273, "rewards/rejected": -16.309375762939453, "step": 3810 }, { "epoch": 3.353819139596137, "grad_norm": 4.527202299510661, "learning_rate": 1.6176470588235293e-07, "logits/chosen": -2.94140625, "logits/rejected": -3.397656202316284, "logps/chosen": -473.3999938964844, "logps/rejected": -428.3999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.9330077171325684, "rewards/margins": 12.568750381469727, "rewards/rejected": -15.5, "step": 3820 }, { "epoch": 3.362598770851624, "grad_norm": 0.8945758518684351, "learning_rate": 1.595697980684811e-07, "logits/chosen": -2.896484375, "logits/rejected": -3.499218702316284, "logps/chosen": -465.1000061035156, "logps/rejected": -421.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.2679443359375, "rewards/margins": 12.378125190734863, "rewards/rejected": -14.640625, "step": 3830 }, { "epoch": 3.3713784021071116, "grad_norm": 0.05290599153908576, "learning_rate": 1.5737489025460928e-07, "logits/chosen": -2.921875, "logits/rejected": -3.282031297683716, "logps/chosen": -449.75, "logps/rejected": -464.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.8628907203674316, "rewards/margins": 12.287500381469727, "rewards/rejected": -15.146875381469727, "step": 3840 }, { "epoch": 3.3801580333625987, "grad_norm": 0.8799763925896169, "learning_rate": 1.5517998244073748e-07, "logits/chosen": -3.0648436546325684, "logits/rejected": -3.616406202316284, "logps/chosen": -424.1000061035156, "logps/rejected": -387.1000061035156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.1626954078674316, "rewards/margins": 12.384374618530273, "rewards/rejected": -15.546875, "step": 3850 }, { "epoch": 3.388937664618086, "grad_norm": 2.747670141209021, "learning_rate": 1.5298507462686565e-07, "logits/chosen": -2.9296875, "logits/rejected": -3.4390625953674316, "logps/chosen": -464.79998779296875, "logps/rejected": -426.3999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.5855469703674316, "rewards/margins": 12.675000190734863, "rewards/rejected": -15.25, "step": 3860 }, { "epoch": 3.3977172958735733, "grad_norm": 0.6049742078697031, "learning_rate": 1.5079016681299383e-07, "logits/chosen": -2.885937452316284, "logits/rejected": -3.149218797683716, "logps/chosen": -456.8999938964844, "logps/rejected": -453.20001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.618115186691284, "rewards/margins": 12.5625, "rewards/rejected": -15.184374809265137, "step": 3870 }, { "epoch": 3.4064969271290604, "grad_norm": 0.1470989199812354, "learning_rate": 1.4859525899912203e-07, "logits/chosen": -3.04296875, "logits/rejected": -3.522656202316284, "logps/chosen": -493.1000061035156, "logps/rejected": -409.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.778125047683716, "rewards/margins": 12.265625, "rewards/rejected": -15.043749809265137, "step": 3880 }, { "epoch": 3.415276558384548, "grad_norm": 0.13161265388801072, "learning_rate": 1.464003511852502e-07, "logits/chosen": -2.9781250953674316, "logits/rejected": -3.413281202316284, "logps/chosen": -437.8500061035156, "logps/rejected": -437.8999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.8871092796325684, "rewards/margins": 11.887499809265137, "rewards/rejected": -14.778124809265137, "step": 3890 }, { "epoch": 3.424056189640035, "grad_norm": 1.196199409101085, "learning_rate": 1.4420544337137837e-07, "logits/chosen": -2.8851561546325684, "logits/rejected": -3.518749952316284, "logps/chosen": -499.5, "logps/rejected": -418.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.387890577316284, "rewards/margins": 12.559374809265137, "rewards/rejected": -15.943750381469727, "step": 3900 }, { "epoch": 3.4328358208955225, "grad_norm": 0.7215686703181198, "learning_rate": 1.4201053555750657e-07, "logits/chosen": -3.0687499046325684, "logits/rejected": -3.596874952316284, "logps/chosen": -484.0, "logps/rejected": -444.5, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -3.642578125, "rewards/margins": 12.634374618530273, "rewards/rejected": -16.271875381469727, "step": 3910 }, { "epoch": 3.4416154521510096, "grad_norm": 0.4406738843864271, "learning_rate": 1.3981562774363474e-07, "logits/chosen": -2.7437500953674316, "logits/rejected": -3.1546874046325684, "logps/chosen": -532.2999877929688, "logps/rejected": -455.70001220703125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.461230516433716, "rewards/margins": 13.050000190734863, "rewards/rejected": -15.524999618530273, "step": 3920 }, { "epoch": 3.450395083406497, "grad_norm": 0.4340808016288137, "learning_rate": 1.3762071992976294e-07, "logits/chosen": -2.964062452316284, "logits/rejected": -3.3984375, "logps/chosen": -490.0, "logps/rejected": -420.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.340625047683716, "rewards/margins": 13.475000381469727, "rewards/rejected": -16.809375762939453, "step": 3930 }, { "epoch": 3.4591747146619842, "grad_norm": 0.05533862491991608, "learning_rate": 1.3542581211589112e-07, "logits/chosen": -2.9375, "logits/rejected": -3.2437500953674316, "logps/chosen": -458.8999938964844, "logps/rejected": -433.79998779296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.80859375, "rewards/margins": 12.756250381469727, "rewards/rejected": -15.565625190734863, "step": 3940 }, { "epoch": 3.4679543459174713, "grad_norm": 0.18822112601303712, "learning_rate": 1.332309043020193e-07, "logits/chosen": -2.940624952316284, "logits/rejected": -3.43359375, "logps/chosen": -486.20001220703125, "logps/rejected": -410.20001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.6832032203674316, "rewards/margins": 12.743749618530273, "rewards/rejected": -16.412500381469727, "step": 3950 }, { "epoch": 3.476733977172959, "grad_norm": 0.18996959944850028, "learning_rate": 1.310359964881475e-07, "logits/chosen": -2.9937500953674316, "logits/rejected": -3.518749952316284, "logps/chosen": -435.8999938964844, "logps/rejected": -429.79998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.096874952316284, "rewards/margins": 13.153124809265137, "rewards/rejected": -16.231250762939453, "step": 3960 }, { "epoch": 3.485513608428446, "grad_norm": 0.01910957476568621, "learning_rate": 1.2884108867427566e-07, "logits/chosen": -2.807812452316284, "logits/rejected": -3.321093797683716, "logps/chosen": -529.7999877929688, "logps/rejected": -440.70001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.3363280296325684, "rewards/margins": 13.03125, "rewards/rejected": -16.368749618530273, "step": 3970 }, { "epoch": 3.4942932396839335, "grad_norm": 2.102750056933607, "learning_rate": 1.2664618086040384e-07, "logits/chosen": -2.7914061546325684, "logits/rejected": -3.3765625953674316, "logps/chosen": -512.9500122070312, "logps/rejected": -459.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.6820311546325684, "rewards/margins": 13.162500381469727, "rewards/rejected": -15.850000381469727, "step": 3980 }, { "epoch": 3.5030728709394205, "grad_norm": 0.07518352660108933, "learning_rate": 1.2445127304653204e-07, "logits/chosen": -2.8296875953674316, "logits/rejected": -3.547656297683716, "logps/chosen": -479.29998779296875, "logps/rejected": -404.79998779296875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.124755859375, "rewards/margins": 12.378125190734863, "rewards/rejected": -15.496874809265137, "step": 3990 }, { "epoch": 3.511852502194908, "grad_norm": 3.3562862303540464, "learning_rate": 1.2225636523266024e-07, "logits/chosen": -2.762500047683716, "logits/rejected": -3.207812547683716, "logps/chosen": -510.6499938964844, "logps/rejected": -437.20001220703125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.0020508766174316, "rewards/margins": 12.731249809265137, "rewards/rejected": -14.737500190734863, "step": 4000 }, { "epoch": 3.520632133450395, "grad_norm": 0.44469118787885575, "learning_rate": 1.200614574187884e-07, "logits/chosen": -2.9765625, "logits/rejected": -3.510937452316284, "logps/chosen": -426.20001220703125, "logps/rejected": -392.3999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.1279296875, "rewards/margins": 12.796875, "rewards/rejected": -15.918749809265137, "step": 4010 }, { "epoch": 3.5294117647058822, "grad_norm": 0.13597357049994596, "learning_rate": 1.1786654960491658e-07, "logits/chosen": -3.0078125, "logits/rejected": -3.5, "logps/chosen": -432.8999938964844, "logps/rejected": -436.6000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.1011719703674316, "rewards/margins": 12.603124618530273, "rewards/rejected": -15.696874618530273, "step": 4020 }, { "epoch": 3.53819139596137, "grad_norm": 0.15523896636363424, "learning_rate": 1.1567164179104477e-07, "logits/chosen": -2.8578124046325684, "logits/rejected": -3.108593702316284, "logps/chosen": -462.20001220703125, "logps/rejected": -485.20001220703125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.6551756858825684, "rewards/margins": 12.740625381469727, "rewards/rejected": -15.403124809265137, "step": 4030 }, { "epoch": 3.546971027216857, "grad_norm": 1.652462109336473, "learning_rate": 1.1347673397717296e-07, "logits/chosen": -2.854687452316284, "logits/rejected": -3.3343749046325684, "logps/chosen": -548.9000244140625, "logps/rejected": -451.6000061035156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.1659178733825684, "rewards/margins": 12.746874809265137, "rewards/rejected": -14.921875, "step": 4040 }, { "epoch": 3.555750658472344, "grad_norm": 0.19301163849638503, "learning_rate": 1.1128182616330113e-07, "logits/chosen": -2.844531297683716, "logits/rejected": -3.2750000953674316, "logps/chosen": -447.04998779296875, "logps/rejected": -453.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.0361328125, "rewards/margins": 12.875, "rewards/rejected": -15.921875, "step": 4050 }, { "epoch": 3.5645302897278315, "grad_norm": 0.9313979779632242, "learning_rate": 1.0908691834942932e-07, "logits/chosen": -2.90234375, "logits/rejected": -3.3148436546325684, "logps/chosen": -487.1000061035156, "logps/rejected": -476.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.663769483566284, "rewards/margins": 13.459375381469727, "rewards/rejected": -16.131250381469727, "step": 4060 }, { "epoch": 3.5733099209833186, "grad_norm": 0.07980997449800085, "learning_rate": 1.068920105355575e-07, "logits/chosen": -2.9242186546325684, "logits/rejected": -3.2398438453674316, "logps/chosen": -483.20001220703125, "logps/rejected": -446.1000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.184765577316284, "rewards/margins": 12.918749809265137, "rewards/rejected": -16.100000381469727, "step": 4070 }, { "epoch": 3.582089552238806, "grad_norm": 0.05193347424244529, "learning_rate": 1.0469710272168568e-07, "logits/chosen": -2.9625000953674316, "logits/rejected": -3.453125, "logps/chosen": -482.20001220703125, "logps/rejected": -425.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.33544921875, "rewards/margins": 12.084375381469727, "rewards/rejected": -15.40625, "step": 4080 }, { "epoch": 3.590869183494293, "grad_norm": 0.23734802974722582, "learning_rate": 1.0250219490781386e-07, "logits/chosen": -2.9375, "logits/rejected": -3.4000000953674316, "logps/chosen": -401.25, "logps/rejected": -408.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.783496141433716, "rewards/margins": 12.915624618530273, "rewards/rejected": -15.675000190734863, "step": 4090 }, { "epoch": 3.5996488147497807, "grad_norm": 0.1646840228622721, "learning_rate": 1.0030728709394205e-07, "logits/chosen": -2.9429688453674316, "logits/rejected": -3.484375, "logps/chosen": -508.1000061035156, "logps/rejected": -455.20001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.922070264816284, "rewards/margins": 13.053125381469727, "rewards/rejected": -15.975000381469727, "step": 4100 }, { "epoch": 3.608428446005268, "grad_norm": 0.5424737470781746, "learning_rate": 9.811237928007022e-08, "logits/chosen": -2.8812499046325684, "logits/rejected": -3.5445313453674316, "logps/chosen": -420.6000061035156, "logps/rejected": -414.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.350781202316284, "rewards/margins": 13.543749809265137, "rewards/rejected": -16.893749237060547, "step": 4110 }, { "epoch": 3.617208077260755, "grad_norm": 0.319465348304568, "learning_rate": 9.591747146619841e-08, "logits/chosen": -3.020312547683716, "logits/rejected": -3.4609375, "logps/chosen": -461.20001220703125, "logps/rejected": -412.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.85498046875, "rewards/margins": 12.981249809265137, "rewards/rejected": -15.834375381469727, "step": 4120 }, { "epoch": 3.6259877085162424, "grad_norm": 21.26942592839781, "learning_rate": 9.37225636523266e-08, "logits/chosen": -2.8734374046325684, "logits/rejected": -3.3578124046325684, "logps/chosen": -510.6000061035156, "logps/rejected": -421.1000061035156, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.2734375, "rewards/margins": 12.959375381469727, "rewards/rejected": -15.234375, "step": 4130 }, { "epoch": 3.6347673397717295, "grad_norm": 0.05669587340693758, "learning_rate": 9.152765583845478e-08, "logits/chosen": -3.003124952316284, "logits/rejected": -3.500781297683716, "logps/chosen": -498.20001220703125, "logps/rejected": -400.79998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.163403272628784, "rewards/margins": 13.096875190734863, "rewards/rejected": -16.256250381469727, "step": 4140 }, { "epoch": 3.6435469710272166, "grad_norm": 0.19390133786865785, "learning_rate": 8.933274802458296e-08, "logits/chosen": -2.840625047683716, "logits/rejected": -3.250781297683716, "logps/chosen": -510.3999938964844, "logps/rejected": -480.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2867188453674316, "rewards/margins": 12.990625381469727, "rewards/rejected": -16.278125762939453, "step": 4150 }, { "epoch": 3.652326602282704, "grad_norm": 0.022028018683772615, "learning_rate": 8.713784021071114e-08, "logits/chosen": -2.9195313453674316, "logits/rejected": -3.37109375, "logps/chosen": -470.45001220703125, "logps/rejected": -413.8999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.0035157203674316, "rewards/margins": 12.774999618530273, "rewards/rejected": -15.771875381469727, "step": 4160 }, { "epoch": 3.6611062335381916, "grad_norm": 0.7821369463512009, "learning_rate": 8.494293239683933e-08, "logits/chosen": -2.860156297683716, "logits/rejected": -3.299999952316284, "logps/chosen": -528.5, "logps/rejected": -498.1000061035156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1226563453674316, "rewards/margins": 12.753125190734863, "rewards/rejected": -15.868749618530273, "step": 4170 }, { "epoch": 3.6698858647936787, "grad_norm": 0.06689871902320767, "learning_rate": 8.27480245829675e-08, "logits/chosen": -2.8968749046325684, "logits/rejected": -3.452343702316284, "logps/chosen": -490.0, "logps/rejected": -421.1000061035156, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -2.5009765625, "rewards/margins": 12.368749618530273, "rewards/rejected": -14.881250381469727, "step": 4180 }, { "epoch": 3.678665496049166, "grad_norm": 0.18775309242892332, "learning_rate": 8.055311676909569e-08, "logits/chosen": -2.973437547683716, "logits/rejected": -3.4476561546325684, "logps/chosen": -469.3999938964844, "logps/rejected": -434.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.11767578125, "rewards/margins": 12.856249809265137, "rewards/rejected": -15.975000381469727, "step": 4190 }, { "epoch": 3.6874451273046533, "grad_norm": 0.06338303708779713, "learning_rate": 7.835820895522388e-08, "logits/chosen": -2.8414063453674316, "logits/rejected": -3.2328124046325684, "logps/chosen": -445.6000061035156, "logps/rejected": -461.20001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.0074219703674316, "rewards/margins": 12.118749618530273, "rewards/rejected": -15.131250381469727, "step": 4200 }, { "epoch": 3.6962247585601404, "grad_norm": 0.373425027629608, "learning_rate": 7.616330114135205e-08, "logits/chosen": -2.965625047683716, "logits/rejected": -3.452343702316284, "logps/chosen": -432.6000061035156, "logps/rejected": -431.6000061035156, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -3.4754395484924316, "rewards/margins": 12.737500190734863, "rewards/rejected": -16.225000381469727, "step": 4210 }, { "epoch": 3.7050043898156275, "grad_norm": 7.129266016403341, "learning_rate": 7.396839332748024e-08, "logits/chosen": -2.8765625953674316, "logits/rejected": -3.4007811546325684, "logps/chosen": -476.3500061035156, "logps/rejected": -473.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.4273438453674316, "rewards/margins": 13.015625, "rewards/rejected": -16.440624237060547, "step": 4220 }, { "epoch": 3.713784021071115, "grad_norm": 0.09490785771518713, "learning_rate": 7.177348551360842e-08, "logits/chosen": -2.934375047683716, "logits/rejected": -3.37109375, "logps/chosen": -502.04998779296875, "logps/rejected": -446.8999938964844, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.301562547683716, "rewards/margins": 12.709375381469727, "rewards/rejected": -16.015625, "step": 4230 }, { "epoch": 3.722563652326602, "grad_norm": 0.08737448223316606, "learning_rate": 6.95785776997366e-08, "logits/chosen": -2.8617186546325684, "logits/rejected": -3.359375, "logps/chosen": -503.20001220703125, "logps/rejected": -444.8999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.017773389816284, "rewards/margins": 12.971875190734863, "rewards/rejected": -15.990625381469727, "step": 4240 }, { "epoch": 3.7313432835820897, "grad_norm": 0.19127685117675428, "learning_rate": 6.738366988586478e-08, "logits/chosen": -2.8296875953674316, "logits/rejected": -3.3851561546325684, "logps/chosen": -479.95001220703125, "logps/rejected": -426.6000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.4801268577575684, "rewards/margins": 13.0625, "rewards/rejected": -15.540624618530273, "step": 4250 }, { "epoch": 3.7401229148375768, "grad_norm": 0.27325222538850813, "learning_rate": 6.518876207199297e-08, "logits/chosen": -2.86328125, "logits/rejected": -3.424999952316284, "logps/chosen": -486.8999938964844, "logps/rejected": -438.5, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.8695311546325684, "rewards/margins": 12.915624618530273, "rewards/rejected": -15.793749809265137, "step": 4260 }, { "epoch": 3.7489025460930643, "grad_norm": 0.24953992223000473, "learning_rate": 6.299385425812117e-08, "logits/chosen": -2.9359374046325684, "logits/rejected": -3.3921875953674316, "logps/chosen": -455.5, "logps/rejected": -437.8999938964844, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.7079100608825684, "rewards/margins": 12.928125381469727, "rewards/rejected": -15.637499809265137, "step": 4270 }, { "epoch": 3.7576821773485514, "grad_norm": 0.16534712472113114, "learning_rate": 6.079894644424934e-08, "logits/chosen": -2.7734375, "logits/rejected": -3.375781297683716, "logps/chosen": -504.45001220703125, "logps/rejected": -443.6000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.8353271484375, "rewards/margins": 12.565625190734863, "rewards/rejected": -15.403124809265137, "step": 4280 }, { "epoch": 3.7664618086040385, "grad_norm": 0.05811605261647762, "learning_rate": 5.860403863037752e-08, "logits/chosen": -2.9749999046325684, "logits/rejected": -3.44921875, "logps/chosen": -461.70001220703125, "logps/rejected": -453.8999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.014843702316284, "rewards/margins": 12.471875190734863, "rewards/rejected": -15.490625381469727, "step": 4290 }, { "epoch": 3.775241439859526, "grad_norm": 0.48854372959318854, "learning_rate": 5.64091308165057e-08, "logits/chosen": -2.74609375, "logits/rejected": -3.2984375953674316, "logps/chosen": -545.4000244140625, "logps/rejected": -437.29998779296875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.6949219703674316, "rewards/margins": 12.440625190734863, "rewards/rejected": -15.128125190734863, "step": 4300 }, { "epoch": 3.784021071115013, "grad_norm": 0.5697026376379704, "learning_rate": 5.421422300263389e-08, "logits/chosen": -3.024218797683716, "logits/rejected": -3.446093797683716, "logps/chosen": -413.75, "logps/rejected": -412.70001220703125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -3.653515577316284, "rewards/margins": 13.046875, "rewards/rejected": -16.700000762939453, "step": 4310 }, { "epoch": 3.7928007023705006, "grad_norm": 0.1639430491168361, "learning_rate": 5.201931518876207e-08, "logits/chosen": -2.9820313453674316, "logits/rejected": -3.42578125, "logps/chosen": -431.6499938964844, "logps/rejected": -433.5, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.415234327316284, "rewards/margins": 13.534375190734863, "rewards/rejected": -16.946874618530273, "step": 4320 }, { "epoch": 3.8015803336259877, "grad_norm": 0.09088752958966294, "learning_rate": 4.982440737489025e-08, "logits/chosen": -2.981250047683716, "logits/rejected": -3.617968797683716, "logps/chosen": -454.6000061035156, "logps/rejected": -408.25, "loss": 0.0078, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1500000953674316, "rewards/margins": 11.981249809265137, "rewards/rejected": -15.131250381469727, "step": 4330 }, { "epoch": 3.810359964881475, "grad_norm": 0.0965181394541503, "learning_rate": 4.7629499561018435e-08, "logits/chosen": -2.8359375, "logits/rejected": -3.3187499046325684, "logps/chosen": -478.20001220703125, "logps/rejected": -421.6000061035156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.8701171875, "rewards/margins": 13.481249809265137, "rewards/rejected": -16.34375, "step": 4340 }, { "epoch": 3.8191395961369623, "grad_norm": 0.3430323438027083, "learning_rate": 4.5434591747146615e-08, "logits/chosen": -2.88671875, "logits/rejected": -3.2578125, "logps/chosen": -489.0, "logps/rejected": -440.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.102246046066284, "rewards/margins": 12.925000190734863, "rewards/rejected": -16.037500381469727, "step": 4350 }, { "epoch": 3.8279192273924494, "grad_norm": 0.13442608564589695, "learning_rate": 4.32396839332748e-08, "logits/chosen": -2.7007813453674316, "logits/rejected": -3.272656202316284, "logps/chosen": -453.1000061035156, "logps/rejected": -455.5, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.218493700027466, "rewards/margins": 12.668749809265137, "rewards/rejected": -15.875, "step": 4360 }, { "epoch": 3.836698858647937, "grad_norm": 0.27484934916126125, "learning_rate": 4.104477611940298e-08, "logits/chosen": -2.9585938453674316, "logits/rejected": -3.4828124046325684, "logps/chosen": -482.29998779296875, "logps/rejected": -414.8999938964844, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.061328172683716, "rewards/margins": 12.524999618530273, "rewards/rejected": -15.574999809265137, "step": 4370 }, { "epoch": 3.845478489903424, "grad_norm": 0.8196527442730976, "learning_rate": 3.884986830553116e-08, "logits/chosen": -2.9703125953674316, "logits/rejected": -3.4437499046325684, "logps/chosen": -499.0, "logps/rejected": -430.20001220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -3.223437547683716, "rewards/margins": 12.559374809265137, "rewards/rejected": -15.774999618530273, "step": 4380 }, { "epoch": 3.854258121158911, "grad_norm": 0.46506898711620936, "learning_rate": 3.665496049165935e-08, "logits/chosen": -2.871875047683716, "logits/rejected": -3.2562499046325684, "logps/chosen": -466.0, "logps/rejected": -442.0, "loss": 0.0187, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1322264671325684, "rewards/margins": 12.928125381469727, "rewards/rejected": -16.065624237060547, "step": 4390 }, { "epoch": 3.8630377524143986, "grad_norm": 0.03219121616056209, "learning_rate": 3.446005267778753e-08, "logits/chosen": -2.9515624046325684, "logits/rejected": -3.196093797683716, "logps/chosen": -485.0, "logps/rejected": -483.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.470654249191284, "rewards/margins": 12.703125, "rewards/rejected": -15.178125381469727, "step": 4400 }, { "epoch": 3.8718173836698857, "grad_norm": 0.2560706183120807, "learning_rate": 3.226514486391571e-08, "logits/chosen": -2.7203125953674316, "logits/rejected": -3.237499952316284, "logps/chosen": -490.8999938964844, "logps/rejected": -467.79998779296875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.8060059547424316, "rewards/margins": 12.925000190734863, "rewards/rejected": -15.734375, "step": 4410 }, { "epoch": 3.8805970149253732, "grad_norm": 0.05563468285862625, "learning_rate": 3.00702370500439e-08, "logits/chosen": -2.987499952316284, "logits/rejected": -3.4281249046325684, "logps/chosen": -487.6499938964844, "logps/rejected": -436.0, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.027148485183716, "rewards/margins": 13.168749809265137, "rewards/rejected": -16.200000762939453, "step": 4420 }, { "epoch": 3.8893766461808603, "grad_norm": 0.21685540754942034, "learning_rate": 2.7875329236172078e-08, "logits/chosen": -2.9359374046325684, "logits/rejected": -3.35546875, "logps/chosen": -516.5999755859375, "logps/rejected": -444.29998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.76318359375, "rewards/margins": 12.846875190734863, "rewards/rejected": -15.609375, "step": 4430 }, { "epoch": 3.898156277436348, "grad_norm": 0.17466063974336163, "learning_rate": 2.568042142230026e-08, "logits/chosen": -2.9828124046325684, "logits/rejected": -3.33203125, "logps/chosen": -455.70001220703125, "logps/rejected": -438.79998779296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.195117235183716, "rewards/margins": 12.921875, "rewards/rejected": -16.109375, "step": 4440 }, { "epoch": 3.906935908691835, "grad_norm": 0.0476552217836039, "learning_rate": 2.3485513608428444e-08, "logits/chosen": -2.973437547683716, "logits/rejected": -3.2750000953674316, "logps/chosen": -488.54998779296875, "logps/rejected": -493.3999938964844, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.513476610183716, "rewards/margins": 13.78125, "rewards/rejected": -16.296875, "step": 4450 }, { "epoch": 3.915715539947322, "grad_norm": 0.08905525732564612, "learning_rate": 2.1290605794556627e-08, "logits/chosen": -2.874218702316284, "logits/rejected": -3.3203125, "logps/chosen": -505.29998779296875, "logps/rejected": -462.70001220703125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.136914014816284, "rewards/margins": 12.728124618530273, "rewards/rejected": -15.865625381469727, "step": 4460 }, { "epoch": 3.9244951712028096, "grad_norm": 0.1668928334643476, "learning_rate": 1.909569798068481e-08, "logits/chosen": -2.823437452316284, "logits/rejected": -3.495312452316284, "logps/chosen": -453.20001220703125, "logps/rejected": -405.6000061035156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.267773389816284, "rewards/margins": 13.134374618530273, "rewards/rejected": -16.409374237060547, "step": 4470 }, { "epoch": 3.9332748024582966, "grad_norm": 0.09234843085234354, "learning_rate": 1.690079016681299e-08, "logits/chosen": -2.960156202316284, "logits/rejected": -3.48828125, "logps/chosen": -499.29998779296875, "logps/rejected": -430.79998779296875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.131640672683716, "rewards/margins": 12.934374809265137, "rewards/rejected": -16.071874618530273, "step": 4480 }, { "epoch": 3.942054433713784, "grad_norm": 0.1280532747188893, "learning_rate": 1.4705882352941176e-08, "logits/chosen": -2.805468797683716, "logits/rejected": -3.268749952316284, "logps/chosen": -482.25, "logps/rejected": -416.8999938964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.1329102516174316, "rewards/margins": 12.287500381469727, "rewards/rejected": -15.421875, "step": 4490 }, { "epoch": 3.9508340649692713, "grad_norm": 0.09592585371257221, "learning_rate": 1.2510974539069359e-08, "logits/chosen": -3.022265672683716, "logits/rejected": -3.438281297683716, "logps/chosen": -442.3999938964844, "logps/rejected": -428.70001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.1246094703674316, "rewards/margins": 12.428125381469727, "rewards/rejected": -15.5625, "step": 4500 }, { "epoch": 3.959613696224759, "grad_norm": 0.3078398776322869, "learning_rate": 1.031606672519754e-08, "logits/chosen": -2.8140625953674316, "logits/rejected": -3.491406202316284, "logps/chosen": -532.5, "logps/rejected": -414.8999938964844, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.138476610183716, "rewards/margins": 12.71875, "rewards/rejected": -15.846875190734863, "step": 4510 }, { "epoch": 3.968393327480246, "grad_norm": 0.3337277488774481, "learning_rate": 8.121158911325724e-09, "logits/chosen": -2.910937547683716, "logits/rejected": -3.335156202316284, "logps/chosen": -491.20001220703125, "logps/rejected": -435.70001220703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.9751954078674316, "rewards/margins": 12.503125190734863, "rewards/rejected": -15.481249809265137, "step": 4520 }, { "epoch": 3.977172958735733, "grad_norm": 0.7026092340484815, "learning_rate": 5.926251097453907e-09, "logits/chosen": -2.8515625, "logits/rejected": -3.3929686546325684, "logps/chosen": -485.8999938964844, "logps/rejected": -450.1000061035156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.555346727371216, "rewards/margins": 13.0, "rewards/rejected": -15.553125381469727, "step": 4530 }, { "epoch": 3.9859525899912205, "grad_norm": 6.171708025221414, "learning_rate": 3.731343283582089e-09, "logits/chosen": -2.8125, "logits/rejected": -3.36328125, "logps/chosen": -459.20001220703125, "logps/rejected": -406.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.0492186546325684, "rewards/margins": 12.556249618530273, "rewards/rejected": -15.618749618530273, "step": 4540 }, { "epoch": 3.9947322212467076, "grad_norm": 1.3860823114375178, "learning_rate": 1.5364354697102721e-09, "logits/chosen": -2.910937547683716, "logits/rejected": -3.3671875, "logps/chosen": -423.8500061035156, "logps/rejected": -408.29998779296875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.051953077316284, "rewards/margins": 12.409375190734863, "rewards/rejected": -15.449999809265137, "step": 4550 } ], "logging_steps": 10, "max_steps": 4556, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }