| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0615711252653928, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 746.1138916015625, | |
| "epoch": 0.0021231422505307855, | |
| "grad_norm": 0.21636255085468292, | |
| "kl": 0.0, | |
| "learning_rate": 3.3333333333333335e-07, | |
| "loss": 0.0, | |
| "reward": 0.11085444036871195, | |
| "reward_std": 0.15387122705578804, | |
| "rewards/code_reward": 0.11063122469931841, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 741.6986846923828, | |
| "epoch": 0.004246284501061571, | |
| "grad_norm": 0.21392129361629486, | |
| "kl": 0.0, | |
| "learning_rate": 6.666666666666667e-07, | |
| "loss": 0.0, | |
| "reward": 0.10614843107759953, | |
| "reward_std": 0.15658440068364143, | |
| "rewards/code_reward": 0.10592522472143173, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 756.122802734375, | |
| "epoch": 0.006369426751592357, | |
| "grad_norm": 0.21344353258609772, | |
| "kl": 6.717443466186523e-05, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.0, | |
| "reward": 0.12699278071522713, | |
| "reward_std": 0.15882046334445477, | |
| "rewards/code_reward": 0.12676957063376904, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 760.8817291259766, | |
| "epoch": 0.008492569002123142, | |
| "grad_norm": 0.2038053572177887, | |
| "kl": 7.62939453125e-05, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 0.0, | |
| "reward": 0.06782207638025284, | |
| "reward_std": 0.1164214089512825, | |
| "rewards/code_reward": 0.06782207870855927, | |
| "rewards/format_reward": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 764.8817291259766, | |
| "epoch": 0.010615711252653927, | |
| "grad_norm": 0.20332112908363342, | |
| "kl": 7.510185241699219e-05, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.0, | |
| "reward": 0.07368321809917688, | |
| "reward_std": 0.11412223428487778, | |
| "rewards/code_reward": 0.07368322089314461, | |
| "rewards/format_reward": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 733.0937805175781, | |
| "epoch": 0.012738853503184714, | |
| "grad_norm": 0.2384093701839447, | |
| "kl": 8.344650268554688e-05, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0, | |
| "reward": 0.11057165823876858, | |
| "reward_std": 0.13630107790231705, | |
| "rewards/code_reward": 0.11057165637612343, | |
| "rewards/format_reward": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 747.8013763427734, | |
| "epoch": 0.014861995753715499, | |
| "grad_norm": 0.21190612018108368, | |
| "kl": 9.250640869140625e-05, | |
| "learning_rate": 2.3333333333333336e-06, | |
| "loss": 0.0, | |
| "reward": 0.13999284896999598, | |
| "reward_std": 0.14958541933447123, | |
| "rewards/code_reward": 0.13999284896999598, | |
| "rewards/format_reward": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 747.6540679931641, | |
| "epoch": 0.016985138004246284, | |
| "grad_norm": 0.1906791776418686, | |
| "kl": 0.00013947486877441406, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 0.0, | |
| "reward": 0.0754421940073371, | |
| "reward_std": 0.10426154918968678, | |
| "rewards/code_reward": 0.07521897740662098, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 710.5401916503906, | |
| "epoch": 0.01910828025477707, | |
| "grad_norm": 0.20240359008312225, | |
| "kl": 0.0002300739288330078, | |
| "learning_rate": 3e-06, | |
| "loss": 0.0, | |
| "reward": 0.12234624661505222, | |
| "reward_std": 0.10050993971526623, | |
| "rewards/code_reward": 0.12234624475240707, | |
| "rewards/format_reward": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 760.1808319091797, | |
| "epoch": 0.021231422505307854, | |
| "grad_norm": 0.34670934081077576, | |
| "kl": 0.0004100799560546875, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.0, | |
| "reward": 0.055801121750846505, | |
| "reward_std": 0.06395915220491588, | |
| "rewards/code_reward": 0.05580112128518522, | |
| "rewards/format_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 750.3906555175781, | |
| "epoch": 0.02335456475583864, | |
| "grad_norm": 0.21112516522407532, | |
| "kl": 0.0007238388061523438, | |
| "learning_rate": 3.6666666666666666e-06, | |
| "loss": 0.0, | |
| "reward": 0.06025231350213289, | |
| "reward_std": 0.09869139082729816, | |
| "rewards/code_reward": 0.06025231350213289, | |
| "rewards/format_reward": 0.0, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 700.1384124755859, | |
| "epoch": 0.025477707006369428, | |
| "grad_norm": 0.22157742083072662, | |
| "kl": 0.00104522705078125, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0, | |
| "reward": 0.1535286195576191, | |
| "reward_std": 0.16596542671322823, | |
| "rewards/code_reward": 0.1530821956694126, | |
| "rewards/format_reward": 0.004464285913854837, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 702.9152069091797, | |
| "epoch": 0.027600849256900213, | |
| "grad_norm": 0.24524690210819244, | |
| "kl": 0.0017261505126953125, | |
| "learning_rate": 4.333333333333334e-06, | |
| "loss": 0.0, | |
| "reward": 0.19020407181233168, | |
| "reward_std": 0.16583579406142235, | |
| "rewards/code_reward": 0.1902040634304285, | |
| "rewards/format_reward": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 714.9486999511719, | |
| "epoch": 0.029723991507430998, | |
| "grad_norm": 0.18179796636104584, | |
| "kl": 0.00283050537109375, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 0.0, | |
| "reward": 0.06596253952011466, | |
| "reward_std": 0.08220406854525208, | |
| "rewards/code_reward": 0.06596253253519535, | |
| "rewards/format_reward": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 683.7835083007812, | |
| "epoch": 0.03184713375796178, | |
| "grad_norm": 0.18836897611618042, | |
| "kl": 0.004302978515625, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0, | |
| "reward": 0.10805188585072756, | |
| "reward_std": 0.0908731259405613, | |
| "rewards/code_reward": 0.10782866925001144, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 641.216552734375, | |
| "epoch": 0.03397027600849257, | |
| "grad_norm": 0.22146466374397278, | |
| "kl": 0.00562286376953125, | |
| "learning_rate": 4.999952797253148e-06, | |
| "loss": 0.0001, | |
| "reward": 0.20005132257938385, | |
| "reward_std": 0.16782562248408794, | |
| "rewards/code_reward": 0.19938167929649353, | |
| "rewards/format_reward": 0.006696428870782256, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 644.9598541259766, | |
| "epoch": 0.036093418259023353, | |
| "grad_norm": 0.20937775075435638, | |
| "kl": 0.00730133056640625, | |
| "learning_rate": 4.9998111909931225e-06, | |
| "loss": 0.0001, | |
| "reward": 0.1508529670536518, | |
| "reward_std": 0.1663584616035223, | |
| "rewards/code_reward": 0.15040653757750988, | |
| "rewards/format_reward": 0.004464285913854837, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 608.2477874755859, | |
| "epoch": 0.03821656050955414, | |
| "grad_norm": 0.2286761999130249, | |
| "kl": 0.0104217529296875, | |
| "learning_rate": 4.999575187161439e-06, | |
| "loss": 0.0001, | |
| "reward": 0.14321784488856792, | |
| "reward_std": 0.1725912243127823, | |
| "rewards/code_reward": 0.14321784675121307, | |
| "rewards/format_reward": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 650.0647583007812, | |
| "epoch": 0.040339702760084924, | |
| "grad_norm": 0.21597912907600403, | |
| "kl": 0.0113677978515625, | |
| "learning_rate": 4.9992447956603455e-06, | |
| "loss": 0.0001, | |
| "reward": 0.12854056991636753, | |
| "reward_std": 0.15781505592167377, | |
| "rewards/code_reward": 0.12854057550430298, | |
| "rewards/format_reward": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 624.8192138671875, | |
| "epoch": 0.04246284501061571, | |
| "grad_norm": 42.34319305419922, | |
| "kl": 7.167266845703125, | |
| "learning_rate": 4.998820030352409e-06, | |
| "loss": 0.0716, | |
| "reward": 0.12942847050726414, | |
| "reward_std": 0.11835422366857529, | |
| "rewards/code_reward": 0.1292052511125803, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 632.3504638671875, | |
| "epoch": 0.044585987261146494, | |
| "grad_norm": 0.23411324620246887, | |
| "kl": 0.0178985595703125, | |
| "learning_rate": 4.998300909059929e-06, | |
| "loss": 0.0002, | |
| "reward": 0.12214689701795578, | |
| "reward_std": 0.1782115437090397, | |
| "rewards/code_reward": 0.12214690260589123, | |
| "rewards/format_reward": 0.0, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 610.6652069091797, | |
| "epoch": 0.04670912951167728, | |
| "grad_norm": 0.24601581692695618, | |
| "kl": 0.020477294921875, | |
| "learning_rate": 4.997687453564198e-06, | |
| "loss": 0.0002, | |
| "reward": 0.18596480786800385, | |
| "reward_std": 0.184912770986557, | |
| "rewards/code_reward": 0.18529516831040382, | |
| "rewards/format_reward": 0.006696428870782256, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 612.2344055175781, | |
| "epoch": 0.04883227176220807, | |
| "grad_norm": 0.25667688250541687, | |
| "kl": 0.02349853515625, | |
| "learning_rate": 4.9969796896045775e-06, | |
| "loss": 0.0002, | |
| "reward": 0.1700380276888609, | |
| "reward_std": 0.15014583989977837, | |
| "rewards/code_reward": 0.1689219493418932, | |
| "rewards/format_reward": 0.011160714784637094, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 582.0379791259766, | |
| "epoch": 0.050955414012738856, | |
| "grad_norm": 1.4910736083984375, | |
| "kl": 0.054718017578125, | |
| "learning_rate": 4.996177646877426e-06, | |
| "loss": 0.0005, | |
| "reward": 0.15169917233288288, | |
| "reward_std": 0.16788329929113388, | |
| "rewards/code_reward": 0.151029534637928, | |
| "rewards/format_reward": 0.006696428870782256, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 605.2277069091797, | |
| "epoch": 0.05307855626326964, | |
| "grad_norm": 0.23100271821022034, | |
| "kl": 0.030059814453125, | |
| "learning_rate": 4.995281359034851e-06, | |
| "loss": 0.0003, | |
| "reward": 0.10647542215883732, | |
| "reward_std": 0.13741069473326206, | |
| "rewards/code_reward": 0.105805778875947, | |
| "rewards/format_reward": 0.006696428870782256, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 595.9777069091797, | |
| "epoch": 0.055201698513800426, | |
| "grad_norm": 0.22825182974338531, | |
| "kl": 0.03179931640625, | |
| "learning_rate": 4.994290863683296e-06, | |
| "loss": 0.0003, | |
| "reward": 0.11801626486703753, | |
| "reward_std": 0.12430650275200605, | |
| "rewards/code_reward": 0.11779305664822459, | |
| "rewards/format_reward": 0.0022321429569274187, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 594.2031402587891, | |
| "epoch": 0.05732484076433121, | |
| "grad_norm": 0.2523776590824127, | |
| "kl": 0.0357666015625, | |
| "learning_rate": 4.99320620238196e-06, | |
| "loss": 0.0004, | |
| "reward": 0.1666601337492466, | |
| "reward_std": 0.20200489647686481, | |
| "rewards/code_reward": 0.1655440628528595, | |
| "rewards/format_reward": 0.011160714784637094, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 606.7299346923828, | |
| "epoch": 0.059447983014861996, | |
| "grad_norm": 0.24759377539157867, | |
| "kl": 0.03466796875, | |
| "learning_rate": 4.99202742064106e-06, | |
| "loss": 0.0003, | |
| "reward": 0.12888818327337503, | |
| "reward_std": 0.14617390558123589, | |
| "rewards/code_reward": 0.12732568103820086, | |
| "rewards/format_reward": 0.01562500116415322, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 582.7879791259766, | |
| "epoch": 0.06157112526539278, | |
| "grad_norm": 0.22141791880130768, | |
| "kl": 0.0360107421875, | |
| "learning_rate": 4.990754567919917e-06, | |
| "loss": 0.0004, | |
| "reward": 0.1982099711894989, | |
| "reward_std": 0.15798946656286716, | |
| "rewards/code_reward": 0.1970939077436924, | |
| "rewards/format_reward": 0.011160714784637094, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 582.3035888671875, | |
| "epoch": 0.06369426751592357, | |
| "grad_norm": 0.4308633804321289, | |
| "kl": 0.04461669921875, | |
| "learning_rate": 4.989387697624881e-06, | |
| "loss": 0.0004, | |
| "reward": 0.15222312323749065, | |
| "reward_std": 0.13287453912198544, | |
| "rewards/code_reward": 0.14999098517000675, | |
| "rewards/format_reward": 0.022321429336443543, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 560.9888763427734, | |
| "epoch": 0.06581740976645435, | |
| "grad_norm": 0.44854736328125, | |
| "kl": 0.05029296875, | |
| "learning_rate": 4.987926867107095e-06, | |
| "loss": 0.0005, | |
| "reward": 0.17351704463362694, | |
| "reward_std": 0.1594883631914854, | |
| "rewards/code_reward": 0.17039205506443977, | |
| "rewards/format_reward": 0.031250000931322575, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 520.7835006713867, | |
| "epoch": 0.06794055201698514, | |
| "grad_norm": 0.3021621108055115, | |
| "kl": 0.0545654296875, | |
| "learning_rate": 4.986372137660078e-06, | |
| "loss": 0.0005, | |
| "reward": 0.19399502873420715, | |
| "reward_std": 0.18005169555544853, | |
| "rewards/code_reward": 0.1872985940426588, | |
| "rewards/format_reward": 0.0669642873108387, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 551.3750228881836, | |
| "epoch": 0.07006369426751592, | |
| "grad_norm": 0.38339507579803467, | |
| "kl": 0.0712890625, | |
| "learning_rate": 4.984723574517165e-06, | |
| "loss": 0.0007, | |
| "reward": 0.15828289464116096, | |
| "reward_std": 0.18912290409207344, | |
| "rewards/code_reward": 0.1453364696353674, | |
| "rewards/format_reward": 0.12946429289877415, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 490.0558166503906, | |
| "epoch": 0.07218683651804671, | |
| "grad_norm": 0.5539775490760803, | |
| "kl": 0.0887451171875, | |
| "learning_rate": 4.9829812468487655e-06, | |
| "loss": 0.0009, | |
| "reward": 0.18788279965519905, | |
| "reward_std": 0.19856177270412445, | |
| "rewards/code_reward": 0.16578458063304424, | |
| "rewards/format_reward": 0.2209821566939354, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 441.99778747558594, | |
| "epoch": 0.07430997876857749, | |
| "grad_norm": 0.35391107201576233, | |
| "kl": 0.12060546875, | |
| "learning_rate": 4.981145227759457e-06, | |
| "loss": 0.0012, | |
| "reward": 0.20366163551807404, | |
| "reward_std": 0.1490145679563284, | |
| "rewards/code_reward": 0.16392949409782887, | |
| "rewards/format_reward": 0.3973214477300644, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 454.8236846923828, | |
| "epoch": 0.07643312101910828, | |
| "grad_norm": 0.34607765078544617, | |
| "kl": 0.18994140625, | |
| "learning_rate": 4.979215594284924e-06, | |
| "loss": 0.0019, | |
| "reward": 0.16812831349670887, | |
| "reward_std": 0.16833286173641682, | |
| "rewards/code_reward": 0.10094079561531544, | |
| "rewards/format_reward": 0.6718750298023224, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 413.80358123779297, | |
| "epoch": 0.07855626326963906, | |
| "grad_norm": 0.30116426944732666, | |
| "kl": 0.1982421875, | |
| "learning_rate": 4.977192427388722e-06, | |
| "loss": 0.002, | |
| "reward": 0.24648623168468475, | |
| "reward_std": 0.1688873954117298, | |
| "rewards/code_reward": 0.16099514812231064, | |
| "rewards/format_reward": 0.854910746216774, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 412.4464416503906, | |
| "epoch": 0.08067940552016985, | |
| "grad_norm": 0.3934517204761505, | |
| "kl": 0.248046875, | |
| "learning_rate": 4.9750758119588824e-06, | |
| "loss": 0.0025, | |
| "reward": 0.24308543279767036, | |
| "reward_std": 0.14966130815446377, | |
| "rewards/code_reward": 0.1495586484670639, | |
| "rewards/format_reward": 0.9352678954601288, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 424.8928756713867, | |
| "epoch": 0.08280254777070063, | |
| "grad_norm": 0.3077991008758545, | |
| "kl": 0.256103515625, | |
| "learning_rate": 4.972865836804349e-06, | |
| "loss": 0.0026, | |
| "reward": 0.2948240712285042, | |
| "reward_std": 0.17200535349547863, | |
| "rewards/code_reward": 0.19995798915624619, | |
| "rewards/format_reward": 0.9486607611179352, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 445.8326110839844, | |
| "epoch": 0.08492569002123142, | |
| "grad_norm": 0.3074510097503662, | |
| "kl": 0.259765625, | |
| "learning_rate": 4.970562594651254e-06, | |
| "loss": 0.0026, | |
| "reward": 0.2571263238787651, | |
| "reward_std": 0.1593556720763445, | |
| "rewards/code_reward": 0.16226024366915226, | |
| "rewards/format_reward": 0.9486607611179352, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 474.68082427978516, | |
| "epoch": 0.0870488322717622, | |
| "grad_norm": 0.28787654638290405, | |
| "kl": 0.2421875, | |
| "learning_rate": 4.968166182139026e-06, | |
| "loss": 0.0024, | |
| "reward": 0.27686072885990143, | |
| "reward_std": 0.16917606256902218, | |
| "rewards/code_reward": 0.18378033302724361, | |
| "rewards/format_reward": 0.9308036118745804, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 520.4397583007812, | |
| "epoch": 0.08917197452229299, | |
| "grad_norm": 0.2756073772907257, | |
| "kl": 0.22216796875, | |
| "learning_rate": 4.9656766998163306e-06, | |
| "loss": 0.0023, | |
| "reward": 0.29509423673152924, | |
| "reward_std": 0.13862515799701214, | |
| "rewards/code_reward": 0.20402280241250992, | |
| "rewards/format_reward": 0.910714328289032, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 520.6406555175781, | |
| "epoch": 0.09129511677282377, | |
| "grad_norm": 0.2641509473323822, | |
| "kl": 0.1728515625, | |
| "learning_rate": 4.963094252136865e-06, | |
| "loss": 0.0017, | |
| "reward": 0.3755844831466675, | |
| "reward_std": 0.19650832191109657, | |
| "rewards/code_reward": 0.280941616743803, | |
| "rewards/format_reward": 0.9464286118745804, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 520.0803909301758, | |
| "epoch": 0.09341825902335456, | |
| "grad_norm": 0.28843629360198975, | |
| "kl": 0.206298828125, | |
| "learning_rate": 4.960418947454958e-06, | |
| "loss": 0.0021, | |
| "reward": 0.21916456520557404, | |
| "reward_std": 0.12000982835888863, | |
| "rewards/code_reward": 0.12407528422772884, | |
| "rewards/format_reward": 0.95089291036129, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 516.7455749511719, | |
| "epoch": 0.09554140127388536, | |
| "grad_norm": 0.9614177942276001, | |
| "kl": 0.203125, | |
| "learning_rate": 4.957650898021038e-06, | |
| "loss": 0.002, | |
| "reward": 0.26794980466365814, | |
| "reward_std": 0.14450966753065586, | |
| "rewards/code_reward": 0.17397657968103886, | |
| "rewards/format_reward": 0.9397321939468384, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 517.7343978881836, | |
| "epoch": 0.09766454352441614, | |
| "grad_norm": 0.2904169261455536, | |
| "kl": 0.17041015625, | |
| "learning_rate": 4.954790219976915e-06, | |
| "loss": 0.0017, | |
| "reward": 0.3067335784435272, | |
| "reward_std": 0.15805694833397865, | |
| "rewards/code_reward": 0.21186750568449497, | |
| "rewards/format_reward": 0.948660746216774, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 533.3594055175781, | |
| "epoch": 0.09978768577494693, | |
| "grad_norm": 0.25753694772720337, | |
| "kl": 0.126953125, | |
| "learning_rate": 4.95183703335091e-06, | |
| "loss": 0.0013, | |
| "reward": 0.22189904749393463, | |
| "reward_std": 0.13265508972108364, | |
| "rewards/code_reward": 0.12390796467661858, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 548.950927734375, | |
| "epoch": 0.10191082802547771, | |
| "grad_norm": 0.26259344816207886, | |
| "kl": 0.1424560546875, | |
| "learning_rate": 4.948791462052819e-06, | |
| "loss": 0.0014, | |
| "reward": 0.22812815010547638, | |
| "reward_std": 0.1622354220598936, | |
| "rewards/code_reward": 0.12991385161876678, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 571.6741485595703, | |
| "epoch": 0.1040339702760085, | |
| "grad_norm": 0.4155128300189972, | |
| "kl": 0.20263671875, | |
| "learning_rate": 4.945653633868716e-06, | |
| "loss": 0.0021, | |
| "reward": 0.24147583171725273, | |
| "reward_std": 0.1386658363044262, | |
| "rewards/code_reward": 0.1450472492724657, | |
| "rewards/format_reward": 0.964285746216774, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 534.6093978881836, | |
| "epoch": 0.10615711252653928, | |
| "grad_norm": 0.24680602550506592, | |
| "kl": 0.159912109375, | |
| "learning_rate": 4.942423680455584e-06, | |
| "loss": 0.0016, | |
| "reward": 0.2133147530257702, | |
| "reward_std": 0.14480553567409515, | |
| "rewards/code_reward": 0.11643974296748638, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 521.4888610839844, | |
| "epoch": 0.10828025477707007, | |
| "grad_norm": 0.27140846848487854, | |
| "kl": 0.172119140625, | |
| "learning_rate": 4.939101737335802e-06, | |
| "loss": 0.0017, | |
| "reward": 0.3708176761865616, | |
| "reward_std": 0.1698193922638893, | |
| "rewards/code_reward": 0.2730497941374779, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 550.3236846923828, | |
| "epoch": 0.11040339702760085, | |
| "grad_norm": 0.24256259202957153, | |
| "kl": 0.145751953125, | |
| "learning_rate": 4.935687943891447e-06, | |
| "loss": 0.0015, | |
| "reward": 0.30257678776979446, | |
| "reward_std": 0.1430999655276537, | |
| "rewards/code_reward": 0.2057017907500267, | |
| "rewards/format_reward": 0.9687500447034836, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 551.3772583007812, | |
| "epoch": 0.11252653927813164, | |
| "grad_norm": 0.2562994062900543, | |
| "kl": 0.16259765625, | |
| "learning_rate": 4.932182443358458e-06, | |
| "loss": 0.0016, | |
| "reward": 0.314239501953125, | |
| "reward_std": 0.21334025636315346, | |
| "rewards/code_reward": 0.21624841168522835, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 553.4955596923828, | |
| "epoch": 0.11464968152866242, | |
| "grad_norm": 0.23835241794586182, | |
| "kl": 0.160888671875, | |
| "learning_rate": 4.928585382820616e-06, | |
| "loss": 0.0016, | |
| "reward": 0.25176869705319405, | |
| "reward_std": 0.11105065606534481, | |
| "rewards/code_reward": 0.1535544078797102, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 552.2366333007812, | |
| "epoch": 0.11677282377919321, | |
| "grad_norm": 0.2630121409893036, | |
| "kl": 0.1552734375, | |
| "learning_rate": 4.924896913203376e-06, | |
| "loss": 0.0016, | |
| "reward": 0.24022378027439117, | |
| "reward_std": 0.15625984594225883, | |
| "rewards/code_reward": 0.14133985061198473, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 574.1295013427734, | |
| "epoch": 0.11889596602972399, | |
| "grad_norm": 0.3262800872325897, | |
| "kl": 0.1572265625, | |
| "learning_rate": 4.921117189267535e-06, | |
| "loss": 0.0016, | |
| "reward": 0.32679247856140137, | |
| "reward_std": 0.19292927533388138, | |
| "rewards/code_reward": 0.22991745918989182, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 541.9576110839844, | |
| "epoch": 0.12101910828025478, | |
| "grad_norm": 0.2467201203107834, | |
| "kl": 0.17578125, | |
| "learning_rate": 4.917246369602742e-06, | |
| "loss": 0.0018, | |
| "reward": 0.25976729951798916, | |
| "reward_std": 0.1260015396401286, | |
| "rewards/code_reward": 0.16110657062381506, | |
| "rewards/format_reward": 0.986607164144516, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 553.419677734375, | |
| "epoch": 0.12314225053078556, | |
| "grad_norm": 0.2763681709766388, | |
| "kl": 0.15478515625, | |
| "learning_rate": 4.9132846166208355e-06, | |
| "loss": 0.0016, | |
| "reward": 0.2834607996046543, | |
| "reward_std": 0.1603868044912815, | |
| "rewards/code_reward": 0.1852465160191059, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 542.2343826293945, | |
| "epoch": 0.12526539278131635, | |
| "grad_norm": 1.2009683847427368, | |
| "kl": 0.203125, | |
| "learning_rate": 4.9092320965490365e-06, | |
| "loss": 0.002, | |
| "reward": 0.36397186666727066, | |
| "reward_std": 0.20367462560534477, | |
| "rewards/code_reward": 0.26531114615499973, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 502.7076110839844, | |
| "epoch": 0.12738853503184713, | |
| "grad_norm": 0.291824609041214, | |
| "kl": 0.1533203125, | |
| "learning_rate": 4.905088979422971e-06, | |
| "loss": 0.0015, | |
| "reward": 0.33501066267490387, | |
| "reward_std": 0.17072956077754498, | |
| "rewards/code_reward": 0.23701957240700722, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 512.5134201049805, | |
| "epoch": 0.12951167728237792, | |
| "grad_norm": 0.2763117849826813, | |
| "kl": 0.1837158203125, | |
| "learning_rate": 4.900855439079536e-06, | |
| "loss": 0.0019, | |
| "reward": 0.3404688164591789, | |
| "reward_std": 0.19662801921367645, | |
| "rewards/code_reward": 0.2453795075416565, | |
| "rewards/format_reward": 0.9508928954601288, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 526.8995819091797, | |
| "epoch": 0.1316348195329087, | |
| "grad_norm": 0.2876502275466919, | |
| "kl": 0.19580078125, | |
| "learning_rate": 4.8965316531496055e-06, | |
| "loss": 0.002, | |
| "reward": 0.2866082601249218, | |
| "reward_std": 0.16614584252238274, | |
| "rewards/code_reward": 0.19129573553800583, | |
| "rewards/format_reward": 0.9531250447034836, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 593.2924499511719, | |
| "epoch": 0.1337579617834395, | |
| "grad_norm": 2.4047532081604004, | |
| "kl": 0.41357421875, | |
| "learning_rate": 4.892117803050578e-06, | |
| "loss": 0.0041, | |
| "reward": 0.2631051279604435, | |
| "reward_std": 0.2128530964255333, | |
| "rewards/code_reward": 0.17359617352485657, | |
| "rewards/format_reward": 0.895089328289032, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 563.3437805175781, | |
| "epoch": 0.13588110403397027, | |
| "grad_norm": 0.2846791446208954, | |
| "kl": 0.197509765625, | |
| "learning_rate": 4.887614073978761e-06, | |
| "loss": 0.002, | |
| "reward": 0.2669316381216049, | |
| "reward_std": 0.14747418276965618, | |
| "rewards/code_reward": 0.17630662396550179, | |
| "rewards/format_reward": 0.9062500447034836, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 535.6183319091797, | |
| "epoch": 0.13800424628450106, | |
| "grad_norm": 0.2759235203266144, | |
| "kl": 0.186767578125, | |
| "learning_rate": 4.883020654901609e-06, | |
| "loss": 0.0019, | |
| "reward": 0.28016526997089386, | |
| "reward_std": 0.17947101965546608, | |
| "rewards/code_reward": 0.18730811774730682, | |
| "rewards/format_reward": 0.928571492433548, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 616.9732360839844, | |
| "epoch": 0.14012738853503184, | |
| "grad_norm": 0.26271936297416687, | |
| "kl": 0.23974609375, | |
| "learning_rate": 4.878337738549785e-06, | |
| "loss": 0.0024, | |
| "reward": 0.23576084896922112, | |
| "reward_std": 0.18645637948065996, | |
| "rewards/code_reward": 0.1466983389109373, | |
| "rewards/format_reward": 0.8906250447034836, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 591.013427734375, | |
| "epoch": 0.14225053078556263, | |
| "grad_norm": 0.2741730511188507, | |
| "kl": 0.220947265625, | |
| "learning_rate": 4.873565521409082e-06, | |
| "loss": 0.0023, | |
| "reward": 0.2887257859110832, | |
| "reward_std": 0.15980570390820503, | |
| "rewards/code_reward": 0.20055612176656723, | |
| "rewards/format_reward": 0.8816964775323868, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 567.3995819091797, | |
| "epoch": 0.14437367303609341, | |
| "grad_norm": 0.30026066303253174, | |
| "kl": 0.196533203125, | |
| "learning_rate": 4.868704203712173e-06, | |
| "loss": 0.002, | |
| "reward": 0.2695513255894184, | |
| "reward_std": 0.13891723938286304, | |
| "rewards/code_reward": 0.18361380137503147, | |
| "rewards/format_reward": 0.8593750447034836, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 557.5424346923828, | |
| "epoch": 0.1464968152866242, | |
| "grad_norm": 0.26508811116218567, | |
| "kl": 0.2275390625, | |
| "learning_rate": 4.86375398943021e-06, | |
| "loss": 0.0023, | |
| "reward": 0.2681450620293617, | |
| "reward_std": 0.15566366165876389, | |
| "rewards/code_reward": 0.17618075758218765, | |
| "rewards/format_reward": 0.91964291036129, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 567.8192291259766, | |
| "epoch": 0.14861995753715498, | |
| "grad_norm": 0.2842702567577362, | |
| "kl": 0.206787109375, | |
| "learning_rate": 4.858715086264274e-06, | |
| "loss": 0.0021, | |
| "reward": 0.3179836943745613, | |
| "reward_std": 0.17460669204592705, | |
| "rewards/code_reward": 0.2246801033616066, | |
| "rewards/format_reward": 0.9330357611179352, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 537.6094055175781, | |
| "epoch": 0.15074309978768577, | |
| "grad_norm": 0.2676061689853668, | |
| "kl": 0.208740234375, | |
| "learning_rate": 4.853587705636646e-06, | |
| "loss": 0.0021, | |
| "reward": 0.29776863381266594, | |
| "reward_std": 0.16130083054304123, | |
| "rewards/code_reward": 0.2037954218685627, | |
| "rewards/format_reward": 0.9397321790456772, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 575.2143096923828, | |
| "epoch": 0.15286624203821655, | |
| "grad_norm": 0.2367551475763321, | |
| "kl": 0.189208984375, | |
| "learning_rate": 4.84837206268195e-06, | |
| "loss": 0.0019, | |
| "reward": 0.23160668835043907, | |
| "reward_std": 0.13609608635306358, | |
| "rewards/code_reward": 0.13785668183118105, | |
| "rewards/format_reward": 0.9375000447034836, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 615.6518096923828, | |
| "epoch": 0.15498938428874734, | |
| "grad_norm": 0.2558582127094269, | |
| "kl": 0.199462890625, | |
| "learning_rate": 4.8430683762381195e-06, | |
| "loss": 0.002, | |
| "reward": 0.3226686045527458, | |
| "reward_std": 0.18408508598804474, | |
| "rewards/code_reward": 0.22847215831279755, | |
| "rewards/format_reward": 0.9419643133878708, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 581.3549499511719, | |
| "epoch": 0.15711252653927812, | |
| "grad_norm": 0.25004705786705017, | |
| "kl": 0.2255859375, | |
| "learning_rate": 4.837676868837213e-06, | |
| "loss": 0.0023, | |
| "reward": 0.3521072790026665, | |
| "reward_std": 0.18179307878017426, | |
| "rewards/code_reward": 0.2556787021458149, | |
| "rewards/format_reward": 0.9642857611179352, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 615.6384124755859, | |
| "epoch": 0.1592356687898089, | |
| "grad_norm": 0.2326764315366745, | |
| "kl": 0.184326171875, | |
| "learning_rate": 4.832197766696085e-06, | |
| "loss": 0.002, | |
| "reward": 0.3262624219059944, | |
| "reward_std": 0.13872519508004189, | |
| "rewards/code_reward": 0.22827134653925896, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 626.1919708251953, | |
| "epoch": 0.1613588110403397, | |
| "grad_norm": 0.22483916580677032, | |
| "kl": 0.2158203125, | |
| "learning_rate": 4.826631299706887e-06, | |
| "loss": 0.0022, | |
| "reward": 0.24266962707042694, | |
| "reward_std": 0.15623858594335616, | |
| "rewards/code_reward": 0.14579462260007858, | |
| "rewards/format_reward": 0.9687500447034836, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 638.6696624755859, | |
| "epoch": 0.16348195329087048, | |
| "grad_norm": 0.2549837827682495, | |
| "kl": 0.2041015625, | |
| "learning_rate": 4.820977701427424e-06, | |
| "loss": 0.002, | |
| "reward": 0.3548019379377365, | |
| "reward_std": 0.19029108062386513, | |
| "rewards/code_reward": 0.2577037066221237, | |
| "rewards/format_reward": 0.9709821939468384, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 645.8727874755859, | |
| "epoch": 0.16560509554140126, | |
| "grad_norm": 0.2154364138841629, | |
| "kl": 0.21875, | |
| "learning_rate": 4.81523720907136e-06, | |
| "loss": 0.0022, | |
| "reward": 0.23285862803459167, | |
| "reward_std": 0.129691231995821, | |
| "rewards/code_reward": 0.13531397026963532, | |
| "rewards/format_reward": 0.975446492433548, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 631.9732513427734, | |
| "epoch": 0.16772823779193205, | |
| "grad_norm": 0.22862014174461365, | |
| "kl": 0.21435546875, | |
| "learning_rate": 4.809410063498254e-06, | |
| "loss": 0.0022, | |
| "reward": 0.33913441002368927, | |
| "reward_std": 0.1570077408105135, | |
| "rewards/code_reward": 0.2411433346569538, | |
| "rewards/format_reward": 0.9799107760190964, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 650.060302734375, | |
| "epoch": 0.16985138004246284, | |
| "grad_norm": 0.2403189241886139, | |
| "kl": 0.218994140625, | |
| "learning_rate": 4.8034965092034656e-06, | |
| "loss": 0.0022, | |
| "reward": 0.2641909271478653, | |
| "reward_std": 0.15404854156076908, | |
| "rewards/code_reward": 0.16664628125727177, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 665.2299346923828, | |
| "epoch": 0.17197452229299362, | |
| "grad_norm": 0.22183021903038025, | |
| "kl": 0.17236328125, | |
| "learning_rate": 4.797496794307889e-06, | |
| "loss": 0.0017, | |
| "reward": 0.26636216044425964, | |
| "reward_std": 0.159404331818223, | |
| "rewards/code_reward": 0.16814786568284035, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 646.7277069091797, | |
| "epoch": 0.1740976645435244, | |
| "grad_norm": 0.22360101342201233, | |
| "kl": 0.1865234375, | |
| "learning_rate": 4.791411170547545e-06, | |
| "loss": 0.0019, | |
| "reward": 0.2806714288890362, | |
| "reward_std": 0.1345765646547079, | |
| "rewards/code_reward": 0.1829035673290491, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 649.9285888671875, | |
| "epoch": 0.1762208067940552, | |
| "grad_norm": 0.2544306218624115, | |
| "kl": 0.173583984375, | |
| "learning_rate": 4.785239893263017e-06, | |
| "loss": 0.0017, | |
| "reward": 0.26558100432157516, | |
| "reward_std": 0.13677635975182056, | |
| "rewards/code_reward": 0.16825956851243973, | |
| "rewards/format_reward": 0.9732143431901932, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 686.3661041259766, | |
| "epoch": 0.17834394904458598, | |
| "grad_norm": 0.21967419981956482, | |
| "kl": 0.16162109375, | |
| "learning_rate": 4.778983221388742e-06, | |
| "loss": 0.0016, | |
| "reward": 0.24129238724708557, | |
| "reward_std": 0.1258857063949108, | |
| "rewards/code_reward": 0.14330130256712437, | |
| "rewards/format_reward": 0.9799107760190964, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 634.2634124755859, | |
| "epoch": 0.18046709129511676, | |
| "grad_norm": 0.254304975271225, | |
| "kl": 0.17724609375, | |
| "learning_rate": 4.77264141744214e-06, | |
| "loss": 0.0018, | |
| "reward": 0.3212145194411278, | |
| "reward_std": 0.1864020749926567, | |
| "rewards/code_reward": 0.2236698605120182, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 641.8393096923828, | |
| "epoch": 0.18259023354564755, | |
| "grad_norm": 0.24272438883781433, | |
| "kl": 0.19873046875, | |
| "learning_rate": 4.766214747512603e-06, | |
| "loss": 0.002, | |
| "reward": 0.31076986342668533, | |
| "reward_std": 0.18200884014368057, | |
| "rewards/code_reward": 0.2134484425187111, | |
| "rewards/format_reward": 0.973214328289032, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 643.4263763427734, | |
| "epoch": 0.18471337579617833, | |
| "grad_norm": 0.2264644354581833, | |
| "kl": 0.185791015625, | |
| "learning_rate": 4.759703481250331e-06, | |
| "loss": 0.0019, | |
| "reward": 0.3214620351791382, | |
| "reward_std": 0.14903312921524048, | |
| "rewards/code_reward": 0.22347095608711243, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 649.5580749511719, | |
| "epoch": 0.18683651804670912, | |
| "grad_norm": 0.22847051918506622, | |
| "kl": 0.169677734375, | |
| "learning_rate": 4.753107891855015e-06, | |
| "loss": 0.0018, | |
| "reward": 0.25390685349702835, | |
| "reward_std": 0.12118050269782543, | |
| "rewards/code_reward": 0.15680862963199615, | |
| "rewards/format_reward": 0.9709821939468384, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 652.5513610839844, | |
| "epoch": 0.18895966029723993, | |
| "grad_norm": 0.22527199983596802, | |
| "kl": 0.19580078125, | |
| "learning_rate": 4.746428256064375e-06, | |
| "loss": 0.002, | |
| "reward": 0.303693201392889, | |
| "reward_std": 0.1710791066288948, | |
| "rewards/code_reward": 0.20525570400059223, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 693.4174499511719, | |
| "epoch": 0.1910828025477707, | |
| "grad_norm": 0.2056378573179245, | |
| "kl": 0.17041015625, | |
| "learning_rate": 4.7396648541425534e-06, | |
| "loss": 0.0017, | |
| "reward": 0.2523197568953037, | |
| "reward_std": 0.1238141655921936, | |
| "rewards/code_reward": 0.15432866849005222, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 679.5580596923828, | |
| "epoch": 0.1932059447983015, | |
| "grad_norm": 0.225660502910614, | |
| "kl": 0.175537109375, | |
| "learning_rate": 4.732817969868348e-06, | |
| "loss": 0.0018, | |
| "reward": 0.25567496195435524, | |
| "reward_std": 0.16754142567515373, | |
| "rewards/code_reward": 0.15813031047582626, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 627.4464569091797, | |
| "epoch": 0.19532908704883228, | |
| "grad_norm": 0.24276329576969147, | |
| "kl": 0.1419677734375, | |
| "learning_rate": 4.7258878905233095e-06, | |
| "loss": 0.0014, | |
| "reward": 0.3579171895980835, | |
| "reward_std": 0.21506508812308311, | |
| "rewards/code_reward": 0.25992610678076744, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 664.091552734375, | |
| "epoch": 0.19745222929936307, | |
| "grad_norm": 0.2459368258714676, | |
| "kl": 0.17431640625, | |
| "learning_rate": 4.718874906879688e-06, | |
| "loss": 0.0017, | |
| "reward": 0.25773513317108154, | |
| "reward_std": 0.16034462675452232, | |
| "rewards/code_reward": 0.16130654886364937, | |
| "rewards/format_reward": 0.964285746216774, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 641.9576110839844, | |
| "epoch": 0.19957537154989385, | |
| "grad_norm": 0.20433549582958221, | |
| "kl": 0.135986328125, | |
| "learning_rate": 4.711779313188231e-06, | |
| "loss": 0.0014, | |
| "reward": 0.31772880256175995, | |
| "reward_std": 0.12460769526660442, | |
| "rewards/code_reward": 0.218844847753644, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 668.7522430419922, | |
| "epoch": 0.20169851380042464, | |
| "grad_norm": 0.228180393576622, | |
| "kl": 0.1337890625, | |
| "learning_rate": 4.70460140716584e-06, | |
| "loss": 0.0014, | |
| "reward": 0.23017888888716698, | |
| "reward_std": 0.16307671833783388, | |
| "rewards/code_reward": 0.13196459133177996, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 666.0960083007812, | |
| "epoch": 0.20382165605095542, | |
| "grad_norm": 0.23849396407604218, | |
| "kl": 0.132568359375, | |
| "learning_rate": 4.697341489983076e-06, | |
| "loss": 0.0013, | |
| "reward": 0.38449443876743317, | |
| "reward_std": 0.205027487128973, | |
| "rewards/code_reward": 0.2869497686624527, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 638.5513763427734, | |
| "epoch": 0.2059447983014862, | |
| "grad_norm": 0.23833003640174866, | |
| "kl": 0.1219482421875, | |
| "learning_rate": 4.6899998662515215e-06, | |
| "loss": 0.0012, | |
| "reward": 0.30101777240633965, | |
| "reward_std": 0.18449735268950462, | |
| "rewards/code_reward": 0.20235705375671387, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 638.1161041259766, | |
| "epoch": 0.208067940552017, | |
| "grad_norm": 0.21731068193912506, | |
| "kl": 0.146484375, | |
| "learning_rate": 4.682576844011007e-06, | |
| "loss": 0.0015, | |
| "reward": 0.2744937762618065, | |
| "reward_std": 0.16123195737600327, | |
| "rewards/code_reward": 0.17650271020829678, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 616.8861846923828, | |
| "epoch": 0.21019108280254778, | |
| "grad_norm": 0.25706782937049866, | |
| "kl": 0.134033203125, | |
| "learning_rate": 4.675072734716678e-06, | |
| "loss": 0.0013, | |
| "reward": 0.27044272795319557, | |
| "reward_std": 0.17698625475168228, | |
| "rewards/code_reward": 0.17245164141058922, | |
| "rewards/format_reward": 0.9799107313156128, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 617.6875305175781, | |
| "epoch": 0.21231422505307856, | |
| "grad_norm": 0.23481673002243042, | |
| "kl": 0.123291015625, | |
| "learning_rate": 4.667487853225931e-06, | |
| "loss": 0.0013, | |
| "reward": 0.27581261470913887, | |
| "reward_std": 0.1309206485748291, | |
| "rewards/code_reward": 0.17692868784070015, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 649.1205749511719, | |
| "epoch": 0.21443736730360935, | |
| "grad_norm": 0.22034895420074463, | |
| "kl": 0.1197509765625, | |
| "learning_rate": 4.659822517785203e-06, | |
| "loss": 0.0012, | |
| "reward": 0.3144006244838238, | |
| "reward_std": 0.1468491405248642, | |
| "rewards/code_reward": 0.21529346704483032, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 637.7120819091797, | |
| "epoch": 0.21656050955414013, | |
| "grad_norm": 0.23196153342723846, | |
| "kl": 0.1163330078125, | |
| "learning_rate": 4.6520770500166165e-06, | |
| "loss": 0.0012, | |
| "reward": 0.2747727185487747, | |
| "reward_std": 0.15404804423451424, | |
| "rewards/code_reward": 0.17678163386881351, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 640.4375305175781, | |
| "epoch": 0.21868365180467092, | |
| "grad_norm": 0.21843470633029938, | |
| "kl": 0.111083984375, | |
| "learning_rate": 4.644251774904487e-06, | |
| "loss": 0.0012, | |
| "reward": 0.2366674654185772, | |
| "reward_std": 0.12331773899495602, | |
| "rewards/code_reward": 0.13889960199594498, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 635.5982513427734, | |
| "epoch": 0.2208067940552017, | |
| "grad_norm": 0.2491585612297058, | |
| "kl": 0.1253662109375, | |
| "learning_rate": 4.636347020781684e-06, | |
| "loss": 0.0013, | |
| "reward": 0.26541591063141823, | |
| "reward_std": 0.20751060917973518, | |
| "rewards/code_reward": 0.16876413114368916, | |
| "rewards/format_reward": 0.9665178954601288, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 636.6919860839844, | |
| "epoch": 0.2229299363057325, | |
| "grad_norm": 0.22667376697063446, | |
| "kl": 0.1239013671875, | |
| "learning_rate": 4.6283631193158605e-06, | |
| "loss": 0.0013, | |
| "reward": 0.28487036004662514, | |
| "reward_std": 0.16408125311136246, | |
| "rewards/code_reward": 0.18732571229338646, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 645.0491333007812, | |
| "epoch": 0.22505307855626328, | |
| "grad_norm": 0.2283681333065033, | |
| "kl": 0.124267578125, | |
| "learning_rate": 4.620300405495532e-06, | |
| "loss": 0.0013, | |
| "reward": 0.2775597535073757, | |
| "reward_std": 0.15426970086991787, | |
| "rewards/code_reward": 0.17867580242455006, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 633.8794860839844, | |
| "epoch": 0.22717622080679406, | |
| "grad_norm": 0.24237921833992004, | |
| "kl": 0.1158447265625, | |
| "learning_rate": 4.612159217616022e-06, | |
| "loss": 0.0012, | |
| "reward": 0.3130115121603012, | |
| "reward_std": 0.20111830905079842, | |
| "rewards/code_reward": 0.21502043306827545, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 607.8995666503906, | |
| "epoch": 0.22929936305732485, | |
| "grad_norm": 0.22627882659435272, | |
| "kl": 0.1114501953125, | |
| "learning_rate": 4.603939897265268e-06, | |
| "loss": 0.0011, | |
| "reward": 0.2647922486066818, | |
| "reward_std": 0.13070931658148766, | |
| "rewards/code_reward": 0.16546186804771423, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 611.9308166503906, | |
| "epoch": 0.23142250530785563, | |
| "grad_norm": 0.24682241678237915, | |
| "kl": 0.11474609375, | |
| "learning_rate": 4.595642789309492e-06, | |
| "loss": 0.0012, | |
| "reward": 0.24479227885603905, | |
| "reward_std": 0.14851071499288082, | |
| "rewards/code_reward": 0.14657797291874886, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 601.9018096923828, | |
| "epoch": 0.23354564755838642, | |
| "grad_norm": 0.22991596162319183, | |
| "kl": 0.1337890625, | |
| "learning_rate": 4.587268241878724e-06, | |
| "loss": 0.0014, | |
| "reward": 0.3472997844219208, | |
| "reward_std": 0.20232820883393288, | |
| "rewards/code_reward": 0.24953191354870796, | |
| "rewards/format_reward": 0.9776785969734192, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 619.3036041259766, | |
| "epoch": 0.2356687898089172, | |
| "grad_norm": 0.23583151400089264, | |
| "kl": 0.142822265625, | |
| "learning_rate": 4.578816606352205e-06, | |
| "loss": 0.0014, | |
| "reward": 0.29563019424676895, | |
| "reward_std": 0.17909668013453484, | |
| "rewards/code_reward": 0.1987551935017109, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 576.4419860839844, | |
| "epoch": 0.23779193205944799, | |
| "grad_norm": 0.2594500780105591, | |
| "kl": 0.11865234375, | |
| "learning_rate": 4.570288237343632e-06, | |
| "loss": 0.0012, | |
| "reward": 0.37235086783766747, | |
| "reward_std": 0.21180756203830242, | |
| "rewards/code_reward": 0.27346691489219666, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 600.5937805175781, | |
| "epoch": 0.23991507430997877, | |
| "grad_norm": 0.24643369019031525, | |
| "kl": 0.1270751953125, | |
| "learning_rate": 4.561683492686289e-06, | |
| "loss": 0.0013, | |
| "reward": 0.31715739518404007, | |
| "reward_std": 0.18861495703458786, | |
| "rewards/code_reward": 0.21871986612677574, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 587.732177734375, | |
| "epoch": 0.24203821656050956, | |
| "grad_norm": 0.23611418902873993, | |
| "kl": 0.1268310546875, | |
| "learning_rate": 4.5530027334180285e-06, | |
| "loss": 0.0013, | |
| "reward": 0.26467062532901764, | |
| "reward_std": 0.17901071533560753, | |
| "rewards/code_reward": 0.16712596639990807, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 599.2254791259766, | |
| "epoch": 0.24416135881104034, | |
| "grad_norm": 0.24544627964496613, | |
| "kl": 0.1339111328125, | |
| "learning_rate": 4.544246323766122e-06, | |
| "loss": 0.0014, | |
| "reward": 0.27841826155781746, | |
| "reward_std": 0.16098117642104626, | |
| "rewards/code_reward": 0.18132001720368862, | |
| "rewards/format_reward": 0.9709821939468384, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 570.107177734375, | |
| "epoch": 0.24628450106157113, | |
| "grad_norm": 0.25169771909713745, | |
| "kl": 0.130859375, | |
| "learning_rate": 4.535414631131983e-06, | |
| "loss": 0.0013, | |
| "reward": 0.34019989520311356, | |
| "reward_std": 0.235354982316494, | |
| "rewards/code_reward": 0.2422088049352169, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 586.2834930419922, | |
| "epoch": 0.2484076433121019, | |
| "grad_norm": 0.2503570318222046, | |
| "kl": 0.1285400390625, | |
| "learning_rate": 4.526508026075746e-06, | |
| "loss": 0.0013, | |
| "reward": 0.33243585377931595, | |
| "reward_std": 0.15851835533976555, | |
| "rewards/code_reward": 0.23310547694563866, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 619.6964569091797, | |
| "epoch": 0.2505307855626327, | |
| "grad_norm": 0.2021104097366333, | |
| "kl": 0.1275634765625, | |
| "learning_rate": 4.517526882300721e-06, | |
| "loss": 0.0013, | |
| "reward": 0.1987566240131855, | |
| "reward_std": 0.12675911094993353, | |
| "rewards/code_reward": 0.10143518354743719, | |
| "rewards/format_reward": 0.973214328289032, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 577.7500305175781, | |
| "epoch": 0.2526539278131635, | |
| "grad_norm": 0.23152245581150055, | |
| "kl": 0.139404296875, | |
| "learning_rate": 4.508471576637713e-06, | |
| "loss": 0.0014, | |
| "reward": 0.24329132214188576, | |
| "reward_std": 0.16570740193128586, | |
| "rewards/code_reward": 0.14485381357371807, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 599.3102874755859, | |
| "epoch": 0.25477707006369427, | |
| "grad_norm": 0.23706002533435822, | |
| "kl": 0.1292724609375, | |
| "learning_rate": 4.499342489029211e-06, | |
| "loss": 0.0013, | |
| "reward": 0.24242350459098816, | |
| "reward_std": 0.14784781634807587, | |
| "rewards/code_reward": 0.14398599043488503, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 572.8839569091797, | |
| "epoch": 0.25690021231422505, | |
| "grad_norm": 0.2494058609008789, | |
| "kl": 0.1270751953125, | |
| "learning_rate": 4.490140002513449e-06, | |
| "loss": 0.0013, | |
| "reward": 0.26072419434785843, | |
| "reward_std": 0.12450610846281052, | |
| "rewards/code_reward": 0.16117061115801334, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 601.8080749511719, | |
| "epoch": 0.25902335456475584, | |
| "grad_norm": 0.23028254508972168, | |
| "kl": 0.1180419921875, | |
| "learning_rate": 4.48086450320833e-06, | |
| "loss": 0.0012, | |
| "reward": 0.3514738455414772, | |
| "reward_std": 0.16258227452635765, | |
| "rewards/code_reward": 0.2525899298489094, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 590.4040374755859, | |
| "epoch": 0.2611464968152866, | |
| "grad_norm": 0.24208419024944305, | |
| "kl": 0.1234130859375, | |
| "learning_rate": 4.4715163802952266e-06, | |
| "loss": 0.0012, | |
| "reward": 0.3460327610373497, | |
| "reward_std": 0.1636445987969637, | |
| "rewards/code_reward": 0.24647919461131096, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 609.7098388671875, | |
| "epoch": 0.2632696390658174, | |
| "grad_norm": 0.253397136926651, | |
| "kl": 0.135009765625, | |
| "learning_rate": 4.462096026002655e-06, | |
| "loss": 0.0014, | |
| "reward": 0.25145725160837173, | |
| "reward_std": 0.16506105288863182, | |
| "rewards/code_reward": 0.1530197374522686, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 603.8973388671875, | |
| "epoch": 0.2653927813163482, | |
| "grad_norm": 0.2500688135623932, | |
| "kl": 0.1434326171875, | |
| "learning_rate": 4.4526038355898144e-06, | |
| "loss": 0.0015, | |
| "reward": 0.3970717117190361, | |
| "reward_std": 0.2130543477833271, | |
| "rewards/code_reward": 0.29908062517642975, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 615.4620819091797, | |
| "epoch": 0.267515923566879, | |
| "grad_norm": 0.20834827423095703, | |
| "kl": 0.1336669921875, | |
| "learning_rate": 4.4430402073300035e-06, | |
| "loss": 0.0014, | |
| "reward": 0.26642825454473495, | |
| "reward_std": 0.1255171401426196, | |
| "rewards/code_reward": 0.16799074038863182, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 617.9687805175781, | |
| "epoch": 0.26963906581740976, | |
| "grad_norm": 0.23262540996074677, | |
| "kl": 0.1351318359375, | |
| "learning_rate": 4.433405542493909e-06, | |
| "loss": 0.0014, | |
| "reward": 0.2870429456233978, | |
| "reward_std": 0.19062896817922592, | |
| "rewards/code_reward": 0.18838223442435265, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 662.2053833007812, | |
| "epoch": 0.27176220806794055, | |
| "grad_norm": 0.22548751533031464, | |
| "kl": 0.1197509765625, | |
| "learning_rate": 4.4237002453327734e-06, | |
| "loss": 0.0013, | |
| "reward": 0.30225350335240364, | |
| "reward_std": 0.1395848747342825, | |
| "rewards/code_reward": 0.203146331012249, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 632.0022430419922, | |
| "epoch": 0.27388535031847133, | |
| "grad_norm": 0.24719858169555664, | |
| "kl": 0.131103515625, | |
| "learning_rate": 4.4139247230614245e-06, | |
| "loss": 0.0013, | |
| "reward": 0.32878731191158295, | |
| "reward_std": 0.16303380951285362, | |
| "rewards/code_reward": 0.22968016006052494, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 636.7299346923828, | |
| "epoch": 0.2760084925690021, | |
| "grad_norm": 0.22243919968605042, | |
| "kl": 0.1234130859375, | |
| "learning_rate": 4.404079385841201e-06, | |
| "loss": 0.0013, | |
| "reward": 0.30703118816018105, | |
| "reward_std": 0.12942655384540558, | |
| "rewards/code_reward": 0.20792402233928442, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 644.5468902587891, | |
| "epoch": 0.2781316348195329, | |
| "grad_norm": 0.220564067363739, | |
| "kl": 0.123291015625, | |
| "learning_rate": 4.394164646762734e-06, | |
| "loss": 0.0013, | |
| "reward": 0.296065516769886, | |
| "reward_std": 0.18630750849843025, | |
| "rewards/code_reward": 0.19673514552414417, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 666.997802734375, | |
| "epoch": 0.2802547770700637, | |
| "grad_norm": 0.22151753306388855, | |
| "kl": 0.1304931640625, | |
| "learning_rate": 4.384180921828618e-06, | |
| "loss": 0.0013, | |
| "reward": 0.3110230341553688, | |
| "reward_std": 0.1834610104560852, | |
| "rewards/code_reward": 0.21370159462094307, | |
| "rewards/format_reward": 0.973214328289032, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 665.1205749511719, | |
| "epoch": 0.2823779193205945, | |
| "grad_norm": 0.21862035989761353, | |
| "kl": 0.1168212890625, | |
| "learning_rate": 4.374128629935955e-06, | |
| "loss": 0.0012, | |
| "reward": 0.2876487486064434, | |
| "reward_std": 0.21351643651723862, | |
| "rewards/code_reward": 0.18965766951441765, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 703.575927734375, | |
| "epoch": 0.28450106157112526, | |
| "grad_norm": 0.23025038838386536, | |
| "kl": 0.1204833984375, | |
| "learning_rate": 4.364008192858781e-06, | |
| "loss": 0.0013, | |
| "reward": 0.37238020449876785, | |
| "reward_std": 0.17016195878386497, | |
| "rewards/code_reward": 0.2737194746732712, | |
| "rewards/format_reward": 0.986607164144516, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 726.8817138671875, | |
| "epoch": 0.28662420382165604, | |
| "grad_norm": 0.21229737997055054, | |
| "kl": 0.121337890625, | |
| "learning_rate": 4.353820035230366e-06, | |
| "loss": 0.0012, | |
| "reward": 0.20391739904880524, | |
| "reward_std": 0.13868718035519123, | |
| "rewards/code_reward": 0.10525666922330856, | |
| "rewards/format_reward": 0.986607164144516, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 712.2098541259766, | |
| "epoch": 0.28874734607218683, | |
| "grad_norm": 0.2108394205570221, | |
| "kl": 0.1138916015625, | |
| "learning_rate": 4.3435645845254e-06, | |
| "loss": 0.0012, | |
| "reward": 0.3125154785811901, | |
| "reward_std": 0.1781605463474989, | |
| "rewards/code_reward": 0.2134083015844226, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 718.5893249511719, | |
| "epoch": 0.2908704883227176, | |
| "grad_norm": 0.2107391357421875, | |
| "kl": 0.1195068359375, | |
| "learning_rate": 4.333242271042054e-06, | |
| "loss": 0.0012, | |
| "reward": 0.3177960254251957, | |
| "reward_std": 0.1659994050860405, | |
| "rewards/code_reward": 0.2186888586729765, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 739.7545013427734, | |
| "epoch": 0.2929936305732484, | |
| "grad_norm": 0.2182048112154007, | |
| "kl": 0.124755859375, | |
| "learning_rate": 4.32285352788393e-06, | |
| "loss": 0.0013, | |
| "reward": 0.30886589735746384, | |
| "reward_std": 0.1802590098232031, | |
| "rewards/code_reward": 0.21020517125725746, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 765.857177734375, | |
| "epoch": 0.2951167728237792, | |
| "grad_norm": 0.19854100048542023, | |
| "kl": 0.115234375, | |
| "learning_rate": 4.312398790941882e-06, | |
| "loss": 0.0012, | |
| "reward": 0.3000107705593109, | |
| "reward_std": 0.15882322564721107, | |
| "rewards/code_reward": 0.201126828789711, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 721.9687805175781, | |
| "epoch": 0.29723991507430997, | |
| "grad_norm": 0.23453111946582794, | |
| "kl": 0.116455078125, | |
| "learning_rate": 4.301878498875735e-06, | |
| "loss": 0.0012, | |
| "reward": 0.33861320093274117, | |
| "reward_std": 0.1569173000752926, | |
| "rewards/code_reward": 0.2401756690815091, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 740.2031707763672, | |
| "epoch": 0.29936305732484075, | |
| "grad_norm": 0.21566148102283478, | |
| "kl": 0.1102294921875, | |
| "learning_rate": 4.291293093095873e-06, | |
| "loss": 0.0011, | |
| "reward": 0.3095410466194153, | |
| "reward_std": 0.1992884911596775, | |
| "rewards/code_reward": 0.2108803205192089, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 710.2969207763672, | |
| "epoch": 0.30148619957537154, | |
| "grad_norm": 0.22105751931667328, | |
| "kl": 0.12060546875, | |
| "learning_rate": 4.280643017744723e-06, | |
| "loss": 0.0013, | |
| "reward": 0.36906543001532555, | |
| "reward_std": 0.21653805300593376, | |
| "rewards/code_reward": 0.2704046741127968, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 751.5937805175781, | |
| "epoch": 0.3036093418259023, | |
| "grad_norm": 0.23819085955619812, | |
| "kl": 0.1221923828125, | |
| "learning_rate": 4.269928719678117e-06, | |
| "loss": 0.0012, | |
| "reward": 0.25049133971333504, | |
| "reward_std": 0.17505915835499763, | |
| "rewards/code_reward": 0.15160739235579967, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 735.9107513427734, | |
| "epoch": 0.3057324840764331, | |
| "grad_norm": 0.2204020470380783, | |
| "kl": 0.1229248046875, | |
| "learning_rate": 4.2591506484465426e-06, | |
| "loss": 0.0012, | |
| "reward": 0.26853859797120094, | |
| "reward_std": 0.16319206822663546, | |
| "rewards/code_reward": 0.17032429203391075, | |
| "rewards/format_reward": 0.9821428805589676, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 730.8906707763672, | |
| "epoch": 0.3078556263269639, | |
| "grad_norm": 0.23124827444553375, | |
| "kl": 0.119140625, | |
| "learning_rate": 4.248309256276283e-06, | |
| "loss": 0.0012, | |
| "reward": 0.34641416370868683, | |
| "reward_std": 0.15815678425133228, | |
| "rewards/code_reward": 0.2479766495525837, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 772.8192291259766, | |
| "epoch": 0.3099787685774947, | |
| "grad_norm": 0.21077241003513336, | |
| "kl": 0.1168212890625, | |
| "learning_rate": 4.23740499805044e-06, | |
| "loss": 0.0012, | |
| "reward": 0.2558128647506237, | |
| "reward_std": 0.12367029674351215, | |
| "rewards/code_reward": 0.15804500319063663, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 740.8214721679688, | |
| "epoch": 0.31210191082802546, | |
| "grad_norm": 0.21619708836078644, | |
| "kl": 0.125732421875, | |
| "learning_rate": 4.22643833128985e-06, | |
| "loss": 0.0013, | |
| "reward": 0.33286692947149277, | |
| "reward_std": 0.211603332310915, | |
| "rewards/code_reward": 0.2342061996459961, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 807.6071624755859, | |
| "epoch": 0.31422505307855625, | |
| "grad_norm": 0.2130916714668274, | |
| "kl": 0.1212158203125, | |
| "learning_rate": 4.215409716133885e-06, | |
| "loss": 0.0012, | |
| "reward": 0.3038931153714657, | |
| "reward_std": 0.19640244916081429, | |
| "rewards/code_reward": 0.2065716814249754, | |
| "rewards/format_reward": 0.973214328289032, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 757.3281555175781, | |
| "epoch": 0.31634819532908703, | |
| "grad_norm": 0.21959362924098969, | |
| "kl": 0.12744140625, | |
| "learning_rate": 4.204319615321151e-06, | |
| "loss": 0.0013, | |
| "reward": 0.35224368050694466, | |
| "reward_std": 0.1676900666207075, | |
| "rewards/code_reward": 0.2542525976896286, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 748.4687805175781, | |
| "epoch": 0.3184713375796178, | |
| "grad_norm": 0.22287501394748688, | |
| "kl": 0.120361328125, | |
| "learning_rate": 4.193168494170065e-06, | |
| "loss": 0.0012, | |
| "reward": 0.34373533725738525, | |
| "reward_std": 0.18135884031653404, | |
| "rewards/code_reward": 0.2457442507147789, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 755.8906707763672, | |
| "epoch": 0.3205944798301486, | |
| "grad_norm": 1.9565397500991821, | |
| "kl": 0.2320556640625, | |
| "learning_rate": 4.181956820559339e-06, | |
| "loss": 0.0023, | |
| "reward": 0.3970649391412735, | |
| "reward_std": 0.2402110919356346, | |
| "rewards/code_reward": 0.2992970943450928, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 779.8549499511719, | |
| "epoch": 0.3227176220806794, | |
| "grad_norm": 0.2783929109573364, | |
| "kl": 0.15283203125, | |
| "learning_rate": 4.170685064908342e-06, | |
| "loss": 0.0016, | |
| "reward": 0.19563322141766548, | |
| "reward_std": 0.12617591954767704, | |
| "rewards/code_reward": 0.11817785818129778, | |
| "rewards/format_reward": 0.7745535969734192, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 726.2210083007812, | |
| "epoch": 0.3248407643312102, | |
| "grad_norm": 0.3019232451915741, | |
| "kl": 0.157470703125, | |
| "learning_rate": 4.159353700157365e-06, | |
| "loss": 0.0016, | |
| "reward": 0.17416437342762947, | |
| "reward_std": 0.18693338334560394, | |
| "rewards/code_reward": 0.14961079927161336, | |
| "rewards/format_reward": 0.2455357201397419, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 698.8303833007812, | |
| "epoch": 0.32696390658174096, | |
| "grad_norm": 0.2774558365345001, | |
| "kl": 0.1429443359375, | |
| "learning_rate": 4.14796320174778e-06, | |
| "loss": 0.0014, | |
| "reward": 0.16863043326884508, | |
| "reward_std": 0.1559329554438591, | |
| "rewards/code_reward": 0.15412150975316763, | |
| "rewards/format_reward": 0.145089291036129, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 675.5960083007812, | |
| "epoch": 0.32908704883227174, | |
| "grad_norm": 0.2807973027229309, | |
| "kl": 0.1292724609375, | |
| "learning_rate": 4.136514047602087e-06, | |
| "loss": 0.0013, | |
| "reward": 0.18624619487673044, | |
| "reward_std": 0.18390434235334396, | |
| "rewards/code_reward": 0.15968369878828526, | |
| "rewards/format_reward": 0.2656250074505806, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 639.5625305175781, | |
| "epoch": 0.33121019108280253, | |
| "grad_norm": 0.2780408263206482, | |
| "kl": 0.1229248046875, | |
| "learning_rate": 4.1250067181038635e-06, | |
| "loss": 0.0012, | |
| "reward": 0.2191852517426014, | |
| "reward_std": 0.13604657351970673, | |
| "rewards/code_reward": 0.16851558908820152, | |
| "rewards/format_reward": 0.5066964477300644, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 639.6652069091797, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.26467737555503845, | |
| "kl": 0.144775390625, | |
| "learning_rate": 4.113441696077608e-06, | |
| "loss": 0.0014, | |
| "reward": 0.31026700511574745, | |
| "reward_std": 0.202113538980484, | |
| "rewards/code_reward": 0.2345973663032055, | |
| "rewards/format_reward": 0.7566964626312256, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 659.5558319091797, | |
| "epoch": 0.3354564755838641, | |
| "grad_norm": 0.24479494988918304, | |
| "kl": 0.1280517578125, | |
| "learning_rate": 4.101819466768484e-06, | |
| "loss": 0.0013, | |
| "reward": 0.2640949599444866, | |
| "reward_std": 0.1684006005525589, | |
| "rewards/code_reward": 0.1763717383146286, | |
| "rewards/format_reward": 0.8772321939468384, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 615.0558319091797, | |
| "epoch": 0.3375796178343949, | |
| "grad_norm": 0.24368631839752197, | |
| "kl": 0.15087890625, | |
| "learning_rate": 4.0901405178219535e-06, | |
| "loss": 0.0015, | |
| "reward": 0.345178809016943, | |
| "reward_std": 0.19809392467141151, | |
| "rewards/code_reward": 0.24941986054182053, | |
| "rewards/format_reward": 0.957589328289032, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 624.1830444335938, | |
| "epoch": 0.33970276008492567, | |
| "grad_norm": 0.23896408081054688, | |
| "kl": 0.154052734375, | |
| "learning_rate": 4.078405339263326e-06, | |
| "loss": 0.0015, | |
| "reward": 0.37723641097545624, | |
| "reward_std": 0.21996535174548626, | |
| "rewards/code_reward": 0.28080783039331436, | |
| "rewards/format_reward": 0.964285746216774, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 627.3236846923828, | |
| "epoch": 0.34182590233545646, | |
| "grad_norm": 0.2518380582332611, | |
| "kl": 0.171142578125, | |
| "learning_rate": 4.06661442347719e-06, | |
| "loss": 0.0017, | |
| "reward": 0.3184036388993263, | |
| "reward_std": 0.19395017623901367, | |
| "rewards/code_reward": 0.2217518538236618, | |
| "rewards/format_reward": 0.9665178805589676, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 583.1674499511719, | |
| "epoch": 0.34394904458598724, | |
| "grad_norm": 0.2569345235824585, | |
| "kl": 0.191162109375, | |
| "learning_rate": 4.054768265186758e-06, | |
| "loss": 0.0019, | |
| "reward": 0.31612952798604965, | |
| "reward_std": 0.20712972059845924, | |
| "rewards/code_reward": 0.21836165338754654, | |
| "rewards/format_reward": 0.9776785969734192, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 571.7857208251953, | |
| "epoch": 0.346072186836518, | |
| "grad_norm": 0.25089165568351746, | |
| "kl": 0.207763671875, | |
| "learning_rate": 4.0428673614331036e-06, | |
| "loss": 0.0021, | |
| "reward": 0.365755058825016, | |
| "reward_std": 0.19836053252220154, | |
| "rewards/code_reward": 0.2673175595700741, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 596.4553833007812, | |
| "epoch": 0.3481953290870488, | |
| "grad_norm": 0.23911510407924652, | |
| "kl": 0.22265625, | |
| "learning_rate": 4.030912211554316e-06, | |
| "loss": 0.0023, | |
| "reward": 0.38677794113755226, | |
| "reward_std": 0.18023086339235306, | |
| "rewards/code_reward": 0.28744759038090706, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 567.4486999511719, | |
| "epoch": 0.3503184713375796, | |
| "grad_norm": 0.24430882930755615, | |
| "kl": 0.2099609375, | |
| "learning_rate": 4.018903317164539e-06, | |
| "loss": 0.0021, | |
| "reward": 0.2250930406153202, | |
| "reward_std": 0.19390171952545643, | |
| "rewards/code_reward": 0.1277716178447008, | |
| "rewards/format_reward": 0.9732143431901932, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 573.8594055175781, | |
| "epoch": 0.3524416135881104, | |
| "grad_norm": 0.2257012128829956, | |
| "kl": 0.232666015625, | |
| "learning_rate": 4.006841182132932e-06, | |
| "loss": 0.0023, | |
| "reward": 0.3599228076636791, | |
| "reward_std": 0.20258177444338799, | |
| "rewards/code_reward": 0.2603691965341568, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 606.4911041259766, | |
| "epoch": 0.35456475583864117, | |
| "grad_norm": 0.238439679145813, | |
| "kl": 0.252197265625, | |
| "learning_rate": 3.9947263125625195e-06, | |
| "loss": 0.0025, | |
| "reward": 0.3261881247162819, | |
| "reward_std": 0.1736624352633953, | |
| "rewards/code_reward": 0.22775060683488846, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 590.1027069091797, | |
| "epoch": 0.35668789808917195, | |
| "grad_norm": 0.22503866255283356, | |
| "kl": 0.255615234375, | |
| "learning_rate": 3.982559216768967e-06, | |
| "loss": 0.0026, | |
| "reward": 0.2961311787366867, | |
| "reward_std": 0.1850012019276619, | |
| "rewards/code_reward": 0.1968008242547512, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 594.5491333007812, | |
| "epoch": 0.35881104033970274, | |
| "grad_norm": 0.22681432962417603, | |
| "kl": 0.32421875, | |
| "learning_rate": 3.970340405259245e-06, | |
| "loss": 0.0033, | |
| "reward": 0.4186030365526676, | |
| "reward_std": 0.18541271798312664, | |
| "rewards/code_reward": 0.31927267275750637, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 594.2790298461914, | |
| "epoch": 0.3609341825902335, | |
| "grad_norm": 0.2294747531414032, | |
| "kl": 0.32666015625, | |
| "learning_rate": 3.958070390710214e-06, | |
| "loss": 0.0033, | |
| "reward": 0.36109255626797676, | |
| "reward_std": 0.18586167134344578, | |
| "rewards/code_reward": 0.26243184227496386, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 592.4553680419922, | |
| "epoch": 0.3630573248407643, | |
| "grad_norm": 0.21662850677967072, | |
| "kl": 0.247802734375, | |
| "learning_rate": 3.945749687947109e-06, | |
| "loss": 0.0025, | |
| "reward": 0.24923527240753174, | |
| "reward_std": 0.13961385935544968, | |
| "rewards/code_reward": 0.15057454677298665, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 570.9286041259766, | |
| "epoch": 0.3651804670912951, | |
| "grad_norm": 0.23902097344398499, | |
| "kl": 0.22607421875, | |
| "learning_rate": 3.933378813921942e-06, | |
| "loss": 0.0023, | |
| "reward": 0.3740244060754776, | |
| "reward_std": 0.22742953523993492, | |
| "rewards/code_reward": 0.27536366507411003, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 598.2477874755859, | |
| "epoch": 0.3673036093418259, | |
| "grad_norm": 0.21673643589019775, | |
| "kl": 0.213134765625, | |
| "learning_rate": 3.920958287691811e-06, | |
| "loss": 0.0021, | |
| "reward": 0.2918965369462967, | |
| "reward_std": 0.19641954079270363, | |
| "rewards/code_reward": 0.19301261007785797, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 558.6718978881836, | |
| "epoch": 0.36942675159235666, | |
| "grad_norm": 0.2543295919895172, | |
| "kl": 0.1962890625, | |
| "learning_rate": 3.908488630397121e-06, | |
| "loss": 0.002, | |
| "reward": 0.41571951657533646, | |
| "reward_std": 0.24686651676893234, | |
| "rewards/code_reward": 0.31683557108044624, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 557.0201110839844, | |
| "epoch": 0.37154989384288745, | |
| "grad_norm": 0.23438729345798492, | |
| "kl": 0.206298828125, | |
| "learning_rate": 3.8959703652397175e-06, | |
| "loss": 0.0021, | |
| "reward": 0.38086430728435516, | |
| "reward_std": 0.22366305626928806, | |
| "rewards/code_reward": 0.28198035806417465, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 602.0536041259766, | |
| "epoch": 0.37367303609341823, | |
| "grad_norm": 0.24083319306373596, | |
| "kl": 0.18701171875, | |
| "learning_rate": 3.883404017460935e-06, | |
| "loss": 0.0019, | |
| "reward": 0.36414580047130585, | |
| "reward_std": 0.22220248356461525, | |
| "rewards/code_reward": 0.2657082974910736, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 591.0893096923828, | |
| "epoch": 0.37579617834394907, | |
| "grad_norm": 0.25874680280685425, | |
| "kl": 0.19873046875, | |
| "learning_rate": 3.870790114319559e-06, | |
| "loss": 0.002, | |
| "reward": 0.3555009290575981, | |
| "reward_std": 0.18250016495585442, | |
| "rewards/code_reward": 0.25684019550681114, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 587.404052734375, | |
| "epoch": 0.37791932059447986, | |
| "grad_norm": 0.22890929877758026, | |
| "kl": 0.176513671875, | |
| "learning_rate": 3.858129185069701e-06, | |
| "loss": 0.0018, | |
| "reward": 0.4567238390445709, | |
| "reward_std": 0.2463996484875679, | |
| "rewards/code_reward": 0.35806312412023544, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 602.4486846923828, | |
| "epoch": 0.38004246284501064, | |
| "grad_norm": 0.22736036777496338, | |
| "kl": 0.162353515625, | |
| "learning_rate": 3.845421760938597e-06, | |
| "loss": 0.0016, | |
| "reward": 0.3570307157933712, | |
| "reward_std": 0.16325377486646175, | |
| "rewards/code_reward": 0.2583700120449066, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 610.8638763427734, | |
| "epoch": 0.3821656050955414, | |
| "grad_norm": 0.2262299656867981, | |
| "kl": 0.153076171875, | |
| "learning_rate": 3.832668375104312e-06, | |
| "loss": 0.0016, | |
| "reward": 0.349903404712677, | |
| "reward_std": 0.15386051312088966, | |
| "rewards/code_reward": 0.2503498010337353, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 639.8326110839844, | |
| "epoch": 0.3842887473460722, | |
| "grad_norm": 0.22941501438617706, | |
| "kl": 0.17724609375, | |
| "learning_rate": 3.8198695626733725e-06, | |
| "loss": 0.0018, | |
| "reward": 0.40823063999414444, | |
| "reward_std": 0.2221880704164505, | |
| "rewards/code_reward": 0.3093467131257057, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 638.1049346923828, | |
| "epoch": 0.386411889596603, | |
| "grad_norm": 0.23558823764324188, | |
| "kl": 0.15283203125, | |
| "learning_rate": 3.8070258606583156e-06, | |
| "loss": 0.0016, | |
| "reward": 0.36934422701597214, | |
| "reward_std": 0.21686138212680817, | |
| "rewards/code_reward": 0.27001385763287544, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 625.5759124755859, | |
| "epoch": 0.3885350318471338, | |
| "grad_norm": 0.31238648295402527, | |
| "kl": 0.166259765625, | |
| "learning_rate": 3.7941378079551544e-06, | |
| "loss": 0.0017, | |
| "reward": 0.3830692619085312, | |
| "reward_std": 0.24278680607676506, | |
| "rewards/code_reward": 0.2835156861692667, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 665.8192291259766, | |
| "epoch": 0.39065817409766457, | |
| "grad_norm": 0.3192687928676605, | |
| "kl": 0.1513671875, | |
| "learning_rate": 3.7812059453207677e-06, | |
| "loss": 0.0015, | |
| "reward": 0.3427841551601887, | |
| "reward_std": 0.20133822225034237, | |
| "rewards/code_reward": 0.2441234067082405, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 655.4486846923828, | |
| "epoch": 0.39278131634819535, | |
| "grad_norm": 0.243864506483078, | |
| "kl": 0.141845703125, | |
| "learning_rate": 3.768230815350213e-06, | |
| "loss": 0.0014, | |
| "reward": 0.32591256499290466, | |
| "reward_std": 0.1841282658278942, | |
| "rewards/code_reward": 0.22680539265275002, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 680.6942291259766, | |
| "epoch": 0.39490445859872614, | |
| "grad_norm": 2.7162351608276367, | |
| "kl": 0.2255859375, | |
| "learning_rate": 3.7552129624539557e-06, | |
| "loss": 0.0023, | |
| "reward": 0.38928014785051346, | |
| "reward_std": 0.22797510400414467, | |
| "rewards/code_reward": 0.2917355000972748, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 670.8727874755859, | |
| "epoch": 0.3970276008492569, | |
| "grad_norm": 28.86046600341797, | |
| "kl": 3.12841796875, | |
| "learning_rate": 3.7421529328350316e-06, | |
| "loss": 0.0313, | |
| "reward": 0.33664827793836594, | |
| "reward_std": 0.2122020348906517, | |
| "rewards/code_reward": 0.2404429018497467, | |
| "rewards/format_reward": 0.9620536118745804, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 683.8326110839844, | |
| "epoch": 0.3991507430997877, | |
| "grad_norm": 0.4426620602607727, | |
| "kl": 0.14501953125, | |
| "learning_rate": 3.7290512744661274e-06, | |
| "loss": 0.0015, | |
| "reward": 0.38399138301610947, | |
| "reward_std": 0.1990874893963337, | |
| "rewards/code_reward": 0.2860002890229225, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 647.325927734375, | |
| "epoch": 0.4012738853503185, | |
| "grad_norm": 0.2341061532497406, | |
| "kl": 0.1455078125, | |
| "learning_rate": 3.715908537066589e-06, | |
| "loss": 0.0015, | |
| "reward": 0.42976176738739014, | |
| "reward_std": 0.21941150352358818, | |
| "rewards/code_reward": 0.33199387788772583, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 698.1763610839844, | |
| "epoch": 0.4033970276008493, | |
| "grad_norm": 1.953539252281189, | |
| "kl": 0.5579833984375, | |
| "learning_rate": 3.7027252720793538e-06, | |
| "loss": 0.0056, | |
| "reward": 0.33469754457473755, | |
| "reward_std": 0.19711985811591148, | |
| "rewards/code_reward": 0.23692966997623444, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 710.0982360839844, | |
| "epoch": 0.40552016985138006, | |
| "grad_norm": 0.24030916392803192, | |
| "kl": 0.161865234375, | |
| "learning_rate": 3.689502032647817e-06, | |
| "loss": 0.0016, | |
| "reward": 0.35261962562799454, | |
| "reward_std": 0.2262839339673519, | |
| "rewards/code_reward": 0.25552139058709145, | |
| "rewards/format_reward": 0.970982164144516, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 672.5401916503906, | |
| "epoch": 0.40764331210191085, | |
| "grad_norm": 0.9592034816741943, | |
| "kl": 0.154541015625, | |
| "learning_rate": 3.6762393735926245e-06, | |
| "loss": 0.0016, | |
| "reward": 0.3493685219436884, | |
| "reward_std": 0.1753272709902376, | |
| "rewards/code_reward": 0.2524934969842434, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 710.7299499511719, | |
| "epoch": 0.40976645435244163, | |
| "grad_norm": 0.3044726550579071, | |
| "kl": 0.15185546875, | |
| "learning_rate": 3.6629378513883852e-06, | |
| "loss": 0.0015, | |
| "reward": 0.4329136684536934, | |
| "reward_std": 0.257048511877656, | |
| "rewards/code_reward": 0.3346993774175644, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 718.3683166503906, | |
| "epoch": 0.4118895966029724, | |
| "grad_norm": 0.2441069632768631, | |
| "kl": 0.1630859375, | |
| "learning_rate": 3.6495980241403307e-06, | |
| "loss": 0.0016, | |
| "reward": 0.32557281479239464, | |
| "reward_std": 0.19367647171020508, | |
| "rewards/code_reward": 0.2271352931857109, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 701.9486846923828, | |
| "epoch": 0.4140127388535032, | |
| "grad_norm": 0.22456014156341553, | |
| "kl": 0.16064453125, | |
| "learning_rate": 3.636220451560896e-06, | |
| "loss": 0.0016, | |
| "reward": 0.42680248618125916, | |
| "reward_std": 0.2046816684305668, | |
| "rewards/code_reward": 0.32903461158275604, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 713.747802734375, | |
| "epoch": 0.416135881104034, | |
| "grad_norm": 0.45598000288009644, | |
| "kl": 0.149169921875, | |
| "learning_rate": 3.622805694946235e-06, | |
| "loss": 0.0015, | |
| "reward": 0.3776397071778774, | |
| "reward_std": 0.18774981424212456, | |
| "rewards/code_reward": 0.2803182378411293, | |
| "rewards/format_reward": 0.9732143133878708, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 717.1406555175781, | |
| "epoch": 0.4182590233545648, | |
| "grad_norm": 0.21405339241027832, | |
| "kl": 0.1429443359375, | |
| "learning_rate": 3.609354317152667e-06, | |
| "loss": 0.0015, | |
| "reward": 0.38271288573741913, | |
| "reward_std": 0.19382936879992485, | |
| "rewards/code_reward": 0.28539142571389675, | |
| "rewards/format_reward": 0.9732143133878708, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 678.2477874755859, | |
| "epoch": 0.42038216560509556, | |
| "grad_norm": 0.49006760120391846, | |
| "kl": 0.2021484375, | |
| "learning_rate": 3.595866882573063e-06, | |
| "loss": 0.0021, | |
| "reward": 0.4323223605751991, | |
| "reward_std": 0.2277931533753872, | |
| "rewards/code_reward": 0.3345545120537281, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 728.9620971679688, | |
| "epoch": 0.42250530785562634, | |
| "grad_norm": 0.39922112226486206, | |
| "kl": 0.184814453125, | |
| "learning_rate": 3.5823439571131675e-06, | |
| "loss": 0.0019, | |
| "reward": 0.40869200229644775, | |
| "reward_std": 0.2020891159772873, | |
| "rewards/code_reward": 0.31159375607967377, | |
| "rewards/format_reward": 0.9709821939468384, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 684.9576416015625, | |
| "epoch": 0.42462845010615713, | |
| "grad_norm": 0.23013651371002197, | |
| "kl": 0.149658203125, | |
| "learning_rate": 3.5687861081678477e-06, | |
| "loss": 0.0015, | |
| "reward": 0.4545319005846977, | |
| "reward_std": 0.24276942387223244, | |
| "rewards/code_reward": 0.3572104535996914, | |
| "rewards/format_reward": 0.9732143431901932, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 704.5223541259766, | |
| "epoch": 0.4267515923566879, | |
| "grad_norm": 0.46601447463035583, | |
| "kl": 0.145263671875, | |
| "learning_rate": 3.555193904597291e-06, | |
| "loss": 0.0015, | |
| "reward": 0.3521813452243805, | |
| "reward_std": 0.1790554393082857, | |
| "rewards/code_reward": 0.2555295582860708, | |
| "rewards/format_reward": 0.96651791036129, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 676.4754791259766, | |
| "epoch": 0.4288747346072187, | |
| "grad_norm": 0.24227948486804962, | |
| "kl": 0.145751953125, | |
| "learning_rate": 3.541567916703138e-06, | |
| "loss": 0.0015, | |
| "reward": 0.4256810247898102, | |
| "reward_std": 0.2298164926469326, | |
| "rewards/code_reward": 0.327689953148365, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 697.6027069091797, | |
| "epoch": 0.4309978768577495, | |
| "grad_norm": 0.32301369309425354, | |
| "kl": 0.141845703125, | |
| "learning_rate": 3.5279087162045517e-06, | |
| "loss": 0.0014, | |
| "reward": 0.27234210819005966, | |
| "reward_std": 0.17865055054426193, | |
| "rewards/code_reward": 0.17479745857417583, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 696.3727874755859, | |
| "epoch": 0.43312101910828027, | |
| "grad_norm": 0.6582425236701965, | |
| "kl": 0.14111328125, | |
| "learning_rate": 3.5142168762142265e-06, | |
| "loss": 0.0014, | |
| "reward": 0.3229696787893772, | |
| "reward_std": 0.1942148432135582, | |
| "rewards/code_reward": 0.22542503476142883, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 718.2232513427734, | |
| "epoch": 0.43524416135881105, | |
| "grad_norm": 0.30619704723358154, | |
| "kl": 0.149169921875, | |
| "learning_rate": 3.500492971214347e-06, | |
| "loss": 0.0015, | |
| "reward": 0.4395933449268341, | |
| "reward_std": 0.265441432595253, | |
| "rewards/code_reward": 0.3402629792690277, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 699.310302734375, | |
| "epoch": 0.43736730360934184, | |
| "grad_norm": 0.3680998980998993, | |
| "kl": 0.151611328125, | |
| "learning_rate": 3.48673757703248e-06, | |
| "loss": 0.0015, | |
| "reward": 0.3385552614927292, | |
| "reward_std": 0.24193225800991058, | |
| "rewards/code_reward": 0.24101059883832932, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 712.4799499511719, | |
| "epoch": 0.4394904458598726, | |
| "grad_norm": 0.22541551291942596, | |
| "kl": 0.315673828125, | |
| "learning_rate": 3.472951270817418e-06, | |
| "loss": 0.0032, | |
| "reward": 0.317364189773798, | |
| "reward_std": 0.2289394848048687, | |
| "rewards/code_reward": 0.2191498950123787, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 725.1719207763672, | |
| "epoch": 0.4416135881104034, | |
| "grad_norm": 0.7986815571784973, | |
| "kl": 0.8701171875, | |
| "learning_rate": 3.4591346310149578e-06, | |
| "loss": 0.0087, | |
| "reward": 0.30210861191153526, | |
| "reward_std": 0.1758405715227127, | |
| "rewards/code_reward": 0.20545680448412895, | |
| "rewards/format_reward": 0.9665178954601288, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 708.9174346923828, | |
| "epoch": 0.4437367303609342, | |
| "grad_norm": 0.6832094788551331, | |
| "kl": 0.571533203125, | |
| "learning_rate": 3.445288237343632e-06, | |
| "loss": 0.0057, | |
| "reward": 0.34425482153892517, | |
| "reward_std": 0.17729798145592213, | |
| "rewards/code_reward": 0.24559411033988, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 664.4152069091797, | |
| "epoch": 0.445859872611465, | |
| "grad_norm": 0.5344778299331665, | |
| "kl": 0.344970703125, | |
| "learning_rate": 3.4314126707703895e-06, | |
| "loss": 0.0035, | |
| "reward": 0.3406968005001545, | |
| "reward_std": 0.21405612863600254, | |
| "rewards/code_reward": 0.2424825206398964, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 687.4955596923828, | |
| "epoch": 0.44798301486199577, | |
| "grad_norm": 0.2852449119091034, | |
| "kl": 0.314208984375, | |
| "learning_rate": 3.4175085134862128e-06, | |
| "loss": 0.0031, | |
| "reward": 0.37548423558473587, | |
| "reward_std": 0.19767768681049347, | |
| "rewards/code_reward": 0.2783860079944134, | |
| "rewards/format_reward": 0.9709821939468384, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 694.372802734375, | |
| "epoch": 0.45010615711252655, | |
| "grad_norm": 0.8310821056365967, | |
| "kl": 0.214111328125, | |
| "learning_rate": 3.4035763488816953e-06, | |
| "loss": 0.0021, | |
| "reward": 0.5172732323408127, | |
| "reward_std": 0.24472371861338615, | |
| "rewards/code_reward": 0.41883569955825806, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 678.0424499511719, | |
| "epoch": 0.45222929936305734, | |
| "grad_norm": 0.28591927886009216, | |
| "kl": 0.14306640625, | |
| "learning_rate": 3.3896167615225594e-06, | |
| "loss": 0.0015, | |
| "reward": 0.3543313890695572, | |
| "reward_std": 0.21659231930971146, | |
| "rewards/code_reward": 0.2567867375910282, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 692.7522583007812, | |
| "epoch": 0.4543524416135881, | |
| "grad_norm": 0.5759381055831909, | |
| "kl": 0.1455078125, | |
| "learning_rate": 3.375630337125133e-06, | |
| "loss": 0.0015, | |
| "reward": 0.39294832199811935, | |
| "reward_std": 0.26400984078645706, | |
| "rewards/code_reward": 0.2960733026266098, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 731.1272735595703, | |
| "epoch": 0.4564755838641189, | |
| "grad_norm": 0.23394830524921417, | |
| "kl": 0.143798828125, | |
| "learning_rate": 3.361617662531772e-06, | |
| "loss": 0.0014, | |
| "reward": 0.3601933494210243, | |
| "reward_std": 0.25189225003123283, | |
| "rewards/code_reward": 0.2619790583848953, | |
| "rewards/format_reward": 0.9821428805589676, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 692.1428833007812, | |
| "epoch": 0.4585987261146497, | |
| "grad_norm": 0.24470415711402893, | |
| "kl": 0.1317138671875, | |
| "learning_rate": 3.347579325686237e-06, | |
| "loss": 0.0013, | |
| "reward": 0.3433048315346241, | |
| "reward_std": 0.22325459122657776, | |
| "rewards/code_reward": 0.245536956936121, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 684.3705749511719, | |
| "epoch": 0.4607218683651805, | |
| "grad_norm": 0.364793062210083, | |
| "kl": 0.122802734375, | |
| "learning_rate": 3.333515915609027e-06, | |
| "loss": 0.0012, | |
| "reward": 0.4696499854326248, | |
| "reward_std": 0.2734139449894428, | |
| "rewards/code_reward": 0.3707660511136055, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 723.4129791259766, | |
| "epoch": 0.46284501061571126, | |
| "grad_norm": 0.36840999126434326, | |
| "kl": 0.128173828125, | |
| "learning_rate": 3.3194280223726616e-06, | |
| "loss": 0.0013, | |
| "reward": 0.3476767987012863, | |
| "reward_std": 0.19485369697213173, | |
| "rewards/code_reward": 0.24968570843338966, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 660.7812805175781, | |
| "epoch": 0.46496815286624205, | |
| "grad_norm": 0.2917425036430359, | |
| "kl": 0.142578125, | |
| "learning_rate": 3.305316237076927e-06, | |
| "loss": 0.0014, | |
| "reward": 0.39485304057598114, | |
| "reward_std": 0.23686816543340683, | |
| "rewards/code_reward": 0.29708515852689743, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 674.2410888671875, | |
| "epoch": 0.46709129511677283, | |
| "grad_norm": 0.27625608444213867, | |
| "kl": 0.13134765625, | |
| "learning_rate": 3.291181151824071e-06, | |
| "loss": 0.0014, | |
| "reward": 0.5001323744654655, | |
| "reward_std": 0.2807440906763077, | |
| "rewards/code_reward": 0.40124842897057533, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 717.279052734375, | |
| "epoch": 0.4692144373673036, | |
| "grad_norm": 0.26342645287513733, | |
| "kl": 0.132568359375, | |
| "learning_rate": 3.27702335969396e-06, | |
| "loss": 0.0014, | |
| "reward": 0.438594788312912, | |
| "reward_std": 0.2874513529241085, | |
| "rewards/code_reward": 0.34060370177030563, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 740.8192291259766, | |
| "epoch": 0.4713375796178344, | |
| "grad_norm": 0.3312680423259735, | |
| "kl": 0.144287109375, | |
| "learning_rate": 3.2628434547191985e-06, | |
| "loss": 0.0014, | |
| "reward": 0.4112970530986786, | |
| "reward_std": 0.2245728299021721, | |
| "rewards/code_reward": 0.3137524016201496, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 709.9174499511719, | |
| "epoch": 0.4734607218683652, | |
| "grad_norm": 1.5567724704742432, | |
| "kl": 0.1339111328125, | |
| "learning_rate": 3.2486420318601973e-06, | |
| "loss": 0.0014, | |
| "reward": 0.4251294732093811, | |
| "reward_std": 0.18372783437371254, | |
| "rewards/code_reward": 0.3291473314166069, | |
| "rewards/format_reward": 0.9598214775323868, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 719.7656707763672, | |
| "epoch": 0.47558386411889597, | |
| "grad_norm": 0.2122294157743454, | |
| "kl": 0.1273193359375, | |
| "learning_rate": 3.2344196869802187e-06, | |
| "loss": 0.0013, | |
| "reward": 0.3450777679681778, | |
| "reward_std": 0.24194234982132912, | |
| "rewards/code_reward": 0.24730990827083588, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 704.0045013427734, | |
| "epoch": 0.47770700636942676, | |
| "grad_norm": 0.9711757898330688, | |
| "kl": 0.20751953125, | |
| "learning_rate": 3.2201770168203694e-06, | |
| "loss": 0.0021, | |
| "reward": 0.4306853115558624, | |
| "reward_std": 0.2568584829568863, | |
| "rewards/code_reward": 0.33492637425661087, | |
| "rewards/format_reward": 0.9575893133878708, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 727.6027069091797, | |
| "epoch": 0.47983014861995754, | |
| "grad_norm": 0.268039733171463, | |
| "kl": 0.13818359375, | |
| "learning_rate": 3.205914618974563e-06, | |
| "loss": 0.0014, | |
| "reward": 0.43213512748479843, | |
| "reward_std": 0.2562938630580902, | |
| "rewards/code_reward": 0.334367249161005, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 732.9643249511719, | |
| "epoch": 0.4819532908704883, | |
| "grad_norm": 0.46155139803886414, | |
| "kl": 0.198486328125, | |
| "learning_rate": 3.1916330918644496e-06, | |
| "loss": 0.002, | |
| "reward": 0.31592320650815964, | |
| "reward_std": 0.19539642706513405, | |
| "rewards/code_reward": 0.2174856998026371, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 770.1295013427734, | |
| "epoch": 0.4840764331210191, | |
| "grad_norm": 0.7360585331916809, | |
| "kl": 0.3978271484375, | |
| "learning_rate": 3.177333034714303e-06, | |
| "loss": 0.004, | |
| "reward": 0.35912561416625977, | |
| "reward_std": 0.21681112423539162, | |
| "rewards/code_reward": 0.26135774329304695, | |
| "rewards/format_reward": 0.9776785969734192, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 706.9777069091797, | |
| "epoch": 0.4861995753715499, | |
| "grad_norm": 1.2824815511703491, | |
| "kl": 0.615478515625, | |
| "learning_rate": 3.1630150475258813e-06, | |
| "loss": 0.0062, | |
| "reward": 0.3668329790234566, | |
| "reward_std": 0.2176014445722103, | |
| "rewards/code_reward": 0.2699579633772373, | |
| "rewards/format_reward": 0.9687500596046448, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 709.825927734375, | |
| "epoch": 0.4883227176220807, | |
| "grad_norm": 0.4730873107910156, | |
| "kl": 0.4136962890625, | |
| "learning_rate": 3.148679731053252e-06, | |
| "loss": 0.0041, | |
| "reward": 0.4401291459798813, | |
| "reward_std": 0.2792894318699837, | |
| "rewards/code_reward": 0.34213805943727493, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 716.0312805175781, | |
| "epoch": 0.49044585987261147, | |
| "grad_norm": 0.226039856672287, | |
| "kl": 0.1241455078125, | |
| "learning_rate": 3.1343276867775805e-06, | |
| "loss": 0.0013, | |
| "reward": 0.3396586962044239, | |
| "reward_std": 0.19299479201436043, | |
| "rewards/code_reward": 0.24211404286324978, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 699.6830749511719, | |
| "epoch": 0.49256900212314225, | |
| "grad_norm": 0.31895455718040466, | |
| "kl": 0.50146484375, | |
| "learning_rate": 3.1199595168819043e-06, | |
| "loss": 0.0051, | |
| "reward": 0.34284605644643307, | |
| "reward_std": 0.14287223480641842, | |
| "rewards/code_reward": 0.24463177705183625, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 781.5178985595703, | |
| "epoch": 0.49469214437367304, | |
| "grad_norm": 0.4143598973751068, | |
| "kl": 0.249755859375, | |
| "learning_rate": 3.105575824225852e-06, | |
| "loss": 0.0025, | |
| "reward": 0.38098950684070587, | |
| "reward_std": 0.21905666589736938, | |
| "rewards/code_reward": 0.28590018674731255, | |
| "rewards/format_reward": 0.9508928954601288, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 725.3705749511719, | |
| "epoch": 0.4968152866242038, | |
| "grad_norm": 0.9609025120735168, | |
| "kl": 0.401123046875, | |
| "learning_rate": 3.091177212320363e-06, | |
| "loss": 0.004, | |
| "reward": 0.4063151776790619, | |
| "reward_std": 0.25925979763269424, | |
| "rewards/code_reward": 0.3076544553041458, | |
| "rewards/format_reward": 0.986607164144516, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 730.1406555175781, | |
| "epoch": 0.4989384288747346, | |
| "grad_norm": 0.2471870481967926, | |
| "kl": 0.233154296875, | |
| "learning_rate": 3.0767642853023538e-06, | |
| "loss": 0.0024, | |
| "reward": 0.3827313929796219, | |
| "reward_std": 0.21219320595264435, | |
| "rewards/code_reward": 0.2858563922345638, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 696.4219207763672, | |
| "epoch": 0.5010615711252654, | |
| "grad_norm": 0.6714680194854736, | |
| "kl": 0.1856689453125, | |
| "learning_rate": 3.062337647909376e-06, | |
| "loss": 0.0019, | |
| "reward": 0.4210161566734314, | |
| "reward_std": 0.18587047047913074, | |
| "rewards/code_reward": 0.3232482895255089, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 744.200927734375, | |
| "epoch": 0.5031847133757962, | |
| "grad_norm": 0.5082603096961975, | |
| "kl": 0.2071533203125, | |
| "learning_rate": 3.04789790545424e-06, | |
| "loss": 0.0021, | |
| "reward": 0.4485570266842842, | |
| "reward_std": 0.1916775107383728, | |
| "rewards/code_reward": 0.3519052043557167, | |
| "rewards/format_reward": 0.9665178954601288, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 758.0178985595703, | |
| "epoch": 0.505307855626327, | |
| "grad_norm": 0.69068843126297, | |
| "kl": 0.19482421875, | |
| "learning_rate": 3.033445663799621e-06, | |
| "loss": 0.002, | |
| "reward": 0.3711010664701462, | |
| "reward_std": 0.1955837495625019, | |
| "rewards/code_reward": 0.2742260619997978, | |
| "rewards/format_reward": 0.9687500596046448, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 717.8147735595703, | |
| "epoch": 0.5074309978768577, | |
| "grad_norm": 0.40367022156715393, | |
| "kl": 0.161865234375, | |
| "learning_rate": 3.018981529332633e-06, | |
| "loss": 0.0016, | |
| "reward": 0.5175677761435509, | |
| "reward_std": 0.2608077637851238, | |
| "rewards/code_reward": 0.4193534851074219, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 729.2545013427734, | |
| "epoch": 0.5095541401273885, | |
| "grad_norm": 0.5104432702064514, | |
| "kl": 0.19384765625, | |
| "learning_rate": 3.00450610893939e-06, | |
| "loss": 0.002, | |
| "reward": 0.40982675552368164, | |
| "reward_std": 0.20613017305731773, | |
| "rewards/code_reward": 0.31161245331168175, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 705.7120819091797, | |
| "epoch": 0.5116772823779193, | |
| "grad_norm": 0.2226688116788864, | |
| "kl": 0.167724609375, | |
| "learning_rate": 2.9900200099795396e-06, | |
| "loss": 0.0017, | |
| "reward": 0.40758588910102844, | |
| "reward_std": 0.22469941899180412, | |
| "rewards/code_reward": 0.31048765778541565, | |
| "rewards/format_reward": 0.970982164144516, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 716.6004791259766, | |
| "epoch": 0.5138004246284501, | |
| "grad_norm": 0.823330283164978, | |
| "kl": 0.224853515625, | |
| "learning_rate": 2.9755238402607826e-06, | |
| "loss": 0.0023, | |
| "reward": 0.381127692759037, | |
| "reward_std": 0.1726750060915947, | |
| "rewards/code_reward": 0.2817973233759403, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 714.8460235595703, | |
| "epoch": 0.5159235668789809, | |
| "grad_norm": 0.5035973191261292, | |
| "kl": 0.198486328125, | |
| "learning_rate": 2.961018208013367e-06, | |
| "loss": 0.002, | |
| "reward": 0.3932320065796375, | |
| "reward_std": 0.14925590343773365, | |
| "rewards/code_reward": 0.295910551212728, | |
| "rewards/format_reward": 0.973214328289032, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 714.6205749511719, | |
| "epoch": 0.5180467091295117, | |
| "grad_norm": 0.6857829689979553, | |
| "kl": 0.16259765625, | |
| "learning_rate": 2.9465037218645694e-06, | |
| "loss": 0.0016, | |
| "reward": 0.3965849094092846, | |
| "reward_std": 0.20821771398186684, | |
| "rewards/code_reward": 0.3001563027501106, | |
| "rewards/format_reward": 0.9642857611179352, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 710.7254791259766, | |
| "epoch": 0.5201698513800425, | |
| "grad_norm": 1.60740327835083, | |
| "kl": 0.131591796875, | |
| "learning_rate": 2.9319809908131604e-06, | |
| "loss": 0.0013, | |
| "reward": 0.43405191600322723, | |
| "reward_std": 0.25733353197574615, | |
| "rewards/code_reward": 0.33539119362831116, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 690.6027221679688, | |
| "epoch": 0.5222929936305732, | |
| "grad_norm": 0.2978648841381073, | |
| "kl": 0.1689453125, | |
| "learning_rate": 2.917450624203847e-06, | |
| "loss": 0.0017, | |
| "reward": 0.45192842930555344, | |
| "reward_std": 0.24438033252954483, | |
| "rewards/code_reward": 0.3539373278617859, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 737.6094207763672, | |
| "epoch": 0.524416135881104, | |
| "grad_norm": 0.3084428310394287, | |
| "kl": 0.1378173828125, | |
| "learning_rate": 2.9029132317017118e-06, | |
| "loss": 0.0014, | |
| "reward": 0.46284686774015427, | |
| "reward_std": 0.2403612770140171, | |
| "rewards/code_reward": 0.36619507521390915, | |
| "rewards/format_reward": 0.96651791036129, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 698.7411041259766, | |
| "epoch": 0.5265392781316348, | |
| "grad_norm": 1.3392090797424316, | |
| "kl": 0.151123046875, | |
| "learning_rate": 2.888369423266629e-06, | |
| "loss": 0.0015, | |
| "reward": 0.4595029503107071, | |
| "reward_std": 0.19635827839374542, | |
| "rewards/code_reward": 0.36218152195215225, | |
| "rewards/format_reward": 0.9732143431901932, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 719.0134429931641, | |
| "epoch": 0.5286624203821656, | |
| "grad_norm": 0.21979840099811554, | |
| "kl": 0.14111328125, | |
| "learning_rate": 2.8738198091276712e-06, | |
| "loss": 0.0014, | |
| "reward": 0.36629121005535126, | |
| "reward_std": 0.21069011464715004, | |
| "rewards/code_reward": 0.2694162093102932, | |
| "rewards/format_reward": 0.9687500298023224, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 745.263427734375, | |
| "epoch": 0.5307855626326964, | |
| "grad_norm": 0.8504329323768616, | |
| "kl": 0.15234375, | |
| "learning_rate": 2.859264999757509e-06, | |
| "loss": 0.0016, | |
| "reward": 0.37740468978881836, | |
| "reward_std": 0.20382403209805489, | |
| "rewards/code_reward": 0.2811993137001991, | |
| "rewards/format_reward": 0.9620536267757416, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 723.1562957763672, | |
| "epoch": 0.5329087048832272, | |
| "grad_norm": 0.27034398913383484, | |
| "kl": 0.1591796875, | |
| "learning_rate": 2.8447056058467928e-06, | |
| "loss": 0.0016, | |
| "reward": 0.48566606640815735, | |
| "reward_std": 0.21075040474534035, | |
| "rewards/code_reward": 0.38789819926023483, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 734.6986846923828, | |
| "epoch": 0.535031847133758, | |
| "grad_norm": 0.4998323917388916, | |
| "kl": 0.145751953125, | |
| "learning_rate": 2.830142238278531e-06, | |
| "loss": 0.0015, | |
| "reward": 0.3668500781059265, | |
| "reward_std": 0.1973743811249733, | |
| "rewards/code_reward": 0.2690822184085846, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 722.1719207763672, | |
| "epoch": 0.5371549893842887, | |
| "grad_norm": 0.7386496663093567, | |
| "kl": 0.16943359375, | |
| "learning_rate": 2.81557550810246e-06, | |
| "loss": 0.0017, | |
| "reward": 0.5175603851675987, | |
| "reward_std": 0.23075248673558235, | |
| "rewards/code_reward": 0.41889964044094086, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 731.4286041259766, | |
| "epoch": 0.5392781316348195, | |
| "grad_norm": 2.323516368865967, | |
| "kl": 0.185791015625, | |
| "learning_rate": 2.8010060265094026e-06, | |
| "loss": 0.0019, | |
| "reward": 0.4158123657107353, | |
| "reward_std": 0.2362896017730236, | |
| "rewards/code_reward": 0.3180444836616516, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 714.1004791259766, | |
| "epoch": 0.5414012738853503, | |
| "grad_norm": 0.22996105253696442, | |
| "kl": 0.193115234375, | |
| "learning_rate": 2.786434404805629e-06, | |
| "loss": 0.002, | |
| "reward": 0.43036870658397675, | |
| "reward_std": 0.17873099818825722, | |
| "rewards/code_reward": 0.3323776051402092, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 755.794677734375, | |
| "epoch": 0.5435244161358811, | |
| "grad_norm": 0.5349477529525757, | |
| "kl": 0.21728515625, | |
| "learning_rate": 2.771861254387199e-06, | |
| "loss": 0.0022, | |
| "reward": 0.3905658796429634, | |
| "reward_std": 0.24914883077144623, | |
| "rewards/code_reward": 0.2939140759408474, | |
| "rewards/format_reward": 0.96651791036129, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 736.7969207763672, | |
| "epoch": 0.5456475583864119, | |
| "grad_norm": 0.5384594202041626, | |
| "kl": 0.44921875, | |
| "learning_rate": 2.7572871867143204e-06, | |
| "loss": 0.0045, | |
| "reward": 0.4809773936867714, | |
| "reward_std": 0.24490142613649368, | |
| "rewards/code_reward": 0.3832095377147198, | |
| "rewards/format_reward": 0.9776785969734192, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 779.904052734375, | |
| "epoch": 0.5477707006369427, | |
| "grad_norm": 0.3539630174636841, | |
| "kl": 0.46142578125, | |
| "learning_rate": 2.742712813285681e-06, | |
| "loss": 0.0046, | |
| "reward": 0.4002307578921318, | |
| "reward_std": 0.26231593638658524, | |
| "rewards/code_reward": 0.30447180569171906, | |
| "rewards/format_reward": 0.9575893431901932, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 688.8013610839844, | |
| "epoch": 0.5498938428874734, | |
| "grad_norm": 0.3760126531124115, | |
| "kl": 0.270263671875, | |
| "learning_rate": 2.7281387456128017e-06, | |
| "loss": 0.0027, | |
| "reward": 0.5040838867425919, | |
| "reward_std": 0.23536711558699608, | |
| "rewards/code_reward": 0.40519992262125015, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 763.7924499511719, | |
| "epoch": 0.5520169851380042, | |
| "grad_norm": 0.28506824374198914, | |
| "kl": 0.37060546875, | |
| "learning_rate": 2.7135655951943716e-06, | |
| "loss": 0.0037, | |
| "reward": 0.4464469403028488, | |
| "reward_std": 0.23727866262197495, | |
| "rewards/code_reward": 0.3491254858672619, | |
| "rewards/format_reward": 0.9732143133878708, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 735.7143096923828, | |
| "epoch": 0.554140127388535, | |
| "grad_norm": 0.5788131952285767, | |
| "kl": 0.35546875, | |
| "learning_rate": 2.698993973490598e-06, | |
| "loss": 0.0036, | |
| "reward": 0.5397853627800941, | |
| "reward_std": 0.2762787565588951, | |
| "rewards/code_reward": 0.4424639120697975, | |
| "rewards/format_reward": 0.9732143133878708, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 770.4129791259766, | |
| "epoch": 0.5562632696390658, | |
| "grad_norm": 0.5763887166976929, | |
| "kl": 0.4140625, | |
| "learning_rate": 2.6844244918975416e-06, | |
| "loss": 0.0041, | |
| "reward": 0.4332207143306732, | |
| "reward_std": 0.21580959856510162, | |
| "rewards/code_reward": 0.3361224830150604, | |
| "rewards/format_reward": 0.9709821939468384, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 756.357177734375, | |
| "epoch": 0.5583864118895966, | |
| "grad_norm": 0.23940207064151764, | |
| "kl": 0.3953857421875, | |
| "learning_rate": 2.66985776172147e-06, | |
| "loss": 0.004, | |
| "reward": 0.4067609831690788, | |
| "reward_std": 0.15922481939196587, | |
| "rewards/code_reward": 0.3078770413994789, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 797.3772735595703, | |
| "epoch": 0.5605095541401274, | |
| "grad_norm": 1.1956448554992676, | |
| "kl": 0.394775390625, | |
| "learning_rate": 2.6552943941532088e-06, | |
| "loss": 0.004, | |
| "reward": 0.35336220264434814, | |
| "reward_std": 0.21252319402992725, | |
| "rewards/code_reward": 0.25447824597358704, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 816.9330749511719, | |
| "epoch": 0.5626326963906582, | |
| "grad_norm": 0.3272117078304291, | |
| "kl": 0.33447265625, | |
| "learning_rate": 2.6407350002424927e-06, | |
| "loss": 0.0034, | |
| "reward": 0.3648254945874214, | |
| "reward_std": 0.19711337611079216, | |
| "rewards/code_reward": 0.2675040401518345, | |
| "rewards/format_reward": 0.973214328289032, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 790.0937805175781, | |
| "epoch": 0.564755838641189, | |
| "grad_norm": 0.3350137174129486, | |
| "kl": 0.217529296875, | |
| "learning_rate": 2.626180190872329e-06, | |
| "loss": 0.0022, | |
| "reward": 0.4639175459742546, | |
| "reward_std": 0.19284814596176147, | |
| "rewards/code_reward": 0.36592647433280945, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 785.247802734375, | |
| "epoch": 0.5668789808917197, | |
| "grad_norm": 0.2253342717885971, | |
| "kl": 0.1259765625, | |
| "learning_rate": 2.611630576733372e-06, | |
| "loss": 0.0013, | |
| "reward": 0.42001737654209137, | |
| "reward_std": 0.24298213049769402, | |
| "rewards/code_reward": 0.32180308550596237, | |
| "rewards/format_reward": 0.9821428805589676, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 874.1495971679688, | |
| "epoch": 0.5690021231422505, | |
| "grad_norm": 1.105658769607544, | |
| "kl": 0.2879638671875, | |
| "learning_rate": 2.5970867682982885e-06, | |
| "loss": 0.0029, | |
| "reward": 0.4009394347667694, | |
| "reward_std": 0.2002662494778633, | |
| "rewards/code_reward": 0.3031715527176857, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 821.7812805175781, | |
| "epoch": 0.5711252653927813, | |
| "grad_norm": 0.39032891392707825, | |
| "kl": 0.2081298828125, | |
| "learning_rate": 2.582549375796154e-06, | |
| "loss": 0.0021, | |
| "reward": 0.4019026607275009, | |
| "reward_std": 0.21826408058404922, | |
| "rewards/code_reward": 0.3036883734166622, | |
| "rewards/format_reward": 0.9821428805589676, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 804.6183471679688, | |
| "epoch": 0.5732484076433121, | |
| "grad_norm": 0.25958776473999023, | |
| "kl": 0.179931640625, | |
| "learning_rate": 2.568019009186841e-06, | |
| "loss": 0.0019, | |
| "reward": 0.4916309267282486, | |
| "reward_std": 0.19469109177589417, | |
| "rewards/code_reward": 0.3934166729450226, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 825.3370971679688, | |
| "epoch": 0.5753715498938429, | |
| "grad_norm": 0.22188299894332886, | |
| "kl": 0.1358642578125, | |
| "learning_rate": 2.5534962781354317e-06, | |
| "loss": 0.0014, | |
| "reward": 0.4202270358800888, | |
| "reward_std": 0.24190283194184303, | |
| "rewards/code_reward": 0.32223593071103096, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 780.4866485595703, | |
| "epoch": 0.5774946921443737, | |
| "grad_norm": 0.2580115497112274, | |
| "kl": 0.1597900390625, | |
| "learning_rate": 2.538981791986634e-06, | |
| "loss": 0.0016, | |
| "reward": 0.38698963820934296, | |
| "reward_std": 0.22697532176971436, | |
| "rewards/code_reward": 0.28877533972263336, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 823.0402069091797, | |
| "epoch": 0.5796178343949044, | |
| "grad_norm": 0.2385285347700119, | |
| "kl": 0.141357421875, | |
| "learning_rate": 2.524476159739218e-06, | |
| "loss": 0.0015, | |
| "reward": 0.43764442950487137, | |
| "reward_std": 0.22844265773892403, | |
| "rewards/code_reward": 0.34032295644283295, | |
| "rewards/format_reward": 0.9732143431901932, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 782.2277221679688, | |
| "epoch": 0.5817409766454352, | |
| "grad_norm": 0.7892447710037231, | |
| "kl": 0.1402587890625, | |
| "learning_rate": 2.5099799900204607e-06, | |
| "loss": 0.0014, | |
| "reward": 0.47495051473379135, | |
| "reward_std": 0.24570094048976898, | |
| "rewards/code_reward": 0.37606657296419144, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 790.029052734375, | |
| "epoch": 0.583864118895966, | |
| "grad_norm": 1.390781044960022, | |
| "kl": 0.1494140625, | |
| "learning_rate": 2.4954938910606108e-06, | |
| "loss": 0.0015, | |
| "reward": 0.41709961369633675, | |
| "reward_std": 0.22452056966722012, | |
| "rewards/code_reward": 0.31910853274166584, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 766.544677734375, | |
| "epoch": 0.5859872611464968, | |
| "grad_norm": 0.3231986463069916, | |
| "kl": 0.125732421875, | |
| "learning_rate": 2.481018470667368e-06, | |
| "loss": 0.0013, | |
| "reward": 0.5159066766500473, | |
| "reward_std": 0.2495804950594902, | |
| "rewards/code_reward": 0.4188084527850151, | |
| "rewards/format_reward": 0.9709821939468384, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 816.5491485595703, | |
| "epoch": 0.5881104033970276, | |
| "grad_norm": 0.3522323966026306, | |
| "kl": 0.1474609375, | |
| "learning_rate": 2.4665543362003802e-06, | |
| "loss": 0.0016, | |
| "reward": 0.5210660025477409, | |
| "reward_std": 0.19517110101878643, | |
| "rewards/code_reward": 0.42352132126688957, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 815.3638610839844, | |
| "epoch": 0.5902335456475584, | |
| "grad_norm": 0.36454498767852783, | |
| "kl": 0.156005859375, | |
| "learning_rate": 2.4521020945457615e-06, | |
| "loss": 0.0016, | |
| "reward": 0.41319186985492706, | |
| "reward_std": 0.21446501463651657, | |
| "rewards/code_reward": 0.3152007535099983, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 830.2031707763672, | |
| "epoch": 0.5923566878980892, | |
| "grad_norm": 0.24598261713981628, | |
| "kl": 0.182373046875, | |
| "learning_rate": 2.4376623520906255e-06, | |
| "loss": 0.0019, | |
| "reward": 0.48784376308321953, | |
| "reward_std": 0.25769151002168655, | |
| "rewards/code_reward": 0.39141515642404556, | |
| "rewards/format_reward": 0.9642857313156128, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 796.6272583007812, | |
| "epoch": 0.5944798301486199, | |
| "grad_norm": 0.24986568093299866, | |
| "kl": 0.154541015625, | |
| "learning_rate": 2.4232357146976478e-06, | |
| "loss": 0.0016, | |
| "reward": 0.3782888986170292, | |
| "reward_std": 0.18982039019465446, | |
| "rewards/code_reward": 0.28029780834913254, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 789.513427734375, | |
| "epoch": 0.5966029723991507, | |
| "grad_norm": 0.28405284881591797, | |
| "kl": 0.147705078125, | |
| "learning_rate": 2.408822787679637e-06, | |
| "loss": 0.0016, | |
| "reward": 0.5121422186493874, | |
| "reward_std": 0.2287510558962822, | |
| "rewards/code_reward": 0.413704726845026, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 808.3995819091797, | |
| "epoch": 0.5987261146496815, | |
| "grad_norm": 0.5303727984428406, | |
| "kl": 0.144775390625, | |
| "learning_rate": 2.3944241757741475e-06, | |
| "loss": 0.0016, | |
| "reward": 0.5508048385381699, | |
| "reward_std": 0.18633075430989265, | |
| "rewards/code_reward": 0.4516976475715637, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 827.4531707763672, | |
| "epoch": 0.6008492569002123, | |
| "grad_norm": 0.2308008074760437, | |
| "kl": 0.13037109375, | |
| "learning_rate": 2.380040483118097e-06, | |
| "loss": 0.0013, | |
| "reward": 0.3481413722038269, | |
| "reward_std": 0.20516538247466087, | |
| "rewards/code_reward": 0.250150291249156, | |
| "rewards/format_reward": 0.9799107313156128, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 786.8147583007812, | |
| "epoch": 0.6029723991507431, | |
| "grad_norm": 0.21248966455459595, | |
| "kl": 0.136962890625, | |
| "learning_rate": 2.365672313222419e-06, | |
| "loss": 0.0014, | |
| "reward": 0.4797332286834717, | |
| "reward_std": 0.2214011587202549, | |
| "rewards/code_reward": 0.3815189450979233, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 797.7723693847656, | |
| "epoch": 0.6050955414012739, | |
| "grad_norm": 0.2716215252876282, | |
| "kl": 0.1376953125, | |
| "learning_rate": 2.351320268946749e-06, | |
| "loss": 0.0014, | |
| "reward": 0.4847887381911278, | |
| "reward_std": 0.26064618304371834, | |
| "rewards/code_reward": 0.3861280009150505, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 790.9308319091797, | |
| "epoch": 0.6072186836518046, | |
| "grad_norm": 0.22615957260131836, | |
| "kl": 0.1334228515625, | |
| "learning_rate": 2.336984952474119e-06, | |
| "loss": 0.0014, | |
| "reward": 0.4451970234513283, | |
| "reward_std": 0.21199724823236465, | |
| "rewards/code_reward": 0.34564343094825745, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 774.716552734375, | |
| "epoch": 0.6093418259023354, | |
| "grad_norm": 0.24061597883701324, | |
| "kl": 0.17236328125, | |
| "learning_rate": 2.322666965285697e-06, | |
| "loss": 0.0018, | |
| "reward": 0.4680435359477997, | |
| "reward_std": 0.2062854841351509, | |
| "rewards/code_reward": 0.3700524792075157, | |
| "rewards/format_reward": 0.9799107313156128, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 785.3437957763672, | |
| "epoch": 0.6114649681528662, | |
| "grad_norm": 0.2794930636882782, | |
| "kl": 0.143798828125, | |
| "learning_rate": 2.3083669081355507e-06, | |
| "loss": 0.0015, | |
| "reward": 0.41017772257328033, | |
| "reward_std": 0.1858556531369686, | |
| "rewards/code_reward": 0.31263307854533195, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 768.2902221679688, | |
| "epoch": 0.613588110403397, | |
| "grad_norm": 0.2621839940547943, | |
| "kl": 0.138427734375, | |
| "learning_rate": 2.2940853810254377e-06, | |
| "loss": 0.0014, | |
| "reward": 0.4905528202652931, | |
| "reward_std": 0.25080636143684387, | |
| "rewards/code_reward": 0.39144565910100937, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 788.9687805175781, | |
| "epoch": 0.6157112526539278, | |
| "grad_norm": 0.25945180654525757, | |
| "kl": 0.1494140625, | |
| "learning_rate": 2.2798229831796313e-06, | |
| "loss": 0.0015, | |
| "reward": 0.43350084125995636, | |
| "reward_std": 0.1987269874662161, | |
| "rewards/code_reward": 0.3370722308754921, | |
| "rewards/format_reward": 0.9642857611179352, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 762.8616485595703, | |
| "epoch": 0.6178343949044586, | |
| "grad_norm": 0.28753146529197693, | |
| "kl": 0.146484375, | |
| "learning_rate": 2.2655803130197816e-06, | |
| "loss": 0.0015, | |
| "reward": 0.45754577219486237, | |
| "reward_std": 0.20388228073716164, | |
| "rewards/code_reward": 0.35977791622281075, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 755.8594207763672, | |
| "epoch": 0.6199575371549894, | |
| "grad_norm": 0.2792350947856903, | |
| "kl": 0.14794921875, | |
| "learning_rate": 2.2513579681398034e-06, | |
| "loss": 0.0016, | |
| "reward": 0.4282514527440071, | |
| "reward_std": 0.16725242137908936, | |
| "rewards/code_reward": 0.32959069684147835, | |
| "rewards/format_reward": 0.986607164144516, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 744.6138763427734, | |
| "epoch": 0.6220806794055201, | |
| "grad_norm": 0.2520155608654022, | |
| "kl": 0.13720703125, | |
| "learning_rate": 2.237156545280803e-06, | |
| "loss": 0.0014, | |
| "reward": 0.44700442999601364, | |
| "reward_std": 0.21429810300469398, | |
| "rewards/code_reward": 0.34812046587467194, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 771.6897888183594, | |
| "epoch": 0.6242038216560509, | |
| "grad_norm": 0.41944995522499084, | |
| "kl": 0.22412109375, | |
| "learning_rate": 2.2229766403060403e-06, | |
| "loss": 0.0023, | |
| "reward": 0.4441903755068779, | |
| "reward_std": 0.19182176142930984, | |
| "rewards/code_reward": 0.3459760546684265, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 778.763427734375, | |
| "epoch": 0.6263269639065817, | |
| "grad_norm": 0.2801876962184906, | |
| "kl": 0.137939453125, | |
| "learning_rate": 2.2088188481759305e-06, | |
| "loss": 0.0014, | |
| "reward": 0.46992357820272446, | |
| "reward_std": 0.19111047685146332, | |
| "rewards/code_reward": 0.37103963643312454, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 778.0893096923828, | |
| "epoch": 0.6284501061571125, | |
| "grad_norm": 0.21918566524982452, | |
| "kl": 0.131103515625, | |
| "learning_rate": 2.194683762923073e-06, | |
| "loss": 0.0013, | |
| "reward": 0.4984453171491623, | |
| "reward_std": 0.22232287377119064, | |
| "rewards/code_reward": 0.40045420452952385, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 740.3951110839844, | |
| "epoch": 0.6305732484076433, | |
| "grad_norm": 0.31050121784210205, | |
| "kl": 0.1572265625, | |
| "learning_rate": 2.1805719776273387e-06, | |
| "loss": 0.0016, | |
| "reward": 0.4212986081838608, | |
| "reward_std": 0.1724853478372097, | |
| "rewards/code_reward": 0.321968249976635, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 682.138427734375, | |
| "epoch": 0.6326963906581741, | |
| "grad_norm": 0.24185748398303986, | |
| "kl": 0.17529296875, | |
| "learning_rate": 2.166484084390974e-06, | |
| "loss": 0.0019, | |
| "reward": 0.5747622847557068, | |
| "reward_std": 0.18613022193312645, | |
| "rewards/code_reward": 0.475878331810236, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 716.6518096923828, | |
| "epoch": 0.6348195329087049, | |
| "grad_norm": 0.6314132213592529, | |
| "kl": 0.166015625, | |
| "learning_rate": 2.1524206743137636e-06, | |
| "loss": 0.0017, | |
| "reward": 0.36886315792798996, | |
| "reward_std": 0.17360183410346508, | |
| "rewards/code_reward": 0.2708720788359642, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 737.8303833007812, | |
| "epoch": 0.6369426751592356, | |
| "grad_norm": 0.2968922555446625, | |
| "kl": 0.19287109375, | |
| "learning_rate": 2.1383823374682287e-06, | |
| "loss": 0.0019, | |
| "reward": 0.39945459365844727, | |
| "reward_std": 0.20941082388162613, | |
| "rewards/code_reward": 0.3014635145664215, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 718.5714721679688, | |
| "epoch": 0.6390658174097664, | |
| "grad_norm": 19.51397132873535, | |
| "kl": 0.275146484375, | |
| "learning_rate": 2.124369662874868e-06, | |
| "loss": 0.0029, | |
| "reward": 0.503417618572712, | |
| "reward_std": 0.15935716964304447, | |
| "rewards/code_reward": 0.40631940215826035, | |
| "rewards/format_reward": 0.9709821790456772, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 704.3393096923828, | |
| "epoch": 0.6411889596602972, | |
| "grad_norm": 0.35022857785224915, | |
| "kl": 0.14697265625, | |
| "learning_rate": 2.110383238477441e-06, | |
| "loss": 0.0015, | |
| "reward": 0.5569600984454155, | |
| "reward_std": 0.20704489946365356, | |
| "rewards/code_reward": 0.45785292237997055, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 702.6986846923828, | |
| "epoch": 0.643312101910828, | |
| "grad_norm": 0.17607638239860535, | |
| "kl": 0.13916015625, | |
| "learning_rate": 2.096423651118305e-06, | |
| "loss": 0.0014, | |
| "reward": 0.2535444311797619, | |
| "reward_std": 0.11278286523884162, | |
| "rewards/code_reward": 0.15466050058603287, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 701.2857360839844, | |
| "epoch": 0.6454352441613588, | |
| "grad_norm": 0.6241003274917603, | |
| "kl": 0.1826171875, | |
| "learning_rate": 2.082491486513788e-06, | |
| "loss": 0.0019, | |
| "reward": 0.5550656765699387, | |
| "reward_std": 0.21512125991284847, | |
| "rewards/code_reward": 0.45618174970149994, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 709.0379638671875, | |
| "epoch": 0.6475583864118896, | |
| "grad_norm": 0.696461021900177, | |
| "kl": 0.1435546875, | |
| "learning_rate": 2.0685873292296116e-06, | |
| "loss": 0.0015, | |
| "reward": 0.3796486109495163, | |
| "reward_std": 0.15390164637938142, | |
| "rewards/code_reward": 0.28121111169457436, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 682.716552734375, | |
| "epoch": 0.6496815286624203, | |
| "grad_norm": 0.26720672845840454, | |
| "kl": 0.162109375, | |
| "learning_rate": 2.054711762656369e-06, | |
| "loss": 0.0016, | |
| "reward": 0.37838516384363174, | |
| "reward_std": 0.16313385590910912, | |
| "rewards/code_reward": 0.28061728924512863, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 666.8080596923828, | |
| "epoch": 0.6518046709129511, | |
| "grad_norm": 0.8882589936256409, | |
| "kl": 0.16259765625, | |
| "learning_rate": 2.040865368985044e-06, | |
| "loss": 0.0017, | |
| "reward": 0.4301592782139778, | |
| "reward_std": 0.20042868331074715, | |
| "rewards/code_reward": 0.33105212450027466, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 681.9129791259766, | |
| "epoch": 0.6539278131634819, | |
| "grad_norm": 0.23706179857254028, | |
| "kl": 0.18310546875, | |
| "learning_rate": 2.027048729182583e-06, | |
| "loss": 0.0019, | |
| "reward": 0.4861885607242584, | |
| "reward_std": 0.16966554708778858, | |
| "rewards/code_reward": 0.3881974592804909, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 693.8147583007812, | |
| "epoch": 0.6560509554140127, | |
| "grad_norm": 0.5197703242301941, | |
| "kl": 0.228271484375, | |
| "learning_rate": 2.0132624229675205e-06, | |
| "loss": 0.0024, | |
| "reward": 0.511215090751648, | |
| "reward_std": 0.18619069457054138, | |
| "rewards/code_reward": 0.4127775654196739, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 714.9777221679688, | |
| "epoch": 0.6581740976645435, | |
| "grad_norm": 0.24638721346855164, | |
| "kl": 0.189453125, | |
| "learning_rate": 1.9995070287856546e-06, | |
| "loss": 0.002, | |
| "reward": 0.5180679038167, | |
| "reward_std": 0.21345077827572823, | |
| "rewards/code_reward": 0.41963040083646774, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 708.2723388671875, | |
| "epoch": 0.6602972399150743, | |
| "grad_norm": 0.422715961933136, | |
| "kl": 0.18701171875, | |
| "learning_rate": 1.985783123785774e-06, | |
| "loss": 0.0019, | |
| "reward": 0.5620292499661446, | |
| "reward_std": 0.20659737288951874, | |
| "rewards/code_reward": 0.46314531564712524, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 668.2589569091797, | |
| "epoch": 0.6624203821656051, | |
| "grad_norm": 0.6652376055717468, | |
| "kl": 0.240478515625, | |
| "learning_rate": 1.9720912837954486e-06, | |
| "loss": 0.0025, | |
| "reward": 0.4389989897608757, | |
| "reward_std": 0.20384247601032257, | |
| "rewards/code_reward": 0.33989182114601135, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 671.5424346923828, | |
| "epoch": 0.6645435244161358, | |
| "grad_norm": 0.898223876953125, | |
| "kl": 0.25927734375, | |
| "learning_rate": 1.958432083296862e-06, | |
| "loss": 0.0026, | |
| "reward": 0.36031387001276016, | |
| "reward_std": 0.2003210037946701, | |
| "rewards/code_reward": 0.26254600286483765, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 676.5602874755859, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.7889689207077026, | |
| "kl": 0.2135009765625, | |
| "learning_rate": 1.9448060954027093e-06, | |
| "loss": 0.0022, | |
| "reward": 0.5204020366072655, | |
| "reward_std": 0.16625045239925385, | |
| "rewards/code_reward": 0.4212948679924011, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 684.9866333007812, | |
| "epoch": 0.6687898089171974, | |
| "grad_norm": 1.3564072847366333, | |
| "kl": 0.40185546875, | |
| "learning_rate": 1.931213891832153e-06, | |
| "loss": 0.0041, | |
| "reward": 0.526521772146225, | |
| "reward_std": 0.2212766855955124, | |
| "rewards/code_reward": 0.4278610572218895, | |
| "rewards/format_reward": 0.986607164144516, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 652.8102874755859, | |
| "epoch": 0.6709129511677282, | |
| "grad_norm": 0.24422591924667358, | |
| "kl": 0.147216796875, | |
| "learning_rate": 1.9176560428868336e-06, | |
| "loss": 0.0015, | |
| "reward": 0.3931754156947136, | |
| "reward_std": 0.1695394441485405, | |
| "rewards/code_reward": 0.29473789036273956, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 687.8214569091797, | |
| "epoch": 0.673036093418259, | |
| "grad_norm": 0.4171687960624695, | |
| "kl": 0.236328125, | |
| "learning_rate": 1.9041331174269373e-06, | |
| "loss": 0.0024, | |
| "reward": 0.47731664031744003, | |
| "reward_std": 0.20429091900587082, | |
| "rewards/code_reward": 0.378879152238369, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 682.6741485595703, | |
| "epoch": 0.6751592356687898, | |
| "grad_norm": 0.9241800308227539, | |
| "kl": 0.36083984375, | |
| "learning_rate": 1.8906456828473341e-06, | |
| "loss": 0.0036, | |
| "reward": 0.5124014094471931, | |
| "reward_std": 0.21064380928874016, | |
| "rewards/code_reward": 0.4132942706346512, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 684.0759124755859, | |
| "epoch": 0.6772823779193206, | |
| "grad_norm": 0.24995659291744232, | |
| "kl": 0.14794921875, | |
| "learning_rate": 1.8771943050537656e-06, | |
| "loss": 0.0016, | |
| "reward": 0.592289388179779, | |
| "reward_std": 0.2126442939043045, | |
| "rewards/code_reward": 0.4942983016371727, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 719.6786041259766, | |
| "epoch": 0.6794055201698513, | |
| "grad_norm": 0.24401088058948517, | |
| "kl": 0.1395263671875, | |
| "learning_rate": 1.8637795484391046e-06, | |
| "loss": 0.0014, | |
| "reward": 0.4689144790172577, | |
| "reward_std": 0.25591350346803665, | |
| "rewards/code_reward": 0.3711466044187546, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 655.4844055175781, | |
| "epoch": 0.6815286624203821, | |
| "grad_norm": 0.3457026779651642, | |
| "kl": 0.50341796875, | |
| "learning_rate": 1.8504019758596698e-06, | |
| "loss": 0.0051, | |
| "reward": 0.5521439760923386, | |
| "reward_std": 0.2452612817287445, | |
| "rewards/code_reward": 0.45326002687215805, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 714.7143096923828, | |
| "epoch": 0.6836518046709129, | |
| "grad_norm": 0.3326283395290375, | |
| "kl": 0.1953125, | |
| "learning_rate": 1.8370621486116163e-06, | |
| "loss": 0.0021, | |
| "reward": 0.5532227605581284, | |
| "reward_std": 0.18401411548256874, | |
| "rewards/code_reward": 0.4552316591143608, | |
| "rewards/format_reward": 0.9799107760190964, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 677.7634124755859, | |
| "epoch": 0.6857749469214437, | |
| "grad_norm": 0.3285404145717621, | |
| "kl": 0.23876953125, | |
| "learning_rate": 1.823760626407377e-06, | |
| "loss": 0.0025, | |
| "reward": 0.4828302264213562, | |
| "reward_std": 0.1928608939051628, | |
| "rewards/code_reward": 0.384615920484066, | |
| "rewards/format_reward": 0.9821429252624512, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 699.5982666015625, | |
| "epoch": 0.6878980891719745, | |
| "grad_norm": 0.34025460481643677, | |
| "kl": 0.224365234375, | |
| "learning_rate": 1.8104979673521838e-06, | |
| "loss": 0.0023, | |
| "reward": 0.42327145487070084, | |
| "reward_std": 0.15393321216106415, | |
| "rewards/code_reward": 0.32505714148283005, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 650.997802734375, | |
| "epoch": 0.6900212314225053, | |
| "grad_norm": 0.3025732636451721, | |
| "kl": 0.24853515625, | |
| "learning_rate": 1.7972747279206482e-06, | |
| "loss": 0.0025, | |
| "reward": 0.37195510417222977, | |
| "reward_std": 0.19180476292967796, | |
| "rewards/code_reward": 0.27418723329901695, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 692.4464569091797, | |
| "epoch": 0.692144373673036, | |
| "grad_norm": 0.2389409989118576, | |
| "kl": 0.148681640625, | |
| "learning_rate": 1.7840914629334122e-06, | |
| "loss": 0.0016, | |
| "reward": 0.5394042208790779, | |
| "reward_std": 0.22496159374713898, | |
| "rewards/code_reward": 0.44185957312583923, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 709.0669860839844, | |
| "epoch": 0.6942675159235668, | |
| "grad_norm": 0.28394991159439087, | |
| "kl": 0.194091796875, | |
| "learning_rate": 1.7709487255338731e-06, | |
| "loss": 0.0021, | |
| "reward": 0.4636544920504093, | |
| "reward_std": 0.15700273029506207, | |
| "rewards/code_reward": 0.36633305437862873, | |
| "rewards/format_reward": 0.973214328289032, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 702.7768249511719, | |
| "epoch": 0.6963906581740976, | |
| "grad_norm": 0.22292962670326233, | |
| "kl": 0.17431640625, | |
| "learning_rate": 1.7578470671649684e-06, | |
| "loss": 0.0019, | |
| "reward": 0.4268321394920349, | |
| "reward_std": 0.1670057326555252, | |
| "rewards/code_reward": 0.32928748056292534, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 694.5937805175781, | |
| "epoch": 0.6985138004246284, | |
| "grad_norm": 0.782927393913269, | |
| "kl": 0.3328857421875, | |
| "learning_rate": 1.744787037546045e-06, | |
| "loss": 0.0034, | |
| "reward": 0.46113383024930954, | |
| "reward_std": 0.18688062392175198, | |
| "rewards/code_reward": 0.3626963049173355, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 706.1986999511719, | |
| "epoch": 0.7006369426751592, | |
| "grad_norm": 0.41430673003196716, | |
| "kl": 0.1827392578125, | |
| "learning_rate": 1.731769184649788e-06, | |
| "loss": 0.0019, | |
| "reward": 0.5658792853355408, | |
| "reward_std": 0.23742860183119774, | |
| "rewards/code_reward": 0.4683346152305603, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 694.9576416015625, | |
| "epoch": 0.70276008492569, | |
| "grad_norm": 0.6622937917709351, | |
| "kl": 0.214111328125, | |
| "learning_rate": 1.7187940546792325e-06, | |
| "loss": 0.0022, | |
| "reward": 0.4137548431754112, | |
| "reward_std": 0.1334713213145733, | |
| "rewards/code_reward": 0.3155405670404434, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 716.8393249511719, | |
| "epoch": 0.7048832271762208, | |
| "grad_norm": 0.22396574914455414, | |
| "kl": 0.2607421875, | |
| "learning_rate": 1.7058621920448465e-06, | |
| "loss": 0.0027, | |
| "reward": 0.4444565996527672, | |
| "reward_std": 0.18423740193247795, | |
| "rewards/code_reward": 0.34646550565958023, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 703.0937805175781, | |
| "epoch": 0.7070063694267515, | |
| "grad_norm": 0.2483583688735962, | |
| "kl": 0.160888671875, | |
| "learning_rate": 1.6929741393416855e-06, | |
| "loss": 0.0016, | |
| "reward": 0.47170016914606094, | |
| "reward_std": 0.18039512634277344, | |
| "rewards/code_reward": 0.37393229454755783, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 755.0312805175781, | |
| "epoch": 0.7091295116772823, | |
| "grad_norm": 0.4338986873626709, | |
| "kl": 0.357177734375, | |
| "learning_rate": 1.6801304373266286e-06, | |
| "loss": 0.0036, | |
| "reward": 0.4291260167956352, | |
| "reward_std": 0.15267430432140827, | |
| "rewards/code_reward": 0.3318046070635319, | |
| "rewards/format_reward": 0.9732143133878708, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 767.3214569091797, | |
| "epoch": 0.7112526539278131, | |
| "grad_norm": 0.21925950050354004, | |
| "kl": 0.137451171875, | |
| "learning_rate": 1.667331624895689e-06, | |
| "loss": 0.0014, | |
| "reward": 0.4992447942495346, | |
| "reward_std": 0.21635426208376884, | |
| "rewards/code_reward": 0.4014769196510315, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 750.1696929931641, | |
| "epoch": 0.7133757961783439, | |
| "grad_norm": 0.30118319392204285, | |
| "kl": 0.359619140625, | |
| "learning_rate": 1.6545782390614037e-06, | |
| "loss": 0.0037, | |
| "reward": 0.4922778084874153, | |
| "reward_std": 0.1726557295769453, | |
| "rewards/code_reward": 0.39317065104842186, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 718.6339721679688, | |
| "epoch": 0.7154989384288747, | |
| "grad_norm": 0.41911348700523376, | |
| "kl": 0.317626953125, | |
| "learning_rate": 1.6418708149302992e-06, | |
| "loss": 0.0033, | |
| "reward": 0.44379642605781555, | |
| "reward_std": 0.19296832010149956, | |
| "rewards/code_reward": 0.3451356738805771, | |
| "rewards/format_reward": 0.986607164144516, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 694.1138610839844, | |
| "epoch": 0.7176220806794055, | |
| "grad_norm": 0.7091541886329651, | |
| "kl": 0.27783203125, | |
| "learning_rate": 1.6292098856804423e-06, | |
| "loss": 0.0028, | |
| "reward": 0.4443873465061188, | |
| "reward_std": 0.19508511200547218, | |
| "rewards/code_reward": 0.3468426913022995, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 720.607177734375, | |
| "epoch": 0.7197452229299363, | |
| "grad_norm": 0.6043697595596313, | |
| "kl": 0.3173828125, | |
| "learning_rate": 1.6165959825390661e-06, | |
| "loss": 0.0033, | |
| "reward": 0.43542125821113586, | |
| "reward_std": 0.16308805532753468, | |
| "rewards/code_reward": 0.33720696344971657, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 706.1317291259766, | |
| "epoch": 0.721868365180467, | |
| "grad_norm": 0.2581160068511963, | |
| "kl": 0.2353515625, | |
| "learning_rate": 1.604029634760284e-06, | |
| "loss": 0.0025, | |
| "reward": 0.5382986813783646, | |
| "reward_std": 0.14037772081792355, | |
| "rewards/code_reward": 0.4403075948357582, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 737.5469055175781, | |
| "epoch": 0.7239915074309978, | |
| "grad_norm": 0.4556562006473541, | |
| "kl": 0.368408203125, | |
| "learning_rate": 1.59151136960288e-06, | |
| "loss": 0.0037, | |
| "reward": 0.538652278482914, | |
| "reward_std": 0.20831965655088425, | |
| "rewards/code_reward": 0.44133080542087555, | |
| "rewards/format_reward": 0.973214328289032, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 723.9486999511719, | |
| "epoch": 0.7261146496815286, | |
| "grad_norm": 0.2620218098163605, | |
| "kl": 0.159912109375, | |
| "learning_rate": 1.5790417123081903e-06, | |
| "loss": 0.0017, | |
| "reward": 0.45855508744716644, | |
| "reward_std": 0.1777043156325817, | |
| "rewards/code_reward": 0.3605640158057213, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 686.8236999511719, | |
| "epoch": 0.7282377919320594, | |
| "grad_norm": 0.2753090560436249, | |
| "kl": 0.16455078125, | |
| "learning_rate": 1.5666211860780583e-06, | |
| "loss": 0.0018, | |
| "reward": 0.5850269198417664, | |
| "reward_std": 0.19610749557614326, | |
| "rewards/code_reward": 0.4870358556509018, | |
| "rewards/format_reward": 0.9799107760190964, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 684.1205749511719, | |
| "epoch": 0.7303609341825902, | |
| "grad_norm": 0.23944684863090515, | |
| "kl": 0.16455078125, | |
| "learning_rate": 1.5542503120528918e-06, | |
| "loss": 0.0017, | |
| "reward": 0.5332599207758904, | |
| "reward_std": 0.2457549162209034, | |
| "rewards/code_reward": 0.43437594920396805, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 720.3839569091797, | |
| "epoch": 0.732484076433121, | |
| "grad_norm": 0.31666672229766846, | |
| "kl": 0.213134765625, | |
| "learning_rate": 1.5419296092897866e-06, | |
| "loss": 0.0022, | |
| "reward": 0.5879708528518677, | |
| "reward_std": 0.24002529680728912, | |
| "rewards/code_reward": 0.4899797812104225, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 693.6964569091797, | |
| "epoch": 0.7346072186836518, | |
| "grad_norm": 0.24176108837127686, | |
| "kl": 0.15869140625, | |
| "learning_rate": 1.529659594740755e-06, | |
| "loss": 0.0016, | |
| "reward": 0.4276282340288162, | |
| "reward_std": 0.20496541634202003, | |
| "rewards/code_reward": 0.32896753773093224, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 704.2902221679688, | |
| "epoch": 0.7367303609341825, | |
| "grad_norm": 0.2568061351776123, | |
| "kl": 0.15771484375, | |
| "learning_rate": 1.5174407832310338e-06, | |
| "loss": 0.0016, | |
| "reward": 0.39445348642766476, | |
| "reward_std": 0.13825338683091104, | |
| "rewards/code_reward": 0.2962391600012779, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 722.2545013427734, | |
| "epoch": 0.7388535031847133, | |
| "grad_norm": 0.49012815952301025, | |
| "kl": 0.17578125, | |
| "learning_rate": 1.5052736874374815e-06, | |
| "loss": 0.0018, | |
| "reward": 0.488083653151989, | |
| "reward_std": 0.1927042007446289, | |
| "rewards/code_reward": 0.39009255915880203, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 713.3147583007812, | |
| "epoch": 0.7409766454352441, | |
| "grad_norm": 0.6304606795310974, | |
| "kl": 0.29345703125, | |
| "learning_rate": 1.4931588178670695e-06, | |
| "loss": 0.003, | |
| "reward": 0.4815641790628433, | |
| "reward_std": 0.16962832398712635, | |
| "rewards/code_reward": 0.38357311114668846, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 700.3214569091797, | |
| "epoch": 0.7430997876857749, | |
| "grad_norm": 0.43463101983070374, | |
| "kl": 0.289306640625, | |
| "learning_rate": 1.4810966828354605e-06, | |
| "loss": 0.0029, | |
| "reward": 0.45994506776332855, | |
| "reward_std": 0.1931474320590496, | |
| "rewards/code_reward": 0.36173076555132866, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 685.1384124755859, | |
| "epoch": 0.7452229299363057, | |
| "grad_norm": 0.34815892577171326, | |
| "kl": 0.44140625, | |
| "learning_rate": 1.469087788445684e-06, | |
| "loss": 0.0045, | |
| "reward": 0.5396069064736366, | |
| "reward_std": 0.20336921885609627, | |
| "rewards/code_reward": 0.44250866025686264, | |
| "rewards/format_reward": 0.9709821939468384, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 698.6071929931641, | |
| "epoch": 0.7473460721868365, | |
| "grad_norm": 0.3489153981208801, | |
| "kl": 0.533447265625, | |
| "learning_rate": 1.4571326385668965e-06, | |
| "loss": 0.0055, | |
| "reward": 0.6215780973434448, | |
| "reward_std": 0.202628992497921, | |
| "rewards/code_reward": 0.5229173377156258, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 713.6897583007812, | |
| "epoch": 0.7494692144373672, | |
| "grad_norm": 0.2902304232120514, | |
| "kl": 0.160400390625, | |
| "learning_rate": 1.4452317348132434e-06, | |
| "loss": 0.0018, | |
| "reward": 0.43891899287700653, | |
| "reward_std": 0.1397520825266838, | |
| "rewards/code_reward": 0.3393654003739357, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 706.091552734375, | |
| "epoch": 0.7515923566878981, | |
| "grad_norm": 0.7335183024406433, | |
| "kl": 0.34814453125, | |
| "learning_rate": 1.4333855765228104e-06, | |
| "loss": 0.0037, | |
| "reward": 0.6451611816883087, | |
| "reward_std": 0.20771214738488197, | |
| "rewards/code_reward": 0.5465004742145538, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 712.1607513427734, | |
| "epoch": 0.7537154989384289, | |
| "grad_norm": 0.7572880387306213, | |
| "kl": 0.3447265625, | |
| "learning_rate": 1.421594660736675e-06, | |
| "loss": 0.0035, | |
| "reward": 0.41940218955278397, | |
| "reward_std": 0.1921430230140686, | |
| "rewards/code_reward": 0.3202950209379196, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 680.1986999511719, | |
| "epoch": 0.7558386411889597, | |
| "grad_norm": 0.3940925896167755, | |
| "kl": 0.549560546875, | |
| "learning_rate": 1.4098594821780476e-06, | |
| "loss": 0.0056, | |
| "reward": 0.6083894520998001, | |
| "reward_std": 0.1597061362117529, | |
| "rewards/code_reward": 0.5108448341488838, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 665.372802734375, | |
| "epoch": 0.7579617834394905, | |
| "grad_norm": 0.2566499710083008, | |
| "kl": 0.192138671875, | |
| "learning_rate": 1.3981805332315174e-06, | |
| "loss": 0.002, | |
| "reward": 0.4351358078420162, | |
| "reward_std": 0.1653740406036377, | |
| "rewards/code_reward": 0.3360286522656679, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 732.7589569091797, | |
| "epoch": 0.7600849256900213, | |
| "grad_norm": 0.35650861263275146, | |
| "kl": 0.250732421875, | |
| "learning_rate": 1.3865583039223929e-06, | |
| "loss": 0.0026, | |
| "reward": 0.5535444989800453, | |
| "reward_std": 0.17830567993223667, | |
| "rewards/code_reward": 0.4555533789098263, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 708.0424346923828, | |
| "epoch": 0.7622080679405521, | |
| "grad_norm": 0.24273599684238434, | |
| "kl": 0.1611328125, | |
| "learning_rate": 1.374993281896137e-06, | |
| "loss": 0.0017, | |
| "reward": 0.44518817216157913, | |
| "reward_std": 0.19435212016105652, | |
| "rewards/code_reward": 0.34697388112545013, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 765.1786041259766, | |
| "epoch": 0.7643312101910829, | |
| "grad_norm": 0.3510468304157257, | |
| "kl": 0.197021484375, | |
| "learning_rate": 1.3634859523979134e-06, | |
| "loss": 0.002, | |
| "reward": 0.47114741802215576, | |
| "reward_std": 0.1812426745891571, | |
| "rewards/code_reward": 0.3724866919219494, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 724.216552734375, | |
| "epoch": 0.7664543524416136, | |
| "grad_norm": 1.1458288431167603, | |
| "kl": 0.52978515625, | |
| "learning_rate": 1.3520367982522208e-06, | |
| "loss": 0.0053, | |
| "reward": 0.45792729407548904, | |
| "reward_std": 0.16464052349328995, | |
| "rewards/code_reward": 0.35926656424999237, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 705.4464721679688, | |
| "epoch": 0.7685774946921444, | |
| "grad_norm": 0.4750509560108185, | |
| "kl": 0.23779296875, | |
| "learning_rate": 1.3406462998426358e-06, | |
| "loss": 0.0024, | |
| "reward": 0.5133348107337952, | |
| "reward_std": 0.24053634703159332, | |
| "rewards/code_reward": 0.41445086151361465, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 743.1853179931641, | |
| "epoch": 0.7707006369426752, | |
| "grad_norm": 0.2608552575111389, | |
| "kl": 0.325927734375, | |
| "learning_rate": 1.3293149350916595e-06, | |
| "loss": 0.0033, | |
| "reward": 0.5553034171462059, | |
| "reward_std": 0.19487734138965607, | |
| "rewards/code_reward": 0.45731230080127716, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 678.2209930419922, | |
| "epoch": 0.772823779193206, | |
| "grad_norm": 0.22239775955677032, | |
| "kl": 0.13037109375, | |
| "learning_rate": 1.3180431794406623e-06, | |
| "loss": 0.0015, | |
| "reward": 0.6062557250261307, | |
| "reward_std": 0.2048381306231022, | |
| "rewards/code_reward": 0.5069253593683243, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 707.4620971679688, | |
| "epoch": 0.7749469214437368, | |
| "grad_norm": 0.4696608781814575, | |
| "kl": 0.270751953125, | |
| "learning_rate": 1.3068315058299358e-06, | |
| "loss": 0.0029, | |
| "reward": 0.5663170740008354, | |
| "reward_std": 0.15939603559672832, | |
| "rewards/code_reward": 0.4678795412182808, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 653.2879791259766, | |
| "epoch": 0.7770700636942676, | |
| "grad_norm": 1.1559607982635498, | |
| "kl": 0.3037109375, | |
| "learning_rate": 1.2956803846788503e-06, | |
| "loss": 0.0032, | |
| "reward": 0.618221327662468, | |
| "reward_std": 0.22959138825535774, | |
| "rewards/code_reward": 0.5193373411893845, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 731.2076263427734, | |
| "epoch": 0.7791932059447984, | |
| "grad_norm": 0.48825645446777344, | |
| "kl": 0.210693359375, | |
| "learning_rate": 1.284590283866116e-06, | |
| "loss": 0.0021, | |
| "reward": 0.33228749781847, | |
| "reward_std": 0.15970432199537754, | |
| "rewards/code_reward": 0.2345196194946766, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 695.4576110839844, | |
| "epoch": 0.7813163481953291, | |
| "grad_norm": 1.4041056632995605, | |
| "kl": 0.1883544921875, | |
| "learning_rate": 1.2735616687101518e-06, | |
| "loss": 0.002, | |
| "reward": 0.40882231295108795, | |
| "reward_std": 0.16854364797472954, | |
| "rewards/code_reward": 0.3103848248720169, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 696.4687805175781, | |
| "epoch": 0.7834394904458599, | |
| "grad_norm": 1.9169604778289795, | |
| "kl": 0.201171875, | |
| "learning_rate": 1.2625950019495614e-06, | |
| "loss": 0.0021, | |
| "reward": 0.5380031913518906, | |
| "reward_std": 0.1728157363831997, | |
| "rewards/code_reward": 0.4400121048092842, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 709.4598693847656, | |
| "epoch": 0.7855626326963907, | |
| "grad_norm": 0.3797023594379425, | |
| "kl": 0.1640625, | |
| "learning_rate": 1.251690743723718e-06, | |
| "loss": 0.0017, | |
| "reward": 0.5747079327702522, | |
| "reward_std": 0.24513645470142365, | |
| "rewards/code_reward": 0.4767168238759041, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 647.7209930419922, | |
| "epoch": 0.7876857749469215, | |
| "grad_norm": 0.24551738798618317, | |
| "kl": 0.150390625, | |
| "learning_rate": 1.2408493515534581e-06, | |
| "loss": 0.0016, | |
| "reward": 0.6943890303373337, | |
| "reward_std": 0.22319162264466286, | |
| "rewards/code_reward": 0.5959515273571014, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 692.2098693847656, | |
| "epoch": 0.7898089171974523, | |
| "grad_norm": 0.4829825460910797, | |
| "kl": 0.406005859375, | |
| "learning_rate": 1.2300712803218834e-06, | |
| "loss": 0.0042, | |
| "reward": 0.5234424099326134, | |
| "reward_std": 0.1910531185567379, | |
| "rewards/code_reward": 0.42388884723186493, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 697.3036041259766, | |
| "epoch": 0.7919320594479831, | |
| "grad_norm": 114.25981140136719, | |
| "kl": 16.0146484375, | |
| "learning_rate": 1.2193569822552772e-06, | |
| "loss": 0.1608, | |
| "reward": 0.559485673904419, | |
| "reward_std": 0.20258497074246407, | |
| "rewards/code_reward": 0.4606017544865608, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 677.7567291259766, | |
| "epoch": 0.7940552016985138, | |
| "grad_norm": 0.3005722761154175, | |
| "kl": 0.171875, | |
| "learning_rate": 1.2087069069041268e-06, | |
| "loss": 0.0018, | |
| "reward": 0.5883411467075348, | |
| "reward_std": 0.21694539301097393, | |
| "rewards/code_reward": 0.4901268184185028, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 671.1495819091797, | |
| "epoch": 0.7961783439490446, | |
| "grad_norm": 0.6558151841163635, | |
| "kl": 0.162841796875, | |
| "learning_rate": 1.1981215011242654e-06, | |
| "loss": 0.0017, | |
| "reward": 0.5491671711206436, | |
| "reward_std": 0.2353355698287487, | |
| "rewards/code_reward": 0.45050643384456635, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 663.4486999511719, | |
| "epoch": 0.7983014861995754, | |
| "grad_norm": 1.0574246644973755, | |
| "kl": 0.168701171875, | |
| "learning_rate": 1.1876012090581184e-06, | |
| "loss": 0.0018, | |
| "reward": 0.523729532957077, | |
| "reward_std": 0.19741250574588776, | |
| "rewards/code_reward": 0.42573845386505127, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 678.5826110839844, | |
| "epoch": 0.8004246284501062, | |
| "grad_norm": 0.28517383337020874, | |
| "kl": 0.168212890625, | |
| "learning_rate": 1.177146472116071e-06, | |
| "loss": 0.0018, | |
| "reward": 0.4997348487377167, | |
| "reward_std": 0.16867511346936226, | |
| "rewards/code_reward": 0.40196699649095535, | |
| "rewards/format_reward": 0.9776785969734192, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 725.0000457763672, | |
| "epoch": 0.802547770700637, | |
| "grad_norm": 0.38322436809539795, | |
| "kl": 0.176025390625, | |
| "learning_rate": 1.1667577289579462e-06, | |
| "loss": 0.0018, | |
| "reward": 0.43969085440039635, | |
| "reward_std": 0.16067362390458584, | |
| "rewards/code_reward": 0.3425925988703966, | |
| "rewards/format_reward": 0.9709821790456772, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 671.4710083007812, | |
| "epoch": 0.8046709129511678, | |
| "grad_norm": 0.24044837057590485, | |
| "kl": 0.1435546875, | |
| "learning_rate": 1.1564354154746007e-06, | |
| "loss": 0.0015, | |
| "reward": 0.5779925882816315, | |
| "reward_std": 0.22314922511577606, | |
| "rewards/code_reward": 0.479331873357296, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 701.1875457763672, | |
| "epoch": 0.8067940552016986, | |
| "grad_norm": 0.2769814729690552, | |
| "kl": 0.187255859375, | |
| "learning_rate": 1.146179964769635e-06, | |
| "loss": 0.002, | |
| "reward": 0.5813698992133141, | |
| "reward_std": 0.21280257403850555, | |
| "rewards/code_reward": 0.48315558582544327, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 703.2701263427734, | |
| "epoch": 0.8089171974522293, | |
| "grad_norm": 0.43315884470939636, | |
| "kl": 0.28125, | |
| "learning_rate": 1.1359918071412195e-06, | |
| "loss": 0.003, | |
| "reward": 0.5584300383925438, | |
| "reward_std": 0.17897445522248745, | |
| "rewards/code_reward": 0.4595461040735245, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 680.9174499511719, | |
| "epoch": 0.8110403397027601, | |
| "grad_norm": 0.3025217652320862, | |
| "kl": 0.208251953125, | |
| "learning_rate": 1.1258713700640456e-06, | |
| "loss": 0.0022, | |
| "reward": 0.47092022001743317, | |
| "reward_std": 0.1665214579552412, | |
| "rewards/code_reward": 0.3727059066295624, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 672.3705596923828, | |
| "epoch": 0.8131634819532909, | |
| "grad_norm": 0.23662854731082916, | |
| "kl": 0.1478271484375, | |
| "learning_rate": 1.115819078171383e-06, | |
| "loss": 0.0016, | |
| "reward": 0.5290590599179268, | |
| "reward_std": 0.21020712330937386, | |
| "rewards/code_reward": 0.4312911853194237, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 659.2678833007812, | |
| "epoch": 0.8152866242038217, | |
| "grad_norm": 0.2239212840795517, | |
| "kl": 0.1688232421875, | |
| "learning_rate": 1.1058353532372667e-06, | |
| "loss": 0.0018, | |
| "reward": 0.5600069090723991, | |
| "reward_std": 0.20570005849003792, | |
| "rewards/code_reward": 0.46067656576633453, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 688.6205596923828, | |
| "epoch": 0.8174097664543525, | |
| "grad_norm": 0.25939956307411194, | |
| "kl": 0.156982421875, | |
| "learning_rate": 1.0959206141587998e-06, | |
| "loss": 0.0016, | |
| "reward": 0.461281917989254, | |
| "reward_std": 0.2138805352151394, | |
| "rewards/code_reward": 0.36329086124897003, | |
| "rewards/format_reward": 0.9799107760190964, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 689.8527069091797, | |
| "epoch": 0.8195329087048833, | |
| "grad_norm": 0.564179003238678, | |
| "kl": 0.34716796875, | |
| "learning_rate": 1.0860752769385766e-06, | |
| "loss": 0.0035, | |
| "reward": 0.5820841789245605, | |
| "reward_std": 0.23867543786764145, | |
| "rewards/code_reward": 0.48320019245147705, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 716.8995971679688, | |
| "epoch": 0.821656050955414, | |
| "grad_norm": 0.31268319487571716, | |
| "kl": 0.2451171875, | |
| "learning_rate": 1.0762997546672279e-06, | |
| "loss": 0.0026, | |
| "reward": 0.24600705318152905, | |
| "reward_std": 0.06653665285557508, | |
| "rewards/code_reward": 0.14823918044567108, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 661.8973388671875, | |
| "epoch": 0.8237791932059448, | |
| "grad_norm": 0.23703983426094055, | |
| "kl": 0.139892578125, | |
| "learning_rate": 1.0665944575060914e-06, | |
| "loss": 0.0015, | |
| "reward": 0.5530121028423309, | |
| "reward_std": 0.2014228142797947, | |
| "rewards/code_reward": 0.45368169248104095, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 671.0468902587891, | |
| "epoch": 0.8259023354564756, | |
| "grad_norm": 0.21562151610851288, | |
| "kl": 0.14697265625, | |
| "learning_rate": 1.056959792669997e-06, | |
| "loss": 0.0016, | |
| "reward": 0.6221778392791748, | |
| "reward_std": 0.17795583605766296, | |
| "rewards/code_reward": 0.5246331766247749, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 707.1830749511719, | |
| "epoch": 0.8280254777070064, | |
| "grad_norm": 0.25027066469192505, | |
| "kl": 0.15234375, | |
| "learning_rate": 1.0473961644101856e-06, | |
| "loss": 0.0016, | |
| "reward": 0.49339308589696884, | |
| "reward_std": 0.1599120758473873, | |
| "rewards/code_reward": 0.39450912177562714, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 724.7522583007812, | |
| "epoch": 0.8301486199575372, | |
| "grad_norm": 0.2350330352783203, | |
| "kl": 0.193603515625, | |
| "learning_rate": 1.037903973997345e-06, | |
| "loss": 0.0021, | |
| "reward": 0.478931725025177, | |
| "reward_std": 0.12047621235251427, | |
| "rewards/code_reward": 0.3804941847920418, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 702.982177734375, | |
| "epoch": 0.832271762208068, | |
| "grad_norm": 0.3609310984611511, | |
| "kl": 0.179931640625, | |
| "learning_rate": 1.0284836197047737e-06, | |
| "loss": 0.0019, | |
| "reward": 0.44246046990156174, | |
| "reward_std": 0.1557149738073349, | |
| "rewards/code_reward": 0.3444693833589554, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 674.607177734375, | |
| "epoch": 0.8343949044585988, | |
| "grad_norm": 0.5464503765106201, | |
| "kl": 0.248046875, | |
| "learning_rate": 1.0191354967916712e-06, | |
| "loss": 0.0026, | |
| "reward": 0.5180330500006676, | |
| "reward_std": 0.1834750883281231, | |
| "rewards/code_reward": 0.4193723499774933, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 684.0937805175781, | |
| "epoch": 0.8365180467091295, | |
| "grad_norm": 0.23662471771240234, | |
| "kl": 0.1290283203125, | |
| "learning_rate": 1.0098599974865515e-06, | |
| "loss": 0.0014, | |
| "reward": 0.5139395222067833, | |
| "reward_std": 0.1551931146532297, | |
| "rewards/code_reward": 0.41594842076301575, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 692.5580596923828, | |
| "epoch": 0.8386411889596603, | |
| "grad_norm": 0.34932953119277954, | |
| "kl": 0.154296875, | |
| "learning_rate": 1.0006575109707898e-06, | |
| "loss": 0.0017, | |
| "reward": 0.5320730581879616, | |
| "reward_std": 0.205118702724576, | |
| "rewards/code_reward": 0.43318910896778107, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 678.8571624755859, | |
| "epoch": 0.8407643312101911, | |
| "grad_norm": 0.5195670127868652, | |
| "kl": 0.1474609375, | |
| "learning_rate": 9.915284233622877e-07, | |
| "loss": 0.0016, | |
| "reward": 0.4320642352104187, | |
| "reward_std": 0.18216058425605297, | |
| "rewards/code_reward": 0.33362672477960587, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 706.8861999511719, | |
| "epoch": 0.8428874734607219, | |
| "grad_norm": 0.24882346391677856, | |
| "kl": 0.148681640625, | |
| "learning_rate": 9.824731176992796e-07, | |
| "loss": 0.0016, | |
| "reward": 0.5600469708442688, | |
| "reward_std": 0.16885506361722946, | |
| "rewards/code_reward": 0.4616094380617142, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 669.0491333007812, | |
| "epoch": 0.8450106157112527, | |
| "grad_norm": 1.0406914949417114, | |
| "kl": 0.364013671875, | |
| "learning_rate": 9.734919739242543e-07, | |
| "loss": 0.0037, | |
| "reward": 0.5749830156564713, | |
| "reward_std": 0.21774039044976234, | |
| "rewards/code_reward": 0.47676874697208405, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 723.325927734375, | |
| "epoch": 0.8471337579617835, | |
| "grad_norm": 0.5013810396194458, | |
| "kl": 0.1451416015625, | |
| "learning_rate": 9.645853688680177e-07, | |
| "loss": 0.0016, | |
| "reward": 0.5728159248828888, | |
| "reward_std": 0.1670310366898775, | |
| "rewards/code_reward": 0.4746016263961792, | |
| "rewards/format_reward": 0.9821428805589676, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 700.310302734375, | |
| "epoch": 0.8492569002123143, | |
| "grad_norm": 0.8073310852050781, | |
| "kl": 0.2965087890625, | |
| "learning_rate": 9.557536762338786e-07, | |
| "loss": 0.003, | |
| "reward": 0.492939718067646, | |
| "reward_std": 0.2011387124657631, | |
| "rewards/code_reward": 0.39494864642620087, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 693.0536041259766, | |
| "epoch": 0.851380042462845, | |
| "grad_norm": 0.3889514207839966, | |
| "kl": 0.164306640625, | |
| "learning_rate": 9.46997266581973e-07, | |
| "loss": 0.0018, | |
| "reward": 0.5752345323562622, | |
| "reward_std": 0.19828381016850471, | |
| "rewards/code_reward": 0.475680947303772, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 706.841552734375, | |
| "epoch": 0.8535031847133758, | |
| "grad_norm": 4.799881458282471, | |
| "kl": 0.4912109375, | |
| "learning_rate": 9.383165073137115e-07, | |
| "loss": 0.0051, | |
| "reward": 0.5113906338810921, | |
| "reward_std": 0.14735013246536255, | |
| "rewards/code_reward": 0.41295309364795685, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 691.2210235595703, | |
| "epoch": 0.8556263269639066, | |
| "grad_norm": 3.541896104812622, | |
| "kl": 0.14697265625, | |
| "learning_rate": 9.297117626563687e-07, | |
| "loss": 0.0016, | |
| "reward": 0.6038797795772552, | |
| "reward_std": 0.18652482330799103, | |
| "rewards/code_reward": 0.5065583363175392, | |
| "rewards/format_reward": 0.973214328289032, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 725.4933471679688, | |
| "epoch": 0.8577494692144374, | |
| "grad_norm": 158.1067352294922, | |
| "kl": 18.90283203125, | |
| "learning_rate": 9.211833936477957e-07, | |
| "loss": 0.1896, | |
| "reward": 0.5942443758249283, | |
| "reward_std": 0.12594054080545902, | |
| "rewards/code_reward": 0.4960300847887993, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 717.1585235595703, | |
| "epoch": 0.8598726114649682, | |
| "grad_norm": 2265.1884765625, | |
| "kl": 230.10986328125, | |
| "learning_rate": 9.127317581212753e-07, | |
| "loss": 2.3015, | |
| "reward": 0.53834218531847, | |
| "reward_std": 0.1464555226266384, | |
| "rewards/code_reward": 0.4394582211971283, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 727.5826110839844, | |
| "epoch": 0.861995753715499, | |
| "grad_norm": 0.2873145341873169, | |
| "kl": 0.1866455078125, | |
| "learning_rate": 9.043572106905084e-07, | |
| "loss": 0.0019, | |
| "reward": 0.5367319211363792, | |
| "reward_std": 0.17168255895376205, | |
| "rewards/code_reward": 0.43851763010025024, | |
| "rewards/format_reward": 0.98214291036129, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 726.1674499511719, | |
| "epoch": 0.8641188959660298, | |
| "grad_norm": 0.2757129371166229, | |
| "kl": 0.1365966796875, | |
| "learning_rate": 8.960601027347321e-07, | |
| "loss": 0.0014, | |
| "reward": 0.5360690876841545, | |
| "reward_std": 0.2111339271068573, | |
| "rewards/code_reward": 0.4367387220263481, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 708.4241333007812, | |
| "epoch": 0.8662420382165605, | |
| "grad_norm": 1.7461967468261719, | |
| "kl": 0.15234375, | |
| "learning_rate": 8.878407823839788e-07, | |
| "loss": 0.0016, | |
| "reward": 0.4714769721031189, | |
| "reward_std": 0.17892321571707726, | |
| "rewards/code_reward": 0.3723698630928993, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 722.7344207763672, | |
| "epoch": 0.8683651804670913, | |
| "grad_norm": 1.359683632850647, | |
| "kl": 0.1497802734375, | |
| "learning_rate": 8.796995945044689e-07, | |
| "loss": 0.0017, | |
| "reward": 0.5647559985518456, | |
| "reward_std": 0.16498099640011787, | |
| "rewards/code_reward": 0.4656488224864006, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 758.9174499511719, | |
| "epoch": 0.8704883227176221, | |
| "grad_norm": 0.34433820843696594, | |
| "kl": 0.12939453125, | |
| "learning_rate": 8.716368806841405e-07, | |
| "loss": 0.0013, | |
| "reward": 0.40852154791355133, | |
| "reward_std": 0.19776060804724693, | |
| "rewards/code_reward": 0.30919117480516434, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 730.169677734375, | |
| "epoch": 0.8726114649681529, | |
| "grad_norm": 0.43445339798927307, | |
| "kl": 0.132080078125, | |
| "learning_rate": 8.636529792183171e-07, | |
| "loss": 0.0014, | |
| "reward": 0.5396310985088348, | |
| "reward_std": 0.19617567211389542, | |
| "rewards/code_reward": 0.44097036868333817, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 717.8705596923828, | |
| "epoch": 0.8747346072186837, | |
| "grad_norm": 0.5580800771713257, | |
| "kl": 0.192138671875, | |
| "learning_rate": 8.557482250955144e-07, | |
| "loss": 0.002, | |
| "reward": 0.4667212590575218, | |
| "reward_std": 0.20506682246923447, | |
| "rewards/code_reward": 0.36850695312023163, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 701.482177734375, | |
| "epoch": 0.8768577494692145, | |
| "grad_norm": 0.33230528235435486, | |
| "kl": 0.150146484375, | |
| "learning_rate": 8.479229499833844e-07, | |
| "loss": 0.0015, | |
| "reward": 0.5547576695680618, | |
| "reward_std": 0.21152211725711823, | |
| "rewards/code_reward": 0.4558737352490425, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 704.0469055175781, | |
| "epoch": 0.8789808917197452, | |
| "grad_norm": 0.3372839093208313, | |
| "kl": 0.1534423828125, | |
| "learning_rate": 8.401774822147976e-07, | |
| "loss": 0.0016, | |
| "reward": 0.5494333058595657, | |
| "reward_std": 0.24079465121030807, | |
| "rewards/code_reward": 0.4505493566393852, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 723.966552734375, | |
| "epoch": 0.881104033970276, | |
| "grad_norm": 0.4163219630718231, | |
| "kl": 0.26123046875, | |
| "learning_rate": 8.325121467740695e-07, | |
| "loss": 0.0026, | |
| "reward": 0.3951665982604027, | |
| "reward_std": 0.18642807379364967, | |
| "rewards/code_reward": 0.29628264531493187, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 736.9129943847656, | |
| "epoch": 0.8832271762208068, | |
| "grad_norm": 0.6581453084945679, | |
| "kl": 0.18310546875, | |
| "learning_rate": 8.249272652833226e-07, | |
| "loss": 0.0018, | |
| "reward": 0.4613909646868706, | |
| "reward_std": 0.14806298539042473, | |
| "rewards/code_reward": 0.3633998855948448, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 712.9754791259766, | |
| "epoch": 0.8853503184713376, | |
| "grad_norm": 1.2168219089508057, | |
| "kl": 0.2080078125, | |
| "learning_rate": 8.174231559889931e-07, | |
| "loss": 0.0021, | |
| "reward": 0.44138607382774353, | |
| "reward_std": 0.22260471060872078, | |
| "rewards/code_reward": 0.34317177161574364, | |
| "rewards/format_reward": 0.9821428805589676, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 711.1049346923828, | |
| "epoch": 0.8874734607218684, | |
| "grad_norm": 1.5622974634170532, | |
| "kl": 0.21630859375, | |
| "learning_rate": 8.100001337484787e-07, | |
| "loss": 0.0022, | |
| "reward": 0.5736604407429695, | |
| "reward_std": 0.20455688051879406, | |
| "rewards/code_reward": 0.4747764840722084, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 729.3861999511719, | |
| "epoch": 0.8895966029723992, | |
| "grad_norm": 0.5059235095977783, | |
| "kl": 0.16796875, | |
| "learning_rate": 8.026585100169251e-07, | |
| "loss": 0.0017, | |
| "reward": 0.4245912581682205, | |
| "reward_std": 0.151387682184577, | |
| "rewards/code_reward": 0.32637695223093033, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 690.7187652587891, | |
| "epoch": 0.89171974522293, | |
| "grad_norm": 6.739729881286621, | |
| "kl": 2.8837890625, | |
| "learning_rate": 7.953985928341601e-07, | |
| "loss": 0.0289, | |
| "reward": 0.5304828435182571, | |
| "reward_std": 0.157493332400918, | |
| "rewards/code_reward": 0.4313756823539734, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 694.5335083007812, | |
| "epoch": 0.8938428874734607, | |
| "grad_norm": 0.45631542801856995, | |
| "kl": 0.1708984375, | |
| "learning_rate": 7.882206868117693e-07, | |
| "loss": 0.0018, | |
| "reward": 0.4608374051749706, | |
| "reward_std": 0.1782052293419838, | |
| "rewards/code_reward": 0.36106058582663536, | |
| "rewards/format_reward": 0.9977678656578064, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 733.1339721679688, | |
| "epoch": 0.8959660297239915, | |
| "grad_norm": 1.1783860921859741, | |
| "kl": 0.181640625, | |
| "learning_rate": 7.81125093120313e-07, | |
| "loss": 0.0019, | |
| "reward": 0.4884042590856552, | |
| "reward_std": 0.164920412003994, | |
| "rewards/code_reward": 0.3899667263031006, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 694.8013610839844, | |
| "epoch": 0.8980891719745223, | |
| "grad_norm": 0.7121770977973938, | |
| "kl": 0.24853515625, | |
| "learning_rate": 7.741121094766916e-07, | |
| "loss": 0.0026, | |
| "reward": 0.5257243886590004, | |
| "reward_std": 0.15851808711886406, | |
| "rewards/code_reward": 0.426840465515852, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 681.4174346923828, | |
| "epoch": 0.9002123142250531, | |
| "grad_norm": 0.739496648311615, | |
| "kl": 0.25439453125, | |
| "learning_rate": 7.671820301316532e-07, | |
| "loss": 0.0026, | |
| "reward": 0.4978240504860878, | |
| "reward_std": 0.17392848432064056, | |
| "rewards/code_reward": 0.39983299374580383, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 727.6897583007812, | |
| "epoch": 0.9023354564755839, | |
| "grad_norm": 0.6177138090133667, | |
| "kl": 0.183349609375, | |
| "learning_rate": 7.603351458574474e-07, | |
| "loss": 0.0019, | |
| "reward": 0.44435514509677887, | |
| "reward_std": 0.13320972956717014, | |
| "rewards/code_reward": 0.34703367203474045, | |
| "rewards/format_reward": 0.9732143133878708, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 721.8772735595703, | |
| "epoch": 0.9044585987261147, | |
| "grad_norm": 1.0612621307373047, | |
| "kl": 0.2496337890625, | |
| "learning_rate": 7.535717439356255e-07, | |
| "loss": 0.0026, | |
| "reward": 0.4390544593334198, | |
| "reward_std": 0.15821044147014618, | |
| "rewards/code_reward": 0.3408401757478714, | |
| "rewards/format_reward": 0.9821428805589676, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 708.1295013427734, | |
| "epoch": 0.9065817409766455, | |
| "grad_norm": 0.3175060451030731, | |
| "kl": 0.15869140625, | |
| "learning_rate": 7.46892108144986e-07, | |
| "loss": 0.0017, | |
| "reward": 0.45070114731788635, | |
| "reward_std": 0.1746504958719015, | |
| "rewards/code_reward": 0.35181717574596405, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 752.310302734375, | |
| "epoch": 0.9087048832271762, | |
| "grad_norm": 18.601308822631836, | |
| "kl": 3.44775390625, | |
| "learning_rate": 7.402965187496697e-07, | |
| "loss": 0.0348, | |
| "reward": 0.46990416944026947, | |
| "reward_std": 0.1597570963203907, | |
| "rewards/code_reward": 0.3723594844341278, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 730.3460235595703, | |
| "epoch": 0.910828025477707, | |
| "grad_norm": 8.600378036499023, | |
| "kl": 1.468994140625, | |
| "learning_rate": 7.337852524873974e-07, | |
| "loss": 0.0148, | |
| "reward": 0.6117217838764191, | |
| "reward_std": 0.21035557612776756, | |
| "rewards/code_reward": 0.5126146152615547, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 710.6138610839844, | |
| "epoch": 0.9129511677282378, | |
| "grad_norm": 0.4506695568561554, | |
| "kl": 0.20361328125, | |
| "learning_rate": 7.273585825578608e-07, | |
| "loss": 0.0022, | |
| "reward": 0.4428362399339676, | |
| "reward_std": 0.12803563103079796, | |
| "rewards/code_reward": 0.34372907504439354, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 658.6228179931641, | |
| "epoch": 0.9150743099787686, | |
| "grad_norm": 5.093682765960693, | |
| "kl": 0.5947265625, | |
| "learning_rate": 7.21016778611259e-07, | |
| "loss": 0.0061, | |
| "reward": 0.5427140817046165, | |
| "reward_std": 0.19685931131243706, | |
| "rewards/code_reward": 0.4447230063378811, | |
| "rewards/format_reward": 0.9799107760190964, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 677.9040222167969, | |
| "epoch": 0.9171974522292994, | |
| "grad_norm": 38.870262145996094, | |
| "kl": 5.4326171875, | |
| "learning_rate": 7.147601067369835e-07, | |
| "loss": 0.0545, | |
| "reward": 0.5093298330903053, | |
| "reward_std": 0.19096140936017036, | |
| "rewards/code_reward": 0.41111550480127335, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 694.513427734375, | |
| "epoch": 0.9193205944798302, | |
| "grad_norm": 0.5155165195465088, | |
| "kl": 0.155029296875, | |
| "learning_rate": 7.085888294524561e-07, | |
| "loss": 0.0016, | |
| "reward": 0.5259926542639732, | |
| "reward_std": 0.18491110764443874, | |
| "rewards/code_reward": 0.42733194679021835, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 704.5580596923828, | |
| "epoch": 0.921443736730361, | |
| "grad_norm": 0.6282893419265747, | |
| "kl": 0.3359375, | |
| "learning_rate": 7.025032056921117e-07, | |
| "loss": 0.0034, | |
| "reward": 0.5899785161018372, | |
| "reward_std": 0.19566836208105087, | |
| "rewards/code_reward": 0.4913177192211151, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 722.1562805175781, | |
| "epoch": 0.9235668789808917, | |
| "grad_norm": 1.048966884613037, | |
| "kl": 0.4742431640625, | |
| "learning_rate": 6.965034907965349e-07, | |
| "loss": 0.0049, | |
| "reward": 0.5559424459934235, | |
| "reward_std": 0.2080874666571617, | |
| "rewards/code_reward": 0.4588441997766495, | |
| "rewards/format_reward": 0.9709821939468384, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 679.9933319091797, | |
| "epoch": 0.9256900212314225, | |
| "grad_norm": 0.6251688599586487, | |
| "kl": 0.171142578125, | |
| "learning_rate": 6.905899365017462e-07, | |
| "loss": 0.0018, | |
| "reward": 0.5245073512196541, | |
| "reward_std": 0.17461021803319454, | |
| "rewards/code_reward": 0.42606981843709946, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 711.6942443847656, | |
| "epoch": 0.9278131634819533, | |
| "grad_norm": 1.1648685932159424, | |
| "kl": 0.299560546875, | |
| "learning_rate": 6.847627909286409e-07, | |
| "loss": 0.003, | |
| "reward": 0.41069934517145157, | |
| "reward_std": 0.17594012804329395, | |
| "rewards/code_reward": 0.31226181238889694, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 702.3482513427734, | |
| "epoch": 0.9299363057324841, | |
| "grad_norm": 1.5311229228973389, | |
| "kl": 0.31640625, | |
| "learning_rate": 6.790222985725761e-07, | |
| "loss": 0.0033, | |
| "reward": 0.5770048946142197, | |
| "reward_std": 0.1962369978427887, | |
| "rewards/code_reward": 0.4790138080716133, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 683.4151916503906, | |
| "epoch": 0.9320594479830149, | |
| "grad_norm": 8.243356704711914, | |
| "kl": 3.05126953125, | |
| "learning_rate": 6.733687002931141e-07, | |
| "loss": 0.0306, | |
| "reward": 0.5087217092514038, | |
| "reward_std": 0.1651569865643978, | |
| "rewards/code_reward": 0.4109538644552231, | |
| "rewards/format_reward": 0.9776786267757416, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 713.6049346923828, | |
| "epoch": 0.9341825902335457, | |
| "grad_norm": 1.4741530418395996, | |
| "kl": 0.967529296875, | |
| "learning_rate": 6.678022333039158e-07, | |
| "loss": 0.0098, | |
| "reward": 0.587900809943676, | |
| "reward_std": 0.16147084161639214, | |
| "rewards/code_reward": 0.4903561547398567, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 677.8303833007812, | |
| "epoch": 0.9363057324840764, | |
| "grad_norm": 0.3179962933063507, | |
| "kl": 0.230224609375, | |
| "learning_rate": 6.623231311627876e-07, | |
| "loss": 0.0025, | |
| "reward": 0.561469204723835, | |
| "reward_std": 0.16684554889798164, | |
| "rewards/code_reward": 0.4625852555036545, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 725.1004791259766, | |
| "epoch": 0.9384288747346072, | |
| "grad_norm": 2.448838233947754, | |
| "kl": 1.276123046875, | |
| "learning_rate": 6.569316237618811e-07, | |
| "loss": 0.0127, | |
| "reward": 0.3736302964389324, | |
| "reward_std": 0.18804692663252354, | |
| "rewards/code_reward": 0.2751928083598614, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 710.9286041259766, | |
| "epoch": 0.940552016985138, | |
| "grad_norm": 0.38171106576919556, | |
| "kl": 0.2259521484375, | |
| "learning_rate": 6.516279373180499e-07, | |
| "loss": 0.0024, | |
| "reward": 0.45342515781521797, | |
| "reward_std": 0.16657396219670773, | |
| "rewards/code_reward": 0.3540947772562504, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 665.5625305175781, | |
| "epoch": 0.9426751592356688, | |
| "grad_norm": 0.5256981253623962, | |
| "kl": 0.63818359375, | |
| "learning_rate": 6.464122943633543e-07, | |
| "loss": 0.0066, | |
| "reward": 0.5117220133543015, | |
| "reward_std": 0.17998000979423523, | |
| "rewards/code_reward": 0.4126148596405983, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 669.888427734375, | |
| "epoch": 0.9447983014861996, | |
| "grad_norm": 10.391777038574219, | |
| "kl": 1.935546875, | |
| "learning_rate": 6.412849137357271e-07, | |
| "loss": 0.0195, | |
| "reward": 0.577217735350132, | |
| "reward_std": 0.18068324774503708, | |
| "rewards/code_reward": 0.47878019511699677, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 706.747802734375, | |
| "epoch": 0.9469214437367304, | |
| "grad_norm": 0.7888285517692566, | |
| "kl": 0.395263671875, | |
| "learning_rate": 6.3624601056979e-07, | |
| "loss": 0.0041, | |
| "reward": 0.5674577727913857, | |
| "reward_std": 0.14589250087738037, | |
| "rewards/code_reward": 0.469020277261734, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 698.3861999511719, | |
| "epoch": 0.9490445859872612, | |
| "grad_norm": 0.5909515619277954, | |
| "kl": 0.4178466796875, | |
| "learning_rate": 6.312957962878278e-07, | |
| "loss": 0.0042, | |
| "reward": 0.44434136897325516, | |
| "reward_std": 0.1476050168275833, | |
| "rewards/code_reward": 0.3447878174483776, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 697.1138763427734, | |
| "epoch": 0.9511677282377919, | |
| "grad_norm": 0.3049964904785156, | |
| "kl": 0.36083984375, | |
| "learning_rate": 6.264344785909181e-07, | |
| "loss": 0.0036, | |
| "reward": 0.5054452195763588, | |
| "reward_std": 0.16559578850865364, | |
| "rewards/code_reward": 0.40633804351091385, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 699.5000457763672, | |
| "epoch": 0.9532908704883227, | |
| "grad_norm": 2.360261917114258, | |
| "kl": 1.0093994140625, | |
| "learning_rate": 6.216622614502149e-07, | |
| "loss": 0.0102, | |
| "reward": 0.43248920887708664, | |
| "reward_std": 0.20951998233795166, | |
| "rewards/code_reward": 0.3344981260597706, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 711.5647735595703, | |
| "epoch": 0.9554140127388535, | |
| "grad_norm": 0.45019006729125977, | |
| "kl": 0.396728515625, | |
| "learning_rate": 6.169793450983916e-07, | |
| "loss": 0.0041, | |
| "reward": 0.4090769328176975, | |
| "reward_std": 0.1387995146214962, | |
| "rewards/code_reward": 0.30996978655457497, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 687.4710083007812, | |
| "epoch": 0.9575371549893843, | |
| "grad_norm": 1.139167070388794, | |
| "kl": 0.70947265625, | |
| "learning_rate": 6.123859260212393e-07, | |
| "loss": 0.0073, | |
| "reward": 0.6231836080551147, | |
| "reward_std": 0.18272383697330952, | |
| "rewards/code_reward": 0.5249693095684052, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 664.4620666503906, | |
| "epoch": 0.9596602972399151, | |
| "grad_norm": 11.963510513305664, | |
| "kl": 2.9296875, | |
| "learning_rate": 6.07882196949423e-07, | |
| "loss": 0.0292, | |
| "reward": 0.5648458003997803, | |
| "reward_std": 0.21996057033538818, | |
| "rewards/code_reward": 0.4666314870119095, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 672.8326110839844, | |
| "epoch": 0.9617834394904459, | |
| "grad_norm": 0.23568426072597504, | |
| "kl": 0.138427734375, | |
| "learning_rate": 6.034683468503948e-07, | |
| "loss": 0.0015, | |
| "reward": 0.5011638775467873, | |
| "reward_std": 0.1840323582291603, | |
| "rewards/code_reward": 0.4020567089319229, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 692.2835083007812, | |
| "epoch": 0.9639065817409767, | |
| "grad_norm": 1.3131980895996094, | |
| "kl": 0.73291015625, | |
| "learning_rate": 5.991445609204641e-07, | |
| "loss": 0.0073, | |
| "reward": 0.49861256778240204, | |
| "reward_std": 0.19007166847586632, | |
| "rewards/code_reward": 0.4006215110421181, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 680.5536041259766, | |
| "epoch": 0.9660297239915074, | |
| "grad_norm": 1.0419756174087524, | |
| "kl": 0.8876953125, | |
| "learning_rate": 5.949110205770292e-07, | |
| "loss": 0.009, | |
| "reward": 0.5448554530739784, | |
| "reward_std": 0.19055398926138878, | |
| "rewards/code_reward": 0.44686436653137207, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 693.2187805175781, | |
| "epoch": 0.9681528662420382, | |
| "grad_norm": 0.7175102233886719, | |
| "kl": 0.481201171875, | |
| "learning_rate": 5.90767903450964e-07, | |
| "loss": 0.0049, | |
| "reward": 0.4721348285675049, | |
| "reward_std": 0.14595188200473785, | |
| "rewards/code_reward": 0.37302765995264053, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 695.2723541259766, | |
| "epoch": 0.970276008492569, | |
| "grad_norm": 0.4296894371509552, | |
| "kl": 0.26416015625, | |
| "learning_rate": 5.867153833791652e-07, | |
| "loss": 0.0027, | |
| "reward": 0.6006196290254593, | |
| "reward_std": 0.17042616941034794, | |
| "rewards/code_reward": 0.5019589066505432, | |
| "rewards/format_reward": 0.9866071790456772, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 692.6652221679688, | |
| "epoch": 0.9723991507430998, | |
| "grad_norm": 0.4421376585960388, | |
| "kl": 0.31982421875, | |
| "learning_rate": 5.827536303972587e-07, | |
| "loss": 0.0033, | |
| "reward": 0.5808815285563469, | |
| "reward_std": 0.2226531021296978, | |
| "rewards/code_reward": 0.4815511405467987, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 679.247802734375, | |
| "epoch": 0.9745222929936306, | |
| "grad_norm": 0.40139421820640564, | |
| "kl": 0.47216796875, | |
| "learning_rate": 5.78882810732465e-07, | |
| "loss": 0.0048, | |
| "reward": 0.5371674299240112, | |
| "reward_std": 0.22804895788431168, | |
| "rewards/code_reward": 0.43962281197309494, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 706.5379638671875, | |
| "epoch": 0.9766454352441614, | |
| "grad_norm": 0.8857892751693726, | |
| "kl": 0.51220703125, | |
| "learning_rate": 5.75103086796625e-07, | |
| "loss": 0.0052, | |
| "reward": 0.4905061312019825, | |
| "reward_std": 0.1841362752020359, | |
| "rewards/code_reward": 0.39206862077116966, | |
| "rewards/format_reward": 0.9843750447034836, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 690.3415374755859, | |
| "epoch": 0.9787685774946921, | |
| "grad_norm": 0.6516154408454895, | |
| "kl": 0.439697265625, | |
| "learning_rate": 5.714146171793846e-07, | |
| "loss": 0.0045, | |
| "reward": 0.5876915380358696, | |
| "reward_std": 0.15721704810857773, | |
| "rewards/code_reward": 0.4892539754509926, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 681.5156402587891, | |
| "epoch": 0.9808917197452229, | |
| "grad_norm": 0.618622362613678, | |
| "kl": 0.48046875, | |
| "learning_rate": 5.678175566415422e-07, | |
| "loss": 0.0048, | |
| "reward": 0.49158109724521637, | |
| "reward_std": 0.1944441720843315, | |
| "rewards/code_reward": 0.39381323754787445, | |
| "rewards/format_reward": 0.9776786118745804, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 722.8750305175781, | |
| "epoch": 0.9830148619957537, | |
| "grad_norm": 0.7218803763389587, | |
| "kl": 0.565185546875, | |
| "learning_rate": 5.643120561085528e-07, | |
| "loss": 0.0057, | |
| "reward": 0.4738345965743065, | |
| "reward_std": 0.24430794268846512, | |
| "rewards/code_reward": 0.37651316076517105, | |
| "rewards/format_reward": 0.9732143133878708, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 682.4219055175781, | |
| "epoch": 0.9851380042462845, | |
| "grad_norm": 0.7259976863861084, | |
| "kl": 0.706298828125, | |
| "learning_rate": 5.608982626641991e-07, | |
| "loss": 0.0071, | |
| "reward": 0.47413645684719086, | |
| "reward_std": 0.21057153865695, | |
| "rewards/code_reward": 0.3761453852057457, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 716.5736999511719, | |
| "epoch": 0.9872611464968153, | |
| "grad_norm": 0.2483934909105301, | |
| "kl": 0.260009765625, | |
| "learning_rate": 5.575763195444166e-07, | |
| "loss": 0.0027, | |
| "reward": 0.5671171024441719, | |
| "reward_std": 0.19927529990673065, | |
| "rewards/code_reward": 0.46912601590156555, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 680.6339416503906, | |
| "epoch": 0.9893842887473461, | |
| "grad_norm": 1.9431463479995728, | |
| "kl": 1.3564453125, | |
| "learning_rate": 5.543463661312847e-07, | |
| "loss": 0.0136, | |
| "reward": 0.417750583961606, | |
| "reward_std": 0.13368695229291916, | |
| "rewards/code_reward": 0.3197594955563545, | |
| "rewards/format_reward": 0.9799107611179352, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 684.9911041259766, | |
| "epoch": 0.9915074309978769, | |
| "grad_norm": 0.8100730776786804, | |
| "kl": 0.4619140625, | |
| "learning_rate": 5.512085379471808e-07, | |
| "loss": 0.0048, | |
| "reward": 0.6499997675418854, | |
| "reward_std": 0.20544839650392532, | |
| "rewards/code_reward": 0.5511157959699631, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 681.5670013427734, | |
| "epoch": 0.9936305732484076, | |
| "grad_norm": 3.3746109008789062, | |
| "kl": 1.414306640625, | |
| "learning_rate": 5.481629666490903e-07, | |
| "loss": 0.0142, | |
| "reward": 0.5468520447611809, | |
| "reward_std": 0.21051420643925667, | |
| "rewards/code_reward": 0.44774486869573593, | |
| "rewards/format_reward": 0.9910714477300644, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 688.2388610839844, | |
| "epoch": 0.9957537154989384, | |
| "grad_norm": 1.0109045505523682, | |
| "kl": 1.15380859375, | |
| "learning_rate": 5.452097800230853e-07, | |
| "loss": 0.0116, | |
| "reward": 0.6098516285419464, | |
| "reward_std": 0.211056686937809, | |
| "rewards/code_reward": 0.5116373002529144, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 681.9821624755859, | |
| "epoch": 0.9978768577494692, | |
| "grad_norm": 0.7048155665397644, | |
| "kl": 0.809814453125, | |
| "learning_rate": 5.423491019789623e-07, | |
| "loss": 0.0082, | |
| "reward": 0.45874594151973724, | |
| "reward_std": 0.14819572865962982, | |
| "rewards/code_reward": 0.3596387729048729, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 707.5000457763672, | |
| "epoch": 1.0, | |
| "grad_norm": 2.86737060546875, | |
| "kl": 1.1783447265625, | |
| "learning_rate": 5.395810525450425e-07, | |
| "loss": 0.0118, | |
| "reward": 0.5169450491666794, | |
| "reward_std": 0.18883745186030865, | |
| "rewards/code_reward": 0.41850756853818893, | |
| "rewards/format_reward": 0.9843750298023224, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 677.5982360839844, | |
| "epoch": 1.0021231422505308, | |
| "grad_norm": 1.9091771841049194, | |
| "kl": 1.307861328125, | |
| "learning_rate": 5.369057478631359e-07, | |
| "loss": 0.0132, | |
| "reward": 0.5092417150735855, | |
| "reward_std": 0.18099428340792656, | |
| "rewards/code_reward": 0.4110274314880371, | |
| "rewards/format_reward": 0.9821428805589676, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 711.9843902587891, | |
| "epoch": 1.0042462845010616, | |
| "grad_norm": 1.6537326574325562, | |
| "kl": 1.21533203125, | |
| "learning_rate": 5.343233001836694e-07, | |
| "loss": 0.0122, | |
| "reward": 0.48672058433294296, | |
| "reward_std": 0.19152027182281017, | |
| "rewards/code_reward": 0.38939911872148514, | |
| "rewards/format_reward": 0.9732143133878708, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 707.3102874755859, | |
| "epoch": 1.0063694267515924, | |
| "grad_norm": 0.8889822959899902, | |
| "kl": 0.529541015625, | |
| "learning_rate": 5.318338178609754e-07, | |
| "loss": 0.0054, | |
| "reward": 0.5736411809921265, | |
| "reward_std": 0.17692103423178196, | |
| "rewards/code_reward": 0.4749804362654686, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 736.5714721679688, | |
| "epoch": 1.0084925690021231, | |
| "grad_norm": 1.3358269929885864, | |
| "kl": 0.98828125, | |
| "learning_rate": 5.294374053487459e-07, | |
| "loss": 0.0099, | |
| "reward": 0.44529393315315247, | |
| "reward_std": 0.17480986192822456, | |
| "rewards/code_reward": 0.3468564301729202, | |
| "rewards/format_reward": 0.9843750596046448, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 711.4241485595703, | |
| "epoch": 1.010615711252654, | |
| "grad_norm": 1.4289793968200684, | |
| "kl": 1.22998046875, | |
| "learning_rate": 5.271341631956511e-07, | |
| "loss": 0.0123, | |
| "reward": 0.5166614726185799, | |
| "reward_std": 0.1912681832909584, | |
| "rewards/code_reward": 0.42000964283943176, | |
| "rewards/format_reward": 0.96651791036129, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 695.966552734375, | |
| "epoch": 1.0127388535031847, | |
| "grad_norm": 1.04447340965271, | |
| "kl": 0.756103515625, | |
| "learning_rate": 5.249241880411181e-07, | |
| "loss": 0.0076, | |
| "reward": 0.5925345048308372, | |
| "reward_std": 0.20060284808278084, | |
| "rewards/code_reward": 0.4952130541205406, | |
| "rewards/format_reward": 0.9732143133878708, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 694.6986846923828, | |
| "epoch": 1.0148619957537155, | |
| "grad_norm": 0.8483067750930786, | |
| "kl": 0.3671875, | |
| "learning_rate": 5.228075726112785e-07, | |
| "loss": 0.0039, | |
| "reward": 0.5394521579146385, | |
| "reward_std": 0.12158003821969032, | |
| "rewards/code_reward": 0.44079139083623886, | |
| "rewards/format_reward": 0.986607164144516, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 708.294677734375, | |
| "epoch": 1.0169851380042463, | |
| "grad_norm": 2.820655584335327, | |
| "kl": 2.081787109375, | |
| "learning_rate": 5.207844057150768e-07, | |
| "loss": 0.0209, | |
| "reward": 0.530554287135601, | |
| "reward_std": 0.18540234863758087, | |
| "rewards/code_reward": 0.4339024946093559, | |
| "rewards/format_reward": 0.9665178954601288, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 717.2143249511719, | |
| "epoch": 1.019108280254777, | |
| "grad_norm": 0.23074620962142944, | |
| "kl": 0.481689453125, | |
| "learning_rate": 5.188547722405437e-07, | |
| "loss": 0.005, | |
| "reward": 0.6097277328372002, | |
| "reward_std": 0.2097402885556221, | |
| "rewards/code_reward": 0.5108437687158585, | |
| "rewards/format_reward": 0.988839328289032, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 673.4553833007812, | |
| "epoch": 1.0212314225053079, | |
| "grad_norm": 18.151901245117188, | |
| "kl": 6.5419921875, | |
| "learning_rate": 5.170187531512351e-07, | |
| "loss": 0.0654, | |
| "reward": 0.4982636645436287, | |
| "reward_std": 0.18235952779650688, | |
| "rewards/code_reward": 0.4000493362545967, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 657.0223541259766, | |
| "epoch": 1.0233545647558386, | |
| "grad_norm": 1.2006980180740356, | |
| "kl": 0.98583984375, | |
| "learning_rate": 5.152764254828348e-07, | |
| "loss": 0.0101, | |
| "reward": 0.6024035438895226, | |
| "reward_std": 0.18741042539477348, | |
| "rewards/code_reward": 0.5044124275445938, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 679.5446624755859, | |
| "epoch": 1.0254777070063694, | |
| "grad_norm": 6.640290260314941, | |
| "kl": 2.26318359375, | |
| "learning_rate": 5.136278623399225e-07, | |
| "loss": 0.0229, | |
| "reward": 0.6333309859037399, | |
| "reward_std": 0.16317120380699635, | |
| "rewards/code_reward": 0.5337774083018303, | |
| "rewards/format_reward": 0.9955357313156128, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 690.9464721679688, | |
| "epoch": 1.0276008492569002, | |
| "grad_norm": 1.2075289487838745, | |
| "kl": 0.635498046875, | |
| "learning_rate": 5.120731328929058e-07, | |
| "loss": 0.0065, | |
| "reward": 0.6160075142979622, | |
| "reward_std": 0.18631838634610176, | |
| "rewards/code_reward": 0.516677126288414, | |
| "rewards/format_reward": 0.9933035969734192, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 711.2254791259766, | |
| "epoch": 1.029723991507431, | |
| "grad_norm": 0.6530643105506897, | |
| "kl": 0.88671875, | |
| "learning_rate": 5.106123023751187e-07, | |
| "loss": 0.009, | |
| "reward": 0.5319265574216843, | |
| "reward_std": 0.16448520869016647, | |
| "rewards/code_reward": 0.43304260820150375, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 697.0067291259766, | |
| "epoch": 1.0318471337579618, | |
| "grad_norm": 0.7889028787612915, | |
| "kl": 0.486083984375, | |
| "learning_rate": 5.092454320800833e-07, | |
| "loss": 0.0049, | |
| "reward": 0.5322659835219383, | |
| "reward_std": 0.2203526459634304, | |
| "rewards/code_reward": 0.4340517073869705, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 707.5915374755859, | |
| "epoch": 1.0339702760084926, | |
| "grad_norm": 1.0391658544540405, | |
| "kl": 1.347412109375, | |
| "learning_rate": 5.079725793589405e-07, | |
| "loss": 0.0136, | |
| "reward": 0.5818885043263435, | |
| "reward_std": 0.18653497844934464, | |
| "rewards/code_reward": 0.4838974103331566, | |
| "rewards/format_reward": 0.979910746216774, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 681.9509124755859, | |
| "epoch": 1.0360934182590233, | |
| "grad_norm": 1.4511501789093018, | |
| "kl": 0.91943359375, | |
| "learning_rate": 5.067937976180407e-07, | |
| "loss": 0.0092, | |
| "reward": 0.20120449364185333, | |
| "reward_std": 0.06054047856014222, | |
| "rewards/code_reward": 0.10365983843803406, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 696.1696624755859, | |
| "epoch": 1.0382165605095541, | |
| "grad_norm": 1.0735193490982056, | |
| "kl": 0.93994140625, | |
| "learning_rate": 5.057091363167046e-07, | |
| "loss": 0.0095, | |
| "reward": 0.41191001795232296, | |
| "reward_std": 0.11514822754543275, | |
| "rewards/code_reward": 0.31324928998947144, | |
| "rewards/format_reward": 0.9866071939468384, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 721.6607513427734, | |
| "epoch": 1.040339702760085, | |
| "grad_norm": 1.8616191148757935, | |
| "kl": 1.632080078125, | |
| "learning_rate": 5.047186409651489e-07, | |
| "loss": 0.0165, | |
| "reward": 0.5570781454443932, | |
| "reward_std": 0.17984510958194733, | |
| "rewards/code_reward": 0.45886383950710297, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 674.4308319091797, | |
| "epoch": 1.0424628450106157, | |
| "grad_norm": 2.0347139835357666, | |
| "kl": 1.854736328125, | |
| "learning_rate": 5.038223531225742e-07, | |
| "loss": 0.0186, | |
| "reward": 0.4472319483757019, | |
| "reward_std": 0.20929547771811485, | |
| "rewards/code_reward": 0.3496873155236244, | |
| "rewards/format_reward": 0.9754464626312256, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 684.2701263427734, | |
| "epoch": 1.0445859872611465, | |
| "grad_norm": 0.47032228112220764, | |
| "kl": 0.452392578125, | |
| "learning_rate": 5.030203103954232e-07, | |
| "loss": 0.0046, | |
| "reward": 0.6024687513709068, | |
| "reward_std": 0.20001190528273582, | |
| "rewards/code_reward": 0.5038080215454102, | |
| "rewards/format_reward": 0.986607164144516, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 749.888427734375, | |
| "epoch": 1.0467091295116773, | |
| "grad_norm": 1.3009785413742065, | |
| "kl": 0.7442626953125, | |
| "learning_rate": 5.023125464358026e-07, | |
| "loss": 0.0075, | |
| "reward": 0.4289785400032997, | |
| "reward_std": 0.19978297501802444, | |
| "rewards/code_reward": 0.3307642340660095, | |
| "rewards/format_reward": 0.9821428805589676, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 707.247802734375, | |
| "epoch": 1.048832271762208, | |
| "grad_norm": 2.7150216102600098, | |
| "kl": 1.939453125, | |
| "learning_rate": 5.016990909400709e-07, | |
| "loss": 0.0195, | |
| "reward": 0.48099584877491, | |
| "reward_std": 0.17564579099416733, | |
| "rewards/code_reward": 0.3834511674940586, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 712.8861999511719, | |
| "epoch": 1.0509554140127388, | |
| "grad_norm": 1.259710431098938, | |
| "kl": 1.7119140625, | |
| "learning_rate": 5.011799696475915e-07, | |
| "loss": 0.0172, | |
| "reward": 0.5863819345831871, | |
| "reward_std": 0.17422104254364967, | |
| "rewards/code_reward": 0.48883724212646484, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 670.3125305175781, | |
| "epoch": 1.0530785562632696, | |
| "grad_norm": 1.799985408782959, | |
| "kl": 1.3544921875, | |
| "learning_rate": 5.007552043396547e-07, | |
| "loss": 0.0137, | |
| "reward": 0.6773558109998703, | |
| "reward_std": 0.21710924059152603, | |
| "rewards/code_reward": 0.578471876680851, | |
| "rewards/format_reward": 0.9888393133878708, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 647.8727874755859, | |
| "epoch": 1.0552016985138004, | |
| "grad_norm": 1.1735022068023682, | |
| "kl": 0.455810546875, | |
| "learning_rate": 5.004248128385618e-07, | |
| "loss": 0.0047, | |
| "reward": 0.6235345751047134, | |
| "reward_std": 0.20276143215596676, | |
| "rewards/code_reward": 0.5259898751974106, | |
| "rewards/format_reward": 0.9754464775323868, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 720.7388610839844, | |
| "epoch": 1.0573248407643312, | |
| "grad_norm": 1.039088487625122, | |
| "kl": 1.011474609375, | |
| "learning_rate": 5.001888090068784e-07, | |
| "loss": 0.0102, | |
| "reward": 0.5388440862298012, | |
| "reward_std": 0.1800019945949316, | |
| "rewards/code_reward": 0.4397369250655174, | |
| "rewards/format_reward": 0.9910714626312256, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 737.9375305175781, | |
| "epoch": 1.059447983014862, | |
| "grad_norm": 2.2365331649780273, | |
| "kl": 0.947998046875, | |
| "learning_rate": 5.000472027468528e-07, | |
| "loss": 0.0095, | |
| "reward": 0.5870940536260605, | |
| "reward_std": 0.17025620490312576, | |
| "rewards/code_reward": 0.4893261566758156, | |
| "rewards/format_reward": 0.9776785969734192, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 686.7678833007812, | |
| "epoch": 1.0615711252653928, | |
| "grad_norm": 11.270224571228027, | |
| "kl": 3.646484375, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 0.0367, | |
| "reward": 0.2899981178343296, | |
| "reward_std": 0.10342313535511494, | |
| "rewards/code_reward": 0.1917838342487812, | |
| "rewards/format_reward": 0.9821428954601288, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0615711252653928, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.008756054809940243, | |
| "train_runtime": 191583.7312, | |
| "train_samples_per_second": 1.169, | |
| "train_steps_per_second": 0.003 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |