{ "best_metric": 0.46374601, "best_model_checkpoint": "/home/zhangzhicheng03/code/face-llm/ms-swift/Emo-CFG_bs-1040_data-ATTR_OPEN_EMO_500k_CAP_78k_lr-4e-5/v0-20250512-052808/checkpoint-1050", "epoch": 2.913557779799818, "eval_steps": 50, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018198362147406734, "grad_norm": 25.030844046592083, "learning_rate": 3.9999963615834764e-05, "loss": 2.025822877883911, "memory(GiB)": 43.02, "step": 1, "token_acc": 0.609375, "train_speed(iter/s)": 0.005908 }, { "epoch": 0.009099181073703366, "grad_norm": 10.802963796008704, "learning_rate": 3.9999090402488034e-05, "loss": 2.101142644882202, "memory(GiB)": 71.21, "step": 5, "token_acc": 0.5037481259370314, "train_speed(iter/s)": 0.013918 }, { "epoch": 0.018198362147406732, "grad_norm": 4.512540795817656, "learning_rate": 3.99963616926889e-05, "loss": 2.7770095825195313, "memory(GiB)": 71.21, "step": 10, "token_acc": 0.4725118483412322, "train_speed(iter/s)": 0.016736 }, { "epoch": 0.0272975432211101, "grad_norm": 3.2273598665998664, "learning_rate": 3.999181411880536e-05, "loss": 1.1679546356201171, "memory(GiB)": 71.21, "step": 15, "token_acc": 0.6352619233776388, "train_speed(iter/s)": 0.018014 }, { "epoch": 0.036396724294813464, "grad_norm": 2.7344684028279493, "learning_rate": 3.99854480944836e-05, "loss": 1.0935646057128907, "memory(GiB)": 76.02, "step": 20, "token_acc": 0.6871584699453552, "train_speed(iter/s)": 0.018758 }, { "epoch": 0.04549590536851683, "grad_norm": 2.6368838955923546, "learning_rate": 3.9977264198775616e-05, "loss": 1.0634303092956543, "memory(GiB)": 76.02, "step": 25, "token_acc": 0.6443461781427668, "train_speed(iter/s)": 0.019279 }, { "epoch": 0.0545950864422202, "grad_norm": 2.0635877115380987, "learning_rate": 3.996726317608652e-05, "loss": 1.0315238952636718, "memory(GiB)": 76.02, "step": 30, "token_acc": 0.6216628527841342, "train_speed(iter/s)": 0.019641 }, { "epoch": 0.06369426751592357, "grad_norm": 1.769746764765505, "learning_rate": 3.995544593610685e-05, "loss": 1.0012907981872559, "memory(GiB)": 76.02, "step": 35, "token_acc": 0.6820960698689956, "train_speed(iter/s)": 0.019879 }, { "epoch": 0.07279344858962693, "grad_norm": 1.7921148583961297, "learning_rate": 3.994181355372981e-05, "loss": 1.0219003677368164, "memory(GiB)": 76.02, "step": 40, "token_acc": 0.6666666666666666, "train_speed(iter/s)": 0.020113 }, { "epoch": 0.0818926296633303, "grad_norm": 2.3622358395090868, "learning_rate": 3.9926367268953514e-05, "loss": 0.9893597602844239, "memory(GiB)": 76.02, "step": 45, "token_acc": 0.6443586443586443, "train_speed(iter/s)": 0.02023 }, { "epoch": 0.09099181073703366, "grad_norm": 1.8847444781295586, "learning_rate": 3.990910848676819e-05, "loss": 1.0064857482910157, "memory(GiB)": 76.02, "step": 50, "token_acc": 0.6833550065019506, "train_speed(iter/s)": 0.020396 }, { "epoch": 0.09099181073703366, "eval_loss": 0.6069548726081848, "eval_runtime": 124.0182, "eval_samples_per_second": 46.542, "eval_steps_per_second": 0.452, "eval_token_acc": 0.6658791259916742, "step": 50 }, { "epoch": 0.10009099181073704, "grad_norm": 2.2611595119155297, "learning_rate": 3.989003877702835e-05, "loss": 1.0090344429016114, "memory(GiB)": 76.02, "step": 55, "token_acc": 0.6743224621038126, "train_speed(iter/s)": 0.019334 }, { "epoch": 0.1091901728844404, "grad_norm": 2.155765617865288, "learning_rate": 3.986915987431006e-05, "loss": 0.9812187194824219, "memory(GiB)": 76.02, "step": 60, "token_acc": 0.6862615587846763, "train_speed(iter/s)": 0.019541 }, { "epoch": 0.11828935395814377, "grad_norm": 1.9675392436886496, "learning_rate": 3.984647367775312e-05, "loss": 0.967503547668457, "memory(GiB)": 76.02, "step": 65, "token_acc": 0.6425840978593272, "train_speed(iter/s)": 0.019703 }, { "epoch": 0.12738853503184713, "grad_norm": 1.6136749581314442, "learning_rate": 3.9821982250888316e-05, "loss": 0.9946205139160156, "memory(GiB)": 76.02, "step": 70, "token_acc": 0.6822200392927309, "train_speed(iter/s)": 0.01985 }, { "epoch": 0.1364877161055505, "grad_norm": 2.1416143299162544, "learning_rate": 3.9795687821449754e-05, "loss": 0.9689006805419922, "memory(GiB)": 76.02, "step": 75, "token_acc": 0.6582365003417635, "train_speed(iter/s)": 0.019982 }, { "epoch": 0.14558689717925385, "grad_norm": 2.2094541193048074, "learning_rate": 3.9767592781172185e-05, "loss": 0.9927925109863281, "memory(GiB)": 76.02, "step": 80, "token_acc": 0.6676557863501483, "train_speed(iter/s)": 0.020086 }, { "epoch": 0.15468607825295724, "grad_norm": 1.6788879867996525, "learning_rate": 3.973769968557348e-05, "loss": 0.9653422355651855, "memory(GiB)": 76.02, "step": 85, "token_acc": 0.6833890746934225, "train_speed(iter/s)": 0.020194 }, { "epoch": 0.1637852593266606, "grad_norm": 1.6608567558622684, "learning_rate": 3.970601125372218e-05, "loss": 0.9711417198181153, "memory(GiB)": 76.02, "step": 90, "token_acc": 0.6648721399730821, "train_speed(iter/s)": 0.020273 }, { "epoch": 0.17288444040036396, "grad_norm": 1.8971338914920044, "learning_rate": 3.967253036799017e-05, "loss": 0.9714339256286622, "memory(GiB)": 76.02, "step": 95, "token_acc": 0.6907407407407408, "train_speed(iter/s)": 0.020364 }, { "epoch": 0.18198362147406733, "grad_norm": 2.2276291949458913, "learning_rate": 3.963726007379047e-05, "loss": 0.9623370170593262, "memory(GiB)": 76.02, "step": 100, "token_acc": 0.6705935659265972, "train_speed(iter/s)": 0.02043 }, { "epoch": 0.18198362147406733, "eval_loss": 0.5809512138366699, "eval_runtime": 123.9915, "eval_samples_per_second": 46.552, "eval_steps_per_second": 0.452, "eval_token_acc": 0.6707621478823382, "step": 100 }, { "epoch": 0.1910828025477707, "grad_norm": 1.7032248137533175, "learning_rate": 3.960020357930028e-05, "loss": 0.9466117858886719, "memory(GiB)": 76.02, "step": 105, "token_acc": 0.6678478620363808, "train_speed(iter/s)": 0.019882 }, { "epoch": 0.20018198362147407, "grad_norm": 1.855445035035624, "learning_rate": 3.9561364255169114e-05, "loss": 0.9585418701171875, "memory(GiB)": 76.02, "step": 110, "token_acc": 0.666546633057256, "train_speed(iter/s)": 0.019966 }, { "epoch": 0.20928116469517744, "grad_norm": 2.271456601509792, "learning_rate": 3.9520745634212225e-05, "loss": 0.9546641349792481, "memory(GiB)": 76.02, "step": 115, "token_acc": 0.6983430799220273, "train_speed(iter/s)": 0.02004 }, { "epoch": 0.2183803457688808, "grad_norm": 1.727865009111447, "learning_rate": 3.947835141108928e-05, "loss": 0.9411544799804688, "memory(GiB)": 76.02, "step": 120, "token_acc": 0.6998714652956298, "train_speed(iter/s)": 0.020118 }, { "epoch": 0.22747952684258416, "grad_norm": 1.5407295558813352, "learning_rate": 3.943418544196826e-05, "loss": 0.9641068458557129, "memory(GiB)": 76.02, "step": 125, "token_acc": 0.6722915963550455, "train_speed(iter/s)": 0.020179 }, { "epoch": 0.23657870791628755, "grad_norm": 1.6770942231997907, "learning_rate": 3.938825174417473e-05, "loss": 0.956147575378418, "memory(GiB)": 76.02, "step": 130, "token_acc": 0.7067484662576687, "train_speed(iter/s)": 0.020251 }, { "epoch": 0.2456778889899909, "grad_norm": 1.799020682507979, "learning_rate": 3.934055449582641e-05, "loss": 0.9465121269226074, "memory(GiB)": 76.02, "step": 135, "token_acc": 0.6822670674109059, "train_speed(iter/s)": 0.020307 }, { "epoch": 0.25477707006369427, "grad_norm": 1.6975378766800486, "learning_rate": 3.929109803545315e-05, "loss": 0.9593283653259277, "memory(GiB)": 76.02, "step": 140, "token_acc": 0.6935749588138386, "train_speed(iter/s)": 0.020367 }, { "epoch": 0.26387625113739765, "grad_norm": 1.6873696015578077, "learning_rate": 3.9239886861602265e-05, "loss": 0.9509831428527832, "memory(GiB)": 76.02, "step": 145, "token_acc": 0.6785370548604427, "train_speed(iter/s)": 0.020417 }, { "epoch": 0.272975432211101, "grad_norm": 1.605433238469568, "learning_rate": 3.9186925632429396e-05, "loss": 0.9489663124084473, "memory(GiB)": 76.02, "step": 150, "token_acc": 0.6493083807973963, "train_speed(iter/s)": 0.020465 }, { "epoch": 0.272975432211101, "eval_loss": 0.556602954864502, "eval_runtime": 119.5036, "eval_samples_per_second": 48.3, "eval_steps_per_second": 0.469, "eval_token_acc": 0.6771622643952052, "step": 150 }, { "epoch": 0.2820746132848044, "grad_norm": 1.665760265285853, "learning_rate": 3.9132219165274786e-05, "loss": 0.9691334724426269, "memory(GiB)": 76.02, "step": 155, "token_acc": 0.6817427385892116, "train_speed(iter/s)": 0.020097 }, { "epoch": 0.2911737943585077, "grad_norm": 1.4504880716204094, "learning_rate": 3.907577243622505e-05, "loss": 0.9517691612243653, "memory(GiB)": 76.02, "step": 160, "token_acc": 0.6508152173913043, "train_speed(iter/s)": 0.020145 }, { "epoch": 0.3002729754322111, "grad_norm": 1.4909379207696947, "learning_rate": 3.901759057966064e-05, "loss": 0.9396313667297364, "memory(GiB)": 76.02, "step": 165, "token_acc": 0.6924564796905223, "train_speed(iter/s)": 0.0202 }, { "epoch": 0.3093721565059145, "grad_norm": 1.6755025509294692, "learning_rate": 3.895767888778874e-05, "loss": 0.958685302734375, "memory(GiB)": 76.02, "step": 170, "token_acc": 0.6812801402893468, "train_speed(iter/s)": 0.020241 }, { "epoch": 0.3184713375796178, "grad_norm": 1.39424961728271, "learning_rate": 3.889604281016194e-05, "loss": 0.9179913520812988, "memory(GiB)": 76.02, "step": 175, "token_acc": 0.6434395848776872, "train_speed(iter/s)": 0.020291 }, { "epoch": 0.3275705186533212, "grad_norm": 1.810023496149751, "learning_rate": 3.883268795318252e-05, "loss": 0.95927734375, "memory(GiB)": 76.02, "step": 180, "token_acc": 0.6510866329264662, "train_speed(iter/s)": 0.020334 }, { "epoch": 0.33666969972702454, "grad_norm": 2.080560793664787, "learning_rate": 3.876762007959253e-05, "loss": 0.9460148811340332, "memory(GiB)": 76.02, "step": 185, "token_acc": 0.6614173228346457, "train_speed(iter/s)": 0.020378 }, { "epoch": 0.34576888080072793, "grad_norm": 1.6314724313426552, "learning_rate": 3.870084510794953e-05, "loss": 0.9372352600097656, "memory(GiB)": 76.02, "step": 190, "token_acc": 0.7167736021998167, "train_speed(iter/s)": 0.020418 }, { "epoch": 0.3548680618744313, "grad_norm": 1.5214499000610326, "learning_rate": 3.863236911208835e-05, "loss": 0.9120028495788575, "memory(GiB)": 76.02, "step": 195, "token_acc": 0.6961974110032363, "train_speed(iter/s)": 0.020453 }, { "epoch": 0.36396724294813465, "grad_norm": 1.403385243202059, "learning_rate": 3.856219832056853e-05, "loss": 0.9274997711181641, "memory(GiB)": 76.02, "step": 200, "token_acc": 0.6597971867844292, "train_speed(iter/s)": 0.020494 }, { "epoch": 0.36396724294813465, "eval_loss": 0.5442519783973694, "eval_runtime": 121.7991, "eval_samples_per_second": 47.389, "eval_steps_per_second": 0.46, "eval_token_acc": 0.6795491599341379, "step": 200 }, { "epoch": 0.37306642402183804, "grad_norm": 1.7288195921368568, "learning_rate": 3.8490339116107814e-05, "loss": 0.9254457473754882, "memory(GiB)": 76.02, "step": 205, "token_acc": 0.6976498547663058, "train_speed(iter/s)": 0.020208 }, { "epoch": 0.3821656050955414, "grad_norm": 1.7934469116880778, "learning_rate": 3.8416798035001545e-05, "loss": 0.9426854133605957, "memory(GiB)": 76.02, "step": 210, "token_acc": 0.6734362307067425, "train_speed(iter/s)": 0.020248 }, { "epoch": 0.39126478616924476, "grad_norm": 1.3762724847783987, "learning_rate": 3.8341581766528185e-05, "loss": 0.949736499786377, "memory(GiB)": 76.02, "step": 215, "token_acc": 0.6799800299550673, "train_speed(iter/s)": 0.020279 }, { "epoch": 0.40036396724294815, "grad_norm": 1.8318501236469258, "learning_rate": 3.826469715234078e-05, "loss": 0.9189864158630371, "memory(GiB)": 76.02, "step": 220, "token_acc": 0.6768424298489053, "train_speed(iter/s)": 0.020316 }, { "epoch": 0.4094631483166515, "grad_norm": 1.734985910099827, "learning_rate": 3.818615118584472e-05, "loss": 0.9207481384277344, "memory(GiB)": 76.02, "step": 225, "token_acc": 0.6853369763205829, "train_speed(iter/s)": 0.020349 }, { "epoch": 0.41856232939035487, "grad_norm": 1.46610540352475, "learning_rate": 3.810595101156157e-05, "loss": 0.949979305267334, "memory(GiB)": 76.02, "step": 230, "token_acc": 0.7674418604651163, "train_speed(iter/s)": 0.020378 }, { "epoch": 0.42766151046405826, "grad_norm": 1.4640218284405278, "learning_rate": 3.8024103924479225e-05, "loss": 0.9503008842468261, "memory(GiB)": 76.02, "step": 235, "token_acc": 0.6691435275713727, "train_speed(iter/s)": 0.020412 }, { "epoch": 0.4367606915377616, "grad_norm": 1.3582380492653447, "learning_rate": 3.794061736938837e-05, "loss": 0.9213446617126465, "memory(GiB)": 76.02, "step": 240, "token_acc": 0.6814469078179697, "train_speed(iter/s)": 0.020441 }, { "epoch": 0.445859872611465, "grad_norm": 1.24168837408377, "learning_rate": 3.785549894020529e-05, "loss": 0.927124309539795, "memory(GiB)": 76.02, "step": 245, "token_acc": 0.7300613496932515, "train_speed(iter/s)": 0.020473 }, { "epoch": 0.4549590536851683, "grad_norm": 1.4540581012012834, "learning_rate": 3.77687563792811e-05, "loss": 0.9168607711791992, "memory(GiB)": 76.02, "step": 250, "token_acc": 0.6800291545189504, "train_speed(iter/s)": 0.020497 }, { "epoch": 0.4549590536851683, "eval_loss": 0.5409244894981384, "eval_runtime": 120.7512, "eval_samples_per_second": 47.801, "eval_steps_per_second": 0.464, "eval_token_acc": 0.6797271657031431, "step": 250 }, { "epoch": 0.4640582347588717, "grad_norm": 1.7178666143628036, "learning_rate": 3.768039757669759e-05, "loss": 0.9190607070922852, "memory(GiB)": 76.02, "step": 255, "token_acc": 0.6971046770601337, "train_speed(iter/s)": 0.020269 }, { "epoch": 0.4731574158325751, "grad_norm": 1.4533539479949111, "learning_rate": 3.759043056954943e-05, "loss": 0.9371905326843262, "memory(GiB)": 76.02, "step": 260, "token_acc": 0.6667847025495751, "train_speed(iter/s)": 0.020296 }, { "epoch": 0.4822565969062784, "grad_norm": 1.8242714144160546, "learning_rate": 3.749886354121324e-05, "loss": 0.9172127723693848, "memory(GiB)": 76.02, "step": 265, "token_acc": 0.7086137281292059, "train_speed(iter/s)": 0.020325 }, { "epoch": 0.4913557779799818, "grad_norm": 1.3386774946853799, "learning_rate": 3.740570482060311e-05, "loss": 0.9408517837524414, "memory(GiB)": 76.02, "step": 270, "token_acc": 0.7290575916230366, "train_speed(iter/s)": 0.020353 }, { "epoch": 0.5004549590536852, "grad_norm": 1.6524604438416564, "learning_rate": 3.731096288141309e-05, "loss": 0.9067551612854003, "memory(GiB)": 76.02, "step": 275, "token_acc": 0.678743961352657, "train_speed(iter/s)": 0.020379 }, { "epoch": 0.5095541401273885, "grad_norm": 1.7068717522460979, "learning_rate": 3.721464634134641e-05, "loss": 0.9261470794677734, "memory(GiB)": 76.02, "step": 280, "token_acc": 0.7159965782720273, "train_speed(iter/s)": 0.020408 }, { "epoch": 0.5186533212010919, "grad_norm": 1.5886442512862196, "learning_rate": 3.711676396133158e-05, "loss": 0.9242866516113282, "memory(GiB)": 76.02, "step": 285, "token_acc": 0.6532932129722501, "train_speed(iter/s)": 0.020431 }, { "epoch": 0.5277525022747953, "grad_norm": 1.3930674320536802, "learning_rate": 3.701732464472553e-05, "loss": 0.9128170967102051, "memory(GiB)": 76.02, "step": 290, "token_acc": 0.6779987171263631, "train_speed(iter/s)": 0.020457 }, { "epoch": 0.5368516833484986, "grad_norm": 1.4564537325119185, "learning_rate": 3.691633743650377e-05, "loss": 0.9042372703552246, "memory(GiB)": 76.02, "step": 295, "token_acc": 0.6832191780821918, "train_speed(iter/s)": 0.020478 }, { "epoch": 0.545950864422202, "grad_norm": 1.4788538883263567, "learning_rate": 3.681381152243763e-05, "loss": 0.9223553657531738, "memory(GiB)": 76.02, "step": 300, "token_acc": 0.6808054841473865, "train_speed(iter/s)": 0.020502 }, { "epoch": 0.545950864422202, "eval_loss": 0.5335711240768433, "eval_runtime": 119.2512, "eval_samples_per_second": 48.402, "eval_steps_per_second": 0.47, "eval_token_acc": 0.682199018540919, "step": 300 }, { "epoch": 0.5550500454959054, "grad_norm": 1.6525258776892648, "learning_rate": 3.6709756228258735e-05, "loss": 0.9161547660827637, "memory(GiB)": 76.02, "step": 305, "token_acc": 0.6724870221802737, "train_speed(iter/s)": 0.02031 }, { "epoch": 0.5641492265696088, "grad_norm": 1.298480729022936, "learning_rate": 3.6604181018810764e-05, "loss": 0.8824697494506836, "memory(GiB)": 76.02, "step": 310, "token_acc": 0.6935075885328836, "train_speed(iter/s)": 0.020334 }, { "epoch": 0.5732484076433121, "grad_norm": 1.3254867339008374, "learning_rate": 3.649709549718849e-05, "loss": 0.8925297737121582, "memory(GiB)": 76.02, "step": 315, "token_acc": 0.6668953687821613, "train_speed(iter/s)": 0.020357 }, { "epoch": 0.5823475887170154, "grad_norm": 1.4003301586141983, "learning_rate": 3.638850940386433e-05, "loss": 0.9219451904296875, "memory(GiB)": 76.02, "step": 320, "token_acc": 0.6934164394234515, "train_speed(iter/s)": 0.020381 }, { "epoch": 0.5914467697907189, "grad_norm": 1.2198877131221963, "learning_rate": 3.627843261580231e-05, "loss": 0.9142662048339844, "memory(GiB)": 76.02, "step": 325, "token_acc": 0.6796973518284993, "train_speed(iter/s)": 0.020407 }, { "epoch": 0.6005459508644222, "grad_norm": 1.2491149251440654, "learning_rate": 3.6166875145559684e-05, "loss": 0.9013506889343261, "memory(GiB)": 76.02, "step": 330, "token_acc": 0.7270875763747454, "train_speed(iter/s)": 0.020426 }, { "epoch": 0.6096451319381255, "grad_norm": 1.3464860154655747, "learning_rate": 3.6053847140376194e-05, "loss": 0.9187211990356445, "memory(GiB)": 76.02, "step": 335, "token_acc": 0.6677791262135923, "train_speed(iter/s)": 0.020449 }, { "epoch": 0.618744313011829, "grad_norm": 1.3081495464213557, "learning_rate": 3.593935888125107e-05, "loss": 0.9130012512207031, "memory(GiB)": 76.02, "step": 340, "token_acc": 0.6820603907637656, "train_speed(iter/s)": 0.020469 }, { "epoch": 0.6278434940855323, "grad_norm": 1.3056329501412263, "learning_rate": 3.582342078200786e-05, "loss": 0.903553581237793, "memory(GiB)": 76.02, "step": 345, "token_acc": 0.7179723502304147, "train_speed(iter/s)": 0.020488 }, { "epoch": 0.6369426751592356, "grad_norm": 1.2033803940833903, "learning_rate": 3.570604338834725e-05, "loss": 0.9074154853820801, "memory(GiB)": 76.02, "step": 350, "token_acc": 0.7170805116629044, "train_speed(iter/s)": 0.020509 }, { "epoch": 0.6369426751592356, "eval_loss": 0.5156524777412415, "eval_runtime": 121.7142, "eval_samples_per_second": 47.423, "eval_steps_per_second": 0.46, "eval_token_acc": 0.6832346884696763, "step": 350 }, { "epoch": 0.6460418562329391, "grad_norm": 1.3706427880274294, "learning_rate": 3.558723737688775e-05, "loss": 0.9084077835083008, "memory(GiB)": 76.02, "step": 355, "token_acc": 0.7012306886619534, "train_speed(iter/s)": 0.020344 }, { "epoch": 0.6551410373066424, "grad_norm": 1.4525504674274499, "learning_rate": 3.54670135541946e-05, "loss": 0.9108301162719726, "memory(GiB)": 76.02, "step": 360, "token_acc": 0.6819548872180451, "train_speed(iter/s)": 0.020365 }, { "epoch": 0.6642402183803457, "grad_norm": 1.371067326824918, "learning_rate": 3.534538285579681e-05, "loss": 0.9166597366333008, "memory(GiB)": 76.02, "step": 365, "token_acc": 0.68828125, "train_speed(iter/s)": 0.020383 }, { "epoch": 0.6733393994540491, "grad_norm": 1.404728462002113, "learning_rate": 3.522235634519244e-05, "loss": 0.8995059967041016, "memory(GiB)": 76.02, "step": 370, "token_acc": 0.6734115742614326, "train_speed(iter/s)": 0.020405 }, { "epoch": 0.6824385805277525, "grad_norm": 1.4153346949849819, "learning_rate": 3.509794521284228e-05, "loss": 0.8986475944519043, "memory(GiB)": 76.02, "step": 375, "token_acc": 0.6696600384862091, "train_speed(iter/s)": 0.020423 }, { "epoch": 0.6915377616014559, "grad_norm": 1.357991579405462, "learning_rate": 3.497216077515198e-05, "loss": 0.914306354522705, "memory(GiB)": 76.02, "step": 380, "token_acc": 0.668999300209937, "train_speed(iter/s)": 0.020442 }, { "epoch": 0.7006369426751592, "grad_norm": 1.421643318524058, "learning_rate": 3.48450144734427e-05, "loss": 0.9151236534118652, "memory(GiB)": 76.02, "step": 385, "token_acc": 0.6687898089171974, "train_speed(iter/s)": 0.02046 }, { "epoch": 0.7097361237488626, "grad_norm": 1.1089727601654944, "learning_rate": 3.4716517872910405e-05, "loss": 0.8921234130859375, "memory(GiB)": 76.02, "step": 390, "token_acc": 0.6953678474114442, "train_speed(iter/s)": 0.020478 }, { "epoch": 0.718835304822566, "grad_norm": 1.3211880131463927, "learning_rate": 3.45866826615739e-05, "loss": 0.9150146484375, "memory(GiB)": 76.02, "step": 395, "token_acc": 0.6571167327034441, "train_speed(iter/s)": 0.020496 }, { "epoch": 0.7279344858962693, "grad_norm": 1.4350745291439944, "learning_rate": 3.445552064921172e-05, "loss": 0.9022627830505371, "memory(GiB)": 76.02, "step": 400, "token_acc": 0.6755852842809364, "train_speed(iter/s)": 0.020512 }, { "epoch": 0.7279344858962693, "eval_loss": 0.5100554823875427, "eval_runtime": 119.6911, "eval_samples_per_second": 48.224, "eval_steps_per_second": 0.468, "eval_token_acc": 0.6859209573473904, "step": 400 }, { "epoch": 0.7370336669699727, "grad_norm": 1.1612524813118632, "learning_rate": 3.432304376628787e-05, "loss": 0.9135440826416016, "memory(GiB)": 76.02, "step": 405, "token_acc": 0.7024793388429752, "train_speed(iter/s)": 0.020366 }, { "epoch": 0.7461328480436761, "grad_norm": 1.3506987538568946, "learning_rate": 3.418926406286666e-05, "loss": 0.9180900573730468, "memory(GiB)": 76.02, "step": 410, "token_acc": 0.715203426124197, "train_speed(iter/s)": 0.020382 }, { "epoch": 0.7552320291173794, "grad_norm": 1.3682849356535443, "learning_rate": 3.405419370751663e-05, "loss": 0.9025050163269043, "memory(GiB)": 76.02, "step": 415, "token_acc": 0.7220916568742656, "train_speed(iter/s)": 0.020402 }, { "epoch": 0.7643312101910829, "grad_norm": 1.4354924987431779, "learning_rate": 3.391784498620369e-05, "loss": 0.9032191276550293, "memory(GiB)": 76.02, "step": 420, "token_acc": 0.6772521062864549, "train_speed(iter/s)": 0.020419 }, { "epoch": 0.7734303912647862, "grad_norm": 1.3319624335350189, "learning_rate": 3.378023030117361e-05, "loss": 0.9076663970947265, "memory(GiB)": 76.02, "step": 425, "token_acc": 0.6790314270994333, "train_speed(iter/s)": 0.020436 }, { "epoch": 0.7825295723384895, "grad_norm": 1.2560401393743486, "learning_rate": 3.364136216982391e-05, "loss": 0.9036032676696777, "memory(GiB)": 76.02, "step": 430, "token_acc": 0.6832980972515856, "train_speed(iter/s)": 0.020453 }, { "epoch": 0.7916287534121929, "grad_norm": 1.331582467821213, "learning_rate": 3.350125322356525e-05, "loss": 0.9180031776428222, "memory(GiB)": 76.02, "step": 435, "token_acc": 0.6918290043290043, "train_speed(iter/s)": 0.020468 }, { "epoch": 0.8007279344858963, "grad_norm": 1.3101601945182637, "learning_rate": 3.335991620667254e-05, "loss": 0.9090401649475097, "memory(GiB)": 76.02, "step": 440, "token_acc": 0.6886586695747001, "train_speed(iter/s)": 0.020484 }, { "epoch": 0.8098271155595996, "grad_norm": 1.490959565832233, "learning_rate": 3.321736397512566e-05, "loss": 0.8914430618286133, "memory(GiB)": 76.02, "step": 445, "token_acc": 0.7289220917822838, "train_speed(iter/s)": 0.020498 }, { "epoch": 0.818926296633303, "grad_norm": 1.6826531523568926, "learning_rate": 3.307360949544012e-05, "loss": 0.8871423721313476, "memory(GiB)": 76.02, "step": 450, "token_acc": 0.6811023622047244, "train_speed(iter/s)": 0.020515 }, { "epoch": 0.818926296633303, "eval_loss": 0.5105797648429871, "eval_runtime": 119.2169, "eval_samples_per_second": 48.416, "eval_steps_per_second": 0.47, "eval_token_acc": 0.6859007294190943, "step": 450 }, { "epoch": 0.8280254777070064, "grad_norm": 1.5351362870698657, "learning_rate": 3.2928665843487646e-05, "loss": 0.9084842681884766, "memory(GiB)": 76.02, "step": 455, "token_acc": 0.6964930376482723, "train_speed(iter/s)": 0.020387 }, { "epoch": 0.8371246587807097, "grad_norm": 1.76414300067586, "learning_rate": 3.278254620330673e-05, "loss": 0.8832217216491699, "memory(GiB)": 76.02, "step": 460, "token_acc": 0.6910656620021528, "train_speed(iter/s)": 0.020403 }, { "epoch": 0.8462238398544131, "grad_norm": 1.26108516359597, "learning_rate": 3.263526386590351e-05, "loss": 0.9098955154418945, "memory(GiB)": 76.02, "step": 465, "token_acc": 0.6647430612805716, "train_speed(iter/s)": 0.020418 }, { "epoch": 0.8553230209281165, "grad_norm": 1.4539630443562455, "learning_rate": 3.248683222804274e-05, "loss": 0.8848261833190918, "memory(GiB)": 76.02, "step": 470, "token_acc": 0.7338235294117647, "train_speed(iter/s)": 0.020432 }, { "epoch": 0.8644222020018199, "grad_norm": 1.6326834981575191, "learning_rate": 3.233726479102927e-05, "loss": 0.9008934020996093, "memory(GiB)": 76.02, "step": 475, "token_acc": 0.7064676616915423, "train_speed(iter/s)": 0.020448 }, { "epoch": 0.8735213830755232, "grad_norm": 1.2054817005259488, "learning_rate": 3.2186575159479966e-05, "loss": 0.8803308486938477, "memory(GiB)": 76.02, "step": 480, "token_acc": 0.7033673855467272, "train_speed(iter/s)": 0.020462 }, { "epoch": 0.8826205641492265, "grad_norm": 1.1783711102902867, "learning_rate": 3.203477704008622e-05, "loss": 0.9082450866699219, "memory(GiB)": 76.02, "step": 485, "token_acc": 0.7070333157059757, "train_speed(iter/s)": 0.020477 }, { "epoch": 0.89171974522293, "grad_norm": 1.241716165408502, "learning_rate": 3.188188424036719e-05, "loss": 0.9072214126586914, "memory(GiB)": 76.02, "step": 490, "token_acc": 0.6927956502038967, "train_speed(iter/s)": 0.02049 }, { "epoch": 0.9008189262966333, "grad_norm": 1.1673048249036013, "learning_rate": 3.172791066741392e-05, "loss": 0.886620044708252, "memory(GiB)": 76.02, "step": 495, "token_acc": 0.7046548956661316, "train_speed(iter/s)": 0.020505 }, { "epoch": 0.9099181073703366, "grad_norm": 1.5005936662764863, "learning_rate": 3.157287032662428e-05, "loss": 0.8825222015380859, "memory(GiB)": 76.02, "step": 500, "token_acc": 0.6940532081377152, "train_speed(iter/s)": 0.020518 }, { "epoch": 0.9099181073703366, "eval_loss": 0.49878114461898804, "eval_runtime": 121.4101, "eval_samples_per_second": 47.541, "eval_steps_per_second": 0.461, "eval_token_acc": 0.6875917842246433, "step": 500 }, { "epoch": 0.9190172884440401, "grad_norm": 1.2321769009736276, "learning_rate": 3.14167773204291e-05, "loss": 0.8877192497253418, "memory(GiB)": 76.02, "step": 505, "token_acc": 0.7100805331852263, "train_speed(iter/s)": 0.020401 }, { "epoch": 0.9281164695177434, "grad_norm": 1.2301460920284364, "learning_rate": 3.1259645847009384e-05, "loss": 0.9063457489013672, "memory(GiB)": 76.02, "step": 510, "token_acc": 0.6885245901639344, "train_speed(iter/s)": 0.020414 }, { "epoch": 0.9372156505914467, "grad_norm": 1.4857123341096659, "learning_rate": 3.110149019900486e-05, "loss": 0.8702260971069335, "memory(GiB)": 76.02, "step": 515, "token_acc": 0.6863874345549739, "train_speed(iter/s)": 0.020427 }, { "epoch": 0.9463148316651502, "grad_norm": 1.189993476966276, "learning_rate": 3.094232476221392e-05, "loss": 0.9034518241882324, "memory(GiB)": 76.02, "step": 520, "token_acc": 0.7082294264339152, "train_speed(iter/s)": 0.020441 }, { "epoch": 0.9554140127388535, "grad_norm": 1.3161268491995117, "learning_rate": 3.07821640142851e-05, "loss": 0.87875394821167, "memory(GiB)": 76.02, "step": 525, "token_acc": 0.683948569058482, "train_speed(iter/s)": 0.020453 }, { "epoch": 0.9645131938125568, "grad_norm": 1.1112974134834392, "learning_rate": 3.062102252340019e-05, "loss": 0.8922388076782226, "memory(GiB)": 76.02, "step": 530, "token_acc": 0.6777905638665133, "train_speed(iter/s)": 0.020468 }, { "epoch": 0.9736123748862603, "grad_norm": 1.292894697629211, "learning_rate": 3.045891494694908e-05, "loss": 0.908051872253418, "memory(GiB)": 76.02, "step": 535, "token_acc": 0.6983343615052436, "train_speed(iter/s)": 0.020479 }, { "epoch": 0.9827115559599636, "grad_norm": 1.166045668885059, "learning_rate": 3.0295856030196618e-05, "loss": 0.9091971397399903, "memory(GiB)": 76.02, "step": 540, "token_acc": 0.7089144936325046, "train_speed(iter/s)": 0.020492 }, { "epoch": 0.991810737033667, "grad_norm": 1.3674012690083148, "learning_rate": 3.0131860604941287e-05, "loss": 0.8997166633605957, "memory(GiB)": 76.02, "step": 545, "token_acc": 0.6767097082735534, "train_speed(iter/s)": 0.020504 }, { "epoch": 1.0, "grad_norm": 1.4019349528909308, "learning_rate": 2.996694358816618e-05, "loss": 0.8638315200805664, "memory(GiB)": 76.02, "step": 550, "token_acc": 0.7002042900919305, "train_speed(iter/s)": 0.020533 }, { "epoch": 1.0, "eval_loss": 0.4928109347820282, "eval_runtime": 119.0212, "eval_samples_per_second": 48.496, "eval_steps_per_second": 0.471, "eval_token_acc": 0.6892747478588738, "step": 550 }, { "epoch": 1.0090991810737033, "grad_norm": 1.4220386258897009, "learning_rate": 2.9801119980682095e-05, "loss": 0.8142873764038085, "memory(GiB)": 76.02, "step": 555, "token_acc": 0.7055921052631579, "train_speed(iter/s)": 0.020412 }, { "epoch": 1.0181983621474067, "grad_norm": 1.187181373009717, "learning_rate": 2.9634404865763122e-05, "loss": 0.7935843467712402, "memory(GiB)": 76.02, "step": 560, "token_acc": 0.7032755298651252, "train_speed(iter/s)": 0.02042 }, { "epoch": 1.02729754322111, "grad_norm": 1.0185191433966165, "learning_rate": 2.9466813407774627e-05, "loss": 0.7965437889099121, "memory(GiB)": 76.02, "step": 565, "token_acc": 0.6973250274825944, "train_speed(iter/s)": 0.020432 }, { "epoch": 1.0363967242948136, "grad_norm": 1.2024810924675036, "learning_rate": 2.9298360850793944e-05, "loss": 0.7800662517547607, "memory(GiB)": 76.02, "step": 570, "token_acc": 0.7089552238805971, "train_speed(iter/s)": 0.020443 }, { "epoch": 1.0454959053685169, "grad_norm": 0.9855874534546613, "learning_rate": 2.912906251722373e-05, "loss": 0.8090152740478516, "memory(GiB)": 76.02, "step": 575, "token_acc": 0.7137375287797391, "train_speed(iter/s)": 0.020455 }, { "epoch": 1.0545950864422202, "grad_norm": 1.183729346768703, "learning_rate": 2.895893380639829e-05, "loss": 0.8083430290222168, "memory(GiB)": 76.02, "step": 580, "token_acc": 0.7071651090342679, "train_speed(iter/s)": 0.020466 }, { "epoch": 1.0636942675159236, "grad_norm": 1.527448245905063, "learning_rate": 2.878799019318283e-05, "loss": 0.787087345123291, "memory(GiB)": 76.02, "step": 585, "token_acc": 0.7470379146919431, "train_speed(iter/s)": 0.020477 }, { "epoch": 1.0727934485896269, "grad_norm": 1.2570337520295112, "learning_rate": 2.8616247226565888e-05, "loss": 0.8103050231933594, "memory(GiB)": 76.02, "step": 590, "token_acc": 0.7105431309904153, "train_speed(iter/s)": 0.020489 }, { "epoch": 1.0818926296633302, "grad_norm": 1.1805179088694353, "learning_rate": 2.8443720528244964e-05, "loss": 0.8091272354125977, "memory(GiB)": 76.02, "step": 595, "token_acc": 0.7236403995560489, "train_speed(iter/s)": 0.0205 }, { "epoch": 1.0909918107370338, "grad_norm": 1.3005835459012032, "learning_rate": 2.827042579120562e-05, "loss": 0.7841366767883301, "memory(GiB)": 76.02, "step": 600, "token_acc": 0.7160133444537115, "train_speed(iter/s)": 0.020511 }, { "epoch": 1.0909918107370338, "eval_loss": 0.4980168640613556, "eval_runtime": 122.0994, "eval_samples_per_second": 47.273, "eval_steps_per_second": 0.459, "eval_token_acc": 0.68817030297391, "step": 600 }, { "epoch": 1.100090991810737, "grad_norm": 1.0825655683949489, "learning_rate": 2.809637877829401e-05, "loss": 0.8102677345275879, "memory(GiB)": 76.02, "step": 605, "token_acc": 0.7054728756601056, "train_speed(iter/s)": 0.020407 }, { "epoch": 1.1091901728844404, "grad_norm": 1.269997926727983, "learning_rate": 2.792159532078314e-05, "loss": 0.8190704345703125, "memory(GiB)": 76.02, "step": 610, "token_acc": 0.7151929653150952, "train_speed(iter/s)": 0.020418 }, { "epoch": 1.1182893539581438, "grad_norm": 1.3197280690186768, "learning_rate": 2.7746091316932807e-05, "loss": 0.7909206867218017, "memory(GiB)": 76.02, "step": 615, "token_acc": 0.8111888111888111, "train_speed(iter/s)": 0.020428 }, { "epoch": 1.127388535031847, "grad_norm": 1.3074486932691716, "learning_rate": 2.756988273054354e-05, "loss": 0.7989336967468261, "memory(GiB)": 76.02, "step": 620, "token_acc": 0.6923334449280214, "train_speed(iter/s)": 0.020439 }, { "epoch": 1.1364877161055504, "grad_norm": 1.09154376619437, "learning_rate": 2.7392985589504512e-05, "loss": 0.7985887050628662, "memory(GiB)": 76.02, "step": 625, "token_acc": 0.6959603118355776, "train_speed(iter/s)": 0.02045 }, { "epoch": 1.1455868971792538, "grad_norm": 1.105083946015695, "learning_rate": 2.721541598433567e-05, "loss": 0.7879680156707763, "memory(GiB)": 76.02, "step": 630, "token_acc": 0.7151389249545572, "train_speed(iter/s)": 0.020461 }, { "epoch": 1.1546860782529573, "grad_norm": 1.1369632866951163, "learning_rate": 2.7037190066724108e-05, "loss": 0.8013208389282227, "memory(GiB)": 76.02, "step": 635, "token_acc": 0.6987542468856173, "train_speed(iter/s)": 0.020471 }, { "epoch": 1.1637852593266607, "grad_norm": 1.084161120288602, "learning_rate": 2.6858324048054956e-05, "loss": 0.8041671752929688, "memory(GiB)": 76.02, "step": 640, "token_acc": 0.6834153197470133, "train_speed(iter/s)": 0.020482 }, { "epoch": 1.172884440400364, "grad_norm": 1.154991176116474, "learning_rate": 2.667883419793676e-05, "loss": 0.8061488151550293, "memory(GiB)": 76.02, "step": 645, "token_acc": 0.7004991680532446, "train_speed(iter/s)": 0.020492 }, { "epoch": 1.1819836214740673, "grad_norm": 1.1196634253017694, "learning_rate": 2.649873684272164e-05, "loss": 0.8086748123168945, "memory(GiB)": 76.02, "step": 650, "token_acc": 0.6978937441056272, "train_speed(iter/s)": 0.020502 }, { "epoch": 1.1819836214740673, "eval_loss": 0.5025342702865601, "eval_runtime": 120.6757, "eval_samples_per_second": 47.831, "eval_steps_per_second": 0.464, "eval_token_acc": 0.6888256878507018, "step": 650 }, { "epoch": 1.1910828025477707, "grad_norm": 1.1155649313448126, "learning_rate": 2.6318048364020214e-05, "loss": 0.7836286544799804, "memory(GiB)": 76.02, "step": 655, "token_acc": 0.7220535467844328, "train_speed(iter/s)": 0.020409 }, { "epoch": 1.200181983621474, "grad_norm": 1.1072757367187032, "learning_rate": 2.613678519721155e-05, "loss": 0.7940217018127441, "memory(GiB)": 76.02, "step": 660, "token_acc": 0.7217682020802377, "train_speed(iter/s)": 0.02042 }, { "epoch": 1.2092811646951773, "grad_norm": 1.0457391204034119, "learning_rate": 2.5954963829948195e-05, "loss": 0.7881236553192139, "memory(GiB)": 76.02, "step": 665, "token_acc": 0.7111846946284033, "train_speed(iter/s)": 0.020429 }, { "epoch": 1.2183803457688809, "grad_norm": 1.2226481761675059, "learning_rate": 2.577260080065649e-05, "loss": 0.8019227981567383, "memory(GiB)": 76.02, "step": 670, "token_acc": 0.7422535211267606, "train_speed(iter/s)": 0.020438 }, { "epoch": 1.2274795268425842, "grad_norm": 1.27401099270194, "learning_rate": 2.558971269703219e-05, "loss": 0.7942542552947998, "memory(GiB)": 76.02, "step": 675, "token_acc": 0.7235213204951857, "train_speed(iter/s)": 0.020449 }, { "epoch": 1.2365787079162875, "grad_norm": 1.3601936101076058, "learning_rate": 2.5406316154531717e-05, "loss": 0.8046051025390625, "memory(GiB)": 76.02, "step": 680, "token_acc": 0.7112280701754385, "train_speed(iter/s)": 0.020459 }, { "epoch": 1.2456778889899909, "grad_norm": 1.1617605645583995, "learning_rate": 2.522242785485893e-05, "loss": 0.8000314712524415, "memory(GiB)": 76.02, "step": 685, "token_acc": 0.6886890349360083, "train_speed(iter/s)": 0.020469 }, { "epoch": 1.2547770700636942, "grad_norm": 1.3512273187713244, "learning_rate": 2.5038064524447827e-05, "loss": 0.8067909240722656, "memory(GiB)": 76.02, "step": 690, "token_acc": 0.7467532467532467, "train_speed(iter/s)": 0.020479 }, { "epoch": 1.2638762511373978, "grad_norm": 1.3157719287072271, "learning_rate": 2.4853242932941064e-05, "loss": 0.7853587150573731, "memory(GiB)": 76.02, "step": 695, "token_acc": 0.7197480881691408, "train_speed(iter/s)": 0.020488 }, { "epoch": 1.2729754322111009, "grad_norm": 1.1947998857326674, "learning_rate": 2.4667979891664625e-05, "loss": 0.7679170131683349, "memory(GiB)": 76.02, "step": 700, "token_acc": 0.7413360120542442, "train_speed(iter/s)": 0.020498 }, { "epoch": 1.2729754322111009, "eval_loss": 0.4833757281303406, "eval_runtime": 119.9805, "eval_samples_per_second": 48.108, "eval_steps_per_second": 0.467, "eval_token_acc": 0.6897318990383643, "step": 700 }, { "epoch": 1.2820746132848044, "grad_norm": 1.3268470864740665, "learning_rate": 2.448229225209865e-05, "loss": 0.788662052154541, "memory(GiB)": 76.02, "step": 705, "token_acc": 0.716280170373876, "train_speed(iter/s)": 0.020416 }, { "epoch": 1.2911737943585078, "grad_norm": 1.2466125304642335, "learning_rate": 2.429619690434464e-05, "loss": 0.7932944297790527, "memory(GiB)": 76.02, "step": 710, "token_acc": 0.7371388301620859, "train_speed(iter/s)": 0.020426 }, { "epoch": 1.300272975432211, "grad_norm": 1.3582445751553864, "learning_rate": 2.4109710775589104e-05, "loss": 0.8029943466186523, "memory(GiB)": 76.02, "step": 715, "token_acc": 0.7082366589327146, "train_speed(iter/s)": 0.020435 }, { "epoch": 1.3093721565059144, "grad_norm": 1.098320752598586, "learning_rate": 2.392285082856394e-05, "loss": 0.8051022529602051, "memory(GiB)": 76.02, "step": 720, "token_acc": 0.6993071593533488, "train_speed(iter/s)": 0.020444 }, { "epoch": 1.3184713375796178, "grad_norm": 1.1993515162762007, "learning_rate": 2.3735634060003428e-05, "loss": 0.7886831760406494, "memory(GiB)": 76.02, "step": 725, "token_acc": 0.7265460664703408, "train_speed(iter/s)": 0.020453 }, { "epoch": 1.3275705186533213, "grad_norm": 1.4913459363975115, "learning_rate": 2.3548077499098256e-05, "loss": 0.7917290687561035, "memory(GiB)": 76.02, "step": 730, "token_acc": 0.7044052863436123, "train_speed(iter/s)": 0.020462 }, { "epoch": 1.3366696997270244, "grad_norm": 1.3995123406507142, "learning_rate": 2.3360198205946542e-05, "loss": 0.788825798034668, "memory(GiB)": 76.02, "step": 735, "token_acc": 0.7135922330097088, "train_speed(iter/s)": 0.020471 }, { "epoch": 1.345768880800728, "grad_norm": 1.3354117848213083, "learning_rate": 2.3172013270002038e-05, "loss": 0.7835997581481934, "memory(GiB)": 76.02, "step": 740, "token_acc": 0.7201051248357424, "train_speed(iter/s)": 0.02048 }, { "epoch": 1.3548680618744313, "grad_norm": 1.0749964264738503, "learning_rate": 2.2983539808519702e-05, "loss": 0.7911547660827637, "memory(GiB)": 76.02, "step": 745, "token_acc": 0.7271609995903319, "train_speed(iter/s)": 0.020488 }, { "epoch": 1.3639672429481347, "grad_norm": 0.9437159555687519, "learning_rate": 2.2794794964998705e-05, "loss": 0.7891970634460449, "memory(GiB)": 76.02, "step": 750, "token_acc": 0.7132644956314536, "train_speed(iter/s)": 0.020497 }, { "epoch": 1.3639672429481347, "eval_loss": 0.48184001445770264, "eval_runtime": 120.3801, "eval_samples_per_second": 47.948, "eval_steps_per_second": 0.465, "eval_token_acc": 0.6908322983376689, "step": 750 }, { "epoch": 1.373066424021838, "grad_norm": 1.3416671636490984, "learning_rate": 2.260579590762304e-05, "loss": 0.8072065353393555, "memory(GiB)": 76.02, "step": 755, "token_acc": 0.7023445463812437, "train_speed(iter/s)": 0.020418 }, { "epoch": 1.3821656050955413, "grad_norm": 1.1639847848783198, "learning_rate": 2.2416559827699945e-05, "loss": 0.8082324028015136, "memory(GiB)": 76.02, "step": 760, "token_acc": 0.7145284621920136, "train_speed(iter/s)": 0.020427 }, { "epoch": 1.3912647861692449, "grad_norm": 1.132127107571287, "learning_rate": 2.2227103938096176e-05, "loss": 0.7869006156921386, "memory(GiB)": 76.02, "step": 765, "token_acc": 0.7099471830985915, "train_speed(iter/s)": 0.020436 }, { "epoch": 1.4003639672429482, "grad_norm": 1.0194297655037412, "learning_rate": 2.2037445471672312e-05, "loss": 0.8034600257873535, "memory(GiB)": 76.02, "step": 770, "token_acc": 0.7037037037037037, "train_speed(iter/s)": 0.020445 }, { "epoch": 1.4094631483166515, "grad_norm": 1.3328252272724603, "learning_rate": 2.1847601679715263e-05, "loss": 0.8002717971801758, "memory(GiB)": 76.02, "step": 775, "token_acc": 0.7140373750543242, "train_speed(iter/s)": 0.020454 }, { "epoch": 1.4185623293903549, "grad_norm": 1.265718534410907, "learning_rate": 2.1657589830369113e-05, "loss": 0.8017659187316895, "memory(GiB)": 76.02, "step": 780, "token_acc": 0.7063737623762376, "train_speed(iter/s)": 0.020462 }, { "epoch": 1.4276615104640582, "grad_norm": 0.9977051429918016, "learning_rate": 2.146742720706441e-05, "loss": 0.7789717674255371, "memory(GiB)": 76.02, "step": 785, "token_acc": 0.710708782742681, "train_speed(iter/s)": 0.02047 }, { "epoch": 1.4367606915377615, "grad_norm": 1.0283878536421338, "learning_rate": 2.127713110694606e-05, "loss": 0.8202502250671386, "memory(GiB)": 76.02, "step": 790, "token_acc": 0.707347972972973, "train_speed(iter/s)": 0.020478 }, { "epoch": 1.4458598726114649, "grad_norm": 1.0457464903588745, "learning_rate": 2.1086718839299972e-05, "loss": 0.7791718482971192, "memory(GiB)": 76.02, "step": 795, "token_acc": 0.7183828610919143, "train_speed(iter/s)": 0.020486 }, { "epoch": 1.4549590536851684, "grad_norm": 1.1827863278388744, "learning_rate": 2.0896207723978637e-05, "loss": 0.8088536262512207, "memory(GiB)": 76.02, "step": 800, "token_acc": 0.7157598499061913, "train_speed(iter/s)": 0.020494 }, { "epoch": 1.4549590536851684, "eval_loss": 0.4799867272377014, "eval_runtime": 120.658, "eval_samples_per_second": 47.838, "eval_steps_per_second": 0.464, "eval_token_acc": 0.6916009596129183, "step": 800 }, { "epoch": 1.4640582347588718, "grad_norm": 1.1034251914058373, "learning_rate": 2.070561508982571e-05, "loss": 0.7959201335906982, "memory(GiB)": 76.02, "step": 805, "token_acc": 0.7082542694497154, "train_speed(iter/s)": 0.020414 }, { "epoch": 1.473157415832575, "grad_norm": 1.1403649470949677, "learning_rate": 2.0514958273099778e-05, "loss": 0.8099080085754394, "memory(GiB)": 76.02, "step": 810, "token_acc": 0.6938775510204082, "train_speed(iter/s)": 0.020423 }, { "epoch": 1.4822565969062784, "grad_norm": 1.242956861788932, "learning_rate": 2.0324254615897438e-05, "loss": 0.7870995044708252, "memory(GiB)": 76.02, "step": 815, "token_acc": 0.6989182692307693, "train_speed(iter/s)": 0.020431 }, { "epoch": 1.4913557779799818, "grad_norm": 1.2480879646871645, "learning_rate": 2.0133521464575915e-05, "loss": 0.8157112121582031, "memory(GiB)": 76.02, "step": 820, "token_acc": 0.6917945296864576, "train_speed(iter/s)": 0.020438 }, { "epoch": 1.5004549590536853, "grad_norm": 1.4455782166201527, "learning_rate": 1.99427761681752e-05, "loss": 0.7882473945617676, "memory(GiB)": 76.02, "step": 825, "token_acc": 0.7195308516063234, "train_speed(iter/s)": 0.020446 }, { "epoch": 1.5095541401273884, "grad_norm": 1.129414363377021, "learning_rate": 1.9752036076839988e-05, "loss": 0.7893435955047607, "memory(GiB)": 76.02, "step": 830, "token_acc": 0.7249863313285949, "train_speed(iter/s)": 0.020454 }, { "epoch": 1.518653321201092, "grad_norm": 1.1611426190154455, "learning_rate": 1.9561318540241528e-05, "loss": 0.7893610000610352, "memory(GiB)": 76.02, "step": 835, "token_acc": 0.7279521674140508, "train_speed(iter/s)": 0.020463 }, { "epoch": 1.5277525022747953, "grad_norm": 1.387275557971045, "learning_rate": 1.93706409059995e-05, "loss": 0.7986185073852539, "memory(GiB)": 76.02, "step": 840, "token_acc": 0.7054386661373561, "train_speed(iter/s)": 0.02047 }, { "epoch": 1.5368516833484986, "grad_norm": 1.1029714828712447, "learning_rate": 1.9180020518104088e-05, "loss": 0.7868841171264649, "memory(GiB)": 76.02, "step": 845, "token_acc": 0.7180851063829787, "train_speed(iter/s)": 0.020478 }, { "epoch": 1.545950864422202, "grad_norm": 1.055709561997052, "learning_rate": 1.898947471533833e-05, "loss": 0.7913725852966309, "memory(GiB)": 76.02, "step": 850, "token_acc": 0.6924932167621345, "train_speed(iter/s)": 0.020486 }, { "epoch": 1.545950864422202, "eval_loss": 0.4763409495353699, "eval_runtime": 119.4883, "eval_samples_per_second": 48.306, "eval_steps_per_second": 0.469, "eval_token_acc": 0.6927134956692006, "step": 850 }, { "epoch": 1.5550500454959053, "grad_norm": 1.1281157034877283, "learning_rate": 1.8799020829701036e-05, "loss": 0.8020171165466309, "memory(GiB)": 76.02, "step": 855, "token_acc": 0.7118734923612973, "train_speed(iter/s)": 0.020415 }, { "epoch": 1.5641492265696089, "grad_norm": 1.0786368581164274, "learning_rate": 1.860867618483027e-05, "loss": 0.7822349071502686, "memory(GiB)": 76.02, "step": 860, "token_acc": 0.6926726410121244, "train_speed(iter/s)": 0.020423 }, { "epoch": 1.573248407643312, "grad_norm": 1.2124940318046376, "learning_rate": 1.8418458094427567e-05, "loss": 0.7907929420471191, "memory(GiB)": 76.02, "step": 865, "token_acc": 0.7004744958481613, "train_speed(iter/s)": 0.02043 }, { "epoch": 1.5823475887170155, "grad_norm": 1.087815247895776, "learning_rate": 1.82283838606831e-05, "loss": 0.78410964012146, "memory(GiB)": 76.02, "step": 870, "token_acc": 0.7159194876486734, "train_speed(iter/s)": 0.020438 }, { "epoch": 1.5914467697907189, "grad_norm": 1.033926015572944, "learning_rate": 1.803847077270188e-05, "loss": 0.786978006362915, "memory(GiB)": 76.02, "step": 875, "token_acc": 0.7101845522898155, "train_speed(iter/s)": 0.020445 }, { "epoch": 1.6005459508644222, "grad_norm": 1.162364059290432, "learning_rate": 1.7848736104931142e-05, "loss": 0.7876530647277832, "memory(GiB)": 76.02, "step": 880, "token_acc": 0.7407407407407407, "train_speed(iter/s)": 0.020452 }, { "epoch": 1.6096451319381255, "grad_norm": 1.0965939407284515, "learning_rate": 1.765919711558906e-05, "loss": 0.7792027473449707, "memory(GiB)": 76.02, "step": 885, "token_acc": 0.7125279642058165, "train_speed(iter/s)": 0.020459 }, { "epoch": 1.6187443130118289, "grad_norm": 1.1822482702836845, "learning_rate": 1.746987104509494e-05, "loss": 0.7893452644348145, "memory(GiB)": 76.02, "step": 890, "token_acc": 0.6998714652956298, "train_speed(iter/s)": 0.020466 }, { "epoch": 1.6278434940855324, "grad_norm": 1.0733217293598245, "learning_rate": 1.7280775114501057e-05, "loss": 0.7864848613739014, "memory(GiB)": 76.02, "step": 895, "token_acc": 0.7469492614001284, "train_speed(iter/s)": 0.020474 }, { "epoch": 1.6369426751592355, "grad_norm": 0.9761043125519061, "learning_rate": 1.7091926523926205e-05, "loss": 0.7935813426971435, "memory(GiB)": 76.02, "step": 900, "token_acc": 0.7378048780487805, "train_speed(iter/s)": 0.020481 }, { "epoch": 1.6369426751592355, "eval_loss": 0.4734553098678589, "eval_runtime": 120.2875, "eval_samples_per_second": 47.985, "eval_steps_per_second": 0.466, "eval_token_acc": 0.6929117293665017, "step": 900 }, { "epoch": 1.646041856232939, "grad_norm": 0.9775890422129749, "learning_rate": 1.6903342450991203e-05, "loss": 0.7867559909820556, "memory(GiB)": 76.02, "step": 905, "token_acc": 0.7061556329849012, "train_speed(iter/s)": 0.020416 }, { "epoch": 1.6551410373066424, "grad_norm": 1.032340730518062, "learning_rate": 1.6715040049256393e-05, "loss": 0.7743623733520508, "memory(GiB)": 76.02, "step": 910, "token_acc": 0.7131681877444589, "train_speed(iter/s)": 0.020423 }, { "epoch": 1.6642402183803457, "grad_norm": 1.0919952776609756, "learning_rate": 1.6527036446661396e-05, "loss": 0.7813485145568848, "memory(GiB)": 76.02, "step": 915, "token_acc": 0.7281947261663286, "train_speed(iter/s)": 0.02043 }, { "epoch": 1.673339399454049, "grad_norm": 1.2303788872377346, "learning_rate": 1.6339348743967126e-05, "loss": 0.7993118762969971, "memory(GiB)": 76.02, "step": 920, "token_acc": 0.7152953054013125, "train_speed(iter/s)": 0.020438 }, { "epoch": 1.6824385805277524, "grad_norm": 1.118393217178591, "learning_rate": 1.6151994013200325e-05, "loss": 0.7818034648895263, "memory(GiB)": 76.02, "step": 925, "token_acc": 0.7246165084002922, "train_speed(iter/s)": 0.020445 }, { "epoch": 1.691537761601456, "grad_norm": 1.2781086578084908, "learning_rate": 1.5964989296100682e-05, "loss": 0.7822434902191162, "memory(GiB)": 76.02, "step": 930, "token_acc": 0.7342391304347826, "train_speed(iter/s)": 0.020452 }, { "epoch": 1.700636942675159, "grad_norm": 1.0706561030394075, "learning_rate": 1.5778351602570742e-05, "loss": 0.7954679965972901, "memory(GiB)": 76.02, "step": 935, "token_acc": 0.7032355915065723, "train_speed(iter/s)": 0.020459 }, { "epoch": 1.7097361237488626, "grad_norm": 1.2217572797748102, "learning_rate": 1.5592097909128673e-05, "loss": 0.7845365524291992, "memory(GiB)": 76.02, "step": 940, "token_acc": 0.7320365224295355, "train_speed(iter/s)": 0.020466 }, { "epoch": 1.718835304822566, "grad_norm": 1.2477451151406387, "learning_rate": 1.5406245157364093e-05, "loss": 0.7835155010223389, "memory(GiB)": 76.02, "step": 945, "token_acc": 0.7151702786377709, "train_speed(iter/s)": 0.020473 }, { "epoch": 1.7279344858962693, "grad_norm": 1.1968781249693217, "learning_rate": 1.5220810252397054e-05, "loss": 0.7988658905029297, "memory(GiB)": 76.02, "step": 950, "token_acc": 0.7049180327868853, "train_speed(iter/s)": 0.020479 }, { "epoch": 1.7279344858962693, "eval_loss": 0.4713653028011322, "eval_runtime": 120.1658, "eval_samples_per_second": 48.034, "eval_steps_per_second": 0.466, "eval_token_acc": 0.6942872284906324, "step": 950 }, { "epoch": 1.7370336669699729, "grad_norm": 0.9590399831837186, "learning_rate": 1.5035810061340376e-05, "loss": 0.7818658828735352, "memory(GiB)": 76.02, "step": 955, "token_acc": 0.7186684073107049, "train_speed(iter/s)": 0.020417 }, { "epoch": 1.746132848043676, "grad_norm": 1.311812274039409, "learning_rate": 1.4851261411765414e-05, "loss": 0.7812034130096436, "memory(GiB)": 76.02, "step": 960, "token_acc": 0.7130058696323757, "train_speed(iter/s)": 0.020424 }, { "epoch": 1.7552320291173795, "grad_norm": 1.2066428640501157, "learning_rate": 1.4667181090171418e-05, "loss": 0.7740418910980225, "memory(GiB)": 76.02, "step": 965, "token_acc": 0.7142857142857143, "train_speed(iter/s)": 0.02043 }, { "epoch": 1.7643312101910829, "grad_norm": 1.1309046997472656, "learning_rate": 1.4483585840458632e-05, "loss": 0.7716457843780518, "memory(GiB)": 76.02, "step": 970, "token_acc": 0.7535986452159187, "train_speed(iter/s)": 0.020437 }, { "epoch": 1.7734303912647862, "grad_norm": 1.0597243121965947, "learning_rate": 1.4300492362405296e-05, "loss": 0.7900642871856689, "memory(GiB)": 76.02, "step": 975, "token_acc": 0.7184942716857611, "train_speed(iter/s)": 0.020444 }, { "epoch": 1.7825295723384895, "grad_norm": 0.9136761859628779, "learning_rate": 1.4117917310148624e-05, "loss": 0.7912971019744873, "memory(GiB)": 76.02, "step": 980, "token_acc": 0.7580794090489381, "train_speed(iter/s)": 0.02045 }, { "epoch": 1.7916287534121929, "grad_norm": 1.123085792919359, "learning_rate": 1.3935877290669932e-05, "loss": 0.7823569774627686, "memory(GiB)": 76.02, "step": 985, "token_acc": 0.7234323432343235, "train_speed(iter/s)": 0.020457 }, { "epoch": 1.8007279344858964, "grad_norm": 1.1608781306244833, "learning_rate": 1.375438886228411e-05, "loss": 0.7732644081115723, "memory(GiB)": 76.02, "step": 990, "token_acc": 0.6950644451430368, "train_speed(iter/s)": 0.020464 }, { "epoch": 1.8098271155595995, "grad_norm": 1.1283275236864316, "learning_rate": 1.3573468533133442e-05, "loss": 0.7756358623504639, "memory(GiB)": 76.02, "step": 995, "token_acc": 0.7115031238515251, "train_speed(iter/s)": 0.02047 }, { "epoch": 1.818926296633303, "grad_norm": 1.0540865657542784, "learning_rate": 1.3393132759686064e-05, "loss": 0.7759748935699463, "memory(GiB)": 76.02, "step": 1000, "token_acc": 0.6963375057950858, "train_speed(iter/s)": 0.020477 }, { "epoch": 1.818926296633303, "eval_loss": 0.4693294167518616, "eval_runtime": 119.7422, "eval_samples_per_second": 48.204, "eval_steps_per_second": 0.468, "eval_token_acc": 0.6942063167774483, "step": 1000 }, { "epoch": 1.8280254777070064, "grad_norm": 1.2097721619516764, "learning_rate": 1.3213397945239053e-05, "loss": 0.7718574047088623, "memory(GiB)": 76.02, "step": 1005, "token_acc": 0.7104117843990626, "train_speed(iter/s)": 0.020419 }, { "epoch": 1.8371246587807097, "grad_norm": 1.3429375958388912, "learning_rate": 1.303428043842641e-05, "loss": 0.7779555320739746, "memory(GiB)": 76.02, "step": 1010, "token_acc": 0.7344594594594595, "train_speed(iter/s)": 0.020425 }, { "epoch": 1.846223839854413, "grad_norm": 1.1502202864135298, "learning_rate": 1.2855796531731994e-05, "loss": 0.784113597869873, "memory(GiB)": 76.02, "step": 1015, "token_acc": 0.7116066903193107, "train_speed(iter/s)": 0.020432 }, { "epoch": 1.8553230209281164, "grad_norm": 0.9764736580354538, "learning_rate": 1.2677962460007555e-05, "loss": 0.769007682800293, "memory(GiB)": 76.02, "step": 1020, "token_acc": 0.7275985663082437, "train_speed(iter/s)": 0.020439 }, { "epoch": 1.86442220200182, "grad_norm": 1.0395064733034296, "learning_rate": 1.2500794398996004e-05, "loss": 0.7842848300933838, "memory(GiB)": 76.02, "step": 1025, "token_acc": 0.7331868131868132, "train_speed(iter/s)": 0.020445 }, { "epoch": 1.873521383075523, "grad_norm": 1.1556386067848643, "learning_rate": 1.2324308463860089e-05, "loss": 0.7766573905944825, "memory(GiB)": 76.02, "step": 1030, "token_acc": 0.729426433915212, "train_speed(iter/s)": 0.020451 }, { "epoch": 1.8826205641492266, "grad_norm": 1.261343214410371, "learning_rate": 1.2148520707716567e-05, "loss": 0.7785522937774658, "memory(GiB)": 76.02, "step": 1035, "token_acc": 0.7095070422535211, "train_speed(iter/s)": 0.020458 }, { "epoch": 1.89171974522293, "grad_norm": 1.3077190411896333, "learning_rate": 1.1973447120175998e-05, "loss": 0.7712287425994873, "memory(GiB)": 76.02, "step": 1040, "token_acc": 0.6994839221913458, "train_speed(iter/s)": 0.020464 }, { "epoch": 1.9008189262966333, "grad_norm": 1.0009654605437637, "learning_rate": 1.1799103625888342e-05, "loss": 0.7672115802764893, "memory(GiB)": 76.02, "step": 1045, "token_acc": 0.7111845210004719, "train_speed(iter/s)": 0.020471 }, { "epoch": 1.9099181073703366, "grad_norm": 1.1500066718260178, "learning_rate": 1.162550608309446e-05, "loss": 0.7593209743499756, "memory(GiB)": 76.02, "step": 1050, "token_acc": 0.7720478325859492, "train_speed(iter/s)": 0.020477 }, { "epoch": 1.9099181073703366, "eval_loss": 0.46374601125717163, "eval_runtime": 119.6783, "eval_samples_per_second": 48.229, "eval_steps_per_second": 0.468, "eval_token_acc": 0.6953795366186186, "step": 1050 }, { "epoch": 1.91901728844404, "grad_norm": 1.0354960902542707, "learning_rate": 1.1452670282183664e-05, "loss": 0.7757611274719238, "memory(GiB)": 76.02, "step": 1055, "token_acc": 0.7227655986509275, "train_speed(iter/s)": 0.02042 }, { "epoch": 1.9281164695177435, "grad_norm": 1.1181099943024946, "learning_rate": 1.12806119442574e-05, "loss": 0.7624452590942383, "memory(GiB)": 76.02, "step": 1060, "token_acc": 0.7370562130177515, "train_speed(iter/s)": 0.020426 }, { "epoch": 1.9372156505914466, "grad_norm": 1.020900947874345, "learning_rate": 1.1109346719699263e-05, "loss": 0.7685122489929199, "memory(GiB)": 76.02, "step": 1065, "token_acc": 0.7123585726718886, "train_speed(iter/s)": 0.020432 }, { "epoch": 1.9463148316651502, "grad_norm": 1.0619107995533037, "learning_rate": 1.0938890186751487e-05, "loss": 0.7687143325805664, "memory(GiB)": 76.02, "step": 1070, "token_acc": 0.7249620637329287, "train_speed(iter/s)": 0.020439 }, { "epoch": 1.9554140127388535, "grad_norm": 1.0950602334931028, "learning_rate": 1.0769257850097881e-05, "loss": 0.7737876415252686, "memory(GiB)": 76.02, "step": 1075, "token_acc": 0.6985485671752885, "train_speed(iter/s)": 0.020445 }, { "epoch": 1.9645131938125568, "grad_norm": 1.307250719010874, "learning_rate": 1.060046513945361e-05, "loss": 0.7766946792602539, "memory(GiB)": 76.02, "step": 1080, "token_acc": 0.7377892030848329, "train_speed(iter/s)": 0.020451 }, { "epoch": 1.9736123748862604, "grad_norm": 1.1430361120086814, "learning_rate": 1.0432527408161597e-05, "loss": 0.7805325031280518, "memory(GiB)": 76.02, "step": 1085, "token_acc": 0.7078861409239384, "train_speed(iter/s)": 0.020457 }, { "epoch": 1.9827115559599635, "grad_norm": 1.002916433279442, "learning_rate": 1.026545993179612e-05, "loss": 0.7858685493469239, "memory(GiB)": 76.02, "step": 1090, "token_acc": 0.7466666666666667, "train_speed(iter/s)": 0.020463 }, { "epoch": 1.991810737033667, "grad_norm": 1.0871219922265896, "learning_rate": 1.009927790677327e-05, "loss": 0.7784292697906494, "memory(GiB)": 76.02, "step": 1095, "token_acc": 0.7174170616113744, "train_speed(iter/s)": 0.020469 }, { "epoch": 2.0, "grad_norm": 1.7655187909978691, "learning_rate": 9.933996448968688e-06, "loss": 0.7408246994018555, "memory(GiB)": 76.02, "step": 1100, "token_acc": 0.7477064220183486, "train_speed(iter/s)": 0.020483 }, { "epoch": 2.0, "eval_loss": 0.4639655649662018, "eval_runtime": 118.882, "eval_samples_per_second": 48.552, "eval_steps_per_second": 0.471, "eval_token_acc": 0.6956344085151487, "step": 1100 }, { "epoch": 2.0090991810737036, "grad_norm": 1.1083572508394148, "learning_rate": 9.769630592342643e-06, "loss": 0.6631475925445557, "memory(GiB)": 76.02, "step": 1105, "token_acc": 0.732795337368303, "train_speed(iter/s)": 0.020423 }, { "epoch": 2.0181983621474067, "grad_norm": 1.1068844322629663, "learning_rate": 9.606195287572577e-06, "loss": 0.6467893600463868, "memory(GiB)": 76.02, "step": 1110, "token_acc": 0.7836676217765043, "train_speed(iter/s)": 0.020427 }, { "epoch": 2.02729754322111, "grad_norm": 1.1238716711584054, "learning_rate": 9.443705400693133e-06, "loss": 0.6334795475006103, "memory(GiB)": 76.02, "step": 1115, "token_acc": 0.746772864597638, "train_speed(iter/s)": 0.020432 }, { "epoch": 2.0363967242948133, "grad_norm": 0.9545754331665411, "learning_rate": 9.282175711744012e-06, "loss": 0.643845796585083, "memory(GiB)": 76.02, "step": 1120, "token_acc": 0.783322390019698, "train_speed(iter/s)": 0.020438 }, { "epoch": 2.045495905368517, "grad_norm": 1.112189160795635, "learning_rate": 9.121620913425508e-06, "loss": 0.6376824378967285, "memory(GiB)": 76.02, "step": 1125, "token_acc": 0.7677035076108537, "train_speed(iter/s)": 0.020444 }, { "epoch": 2.05459508644222, "grad_norm": 1.069654016986732, "learning_rate": 8.962055609762143e-06, "loss": 0.6328807353973389, "memory(GiB)": 76.02, "step": 1130, "token_acc": 0.7605409705648369, "train_speed(iter/s)": 0.020449 }, { "epoch": 2.0636942675159236, "grad_norm": 1.134992866714782, "learning_rate": 8.803494314774241e-06, "loss": 0.6297794342041015, "memory(GiB)": 76.02, "step": 1135, "token_acc": 0.7869767441860465, "train_speed(iter/s)": 0.020456 }, { "epoch": 2.072793448589627, "grad_norm": 1.1668054237375585, "learning_rate": 8.645951451157741e-06, "loss": 0.6355114459991456, "memory(GiB)": 76.02, "step": 1140, "token_acc": 0.7761146496815287, "train_speed(iter/s)": 0.020462 }, { "epoch": 2.08189262966333, "grad_norm": 1.1864938776830725, "learning_rate": 8.489441348972312e-06, "loss": 0.6331965923309326, "memory(GiB)": 76.02, "step": 1145, "token_acc": 0.7740963855421686, "train_speed(iter/s)": 0.020468 }, { "epoch": 2.0909918107370338, "grad_norm": 1.0454450783179292, "learning_rate": 8.333978244337921e-06, "loss": 0.6294968605041504, "memory(GiB)": 76.02, "step": 1150, "token_acc": 0.77819937909624, "train_speed(iter/s)": 0.020473 }, { "epoch": 2.0909918107370338, "eval_loss": 0.47781530022621155, "eval_runtime": 120.0715, "eval_samples_per_second": 48.071, "eval_steps_per_second": 0.466, "eval_token_acc": 0.6904358309430665, "step": 1150 }, { "epoch": 2.100090991810737, "grad_norm": 0.9951435698165627, "learning_rate": 8.179576278139872e-06, "loss": 0.6304058074951172, "memory(GiB)": 76.02, "step": 1155, "token_acc": 0.7404277792447848, "train_speed(iter/s)": 0.020422 }, { "epoch": 2.1091901728844404, "grad_norm": 1.067908969484696, "learning_rate": 8.026249494742617e-06, "loss": 0.6222400665283203, "memory(GiB)": 76.02, "step": 1160, "token_acc": 0.7715277777777778, "train_speed(iter/s)": 0.020428 }, { "epoch": 2.1182893539581436, "grad_norm": 1.057238882123902, "learning_rate": 7.874011840712197e-06, "loss": 0.6318105697631836, "memory(GiB)": 76.02, "step": 1165, "token_acc": 0.7550738007380073, "train_speed(iter/s)": 0.020433 }, { "epoch": 2.127388535031847, "grad_norm": 1.0798825041809057, "learning_rate": 7.72287716354776e-06, "loss": 0.6285967350006103, "memory(GiB)": 76.02, "step": 1170, "token_acc": 0.7547770700636943, "train_speed(iter/s)": 0.020439 }, { "epoch": 2.1364877161055507, "grad_norm": 1.0478822425834018, "learning_rate": 7.572859210421945e-06, "loss": 0.6234595775604248, "memory(GiB)": 76.02, "step": 1175, "token_acc": 0.7690631808278867, "train_speed(iter/s)": 0.020444 }, { "epoch": 2.1455868971792538, "grad_norm": 0.9867274025718497, "learning_rate": 7.423971626930435e-06, "loss": 0.6359669685363769, "memory(GiB)": 76.02, "step": 1180, "token_acc": 0.7695961995249406, "train_speed(iter/s)": 0.02045 }, { "epoch": 2.1546860782529573, "grad_norm": 1.0045378569587455, "learning_rate": 7.276227955850774e-06, "loss": 0.6464476585388184, "memory(GiB)": 76.02, "step": 1185, "token_acc": 0.7841451766953199, "train_speed(iter/s)": 0.020455 }, { "epoch": 2.1637852593266604, "grad_norm": 1.022012980465645, "learning_rate": 7.12964163591054e-06, "loss": 0.6201572895050049, "memory(GiB)": 76.02, "step": 1190, "token_acc": 0.74373795761079, "train_speed(iter/s)": 0.020461 }, { "epoch": 2.172884440400364, "grad_norm": 1.2093399237034956, "learning_rate": 6.984226000564907e-06, "loss": 0.6306787490844726, "memory(GiB)": 76.02, "step": 1195, "token_acc": 0.7755102040816326, "train_speed(iter/s)": 0.020467 }, { "epoch": 2.1819836214740675, "grad_norm": 0.966059090473921, "learning_rate": 6.8399942767839075e-06, "loss": 0.6421105861663818, "memory(GiB)": 76.02, "step": 1200, "token_acc": 0.7779262426509888, "train_speed(iter/s)": 0.020473 }, { "epoch": 2.1819836214740675, "eval_loss": 0.47876349091529846, "eval_runtime": 119.7281, "eval_samples_per_second": 48.209, "eval_steps_per_second": 0.468, "eval_token_acc": 0.6895983947116104, "step": 1200 }, { "epoch": 2.1910828025477707, "grad_norm": 1.069591399453908, "learning_rate": 6.696959583849228e-06, "loss": 0.6228060245513916, "memory(GiB)": 76.02, "step": 1205, "token_acc": 0.725686591276252, "train_speed(iter/s)": 0.020424 }, { "epoch": 2.200181983621474, "grad_norm": 1.0705675997492539, "learning_rate": 6.5551349321609585e-06, "loss": 0.6346144676208496, "memory(GiB)": 76.02, "step": 1210, "token_acc": 0.7361563517915309, "train_speed(iter/s)": 0.020429 }, { "epoch": 2.2092811646951773, "grad_norm": 0.99473395335189, "learning_rate": 6.414533222054138e-06, "loss": 0.6288974761962891, "memory(GiB)": 76.02, "step": 1215, "token_acc": 0.7661224489795918, "train_speed(iter/s)": 0.020435 }, { "epoch": 2.218380345768881, "grad_norm": 1.0273110672808459, "learning_rate": 6.275167242625331e-06, "loss": 0.6033660411834717, "memory(GiB)": 76.02, "step": 1220, "token_acc": 0.7424931756141947, "train_speed(iter/s)": 0.02044 }, { "epoch": 2.227479526842584, "grad_norm": 1.1134175189046431, "learning_rate": 6.137049670569344e-06, "loss": 0.6237975120544433, "memory(GiB)": 76.02, "step": 1225, "token_acc": 0.7610619469026548, "train_speed(iter/s)": 0.020445 }, { "epoch": 2.2365787079162875, "grad_norm": 1.0391880977302441, "learning_rate": 6.000193069026181e-06, "loss": 0.633206558227539, "memory(GiB)": 76.02, "step": 1230, "token_acc": 0.7656550134460238, "train_speed(iter/s)": 0.020451 }, { "epoch": 2.245677888989991, "grad_norm": 1.1575554243921846, "learning_rate": 5.8646098864382525e-06, "loss": 0.6448534488677978, "memory(GiB)": 76.02, "step": 1235, "token_acc": 0.7768777614138439, "train_speed(iter/s)": 0.020456 }, { "epoch": 2.254777070063694, "grad_norm": 1.0130550727371117, "learning_rate": 5.730312455418134e-06, "loss": 0.6195736408233643, "memory(GiB)": 76.02, "step": 1240, "token_acc": 0.7690447400241838, "train_speed(iter/s)": 0.020461 }, { "epoch": 2.2638762511373978, "grad_norm": 1.0895008794001835, "learning_rate": 5.597312991626713e-06, "loss": 0.6155508041381836, "memory(GiB)": 76.02, "step": 1245, "token_acc": 0.7842149454240135, "train_speed(iter/s)": 0.020466 }, { "epoch": 2.272975432211101, "grad_norm": 1.0868616738166854, "learning_rate": 5.465623592662137e-06, "loss": 0.6290598392486573, "memory(GiB)": 76.02, "step": 1250, "token_acc": 0.7843260188087774, "train_speed(iter/s)": 0.020471 }, { "epoch": 2.272975432211101, "eval_loss": 0.47770801186561584, "eval_runtime": 119.4212, "eval_samples_per_second": 48.333, "eval_steps_per_second": 0.469, "eval_token_acc": 0.6896631240821578, "step": 1250 }, { "epoch": 2.2820746132848044, "grad_norm": 1.0252310733499297, "learning_rate": 5.335256236959379e-06, "loss": 0.6228739261627197, "memory(GiB)": 76.02, "step": 1255, "token_acc": 0.7295555555555555, "train_speed(iter/s)": 0.020423 }, { "epoch": 2.2911737943585075, "grad_norm": 1.1274971851401754, "learning_rate": 5.206222782700667e-06, "loss": 0.6328925609588623, "memory(GiB)": 76.02, "step": 1260, "token_acc": 0.772992700729927, "train_speed(iter/s)": 0.020428 }, { "epoch": 2.300272975432211, "grad_norm": 0.9968940954527525, "learning_rate": 5.078534966736895e-06, "loss": 0.6318979740142823, "memory(GiB)": 76.02, "step": 1265, "token_acc": 0.766875691626706, "train_speed(iter/s)": 0.020433 }, { "epoch": 2.3093721565059147, "grad_norm": 1.0466074457299364, "learning_rate": 4.952204403520042e-06, "loss": 0.6296024799346924, "memory(GiB)": 76.02, "step": 1270, "token_acc": 0.7647696476964769, "train_speed(iter/s)": 0.020438 }, { "epoch": 2.3184713375796178, "grad_norm": 1.059039551077919, "learning_rate": 4.827242584046698e-06, "loss": 0.6291126251220703, "memory(GiB)": 76.02, "step": 1275, "token_acc": 0.7655979202772963, "train_speed(iter/s)": 0.020443 }, { "epoch": 2.3275705186533213, "grad_norm": 1.1223580815679548, "learning_rate": 4.70366087481289e-06, "loss": 0.620822811126709, "memory(GiB)": 76.02, "step": 1280, "token_acc": 0.7782307378719935, "train_speed(iter/s)": 0.020448 }, { "epoch": 2.3366696997270244, "grad_norm": 1.0233004088935174, "learning_rate": 4.581470516780115e-06, "loss": 0.6297062873840332, "memory(GiB)": 76.02, "step": 1285, "token_acc": 0.7572519083969466, "train_speed(iter/s)": 0.020453 }, { "epoch": 2.345768880800728, "grad_norm": 1.0470029791397224, "learning_rate": 4.460682624352952e-06, "loss": 0.625699806213379, "memory(GiB)": 76.02, "step": 1290, "token_acc": 0.7591605596269154, "train_speed(iter/s)": 0.020458 }, { "epoch": 2.3548680618744315, "grad_norm": 0.915808456859335, "learning_rate": 4.34130818436805e-06, "loss": 0.6242890357971191, "memory(GiB)": 76.02, "step": 1295, "token_acc": 0.7637987012987013, "train_speed(iter/s)": 0.020462 }, { "epoch": 2.3639672429481347, "grad_norm": 0.9679022008759249, "learning_rate": 4.223358055094762e-06, "loss": 0.6215915203094482, "memory(GiB)": 76.02, "step": 1300, "token_acc": 0.7939560439560439, "train_speed(iter/s)": 0.020467 }, { "epoch": 2.3639672429481347, "eval_loss": 0.4746646285057068, "eval_runtime": 120.7999, "eval_samples_per_second": 47.782, "eval_steps_per_second": 0.464, "eval_token_acc": 0.6904034662577928, "step": 1300 }, { "epoch": 2.3767060964513194, "grad_norm": 1.0864039116899251, "learning_rate": 4.106842965247497e-06, "loss": 0.607478666305542, "memory(GiB)": 53.99, "step": 1305, "token_acc": 0.777601899485556, "train_speed(iter/s)": 4.038665 }, { "epoch": 2.385805277525023, "grad_norm": 0.955554735442322, "learning_rate": 3.991773513009849e-06, "loss": 0.6158496856689453, "memory(GiB)": 53.99, "step": 1310, "token_acc": 0.7964731814842028, "train_speed(iter/s)": 2.330085 }, { "epoch": 2.394904458598726, "grad_norm": 1.0615963891170637, "learning_rate": 3.87816016507055e-06, "loss": 0.6333821296691895, "memory(GiB)": 53.99, "step": 1315, "token_acc": 0.7811782708492732, "train_speed(iter/s)": 1.665234 }, { "epoch": 2.4040036396724296, "grad_norm": 1.148829953509744, "learning_rate": 3.766013255671479e-06, "loss": 0.6272965908050537, "memory(GiB)": 53.99, "step": 1320, "token_acc": 0.7688679245283019, "train_speed(iter/s)": 1.297177 }, { "epoch": 2.4131028207461327, "grad_norm": 1.0891236462035252, "learning_rate": 3.6553429856675915e-06, "loss": 0.6266043663024903, "memory(GiB)": 77.52, "step": 1325, "token_acc": 0.7914959016393442, "train_speed(iter/s)": 1.06612 }, { "epoch": 2.4222020018198362, "grad_norm": 1.1117445945203506, "learning_rate": 3.5461594215991247e-06, "loss": 0.6159255981445313, "memory(GiB)": 77.52, "step": 1330, "token_acc": 0.7893491124260354, "train_speed(iter/s)": 0.90399 }, { "epoch": 2.43130118289354, "grad_norm": 0.9824968556280764, "learning_rate": 3.438472494775902e-06, "loss": 0.6225139141082764, "memory(GiB)": 77.52, "step": 1335, "token_acc": 0.7502756339581036, "train_speed(iter/s)": 0.785373 }, { "epoch": 2.440400363967243, "grad_norm": 0.9912665739642537, "learning_rate": 3.3322920003739913e-06, "loss": 0.6153748989105224, "memory(GiB)": 77.52, "step": 1340, "token_acc": 0.790268456375839, "train_speed(iter/s)": 0.696672 }, { "epoch": 2.4494995450409465, "grad_norm": 1.064566119713343, "learning_rate": 3.227627596544738e-06, "loss": 0.6232125759124756, "memory(GiB)": 77.52, "step": 1345, "token_acc": 0.7880870561282932, "train_speed(iter/s)": 0.625452 }, { "epoch": 2.4585987261146496, "grad_norm": 1.0308754966071667, "learning_rate": 3.1244888035362875e-06, "loss": 0.6144218444824219, "memory(GiB)": 77.52, "step": 1350, "token_acc": 0.7680478428022213, "train_speed(iter/s)": 0.569157 }, { "epoch": 2.4585987261146496, "eval_loss": 0.4741266369819641, "eval_runtime": 123.1251, "eval_samples_per_second": 46.879, "eval_steps_per_second": 0.455, "eval_token_acc": 0.6901121840903298, "step": 1350 }, { "epoch": 2.467697907188353, "grad_norm": 1.1124385874562812, "learning_rate": 3.0228850028275803e-06, "loss": 0.6197083950042724, "memory(GiB)": 77.52, "step": 1355, "token_acc": 0.7441558441558441, "train_speed(iter/s)": 0.491445 }, { "epoch": 2.4767970882620562, "grad_norm": 1.0177391538655736, "learning_rate": 2.922825436275061e-06, "loss": 0.6326658248901367, "memory(GiB)": 77.52, "step": 1360, "token_acc": 0.774859287054409, "train_speed(iter/s)": 0.456689 }, { "epoch": 2.48589626933576, "grad_norm": 0.9939709571788379, "learning_rate": 2.8243192052719902e-06, "loss": 0.6353094577789307, "memory(GiB)": 77.52, "step": 1365, "token_acc": 0.7515923566878981, "train_speed(iter/s)": 0.426316 }, { "epoch": 2.494995450409463, "grad_norm": 1.0864856971626622, "learning_rate": 2.72737526992064e-06, "loss": 0.6143672466278076, "memory(GiB)": 77.52, "step": 1370, "token_acc": 0.800497203231821, "train_speed(iter/s)": 0.399977 }, { "epoch": 2.5040946314831665, "grad_norm": 0.9778765243753255, "learning_rate": 2.6320024482172592e-06, "loss": 0.6241840362548828, "memory(GiB)": 77.52, "step": 1375, "token_acc": 0.7901711761457758, "train_speed(iter/s)": 0.376966 }, { "epoch": 2.51319381255687, "grad_norm": 0.963647645236081, "learning_rate": 2.5382094152499705e-06, "loss": 0.635280704498291, "memory(GiB)": 77.52, "step": 1380, "token_acc": 0.7607636068237206, "train_speed(iter/s)": 0.356417 }, { "epoch": 2.522292993630573, "grad_norm": 0.9666636858906085, "learning_rate": 2.4460047024097144e-06, "loss": 0.6261641502380371, "memory(GiB)": 77.52, "step": 1385, "token_acc": 0.7655134541460736, "train_speed(iter/s)": 0.338341 }, { "epoch": 2.5313921747042767, "grad_norm": 0.9689736671771748, "learning_rate": 2.3553966966142384e-06, "loss": 0.6166990280151368, "memory(GiB)": 77.52, "step": 1390, "token_acc": 0.7619183556951185, "train_speed(iter/s)": 0.321781 }, { "epoch": 2.54049135577798, "grad_norm": 1.0530841209630801, "learning_rate": 2.266393639545197e-06, "loss": 0.6244637966156006, "memory(GiB)": 77.52, "step": 1395, "token_acc": 0.7679372197309418, "train_speed(iter/s)": 0.307132 }, { "epoch": 2.5495905368516834, "grad_norm": 0.9878733985818398, "learning_rate": 2.1790036268985284e-06, "loss": 0.6239931106567382, "memory(GiB)": 77.52, "step": 1400, "token_acc": 0.7469059405940595, "train_speed(iter/s)": 0.293674 }, { "epoch": 2.5495905368516834, "eval_loss": 0.47406768798828125, "eval_runtime": 121.0349, "eval_samples_per_second": 47.689, "eval_steps_per_second": 0.463, "eval_token_acc": 0.6900029532775313, "step": 1400 }, { "epoch": 2.558689717925387, "grad_norm": 1.0090114818567588, "learning_rate": 2.0932346076480314e-06, "loss": 0.6187572956085206, "memory(GiB)": 77.52, "step": 1405, "token_acc": 0.7450779851700332, "train_speed(iter/s)": 0.272986 }, { "epoch": 2.56778889899909, "grad_norm": 0.9588816991739316, "learning_rate": 2.009094383322356e-06, "loss": 0.6277956485748291, "memory(GiB)": 77.52, "step": 1410, "token_acc": 0.7810402684563759, "train_speed(iter/s)": 0.262478 }, { "epoch": 2.5768880800727936, "grad_norm": 0.9909418694472445, "learning_rate": 1.9265906072953822e-06, "loss": 0.6175178050994873, "memory(GiB)": 77.52, "step": 1415, "token_acc": 0.7652439024390244, "train_speed(iter/s)": 0.252862 }, { "epoch": 2.5859872611464967, "grad_norm": 1.1182023779440498, "learning_rate": 1.8457307840900428e-06, "loss": 0.6154948711395264, "memory(GiB)": 77.52, "step": 1420, "token_acc": 0.7852161785216178, "train_speed(iter/s)": 0.244119 }, { "epoch": 2.5950864422202002, "grad_norm": 1.0404157493617592, "learning_rate": 1.7665222686957362e-06, "loss": 0.6219567775726318, "memory(GiB)": 77.52, "step": 1425, "token_acc": 0.7628019323671498, "train_speed(iter/s)": 0.235826 }, { "epoch": 2.604185623293904, "grad_norm": 1.0786639447035942, "learning_rate": 1.6889722658993223e-06, "loss": 0.6350451946258545, "memory(GiB)": 77.52, "step": 1430, "token_acc": 0.7704379562043796, "train_speed(iter/s)": 0.228331 }, { "epoch": 2.613284804367607, "grad_norm": 1.0095118897080797, "learning_rate": 1.6130878296297536e-06, "loss": 0.6284623622894288, "memory(GiB)": 77.52, "step": 1435, "token_acc": 0.7636180228648285, "train_speed(iter/s)": 0.221176 }, { "epoch": 2.62238398544131, "grad_norm": 0.94070647379727, "learning_rate": 1.5388758623164802e-06, "loss": 0.6281323432922363, "memory(GiB)": 77.52, "step": 1440, "token_acc": 0.7643463497453311, "train_speed(iter/s)": 0.214634 }, { "epoch": 2.6314831665150136, "grad_norm": 1.0651613672971816, "learning_rate": 1.4663431142615792e-06, "loss": 0.6090371608734131, "memory(GiB)": 77.52, "step": 1445, "token_acc": 0.8246628131021194, "train_speed(iter/s)": 0.208466 }, { "epoch": 2.640582347588717, "grad_norm": 1.0004848001888615, "learning_rate": 1.3954961830257685e-06, "loss": 0.624143123626709, "memory(GiB)": 77.52, "step": 1450, "token_acc": 0.7779850746268657, "train_speed(iter/s)": 0.202625 }, { "epoch": 2.640582347588717, "eval_loss": 0.47285741567611694, "eval_runtime": 117.6959, "eval_samples_per_second": 49.042, "eval_steps_per_second": 0.476, "eval_token_acc": 0.6904803323853178, "step": 1450 }, { "epoch": 2.6496815286624202, "grad_norm": 1.0494169722074018, "learning_rate": 1.3263415128282908e-06, "loss": 0.6255748271942139, "memory(GiB)": 77.52, "step": 1455, "token_acc": 0.732059542323928, "train_speed(iter/s)": 0.193115 }, { "epoch": 2.658780709736124, "grad_norm": 1.0337513224693766, "learning_rate": 1.2588853939607338e-06, "loss": 0.6212813377380371, "memory(GiB)": 77.52, "step": 1460, "token_acc": 0.7488385598141696, "train_speed(iter/s)": 0.188151 }, { "epoch": 2.667879890809827, "grad_norm": 0.9339498438090048, "learning_rate": 1.1931339622148897e-06, "loss": 0.6209768295288086, "memory(GiB)": 77.52, "step": 1465, "token_acc": 0.7604208822339134, "train_speed(iter/s)": 0.183569 }, { "epoch": 2.6769790718835305, "grad_norm": 1.009290828686064, "learning_rate": 1.1290931983246334e-06, "loss": 0.619508934020996, "memory(GiB)": 77.52, "step": 1470, "token_acc": 0.7703793381759484, "train_speed(iter/s)": 0.179159 }, { "epoch": 2.686078252957234, "grad_norm": 0.9092366819727269, "learning_rate": 1.0667689274219128e-06, "loss": 0.6159298419952393, "memory(GiB)": 77.52, "step": 1475, "token_acc": 0.7770177838577291, "train_speed(iter/s)": 0.175056 }, { "epoch": 2.695177434030937, "grad_norm": 0.9840242855378942, "learning_rate": 1.0061668185068996e-06, "loss": 0.6134575843811035, "memory(GiB)": 77.52, "step": 1480, "token_acc": 0.7733843537414966, "train_speed(iter/s)": 0.171104 }, { "epoch": 2.7042766151046407, "grad_norm": 1.0092116973578455, "learning_rate": 9.4729238393235e-07, "loss": 0.6143134593963623, "memory(GiB)": 77.52, "step": 1485, "token_acc": 0.7900072411296162, "train_speed(iter/s)": 0.167358 }, { "epoch": 2.713375796178344, "grad_norm": 1.0868815484832741, "learning_rate": 8.901509789021779e-07, "loss": 0.600148344039917, "memory(GiB)": 77.52, "step": 1490, "token_acc": 0.7679245283018868, "train_speed(iter/s)": 0.163825 }, { "epoch": 2.7224749772520473, "grad_norm": 1.0410901018430865, "learning_rate": 8.347478009843746e-07, "loss": 0.6201463222503663, "memory(GiB)": 77.52, "step": 1495, "token_acc": 0.738926899531869, "train_speed(iter/s)": 0.160424 }, { "epoch": 2.731574158325751, "grad_norm": 0.9884777012197261, "learning_rate": 7.810878896382101e-07, "loss": 0.6072117805480957, "memory(GiB)": 77.52, "step": 1500, "token_acc": 0.7709691438504997, "train_speed(iter/s)": 0.157229 }, { "epoch": 2.731574158325751, "eval_loss": 0.4724496603012085, "eval_runtime": 119.0959, "eval_samples_per_second": 48.465, "eval_steps_per_second": 0.47, "eval_token_acc": 0.6903711015725191, "step": 1500 }, { "epoch": 2.740673339399454, "grad_norm": 0.9523503494388392, "learning_rate": 7.291761257558749e-07, "loss": 0.6324088096618652, "memory(GiB)": 77.52, "step": 1505, "token_acc": 0.7417567924030599, "train_speed(iter/s)": 0.151702 }, { "epoch": 2.7497725204731576, "grad_norm": 0.9815948952007479, "learning_rate": 6.790172312184972e-07, "loss": 0.6338190078735352, "memory(GiB)": 77.52, "step": 1510, "token_acc": 0.7562122229684352, "train_speed(iter/s)": 0.148878 }, { "epoch": 2.7588717015468607, "grad_norm": 1.0475192921698937, "learning_rate": 6.306157684666425e-07, "loss": 0.6202810764312744, "memory(GiB)": 77.52, "step": 1515, "token_acc": 0.7550281576830249, "train_speed(iter/s)": 0.146148 }, { "epoch": 2.7679708826205642, "grad_norm": 1.0206535695296246, "learning_rate": 5.839761400853183e-07, "loss": 0.6317409992218017, "memory(GiB)": 77.52, "step": 1520, "token_acc": 0.7529880478087649, "train_speed(iter/s)": 0.143534 }, { "epoch": 2.777070063694268, "grad_norm": 0.9666971373448247, "learning_rate": 5.391025884035239e-07, "loss": 0.6138282775878906, "memory(GiB)": 77.52, "step": 1525, "token_acc": 0.767303609341826, "train_speed(iter/s)": 0.141033 }, { "epoch": 2.786169244767971, "grad_norm": 1.002591354360291, "learning_rate": 4.959991951083498e-07, "loss": 0.617135763168335, "memory(GiB)": 77.52, "step": 1530, "token_acc": 0.8161559888579387, "train_speed(iter/s)": 0.13864 }, { "epoch": 2.795268425841674, "grad_norm": 1.006202469505235, "learning_rate": 4.5466988087373044e-07, "loss": 0.6056863784790039, "memory(GiB)": 77.52, "step": 1535, "token_acc": 0.760498687664042, "train_speed(iter/s)": 0.136344 }, { "epoch": 2.8043676069153776, "grad_norm": 0.9830438526460707, "learning_rate": 4.151184050038004e-07, "loss": 0.6215356349945068, "memory(GiB)": 77.52, "step": 1540, "token_acc": 0.7701478302336672, "train_speed(iter/s)": 0.134118 }, { "epoch": 2.813466787989081, "grad_norm": 1.0560207375711046, "learning_rate": 3.7734836509096596e-07, "loss": 0.6116134643554687, "memory(GiB)": 77.52, "step": 1545, "token_acc": 0.7759115116755428, "train_speed(iter/s)": 0.132005 }, { "epoch": 2.8225659690627842, "grad_norm": 1.0225913174286714, "learning_rate": 3.4136319668866434e-07, "loss": 0.625472116470337, "memory(GiB)": 77.52, "step": 1550, "token_acc": 0.7980769230769231, "train_speed(iter/s)": 0.129952 }, { "epoch": 2.8225659690627842, "eval_loss": 0.4721684753894806, "eval_runtime": 118.0382, "eval_samples_per_second": 48.899, "eval_steps_per_second": 0.474, "eval_token_acc": 0.6906583381543229, "step": 1550 }, { "epoch": 2.831665150136488, "grad_norm": 1.0572860374958588, "learning_rate": 3.071661729988584e-07, "loss": 0.6085397720336914, "memory(GiB)": 77.52, "step": 1555, "token_acc": 0.7432788613600422, "train_speed(iter/s)": 0.126385 }, { "epoch": 2.840764331210191, "grad_norm": 0.977034680594488, "learning_rate": 2.747604045743102e-07, "loss": 0.6142263889312745, "memory(GiB)": 77.52, "step": 1560, "token_acc": 0.7400581959262852, "train_speed(iter/s)": 0.124515 }, { "epoch": 2.8498635122838945, "grad_norm": 0.9902282597829868, "learning_rate": 2.4414883903565834e-07, "loss": 0.6152991771697998, "memory(GiB)": 77.52, "step": 1565, "token_acc": 0.8156277436347673, "train_speed(iter/s)": 0.122739 }, { "epoch": 2.858962693357598, "grad_norm": 1.112609715887069, "learning_rate": 2.15334260803286e-07, "loss": 0.6211013793945312, "memory(GiB)": 77.52, "step": 1570, "token_acc": 0.7968069666182874, "train_speed(iter/s)": 0.121013 }, { "epoch": 2.868061874431301, "grad_norm": 1.007653504358626, "learning_rate": 1.8831929084406119e-07, "loss": 0.6160074234008789, "memory(GiB)": 77.52, "step": 1575, "token_acc": 0.7956026058631922, "train_speed(iter/s)": 0.119343 }, { "epoch": 2.8771610555050047, "grad_norm": 1.0328729828726175, "learning_rate": 1.631063864329274e-07, "loss": 0.6106714725494384, "memory(GiB)": 77.52, "step": 1580, "token_acc": 0.8102600140548137, "train_speed(iter/s)": 0.11774 }, { "epoch": 2.886260236578708, "grad_norm": 0.9727986501836436, "learning_rate": 1.3969784092939588e-07, "loss": 0.6038858890533447, "memory(GiB)": 77.52, "step": 1585, "token_acc": 0.7294275491949911, "train_speed(iter/s)": 0.116161 }, { "epoch": 2.8953594176524113, "grad_norm": 1.0580993770834335, "learning_rate": 1.180957835689478e-07, "loss": 0.6102193832397461, "memory(GiB)": 77.52, "step": 1590, "token_acc": 0.7574578469520103, "train_speed(iter/s)": 0.114662 }, { "epoch": 2.904458598726115, "grad_norm": 0.9841890221890635, "learning_rate": 9.83021792693406e-08, "loss": 0.6162684917449951, "memory(GiB)": 77.52, "step": 1595, "token_acc": 0.7871674491392802, "train_speed(iter/s)": 0.113191 }, { "epoch": 2.913557779799818, "grad_norm": 1.0209356603903383, "learning_rate": 8.031882845189743e-08, "loss": 0.6077028751373291, "memory(GiB)": 77.52, "step": 1600, "token_acc": 0.7544715447154472, "train_speed(iter/s)": 0.111782 }, { "epoch": 2.913557779799818, "eval_loss": 0.4720407724380493, "eval_runtime": 116.4832, "eval_samples_per_second": 49.552, "eval_steps_per_second": 0.481, "eval_token_acc": 0.6904317853574072, "step": 1600 } ], "logging_steps": 5, "max_steps": 1647, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.5861721952354304e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }