Spaces:
Runtime error
Runtime error
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.8837229214830038, | |
| "eval_steps": 500, | |
| "global_step": 5500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 0.5656424760818481, | |
| "learning_rate": 0.00019972583961617548, | |
| "loss": 2.6115, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.9568225145339966, | |
| "learning_rate": 0.0001993831391363948, | |
| "loss": 2.3674, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.8508038520812988, | |
| "learning_rate": 0.00019904043865661412, | |
| "loss": 2.1487, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.0186710357666016, | |
| "learning_rate": 0.00019869773817683345, | |
| "loss": 2.1421, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.7234398126602173, | |
| "learning_rate": 0.0001983550376970528, | |
| "loss": 2.0021, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.197520136833191, | |
| "learning_rate": 0.0001980123372172721, | |
| "loss": 2.0381, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.0739991664886475, | |
| "learning_rate": 0.00019766963673749144, | |
| "loss": 1.8883, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0150132179260254, | |
| "learning_rate": 0.00019732693625771076, | |
| "loss": 1.8494, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.236234426498413, | |
| "learning_rate": 0.0001969842357779301, | |
| "loss": 1.9171, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.0886958837509155, | |
| "learning_rate": 0.00019664153529814942, | |
| "loss": 1.9387, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.1191097497940063, | |
| "learning_rate": 0.00019629883481836875, | |
| "loss": 1.976, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.0738675594329834, | |
| "learning_rate": 0.00019595613433858808, | |
| "loss": 1.8378, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.648668646812439, | |
| "learning_rate": 0.0001956134338588074, | |
| "loss": 1.768, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.9386289119720459, | |
| "learning_rate": 0.00019527073337902674, | |
| "loss": 2.0067, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.1613832712173462, | |
| "learning_rate": 0.00019492803289924607, | |
| "loss": 1.9355, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.7319044470787048, | |
| "learning_rate": 0.0001945853324194654, | |
| "loss": 1.9042, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9041644930839539, | |
| "learning_rate": 0.00019424263193968473, | |
| "loss": 1.7163, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9293299317359924, | |
| "learning_rate": 0.00019389993145990406, | |
| "loss": 1.807, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.9214122295379639, | |
| "learning_rate": 0.00019355723098012336, | |
| "loss": 1.9056, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.7177646160125732, | |
| "learning_rate": 0.0001932145305003427, | |
| "loss": 1.9574, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.813965916633606, | |
| "learning_rate": 0.00019287183002056205, | |
| "loss": 1.7995, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.0333760976791382, | |
| "learning_rate": 0.00019252912954078138, | |
| "loss": 1.814, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.6691217422485352, | |
| "learning_rate": 0.0001921864290610007, | |
| "loss": 1.8261, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.1737751960754395, | |
| "learning_rate": 0.00019184372858122, | |
| "loss": 1.9473, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.1508344411849976, | |
| "learning_rate": 0.00019150102810143934, | |
| "loss": 1.9176, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.6660133600234985, | |
| "learning_rate": 0.00019115832762165867, | |
| "loss": 1.835, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.6423531174659729, | |
| "learning_rate": 0.00019081562714187803, | |
| "loss": 1.7194, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.8241636157035828, | |
| "learning_rate": 0.00019047292666209733, | |
| "loss": 1.8679, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.7184795141220093, | |
| "learning_rate": 0.00019013022618231666, | |
| "loss": 1.8129, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.8253782391548157, | |
| "learning_rate": 0.000189787525702536, | |
| "loss": 1.8567, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.417243242263794, | |
| "learning_rate": 0.00018944482522275532, | |
| "loss": 1.7741, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.9040454626083374, | |
| "learning_rate": 0.00018910212474297465, | |
| "loss": 1.8458, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.6580069065093994, | |
| "learning_rate": 0.00018875942426319398, | |
| "loss": 1.7982, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.8849833011627197, | |
| "learning_rate": 0.0001884167237834133, | |
| "loss": 1.8622, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.0523239374160767, | |
| "learning_rate": 0.00018807402330363264, | |
| "loss": 1.8608, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.0496423244476318, | |
| "learning_rate": 0.00018773132282385194, | |
| "loss": 1.8245, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.9488272070884705, | |
| "learning_rate": 0.00018738862234407127, | |
| "loss": 1.8933, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.9461072087287903, | |
| "learning_rate": 0.00018704592186429063, | |
| "loss": 1.7277, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.6415026187896729, | |
| "learning_rate": 0.00018670322138450996, | |
| "loss": 1.7843, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.0457078218460083, | |
| "learning_rate": 0.0001863605209047293, | |
| "loss": 1.8874, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.0890721082687378, | |
| "learning_rate": 0.0001860178204249486, | |
| "loss": 1.8536, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.8896569013595581, | |
| "learning_rate": 0.00018567511994516792, | |
| "loss": 1.8297, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.9457584023475647, | |
| "learning_rate": 0.00018533241946538728, | |
| "loss": 1.8061, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.8208130598068237, | |
| "learning_rate": 0.0001849897189856066, | |
| "loss": 1.8238, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.7884149551391602, | |
| "learning_rate": 0.0001846470185058259, | |
| "loss": 1.7419, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.5733205080032349, | |
| "learning_rate": 0.00018430431802604524, | |
| "loss": 1.8829, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.963455319404602, | |
| "learning_rate": 0.00018396161754626457, | |
| "loss": 1.822, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.616909384727478, | |
| "learning_rate": 0.0001836189170664839, | |
| "loss": 1.7923, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.5382218360900879, | |
| "learning_rate": 0.00018327621658670323, | |
| "loss": 1.719, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.171004056930542, | |
| "learning_rate": 0.00018293351610692256, | |
| "loss": 1.8522, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "eval_loss": 1.9394277334213257, | |
| "eval_runtime": 33.4276, | |
| "eval_samples_per_second": 29.915, | |
| "eval_steps_per_second": 3.739, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.7731293439865112, | |
| "learning_rate": 0.0001825908156271419, | |
| "loss": 1.9151, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.8664043545722961, | |
| "learning_rate": 0.00018224811514736122, | |
| "loss": 1.6679, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.3886076211929321, | |
| "learning_rate": 0.00018190541466758055, | |
| "loss": 1.8509, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.7000617384910583, | |
| "learning_rate": 0.00018156271418779988, | |
| "loss": 1.8046, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.8490706086158752, | |
| "learning_rate": 0.0001812200137080192, | |
| "loss": 1.748, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.4293190240859985, | |
| "learning_rate": 0.00018087731322823854, | |
| "loss": 1.9725, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.7126957178115845, | |
| "learning_rate": 0.00018053461274845787, | |
| "loss": 1.6888, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.9974524974822998, | |
| "learning_rate": 0.00018019191226867717, | |
| "loss": 1.8405, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.9911081790924072, | |
| "learning_rate": 0.0001798492117888965, | |
| "loss": 1.7753, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 1.3659840822219849, | |
| "learning_rate": 0.00017950651130911585, | |
| "loss": 1.7435, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.4976978302001953, | |
| "learning_rate": 0.00017916381082933518, | |
| "loss": 1.759, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.7868736982345581, | |
| "learning_rate": 0.0001788211103495545, | |
| "loss": 1.7654, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.006628155708313, | |
| "learning_rate": 0.00017847840986977382, | |
| "loss": 1.7862, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.8664697408676147, | |
| "learning_rate": 0.00017813570938999315, | |
| "loss": 1.8815, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.44789645075798035, | |
| "learning_rate": 0.00017779300891021248, | |
| "loss": 1.779, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.9740760326385498, | |
| "learning_rate": 0.00017745030843043183, | |
| "loss": 1.7026, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.9802984595298767, | |
| "learning_rate": 0.00017710760795065114, | |
| "loss": 1.8359, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.0521053075790405, | |
| "learning_rate": 0.00017676490747087047, | |
| "loss": 1.7777, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.6399825215339661, | |
| "learning_rate": 0.0001764222069910898, | |
| "loss": 1.8129, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.1847810745239258, | |
| "learning_rate": 0.00017607950651130912, | |
| "loss": 1.8775, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.7050787806510925, | |
| "learning_rate": 0.00017573680603152845, | |
| "loss": 1.8454, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.8241177797317505, | |
| "learning_rate": 0.00017539410555174778, | |
| "loss": 1.7047, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.743680477142334, | |
| "learning_rate": 0.00017505140507196711, | |
| "loss": 1.8251, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.776196300983429, | |
| "learning_rate": 0.00017470870459218644, | |
| "loss": 1.8341, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.6896054744720459, | |
| "learning_rate": 0.00017436600411240575, | |
| "loss": 1.7569, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.703697919845581, | |
| "learning_rate": 0.0001740233036326251, | |
| "loss": 1.7696, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.6734452247619629, | |
| "learning_rate": 0.00017368060315284443, | |
| "loss": 1.6639, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.6856238842010498, | |
| "learning_rate": 0.00017333790267306376, | |
| "loss": 1.8419, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.1194758415222168, | |
| "learning_rate": 0.00017299520219328306, | |
| "loss": 1.7916, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.455841064453125, | |
| "learning_rate": 0.0001726525017135024, | |
| "loss": 1.7368, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.5988683700561523, | |
| "learning_rate": 0.00017230980123372172, | |
| "loss": 1.8434, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.9031710028648376, | |
| "learning_rate": 0.00017196710075394108, | |
| "loss": 1.7447, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.2125264406204224, | |
| "learning_rate": 0.0001716244002741604, | |
| "loss": 1.9449, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.9563066959381104, | |
| "learning_rate": 0.0001712816997943797, | |
| "loss": 1.7063, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.8778769969940186, | |
| "learning_rate": 0.00017093899931459904, | |
| "loss": 1.802, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.0570799112319946, | |
| "learning_rate": 0.00017059629883481837, | |
| "loss": 1.7331, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.8234407305717468, | |
| "learning_rate": 0.0001702535983550377, | |
| "loss": 1.7943, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.968658983707428, | |
| "learning_rate": 0.00016991089787525703, | |
| "loss": 1.8527, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.6607180237770081, | |
| "learning_rate": 0.00016956819739547636, | |
| "loss": 1.8521, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.8055354952812195, | |
| "learning_rate": 0.0001692254969156957, | |
| "loss": 1.6901, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.8606925010681152, | |
| "learning_rate": 0.00016888279643591502, | |
| "loss": 1.7248, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.9894892573356628, | |
| "learning_rate": 0.00016854009595613432, | |
| "loss": 1.7541, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.8559629321098328, | |
| "learning_rate": 0.00016819739547635368, | |
| "loss": 1.7803, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.8917673826217651, | |
| "learning_rate": 0.000167854694996573, | |
| "loss": 1.8224, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.2621186971664429, | |
| "learning_rate": 0.00016751199451679234, | |
| "loss": 1.8253, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.1135177612304688, | |
| "learning_rate": 0.00016716929403701167, | |
| "loss": 1.6519, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.7034028172492981, | |
| "learning_rate": 0.00016682659355723097, | |
| "loss": 1.7079, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.7942814826965332, | |
| "learning_rate": 0.0001664838930774503, | |
| "loss": 1.828, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.9687950611114502, | |
| "learning_rate": 0.00016614119259766966, | |
| "loss": 1.7203, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.1074302196502686, | |
| "learning_rate": 0.000165798492117889, | |
| "loss": 1.7146, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "eval_loss": 1.9078810214996338, | |
| "eval_runtime": 33.2486, | |
| "eval_samples_per_second": 30.076, | |
| "eval_steps_per_second": 3.76, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.9533829689025879, | |
| "learning_rate": 0.0001654557916381083, | |
| "loss": 1.7596, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.0547090768814087, | |
| "learning_rate": 0.00016511309115832762, | |
| "loss": 1.9113, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.0186220407485962, | |
| "learning_rate": 0.00016477039067854695, | |
| "loss": 1.7845, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.9044001698493958, | |
| "learning_rate": 0.00016442769019876628, | |
| "loss": 1.8174, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.6433171033859253, | |
| "learning_rate": 0.0001640849897189856, | |
| "loss": 1.7702, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.2511520385742188, | |
| "learning_rate": 0.00016374228923920494, | |
| "loss": 1.9304, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7901211977005005, | |
| "learning_rate": 0.00016339958875942427, | |
| "loss": 1.8432, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.515535831451416, | |
| "learning_rate": 0.0001630568882796436, | |
| "loss": 1.8818, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.9449120759963989, | |
| "learning_rate": 0.00016271418779986293, | |
| "loss": 1.8594, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.7776308059692383, | |
| "learning_rate": 0.00016237148732008226, | |
| "loss": 1.8896, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 1.3541969060897827, | |
| "learning_rate": 0.0001620287868403016, | |
| "loss": 1.8208, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.7614444494247437, | |
| "learning_rate": 0.00016168608636052092, | |
| "loss": 1.759, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.170345425605774, | |
| "learning_rate": 0.00016134338588074025, | |
| "loss": 1.6713, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.8094021081924438, | |
| "learning_rate": 0.00016100068540095955, | |
| "loss": 1.7394, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.169124722480774, | |
| "learning_rate": 0.0001606579849211789, | |
| "loss": 1.7609, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.6766496300697327, | |
| "learning_rate": 0.00016031528444139824, | |
| "loss": 1.7812, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.0808138847351074, | |
| "learning_rate": 0.00015997258396161757, | |
| "loss": 1.7777, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.6450923681259155, | |
| "learning_rate": 0.00015962988348183687, | |
| "loss": 1.8539, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.0518946647644043, | |
| "learning_rate": 0.0001592871830020562, | |
| "loss": 1.7799, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.7807414531707764, | |
| "learning_rate": 0.00015894448252227553, | |
| "loss": 1.774, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.4259986877441406, | |
| "learning_rate": 0.00015860178204249488, | |
| "loss": 1.8153, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.9342586994171143, | |
| "learning_rate": 0.0001582590815627142, | |
| "loss": 1.7495, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.7621099948883057, | |
| "learning_rate": 0.00015791638108293352, | |
| "loss": 1.7964, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.8253260254859924, | |
| "learning_rate": 0.00015757368060315285, | |
| "loss": 1.7669, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.6914420127868652, | |
| "learning_rate": 0.00015723098012337218, | |
| "loss": 1.803, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.7147281765937805, | |
| "learning_rate": 0.0001568882796435915, | |
| "loss": 1.8226, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 2.0851213932037354, | |
| "learning_rate": 0.00015654557916381084, | |
| "loss": 1.6957, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.6254770159721375, | |
| "learning_rate": 0.00015620287868403017, | |
| "loss": 1.75, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 1.0984652042388916, | |
| "learning_rate": 0.0001558601782042495, | |
| "loss": 1.8425, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.0353467464447021, | |
| "learning_rate": 0.00015551747772446882, | |
| "loss": 1.7995, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.6647160053253174, | |
| "learning_rate": 0.00015517477724468813, | |
| "loss": 1.866, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.6671775579452515, | |
| "learning_rate": 0.00015483207676490748, | |
| "loss": 1.6871, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.0024131536483765, | |
| "learning_rate": 0.00015448937628512681, | |
| "loss": 1.7424, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.0090551376342773, | |
| "learning_rate": 0.00015414667580534614, | |
| "loss": 1.7001, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.9725455045700073, | |
| "learning_rate": 0.00015380397532556545, | |
| "loss": 1.7114, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.6556392312049866, | |
| "learning_rate": 0.00015346127484578478, | |
| "loss": 1.5969, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.156596302986145, | |
| "learning_rate": 0.00015311857436600413, | |
| "loss": 1.7334, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.9172496199607849, | |
| "learning_rate": 0.00015277587388622346, | |
| "loss": 1.7373, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9010474681854248, | |
| "learning_rate": 0.0001524331734064428, | |
| "loss": 1.8032, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.9486579298973083, | |
| "learning_rate": 0.0001520904729266621, | |
| "loss": 1.6388, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.8411978483200073, | |
| "learning_rate": 0.00015174777244688142, | |
| "loss": 1.7671, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.9575003385543823, | |
| "learning_rate": 0.00015140507196710075, | |
| "loss": 1.6523, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.7651090025901794, | |
| "learning_rate": 0.0001510623714873201, | |
| "loss": 1.812, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.8477165699005127, | |
| "learning_rate": 0.0001507196710075394, | |
| "loss": 1.7125, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.9737070202827454, | |
| "learning_rate": 0.00015037697052775874, | |
| "loss": 1.7506, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.0645496845245361, | |
| "learning_rate": 0.00015003427004797807, | |
| "loss": 1.7335, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.9303259253501892, | |
| "learning_rate": 0.0001496915695681974, | |
| "loss": 1.8838, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.6571500897407532, | |
| "learning_rate": 0.00014934886908841673, | |
| "loss": 1.8093, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.7994106411933899, | |
| "learning_rate": 0.00014900616860863606, | |
| "loss": 1.6691, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.8453437685966492, | |
| "learning_rate": 0.0001486634681288554, | |
| "loss": 1.6731, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "eval_loss": 1.8940061330795288, | |
| "eval_runtime": 33.2126, | |
| "eval_samples_per_second": 30.109, | |
| "eval_steps_per_second": 3.764, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0370814800262451, | |
| "learning_rate": 0.00014832076764907472, | |
| "loss": 1.7869, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.0886887311935425, | |
| "learning_rate": 0.00014797806716929405, | |
| "loss": 1.7887, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.9058669209480286, | |
| "learning_rate": 0.00014763536668951335, | |
| "loss": 1.6781, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.46401920914649963, | |
| "learning_rate": 0.0001472926662097327, | |
| "loss": 1.6465, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.6265978813171387, | |
| "learning_rate": 0.00014694996572995204, | |
| "loss": 1.8399, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.7882290482521057, | |
| "learning_rate": 0.00014660726525017137, | |
| "loss": 1.7707, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.7576068043708801, | |
| "learning_rate": 0.00014626456477039067, | |
| "loss": 1.8781, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.8988894820213318, | |
| "learning_rate": 0.00014592186429061, | |
| "loss": 1.7109, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.7934654951095581, | |
| "learning_rate": 0.00014557916381082933, | |
| "loss": 1.8261, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.9526162147521973, | |
| "learning_rate": 0.0001452364633310487, | |
| "loss": 1.7286, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.8650903701782227, | |
| "learning_rate": 0.000144893762851268, | |
| "loss": 1.8075, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.8737215399742126, | |
| "learning_rate": 0.00014455106237148732, | |
| "loss": 1.7683, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.0927869081497192, | |
| "learning_rate": 0.00014420836189170665, | |
| "loss": 1.8238, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.7490981817245483, | |
| "learning_rate": 0.00014386566141192598, | |
| "loss": 1.7528, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.6721557974815369, | |
| "learning_rate": 0.0001435229609321453, | |
| "loss": 1.7212, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.8125373125076294, | |
| "learning_rate": 0.00014318026045236464, | |
| "loss": 1.8369, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.598507821559906, | |
| "learning_rate": 0.00014283755997258397, | |
| "loss": 1.8455, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.2567535638809204, | |
| "learning_rate": 0.0001424948594928033, | |
| "loss": 1.7656, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.5279853343963623, | |
| "learning_rate": 0.00014215215901302263, | |
| "loss": 1.8297, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.1410638093948364, | |
| "learning_rate": 0.00014180945853324196, | |
| "loss": 1.7489, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.9007987976074219, | |
| "learning_rate": 0.0001414667580534613, | |
| "loss": 1.7473, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.5736974477767944, | |
| "learning_rate": 0.00014112405757368062, | |
| "loss": 1.8022, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.6310347318649292, | |
| "learning_rate": 0.00014078135709389995, | |
| "loss": 1.7676, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.9788106679916382, | |
| "learning_rate": 0.00014043865661411925, | |
| "loss": 1.7303, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.6612042784690857, | |
| "learning_rate": 0.00014009595613433858, | |
| "loss": 1.675, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.8740193247795105, | |
| "learning_rate": 0.00013975325565455794, | |
| "loss": 1.7945, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.9548364877700806, | |
| "learning_rate": 0.00013941055517477727, | |
| "loss": 1.7485, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6676565408706665, | |
| "learning_rate": 0.00013906785469499657, | |
| "loss": 1.7479, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.6287640333175659, | |
| "learning_rate": 0.0001387251542152159, | |
| "loss": 1.7007, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.5443295240402222, | |
| "learning_rate": 0.00013838245373543523, | |
| "loss": 1.8916, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.9970656037330627, | |
| "learning_rate": 0.00013803975325565456, | |
| "loss": 1.6733, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.9320075511932373, | |
| "learning_rate": 0.00013769705277587391, | |
| "loss": 1.8622, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.8384440541267395, | |
| "learning_rate": 0.00013735435229609322, | |
| "loss": 1.6825, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.1807342767715454, | |
| "learning_rate": 0.00013701165181631255, | |
| "loss": 1.6548, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.7640541195869446, | |
| "learning_rate": 0.00013666895133653188, | |
| "loss": 1.8134, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.9137887358665466, | |
| "learning_rate": 0.0001363262508567512, | |
| "loss": 1.7685, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.8986667394638062, | |
| "learning_rate": 0.00013598355037697054, | |
| "loss": 1.7455, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.96836918592453, | |
| "learning_rate": 0.00013564084989718987, | |
| "loss": 1.8705, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.381028175354004, | |
| "learning_rate": 0.0001352981494174092, | |
| "loss": 1.7644, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.617438018321991, | |
| "learning_rate": 0.00013495544893762853, | |
| "loss": 1.6194, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.8686628937721252, | |
| "learning_rate": 0.00013461274845784783, | |
| "loss": 1.7171, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.7735409140586853, | |
| "learning_rate": 0.00013427004797806716, | |
| "loss": 1.725, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 1.0692516565322876, | |
| "learning_rate": 0.00013392734749828651, | |
| "loss": 1.762, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.763136625289917, | |
| "learning_rate": 0.00013358464701850584, | |
| "loss": 1.6546, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.9908429980278015, | |
| "learning_rate": 0.00013324194653872517, | |
| "loss": 1.6499, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.9493003487586975, | |
| "learning_rate": 0.00013289924605894448, | |
| "loss": 1.5616, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.8336248993873596, | |
| "learning_rate": 0.0001325565455791638, | |
| "loss": 1.7914, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.8938840627670288, | |
| "learning_rate": 0.00013221384509938314, | |
| "loss": 1.7274, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.0243479013442993, | |
| "learning_rate": 0.0001318711446196025, | |
| "loss": 1.6643, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 1.0226181745529175, | |
| "learning_rate": 0.0001315284441398218, | |
| "loss": 1.7626, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_loss": 1.8913378715515137, | |
| "eval_runtime": 33.1473, | |
| "eval_samples_per_second": 30.168, | |
| "eval_steps_per_second": 3.771, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.1059471368789673, | |
| "learning_rate": 0.00013118574366004112, | |
| "loss": 1.6362, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.3754314184188843, | |
| "learning_rate": 0.00013084304318026045, | |
| "loss": 1.8308, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.3899627923965454, | |
| "learning_rate": 0.00013050034270047978, | |
| "loss": 1.6982, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.8804599046707153, | |
| "learning_rate": 0.00013015764222069911, | |
| "loss": 1.8138, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.6578095555305481, | |
| "learning_rate": 0.00012981494174091844, | |
| "loss": 1.7211, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.5725558996200562, | |
| "learning_rate": 0.00012947224126113777, | |
| "loss": 1.8684, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.097717523574829, | |
| "learning_rate": 0.0001291295407813571, | |
| "loss": 1.7705, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.7564202547073364, | |
| "learning_rate": 0.00012878684030157643, | |
| "loss": 1.5935, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.732243537902832, | |
| "learning_rate": 0.00012844413982179576, | |
| "loss": 1.7694, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.6464608907699585, | |
| "learning_rate": 0.0001281014393420151, | |
| "loss": 1.8418, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.7090341448783875, | |
| "learning_rate": 0.00012775873886223442, | |
| "loss": 1.8122, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 1.1480237245559692, | |
| "learning_rate": 0.00012741603838245375, | |
| "loss": 1.7766, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.6737000346183777, | |
| "learning_rate": 0.00012707333790267305, | |
| "loss": 1.7876, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.7794924378395081, | |
| "learning_rate": 0.00012673063742289238, | |
| "loss": 1.8529, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.3136320114135742, | |
| "learning_rate": 0.00012638793694311174, | |
| "loss": 1.6699, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.884027361869812, | |
| "learning_rate": 0.00012604523646333107, | |
| "loss": 1.7689, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.103605031967163, | |
| "learning_rate": 0.00012570253598355037, | |
| "loss": 1.8594, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.3322539329528809, | |
| "learning_rate": 0.0001253598355037697, | |
| "loss": 1.6765, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.7840645909309387, | |
| "learning_rate": 0.00012501713502398903, | |
| "loss": 1.65, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.9259356260299683, | |
| "learning_rate": 0.00012467443454420836, | |
| "loss": 1.7805, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.3709288835525513, | |
| "learning_rate": 0.0001243317340644277, | |
| "loss": 1.7086, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.6325123310089111, | |
| "learning_rate": 0.00012398903358464702, | |
| "loss": 1.7124, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.854541003704071, | |
| "learning_rate": 0.00012364633310486635, | |
| "loss": 1.7089, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.8861531019210815, | |
| "learning_rate": 0.00012330363262508568, | |
| "loss": 1.8369, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.269750714302063, | |
| "learning_rate": 0.000122960932145305, | |
| "loss": 1.7598, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.999598503112793, | |
| "learning_rate": 0.00012261823166552434, | |
| "loss": 1.8376, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7654330134391785, | |
| "learning_rate": 0.00012227553118574367, | |
| "loss": 1.7236, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.11728835105896, | |
| "learning_rate": 0.000121932830705963, | |
| "loss": 1.7375, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7219797968864441, | |
| "learning_rate": 0.00012159013022618233, | |
| "loss": 1.7786, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.0127757787704468, | |
| "learning_rate": 0.00012124742974640165, | |
| "loss": 1.7003, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.0450137853622437, | |
| "learning_rate": 0.00012090472926662097, | |
| "loss": 1.7425, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.9303760528564453, | |
| "learning_rate": 0.0001205620287868403, | |
| "loss": 1.632, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.7303478717803955, | |
| "learning_rate": 0.00012021932830705965, | |
| "loss": 1.6918, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.6323578953742981, | |
| "learning_rate": 0.00011987662782727895, | |
| "loss": 1.672, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.715811014175415, | |
| "learning_rate": 0.00011953392734749828, | |
| "loss": 1.7613, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.7297527194023132, | |
| "learning_rate": 0.00011919122686771762, | |
| "loss": 1.7277, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.0844471454620361, | |
| "learning_rate": 0.00011884852638793695, | |
| "loss": 1.8143, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.9260643720626831, | |
| "learning_rate": 0.00011850582590815628, | |
| "loss": 1.7228, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.9541537761688232, | |
| "learning_rate": 0.0001181631254283756, | |
| "loss": 1.7143, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.0506033897399902, | |
| "learning_rate": 0.00011782042494859493, | |
| "loss": 1.7659, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.7201717495918274, | |
| "learning_rate": 0.00011747772446881427, | |
| "loss": 1.7257, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.8612362742424011, | |
| "learning_rate": 0.0001171350239890336, | |
| "loss": 1.7009, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.8745547533035278, | |
| "learning_rate": 0.0001167923235092529, | |
| "loss": 1.733, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.5927043557167053, | |
| "learning_rate": 0.00011644962302947225, | |
| "loss": 1.7724, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.6471837162971497, | |
| "learning_rate": 0.00011610692254969158, | |
| "loss": 1.7103, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 1.1340347528457642, | |
| "learning_rate": 0.0001157642220699109, | |
| "loss": 1.7053, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.8819349408149719, | |
| "learning_rate": 0.00011542152159013022, | |
| "loss": 1.7552, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.6587919592857361, | |
| "learning_rate": 0.00011507882111034955, | |
| "loss": 1.6482, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.0057884454727173, | |
| "learning_rate": 0.00011473612063056888, | |
| "loss": 1.7711, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.6465263962745667, | |
| "learning_rate": 0.00011439342015078823, | |
| "loss": 1.7565, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "eval_loss": 1.8792312145233154, | |
| "eval_runtime": 33.1087, | |
| "eval_samples_per_second": 30.204, | |
| "eval_steps_per_second": 3.775, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.5970360040664673, | |
| "learning_rate": 0.00011405071967100756, | |
| "loss": 1.7179, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.3015583753585815, | |
| "learning_rate": 0.00011370801919122687, | |
| "loss": 1.7225, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.9235218167304993, | |
| "learning_rate": 0.0001133653187114462, | |
| "loss": 1.7657, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.025038480758667, | |
| "learning_rate": 0.00011302261823166553, | |
| "loss": 1.7755, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.8988834619522095, | |
| "learning_rate": 0.00011267991775188486, | |
| "loss": 1.8187, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.7810622453689575, | |
| "learning_rate": 0.00011233721727210418, | |
| "loss": 1.6565, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 1.6817054748535156, | |
| "learning_rate": 0.0001119945167923235, | |
| "loss": 1.7764, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.9688411355018616, | |
| "learning_rate": 0.00011165181631254285, | |
| "loss": 1.6599, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.742932915687561, | |
| "learning_rate": 0.00011130911583276218, | |
| "loss": 1.7552, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.5261206030845642, | |
| "learning_rate": 0.0001109664153529815, | |
| "loss": 1.6432, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.8997339606285095, | |
| "learning_rate": 0.00011062371487320082, | |
| "loss": 1.8438, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.8077126741409302, | |
| "learning_rate": 0.00011028101439342015, | |
| "loss": 1.8144, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.9872453212738037, | |
| "learning_rate": 0.00010993831391363948, | |
| "loss": 1.7427, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.1201390027999878, | |
| "learning_rate": 0.00010959561343385883, | |
| "loss": 1.7696, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 1.1584488153457642, | |
| "learning_rate": 0.00010925291295407813, | |
| "loss": 1.6236, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.8254250884056091, | |
| "learning_rate": 0.00010891021247429747, | |
| "loss": 1.6214, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.9825947284698486, | |
| "learning_rate": 0.0001085675119945168, | |
| "loss": 1.7889, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.0265246629714966, | |
| "learning_rate": 0.00010822481151473613, | |
| "loss": 1.7283, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.891777515411377, | |
| "learning_rate": 0.00010788211103495545, | |
| "loss": 1.8176, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.8920706510543823, | |
| "learning_rate": 0.00010753941055517478, | |
| "loss": 1.7676, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.072204828262329, | |
| "learning_rate": 0.00010719671007539411, | |
| "loss": 1.5836, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.9175311923027039, | |
| "learning_rate": 0.00010685400959561345, | |
| "loss": 1.8073, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.6199253797531128, | |
| "learning_rate": 0.00010651130911583275, | |
| "loss": 1.828, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.653229296207428, | |
| "learning_rate": 0.0001061686086360521, | |
| "loss": 1.7308, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.790413498878479, | |
| "learning_rate": 0.00010582590815627143, | |
| "loss": 1.8169, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.8657679557800293, | |
| "learning_rate": 0.00010548320767649076, | |
| "loss": 1.7453, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.6758552193641663, | |
| "learning_rate": 0.00010514050719671007, | |
| "loss": 1.7171, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 1.0935484170913696, | |
| "learning_rate": 0.0001047978067169294, | |
| "loss": 1.6754, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.8095535635948181, | |
| "learning_rate": 0.00010445510623714873, | |
| "loss": 1.8387, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.8804395198822021, | |
| "learning_rate": 0.00010411240575736808, | |
| "loss": 1.7839, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.945090115070343, | |
| "learning_rate": 0.0001037697052775874, | |
| "loss": 1.7196, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.6158414483070374, | |
| "learning_rate": 0.00010342700479780672, | |
| "loss": 1.8011, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.7917384505271912, | |
| "learning_rate": 0.00010308430431802605, | |
| "loss": 1.744, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.6415919065475464, | |
| "learning_rate": 0.00010274160383824538, | |
| "loss": 1.6379, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.6077090501785278, | |
| "learning_rate": 0.00010239890335846471, | |
| "loss": 1.657, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.036901593208313, | |
| "learning_rate": 0.00010205620287868403, | |
| "loss": 1.7059, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.7633301019668579, | |
| "learning_rate": 0.00010171350239890336, | |
| "loss": 1.8085, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.04219651222229, | |
| "learning_rate": 0.0001013708019191227, | |
| "loss": 1.6641, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.9899976849555969, | |
| "learning_rate": 0.00010102810143934203, | |
| "loss": 1.6819, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.755636990070343, | |
| "learning_rate": 0.00010068540095956133, | |
| "loss": 1.7573, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.1326630115509033, | |
| "learning_rate": 0.00010034270047978068, | |
| "loss": 1.6942, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.7579949498176575, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6903, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.7203909754753113, | |
| "learning_rate": 9.965729952021933e-05, | |
| "loss": 1.5946, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.8731165528297424, | |
| "learning_rate": 9.931459904043866e-05, | |
| "loss": 1.6224, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.6287246942520142, | |
| "learning_rate": 9.8971898560658e-05, | |
| "loss": 1.6279, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 0.8794381618499756, | |
| "learning_rate": 9.862919808087731e-05, | |
| "loss": 1.7103, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.1305402517318726, | |
| "learning_rate": 9.828649760109665e-05, | |
| "loss": 1.5876, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.349693775177002, | |
| "learning_rate": 9.794379712131597e-05, | |
| "loss": 1.549, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.1124284267425537, | |
| "learning_rate": 9.76010966415353e-05, | |
| "loss": 1.5688, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.5864982604980469, | |
| "learning_rate": 9.725839616175463e-05, | |
| "loss": 1.6437, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "eval_loss": 1.8886157274246216, | |
| "eval_runtime": 33.1481, | |
| "eval_samples_per_second": 30.168, | |
| "eval_steps_per_second": 3.771, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.8807237148284912, | |
| "learning_rate": 9.691569568197396e-05, | |
| "loss": 1.5888, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 0.8454139232635498, | |
| "learning_rate": 9.657299520219329e-05, | |
| "loss": 1.5414, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 0.9541159272193909, | |
| "learning_rate": 9.623029472241262e-05, | |
| "loss": 1.7525, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.38509202003479, | |
| "learning_rate": 9.588759424263193e-05, | |
| "loss": 1.5302, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 1.242966651916504, | |
| "learning_rate": 9.554489376285128e-05, | |
| "loss": 1.6085, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.1269468069076538, | |
| "learning_rate": 9.52021932830706e-05, | |
| "loss": 1.603, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.1521382331848145, | |
| "learning_rate": 9.485949280328992e-05, | |
| "loss": 1.6984, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.3359086513519287, | |
| "learning_rate": 9.451679232350927e-05, | |
| "loss": 1.4839, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.057581901550293, | |
| "learning_rate": 9.417409184372858e-05, | |
| "loss": 1.5541, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.090909719467163, | |
| "learning_rate": 9.383139136394791e-05, | |
| "loss": 1.5811, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 1.3244885206222534, | |
| "learning_rate": 9.348869088416724e-05, | |
| "loss": 1.6006, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.8855965733528137, | |
| "learning_rate": 9.314599040438657e-05, | |
| "loss": 1.5577, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 0.9480008482933044, | |
| "learning_rate": 9.28032899246059e-05, | |
| "loss": 1.6064, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.397888422012329, | |
| "learning_rate": 9.246058944482523e-05, | |
| "loss": 1.5708, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 0.8178092241287231, | |
| "learning_rate": 9.211788896504455e-05, | |
| "loss": 1.4722, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.3776417970657349, | |
| "learning_rate": 9.177518848526389e-05, | |
| "loss": 1.6941, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.3224530220031738, | |
| "learning_rate": 9.14324880054832e-05, | |
| "loss": 1.5414, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.3367009162902832, | |
| "learning_rate": 9.108978752570254e-05, | |
| "loss": 1.6275, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 1.0063951015472412, | |
| "learning_rate": 9.074708704592187e-05, | |
| "loss": 1.6761, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.320760726928711, | |
| "learning_rate": 9.04043865661412e-05, | |
| "loss": 1.5896, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.0159672498703003, | |
| "learning_rate": 9.006168608636053e-05, | |
| "loss": 1.5728, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.095314383506775, | |
| "learning_rate": 8.971898560657985e-05, | |
| "loss": 1.5329, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 1.212713360786438, | |
| "learning_rate": 8.937628512679918e-05, | |
| "loss": 1.5746, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.8203460574150085, | |
| "learning_rate": 8.903358464701851e-05, | |
| "loss": 1.6119, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 0.9643343091011047, | |
| "learning_rate": 8.869088416723784e-05, | |
| "loss": 1.5893, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.2415894269943237, | |
| "learning_rate": 8.834818368745716e-05, | |
| "loss": 1.6291, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.826658844947815, | |
| "learning_rate": 8.80054832076765e-05, | |
| "loss": 1.6394, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 1.3455665111541748, | |
| "learning_rate": 8.766278272789582e-05, | |
| "loss": 1.568, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.8909701108932495, | |
| "learning_rate": 8.732008224811515e-05, | |
| "loss": 1.5733, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 1.4277849197387695, | |
| "learning_rate": 8.697738176833448e-05, | |
| "loss": 1.6339, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 0.9563093185424805, | |
| "learning_rate": 8.663468128855381e-05, | |
| "loss": 1.5775, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 0.8461637496948242, | |
| "learning_rate": 8.629198080877314e-05, | |
| "loss": 1.653, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.0858458280563354, | |
| "learning_rate": 8.594928032899247e-05, | |
| "loss": 1.4778, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 1.1627178192138672, | |
| "learning_rate": 8.560657984921178e-05, | |
| "loss": 1.5374, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.196664571762085, | |
| "learning_rate": 8.526387936943113e-05, | |
| "loss": 1.6483, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.1990993022918701, | |
| "learning_rate": 8.492117888965046e-05, | |
| "loss": 1.5993, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 1.0623687505722046, | |
| "learning_rate": 8.457847840986977e-05, | |
| "loss": 1.5743, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.1684637069702148, | |
| "learning_rate": 8.423577793008912e-05, | |
| "loss": 1.5546, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.2448011636734009, | |
| "learning_rate": 8.389307745030843e-05, | |
| "loss": 1.496, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.9411953091621399, | |
| "learning_rate": 8.355037697052776e-05, | |
| "loss": 1.5966, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.0667563676834106, | |
| "learning_rate": 8.320767649074709e-05, | |
| "loss": 1.5128, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.50753653049469, | |
| "learning_rate": 8.286497601096642e-05, | |
| "loss": 1.5772, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 0.9346134662628174, | |
| "learning_rate": 8.252227553118574e-05, | |
| "loss": 1.6321, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.304190754890442, | |
| "learning_rate": 8.217957505140508e-05, | |
| "loss": 1.5656, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 1.058018684387207, | |
| "learning_rate": 8.18368745716244e-05, | |
| "loss": 1.5413, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.15809166431427, | |
| "learning_rate": 8.149417409184373e-05, | |
| "loss": 1.5673, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.092393159866333, | |
| "learning_rate": 8.115147361206306e-05, | |
| "loss": 1.5962, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 0.9390305876731873, | |
| "learning_rate": 8.080877313228239e-05, | |
| "loss": 1.5565, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.002120852470398, | |
| "learning_rate": 8.046607265250173e-05, | |
| "loss": 1.5803, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.0857172012329102, | |
| "learning_rate": 8.012337217272105e-05, | |
| "loss": 1.6345, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_loss": 1.8998303413391113, | |
| "eval_runtime": 33.1629, | |
| "eval_samples_per_second": 30.154, | |
| "eval_steps_per_second": 3.769, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.9931670427322388, | |
| "learning_rate": 7.978067169294038e-05, | |
| "loss": 1.605, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.3759890794754028, | |
| "learning_rate": 7.94379712131597e-05, | |
| "loss": 1.5059, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.2301968336105347, | |
| "learning_rate": 7.909527073337903e-05, | |
| "loss": 1.582, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 1.1518924236297607, | |
| "learning_rate": 7.875257025359835e-05, | |
| "loss": 1.5839, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.9161165952682495, | |
| "learning_rate": 7.84098697738177e-05, | |
| "loss": 1.5494, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.250705599784851, | |
| "learning_rate": 7.806716929403701e-05, | |
| "loss": 1.5178, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 0.7702249884605408, | |
| "learning_rate": 7.772446881425634e-05, | |
| "loss": 1.644, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.4425973892211914, | |
| "learning_rate": 7.738176833447567e-05, | |
| "loss": 1.5009, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.2036337852478027, | |
| "learning_rate": 7.7039067854695e-05, | |
| "loss": 1.5456, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.4006402492523193, | |
| "learning_rate": 7.669636737491433e-05, | |
| "loss": 1.5511, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.1983481645584106, | |
| "learning_rate": 7.635366689513366e-05, | |
| "loss": 1.5645, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.2755049467086792, | |
| "learning_rate": 7.601096641535297e-05, | |
| "loss": 1.6512, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.3783161640167236, | |
| "learning_rate": 7.566826593557232e-05, | |
| "loss": 1.6747, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.1947081089019775, | |
| "learning_rate": 7.532556545579165e-05, | |
| "loss": 1.6605, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.2230151891708374, | |
| "learning_rate": 7.498286497601096e-05, | |
| "loss": 1.6187, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 1.372226595878601, | |
| "learning_rate": 7.464016449623031e-05, | |
| "loss": 1.6354, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.2375085353851318, | |
| "learning_rate": 7.429746401644962e-05, | |
| "loss": 1.656, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 0.9703730940818787, | |
| "learning_rate": 7.395476353666895e-05, | |
| "loss": 1.5571, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 1.3475947380065918, | |
| "learning_rate": 7.361206305688828e-05, | |
| "loss": 1.5487, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.3879302740097046, | |
| "learning_rate": 7.326936257710761e-05, | |
| "loss": 1.6702, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.4043548107147217, | |
| "learning_rate": 7.292666209732694e-05, | |
| "loss": 1.5555, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.2937321662902832, | |
| "learning_rate": 7.258396161754627e-05, | |
| "loss": 1.5959, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.4525338411331177, | |
| "learning_rate": 7.224126113776559e-05, | |
| "loss": 1.6252, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.1089144945144653, | |
| "learning_rate": 7.189856065798493e-05, | |
| "loss": 1.5027, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 1.2625998258590698, | |
| "learning_rate": 7.155586017820425e-05, | |
| "loss": 1.5907, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.2458665370941162, | |
| "learning_rate": 7.121315969842358e-05, | |
| "loss": 1.54, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.2830859422683716, | |
| "learning_rate": 7.087045921864292e-05, | |
| "loss": 1.5867, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.0032719373703003, | |
| "learning_rate": 7.052775873886224e-05, | |
| "loss": 1.5374, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 0.9105421304702759, | |
| "learning_rate": 7.018505825908157e-05, | |
| "loss": 1.528, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.3588030338287354, | |
| "learning_rate": 6.98423577793009e-05, | |
| "loss": 1.6368, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.4903500080108643, | |
| "learning_rate": 6.949965729952023e-05, | |
| "loss": 1.675, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 1.229722261428833, | |
| "learning_rate": 6.915695681973956e-05, | |
| "loss": 1.555, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 0.9523776769638062, | |
| "learning_rate": 6.881425633995888e-05, | |
| "loss": 1.6608, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.986708164215088, | |
| "learning_rate": 6.84715558601782e-05, | |
| "loss": 1.7199, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 0.79183429479599, | |
| "learning_rate": 6.812885538039754e-05, | |
| "loss": 1.5034, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.1760715246200562, | |
| "learning_rate": 6.778615490061686e-05, | |
| "loss": 1.6812, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.7899055480957031, | |
| "learning_rate": 6.744345442083619e-05, | |
| "loss": 1.7389, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.2628593444824219, | |
| "learning_rate": 6.710075394105552e-05, | |
| "loss": 1.5317, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 1.037351131439209, | |
| "learning_rate": 6.675805346127485e-05, | |
| "loss": 1.5858, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.2006704807281494, | |
| "learning_rate": 6.641535298149417e-05, | |
| "loss": 1.4587, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.0877715349197388, | |
| "learning_rate": 6.607265250171351e-05, | |
| "loss": 1.5306, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.4047476053237915, | |
| "learning_rate": 6.572995202193284e-05, | |
| "loss": 1.5603, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.2444441318511963, | |
| "learning_rate": 6.538725154215215e-05, | |
| "loss": 1.5809, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.5738134384155273, | |
| "learning_rate": 6.50445510623715e-05, | |
| "loss": 1.5606, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 1.4850690364837646, | |
| "learning_rate": 6.470185058259081e-05, | |
| "loss": 1.4945, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.3746342658996582, | |
| "learning_rate": 6.435915010281016e-05, | |
| "loss": 1.5152, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.139249324798584, | |
| "learning_rate": 6.401644962302947e-05, | |
| "loss": 1.6004, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.3590480089187622, | |
| "learning_rate": 6.36737491432488e-05, | |
| "loss": 1.4926, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.6366995573043823, | |
| "learning_rate": 6.333104866346813e-05, | |
| "loss": 1.6734, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.1154892444610596, | |
| "learning_rate": 6.298834818368746e-05, | |
| "loss": 1.5628, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "eval_loss": 1.9075069427490234, | |
| "eval_runtime": 33.1019, | |
| "eval_samples_per_second": 30.21, | |
| "eval_steps_per_second": 3.776, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.123923897743225, | |
| "learning_rate": 6.264564770390678e-05, | |
| "loss": 1.6206, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.3015213012695312, | |
| "learning_rate": 6.230294722412612e-05, | |
| "loss": 1.6292, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.8867294788360596, | |
| "learning_rate": 6.196024674434544e-05, | |
| "loss": 1.6625, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.5840169191360474, | |
| "learning_rate": 6.161754626456477e-05, | |
| "loss": 1.6224, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 0.9141889810562134, | |
| "learning_rate": 6.12748457847841e-05, | |
| "loss": 1.5051, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.5261061191558838, | |
| "learning_rate": 6.093214530500343e-05, | |
| "loss": 1.4289, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 1.2253016233444214, | |
| "learning_rate": 6.0589444825222764e-05, | |
| "loss": 1.6065, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.7163646221160889, | |
| "learning_rate": 6.0246744345442087e-05, | |
| "loss": 1.5978, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.0204969644546509, | |
| "learning_rate": 5.9904043865661416e-05, | |
| "loss": 1.6267, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.9314994812011719, | |
| "learning_rate": 5.956134338588074e-05, | |
| "loss": 1.6486, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.1685149669647217, | |
| "learning_rate": 5.9218642906100076e-05, | |
| "loss": 1.6397, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.422166347503662, | |
| "learning_rate": 5.88759424263194e-05, | |
| "loss": 1.6419, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 1.3074285984039307, | |
| "learning_rate": 5.853324194653873e-05, | |
| "loss": 1.565, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.965584933757782, | |
| "learning_rate": 5.819054146675805e-05, | |
| "loss": 1.5841, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 0.9101732969284058, | |
| "learning_rate": 5.784784098697739e-05, | |
| "loss": 1.6144, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.183640718460083, | |
| "learning_rate": 5.750514050719671e-05, | |
| "loss": 1.5998, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.1072790622711182, | |
| "learning_rate": 5.716244002741604e-05, | |
| "loss": 1.4634, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.608017086982727, | |
| "learning_rate": 5.681973954763536e-05, | |
| "loss": 1.5629, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.4969751834869385, | |
| "learning_rate": 5.64770390678547e-05, | |
| "loss": 1.5966, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.727695107460022, | |
| "learning_rate": 5.613433858807403e-05, | |
| "loss": 1.5456, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.4587767124176025, | |
| "learning_rate": 5.579163810829335e-05, | |
| "loss": 1.5238, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.5338579416275024, | |
| "learning_rate": 5.544893762851269e-05, | |
| "loss": 1.5485, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 0.8002244234085083, | |
| "learning_rate": 5.510623714873201e-05, | |
| "loss": 1.634, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.281417727470398, | |
| "learning_rate": 5.476353666895134e-05, | |
| "loss": 1.589, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 0.906808078289032, | |
| "learning_rate": 5.4420836189170664e-05, | |
| "loss": 1.57, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.799028992652893, | |
| "learning_rate": 5.407813570939e-05, | |
| "loss": 1.623, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 1.2560220956802368, | |
| "learning_rate": 5.3735435229609324e-05, | |
| "loss": 1.4231, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.315132737159729, | |
| "learning_rate": 5.339273474982865e-05, | |
| "loss": 1.553, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.1687719821929932, | |
| "learning_rate": 5.3050034270047976e-05, | |
| "loss": 1.691, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.182626724243164, | |
| "learning_rate": 5.270733379026731e-05, | |
| "loss": 1.58, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.819560170173645, | |
| "learning_rate": 5.2364633310486636e-05, | |
| "loss": 1.574, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.4093881845474243, | |
| "learning_rate": 5.2021932830705965e-05, | |
| "loss": 1.5805, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.079927921295166, | |
| "learning_rate": 5.167923235092529e-05, | |
| "loss": 1.6296, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.1056098937988281, | |
| "learning_rate": 5.1336531871144625e-05, | |
| "loss": 1.4964, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.924827218055725, | |
| "learning_rate": 5.0993831391363954e-05, | |
| "loss": 1.5223, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 1.461719274520874, | |
| "learning_rate": 5.065113091158328e-05, | |
| "loss": 1.5323, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.6647108793258667, | |
| "learning_rate": 5.0308430431802614e-05, | |
| "loss": 1.6025, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.33492910861969, | |
| "learning_rate": 4.996572995202194e-05, | |
| "loss": 1.5554, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.134582757949829, | |
| "learning_rate": 4.962302947224126e-05, | |
| "loss": 1.621, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.315508246421814, | |
| "learning_rate": 4.928032899246059e-05, | |
| "loss": 1.5828, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.3290214538574219, | |
| "learning_rate": 4.8937628512679926e-05, | |
| "loss": 1.578, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.2616337537765503, | |
| "learning_rate": 4.859492803289925e-05, | |
| "loss": 1.6177, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.4099230766296387, | |
| "learning_rate": 4.825222755311858e-05, | |
| "loss": 1.4926, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 0.9720429182052612, | |
| "learning_rate": 4.790952707333791e-05, | |
| "loss": 1.6552, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.1491189002990723, | |
| "learning_rate": 4.756682659355723e-05, | |
| "loss": 1.7001, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.1790263652801514, | |
| "learning_rate": 4.722412611377656e-05, | |
| "loss": 1.544, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.1880890130996704, | |
| "learning_rate": 4.688142563399589e-05, | |
| "loss": 1.6053, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 1.0895016193389893, | |
| "learning_rate": 4.653872515421522e-05, | |
| "loss": 1.455, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.230600118637085, | |
| "learning_rate": 4.619602467443454e-05, | |
| "loss": 1.5752, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.4027129411697388, | |
| "learning_rate": 4.585332419465387e-05, | |
| "loss": 1.5461, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "eval_loss": 1.9048413038253784, | |
| "eval_runtime": 33.129, | |
| "eval_samples_per_second": 30.185, | |
| "eval_steps_per_second": 3.773, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 0.8590341806411743, | |
| "learning_rate": 4.55106237148732e-05, | |
| "loss": 1.6303, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.0827935934066772, | |
| "learning_rate": 4.516792323509253e-05, | |
| "loss": 1.5712, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 0.8795824646949768, | |
| "learning_rate": 4.4825222755311855e-05, | |
| "loss": 1.4882, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 1.509653091430664, | |
| "learning_rate": 4.4482522275531185e-05, | |
| "loss": 1.5534, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.0400638580322266, | |
| "learning_rate": 4.413982179575052e-05, | |
| "loss": 1.5681, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.1006004810333252, | |
| "learning_rate": 4.3797121315969844e-05, | |
| "loss": 1.5715, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.1621884107589722, | |
| "learning_rate": 4.3454420836189174e-05, | |
| "loss": 1.6373, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.0296626091003418, | |
| "learning_rate": 4.3111720356408503e-05, | |
| "loss": 1.6076, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 1.6784312725067139, | |
| "learning_rate": 4.276901987662783e-05, | |
| "loss": 1.6046, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.0730016231536865, | |
| "learning_rate": 4.2426319396847156e-05, | |
| "loss": 1.6317, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.0070710182189941, | |
| "learning_rate": 4.2083618917066486e-05, | |
| "loss": 1.5472, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.143546462059021, | |
| "learning_rate": 4.1740918437285815e-05, | |
| "loss": 1.4993, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.8565304279327393, | |
| "learning_rate": 4.1398217957505145e-05, | |
| "loss": 1.6021, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.1914728879928589, | |
| "learning_rate": 4.105551747772447e-05, | |
| "loss": 1.7231, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.6387224197387695, | |
| "learning_rate": 4.07128169979438e-05, | |
| "loss": 1.5804, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.65473210811615, | |
| "learning_rate": 4.037011651816313e-05, | |
| "loss": 1.6404, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.6097077131271362, | |
| "learning_rate": 4.002741603838245e-05, | |
| "loss": 1.4651, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.4290515184402466, | |
| "learning_rate": 3.968471555860178e-05, | |
| "loss": 1.5668, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.047481894493103, | |
| "learning_rate": 3.934201507882111e-05, | |
| "loss": 1.5275, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.3638914823532104, | |
| "learning_rate": 3.8999314599040446e-05, | |
| "loss": 1.6588, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 1.7712153196334839, | |
| "learning_rate": 3.865661411925977e-05, | |
| "loss": 1.6079, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.0898468494415283, | |
| "learning_rate": 3.83139136394791e-05, | |
| "loss": 1.574, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.4913599491119385, | |
| "learning_rate": 3.797121315969843e-05, | |
| "loss": 1.5376, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.225707769393921, | |
| "learning_rate": 3.762851267991775e-05, | |
| "loss": 1.5925, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.5699125528335571, | |
| "learning_rate": 3.728581220013708e-05, | |
| "loss": 1.593, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.318574070930481, | |
| "learning_rate": 3.694311172035641e-05, | |
| "loss": 1.5525, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 1.4544116258621216, | |
| "learning_rate": 3.660041124057574e-05, | |
| "loss": 1.5678, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.7460687160491943, | |
| "learning_rate": 3.6257710760795063e-05, | |
| "loss": 1.6081, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.4106998443603516, | |
| "learning_rate": 3.591501028101439e-05, | |
| "loss": 1.5687, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.0583499670028687, | |
| "learning_rate": 3.557230980123372e-05, | |
| "loss": 1.5467, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.2292665243148804, | |
| "learning_rate": 3.522960932145305e-05, | |
| "loss": 1.5491, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.3556251525878906, | |
| "learning_rate": 3.4886908841672375e-05, | |
| "loss": 1.5568, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 1.6374377012252808, | |
| "learning_rate": 3.4544208361891705e-05, | |
| "loss": 1.6016, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.0343750715255737, | |
| "learning_rate": 3.420150788211104e-05, | |
| "loss": 1.4693, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.378056526184082, | |
| "learning_rate": 3.3858807402330365e-05, | |
| "loss": 1.6081, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.370970368385315, | |
| "learning_rate": 3.3516106922549694e-05, | |
| "loss": 1.515, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.3780639171600342, | |
| "learning_rate": 3.3173406442769024e-05, | |
| "loss": 1.5644, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.0907922983169556, | |
| "learning_rate": 3.2830705962988354e-05, | |
| "loss": 1.5701, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.4807682037353516, | |
| "learning_rate": 3.2488005483207677e-05, | |
| "loss": 1.5535, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 1.7207825183868408, | |
| "learning_rate": 3.2145305003427006e-05, | |
| "loss": 1.6049, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 0.8784241676330566, | |
| "learning_rate": 3.1802604523646336e-05, | |
| "loss": 1.5213, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.6283917427062988, | |
| "learning_rate": 3.1459904043865666e-05, | |
| "loss": 1.4902, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.0017669200897217, | |
| "learning_rate": 3.111720356408499e-05, | |
| "loss": 1.5147, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.4256327152252197, | |
| "learning_rate": 3.077450308430432e-05, | |
| "loss": 1.518, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.4298090934753418, | |
| "learning_rate": 3.0431802604523645e-05, | |
| "loss": 1.5843, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.0894560813903809, | |
| "learning_rate": 3.0089102124742974e-05, | |
| "loss": 1.5931, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.8101505041122437, | |
| "learning_rate": 2.97464016449623e-05, | |
| "loss": 1.5629, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 0.966204047203064, | |
| "learning_rate": 2.9403701165181634e-05, | |
| "loss": 1.5048, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 1.2718944549560547, | |
| "learning_rate": 2.9061000685400963e-05, | |
| "loss": 1.6685, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 0.9012284874916077, | |
| "learning_rate": 2.871830020562029e-05, | |
| "loss": 1.5769, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "eval_loss": 1.9053254127502441, | |
| "eval_runtime": 33.1389, | |
| "eval_samples_per_second": 30.176, | |
| "eval_steps_per_second": 3.772, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.4876313209533691, | |
| "learning_rate": 2.837559972583962e-05, | |
| "loss": 1.4846, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 0.9953039288520813, | |
| "learning_rate": 2.8032899246058946e-05, | |
| "loss": 1.6145, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.4575115442276, | |
| "learning_rate": 2.7690198766278275e-05, | |
| "loss": 1.5442, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.3410977125167847, | |
| "learning_rate": 2.73474982864976e-05, | |
| "loss": 1.5617, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 1.5489014387130737, | |
| "learning_rate": 2.700479780671693e-05, | |
| "loss": 1.6061, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 2.2693567276000977, | |
| "learning_rate": 2.6662097326936258e-05, | |
| "loss": 1.576, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.776106595993042, | |
| "learning_rate": 2.6319396847155587e-05, | |
| "loss": 1.57, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.0588148832321167, | |
| "learning_rate": 2.5976696367374914e-05, | |
| "loss": 1.476, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 1.133484125137329, | |
| "learning_rate": 2.5633995887594243e-05, | |
| "loss": 1.5103, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.3961825370788574, | |
| "learning_rate": 2.529129540781357e-05, | |
| "loss": 1.59, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.7427486181259155, | |
| "learning_rate": 2.49485949280329e-05, | |
| "loss": 1.5608, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.961029291152954, | |
| "learning_rate": 2.460589444825223e-05, | |
| "loss": 1.6278, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.2870323657989502, | |
| "learning_rate": 2.4263193968471555e-05, | |
| "loss": 1.5877, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 1.204353928565979, | |
| "learning_rate": 2.3920493488690885e-05, | |
| "loss": 1.5, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 0.9764713644981384, | |
| "learning_rate": 2.357779300891021e-05, | |
| "loss": 1.6451, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.2140144109725952, | |
| "learning_rate": 2.3235092529129544e-05, | |
| "loss": 1.4958, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.9167425632476807, | |
| "learning_rate": 2.289239204934887e-05, | |
| "loss": 1.569, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 1.864986538887024, | |
| "learning_rate": 2.25496915695682e-05, | |
| "loss": 1.5232, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.2421759366989136, | |
| "learning_rate": 2.2206991089787527e-05, | |
| "loss": 1.5894, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.412864089012146, | |
| "learning_rate": 2.1898560657984922e-05, | |
| "loss": 1.5737, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 1.105542778968811, | |
| "learning_rate": 2.1555860178204252e-05, | |
| "loss": 1.5747, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.7511961460113525, | |
| "learning_rate": 2.1213159698423578e-05, | |
| "loss": 1.5455, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.4287422895431519, | |
| "learning_rate": 2.0870459218642908e-05, | |
| "loss": 1.4871, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 0.835995614528656, | |
| "learning_rate": 2.0527758738862234e-05, | |
| "loss": 1.6128, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.8323885202407837, | |
| "learning_rate": 2.0185058259081564e-05, | |
| "loss": 1.6333, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.5953247547149658, | |
| "learning_rate": 1.984235777930089e-05, | |
| "loss": 1.5631, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.4622983932495117, | |
| "learning_rate": 1.9499657299520223e-05, | |
| "loss": 1.5065, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.6321667432785034, | |
| "learning_rate": 1.915695681973955e-05, | |
| "loss": 1.59, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 1.3693170547485352, | |
| "learning_rate": 1.8814256339958876e-05, | |
| "loss": 1.5847, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.5187667608261108, | |
| "learning_rate": 1.8471555860178205e-05, | |
| "loss": 1.541, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.1000255346298218, | |
| "learning_rate": 1.8128855380397532e-05, | |
| "loss": 1.4775, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 1.4071645736694336, | |
| "learning_rate": 1.778615490061686e-05, | |
| "loss": 1.6336, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.5703157186508179, | |
| "learning_rate": 1.7443454420836188e-05, | |
| "loss": 1.6725, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.0555702447891235, | |
| "learning_rate": 1.710075394105552e-05, | |
| "loss": 1.4712, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 1.4873102903366089, | |
| "learning_rate": 1.6758053461274847e-05, | |
| "loss": 1.5741, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.1715468168258667, | |
| "learning_rate": 1.6415352981494177e-05, | |
| "loss": 1.4884, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.82741379737854, | |
| "learning_rate": 1.6072652501713503e-05, | |
| "loss": 1.5778, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.6479945182800293, | |
| "learning_rate": 1.5729952021932833e-05, | |
| "loss": 1.6802, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.0871607065200806, | |
| "learning_rate": 1.538725154215216e-05, | |
| "loss": 1.5509, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.7326961755752563, | |
| "learning_rate": 1.5044551062371487e-05, | |
| "loss": 1.5746, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 1.3573588132858276, | |
| "learning_rate": 1.4701850582590817e-05, | |
| "loss": 1.6147, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.807897925376892, | |
| "learning_rate": 1.4359150102810145e-05, | |
| "loss": 1.6446, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.0243467092514038, | |
| "learning_rate": 1.4016449623029473e-05, | |
| "loss": 1.5844, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 1.709069013595581, | |
| "learning_rate": 1.36737491432488e-05, | |
| "loss": 1.5774, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.717564582824707, | |
| "learning_rate": 1.3331048663468129e-05, | |
| "loss": 1.5898, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.1066781282424927, | |
| "learning_rate": 1.2988348183687457e-05, | |
| "loss": 1.5828, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 1.119360089302063, | |
| "learning_rate": 1.2645647703906785e-05, | |
| "loss": 1.5321, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.0519651174545288, | |
| "learning_rate": 1.2302947224126114e-05, | |
| "loss": 1.5691, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.7377208471298218, | |
| "learning_rate": 1.1960246744345442e-05, | |
| "loss": 1.5568, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.4080170392990112, | |
| "learning_rate": 1.1617546264564772e-05, | |
| "loss": 1.6109, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "eval_loss": 1.9016900062561035, | |
| "eval_runtime": 33.1979, | |
| "eval_samples_per_second": 30.122, | |
| "eval_steps_per_second": 3.765, | |
| "step": 5500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5838, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "total_flos": 2.980420245786624e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |