where-lambo-checkpoints / trainer_state.json
amazingvince's picture
Upload folder using huggingface_hub
40b94e0
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.056865781523698304,
"eval_steps": 100,
"global_step": 1400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.619433198380567e-07,
"loss": 3.5456,
"step": 1
},
{
"epoch": 0.0,
"learning_rate": 8.097165991902834e-07,
"loss": 3.5854,
"step": 5
},
{
"epoch": 0.0,
"learning_rate": 1.6194331983805669e-06,
"loss": 2.0027,
"step": 10
},
{
"epoch": 0.0,
"learning_rate": 2.42914979757085e-06,
"loss": 1.138,
"step": 15
},
{
"epoch": 0.0,
"learning_rate": 3.2388663967611337e-06,
"loss": 0.989,
"step": 20
},
{
"epoch": 0.0,
"learning_rate": 4.048582995951417e-06,
"loss": 0.9135,
"step": 25
},
{
"epoch": 0.0,
"learning_rate": 4.8582995951417e-06,
"loss": 0.9136,
"step": 30
},
{
"epoch": 0.0,
"learning_rate": 5.668016194331984e-06,
"loss": 0.9079,
"step": 35
},
{
"epoch": 0.0,
"learning_rate": 6.4777327935222675e-06,
"loss": 0.824,
"step": 40
},
{
"epoch": 0.0,
"learning_rate": 7.2874493927125516e-06,
"loss": 0.8211,
"step": 45
},
{
"epoch": 0.0,
"learning_rate": 8.097165991902834e-06,
"loss": 0.8243,
"step": 50
},
{
"epoch": 0.0,
"learning_rate": 8.906882591093118e-06,
"loss": 0.7796,
"step": 55
},
{
"epoch": 0.0,
"learning_rate": 9.7165991902834e-06,
"loss": 0.78,
"step": 60
},
{
"epoch": 0.0,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.7939,
"step": 65
},
{
"epoch": 0.0,
"learning_rate": 1.1336032388663969e-05,
"loss": 0.7776,
"step": 70
},
{
"epoch": 0.0,
"learning_rate": 1.2145748987854251e-05,
"loss": 0.8019,
"step": 75
},
{
"epoch": 0.0,
"learning_rate": 1.2955465587044535e-05,
"loss": 0.8024,
"step": 80
},
{
"epoch": 0.0,
"learning_rate": 1.3765182186234817e-05,
"loss": 0.8603,
"step": 85
},
{
"epoch": 0.0,
"learning_rate": 1.4574898785425103e-05,
"loss": 0.8556,
"step": 90
},
{
"epoch": 0.0,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.8025,
"step": 95
},
{
"epoch": 0.0,
"learning_rate": 1.6194331983805668e-05,
"loss": 0.7998,
"step": 100
},
{
"epoch": 0.0,
"eval_loss": 0.7919395565986633,
"eval_runtime": 147.6152,
"eval_samples_per_second": 16.028,
"eval_steps_per_second": 2.676,
"step": 100
},
{
"epoch": 0.0,
"learning_rate": 1.7004048582995952e-05,
"loss": 0.8449,
"step": 105
},
{
"epoch": 0.0,
"learning_rate": 1.7813765182186236e-05,
"loss": 0.8517,
"step": 110
},
{
"epoch": 0.0,
"learning_rate": 1.862348178137652e-05,
"loss": 0.8707,
"step": 115
},
{
"epoch": 0.0,
"learning_rate": 1.94331983805668e-05,
"loss": 0.8274,
"step": 120
},
{
"epoch": 0.01,
"learning_rate": 2.0242914979757088e-05,
"loss": 0.8742,
"step": 125
},
{
"epoch": 0.01,
"learning_rate": 2.105263157894737e-05,
"loss": 0.865,
"step": 130
},
{
"epoch": 0.01,
"learning_rate": 2.1862348178137656e-05,
"loss": 0.882,
"step": 135
},
{
"epoch": 0.01,
"learning_rate": 2.2672064777327937e-05,
"loss": 0.8877,
"step": 140
},
{
"epoch": 0.01,
"learning_rate": 2.348178137651822e-05,
"loss": 0.9318,
"step": 145
},
{
"epoch": 0.01,
"learning_rate": 2.4291497975708502e-05,
"loss": 0.9297,
"step": 150
},
{
"epoch": 0.01,
"learning_rate": 2.510121457489879e-05,
"loss": 0.9277,
"step": 155
},
{
"epoch": 0.01,
"learning_rate": 2.591093117408907e-05,
"loss": 0.9756,
"step": 160
},
{
"epoch": 0.01,
"learning_rate": 2.6720647773279354e-05,
"loss": 0.935,
"step": 165
},
{
"epoch": 0.01,
"learning_rate": 2.7530364372469635e-05,
"loss": 0.9592,
"step": 170
},
{
"epoch": 0.01,
"learning_rate": 2.8340080971659922e-05,
"loss": 0.9698,
"step": 175
},
{
"epoch": 0.01,
"learning_rate": 2.9149797570850206e-05,
"loss": 0.9645,
"step": 180
},
{
"epoch": 0.01,
"learning_rate": 2.9959514170040487e-05,
"loss": 0.9502,
"step": 185
},
{
"epoch": 0.01,
"learning_rate": 3.0769230769230774e-05,
"loss": 1.0102,
"step": 190
},
{
"epoch": 0.01,
"learning_rate": 3.157894736842106e-05,
"loss": 0.9904,
"step": 195
},
{
"epoch": 0.01,
"learning_rate": 3.2388663967611336e-05,
"loss": 0.9706,
"step": 200
},
{
"epoch": 0.01,
"eval_loss": 0.9783709645271301,
"eval_runtime": 147.5197,
"eval_samples_per_second": 16.039,
"eval_steps_per_second": 2.678,
"step": 200
},
{
"epoch": 0.01,
"learning_rate": 3.319838056680162e-05,
"loss": 1.0047,
"step": 205
},
{
"epoch": 0.01,
"learning_rate": 3.4008097165991904e-05,
"loss": 1.0575,
"step": 210
},
{
"epoch": 0.01,
"learning_rate": 3.481781376518219e-05,
"loss": 1.0273,
"step": 215
},
{
"epoch": 0.01,
"learning_rate": 3.562753036437247e-05,
"loss": 1.0536,
"step": 220
},
{
"epoch": 0.01,
"learning_rate": 3.6437246963562756e-05,
"loss": 1.0294,
"step": 225
},
{
"epoch": 0.01,
"learning_rate": 3.724696356275304e-05,
"loss": 1.0606,
"step": 230
},
{
"epoch": 0.01,
"learning_rate": 3.8056680161943324e-05,
"loss": 1.0694,
"step": 235
},
{
"epoch": 0.01,
"learning_rate": 3.88663967611336e-05,
"loss": 1.0938,
"step": 240
},
{
"epoch": 0.01,
"learning_rate": 3.967611336032389e-05,
"loss": 1.0869,
"step": 245
},
{
"epoch": 0.01,
"learning_rate": 3.999999850459128e-05,
"loss": 1.168,
"step": 250
},
{
"epoch": 0.01,
"learning_rate": 3.999998936598321e-05,
"loss": 1.1085,
"step": 255
},
{
"epoch": 0.01,
"learning_rate": 3.999997191955348e-05,
"loss": 1.1078,
"step": 260
},
{
"epoch": 0.01,
"learning_rate": 3.9999946165309344e-05,
"loss": 1.1171,
"step": 265
},
{
"epoch": 0.01,
"learning_rate": 3.9999912103261505e-05,
"loss": 1.1211,
"step": 270
},
{
"epoch": 0.01,
"learning_rate": 3.99998697334241e-05,
"loss": 1.1611,
"step": 275
},
{
"epoch": 0.01,
"learning_rate": 3.999981905581474e-05,
"loss": 1.1409,
"step": 280
},
{
"epoch": 0.01,
"learning_rate": 3.999976007045447e-05,
"loss": 1.1618,
"step": 285
},
{
"epoch": 0.01,
"learning_rate": 3.9999692777367795e-05,
"loss": 1.1556,
"step": 290
},
{
"epoch": 0.01,
"learning_rate": 3.999961717658266e-05,
"loss": 1.1638,
"step": 295
},
{
"epoch": 0.01,
"learning_rate": 3.999953326813049e-05,
"loss": 1.155,
"step": 300
},
{
"epoch": 0.01,
"eval_loss": 1.1065119504928589,
"eval_runtime": 147.7258,
"eval_samples_per_second": 16.016,
"eval_steps_per_second": 2.674,
"step": 300
},
{
"epoch": 0.01,
"learning_rate": 3.999944105204611e-05,
"loss": 1.1231,
"step": 305
},
{
"epoch": 0.01,
"learning_rate": 3.999934052836784e-05,
"loss": 1.1643,
"step": 310
},
{
"epoch": 0.01,
"learning_rate": 3.999923169713744e-05,
"loss": 1.1288,
"step": 315
},
{
"epoch": 0.01,
"learning_rate": 3.999911455840012e-05,
"loss": 1.2138,
"step": 320
},
{
"epoch": 0.01,
"learning_rate": 3.9998989112204534e-05,
"loss": 1.1732,
"step": 325
},
{
"epoch": 0.01,
"learning_rate": 3.999885535860278e-05,
"loss": 1.2124,
"step": 330
},
{
"epoch": 0.01,
"learning_rate": 3.9998713297650436e-05,
"loss": 1.1609,
"step": 335
},
{
"epoch": 0.01,
"learning_rate": 3.9998562929406505e-05,
"loss": 1.1319,
"step": 340
},
{
"epoch": 0.01,
"learning_rate": 3.999840425393345e-05,
"loss": 1.1741,
"step": 345
},
{
"epoch": 0.01,
"learning_rate": 3.999823727129718e-05,
"loss": 1.1274,
"step": 350
},
{
"epoch": 0.01,
"learning_rate": 3.999806198156706e-05,
"loss": 1.1295,
"step": 355
},
{
"epoch": 0.01,
"learning_rate": 3.999787838481591e-05,
"loss": 1.1273,
"step": 360
},
{
"epoch": 0.01,
"learning_rate": 3.999768648111998e-05,
"loss": 1.1855,
"step": 365
},
{
"epoch": 0.02,
"learning_rate": 3.9997486270559006e-05,
"loss": 1.1445,
"step": 370
},
{
"epoch": 0.02,
"learning_rate": 3.999727775321613e-05,
"loss": 1.1475,
"step": 375
},
{
"epoch": 0.02,
"learning_rate": 3.9997060929177987e-05,
"loss": 1.1315,
"step": 380
},
{
"epoch": 0.02,
"learning_rate": 3.999683579853463e-05,
"loss": 1.174,
"step": 385
},
{
"epoch": 0.02,
"learning_rate": 3.999660236137959e-05,
"loss": 1.1369,
"step": 390
},
{
"epoch": 0.02,
"learning_rate": 3.9996360617809826e-05,
"loss": 1.1386,
"step": 395
},
{
"epoch": 0.02,
"learning_rate": 3.999611056792576e-05,
"loss": 1.1698,
"step": 400
},
{
"epoch": 0.02,
"eval_loss": 1.1100603342056274,
"eval_runtime": 147.5975,
"eval_samples_per_second": 16.03,
"eval_steps_per_second": 2.676,
"step": 400
},
{
"epoch": 0.02,
"learning_rate": 3.9995852211831254e-05,
"loss": 1.1222,
"step": 405
},
{
"epoch": 0.02,
"learning_rate": 3.999558554963364e-05,
"loss": 1.1563,
"step": 410
},
{
"epoch": 0.02,
"learning_rate": 3.999531058144367e-05,
"loss": 1.1521,
"step": 415
},
{
"epoch": 0.02,
"learning_rate": 3.999502730737558e-05,
"loss": 1.1561,
"step": 420
},
{
"epoch": 0.02,
"learning_rate": 3.9994735727547025e-05,
"loss": 1.1979,
"step": 425
},
{
"epoch": 0.02,
"learning_rate": 3.999443584207914e-05,
"loss": 1.1811,
"step": 430
},
{
"epoch": 0.02,
"learning_rate": 3.999412765109648e-05,
"loss": 1.1241,
"step": 435
},
{
"epoch": 0.02,
"learning_rate": 3.999381115472707e-05,
"loss": 1.1933,
"step": 440
},
{
"epoch": 0.02,
"learning_rate": 3.999348635310238e-05,
"loss": 1.1941,
"step": 445
},
{
"epoch": 0.02,
"learning_rate": 3.999315324635733e-05,
"loss": 1.1779,
"step": 450
},
{
"epoch": 0.02,
"learning_rate": 3.9992811834630296e-05,
"loss": 1.1477,
"step": 455
},
{
"epoch": 0.02,
"learning_rate": 3.9992462118063094e-05,
"loss": 1.1848,
"step": 460
},
{
"epoch": 0.02,
"learning_rate": 3.999210409680098e-05,
"loss": 1.1892,
"step": 465
},
{
"epoch": 0.02,
"learning_rate": 3.99917377709927e-05,
"loss": 1.1892,
"step": 470
},
{
"epoch": 0.02,
"learning_rate": 3.999136314079039e-05,
"loss": 1.2381,
"step": 475
},
{
"epoch": 0.02,
"learning_rate": 3.99909802063497e-05,
"loss": 1.1743,
"step": 480
},
{
"epoch": 0.02,
"learning_rate": 3.999058896782967e-05,
"loss": 1.225,
"step": 485
},
{
"epoch": 0.02,
"learning_rate": 3.9990189425392826e-05,
"loss": 1.2099,
"step": 490
},
{
"epoch": 0.02,
"learning_rate": 3.998978157920515e-05,
"loss": 1.1681,
"step": 495
},
{
"epoch": 0.02,
"learning_rate": 3.9989365429436046e-05,
"loss": 1.1842,
"step": 500
},
{
"epoch": 0.02,
"eval_loss": 1.1305327415466309,
"eval_runtime": 147.7068,
"eval_samples_per_second": 16.018,
"eval_steps_per_second": 2.674,
"step": 500
},
{
"epoch": 0.02,
"learning_rate": 3.9988940976258376e-05,
"loss": 1.2052,
"step": 505
},
{
"epoch": 0.02,
"learning_rate": 3.998850821984845e-05,
"loss": 1.1667,
"step": 510
},
{
"epoch": 0.02,
"learning_rate": 3.998806716038604e-05,
"loss": 1.1788,
"step": 515
},
{
"epoch": 0.02,
"learning_rate": 3.998761779805437e-05,
"loss": 1.1593,
"step": 520
},
{
"epoch": 0.02,
"learning_rate": 3.998716013304007e-05,
"loss": 1.1985,
"step": 525
},
{
"epoch": 0.02,
"learning_rate": 3.9986694165533275e-05,
"loss": 1.2016,
"step": 530
},
{
"epoch": 0.02,
"learning_rate": 3.9986219895727535e-05,
"loss": 1.157,
"step": 535
},
{
"epoch": 0.02,
"learning_rate": 3.998573732381986e-05,
"loss": 1.165,
"step": 540
},
{
"epoch": 0.02,
"learning_rate": 3.9985246450010706e-05,
"loss": 1.1454,
"step": 545
},
{
"epoch": 0.02,
"learning_rate": 3.998474727450397e-05,
"loss": 1.1403,
"step": 550
},
{
"epoch": 0.02,
"learning_rate": 3.998423979750702e-05,
"loss": 1.1456,
"step": 555
},
{
"epoch": 0.02,
"learning_rate": 3.998372401923065e-05,
"loss": 1.1927,
"step": 560
},
{
"epoch": 0.02,
"learning_rate": 3.998319993988911e-05,
"loss": 1.2018,
"step": 565
},
{
"epoch": 0.02,
"learning_rate": 3.998266755970009e-05,
"loss": 1.1562,
"step": 570
},
{
"epoch": 0.02,
"learning_rate": 3.998212687888474e-05,
"loss": 1.1965,
"step": 575
},
{
"epoch": 0.02,
"learning_rate": 3.998157789766767e-05,
"loss": 1.2017,
"step": 580
},
{
"epoch": 0.02,
"learning_rate": 3.9981020616276904e-05,
"loss": 1.1575,
"step": 585
},
{
"epoch": 0.02,
"learning_rate": 3.998045503494394e-05,
"loss": 1.1821,
"step": 590
},
{
"epoch": 0.02,
"learning_rate": 3.9979881153903706e-05,
"loss": 1.1702,
"step": 595
},
{
"epoch": 0.02,
"learning_rate": 3.99792989733946e-05,
"loss": 1.1156,
"step": 600
},
{
"epoch": 0.02,
"eval_loss": 1.1337687969207764,
"eval_runtime": 147.7416,
"eval_samples_per_second": 16.014,
"eval_steps_per_second": 2.674,
"step": 600
},
{
"epoch": 0.02,
"learning_rate": 3.997870849365845e-05,
"loss": 1.2001,
"step": 605
},
{
"epoch": 0.02,
"learning_rate": 3.997810971494054e-05,
"loss": 1.1929,
"step": 610
},
{
"epoch": 0.02,
"learning_rate": 3.997750263748958e-05,
"loss": 1.2504,
"step": 615
},
{
"epoch": 0.03,
"learning_rate": 3.997688726155776e-05,
"loss": 1.1815,
"step": 620
},
{
"epoch": 0.03,
"learning_rate": 3.9976263587400704e-05,
"loss": 1.2068,
"step": 625
},
{
"epoch": 0.03,
"learning_rate": 3.997563161527748e-05,
"loss": 1.1523,
"step": 630
},
{
"epoch": 0.03,
"learning_rate": 3.997499134545059e-05,
"loss": 1.1511,
"step": 635
},
{
"epoch": 0.03,
"learning_rate": 3.9974342778186004e-05,
"loss": 1.2351,
"step": 640
},
{
"epoch": 0.03,
"learning_rate": 3.997368591375314e-05,
"loss": 1.1421,
"step": 645
},
{
"epoch": 0.03,
"learning_rate": 3.997302075242485e-05,
"loss": 1.2697,
"step": 650
},
{
"epoch": 0.03,
"learning_rate": 3.9972347294477433e-05,
"loss": 1.23,
"step": 655
},
{
"epoch": 0.03,
"learning_rate": 3.997166554019063e-05,
"loss": 1.1817,
"step": 660
},
{
"epoch": 0.03,
"learning_rate": 3.997097548984765e-05,
"loss": 1.1849,
"step": 665
},
{
"epoch": 0.03,
"learning_rate": 3.9970277143735124e-05,
"loss": 1.2048,
"step": 670
},
{
"epoch": 0.03,
"learning_rate": 3.996957050214314e-05,
"loss": 1.206,
"step": 675
},
{
"epoch": 0.03,
"learning_rate": 3.996885556536524e-05,
"loss": 1.1733,
"step": 680
},
{
"epoch": 0.03,
"learning_rate": 3.9968132333698396e-05,
"loss": 1.2148,
"step": 685
},
{
"epoch": 0.03,
"learning_rate": 3.9967400807443033e-05,
"loss": 1.2268,
"step": 690
},
{
"epoch": 0.03,
"learning_rate": 3.996666098690301e-05,
"loss": 1.1708,
"step": 695
},
{
"epoch": 0.03,
"learning_rate": 3.9965912872385656e-05,
"loss": 1.2447,
"step": 700
},
{
"epoch": 0.03,
"eval_loss": 1.143655776977539,
"eval_runtime": 147.6778,
"eval_samples_per_second": 16.021,
"eval_steps_per_second": 2.675,
"step": 700
},
{
"epoch": 0.03,
"learning_rate": 3.996515646420173e-05,
"loss": 1.201,
"step": 705
},
{
"epoch": 0.03,
"learning_rate": 3.996439176266544e-05,
"loss": 1.2456,
"step": 710
},
{
"epoch": 0.03,
"learning_rate": 3.996361876809442e-05,
"loss": 1.2182,
"step": 715
},
{
"epoch": 0.03,
"learning_rate": 3.996283748080977e-05,
"loss": 1.2523,
"step": 720
},
{
"epoch": 0.03,
"learning_rate": 3.996204790113605e-05,
"loss": 1.2092,
"step": 725
},
{
"epoch": 0.03,
"learning_rate": 3.996125002940122e-05,
"loss": 1.1716,
"step": 730
},
{
"epoch": 0.03,
"learning_rate": 3.9960443865936726e-05,
"loss": 1.2161,
"step": 735
},
{
"epoch": 0.03,
"learning_rate": 3.995962941107744e-05,
"loss": 1.1401,
"step": 740
},
{
"epoch": 0.03,
"learning_rate": 3.995880666516166e-05,
"loss": 1.2052,
"step": 745
},
{
"epoch": 0.03,
"learning_rate": 3.995797562853117e-05,
"loss": 1.1617,
"step": 750
},
{
"epoch": 0.03,
"learning_rate": 3.995713630153117e-05,
"loss": 1.1427,
"step": 755
},
{
"epoch": 0.03,
"learning_rate": 3.99562886845103e-05,
"loss": 1.1332,
"step": 760
},
{
"epoch": 0.03,
"learning_rate": 3.995543277782066e-05,
"loss": 1.195,
"step": 765
},
{
"epoch": 0.03,
"learning_rate": 3.995456858181778e-05,
"loss": 1.2486,
"step": 770
},
{
"epoch": 0.03,
"learning_rate": 3.995369609686065e-05,
"loss": 1.2163,
"step": 775
},
{
"epoch": 0.03,
"learning_rate": 3.995281532331169e-05,
"loss": 1.1719,
"step": 780
},
{
"epoch": 0.03,
"learning_rate": 3.995192626153676e-05,
"loss": 1.124,
"step": 785
},
{
"epoch": 0.03,
"learning_rate": 3.995102891190517e-05,
"loss": 1.1994,
"step": 790
},
{
"epoch": 0.03,
"learning_rate": 3.995012327478968e-05,
"loss": 1.1725,
"step": 795
},
{
"epoch": 0.03,
"learning_rate": 3.9949209350566464e-05,
"loss": 1.1781,
"step": 800
},
{
"epoch": 0.03,
"eval_loss": 1.1505155563354492,
"eval_runtime": 147.4826,
"eval_samples_per_second": 16.043,
"eval_steps_per_second": 2.678,
"step": 800
},
{
"epoch": 0.03,
"learning_rate": 3.9948287139615176e-05,
"loss": 1.2887,
"step": 805
},
{
"epoch": 0.03,
"learning_rate": 3.994735664231889e-05,
"loss": 1.1734,
"step": 810
},
{
"epoch": 0.03,
"learning_rate": 3.994641785906413e-05,
"loss": 1.1603,
"step": 815
},
{
"epoch": 0.03,
"learning_rate": 3.994547079024084e-05,
"loss": 1.1798,
"step": 820
},
{
"epoch": 0.03,
"learning_rate": 3.994451543624245e-05,
"loss": 1.1728,
"step": 825
},
{
"epoch": 0.03,
"learning_rate": 3.994355179746579e-05,
"loss": 1.1855,
"step": 830
},
{
"epoch": 0.03,
"learning_rate": 3.994257987431116e-05,
"loss": 1.2026,
"step": 835
},
{
"epoch": 0.03,
"learning_rate": 3.9941599667182267e-05,
"loss": 1.1534,
"step": 840
},
{
"epoch": 0.03,
"learning_rate": 3.99406111764863e-05,
"loss": 1.182,
"step": 845
},
{
"epoch": 0.03,
"learning_rate": 3.993961440263386e-05,
"loss": 1.1593,
"step": 850
},
{
"epoch": 0.03,
"learning_rate": 3.9938609346038995e-05,
"loss": 1.1862,
"step": 855
},
{
"epoch": 0.03,
"learning_rate": 3.993759600711921e-05,
"loss": 1.1936,
"step": 860
},
{
"epoch": 0.04,
"learning_rate": 3.993657438629543e-05,
"loss": 1.189,
"step": 865
},
{
"epoch": 0.04,
"learning_rate": 3.993554448399202e-05,
"loss": 1.1711,
"step": 870
},
{
"epoch": 0.04,
"learning_rate": 3.99345063006368e-05,
"loss": 1.2105,
"step": 875
},
{
"epoch": 0.04,
"learning_rate": 3.993345983666102e-05,
"loss": 1.2399,
"step": 880
},
{
"epoch": 0.04,
"learning_rate": 3.9932405092499384e-05,
"loss": 1.1836,
"step": 885
},
{
"epoch": 0.04,
"learning_rate": 3.993134206859001e-05,
"loss": 1.2081,
"step": 890
},
{
"epoch": 0.04,
"learning_rate": 3.993027076537447e-05,
"loss": 1.2749,
"step": 895
},
{
"epoch": 0.04,
"learning_rate": 3.992919118329777e-05,
"loss": 1.2129,
"step": 900
},
{
"epoch": 0.04,
"eval_loss": 1.1472464799880981,
"eval_runtime": 147.6777,
"eval_samples_per_second": 16.021,
"eval_steps_per_second": 2.675,
"step": 900
},
{
"epoch": 0.04,
"learning_rate": 3.992810332280837e-05,
"loss": 1.1682,
"step": 905
},
{
"epoch": 0.04,
"learning_rate": 3.9927007184358156e-05,
"loss": 1.1847,
"step": 910
},
{
"epoch": 0.04,
"learning_rate": 3.9925902768402454e-05,
"loss": 1.23,
"step": 915
},
{
"epoch": 0.04,
"learning_rate": 3.992479007540002e-05,
"loss": 1.2384,
"step": 920
},
{
"epoch": 0.04,
"learning_rate": 3.9923669105813064e-05,
"loss": 1.1557,
"step": 925
},
{
"epoch": 0.04,
"learning_rate": 3.992253986010723e-05,
"loss": 1.1815,
"step": 930
},
{
"epoch": 0.04,
"learning_rate": 3.992140233875159e-05,
"loss": 1.192,
"step": 935
},
{
"epoch": 0.04,
"learning_rate": 3.992025654221865e-05,
"loss": 1.1514,
"step": 940
},
{
"epoch": 0.04,
"learning_rate": 3.99191024709844e-05,
"loss": 1.2003,
"step": 945
},
{
"epoch": 0.04,
"learning_rate": 3.99179401255282e-05,
"loss": 1.1963,
"step": 950
},
{
"epoch": 0.04,
"learning_rate": 3.991676950633288e-05,
"loss": 1.1607,
"step": 955
},
{
"epoch": 0.04,
"learning_rate": 3.9915590613884723e-05,
"loss": 1.1276,
"step": 960
},
{
"epoch": 0.04,
"learning_rate": 3.991440344867341e-05,
"loss": 1.179,
"step": 965
},
{
"epoch": 0.04,
"learning_rate": 3.9913208011192095e-05,
"loss": 1.1568,
"step": 970
},
{
"epoch": 0.04,
"learning_rate": 3.9912004301937346e-05,
"loss": 1.1639,
"step": 975
},
{
"epoch": 0.04,
"learning_rate": 3.991079232140917e-05,
"loss": 1.1477,
"step": 980
},
{
"epoch": 0.04,
"learning_rate": 3.990957207011101e-05,
"loss": 1.1462,
"step": 985
},
{
"epoch": 0.04,
"learning_rate": 3.990834354854976e-05,
"loss": 1.1981,
"step": 990
},
{
"epoch": 0.04,
"learning_rate": 3.990710675723573e-05,
"loss": 1.1501,
"step": 995
},
{
"epoch": 0.04,
"learning_rate": 3.990586169668268e-05,
"loss": 1.1959,
"step": 1000
},
{
"epoch": 0.04,
"eval_loss": 1.1401584148406982,
"eval_runtime": 147.7237,
"eval_samples_per_second": 16.016,
"eval_steps_per_second": 2.674,
"step": 1000
},
{
"epoch": 0.04,
"learning_rate": 3.990460836740779e-05,
"loss": 1.1872,
"step": 1005
},
{
"epoch": 0.04,
"learning_rate": 3.990334676993168e-05,
"loss": 1.1681,
"step": 1010
},
{
"epoch": 0.04,
"learning_rate": 3.990207690477841e-05,
"loss": 1.2579,
"step": 1015
},
{
"epoch": 0.04,
"learning_rate": 3.9900798772475464e-05,
"loss": 1.2083,
"step": 1020
},
{
"epoch": 0.04,
"learning_rate": 3.989951237355379e-05,
"loss": 1.2109,
"step": 1025
},
{
"epoch": 0.04,
"learning_rate": 3.989821770854771e-05,
"loss": 1.2098,
"step": 1030
},
{
"epoch": 0.04,
"learning_rate": 3.989691477799506e-05,
"loss": 1.1644,
"step": 1035
},
{
"epoch": 0.04,
"learning_rate": 3.9895603582437025e-05,
"loss": 1.226,
"step": 1040
},
{
"epoch": 0.04,
"learning_rate": 3.989428412241829e-05,
"loss": 1.2039,
"step": 1045
},
{
"epoch": 0.04,
"learning_rate": 3.989295639848694e-05,
"loss": 1.1914,
"step": 1050
},
{
"epoch": 0.04,
"learning_rate": 3.98916204111945e-05,
"loss": 1.1709,
"step": 1055
},
{
"epoch": 0.04,
"learning_rate": 3.989027616109592e-05,
"loss": 1.1592,
"step": 1060
},
{
"epoch": 0.04,
"learning_rate": 3.988892364874961e-05,
"loss": 1.1795,
"step": 1065
},
{
"epoch": 0.04,
"learning_rate": 3.988756287471736e-05,
"loss": 1.1595,
"step": 1070
},
{
"epoch": 0.04,
"learning_rate": 3.988619383956445e-05,
"loss": 1.1556,
"step": 1075
},
{
"epoch": 0.04,
"learning_rate": 3.988481654385957e-05,
"loss": 1.1523,
"step": 1080
},
{
"epoch": 0.04,
"learning_rate": 3.9883430988174813e-05,
"loss": 1.2019,
"step": 1085
},
{
"epoch": 0.04,
"learning_rate": 3.9882037173085745e-05,
"loss": 1.1442,
"step": 1090
},
{
"epoch": 0.04,
"learning_rate": 3.988063509917133e-05,
"loss": 1.1949,
"step": 1095
},
{
"epoch": 0.04,
"learning_rate": 3.987922476701399e-05,
"loss": 1.249,
"step": 1100
},
{
"epoch": 0.04,
"eval_loss": 1.1385736465454102,
"eval_runtime": 147.6531,
"eval_samples_per_second": 16.024,
"eval_steps_per_second": 2.675,
"step": 1100
},
{
"epoch": 0.04,
"learning_rate": 3.987780617719956e-05,
"loss": 1.1863,
"step": 1105
},
{
"epoch": 0.05,
"learning_rate": 3.987637933031731e-05,
"loss": 1.1402,
"step": 1110
},
{
"epoch": 0.05,
"learning_rate": 3.987494422695994e-05,
"loss": 1.1732,
"step": 1115
},
{
"epoch": 0.05,
"learning_rate": 3.987350086772358e-05,
"loss": 1.2003,
"step": 1120
},
{
"epoch": 0.05,
"learning_rate": 3.987204925320779e-05,
"loss": 1.154,
"step": 1125
},
{
"epoch": 0.05,
"learning_rate": 3.987058938401555e-05,
"loss": 1.1681,
"step": 1130
},
{
"epoch": 0.05,
"learning_rate": 3.9869121260753284e-05,
"loss": 1.1782,
"step": 1135
},
{
"epoch": 0.05,
"learning_rate": 3.9867644884030836e-05,
"loss": 1.1983,
"step": 1140
},
{
"epoch": 0.05,
"learning_rate": 3.986616025446148e-05,
"loss": 1.1692,
"step": 1145
},
{
"epoch": 0.05,
"learning_rate": 3.9864667372661924e-05,
"loss": 1.1413,
"step": 1150
},
{
"epoch": 0.05,
"learning_rate": 3.9863166239252284e-05,
"loss": 1.1805,
"step": 1155
},
{
"epoch": 0.05,
"learning_rate": 3.986165685485614e-05,
"loss": 1.1676,
"step": 1160
},
{
"epoch": 0.05,
"learning_rate": 3.9860139220100456e-05,
"loss": 1.1958,
"step": 1165
},
{
"epoch": 0.05,
"learning_rate": 3.985861333561565e-05,
"loss": 1.1652,
"step": 1170
},
{
"epoch": 0.05,
"learning_rate": 3.985707920203557e-05,
"loss": 1.1271,
"step": 1175
},
{
"epoch": 0.05,
"learning_rate": 3.985553681999747e-05,
"loss": 1.2058,
"step": 1180
},
{
"epoch": 0.05,
"learning_rate": 3.985398619014205e-05,
"loss": 1.1331,
"step": 1185
},
{
"epoch": 0.05,
"learning_rate": 3.985242731311342e-05,
"loss": 1.1927,
"step": 1190
},
{
"epoch": 0.05,
"learning_rate": 3.9850860189559135e-05,
"loss": 1.1241,
"step": 1195
},
{
"epoch": 0.05,
"learning_rate": 3.984928482013016e-05,
"loss": 1.223,
"step": 1200
},
{
"epoch": 0.05,
"eval_loss": 1.1339735984802246,
"eval_runtime": 147.6746,
"eval_samples_per_second": 16.022,
"eval_steps_per_second": 2.675,
"step": 1200
},
{
"epoch": 0.05,
"learning_rate": 3.9847701205480887e-05,
"loss": 1.1781,
"step": 1205
},
{
"epoch": 0.05,
"learning_rate": 3.984610934626913e-05,
"loss": 1.2793,
"step": 1210
},
{
"epoch": 0.05,
"learning_rate": 3.984450924315614e-05,
"loss": 1.1629,
"step": 1215
},
{
"epoch": 0.05,
"learning_rate": 3.98429008968066e-05,
"loss": 1.2031,
"step": 1220
},
{
"epoch": 0.05,
"learning_rate": 3.9841284307888586e-05,
"loss": 1.1911,
"step": 1225
},
{
"epoch": 0.05,
"learning_rate": 3.983965947707361e-05,
"loss": 1.1411,
"step": 1230
},
{
"epoch": 0.05,
"learning_rate": 3.9838026405036625e-05,
"loss": 1.1536,
"step": 1235
},
{
"epoch": 0.05,
"learning_rate": 3.9836385092456e-05,
"loss": 1.2041,
"step": 1240
},
{
"epoch": 0.05,
"learning_rate": 3.98347355400135e-05,
"loss": 1.2097,
"step": 1245
},
{
"epoch": 0.05,
"learning_rate": 3.9833077748394355e-05,
"loss": 1.2365,
"step": 1250
},
{
"epoch": 0.05,
"learning_rate": 3.9831411718287195e-05,
"loss": 1.1123,
"step": 1255
},
{
"epoch": 0.05,
"learning_rate": 3.982973745038406e-05,
"loss": 1.1607,
"step": 1260
},
{
"epoch": 0.05,
"learning_rate": 3.982805494538044e-05,
"loss": 1.2217,
"step": 1265
},
{
"epoch": 0.05,
"learning_rate": 3.982636420397523e-05,
"loss": 1.1717,
"step": 1270
},
{
"epoch": 0.05,
"learning_rate": 3.982466522687075e-05,
"loss": 1.119,
"step": 1275
},
{
"epoch": 0.05,
"learning_rate": 3.982295801477273e-05,
"loss": 1.2046,
"step": 1280
},
{
"epoch": 0.05,
"learning_rate": 3.9821242568390345e-05,
"loss": 1.2211,
"step": 1285
},
{
"epoch": 0.05,
"learning_rate": 3.981951888843617e-05,
"loss": 1.2022,
"step": 1290
},
{
"epoch": 0.05,
"learning_rate": 3.9817786975626215e-05,
"loss": 1.1738,
"step": 1295
},
{
"epoch": 0.05,
"learning_rate": 3.9816046830679884e-05,
"loss": 1.1974,
"step": 1300
},
{
"epoch": 0.05,
"eval_loss": 1.139178991317749,
"eval_runtime": 147.7574,
"eval_samples_per_second": 16.013,
"eval_steps_per_second": 2.673,
"step": 1300
},
{
"epoch": 0.05,
"learning_rate": 3.981429845432003e-05,
"loss": 1.1692,
"step": 1305
},
{
"epoch": 0.05,
"learning_rate": 3.981254184727292e-05,
"loss": 1.2064,
"step": 1310
},
{
"epoch": 0.05,
"learning_rate": 3.981077701026822e-05,
"loss": 1.1531,
"step": 1315
},
{
"epoch": 0.05,
"learning_rate": 3.980900394403903e-05,
"loss": 1.1729,
"step": 1320
},
{
"epoch": 0.05,
"learning_rate": 3.9807222649321865e-05,
"loss": 1.1757,
"step": 1325
},
{
"epoch": 0.05,
"learning_rate": 3.9805433126856676e-05,
"loss": 1.1888,
"step": 1330
},
{
"epoch": 0.05,
"learning_rate": 3.980363537738679e-05,
"loss": 1.1702,
"step": 1335
},
{
"epoch": 0.05,
"learning_rate": 3.9801829401659e-05,
"loss": 1.1866,
"step": 1340
},
{
"epoch": 0.05,
"learning_rate": 3.980001520042348e-05,
"loss": 1.0926,
"step": 1345
},
{
"epoch": 0.05,
"learning_rate": 3.979819277443383e-05,
"loss": 1.2147,
"step": 1350
},
{
"epoch": 0.06,
"learning_rate": 3.979636212444708e-05,
"loss": 1.1995,
"step": 1355
},
{
"epoch": 0.06,
"learning_rate": 3.979452325122365e-05,
"loss": 1.1527,
"step": 1360
},
{
"epoch": 0.06,
"learning_rate": 3.9792676155527416e-05,
"loss": 1.1541,
"step": 1365
},
{
"epoch": 0.06,
"learning_rate": 3.979082083812562e-05,
"loss": 1.1754,
"step": 1370
},
{
"epoch": 0.06,
"learning_rate": 3.9788957299788965e-05,
"loss": 1.1527,
"step": 1375
},
{
"epoch": 0.06,
"learning_rate": 3.978708554129154e-05,
"loss": 1.2118,
"step": 1380
},
{
"epoch": 0.06,
"learning_rate": 3.978520556341086e-05,
"loss": 1.1607,
"step": 1385
},
{
"epoch": 0.06,
"learning_rate": 3.978331736692785e-05,
"loss": 1.2215,
"step": 1390
},
{
"epoch": 0.06,
"learning_rate": 3.978142095262685e-05,
"loss": 1.1173,
"step": 1395
},
{
"epoch": 0.06,
"learning_rate": 3.977951632129561e-05,
"loss": 1.1601,
"step": 1400
},
{
"epoch": 0.06,
"eval_loss": 1.1346731185913086,
"eval_runtime": 147.6488,
"eval_samples_per_second": 16.025,
"eval_steps_per_second": 2.675,
"step": 1400
}
],
"logging_steps": 5,
"max_steps": 24619,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 194475417608192.0,
"trial_name": null,
"trial_params": null
}