kie3-bs-plus-lora / trainer_state.json
jpraysz's picture
Upload 21 files
850f107 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 282,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.053475935828877004,
"grad_norm": 0.3194684684276581,
"learning_rate": 4.997518240705502e-05,
"loss": 0.2185,
"num_input_tokens_seen": 279408,
"step": 5,
"train_runtime": 91.3615,
"train_tokens_per_second": 3058.267
},
{
"epoch": 0.10695187165775401,
"grad_norm": 0.23469097912311554,
"learning_rate": 4.9874445377212606e-05,
"loss": 0.1329,
"num_input_tokens_seen": 566864,
"step": 10,
"train_runtime": 185.5432,
"train_tokens_per_second": 3055.159
},
{
"epoch": 0.16042780748663102,
"grad_norm": 0.1777949333190918,
"learning_rate": 4.969655004749674e-05,
"loss": 0.1124,
"num_input_tokens_seen": 861088,
"step": 15,
"train_runtime": 283.0649,
"train_tokens_per_second": 3042.016
},
{
"epoch": 0.21390374331550802,
"grad_norm": 0.14744223654270172,
"learning_rate": 4.944204823327408e-05,
"loss": 0.0841,
"num_input_tokens_seen": 1154080,
"step": 20,
"train_runtime": 378.7884,
"train_tokens_per_second": 3046.767
},
{
"epoch": 0.26737967914438504,
"grad_norm": 0.1648288071155548,
"learning_rate": 4.911172937635942e-05,
"loss": 0.067,
"num_input_tokens_seen": 1439584,
"step": 25,
"train_runtime": 471.4705,
"train_tokens_per_second": 3053.391
},
{
"epoch": 0.32085561497326204,
"grad_norm": 0.18084144592285156,
"learning_rate": 4.870661809623788e-05,
"loss": 0.0633,
"num_input_tokens_seen": 1723424,
"step": 30,
"train_runtime": 563.3252,
"train_tokens_per_second": 3059.377
},
{
"epoch": 0.37433155080213903,
"grad_norm": 0.1638505458831787,
"learning_rate": 4.8227971011787196e-05,
"loss": 0.0508,
"num_input_tokens_seen": 2010112,
"step": 35,
"train_runtime": 656.4365,
"train_tokens_per_second": 3062.158
},
{
"epoch": 0.42780748663101603,
"grad_norm": 0.2159387320280075,
"learning_rate": 4.767727284335852e-05,
"loss": 0.0452,
"num_input_tokens_seen": 2293712,
"step": 40,
"train_runtime": 748.4606,
"train_tokens_per_second": 3064.573
},
{
"epoch": 0.48128342245989303,
"grad_norm": 0.14859342575073242,
"learning_rate": 4.705623180730705e-05,
"loss": 0.0401,
"num_input_tokens_seen": 2579088,
"step": 45,
"train_runtime": 841.5866,
"train_tokens_per_second": 3064.555
},
{
"epoch": 0.5347593582887701,
"grad_norm": 0.11989740282297134,
"learning_rate": 4.6366774317257946e-05,
"loss": 0.0418,
"num_input_tokens_seen": 2863376,
"step": 50,
"train_runtime": 933.8669,
"train_tokens_per_second": 3066.15
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.13309597969055176,
"learning_rate": 4.561103900854401e-05,
"loss": 0.0468,
"num_input_tokens_seen": 3157536,
"step": 55,
"train_runtime": 1030.6792,
"train_tokens_per_second": 3063.549
},
{
"epoch": 0.6417112299465241,
"grad_norm": 0.1302647739648819,
"learning_rate": 4.479137010435053e-05,
"loss": 0.0421,
"num_input_tokens_seen": 3438464,
"step": 60,
"train_runtime": 1121.7292,
"train_tokens_per_second": 3065.324
},
{
"epoch": 0.6951871657754011,
"grad_norm": 0.14104266464710236,
"learning_rate": 4.391031014414514e-05,
"loss": 0.04,
"num_input_tokens_seen": 3726720,
"step": 65,
"train_runtime": 1215.6376,
"train_tokens_per_second": 3065.65
},
{
"epoch": 0.7486631016042781,
"grad_norm": 0.09389258176088333,
"learning_rate": 4.2970592096948236e-05,
"loss": 0.0279,
"num_input_tokens_seen": 4011152,
"step": 70,
"train_runtime": 1308.2677,
"train_tokens_per_second": 3066.002
},
{
"epoch": 0.8021390374331551,
"grad_norm": 0.15608885884284973,
"learning_rate": 4.197513088390813e-05,
"loss": 0.0322,
"num_input_tokens_seen": 4294992,
"step": 75,
"train_runtime": 1400.2179,
"train_tokens_per_second": 3067.374
},
{
"epoch": 0.8556149732620321,
"grad_norm": 0.13360099494457245,
"learning_rate": 4.092701433647687e-05,
"loss": 0.034,
"num_input_tokens_seen": 4579744,
"step": 80,
"train_runtime": 1492.6689,
"train_tokens_per_second": 3068.158
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.12436749041080475,
"learning_rate": 3.982949361823388e-05,
"loss": 0.0388,
"num_input_tokens_seen": 4856256,
"step": 85,
"train_runtime": 1581.75,
"train_tokens_per_second": 3070.179
},
{
"epoch": 0.9625668449197861,
"grad_norm": 0.12284192442893982,
"learning_rate": 3.8685973140068e-05,
"loss": 0.0395,
"num_input_tokens_seen": 5130032,
"step": 90,
"train_runtime": 1669.3625,
"train_tokens_per_second": 3073.048
},
{
"epoch": 1.0106951871657754,
"grad_norm": 0.17556537687778473,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.03,
"num_input_tokens_seen": 5392272,
"step": 95,
"train_runtime": 1756.1589,
"train_tokens_per_second": 3070.492
},
{
"epoch": 1.0641711229946524,
"grad_norm": 0.1371014416217804,
"learning_rate": 3.6275252980402544e-05,
"loss": 0.0282,
"num_input_tokens_seen": 5683744,
"step": 100,
"train_runtime": 1851.388,
"train_tokens_per_second": 3069.991
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.18953965604305267,
"learning_rate": 3.501553113674699e-05,
"loss": 0.0452,
"num_input_tokens_seen": 5971952,
"step": 105,
"train_runtime": 1946.7884,
"train_tokens_per_second": 3067.592
},
{
"epoch": 1.1711229946524064,
"grad_norm": 0.1199231818318367,
"learning_rate": 3.3724742013273854e-05,
"loss": 0.0266,
"num_input_tokens_seen": 6263424,
"step": 110,
"train_runtime": 2042.2353,
"train_tokens_per_second": 3066.945
},
{
"epoch": 1.2245989304812834,
"grad_norm": 0.08340111374855042,
"learning_rate": 3.2406889522140856e-05,
"loss": 0.0329,
"num_input_tokens_seen": 6551312,
"step": 115,
"train_runtime": 2135.5267,
"train_tokens_per_second": 3067.773
},
{
"epoch": 1.2780748663101604,
"grad_norm": 0.11855883151292801,
"learning_rate": 3.1066061523646295e-05,
"loss": 0.0232,
"num_input_tokens_seen": 6840256,
"step": 120,
"train_runtime": 2229.9866,
"train_tokens_per_second": 3067.398
},
{
"epoch": 1.3315508021390374,
"grad_norm": 0.13163559138774872,
"learning_rate": 2.9706417146052838e-05,
"loss": 0.0366,
"num_input_tokens_seen": 7128624,
"step": 125,
"train_runtime": 2323.6929,
"train_tokens_per_second": 3067.8
},
{
"epoch": 1.3850267379679144,
"grad_norm": 0.14266876876354218,
"learning_rate": 2.8332173884344477e-05,
"loss": 0.0301,
"num_input_tokens_seen": 7418256,
"step": 130,
"train_runtime": 2418.4715,
"train_tokens_per_second": 3067.332
},
{
"epoch": 1.4385026737967914,
"grad_norm": 0.1019771620631218,
"learning_rate": 2.6947594517935083e-05,
"loss": 0.0253,
"num_input_tokens_seen": 7710672,
"step": 135,
"train_runtime": 2514.2479,
"train_tokens_per_second": 3066.791
},
{
"epoch": 1.4919786096256684,
"grad_norm": 0.07594949007034302,
"learning_rate": 2.555697388790885e-05,
"loss": 0.0291,
"num_input_tokens_seen": 7985712,
"step": 140,
"train_runtime": 2602.9726,
"train_tokens_per_second": 3067.92
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.08458653837442398,
"learning_rate": 2.4164625574808146e-05,
"loss": 0.0236,
"num_input_tokens_seen": 8274128,
"step": 145,
"train_runtime": 2697.4446,
"train_tokens_per_second": 3067.395
},
{
"epoch": 1.5989304812834224,
"grad_norm": 0.1144990548491478,
"learning_rate": 2.277486851829338e-05,
"loss": 0.0288,
"num_input_tokens_seen": 8561696,
"step": 150,
"train_runtime": 2791.3205,
"train_tokens_per_second": 3067.256
},
{
"epoch": 1.6524064171122994,
"grad_norm": 0.14420673251152039,
"learning_rate": 2.1392013620179337e-05,
"loss": 0.0334,
"num_input_tokens_seen": 8846736,
"step": 155,
"train_runtime": 2883.7816,
"train_tokens_per_second": 3067.755
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.14161178469657898,
"learning_rate": 2.0020350372404102e-05,
"loss": 0.0273,
"num_input_tokens_seen": 9120352,
"step": 160,
"train_runtime": 2971.4097,
"train_tokens_per_second": 3069.369
},
{
"epoch": 1.7593582887700534,
"grad_norm": 0.15640532970428467,
"learning_rate": 1.8664133551409612e-05,
"loss": 0.0219,
"num_input_tokens_seen": 9398016,
"step": 165,
"train_runtime": 3061.1211,
"train_tokens_per_second": 3070.122
},
{
"epoch": 1.8128342245989306,
"grad_norm": 0.10912443697452545,
"learning_rate": 1.7327570020206504e-05,
"loss": 0.0246,
"num_input_tokens_seen": 9678592,
"step": 170,
"train_runtime": 3151.8613,
"train_tokens_per_second": 3070.754
},
{
"epoch": 1.8663101604278074,
"grad_norm": 0.1235104501247406,
"learning_rate": 1.6014805679062185e-05,
"loss": 0.0283,
"num_input_tokens_seen": 9965712,
"step": 175,
"train_runtime": 3245.3209,
"train_tokens_per_second": 3070.794
},
{
"epoch": 1.9197860962566846,
"grad_norm": 0.11748490482568741,
"learning_rate": 1.4729912605289767e-05,
"loss": 0.0196,
"num_input_tokens_seen": 10244944,
"step": 180,
"train_runtime": 3335.3629,
"train_tokens_per_second": 3071.613
},
{
"epoch": 1.9732620320855614,
"grad_norm": 0.14662568271160126,
"learning_rate": 1.34768764220293e-05,
"loss": 0.0222,
"num_input_tokens_seen": 10529152,
"step": 185,
"train_runtime": 3427.6542,
"train_tokens_per_second": 3071.824
},
{
"epoch": 2.021390374331551,
"grad_norm": 0.0968947634100914,
"learning_rate": 1.2259583935202062e-05,
"loss": 0.0228,
"num_input_tokens_seen": 10786048,
"step": 190,
"train_runtime": 3511.4724,
"train_tokens_per_second": 3071.66
},
{
"epoch": 2.0748663101604277,
"grad_norm": 0.10974116623401642,
"learning_rate": 1.1081811076986965e-05,
"loss": 0.0294,
"num_input_tokens_seen": 11069968,
"step": 195,
"train_runtime": 3603.8143,
"train_tokens_per_second": 3071.736
},
{
"epoch": 2.128342245989305,
"grad_norm": 0.10370402038097382,
"learning_rate": 9.94721119321739e-06,
"loss": 0.0171,
"num_input_tokens_seen": 11358096,
"step": 200,
"train_runtime": 3697.9457,
"train_tokens_per_second": 3071.461
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.09077204018831253,
"learning_rate": 8.85930371102994e-06,
"loss": 0.0199,
"num_input_tokens_seen": 11639152,
"step": 205,
"train_runtime": 3789.6744,
"train_tokens_per_second": 3071.28
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.11789421737194061,
"learning_rate": 7.8214632219169e-06,
"loss": 0.025,
"num_input_tokens_seen": 11921888,
"step": 210,
"train_runtime": 3881.0879,
"train_tokens_per_second": 3071.79
},
{
"epoch": 2.2887700534759357,
"grad_norm": 0.09948533028364182,
"learning_rate": 6.836909014045925e-06,
"loss": 0.0191,
"num_input_tokens_seen": 12209424,
"step": 215,
"train_runtime": 3975.3631,
"train_tokens_per_second": 3071.273
},
{
"epoch": 2.342245989304813,
"grad_norm": 0.11090569198131561,
"learning_rate": 5.908695086316701e-06,
"loss": 0.0276,
"num_input_tokens_seen": 12500800,
"step": 220,
"train_runtime": 4070.6577,
"train_tokens_per_second": 3070.953
},
{
"epoch": 2.3957219251336896,
"grad_norm": 0.11661939322948456,
"learning_rate": 5.0397006751301435e-06,
"loss": 0.0299,
"num_input_tokens_seen": 12784480,
"step": 225,
"train_runtime": 4162.6024,
"train_tokens_per_second": 3071.271
},
{
"epoch": 2.449197860962567,
"grad_norm": 0.08428950607776642,
"learning_rate": 4.23262132325514e-06,
"loss": 0.023,
"num_input_tokens_seen": 13070832,
"step": 230,
"train_runtime": 4256.1224,
"train_tokens_per_second": 3071.066
},
{
"epoch": 2.502673796791444,
"grad_norm": 0.15925215184688568,
"learning_rate": 3.489960518496521e-06,
"loss": 0.0224,
"num_input_tokens_seen": 13358112,
"step": 235,
"train_runtime": 4349.7506,
"train_tokens_per_second": 3071.006
},
{
"epoch": 2.556149732620321,
"grad_norm": 0.15281225740909576,
"learning_rate": 2.8140219281002718e-06,
"loss": 0.0246,
"num_input_tokens_seen": 13640608,
"step": 240,
"train_runtime": 4440.7769,
"train_tokens_per_second": 3071.672
},
{
"epoch": 2.6096256684491976,
"grad_norm": 0.12512794137001038,
"learning_rate": 2.2069022529842664e-06,
"loss": 0.0249,
"num_input_tokens_seen": 13919056,
"step": 245,
"train_runtime": 4530.1345,
"train_tokens_per_second": 3072.548
},
{
"epoch": 2.663101604278075,
"grad_norm": 0.12052249163389206,
"learning_rate": 1.6704847239599364e-06,
"loss": 0.0226,
"num_input_tokens_seen": 14203200,
"step": 250,
"train_runtime": 4622.3848,
"train_tokens_per_second": 3072.7
},
{
"epoch": 2.716577540106952,
"grad_norm": 0.1099454015493393,
"learning_rate": 1.2064332601191163e-06,
"loss": 0.0188,
"num_input_tokens_seen": 14488480,
"step": 255,
"train_runtime": 4715.2781,
"train_tokens_per_second": 3072.667
},
{
"epoch": 2.770053475935829,
"grad_norm": 0.10424145311117172,
"learning_rate": 8.161873075061499e-07,
"loss": 0.0222,
"num_input_tokens_seen": 14772448,
"step": 260,
"train_runtime": 4807.1927,
"train_tokens_per_second": 3072.989
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.14698970317840576,
"learning_rate": 5.009573740853313e-07,
"loss": 0.023,
"num_input_tokens_seen": 15056768,
"step": 265,
"train_runtime": 4899.3582,
"train_tokens_per_second": 3073.212
},
{
"epoch": 2.877005347593583,
"grad_norm": 0.15283866226673126,
"learning_rate": 2.617212748536491e-07,
"loss": 0.0211,
"num_input_tokens_seen": 15352912,
"step": 270,
"train_runtime": 4996.8488,
"train_tokens_per_second": 3072.519
},
{
"epoch": 2.93048128342246,
"grad_norm": 0.13693705201148987,
"learning_rate": 9.922109874636876e-08,
"loss": 0.0306,
"num_input_tokens_seen": 15644144,
"step": 275,
"train_runtime": 5092.097,
"train_tokens_per_second": 3072.24
},
{
"epoch": 2.983957219251337,
"grad_norm": 0.13499431312084198,
"learning_rate": 1.3960906743634706e-08,
"loss": 0.0195,
"num_input_tokens_seen": 15928032,
"step": 280,
"train_runtime": 5183.7733,
"train_tokens_per_second": 3072.671
},
{
"epoch": 3.0,
"num_input_tokens_seen": 16009072,
"step": 282,
"total_flos": 7.46088811960959e+17,
"train_loss": 0.038041487124794764,
"train_runtime": 5210.8723,
"train_samples_per_second": 0.861,
"train_steps_per_second": 0.054
}
],
"logging_steps": 5,
"max_steps": 282,
"num_input_tokens_seen": 16009072,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.46088811960959e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}