| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 282, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.053475935828877004, | |
| "grad_norm": 0.3194684684276581, | |
| "learning_rate": 4.997518240705502e-05, | |
| "loss": 0.2185, | |
| "num_input_tokens_seen": 279408, | |
| "step": 5, | |
| "train_runtime": 91.3615, | |
| "train_tokens_per_second": 3058.267 | |
| }, | |
| { | |
| "epoch": 0.10695187165775401, | |
| "grad_norm": 0.23469097912311554, | |
| "learning_rate": 4.9874445377212606e-05, | |
| "loss": 0.1329, | |
| "num_input_tokens_seen": 566864, | |
| "step": 10, | |
| "train_runtime": 185.5432, | |
| "train_tokens_per_second": 3055.159 | |
| }, | |
| { | |
| "epoch": 0.16042780748663102, | |
| "grad_norm": 0.1777949333190918, | |
| "learning_rate": 4.969655004749674e-05, | |
| "loss": 0.1124, | |
| "num_input_tokens_seen": 861088, | |
| "step": 15, | |
| "train_runtime": 283.0649, | |
| "train_tokens_per_second": 3042.016 | |
| }, | |
| { | |
| "epoch": 0.21390374331550802, | |
| "grad_norm": 0.14744223654270172, | |
| "learning_rate": 4.944204823327408e-05, | |
| "loss": 0.0841, | |
| "num_input_tokens_seen": 1154080, | |
| "step": 20, | |
| "train_runtime": 378.7884, | |
| "train_tokens_per_second": 3046.767 | |
| }, | |
| { | |
| "epoch": 0.26737967914438504, | |
| "grad_norm": 0.1648288071155548, | |
| "learning_rate": 4.911172937635942e-05, | |
| "loss": 0.067, | |
| "num_input_tokens_seen": 1439584, | |
| "step": 25, | |
| "train_runtime": 471.4705, | |
| "train_tokens_per_second": 3053.391 | |
| }, | |
| { | |
| "epoch": 0.32085561497326204, | |
| "grad_norm": 0.18084144592285156, | |
| "learning_rate": 4.870661809623788e-05, | |
| "loss": 0.0633, | |
| "num_input_tokens_seen": 1723424, | |
| "step": 30, | |
| "train_runtime": 563.3252, | |
| "train_tokens_per_second": 3059.377 | |
| }, | |
| { | |
| "epoch": 0.37433155080213903, | |
| "grad_norm": 0.1638505458831787, | |
| "learning_rate": 4.8227971011787196e-05, | |
| "loss": 0.0508, | |
| "num_input_tokens_seen": 2010112, | |
| "step": 35, | |
| "train_runtime": 656.4365, | |
| "train_tokens_per_second": 3062.158 | |
| }, | |
| { | |
| "epoch": 0.42780748663101603, | |
| "grad_norm": 0.2159387320280075, | |
| "learning_rate": 4.767727284335852e-05, | |
| "loss": 0.0452, | |
| "num_input_tokens_seen": 2293712, | |
| "step": 40, | |
| "train_runtime": 748.4606, | |
| "train_tokens_per_second": 3064.573 | |
| }, | |
| { | |
| "epoch": 0.48128342245989303, | |
| "grad_norm": 0.14859342575073242, | |
| "learning_rate": 4.705623180730705e-05, | |
| "loss": 0.0401, | |
| "num_input_tokens_seen": 2579088, | |
| "step": 45, | |
| "train_runtime": 841.5866, | |
| "train_tokens_per_second": 3064.555 | |
| }, | |
| { | |
| "epoch": 0.5347593582887701, | |
| "grad_norm": 0.11989740282297134, | |
| "learning_rate": 4.6366774317257946e-05, | |
| "loss": 0.0418, | |
| "num_input_tokens_seen": 2863376, | |
| "step": 50, | |
| "train_runtime": 933.8669, | |
| "train_tokens_per_second": 3066.15 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 0.13309597969055176, | |
| "learning_rate": 4.561103900854401e-05, | |
| "loss": 0.0468, | |
| "num_input_tokens_seen": 3157536, | |
| "step": 55, | |
| "train_runtime": 1030.6792, | |
| "train_tokens_per_second": 3063.549 | |
| }, | |
| { | |
| "epoch": 0.6417112299465241, | |
| "grad_norm": 0.1302647739648819, | |
| "learning_rate": 4.479137010435053e-05, | |
| "loss": 0.0421, | |
| "num_input_tokens_seen": 3438464, | |
| "step": 60, | |
| "train_runtime": 1121.7292, | |
| "train_tokens_per_second": 3065.324 | |
| }, | |
| { | |
| "epoch": 0.6951871657754011, | |
| "grad_norm": 0.14104266464710236, | |
| "learning_rate": 4.391031014414514e-05, | |
| "loss": 0.04, | |
| "num_input_tokens_seen": 3726720, | |
| "step": 65, | |
| "train_runtime": 1215.6376, | |
| "train_tokens_per_second": 3065.65 | |
| }, | |
| { | |
| "epoch": 0.7486631016042781, | |
| "grad_norm": 0.09389258176088333, | |
| "learning_rate": 4.2970592096948236e-05, | |
| "loss": 0.0279, | |
| "num_input_tokens_seen": 4011152, | |
| "step": 70, | |
| "train_runtime": 1308.2677, | |
| "train_tokens_per_second": 3066.002 | |
| }, | |
| { | |
| "epoch": 0.8021390374331551, | |
| "grad_norm": 0.15608885884284973, | |
| "learning_rate": 4.197513088390813e-05, | |
| "loss": 0.0322, | |
| "num_input_tokens_seen": 4294992, | |
| "step": 75, | |
| "train_runtime": 1400.2179, | |
| "train_tokens_per_second": 3067.374 | |
| }, | |
| { | |
| "epoch": 0.8556149732620321, | |
| "grad_norm": 0.13360099494457245, | |
| "learning_rate": 4.092701433647687e-05, | |
| "loss": 0.034, | |
| "num_input_tokens_seen": 4579744, | |
| "step": 80, | |
| "train_runtime": 1492.6689, | |
| "train_tokens_per_second": 3068.158 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.12436749041080475, | |
| "learning_rate": 3.982949361823388e-05, | |
| "loss": 0.0388, | |
| "num_input_tokens_seen": 4856256, | |
| "step": 85, | |
| "train_runtime": 1581.75, | |
| "train_tokens_per_second": 3070.179 | |
| }, | |
| { | |
| "epoch": 0.9625668449197861, | |
| "grad_norm": 0.12284192442893982, | |
| "learning_rate": 3.8685973140068e-05, | |
| "loss": 0.0395, | |
| "num_input_tokens_seen": 5130032, | |
| "step": 90, | |
| "train_runtime": 1669.3625, | |
| "train_tokens_per_second": 3073.048 | |
| }, | |
| { | |
| "epoch": 1.0106951871657754, | |
| "grad_norm": 0.17556537687778473, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.03, | |
| "num_input_tokens_seen": 5392272, | |
| "step": 95, | |
| "train_runtime": 1756.1589, | |
| "train_tokens_per_second": 3070.492 | |
| }, | |
| { | |
| "epoch": 1.0641711229946524, | |
| "grad_norm": 0.1371014416217804, | |
| "learning_rate": 3.6275252980402544e-05, | |
| "loss": 0.0282, | |
| "num_input_tokens_seen": 5683744, | |
| "step": 100, | |
| "train_runtime": 1851.388, | |
| "train_tokens_per_second": 3069.991 | |
| }, | |
| { | |
| "epoch": 1.1176470588235294, | |
| "grad_norm": 0.18953965604305267, | |
| "learning_rate": 3.501553113674699e-05, | |
| "loss": 0.0452, | |
| "num_input_tokens_seen": 5971952, | |
| "step": 105, | |
| "train_runtime": 1946.7884, | |
| "train_tokens_per_second": 3067.592 | |
| }, | |
| { | |
| "epoch": 1.1711229946524064, | |
| "grad_norm": 0.1199231818318367, | |
| "learning_rate": 3.3724742013273854e-05, | |
| "loss": 0.0266, | |
| "num_input_tokens_seen": 6263424, | |
| "step": 110, | |
| "train_runtime": 2042.2353, | |
| "train_tokens_per_second": 3066.945 | |
| }, | |
| { | |
| "epoch": 1.2245989304812834, | |
| "grad_norm": 0.08340111374855042, | |
| "learning_rate": 3.2406889522140856e-05, | |
| "loss": 0.0329, | |
| "num_input_tokens_seen": 6551312, | |
| "step": 115, | |
| "train_runtime": 2135.5267, | |
| "train_tokens_per_second": 3067.773 | |
| }, | |
| { | |
| "epoch": 1.2780748663101604, | |
| "grad_norm": 0.11855883151292801, | |
| "learning_rate": 3.1066061523646295e-05, | |
| "loss": 0.0232, | |
| "num_input_tokens_seen": 6840256, | |
| "step": 120, | |
| "train_runtime": 2229.9866, | |
| "train_tokens_per_second": 3067.398 | |
| }, | |
| { | |
| "epoch": 1.3315508021390374, | |
| "grad_norm": 0.13163559138774872, | |
| "learning_rate": 2.9706417146052838e-05, | |
| "loss": 0.0366, | |
| "num_input_tokens_seen": 7128624, | |
| "step": 125, | |
| "train_runtime": 2323.6929, | |
| "train_tokens_per_second": 3067.8 | |
| }, | |
| { | |
| "epoch": 1.3850267379679144, | |
| "grad_norm": 0.14266876876354218, | |
| "learning_rate": 2.8332173884344477e-05, | |
| "loss": 0.0301, | |
| "num_input_tokens_seen": 7418256, | |
| "step": 130, | |
| "train_runtime": 2418.4715, | |
| "train_tokens_per_second": 3067.332 | |
| }, | |
| { | |
| "epoch": 1.4385026737967914, | |
| "grad_norm": 0.1019771620631218, | |
| "learning_rate": 2.6947594517935083e-05, | |
| "loss": 0.0253, | |
| "num_input_tokens_seen": 7710672, | |
| "step": 135, | |
| "train_runtime": 2514.2479, | |
| "train_tokens_per_second": 3066.791 | |
| }, | |
| { | |
| "epoch": 1.4919786096256684, | |
| "grad_norm": 0.07594949007034302, | |
| "learning_rate": 2.555697388790885e-05, | |
| "loss": 0.0291, | |
| "num_input_tokens_seen": 7985712, | |
| "step": 140, | |
| "train_runtime": 2602.9726, | |
| "train_tokens_per_second": 3067.92 | |
| }, | |
| { | |
| "epoch": 1.5454545454545454, | |
| "grad_norm": 0.08458653837442398, | |
| "learning_rate": 2.4164625574808146e-05, | |
| "loss": 0.0236, | |
| "num_input_tokens_seen": 8274128, | |
| "step": 145, | |
| "train_runtime": 2697.4446, | |
| "train_tokens_per_second": 3067.395 | |
| }, | |
| { | |
| "epoch": 1.5989304812834224, | |
| "grad_norm": 0.1144990548491478, | |
| "learning_rate": 2.277486851829338e-05, | |
| "loss": 0.0288, | |
| "num_input_tokens_seen": 8561696, | |
| "step": 150, | |
| "train_runtime": 2791.3205, | |
| "train_tokens_per_second": 3067.256 | |
| }, | |
| { | |
| "epoch": 1.6524064171122994, | |
| "grad_norm": 0.14420673251152039, | |
| "learning_rate": 2.1392013620179337e-05, | |
| "loss": 0.0334, | |
| "num_input_tokens_seen": 8846736, | |
| "step": 155, | |
| "train_runtime": 2883.7816, | |
| "train_tokens_per_second": 3067.755 | |
| }, | |
| { | |
| "epoch": 1.7058823529411766, | |
| "grad_norm": 0.14161178469657898, | |
| "learning_rate": 2.0020350372404102e-05, | |
| "loss": 0.0273, | |
| "num_input_tokens_seen": 9120352, | |
| "step": 160, | |
| "train_runtime": 2971.4097, | |
| "train_tokens_per_second": 3069.369 | |
| }, | |
| { | |
| "epoch": 1.7593582887700534, | |
| "grad_norm": 0.15640532970428467, | |
| "learning_rate": 1.8664133551409612e-05, | |
| "loss": 0.0219, | |
| "num_input_tokens_seen": 9398016, | |
| "step": 165, | |
| "train_runtime": 3061.1211, | |
| "train_tokens_per_second": 3070.122 | |
| }, | |
| { | |
| "epoch": 1.8128342245989306, | |
| "grad_norm": 0.10912443697452545, | |
| "learning_rate": 1.7327570020206504e-05, | |
| "loss": 0.0246, | |
| "num_input_tokens_seen": 9678592, | |
| "step": 170, | |
| "train_runtime": 3151.8613, | |
| "train_tokens_per_second": 3070.754 | |
| }, | |
| { | |
| "epoch": 1.8663101604278074, | |
| "grad_norm": 0.1235104501247406, | |
| "learning_rate": 1.6014805679062185e-05, | |
| "loss": 0.0283, | |
| "num_input_tokens_seen": 9965712, | |
| "step": 175, | |
| "train_runtime": 3245.3209, | |
| "train_tokens_per_second": 3070.794 | |
| }, | |
| { | |
| "epoch": 1.9197860962566846, | |
| "grad_norm": 0.11748490482568741, | |
| "learning_rate": 1.4729912605289767e-05, | |
| "loss": 0.0196, | |
| "num_input_tokens_seen": 10244944, | |
| "step": 180, | |
| "train_runtime": 3335.3629, | |
| "train_tokens_per_second": 3071.613 | |
| }, | |
| { | |
| "epoch": 1.9732620320855614, | |
| "grad_norm": 0.14662568271160126, | |
| "learning_rate": 1.34768764220293e-05, | |
| "loss": 0.0222, | |
| "num_input_tokens_seen": 10529152, | |
| "step": 185, | |
| "train_runtime": 3427.6542, | |
| "train_tokens_per_second": 3071.824 | |
| }, | |
| { | |
| "epoch": 2.021390374331551, | |
| "grad_norm": 0.0968947634100914, | |
| "learning_rate": 1.2259583935202062e-05, | |
| "loss": 0.0228, | |
| "num_input_tokens_seen": 10786048, | |
| "step": 190, | |
| "train_runtime": 3511.4724, | |
| "train_tokens_per_second": 3071.66 | |
| }, | |
| { | |
| "epoch": 2.0748663101604277, | |
| "grad_norm": 0.10974116623401642, | |
| "learning_rate": 1.1081811076986965e-05, | |
| "loss": 0.0294, | |
| "num_input_tokens_seen": 11069968, | |
| "step": 195, | |
| "train_runtime": 3603.8143, | |
| "train_tokens_per_second": 3071.736 | |
| }, | |
| { | |
| "epoch": 2.128342245989305, | |
| "grad_norm": 0.10370402038097382, | |
| "learning_rate": 9.94721119321739e-06, | |
| "loss": 0.0171, | |
| "num_input_tokens_seen": 11358096, | |
| "step": 200, | |
| "train_runtime": 3697.9457, | |
| "train_tokens_per_second": 3071.461 | |
| }, | |
| { | |
| "epoch": 2.1818181818181817, | |
| "grad_norm": 0.09077204018831253, | |
| "learning_rate": 8.85930371102994e-06, | |
| "loss": 0.0199, | |
| "num_input_tokens_seen": 11639152, | |
| "step": 205, | |
| "train_runtime": 3789.6744, | |
| "train_tokens_per_second": 3071.28 | |
| }, | |
| { | |
| "epoch": 2.235294117647059, | |
| "grad_norm": 0.11789421737194061, | |
| "learning_rate": 7.8214632219169e-06, | |
| "loss": 0.025, | |
| "num_input_tokens_seen": 11921888, | |
| "step": 210, | |
| "train_runtime": 3881.0879, | |
| "train_tokens_per_second": 3071.79 | |
| }, | |
| { | |
| "epoch": 2.2887700534759357, | |
| "grad_norm": 0.09948533028364182, | |
| "learning_rate": 6.836909014045925e-06, | |
| "loss": 0.0191, | |
| "num_input_tokens_seen": 12209424, | |
| "step": 215, | |
| "train_runtime": 3975.3631, | |
| "train_tokens_per_second": 3071.273 | |
| }, | |
| { | |
| "epoch": 2.342245989304813, | |
| "grad_norm": 0.11090569198131561, | |
| "learning_rate": 5.908695086316701e-06, | |
| "loss": 0.0276, | |
| "num_input_tokens_seen": 12500800, | |
| "step": 220, | |
| "train_runtime": 4070.6577, | |
| "train_tokens_per_second": 3070.953 | |
| }, | |
| { | |
| "epoch": 2.3957219251336896, | |
| "grad_norm": 0.11661939322948456, | |
| "learning_rate": 5.0397006751301435e-06, | |
| "loss": 0.0299, | |
| "num_input_tokens_seen": 12784480, | |
| "step": 225, | |
| "train_runtime": 4162.6024, | |
| "train_tokens_per_second": 3071.271 | |
| }, | |
| { | |
| "epoch": 2.449197860962567, | |
| "grad_norm": 0.08428950607776642, | |
| "learning_rate": 4.23262132325514e-06, | |
| "loss": 0.023, | |
| "num_input_tokens_seen": 13070832, | |
| "step": 230, | |
| "train_runtime": 4256.1224, | |
| "train_tokens_per_second": 3071.066 | |
| }, | |
| { | |
| "epoch": 2.502673796791444, | |
| "grad_norm": 0.15925215184688568, | |
| "learning_rate": 3.489960518496521e-06, | |
| "loss": 0.0224, | |
| "num_input_tokens_seen": 13358112, | |
| "step": 235, | |
| "train_runtime": 4349.7506, | |
| "train_tokens_per_second": 3071.006 | |
| }, | |
| { | |
| "epoch": 2.556149732620321, | |
| "grad_norm": 0.15281225740909576, | |
| "learning_rate": 2.8140219281002718e-06, | |
| "loss": 0.0246, | |
| "num_input_tokens_seen": 13640608, | |
| "step": 240, | |
| "train_runtime": 4440.7769, | |
| "train_tokens_per_second": 3071.672 | |
| }, | |
| { | |
| "epoch": 2.6096256684491976, | |
| "grad_norm": 0.12512794137001038, | |
| "learning_rate": 2.2069022529842664e-06, | |
| "loss": 0.0249, | |
| "num_input_tokens_seen": 13919056, | |
| "step": 245, | |
| "train_runtime": 4530.1345, | |
| "train_tokens_per_second": 3072.548 | |
| }, | |
| { | |
| "epoch": 2.663101604278075, | |
| "grad_norm": 0.12052249163389206, | |
| "learning_rate": 1.6704847239599364e-06, | |
| "loss": 0.0226, | |
| "num_input_tokens_seen": 14203200, | |
| "step": 250, | |
| "train_runtime": 4622.3848, | |
| "train_tokens_per_second": 3072.7 | |
| }, | |
| { | |
| "epoch": 2.716577540106952, | |
| "grad_norm": 0.1099454015493393, | |
| "learning_rate": 1.2064332601191163e-06, | |
| "loss": 0.0188, | |
| "num_input_tokens_seen": 14488480, | |
| "step": 255, | |
| "train_runtime": 4715.2781, | |
| "train_tokens_per_second": 3072.667 | |
| }, | |
| { | |
| "epoch": 2.770053475935829, | |
| "grad_norm": 0.10424145311117172, | |
| "learning_rate": 8.161873075061499e-07, | |
| "loss": 0.0222, | |
| "num_input_tokens_seen": 14772448, | |
| "step": 260, | |
| "train_runtime": 4807.1927, | |
| "train_tokens_per_second": 3072.989 | |
| }, | |
| { | |
| "epoch": 2.8235294117647056, | |
| "grad_norm": 0.14698970317840576, | |
| "learning_rate": 5.009573740853313e-07, | |
| "loss": 0.023, | |
| "num_input_tokens_seen": 15056768, | |
| "step": 265, | |
| "train_runtime": 4899.3582, | |
| "train_tokens_per_second": 3073.212 | |
| }, | |
| { | |
| "epoch": 2.877005347593583, | |
| "grad_norm": 0.15283866226673126, | |
| "learning_rate": 2.617212748536491e-07, | |
| "loss": 0.0211, | |
| "num_input_tokens_seen": 15352912, | |
| "step": 270, | |
| "train_runtime": 4996.8488, | |
| "train_tokens_per_second": 3072.519 | |
| }, | |
| { | |
| "epoch": 2.93048128342246, | |
| "grad_norm": 0.13693705201148987, | |
| "learning_rate": 9.922109874636876e-08, | |
| "loss": 0.0306, | |
| "num_input_tokens_seen": 15644144, | |
| "step": 275, | |
| "train_runtime": 5092.097, | |
| "train_tokens_per_second": 3072.24 | |
| }, | |
| { | |
| "epoch": 2.983957219251337, | |
| "grad_norm": 0.13499431312084198, | |
| "learning_rate": 1.3960906743634706e-08, | |
| "loss": 0.0195, | |
| "num_input_tokens_seen": 15928032, | |
| "step": 280, | |
| "train_runtime": 5183.7733, | |
| "train_tokens_per_second": 3072.671 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "num_input_tokens_seen": 16009072, | |
| "step": 282, | |
| "total_flos": 7.46088811960959e+17, | |
| "train_loss": 0.038041487124794764, | |
| "train_runtime": 5210.8723, | |
| "train_samples_per_second": 0.861, | |
| "train_steps_per_second": 0.054 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 282, | |
| "num_input_tokens_seen": 16009072, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.46088811960959e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |