| { | |
| "best_metric": 0.5393198132514954, | |
| "best_model_checkpoint": "/media/user/Expansion/flan-t5-small-ner/checkpoint-99955", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 99955, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02501125506477915, | |
| "grad_norm": 27.66358757019043, | |
| "learning_rate": 4.974988744935221e-05, | |
| "loss": 4.6267, | |
| "num_input_tokens_seen": 1673336, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0500225101295583, | |
| "grad_norm": 9.223219871520996, | |
| "learning_rate": 4.949977489870442e-05, | |
| "loss": 2.1909, | |
| "num_input_tokens_seen": 3361736, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07503376519433745, | |
| "grad_norm": 15.481683731079102, | |
| "learning_rate": 4.924966234805663e-05, | |
| "loss": 1.8355, | |
| "num_input_tokens_seen": 5013800, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1000450202591166, | |
| "grad_norm": 14.396512985229492, | |
| "learning_rate": 4.8999549797408836e-05, | |
| "loss": 1.5648, | |
| "num_input_tokens_seen": 6667312, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.12505627532389577, | |
| "grad_norm": 7.068989276885986, | |
| "learning_rate": 4.8749437246761046e-05, | |
| "loss": 1.4252, | |
| "num_input_tokens_seen": 8347016, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.1500675303886749, | |
| "grad_norm": 11.330971717834473, | |
| "learning_rate": 4.849932469611325e-05, | |
| "loss": 1.3972, | |
| "num_input_tokens_seen": 10008296, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.17507878545345407, | |
| "grad_norm": 9.403321266174316, | |
| "learning_rate": 4.824921214546546e-05, | |
| "loss": 1.3001, | |
| "num_input_tokens_seen": 11658808, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2000900405182332, | |
| "grad_norm": 8.147115707397461, | |
| "learning_rate": 4.799909959481767e-05, | |
| "loss": 1.2625, | |
| "num_input_tokens_seen": 13331648, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.22510129558301237, | |
| "grad_norm": 13.405184745788574, | |
| "learning_rate": 4.774898704416988e-05, | |
| "loss": 1.1839, | |
| "num_input_tokens_seen": 14982440, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.25011255064779153, | |
| "grad_norm": 20.70949363708496, | |
| "learning_rate": 4.749887449352209e-05, | |
| "loss": 1.1598, | |
| "num_input_tokens_seen": 16633632, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.27512380571257067, | |
| "grad_norm": 16.94267463684082, | |
| "learning_rate": 4.72487619428743e-05, | |
| "loss": 1.1473, | |
| "num_input_tokens_seen": 18311672, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.3001350607773498, | |
| "grad_norm": 8.609989166259766, | |
| "learning_rate": 4.69986493922265e-05, | |
| "loss": 1.1098, | |
| "num_input_tokens_seen": 19980456, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.32514631584212894, | |
| "grad_norm": 9.003643989562988, | |
| "learning_rate": 4.674853684157871e-05, | |
| "loss": 1.0973, | |
| "num_input_tokens_seen": 21646328, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.35015757090690813, | |
| "grad_norm": 18.364194869995117, | |
| "learning_rate": 4.649842429093092e-05, | |
| "loss": 1.0987, | |
| "num_input_tokens_seen": 23277400, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.37516882597168727, | |
| "grad_norm": 13.544733047485352, | |
| "learning_rate": 4.624831174028313e-05, | |
| "loss": 1.0642, | |
| "num_input_tokens_seen": 24915304, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.4001800810364664, | |
| "grad_norm": 14.257452011108398, | |
| "learning_rate": 4.5998199189635336e-05, | |
| "loss": 1.0414, | |
| "num_input_tokens_seen": 26590576, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.42519133610124554, | |
| "grad_norm": 10.29515266418457, | |
| "learning_rate": 4.5748086638987546e-05, | |
| "loss": 1.0634, | |
| "num_input_tokens_seen": 28236280, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.45020259116602473, | |
| "grad_norm": 13.840631484985352, | |
| "learning_rate": 4.5497974088339756e-05, | |
| "loss": 0.9817, | |
| "num_input_tokens_seen": 29891480, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.47521384623080387, | |
| "grad_norm": 12.118327140808105, | |
| "learning_rate": 4.5247861537691966e-05, | |
| "loss": 1.0122, | |
| "num_input_tokens_seen": 31551000, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.5002251012955831, | |
| "grad_norm": 8.115203857421875, | |
| "learning_rate": 4.499774898704417e-05, | |
| "loss": 0.9802, | |
| "num_input_tokens_seen": 33221384, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5252363563603621, | |
| "grad_norm": 8.905954360961914, | |
| "learning_rate": 4.474763643639638e-05, | |
| "loss": 0.9796, | |
| "num_input_tokens_seen": 34891392, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.5502476114251413, | |
| "grad_norm": 10.70656681060791, | |
| "learning_rate": 4.449752388574859e-05, | |
| "loss": 1.0031, | |
| "num_input_tokens_seen": 36518768, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5752588664899204, | |
| "grad_norm": 12.424896240234375, | |
| "learning_rate": 4.42474113351008e-05, | |
| "loss": 0.9591, | |
| "num_input_tokens_seen": 38147456, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.6002701215546996, | |
| "grad_norm": 10.77695083618164, | |
| "learning_rate": 4.399729878445301e-05, | |
| "loss": 0.9338, | |
| "num_input_tokens_seen": 39823976, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6252813766194788, | |
| "grad_norm": 12.77743911743164, | |
| "learning_rate": 4.374718623380521e-05, | |
| "loss": 0.9112, | |
| "num_input_tokens_seen": 41493480, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.6502926316842579, | |
| "grad_norm": 16.060897827148438, | |
| "learning_rate": 4.349707368315742e-05, | |
| "loss": 0.915, | |
| "num_input_tokens_seen": 43130832, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.6753038867490371, | |
| "grad_norm": 17.562183380126953, | |
| "learning_rate": 4.324696113250963e-05, | |
| "loss": 0.9096, | |
| "num_input_tokens_seen": 44779392, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.7003151418138163, | |
| "grad_norm": 12.406323432922363, | |
| "learning_rate": 4.2996848581861835e-05, | |
| "loss": 0.9499, | |
| "num_input_tokens_seen": 46433856, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7253263968785953, | |
| "grad_norm": 15.567843437194824, | |
| "learning_rate": 4.2746736031214045e-05, | |
| "loss": 0.923, | |
| "num_input_tokens_seen": 48102016, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.7503376519433745, | |
| "grad_norm": 9.45335578918457, | |
| "learning_rate": 4.2496623480566255e-05, | |
| "loss": 0.9285, | |
| "num_input_tokens_seen": 49796432, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.7753489070081536, | |
| "grad_norm": 7.158623695373535, | |
| "learning_rate": 4.2246510929918465e-05, | |
| "loss": 0.9023, | |
| "num_input_tokens_seen": 51432848, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.8003601620729328, | |
| "grad_norm": 9.542813301086426, | |
| "learning_rate": 4.1996398379270675e-05, | |
| "loss": 0.9237, | |
| "num_input_tokens_seen": 53083496, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.825371417137712, | |
| "grad_norm": 10.027923583984375, | |
| "learning_rate": 4.1746285828622885e-05, | |
| "loss": 0.8813, | |
| "num_input_tokens_seen": 54755032, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.8503826722024911, | |
| "grad_norm": 18.8748722076416, | |
| "learning_rate": 4.1496173277975095e-05, | |
| "loss": 0.9036, | |
| "num_input_tokens_seen": 56411184, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.8753939272672703, | |
| "grad_norm": 12.792276382446289, | |
| "learning_rate": 4.12460607273273e-05, | |
| "loss": 0.8589, | |
| "num_input_tokens_seen": 58070520, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.9004051823320495, | |
| "grad_norm": 6.8420491218566895, | |
| "learning_rate": 4.09959481766795e-05, | |
| "loss": 0.8855, | |
| "num_input_tokens_seen": 59745800, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9254164373968286, | |
| "grad_norm": 9.066823959350586, | |
| "learning_rate": 4.074583562603171e-05, | |
| "loss": 0.8773, | |
| "num_input_tokens_seen": 61457288, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.9504276924616077, | |
| "grad_norm": 7.002307415008545, | |
| "learning_rate": 4.049572307538392e-05, | |
| "loss": 0.8747, | |
| "num_input_tokens_seen": 63139928, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.9754389475263868, | |
| "grad_norm": 14.685755729675293, | |
| "learning_rate": 4.024561052473613e-05, | |
| "loss": 0.8398, | |
| "num_input_tokens_seen": 64811920, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.6227446794509888, | |
| "eval_runtime": 96.3481, | |
| "eval_samples_per_second": 414.964, | |
| "eval_steps_per_second": 51.874, | |
| "num_input_tokens_seen": 66451084, | |
| "step": 19991 | |
| }, | |
| { | |
| "epoch": 1.0004502025911661, | |
| "grad_norm": 13.560747146606445, | |
| "learning_rate": 3.999549797408834e-05, | |
| "loss": 0.852, | |
| "num_input_tokens_seen": 66482076, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.025461457655945, | |
| "grad_norm": 4.446373462677002, | |
| "learning_rate": 3.974538542344055e-05, | |
| "loss": 0.7973, | |
| "num_input_tokens_seen": 68132180, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.0504727127207243, | |
| "grad_norm": 3.456674098968506, | |
| "learning_rate": 3.949527287279276e-05, | |
| "loss": 0.8215, | |
| "num_input_tokens_seen": 69804380, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.0754839677855035, | |
| "grad_norm": 8.283075332641602, | |
| "learning_rate": 3.924516032214497e-05, | |
| "loss": 0.8081, | |
| "num_input_tokens_seen": 71452668, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.1004952228502827, | |
| "grad_norm": 9.358149528503418, | |
| "learning_rate": 3.8995047771497175e-05, | |
| "loss": 0.7991, | |
| "num_input_tokens_seen": 73104948, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.1255064779150619, | |
| "grad_norm": 9.011244773864746, | |
| "learning_rate": 3.8744935220849385e-05, | |
| "loss": 0.7839, | |
| "num_input_tokens_seen": 74751164, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.1505177329798408, | |
| "grad_norm": 5.775268077850342, | |
| "learning_rate": 3.849482267020159e-05, | |
| "loss": 0.7515, | |
| "num_input_tokens_seen": 76431460, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.17552898804462, | |
| "grad_norm": 13.273436546325684, | |
| "learning_rate": 3.82447101195538e-05, | |
| "loss": 0.7821, | |
| "num_input_tokens_seen": 78092124, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.2005402431093992, | |
| "grad_norm": 10.351176261901855, | |
| "learning_rate": 3.799459756890601e-05, | |
| "loss": 0.772, | |
| "num_input_tokens_seen": 79736012, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.2255514981741784, | |
| "grad_norm": 14.834792137145996, | |
| "learning_rate": 3.774448501825822e-05, | |
| "loss": 0.78, | |
| "num_input_tokens_seen": 81414220, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.2505627532389576, | |
| "grad_norm": 14.160717964172363, | |
| "learning_rate": 3.749437246761043e-05, | |
| "loss": 0.7767, | |
| "num_input_tokens_seen": 83081932, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.2755740083037366, | |
| "grad_norm": 8.410615921020508, | |
| "learning_rate": 3.724425991696264e-05, | |
| "loss": 0.7665, | |
| "num_input_tokens_seen": 84745948, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.3005852633685158, | |
| "grad_norm": 7.881125450134277, | |
| "learning_rate": 3.699414736631484e-05, | |
| "loss": 0.7626, | |
| "num_input_tokens_seen": 86421180, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.325596518433295, | |
| "grad_norm": 21.633901596069336, | |
| "learning_rate": 3.674403481566705e-05, | |
| "loss": 0.7645, | |
| "num_input_tokens_seen": 88075204, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.3506077734980741, | |
| "grad_norm": 14.725602149963379, | |
| "learning_rate": 3.649392226501926e-05, | |
| "loss": 0.751, | |
| "num_input_tokens_seen": 89740116, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.3756190285628533, | |
| "grad_norm": 6.119060039520264, | |
| "learning_rate": 3.6243809714371465e-05, | |
| "loss": 0.756, | |
| "num_input_tokens_seen": 91410556, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.4006302836276325, | |
| "grad_norm": 6.520070552825928, | |
| "learning_rate": 3.5993697163723675e-05, | |
| "loss": 0.7526, | |
| "num_input_tokens_seen": 93116396, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.4256415386924115, | |
| "grad_norm": 7.963521480560303, | |
| "learning_rate": 3.5743584613075885e-05, | |
| "loss": 0.7645, | |
| "num_input_tokens_seen": 94761716, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.4506527937571907, | |
| "grad_norm": 11.38167953491211, | |
| "learning_rate": 3.5493472062428095e-05, | |
| "loss": 0.7624, | |
| "num_input_tokens_seen": 96449700, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.4756640488219699, | |
| "grad_norm": 15.715912818908691, | |
| "learning_rate": 3.5243359511780305e-05, | |
| "loss": 0.7509, | |
| "num_input_tokens_seen": 98102252, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.500675303886749, | |
| "grad_norm": 7.735713005065918, | |
| "learning_rate": 3.499324696113251e-05, | |
| "loss": 0.7738, | |
| "num_input_tokens_seen": 99780396, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.525686558951528, | |
| "grad_norm": 8.079352378845215, | |
| "learning_rate": 3.474313441048472e-05, | |
| "loss": 0.7522, | |
| "num_input_tokens_seen": 101479956, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.5506978140163072, | |
| "grad_norm": 8.290655136108398, | |
| "learning_rate": 3.449302185983693e-05, | |
| "loss": 0.7381, | |
| "num_input_tokens_seen": 103149500, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.5757090690810864, | |
| "grad_norm": 8.904264450073242, | |
| "learning_rate": 3.424290930918914e-05, | |
| "loss": 0.7467, | |
| "num_input_tokens_seen": 104812996, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.6007203241458656, | |
| "grad_norm": 7.439008712768555, | |
| "learning_rate": 3.399279675854135e-05, | |
| "loss": 0.7507, | |
| "num_input_tokens_seen": 106479036, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.6257315792106448, | |
| "grad_norm": 7.584664344787598, | |
| "learning_rate": 3.374268420789355e-05, | |
| "loss": 0.7168, | |
| "num_input_tokens_seen": 108141364, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.650742834275424, | |
| "grad_norm": 8.953302383422852, | |
| "learning_rate": 3.349257165724576e-05, | |
| "loss": 0.7469, | |
| "num_input_tokens_seen": 109799916, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.6757540893402032, | |
| "grad_norm": 10.678362846374512, | |
| "learning_rate": 3.324245910659797e-05, | |
| "loss": 0.7468, | |
| "num_input_tokens_seen": 111436748, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.7007653444049824, | |
| "grad_norm": 11.628217697143555, | |
| "learning_rate": 3.2992346555950175e-05, | |
| "loss": 0.7358, | |
| "num_input_tokens_seen": 113068476, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.7257765994697614, | |
| "grad_norm": 12.741203308105469, | |
| "learning_rate": 3.2742234005302385e-05, | |
| "loss": 0.7402, | |
| "num_input_tokens_seen": 114748860, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.7507878545345406, | |
| "grad_norm": 9.066828727722168, | |
| "learning_rate": 3.2492121454654595e-05, | |
| "loss": 0.7728, | |
| "num_input_tokens_seen": 116441684, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.7757991095993197, | |
| "grad_norm": 7.780086517333984, | |
| "learning_rate": 3.2242008904006805e-05, | |
| "loss": 0.7424, | |
| "num_input_tokens_seen": 118093652, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.8008103646640987, | |
| "grad_norm": 5.290003299713135, | |
| "learning_rate": 3.1991896353359015e-05, | |
| "loss": 0.7121, | |
| "num_input_tokens_seen": 119756772, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.825821619728878, | |
| "grad_norm": 13.356730461120605, | |
| "learning_rate": 3.1741783802711225e-05, | |
| "loss": 0.789, | |
| "num_input_tokens_seen": 121419852, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.850832874793657, | |
| "grad_norm": 4.2140727043151855, | |
| "learning_rate": 3.149167125206343e-05, | |
| "loss": 0.7501, | |
| "num_input_tokens_seen": 123080420, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.8758441298584363, | |
| "grad_norm": 15.408193588256836, | |
| "learning_rate": 3.124155870141564e-05, | |
| "loss": 0.7576, | |
| "num_input_tokens_seen": 124733724, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.9008553849232155, | |
| "grad_norm": 8.88025951385498, | |
| "learning_rate": 3.099144615076784e-05, | |
| "loss": 0.7315, | |
| "num_input_tokens_seen": 126386636, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.9258666399879947, | |
| "grad_norm": 15.850674629211426, | |
| "learning_rate": 3.074133360012005e-05, | |
| "loss": 0.7289, | |
| "num_input_tokens_seen": 128054932, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.9508778950527739, | |
| "grad_norm": 10.460667610168457, | |
| "learning_rate": 3.049122104947226e-05, | |
| "loss": 0.7375, | |
| "num_input_tokens_seen": 129731780, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.975889150117553, | |
| "grad_norm": 4.816532135009766, | |
| "learning_rate": 3.024110849882447e-05, | |
| "loss": 0.7203, | |
| "num_input_tokens_seen": 131377564, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.5678554773330688, | |
| "eval_runtime": 97.2769, | |
| "eval_samples_per_second": 411.002, | |
| "eval_steps_per_second": 51.379, | |
| "num_input_tokens_seen": 132976438, | |
| "step": 39982 | |
| }, | |
| { | |
| "epoch": 2.0009004051823323, | |
| "grad_norm": 8.531465530395508, | |
| "learning_rate": 2.999099594817668e-05, | |
| "loss": 0.7337, | |
| "num_input_tokens_seen": 133038726, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.025911660247111, | |
| "grad_norm": 17.74102783203125, | |
| "learning_rate": 2.974088339752889e-05, | |
| "loss": 0.6798, | |
| "num_input_tokens_seen": 134681590, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.05092291531189, | |
| "grad_norm": 16.203670501708984, | |
| "learning_rate": 2.9490770846881098e-05, | |
| "loss": 0.692, | |
| "num_input_tokens_seen": 136354910, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.0759341703766694, | |
| "grad_norm": 11.238871574401855, | |
| "learning_rate": 2.9240658296233308e-05, | |
| "loss": 0.653, | |
| "num_input_tokens_seen": 138014246, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.1009454254414486, | |
| "grad_norm": 8.781373023986816, | |
| "learning_rate": 2.899054574558551e-05, | |
| "loss": 0.6742, | |
| "num_input_tokens_seen": 139676526, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.1259566805062278, | |
| "grad_norm": 7.73007869720459, | |
| "learning_rate": 2.874043319493772e-05, | |
| "loss": 0.6739, | |
| "num_input_tokens_seen": 141326846, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.150967935571007, | |
| "grad_norm": 6.6758904457092285, | |
| "learning_rate": 2.849032064428993e-05, | |
| "loss": 0.6767, | |
| "num_input_tokens_seen": 142999126, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.175979190635786, | |
| "grad_norm": 9.964508056640625, | |
| "learning_rate": 2.824020809364214e-05, | |
| "loss": 0.6649, | |
| "num_input_tokens_seen": 144643454, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.2009904457005653, | |
| "grad_norm": 7.9148664474487305, | |
| "learning_rate": 2.7990095542994348e-05, | |
| "loss": 0.678, | |
| "num_input_tokens_seen": 146327686, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.2260017007653445, | |
| "grad_norm": 5.838576316833496, | |
| "learning_rate": 2.7739982992346558e-05, | |
| "loss": 0.6629, | |
| "num_input_tokens_seen": 147996750, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 2.2510129558301237, | |
| "grad_norm": 9.018148422241211, | |
| "learning_rate": 2.7489870441698768e-05, | |
| "loss": 0.6673, | |
| "num_input_tokens_seen": 149658382, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.276024210894903, | |
| "grad_norm": 5.56981897354126, | |
| "learning_rate": 2.7239757891050978e-05, | |
| "loss": 0.658, | |
| "num_input_tokens_seen": 151279470, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.3010354659596817, | |
| "grad_norm": 3.9373059272766113, | |
| "learning_rate": 2.698964534040318e-05, | |
| "loss": 0.6747, | |
| "num_input_tokens_seen": 152950878, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.326046721024461, | |
| "grad_norm": 7.596631050109863, | |
| "learning_rate": 2.6739532789755388e-05, | |
| "loss": 0.6824, | |
| "num_input_tokens_seen": 154603110, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 2.35105797608924, | |
| "grad_norm": 7.714618682861328, | |
| "learning_rate": 2.6489420239107598e-05, | |
| "loss": 0.6662, | |
| "num_input_tokens_seen": 156262254, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.3760692311540192, | |
| "grad_norm": 11.400321006774902, | |
| "learning_rate": 2.6239307688459808e-05, | |
| "loss": 0.6478, | |
| "num_input_tokens_seen": 157940526, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 2.4010804862187984, | |
| "grad_norm": 5.944780349731445, | |
| "learning_rate": 2.5989195137812018e-05, | |
| "loss": 0.6701, | |
| "num_input_tokens_seen": 159597926, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.4260917412835776, | |
| "grad_norm": 7.971735954284668, | |
| "learning_rate": 2.5739082587164225e-05, | |
| "loss": 0.6815, | |
| "num_input_tokens_seen": 161249054, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 2.451102996348357, | |
| "grad_norm": 8.019645690917969, | |
| "learning_rate": 2.5488970036516435e-05, | |
| "loss": 0.6823, | |
| "num_input_tokens_seen": 162937710, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.476114251413136, | |
| "grad_norm": 14.52238655090332, | |
| "learning_rate": 2.5238857485868645e-05, | |
| "loss": 0.6662, | |
| "num_input_tokens_seen": 164579550, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 2.501125506477915, | |
| "grad_norm": 8.065009117126465, | |
| "learning_rate": 2.498874493522085e-05, | |
| "loss": 0.6855, | |
| "num_input_tokens_seen": 166259486, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.526136761542694, | |
| "grad_norm": 3.0121171474456787, | |
| "learning_rate": 2.473863238457306e-05, | |
| "loss": 0.6597, | |
| "num_input_tokens_seen": 167925014, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 2.551148016607473, | |
| "grad_norm": 9.93840217590332, | |
| "learning_rate": 2.4488519833925268e-05, | |
| "loss": 0.6672, | |
| "num_input_tokens_seen": 169584230, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.5761592716722523, | |
| "grad_norm": 7.8001627922058105, | |
| "learning_rate": 2.4238407283277475e-05, | |
| "loss": 0.6419, | |
| "num_input_tokens_seen": 171205846, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 2.6011705267370315, | |
| "grad_norm": 5.621837139129639, | |
| "learning_rate": 2.3988294732629685e-05, | |
| "loss": 0.6679, | |
| "num_input_tokens_seen": 172867766, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.6261817818018107, | |
| "grad_norm": 18.287431716918945, | |
| "learning_rate": 2.3738182181981895e-05, | |
| "loss": 0.6601, | |
| "num_input_tokens_seen": 174508502, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 2.65119303686659, | |
| "grad_norm": 7.687650203704834, | |
| "learning_rate": 2.34880696313341e-05, | |
| "loss": 0.6722, | |
| "num_input_tokens_seen": 176179174, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.676204291931369, | |
| "grad_norm": 9.807682037353516, | |
| "learning_rate": 2.3237957080686308e-05, | |
| "loss": 0.666, | |
| "num_input_tokens_seen": 177874198, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 2.7012155469961483, | |
| "grad_norm": 9.2701416015625, | |
| "learning_rate": 2.2987844530038518e-05, | |
| "loss": 0.6811, | |
| "num_input_tokens_seen": 179531678, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.7262268020609275, | |
| "grad_norm": 8.37064266204834, | |
| "learning_rate": 2.2737731979390728e-05, | |
| "loss": 0.6505, | |
| "num_input_tokens_seen": 181197542, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 2.7512380571257067, | |
| "grad_norm": 5.556591033935547, | |
| "learning_rate": 2.2487619428742935e-05, | |
| "loss": 0.6711, | |
| "num_input_tokens_seen": 182849270, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.776249312190486, | |
| "grad_norm": 7.93866491317749, | |
| "learning_rate": 2.2237506878095145e-05, | |
| "loss": 0.6664, | |
| "num_input_tokens_seen": 184520526, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 2.801260567255265, | |
| "grad_norm": 6.768641471862793, | |
| "learning_rate": 2.198739432744735e-05, | |
| "loss": 0.6699, | |
| "num_input_tokens_seen": 186239974, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.8262718223200443, | |
| "grad_norm": 5.911066055297852, | |
| "learning_rate": 2.173728177679956e-05, | |
| "loss": 0.6649, | |
| "num_input_tokens_seen": 187875982, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 2.851283077384823, | |
| "grad_norm": 9.964897155761719, | |
| "learning_rate": 2.1487169226151768e-05, | |
| "loss": 0.6874, | |
| "num_input_tokens_seen": 189505118, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.876294332449602, | |
| "grad_norm": 8.109452247619629, | |
| "learning_rate": 2.1237056675503978e-05, | |
| "loss": 0.6762, | |
| "num_input_tokens_seen": 191184886, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 2.9013055875143814, | |
| "grad_norm": 8.556594848632812, | |
| "learning_rate": 2.0986944124856188e-05, | |
| "loss": 0.6491, | |
| "num_input_tokens_seen": 192859070, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.9263168425791606, | |
| "grad_norm": 5.430099010467529, | |
| "learning_rate": 2.0736831574208394e-05, | |
| "loss": 0.661, | |
| "num_input_tokens_seen": 194533102, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 2.9513280976439398, | |
| "grad_norm": 9.806259155273438, | |
| "learning_rate": 2.04867190235606e-05, | |
| "loss": 0.645, | |
| "num_input_tokens_seen": 196171870, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.976339352708719, | |
| "grad_norm": 8.950848579406738, | |
| "learning_rate": 2.023660647291281e-05, | |
| "loss": 0.6479, | |
| "num_input_tokens_seen": 197877830, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.560497522354126, | |
| "eval_runtime": 98.7193, | |
| "eval_samples_per_second": 404.997, | |
| "eval_steps_per_second": 50.628, | |
| "num_input_tokens_seen": 199402582, | |
| "step": 59973 | |
| }, | |
| { | |
| "epoch": 3.001350607773498, | |
| "grad_norm": 6.855441093444824, | |
| "learning_rate": 1.998649392226502e-05, | |
| "loss": 0.6187, | |
| "num_input_tokens_seen": 199490934, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.0263618628382774, | |
| "grad_norm": 8.57907772064209, | |
| "learning_rate": 1.973638137161723e-05, | |
| "loss": 0.652, | |
| "num_input_tokens_seen": 201164870, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 3.0513731179030565, | |
| "grad_norm": 15.578742027282715, | |
| "learning_rate": 1.9486268820969438e-05, | |
| "loss": 0.6127, | |
| "num_input_tokens_seen": 202824222, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.0763843729678357, | |
| "grad_norm": 9.083669662475586, | |
| "learning_rate": 1.9236156270321644e-05, | |
| "loss": 0.6146, | |
| "num_input_tokens_seen": 204495334, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 3.1013956280326145, | |
| "grad_norm": 10.12027359008789, | |
| "learning_rate": 1.8986043719673854e-05, | |
| "loss": 0.6341, | |
| "num_input_tokens_seen": 206136214, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 3.1264068830973937, | |
| "grad_norm": 10.482580184936523, | |
| "learning_rate": 1.8735931169026064e-05, | |
| "loss": 0.603, | |
| "num_input_tokens_seen": 207809294, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 3.151418138162173, | |
| "grad_norm": 7.722796440124512, | |
| "learning_rate": 1.848581861837827e-05, | |
| "loss": 0.6184, | |
| "num_input_tokens_seen": 209485534, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 3.176429393226952, | |
| "grad_norm": 7.449066162109375, | |
| "learning_rate": 1.8235706067730478e-05, | |
| "loss": 0.621, | |
| "num_input_tokens_seen": 211143158, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 3.2014406482917313, | |
| "grad_norm": 8.766199111938477, | |
| "learning_rate": 1.7985593517082688e-05, | |
| "loss": 0.6165, | |
| "num_input_tokens_seen": 212777414, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 3.2264519033565104, | |
| "grad_norm": 4.193557262420654, | |
| "learning_rate": 1.7735480966434898e-05, | |
| "loss": 0.6188, | |
| "num_input_tokens_seen": 214420798, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 3.2514631584212896, | |
| "grad_norm": 6.699706554412842, | |
| "learning_rate": 1.7485368415787104e-05, | |
| "loss": 0.6095, | |
| "num_input_tokens_seen": 216073590, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.276474413486069, | |
| "grad_norm": 8.79476547241211, | |
| "learning_rate": 1.7235255865139314e-05, | |
| "loss": 0.6208, | |
| "num_input_tokens_seen": 217746214, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 3.301485668550848, | |
| "grad_norm": 6.685282230377197, | |
| "learning_rate": 1.698514331449152e-05, | |
| "loss": 0.6058, | |
| "num_input_tokens_seen": 219433446, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 3.326496923615627, | |
| "grad_norm": 10.743680953979492, | |
| "learning_rate": 1.673503076384373e-05, | |
| "loss": 0.6318, | |
| "num_input_tokens_seen": 221089694, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 3.3515081786804064, | |
| "grad_norm": 8.36410903930664, | |
| "learning_rate": 1.6484918213195938e-05, | |
| "loss": 0.6236, | |
| "num_input_tokens_seen": 222760502, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 3.376519433745185, | |
| "grad_norm": 7.1238274574279785, | |
| "learning_rate": 1.6234805662548148e-05, | |
| "loss": 0.6103, | |
| "num_input_tokens_seen": 224417582, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 3.4015306888099643, | |
| "grad_norm": 7.042121887207031, | |
| "learning_rate": 1.5984693111900358e-05, | |
| "loss": 0.6157, | |
| "num_input_tokens_seen": 226068982, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 3.4265419438747435, | |
| "grad_norm": 9.31881332397461, | |
| "learning_rate": 1.5734580561252564e-05, | |
| "loss": 0.6263, | |
| "num_input_tokens_seen": 227701038, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 3.4515531989395227, | |
| "grad_norm": 7.049442768096924, | |
| "learning_rate": 1.548446801060477e-05, | |
| "loss": 0.6237, | |
| "num_input_tokens_seen": 229359710, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 3.476564454004302, | |
| "grad_norm": 7.746445178985596, | |
| "learning_rate": 1.5234355459956981e-05, | |
| "loss": 0.6376, | |
| "num_input_tokens_seen": 231028950, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 3.501575709069081, | |
| "grad_norm": 4.588512420654297, | |
| "learning_rate": 1.4984242909309191e-05, | |
| "loss": 0.6189, | |
| "num_input_tokens_seen": 232663446, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.5265869641338603, | |
| "grad_norm": 9.873016357421875, | |
| "learning_rate": 1.47341303586614e-05, | |
| "loss": 0.5935, | |
| "num_input_tokens_seen": 234333558, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 3.5515982191986395, | |
| "grad_norm": 8.153191566467285, | |
| "learning_rate": 1.4484017808013606e-05, | |
| "loss": 0.6403, | |
| "num_input_tokens_seen": 236006758, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 3.5766094742634187, | |
| "grad_norm": 5.909561634063721, | |
| "learning_rate": 1.4233905257365814e-05, | |
| "loss": 0.6152, | |
| "num_input_tokens_seen": 237655630, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 3.6016207293281974, | |
| "grad_norm": 9.481532096862793, | |
| "learning_rate": 1.3983792706718024e-05, | |
| "loss": 0.5916, | |
| "num_input_tokens_seen": 239300238, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 3.6266319843929766, | |
| "grad_norm": 4.988440990447998, | |
| "learning_rate": 1.3733680156070232e-05, | |
| "loss": 0.6275, | |
| "num_input_tokens_seen": 240971214, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 3.651643239457756, | |
| "grad_norm": 6.159299850463867, | |
| "learning_rate": 1.3483567605422439e-05, | |
| "loss": 0.6101, | |
| "num_input_tokens_seen": 242634286, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 3.676654494522535, | |
| "grad_norm": 4.264859199523926, | |
| "learning_rate": 1.3233455054774649e-05, | |
| "loss": 0.6045, | |
| "num_input_tokens_seen": 244293870, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 3.701665749587314, | |
| "grad_norm": 5.82095193862915, | |
| "learning_rate": 1.2983342504126857e-05, | |
| "loss": 0.624, | |
| "num_input_tokens_seen": 245956374, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 3.7266770046520934, | |
| "grad_norm": 10.4242525100708, | |
| "learning_rate": 1.2733229953479067e-05, | |
| "loss": 0.6231, | |
| "num_input_tokens_seen": 247566166, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 3.7516882597168726, | |
| "grad_norm": 6.536423206329346, | |
| "learning_rate": 1.2483117402831276e-05, | |
| "loss": 0.6159, | |
| "num_input_tokens_seen": 249233118, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 3.776699514781652, | |
| "grad_norm": 10.467476844787598, | |
| "learning_rate": 1.2233004852183482e-05, | |
| "loss": 0.6252, | |
| "num_input_tokens_seen": 250919822, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 3.801710769846431, | |
| "grad_norm": 13.297423362731934, | |
| "learning_rate": 1.1982892301535692e-05, | |
| "loss": 0.6133, | |
| "num_input_tokens_seen": 252600838, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 3.82672202491121, | |
| "grad_norm": 6.729821681976318, | |
| "learning_rate": 1.1732779750887899e-05, | |
| "loss": 0.6201, | |
| "num_input_tokens_seen": 254292558, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 3.8517332799759894, | |
| "grad_norm": 5.975412845611572, | |
| "learning_rate": 1.1482667200240109e-05, | |
| "loss": 0.5976, | |
| "num_input_tokens_seen": 255961510, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 3.8767445350407685, | |
| "grad_norm": 16.30948257446289, | |
| "learning_rate": 1.1232554649592317e-05, | |
| "loss": 0.6023, | |
| "num_input_tokens_seen": 257630246, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 3.9017557901055477, | |
| "grad_norm": 7.327265739440918, | |
| "learning_rate": 1.0982442098944526e-05, | |
| "loss": 0.6145, | |
| "num_input_tokens_seen": 259305118, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 3.9267670451703265, | |
| "grad_norm": 12.45727825164795, | |
| "learning_rate": 1.0732329548296734e-05, | |
| "loss": 0.6311, | |
| "num_input_tokens_seen": 260978934, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 3.9517783002351057, | |
| "grad_norm": 10.317325592041016, | |
| "learning_rate": 1.0482216997648942e-05, | |
| "loss": 0.6346, | |
| "num_input_tokens_seen": 262670814, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 3.976789555299885, | |
| "grad_norm": 7.8411149978637695, | |
| "learning_rate": 1.023210444700115e-05, | |
| "loss": 0.6023, | |
| "num_input_tokens_seen": 264314614, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.54269939661026, | |
| "eval_runtime": 96.9182, | |
| "eval_samples_per_second": 412.523, | |
| "eval_steps_per_second": 51.569, | |
| "num_input_tokens_seen": 265875340, | |
| "step": 79964 | |
| }, | |
| { | |
| "epoch": 4.0018008103646645, | |
| "grad_norm": 6.620047569274902, | |
| "learning_rate": 9.98199189635336e-06, | |
| "loss": 0.6268, | |
| "num_input_tokens_seen": 266010940, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.026812065429443, | |
| "grad_norm": 10.007366180419922, | |
| "learning_rate": 9.731879345705567e-06, | |
| "loss": 0.5924, | |
| "num_input_tokens_seen": 267660364, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 4.051823320494222, | |
| "grad_norm": 6.680395603179932, | |
| "learning_rate": 9.481766795057777e-06, | |
| "loss": 0.5786, | |
| "num_input_tokens_seen": 269338492, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 4.076834575559001, | |
| "grad_norm": 4.809377670288086, | |
| "learning_rate": 9.231654244409984e-06, | |
| "loss": 0.5942, | |
| "num_input_tokens_seen": 271024236, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 4.10184583062378, | |
| "grad_norm": 8.463695526123047, | |
| "learning_rate": 8.981541693762194e-06, | |
| "loss": 0.5796, | |
| "num_input_tokens_seen": 272672620, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 4.12685708568856, | |
| "grad_norm": 10.12741470336914, | |
| "learning_rate": 8.731429143114402e-06, | |
| "loss": 0.5879, | |
| "num_input_tokens_seen": 274353676, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 4.151868340753339, | |
| "grad_norm": 15.428593635559082, | |
| "learning_rate": 8.48131659246661e-06, | |
| "loss": 0.5977, | |
| "num_input_tokens_seen": 275998164, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 4.176879595818118, | |
| "grad_norm": 10.350814819335938, | |
| "learning_rate": 8.231204041818819e-06, | |
| "loss": 0.566, | |
| "num_input_tokens_seen": 277685356, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 4.201890850882897, | |
| "grad_norm": 11.962939262390137, | |
| "learning_rate": 7.981091491171027e-06, | |
| "loss": 0.5671, | |
| "num_input_tokens_seen": 279358548, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 4.226902105947676, | |
| "grad_norm": 10.32712459564209, | |
| "learning_rate": 7.730978940523236e-06, | |
| "loss": 0.5785, | |
| "num_input_tokens_seen": 280991044, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 4.2519133610124555, | |
| "grad_norm": 5.896986484527588, | |
| "learning_rate": 7.480866389875445e-06, | |
| "loss": 0.6051, | |
| "num_input_tokens_seen": 282646764, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 4.276924616077235, | |
| "grad_norm": 7.187685966491699, | |
| "learning_rate": 7.230753839227652e-06, | |
| "loss": 0.5943, | |
| "num_input_tokens_seen": 284342508, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 4.301935871142014, | |
| "grad_norm": 6.680044174194336, | |
| "learning_rate": 6.980641288579861e-06, | |
| "loss": 0.5765, | |
| "num_input_tokens_seen": 286036340, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 4.326947126206793, | |
| "grad_norm": 4.963362693786621, | |
| "learning_rate": 6.73052873793207e-06, | |
| "loss": 0.6137, | |
| "num_input_tokens_seen": 287681564, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 4.351958381271572, | |
| "grad_norm": 12.112903594970703, | |
| "learning_rate": 6.480416187284279e-06, | |
| "loss": 0.5983, | |
| "num_input_tokens_seen": 289353828, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 4.3769696363363515, | |
| "grad_norm": 5.938944339752197, | |
| "learning_rate": 6.230303636636486e-06, | |
| "loss": 0.6017, | |
| "num_input_tokens_seen": 291011668, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 4.401980891401131, | |
| "grad_norm": 4.485511302947998, | |
| "learning_rate": 5.980191085988695e-06, | |
| "loss": 0.5898, | |
| "num_input_tokens_seen": 292675092, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 4.42699214646591, | |
| "grad_norm": 9.15986442565918, | |
| "learning_rate": 5.730078535340903e-06, | |
| "loss": 0.5744, | |
| "num_input_tokens_seen": 294338212, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 4.452003401530689, | |
| "grad_norm": 3.6591997146606445, | |
| "learning_rate": 5.479965984693112e-06, | |
| "loss": 0.5948, | |
| "num_input_tokens_seen": 296004820, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 4.477014656595468, | |
| "grad_norm": 9.19853401184082, | |
| "learning_rate": 5.2298534340453205e-06, | |
| "loss": 0.5838, | |
| "num_input_tokens_seen": 297661964, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 4.5020259116602475, | |
| "grad_norm": 13.491796493530273, | |
| "learning_rate": 4.979740883397529e-06, | |
| "loss": 0.5726, | |
| "num_input_tokens_seen": 299312356, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.527037166725027, | |
| "grad_norm": 6.374147415161133, | |
| "learning_rate": 4.729628332749737e-06, | |
| "loss": 0.5728, | |
| "num_input_tokens_seen": 300978468, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 4.552048421789806, | |
| "grad_norm": 7.507421970367432, | |
| "learning_rate": 4.479515782101945e-06, | |
| "loss": 0.5903, | |
| "num_input_tokens_seen": 302639252, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 4.577059676854585, | |
| "grad_norm": 12.31728744506836, | |
| "learning_rate": 4.229403231454155e-06, | |
| "loss": 0.5916, | |
| "num_input_tokens_seen": 304289124, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 4.602070931919363, | |
| "grad_norm": 11.238248825073242, | |
| "learning_rate": 3.979290680806363e-06, | |
| "loss": 0.5617, | |
| "num_input_tokens_seen": 305968436, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 4.6270821869841425, | |
| "grad_norm": 6.74647331237793, | |
| "learning_rate": 3.7291781301585712e-06, | |
| "loss": 0.6249, | |
| "num_input_tokens_seen": 307616156, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 4.652093442048922, | |
| "grad_norm": 7.845546722412109, | |
| "learning_rate": 3.4790655795107795e-06, | |
| "loss": 0.6015, | |
| "num_input_tokens_seen": 309294188, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 4.677104697113701, | |
| "grad_norm": 5.631568431854248, | |
| "learning_rate": 3.2289530288629883e-06, | |
| "loss": 0.5747, | |
| "num_input_tokens_seen": 310930388, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 4.70211595217848, | |
| "grad_norm": 4.305506229400635, | |
| "learning_rate": 2.978840478215197e-06, | |
| "loss": 0.5957, | |
| "num_input_tokens_seen": 312600876, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 4.727127207243259, | |
| "grad_norm": 12.092133522033691, | |
| "learning_rate": 2.7287279275674053e-06, | |
| "loss": 0.5952, | |
| "num_input_tokens_seen": 314275796, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 4.7521384623080385, | |
| "grad_norm": 7.043518543243408, | |
| "learning_rate": 2.478615376919614e-06, | |
| "loss": 0.6013, | |
| "num_input_tokens_seen": 315945468, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 4.777149717372818, | |
| "grad_norm": 6.208098888397217, | |
| "learning_rate": 2.2285028262718224e-06, | |
| "loss": 0.591, | |
| "num_input_tokens_seen": 317595388, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 4.802160972437597, | |
| "grad_norm": 3.588547706604004, | |
| "learning_rate": 1.978390275624031e-06, | |
| "loss": 0.5846, | |
| "num_input_tokens_seen": 319229212, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 4.827172227502376, | |
| "grad_norm": 10.502739906311035, | |
| "learning_rate": 1.7282777249762395e-06, | |
| "loss": 0.5904, | |
| "num_input_tokens_seen": 320908604, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 4.852183482567155, | |
| "grad_norm": 8.170723915100098, | |
| "learning_rate": 1.4781651743284478e-06, | |
| "loss": 0.5925, | |
| "num_input_tokens_seen": 322558268, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 4.8771947376319345, | |
| "grad_norm": 10.083109855651855, | |
| "learning_rate": 1.2280526236806563e-06, | |
| "loss": 0.5977, | |
| "num_input_tokens_seen": 324205708, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 4.902205992696714, | |
| "grad_norm": 6.591386795043945, | |
| "learning_rate": 9.779400730328649e-07, | |
| "loss": 0.5633, | |
| "num_input_tokens_seen": 325850036, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 4.927217247761493, | |
| "grad_norm": 7.133991241455078, | |
| "learning_rate": 7.278275223850733e-07, | |
| "loss": 0.5786, | |
| "num_input_tokens_seen": 327509276, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 4.952228502826272, | |
| "grad_norm": 5.090227127075195, | |
| "learning_rate": 4.777149717372818e-07, | |
| "loss": 0.5886, | |
| "num_input_tokens_seen": 329175052, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 4.977239757891051, | |
| "grad_norm": 7.157599925994873, | |
| "learning_rate": 2.276024210894903e-07, | |
| "loss": 0.5879, | |
| "num_input_tokens_seen": 330819060, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.5393198132514954, | |
| "eval_runtime": 97.8527, | |
| "eval_samples_per_second": 408.584, | |
| "eval_steps_per_second": 51.077, | |
| "num_input_tokens_seen": 332318598, | |
| "step": 99955 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "num_input_tokens_seen": 332318598, | |
| "step": 99955, | |
| "total_flos": 1.2062750373789696e+17, | |
| "train_loss": 0.7601320325591829, | |
| "train_runtime": 7988.2275, | |
| "train_samples_per_second": 100.099, | |
| "train_steps_per_second": 12.513, | |
| "train_tokens_per_second": 41593.134 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 99955, | |
| "num_input_tokens_seen": 332318598, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2062750373789696e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |