| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9287925696594427, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.030959752321981424, | |
| "grad_norm": 9.773098945617676, | |
| "learning_rate": 2.991640866873065e-05, | |
| "loss": 9.245, | |
| "mean_token_accuracy": 0.2123243510723114, | |
| "num_tokens": 5327.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06191950464396285, | |
| "grad_norm": 3.128091335296631, | |
| "learning_rate": 2.9823529411764707e-05, | |
| "loss": 6.1602, | |
| "mean_token_accuracy": 0.25044268518686297, | |
| "num_tokens": 10795.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09287925696594428, | |
| "grad_norm": 4.203906536102295, | |
| "learning_rate": 2.973065015479876e-05, | |
| "loss": 5.6948, | |
| "mean_token_accuracy": 0.262071692943573, | |
| "num_tokens": 16240.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1238390092879257, | |
| "grad_norm": 4.233511924743652, | |
| "learning_rate": 2.9637770897832817e-05, | |
| "loss": 5.2733, | |
| "mean_token_accuracy": 0.27527774721384046, | |
| "num_tokens": 21582.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15479876160990713, | |
| "grad_norm": 7.080459117889404, | |
| "learning_rate": 2.9544891640866874e-05, | |
| "loss": 4.9304, | |
| "mean_token_accuracy": 0.2877007365226746, | |
| "num_tokens": 27142.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18575851393188855, | |
| "grad_norm": 7.273204326629639, | |
| "learning_rate": 2.945201238390093e-05, | |
| "loss": 4.689, | |
| "mean_token_accuracy": 0.28890604972839357, | |
| "num_tokens": 32801.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21671826625386997, | |
| "grad_norm": 2.2185206413269043, | |
| "learning_rate": 2.9359133126934984e-05, | |
| "loss": 4.3965, | |
| "mean_token_accuracy": 0.28117197155952456, | |
| "num_tokens": 38472.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2476780185758514, | |
| "grad_norm": 2.0464794635772705, | |
| "learning_rate": 2.926625386996904e-05, | |
| "loss": 4.062, | |
| "mean_token_accuracy": 0.299494668841362, | |
| "num_tokens": 43743.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2786377708978328, | |
| "grad_norm": 1.633155345916748, | |
| "learning_rate": 2.9173374613003097e-05, | |
| "loss": 4.0378, | |
| "mean_token_accuracy": 0.30819864571094513, | |
| "num_tokens": 49087.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.30959752321981426, | |
| "grad_norm": 1.4594247341156006, | |
| "learning_rate": 2.908049535603715e-05, | |
| "loss": 3.8513, | |
| "mean_token_accuracy": 0.3258361428976059, | |
| "num_tokens": 54433.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.34055727554179566, | |
| "grad_norm": 1.5312635898590088, | |
| "learning_rate": 2.898761609907121e-05, | |
| "loss": 3.9162, | |
| "mean_token_accuracy": 0.32140363454818727, | |
| "num_tokens": 59629.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3715170278637771, | |
| "grad_norm": 1.3190491199493408, | |
| "learning_rate": 2.8894736842105263e-05, | |
| "loss": 3.902, | |
| "mean_token_accuracy": 0.3103078156709671, | |
| "num_tokens": 65326.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4024767801857585, | |
| "grad_norm": 1.6095689535140991, | |
| "learning_rate": 2.880185758513932e-05, | |
| "loss": 3.7107, | |
| "mean_token_accuracy": 0.3353793561458588, | |
| "num_tokens": 70440.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.43343653250773995, | |
| "grad_norm": 1.6634972095489502, | |
| "learning_rate": 2.8708978328173377e-05, | |
| "loss": 3.7747, | |
| "mean_token_accuracy": 0.3298566401004791, | |
| "num_tokens": 75712.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.46439628482972134, | |
| "grad_norm": 1.3906605243682861, | |
| "learning_rate": 2.861609907120743e-05, | |
| "loss": 3.7344, | |
| "mean_token_accuracy": 0.34043932259082793, | |
| "num_tokens": 81272.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4953560371517028, | |
| "grad_norm": 1.6273926496505737, | |
| "learning_rate": 2.8523219814241487e-05, | |
| "loss": 3.6722, | |
| "mean_token_accuracy": 0.33802524507045745, | |
| "num_tokens": 86836.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 1.595566987991333, | |
| "learning_rate": 2.8430340557275543e-05, | |
| "loss": 3.5486, | |
| "mean_token_accuracy": 0.36929037272930143, | |
| "num_tokens": 91622.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5572755417956656, | |
| "grad_norm": 1.9571454524993896, | |
| "learning_rate": 2.83374613003096e-05, | |
| "loss": 3.6849, | |
| "mean_token_accuracy": 0.3387055486440659, | |
| "num_tokens": 97019.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 1.6203333139419556, | |
| "learning_rate": 2.8244582043343653e-05, | |
| "loss": 3.5592, | |
| "mean_token_accuracy": 0.36260710954666137, | |
| "num_tokens": 102273.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6191950464396285, | |
| "grad_norm": 1.8625439405441284, | |
| "learning_rate": 2.815170278637771e-05, | |
| "loss": 3.4542, | |
| "mean_token_accuracy": 0.3554231733083725, | |
| "num_tokens": 107847.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6501547987616099, | |
| "grad_norm": 1.5171610116958618, | |
| "learning_rate": 2.8058823529411766e-05, | |
| "loss": 3.6914, | |
| "mean_token_accuracy": 0.3506886214017868, | |
| "num_tokens": 113499.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6811145510835913, | |
| "grad_norm": 1.465408205986023, | |
| "learning_rate": 2.796594427244582e-05, | |
| "loss": 3.6008, | |
| "mean_token_accuracy": 0.3558589071035385, | |
| "num_tokens": 119014.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7120743034055728, | |
| "grad_norm": 1.5382874011993408, | |
| "learning_rate": 2.787306501547988e-05, | |
| "loss": 3.5375, | |
| "mean_token_accuracy": 0.3548148155212402, | |
| "num_tokens": 124170.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7430340557275542, | |
| "grad_norm": 1.773881196975708, | |
| "learning_rate": 2.7780185758513933e-05, | |
| "loss": 3.573, | |
| "mean_token_accuracy": 0.3465736091136932, | |
| "num_tokens": 129487.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7739938080495357, | |
| "grad_norm": 1.7652744054794312, | |
| "learning_rate": 2.7687306501547986e-05, | |
| "loss": 3.6811, | |
| "mean_token_accuracy": 0.33623204231262205, | |
| "num_tokens": 135007.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.804953560371517, | |
| "grad_norm": 1.7662419080734253, | |
| "learning_rate": 2.7594427244582046e-05, | |
| "loss": 3.505, | |
| "mean_token_accuracy": 0.3567329585552216, | |
| "num_tokens": 140143.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8359133126934984, | |
| "grad_norm": 1.9441474676132202, | |
| "learning_rate": 2.75015479876161e-05, | |
| "loss": 3.4804, | |
| "mean_token_accuracy": 0.36218210160732267, | |
| "num_tokens": 145363.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8668730650154799, | |
| "grad_norm": 1.745896816253662, | |
| "learning_rate": 2.7408668730650156e-05, | |
| "loss": 3.6519, | |
| "mean_token_accuracy": 0.34941086173057556, | |
| "num_tokens": 150840.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8978328173374613, | |
| "grad_norm": 1.928284764289856, | |
| "learning_rate": 2.7315789473684213e-05, | |
| "loss": 3.6138, | |
| "mean_token_accuracy": 0.34826839864254, | |
| "num_tokens": 156077.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9287925696594427, | |
| "grad_norm": 2.177100896835327, | |
| "learning_rate": 2.722291021671827e-05, | |
| "loss": 3.4666, | |
| "mean_token_accuracy": 0.36537405848503113, | |
| "num_tokens": 160953.0, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3230, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7665993537945600.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |