{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6423841059602649, "eval_steps": 500, "global_step": 97, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.033112582781456956, "grad_norm": 26.791088104248047, "learning_rate": 0.0001999783578606323, "loss": 10.8058, "num_input_tokens_seen": 98464, "step": 5 }, { "epoch": 0.06622516556291391, "grad_norm": 22.559255599975586, "learning_rate": 0.00019984613426472932, "loss": 2.5613, "num_input_tokens_seen": 196880, "step": 10 }, { "epoch": 0.09933774834437085, "grad_norm": 8.285969734191895, "learning_rate": 0.00019959386925858942, "loss": 1.4579, "num_input_tokens_seen": 295472, "step": 15 }, { "epoch": 0.13245033112582782, "grad_norm": 7.880527496337891, "learning_rate": 0.0001992218661313415, "loss": 1.1407, "num_input_tokens_seen": 394096, "step": 20 }, { "epoch": 0.16556291390728478, "grad_norm": 14.62392807006836, "learning_rate": 0.00019873057212894398, "loss": 0.7902, "num_input_tokens_seen": 492464, "step": 25 }, { "epoch": 0.1986754966887417, "grad_norm": 12.557646751403809, "learning_rate": 0.00019812057791647686, "loss": 0.983, "num_input_tokens_seen": 590896, "step": 30 }, { "epoch": 0.23178807947019867, "grad_norm": 12.462843894958496, "learning_rate": 0.0001973926168680066, "loss": 0.9299, "num_input_tokens_seen": 689328, "step": 35 }, { "epoch": 0.26490066225165565, "grad_norm": 3.7140164375305176, "learning_rate": 0.00019654756418487667, "loss": 0.6314, "num_input_tokens_seen": 788032, "step": 40 }, { "epoch": 0.2980132450331126, "grad_norm": 16.123748779296875, "learning_rate": 0.00019558643584348476, "loss": 0.8311, "num_input_tokens_seen": 886144, "step": 45 }, { "epoch": 0.33112582781456956, "grad_norm": 7.482938289642334, "learning_rate": 0.00019451038737381077, "loss": 0.5493, "num_input_tokens_seen": 984672, "step": 50 }, { "epoch": 0.36423841059602646, "grad_norm": 4.410764694213867, "learning_rate": 0.00019332071247016476, "loss": 0.4694, "num_input_tokens_seen": 1083232, "step": 55 }, { "epoch": 0.3973509933774834, "grad_norm": 6.899860858917236, "learning_rate": 0.00019201884143582495, "loss": 0.5595, "num_input_tokens_seen": 1181568, "step": 60 }, { "epoch": 0.4304635761589404, "grad_norm": 3.364258050918579, "learning_rate": 0.0001906063394634356, "loss": 0.2787, "num_input_tokens_seen": 1279936, "step": 65 }, { "epoch": 0.46357615894039733, "grad_norm": 20.896175384521484, "learning_rate": 0.00018970643640796642, "loss": 0.5269, "num_input_tokens_seen": 1378544, "step": 70 }, { "epoch": 0.4966887417218543, "grad_norm": 3.4167935848236084, "learning_rate": 0.00018812051176267307, "loss": 0.7782, "num_input_tokens_seen": 1476896, "step": 75 }, { "epoch": 0.5298013245033113, "grad_norm": 10.354905128479004, "learning_rate": 0.00018642864300065767, "loss": 0.5458, "num_input_tokens_seen": 1574912, "step": 80 }, { "epoch": 0.5629139072847682, "grad_norm": 3.3909523487091064, "learning_rate": 0.00018463286419478255, "loss": 0.3148, "num_input_tokens_seen": 1673056, "step": 85 }, { "epoch": 0.5960264900662252, "grad_norm": 13.916143417358398, "learning_rate": 0.00018273533434521263, "loss": 0.301, "num_input_tokens_seen": 1771536, "step": 90 }, { "epoch": 0.6291390728476821, "grad_norm": 4.097564697265625, "learning_rate": 0.0001807383347837268, "loss": 0.6369, "num_input_tokens_seen": 1869952, "step": 95 }, { "epoch": 0.6423841059602649, "num_input_tokens_seen": 1916832, "step": 97, "total_flos": 8.57490291718226e+16, "train_loss": 1.3051114364997627, "train_runtime": 5464.1136, "train_samples_per_second": 1.326, "train_steps_per_second": 0.083 } ], "logging_steps": 5, "max_steps": 453, "num_input_tokens_seen": 1916832, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.57490291718226e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }