Model save

Browse files

Files changed (5) hide show

README.md +57 -0
all_results.json +9 -0
generation_config.json +6 -0
train_results.json +9 -0
trainer_state.json +2418 -0

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+library_name: transformers
+model_name: Qwen2.5-7B-Open-R1-Distill
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+# Model Card for Qwen2.5-7B-Open-R1-Distill
+This model is a fine-tuned version of [None](https://huggingface.co/None).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="howardzhou/Qwen2.5-7B-Open-R1-Distill", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/howardzhou92-nw/huggingface/runs/6rgf4qfj)
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.15.0.dev0
+- Transformers: 4.49.0.dev0
+- Pytorch: 2.5.1
+- Datasets: 3.2.0
+- Tokenizers: 0.21.0
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.996662958843159,
+    "total_flos": 352364519227392.0,
+    "train_loss": 0.5676420692886625,
+    "train_runtime": 10418.0052,
+    "train_samples": 16610,
+    "train_samples_per_second": 4.141,
+    "train_steps_per_second": 0.032
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.49.0.dev0"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.996662958843159,
+    "total_flos": 352364519227392.0,
+    "train_loss": 0.5676420692886625,
+    "train_runtime": 10418.0052,
+    "train_samples": 16610,
+    "train_samples_per_second": 4.141,
+    "train_steps_per_second": 0.032
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2418 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.996662958843159,
+  "eval_steps": 100,
+  "global_step": 336,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005932517612161661,
+      "grad_norm": 5.324305785932815,
+      "learning_rate": 1.999956289272873e-05,
+      "loss": 0.9594,
+      "step": 1
+    },
+    {
+      "epoch": 0.011865035224323322,
+      "grad_norm": 22.84012317380655,
+      "learning_rate": 1.9998251609127465e-05,
+      "loss": 0.9271,
+      "step": 2
+    },
+    {
+      "epoch": 0.017797552836484983,
+      "grad_norm": 4.084146062025191,
+      "learning_rate": 1.9996066263830533e-05,
+      "loss": 0.885,
+      "step": 3
+    },
+    {
+      "epoch": 0.023730070448646643,
+      "grad_norm": 5.3602743742788626,
+      "learning_rate": 1.9993007047883988e-05,
+      "loss": 0.8686,
+      "step": 4
+    },
+    {
+      "epoch": 0.029662588060808306,
+      "grad_norm": 3.8319494768533953,
+      "learning_rate": 1.9989074228728942e-05,
+      "loss": 0.8172,
+      "step": 5
+    },
+    {
+      "epoch": 0.035595105672969966,
+      "grad_norm": 2.175190126654642,
+      "learning_rate": 1.998426815017817e-05,
+      "loss": 0.7451,
+      "step": 6
+    },
+    {
+      "epoch": 0.041527623285131626,
+      "grad_norm": 2.199115330911503,
+      "learning_rate": 1.9978589232386036e-05,
+      "loss": 0.7646,
+      "step": 7
+    },
+    {
+      "epoch": 0.047460140897293286,
+      "grad_norm": 2.1181147808123333,
+      "learning_rate": 1.9972037971811802e-05,
+      "loss": 0.7291,
+      "step": 8
+    },
+    {
+      "epoch": 0.05339265850945495,
+      "grad_norm": 1.7260889336074359,
+      "learning_rate": 1.9964614941176194e-05,
+      "loss": 0.7191,
+      "step": 9
+    },
+    {
+      "epoch": 0.05932517612161661,
+      "grad_norm": 1.1637147793554257,
+      "learning_rate": 1.9956320789411338e-05,
+      "loss": 0.7203,
+      "step": 10
+    },
+    {
+      "epoch": 0.06525769373377827,
+      "grad_norm": 1.4450484122253298,
+      "learning_rate": 1.994715624160405e-05,
+      "loss": 0.6792,
+      "step": 11
+    },
+    {
+      "epoch": 0.07119021134593993,
+      "grad_norm": 1.1620631343638403,
+      "learning_rate": 1.9937122098932428e-05,
+      "loss": 0.7178,
+      "step": 12
+    },
+    {
+      "epoch": 0.0771227289581016,
+      "grad_norm": 0.9627269401137271,
+      "learning_rate": 1.992621923859581e-05,
+      "loss": 0.6847,
+      "step": 13
+    },
+    {
+      "epoch": 0.08305524657026325,
+      "grad_norm": 0.9961377846813068,
+      "learning_rate": 1.9914448613738107e-05,
+      "loss": 0.6762,
+      "step": 14
+    },
+    {
+      "epoch": 0.08898776418242492,
+      "grad_norm": 0.8096822959197216,
+      "learning_rate": 1.9901811253364458e-05,
+      "loss": 0.6961,
+      "step": 15
+    },
+    {
+      "epoch": 0.09492028179458657,
+      "grad_norm": 0.856064052820252,
+      "learning_rate": 1.9888308262251286e-05,
+      "loss": 0.6797,
+      "step": 16
+    },
+    {
+      "epoch": 0.10085279940674824,
+      "grad_norm": 1.0455477055796631,
+      "learning_rate": 1.9873940820849714e-05,
+      "loss": 0.6807,
+      "step": 17
+    },
+    {
+      "epoch": 0.1067853170189099,
+      "grad_norm": 0.7952535188433517,
+      "learning_rate": 1.985871018518236e-05,
+      "loss": 0.6792,
+      "step": 18
+    },
+    {
+      "epoch": 0.11271783463107156,
+      "grad_norm": 0.912745561968978,
+      "learning_rate": 1.9842617686733546e-05,
+      "loss": 0.6581,
+      "step": 19
+    },
+    {
+      "epoch": 0.11865035224323323,
+      "grad_norm": 0.9048458674371787,
+      "learning_rate": 1.9825664732332886e-05,
+      "loss": 0.6593,
+      "step": 20
+    },
+    {
+      "epoch": 0.12458286985539488,
+      "grad_norm": 0.9891444883072399,
+      "learning_rate": 1.9807852804032306e-05,
+      "loss": 0.6646,
+      "step": 21
+    },
+    {
+      "epoch": 0.13051538746755653,
+      "grad_norm": 0.8380876682823516,
+      "learning_rate": 1.9789183458976485e-05,
+      "loss": 0.6555,
+      "step": 22
+    },
+    {
+      "epoch": 0.1364479050797182,
+      "grad_norm": 0.7752881193795825,
+      "learning_rate": 1.9769658329266718e-05,
+      "loss": 0.6423,
+      "step": 23
+    },
+    {
+      "epoch": 0.14238042269187987,
+      "grad_norm": 0.6671431472869794,
+      "learning_rate": 1.9749279121818235e-05,
+      "loss": 0.6663,
+      "step": 24
+    },
+    {
+      "epoch": 0.14831294030404152,
+      "grad_norm": 0.8415919815133858,
+      "learning_rate": 1.9728047618210995e-05,
+      "loss": 0.6716,
+      "step": 25
+    },
+    {
+      "epoch": 0.1542454579162032,
+      "grad_norm": 0.7245934639150007,
+      "learning_rate": 1.970596567453391e-05,
+      "loss": 0.6312,
+      "step": 26
+    },
+    {
+      "epoch": 0.16017797552836485,
+      "grad_norm": 0.9130270576949818,
+      "learning_rate": 1.9683035221222617e-05,
+      "loss": 0.6449,
+      "step": 27
+    },
+    {
+      "epoch": 0.1661104931405265,
+      "grad_norm": 0.6519157743634046,
+      "learning_rate": 1.9659258262890683e-05,
+      "loss": 0.6382,
+      "step": 28
+    },
+    {
+      "epoch": 0.17204301075268819,
+      "grad_norm": 0.831521850696097,
+      "learning_rate": 1.9634636878154393e-05,
+      "loss": 0.6489,
+      "step": 29
+    },
+    {
+      "epoch": 0.17797552836484984,
+      "grad_norm": 1.4176318143795756,
+      "learning_rate": 1.9609173219450998e-05,
+      "loss": 0.6103,
+      "step": 30
+    },
+    {
+      "epoch": 0.1839080459770115,
+      "grad_norm": 0.7923365048789488,
+      "learning_rate": 1.9582869512850576e-05,
+      "loss": 0.6577,
+      "step": 31
+    },
+    {
+      "epoch": 0.18984056358917314,
+      "grad_norm": 0.8145126868212449,
+      "learning_rate": 1.955572805786141e-05,
+      "loss": 0.6601,
+      "step": 32
+    },
+    {
+      "epoch": 0.19577308120133483,
+      "grad_norm": 0.7417273199883386,
+      "learning_rate": 1.9527751227228964e-05,
+      "loss": 0.6357,
+      "step": 33
+    },
+    {
+      "epoch": 0.20170559881349648,
+      "grad_norm": 0.9812326390343167,
+      "learning_rate": 1.9498941466728462e-05,
+      "loss": 0.6435,
+      "step": 34
+    },
+    {
+      "epoch": 0.20763811642565813,
+      "grad_norm": 0.7122362598498115,
+      "learning_rate": 1.946930129495106e-05,
+      "loss": 0.6444,
+      "step": 35
+    },
+    {
+      "epoch": 0.2135706340378198,
+      "grad_norm": 0.6999496527556666,
+      "learning_rate": 1.9438833303083677e-05,
+      "loss": 0.6216,
+      "step": 36
+    },
+    {
+      "epoch": 0.21950315164998146,
+      "grad_norm": 0.9616816358092815,
+      "learning_rate": 1.9407540154682473e-05,
+      "loss": 0.6579,
+      "step": 37
+    },
+    {
+      "epoch": 0.22543566926214312,
+      "grad_norm": 0.7213515842394086,
+      "learning_rate": 1.9375424585439994e-05,
+      "loss": 0.6281,
+      "step": 38
+    },
+    {
+      "epoch": 0.23136818687430477,
+      "grad_norm": 0.9878068930431408,
+      "learning_rate": 1.9342489402945997e-05,
+      "loss": 0.633,
+      "step": 39
+    },
+    {
+      "epoch": 0.23730070448646645,
+      "grad_norm": 0.6631768147177095,
+      "learning_rate": 1.9308737486442045e-05,
+      "loss": 0.6093,
+      "step": 40
+    },
+    {
+      "epoch": 0.2432332220986281,
+      "grad_norm": 0.8581774181959311,
+      "learning_rate": 1.927417178656975e-05,
+      "loss": 0.6579,
+      "step": 41
+    },
+    {
+      "epoch": 0.24916573971078976,
+      "grad_norm": 0.6410107561811785,
+      "learning_rate": 1.9238795325112867e-05,
+      "loss": 0.6203,
+      "step": 42
+    },
+    {
+      "epoch": 0.25509825732295144,
+      "grad_norm": 0.8809363552180719,
+      "learning_rate": 1.9202611194733107e-05,
+      "loss": 0.6034,
+      "step": 43
+    },
+    {
+      "epoch": 0.26103077493511306,
+      "grad_norm": 0.6867316845306615,
+      "learning_rate": 1.9165622558699763e-05,
+      "loss": 0.6516,
+      "step": 44
+    },
+    {
+      "epoch": 0.26696329254727474,
+      "grad_norm": 1.0648968501842426,
+      "learning_rate": 1.912783265061319e-05,
+      "loss": 0.6446,
+      "step": 45
+    },
+    {
+      "epoch": 0.2728958101594364,
+      "grad_norm": 0.7208316977945584,
+      "learning_rate": 1.908924477412211e-05,
+      "loss": 0.6351,
+      "step": 46
+    },
+    {
+      "epoch": 0.27882832777159805,
+      "grad_norm": 0.8089443201366949,
+      "learning_rate": 1.90498623026348e-05,
+      "loss": 0.6386,
+      "step": 47
+    },
+    {
+      "epoch": 0.28476084538375973,
+      "grad_norm": 0.6427781719286114,
+      "learning_rate": 1.900968867902419e-05,
+      "loss": 0.6243,
+      "step": 48
+    },
+    {
+      "epoch": 0.2906933629959214,
+      "grad_norm": 0.7797049642173535,
+      "learning_rate": 1.8968727415326885e-05,
+      "loss": 0.6124,
+      "step": 49
+    },
+    {
+      "epoch": 0.29662588060808304,
+      "grad_norm": 0.5348618501721483,
+      "learning_rate": 1.8926982092436117e-05,
+      "loss": 0.6176,
+      "step": 50
+    },
+    {
+      "epoch": 0.3025583982202447,
+      "grad_norm": 0.8550876198086238,
+      "learning_rate": 1.8884456359788725e-05,
+      "loss": 0.6271,
+      "step": 51
+    },
+    {
+      "epoch": 0.3084909158324064,
+      "grad_norm": 0.6138520011172943,
+      "learning_rate": 1.8841153935046098e-05,
+      "loss": 0.625,
+      "step": 52
+    },
+    {
+      "epoch": 0.314423433444568,
+      "grad_norm": 0.7544109212965426,
+      "learning_rate": 1.8797078603769184e-05,
+      "loss": 0.6451,
+      "step": 53
+    },
+    {
+      "epoch": 0.3203559510567297,
+      "grad_norm": 0.6909872317657667,
+      "learning_rate": 1.8752234219087538e-05,
+      "loss": 0.623,
+      "step": 54
+    },
+    {
+      "epoch": 0.3262884686688914,
+      "grad_norm": 0.6057588839577427,
+      "learning_rate": 1.8706624701362485e-05,
+      "loss": 0.6173,
+      "step": 55
+    },
+    {
+      "epoch": 0.332220986281053,
+      "grad_norm": 0.7753788330958152,
+      "learning_rate": 1.866025403784439e-05,
+      "loss": 0.5968,
+      "step": 56
+    },
+    {
+      "epoch": 0.3381535038932147,
+      "grad_norm": 0.5677646570795517,
+      "learning_rate": 1.8613126282324092e-05,
+      "loss": 0.6083,
+      "step": 57
+    },
+    {
+      "epoch": 0.34408602150537637,
+      "grad_norm": 0.8456419916175821,
+      "learning_rate": 1.8565245554778516e-05,
+      "loss": 0.6222,
+      "step": 58
+    },
+    {
+      "epoch": 0.350018539117538,
+      "grad_norm": 0.7548049715006433,
+      "learning_rate": 1.8516616041010495e-05,
+      "loss": 0.6328,
+      "step": 59
+    },
+    {
+      "epoch": 0.3559510567296997,
+      "grad_norm": 0.7884744995171575,
+      "learning_rate": 1.8467241992282842e-05,
+      "loss": 0.5983,
+      "step": 60
+    },
+    {
+      "epoch": 0.3618835743418613,
+      "grad_norm": 0.7405638018271332,
+      "learning_rate": 1.84171277249467e-05,
+      "loss": 0.6158,
+      "step": 61
+    },
+    {
+      "epoch": 0.367816091954023,
+      "grad_norm": 0.7437562413628,
+      "learning_rate": 1.83662776200642e-05,
+      "loss": 0.6083,
+      "step": 62
+    },
+    {
+      "epoch": 0.37374860956618466,
+      "grad_norm": 0.6809427452609553,
+      "learning_rate": 1.8314696123025456e-05,
+      "loss": 0.6384,
+      "step": 63
+    },
+    {
+      "epoch": 0.3796811271783463,
+      "grad_norm": 0.7150552234310213,
+      "learning_rate": 1.826238774315995e-05,
+      "loss": 0.5962,
+      "step": 64
+    },
+    {
+      "epoch": 0.38561364479050797,
+      "grad_norm": 0.7499268266093695,
+      "learning_rate": 1.8209357053342325e-05,
+      "loss": 0.5992,
+      "step": 65
+    },
+    {
+      "epoch": 0.39154616240266965,
+      "grad_norm": 0.701144986922023,
+      "learning_rate": 1.8155608689592604e-05,
+      "loss": 0.6224,
+      "step": 66
+    },
+    {
+      "epoch": 0.3974786800148313,
+      "grad_norm": 0.7848864580999906,
+      "learning_rate": 1.8101147350670905e-05,
+      "loss": 0.6063,
+      "step": 67
+    },
+    {
+      "epoch": 0.40341119762699296,
+      "grad_norm": 0.5906752612381765,
+      "learning_rate": 1.8045977797666685e-05,
+      "loss": 0.5999,
+      "step": 68
+    },
+    {
+      "epoch": 0.40934371523915464,
+      "grad_norm": 0.8877108267943582,
+      "learning_rate": 1.7990104853582494e-05,
+      "loss": 0.6236,
+      "step": 69
+    },
+    {
+      "epoch": 0.41527623285131626,
+      "grad_norm": 0.6701464198767606,
+      "learning_rate": 1.7933533402912354e-05,
+      "loss": 0.6158,
+      "step": 70
+    },
+    {
+      "epoch": 0.42120875046347794,
+      "grad_norm": 0.8978968992213687,
+      "learning_rate": 1.7876268391214756e-05,
+      "loss": 0.6139,
+      "step": 71
+    },
+    {
+      "epoch": 0.4271412680756396,
+      "grad_norm": 0.5561189587112576,
+      "learning_rate": 1.78183148246803e-05,
+      "loss": 0.5855,
+      "step": 72
+    },
+    {
+      "epoch": 0.43307378568780125,
+      "grad_norm": 0.9537599050583528,
+      "learning_rate": 1.775967776969405e-05,
+      "loss": 0.6143,
+      "step": 73
+    },
+    {
+      "epoch": 0.43900630329996293,
+      "grad_norm": 0.7065253211082851,
+      "learning_rate": 1.7700362352392632e-05,
+      "loss": 0.6316,
+      "step": 74
+    },
+    {
+      "epoch": 0.44493882091212456,
+      "grad_norm": 0.8079873624667518,
+      "learning_rate": 1.7640373758216075e-05,
+      "loss": 0.5903,
+      "step": 75
+    },
+    {
+      "epoch": 0.45087133852428624,
+      "grad_norm": 0.7300425305669899,
+      "learning_rate": 1.757971723145453e-05,
+      "loss": 0.5841,
+      "step": 76
+    },
+    {
+      "epoch": 0.4568038561364479,
+      "grad_norm": 0.6134926917670087,
+      "learning_rate": 1.7518398074789776e-05,
+      "loss": 0.613,
+      "step": 77
+    },
+    {
+      "epoch": 0.46273637374860954,
+      "grad_norm": 0.6589828304739275,
+      "learning_rate": 1.7456421648831658e-05,
+      "loss": 0.595,
+      "step": 78
+    },
+    {
+      "epoch": 0.4686688913607712,
+      "grad_norm": 0.6394731073863064,
+      "learning_rate": 1.739379337164946e-05,
+      "loss": 0.6126,
+      "step": 79
+    },
+    {
+      "epoch": 0.4746014089729329,
+      "grad_norm": 0.615495452837106,
+      "learning_rate": 1.7330518718298263e-05,
+      "loss": 0.5948,
+      "step": 80
+    },
+    {
+      "epoch": 0.48053392658509453,
+      "grad_norm": 0.6010913241719915,
+      "learning_rate": 1.7266603220340273e-05,
+      "loss": 0.6127,
+      "step": 81
+    },
+    {
+      "epoch": 0.4864664441972562,
+      "grad_norm": 0.6447864666738669,
+      "learning_rate": 1.7202052465361268e-05,
+      "loss": 0.5975,
+      "step": 82
+    },
+    {
+      "epoch": 0.4923989618094179,
+      "grad_norm": 0.5299790811179832,
+      "learning_rate": 1.7136872096482123e-05,
+      "loss": 0.6191,
+      "step": 83
+    },
+    {
+      "epoch": 0.4983314794215795,
+      "grad_norm": 0.6051833501420879,
+      "learning_rate": 1.7071067811865477e-05,
+      "loss": 0.616,
+      "step": 84
+    },
+    {
+      "epoch": 0.5042639970337411,
+      "grad_norm": 0.545917313248304,
+      "learning_rate": 1.7004645364217584e-05,
+      "loss": 0.6186,
+      "step": 85
+    },
+    {
+      "epoch": 0.5101965146459029,
+      "grad_norm": 0.6742615641272589,
+      "learning_rate": 1.693761056028542e-05,
+      "loss": 0.6218,
+      "step": 86
+    },
+    {
+      "epoch": 0.5161290322580645,
+      "grad_norm": 0.56943138685776,
+      "learning_rate": 1.686996926034902e-05,
+      "loss": 0.6113,
+      "step": 87
+    },
+    {
+      "epoch": 0.5220615498702261,
+      "grad_norm": 0.5709110972034852,
+      "learning_rate": 1.6801727377709195e-05,
+      "loss": 0.6125,
+      "step": 88
+    },
+    {
+      "epoch": 0.5279940674823879,
+      "grad_norm": 0.5196995804675383,
+      "learning_rate": 1.6732890878170573e-05,
+      "loss": 0.5862,
+      "step": 89
+    },
+    {
+      "epoch": 0.5339265850945495,
+      "grad_norm": 0.5096600514404106,
+      "learning_rate": 1.6663465779520042e-05,
+      "loss": 0.592,
+      "step": 90
+    },
+    {
+      "epoch": 0.5398591027067111,
+      "grad_norm": 0.5781695292378604,
+      "learning_rate": 1.659345815100069e-05,
+      "loss": 0.5741,
+      "step": 91
+    },
+    {
+      "epoch": 0.5457916203188728,
+      "grad_norm": 0.5603815789576477,
+      "learning_rate": 1.6522874112781213e-05,
+      "loss": 0.6247,
+      "step": 92
+    },
+    {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 0.5795708352012364,
+      "learning_rate": 1.645171983542088e-05,
+      "loss": 0.6023,
+      "step": 93
+    },
+    {
+      "epoch": 0.5576566555431961,
+      "grad_norm": 0.543427974530907,
+      "learning_rate": 1.6380001539330088e-05,
+      "loss": 0.5991,
+      "step": 94
+    },
+    {
+      "epoch": 0.5635891731553578,
+      "grad_norm": 0.6346825164978813,
+      "learning_rate": 1.6307725494226586e-05,
+      "loss": 0.6374,
+      "step": 95
+    },
+    {
+      "epoch": 0.5695216907675195,
+      "grad_norm": 0.5799055100493153,
+      "learning_rate": 1.6234898018587336e-05,
+      "loss": 0.6267,
+      "step": 96
+    },
+    {
+      "epoch": 0.5754542083796811,
+      "grad_norm": 0.5921423473197417,
+      "learning_rate": 1.616152547909618e-05,
+      "loss": 0.6164,
+      "step": 97
+    },
+    {
+      "epoch": 0.5813867259918428,
+      "grad_norm": 0.49534355256808843,
+      "learning_rate": 1.608761429008721e-05,
+      "loss": 0.578,
+      "step": 98
+    },
+    {
+      "epoch": 0.5873192436040044,
+      "grad_norm": 0.6107346169575619,
+      "learning_rate": 1.601317091298406e-05,
+      "loss": 0.5756,
+      "step": 99
+    },
+    {
+      "epoch": 0.5932517612161661,
+      "grad_norm": 0.5446618742898293,
+      "learning_rate": 1.5938201855735017e-05,
+      "loss": 0.6025,
+      "step": 100
+    },
+    {
+      "epoch": 0.5932517612161661,
+      "eval_loss": 0.6248904466629028,
+      "eval_runtime": 6.4597,
+      "eval_samples_per_second": 19.66,
+      "eval_steps_per_second": 2.477,
+      "step": 100
+    },
+    {
+      "epoch": 0.5991842788283278,
+      "grad_norm": 0.5859755046139327,
+      "learning_rate": 1.5862713672244092e-05,
+      "loss": 0.5881,
+      "step": 101
+    },
+    {
+      "epoch": 0.6051167964404894,
+      "grad_norm": 0.6191852673376101,
+      "learning_rate": 1.578671296179806e-05,
+      "loss": 0.605,
+      "step": 102
+    },
+    {
+      "epoch": 0.6110493140526511,
+      "grad_norm": 0.7094358933361382,
+      "learning_rate": 1.5710206368489555e-05,
+      "loss": 0.6173,
+      "step": 103
+    },
+    {
+      "epoch": 0.6169818316648128,
+      "grad_norm": 0.5352134780488959,
+      "learning_rate": 1.563320058063622e-05,
+      "loss": 0.5986,
+      "step": 104
+    },
+    {
+      "epoch": 0.6229143492769744,
+      "grad_norm": 0.6198887975123263,
+      "learning_rate": 1.5555702330196024e-05,
+      "loss": 0.5871,
+      "step": 105
+    },
+    {
+      "epoch": 0.628846866889136,
+      "grad_norm": 0.6049898880261826,
+      "learning_rate": 1.5477718392178716e-05,
+      "loss": 0.6041,
+      "step": 106
+    },
+    {
+      "epoch": 0.6347793845012978,
+      "grad_norm": 0.7958720800000618,
+      "learning_rate": 1.5399255584053568e-05,
+      "loss": 0.5951,
+      "step": 107
+    },
+    {
+      "epoch": 0.6407119021134594,
+      "grad_norm": 0.6616316291827746,
+      "learning_rate": 1.5320320765153367e-05,
+      "loss": 0.6084,
+      "step": 108
+    },
+    {
+      "epoch": 0.646644419725621,
+      "grad_norm": 0.5322657642988358,
+      "learning_rate": 1.5240920836074777e-05,
+      "loss": 0.6065,
+      "step": 109
+    },
+    {
+      "epoch": 0.6525769373377828,
+      "grad_norm": 0.6975717651431369,
+      "learning_rate": 1.5161062738075068e-05,
+      "loss": 0.6123,
+      "step": 110
+    },
+    {
+      "epoch": 0.6585094549499444,
+      "grad_norm": 0.5314930484389285,
+      "learning_rate": 1.5080753452465296e-05,
+      "loss": 0.5953,
+      "step": 111
+    },
+    {
+      "epoch": 0.664441972562106,
+      "grad_norm": 0.6439755660279854,
+      "learning_rate": 1.5000000000000002e-05,
+      "loss": 0.5794,
+      "step": 112
+    },
+    {
+      "epoch": 0.6703744901742678,
+      "grad_norm": 0.6572591013210453,
+      "learning_rate": 1.4918809440263435e-05,
+      "loss": 0.6116,
+      "step": 113
+    },
+    {
+      "epoch": 0.6763070077864294,
+      "grad_norm": 0.7834266438815685,
+      "learning_rate": 1.4837188871052399e-05,
+      "loss": 0.5871,
+      "step": 114
+    },
+    {
+      "epoch": 0.682239525398591,
+      "grad_norm": 0.7160570880780417,
+      "learning_rate": 1.4755145427755755e-05,
+      "loss": 0.5677,
+      "step": 115
+    },
+    {
+      "epoch": 0.6881720430107527,
+      "grad_norm": 0.6176131655448136,
+      "learning_rate": 1.4672686282730622e-05,
+      "loss": 0.5804,
+      "step": 116
+    },
+    {
+      "epoch": 0.6941045606229144,
+      "grad_norm": 0.7328033360980243,
+      "learning_rate": 1.4589818644675378e-05,
+      "loss": 0.5827,
+      "step": 117
+    },
+    {
+      "epoch": 0.700037078235076,
+      "grad_norm": 0.5454029774533725,
+      "learning_rate": 1.4506549757999456e-05,
+      "loss": 0.585,
+      "step": 118
+    },
+    {
+      "epoch": 0.7059695958472376,
+      "grad_norm": 0.6681806820396734,
+      "learning_rate": 1.4422886902190014e-05,
+      "loss": 0.5805,
+      "step": 119
+    },
+    {
+      "epoch": 0.7119021134593994,
+      "grad_norm": 0.550161826800762,
+      "learning_rate": 1.4338837391175582e-05,
+      "loss": 0.5798,
+      "step": 120
+    },
+    {
+      "epoch": 0.717834631071561,
+      "grad_norm": 0.5611088668582199,
+      "learning_rate": 1.4254408572686642e-05,
+      "loss": 0.5861,
+      "step": 121
+    },
+    {
+      "epoch": 0.7237671486837226,
+      "grad_norm": 0.5366392258292604,
+      "learning_rate": 1.4169607827613284e-05,
+      "loss": 0.5758,
+      "step": 122
+    },
+    {
+      "epoch": 0.7296996662958843,
+      "grad_norm": 0.6344536914740986,
+      "learning_rate": 1.4084442569359964e-05,
+      "loss": 0.5786,
+      "step": 123
+    },
+    {
+      "epoch": 0.735632183908046,
+      "grad_norm": 0.5387616042555342,
+      "learning_rate": 1.3998920243197408e-05,
+      "loss": 0.5722,
+      "step": 124
+    },
+    {
+      "epoch": 0.7415647015202076,
+      "grad_norm": 0.5579694394513077,
+      "learning_rate": 1.391304832561175e-05,
+      "loss": 0.5907,
+      "step": 125
+    },
+    {
+      "epoch": 0.7474972191323693,
+      "grad_norm": 0.5370248165446521,
+      "learning_rate": 1.3826834323650899e-05,
+      "loss": 0.5838,
+      "step": 126
+    },
+    {
+      "epoch": 0.753429736744531,
+      "grad_norm": 0.503594155073456,
+      "learning_rate": 1.3740285774268282e-05,
+      "loss": 0.6,
+      "step": 127
+    },
+    {
+      "epoch": 0.7593622543566926,
+      "grad_norm": 0.5282433463387846,
+      "learning_rate": 1.3653410243663953e-05,
+      "loss": 0.6192,
+      "step": 128
+    },
+    {
+      "epoch": 0.7652947719688543,
+      "grad_norm": 0.5148036620174011,
+      "learning_rate": 1.3566215326623131e-05,
+      "loss": 0.5844,
+      "step": 129
+    },
+    {
+      "epoch": 0.7712272895810159,
+      "grad_norm": 0.47585872492395137,
+      "learning_rate": 1.3478708645852272e-05,
+      "loss": 0.5903,
+      "step": 130
+    },
+    {
+      "epoch": 0.7771598071931776,
+      "grad_norm": 0.5284323921284109,
+      "learning_rate": 1.3390897851312667e-05,
+      "loss": 0.5822,
+      "step": 131
+    },
+    {
+      "epoch": 0.7830923248053393,
+      "grad_norm": 0.5114134262850306,
+      "learning_rate": 1.3302790619551673e-05,
+      "loss": 0.5914,
+      "step": 132
+    },
+    {
+      "epoch": 0.7890248424175009,
+      "grad_norm": 0.5033307503491072,
+      "learning_rate": 1.3214394653031616e-05,
+      "loss": 0.5822,
+      "step": 133
+    },
+    {
+      "epoch": 0.7949573600296626,
+      "grad_norm": 0.5585716796647099,
+      "learning_rate": 1.3125717679456447e-05,
+      "loss": 0.6147,
+      "step": 134
+    },
+    {
+      "epoch": 0.8008898776418243,
+      "grad_norm": 0.4893700313257534,
+      "learning_rate": 1.3036767451096148e-05,
+      "loss": 0.5923,
+      "step": 135
+    },
+    {
+      "epoch": 0.8068223952539859,
+      "grad_norm": 0.5257982159346182,
+      "learning_rate": 1.2947551744109044e-05,
+      "loss": 0.5715,
+      "step": 136
+    },
+    {
+      "epoch": 0.8127549128661475,
+      "grad_norm": 0.6395398302691117,
+      "learning_rate": 1.2858078357861979e-05,
+      "loss": 0.6105,
+      "step": 137
+    },
+    {
+      "epoch": 0.8186874304783093,
+      "grad_norm": 0.5283010642253179,
+      "learning_rate": 1.2768355114248493e-05,
+      "loss": 0.5826,
+      "step": 138
+    },
+    {
+      "epoch": 0.8246199480904709,
+      "grad_norm": 0.6035790420033149,
+      "learning_rate": 1.2678389857005033e-05,
+      "loss": 0.5748,
+      "step": 139
+    },
+    {
+      "epoch": 0.8305524657026325,
+      "grad_norm": 0.5955335369154928,
+      "learning_rate": 1.2588190451025209e-05,
+      "loss": 0.603,
+      "step": 140
+    },
+    {
+      "epoch": 0.8364849833147943,
+      "grad_norm": 0.5214414900773817,
+      "learning_rate": 1.249776478167227e-05,
+      "loss": 0.5791,
+      "step": 141
+    },
+    {
+      "epoch": 0.8424175009269559,
+      "grad_norm": 0.6417186166595917,
+      "learning_rate": 1.2407120754089733e-05,
+      "loss": 0.5969,
+      "step": 142
+    },
+    {
+      "epoch": 0.8483500185391175,
+      "grad_norm": 0.5136915661544744,
+      "learning_rate": 1.2316266292510305e-05,
+      "loss": 0.5983,
+      "step": 143
+    },
+    {
+      "epoch": 0.8542825361512792,
+      "grad_norm": 0.6533160354469787,
+      "learning_rate": 1.2225209339563144e-05,
+      "loss": 0.594,
+      "step": 144
+    },
+    {
+      "epoch": 0.8602150537634409,
+      "grad_norm": 0.47221905497667216,
+      "learning_rate": 1.2133957855579501e-05,
+      "loss": 0.581,
+      "step": 145
+    },
+    {
+      "epoch": 0.8661475713756025,
+      "grad_norm": 0.6131552653676784,
+      "learning_rate": 1.2042519817896805e-05,
+      "loss": 0.6023,
+      "step": 146
+    },
+    {
+      "epoch": 0.8720800889877642,
+      "grad_norm": 0.5846657268915201,
+      "learning_rate": 1.1950903220161286e-05,
+      "loss": 0.5961,
+      "step": 147
+    },
+    {
+      "epoch": 0.8780126065999259,
+      "grad_norm": 0.5541734985583223,
+      "learning_rate": 1.1859116071629148e-05,
+      "loss": 0.5781,
+      "step": 148
+    },
+    {
+      "epoch": 0.8839451242120875,
+      "grad_norm": 0.5275638237960486,
+      "learning_rate": 1.1767166396466404e-05,
+      "loss": 0.6061,
+      "step": 149
+    },
+    {
+      "epoch": 0.8898776418242491,
+      "grad_norm": 0.48482325077647975,
+      "learning_rate": 1.1675062233047365e-05,
+      "loss": 0.5905,
+      "step": 150
+    },
+    {
+      "epoch": 0.8958101594364108,
+      "grad_norm": 0.42738134564467045,
+      "learning_rate": 1.1582811633251949e-05,
+      "loss": 0.595,
+      "step": 151
+    },
+    {
+      "epoch": 0.9017426770485725,
+      "grad_norm": 0.5452867174600223,
+      "learning_rate": 1.1490422661761744e-05,
+      "loss": 0.6139,
+      "step": 152
+    },
+    {
+      "epoch": 0.9076751946607341,
+      "grad_norm": 0.45866736271381936,
+      "learning_rate": 1.1397903395354996e-05,
+      "loss": 0.5539,
+      "step": 153
+    },
+    {
+      "epoch": 0.9136077122728958,
+      "grad_norm": 0.5301598351347845,
+      "learning_rate": 1.130526192220052e-05,
+      "loss": 0.5902,
+      "step": 154
+    },
+    {
+      "epoch": 0.9195402298850575,
+      "grad_norm": 0.4744630476971996,
+      "learning_rate": 1.1212506341150615e-05,
+      "loss": 0.5791,
+      "step": 155
+    },
+    {
+      "epoch": 0.9254727474972191,
+      "grad_norm": 0.5400171376545928,
+      "learning_rate": 1.1119644761033079e-05,
+      "loss": 0.5879,
+      "step": 156
+    },
+    {
+      "epoch": 0.9314052651093808,
+      "grad_norm": 0.49031695283723725,
+      "learning_rate": 1.1026685299942286e-05,
+      "loss": 0.5786,
+      "step": 157
+    },
+    {
+      "epoch": 0.9373377827215424,
+      "grad_norm": 0.541456950819932,
+      "learning_rate": 1.0933636084529507e-05,
+      "loss": 0.5934,
+      "step": 158
+    },
+    {
+      "epoch": 0.9432703003337041,
+      "grad_norm": 0.5201441726217624,
+      "learning_rate": 1.0840505249292477e-05,
+      "loss": 0.5931,
+      "step": 159
+    },
+    {
+      "epoch": 0.9492028179458658,
+      "grad_norm": 0.5197234596906991,
+      "learning_rate": 1.0747300935864245e-05,
+      "loss": 0.5867,
+      "step": 160
+    },
+    {
+      "epoch": 0.9551353355580274,
+      "grad_norm": 0.6456281680986811,
+      "learning_rate": 1.0654031292301432e-05,
+      "loss": 0.5887,
+      "step": 161
+    },
+    {
+      "epoch": 0.9610678531701891,
+      "grad_norm": 0.498657726991286,
+      "learning_rate": 1.0560704472371919e-05,
+      "loss": 0.596,
+      "step": 162
+    },
+    {
+      "epoch": 0.9670003707823508,
+      "grad_norm": 0.6139669635009651,
+      "learning_rate": 1.0467328634842024e-05,
+      "loss": 0.5827,
+      "step": 163
+    },
+    {
+      "epoch": 0.9729328883945124,
+      "grad_norm": 0.6135223890559531,
+      "learning_rate": 1.037391194276326e-05,
+      "loss": 0.596,
+      "step": 164
+    },
+    {
+      "epoch": 0.978865406006674,
+      "grad_norm": 0.4774291756729163,
+      "learning_rate": 1.028046256275869e-05,
+      "loss": 0.585,
+      "step": 165
+    },
+    {
+      "epoch": 0.9847979236188358,
+      "grad_norm": 0.5242543425803347,
+      "learning_rate": 1.0186988664309023e-05,
+      "loss": 0.577,
+      "step": 166
+    },
+    {
+      "epoch": 0.9907304412309974,
+      "grad_norm": 0.45667298643197274,
+      "learning_rate": 1.0093498419038394e-05,
+      "loss": 0.5936,
+      "step": 167
+    },
+    {
+      "epoch": 0.996662958843159,
+      "grad_norm": 0.5025893794233189,
+      "learning_rate": 1e-05,
+      "loss": 0.5854,
+      "step": 168
+    },
+    {
+      "epoch": 1.0059325176121616,
+      "grad_norm": 1.1546218989986659,
+      "learning_rate": 9.90650158096161e-06,
+      "loss": 1.133,
+      "step": 169
+    },
+    {
+      "epoch": 1.0118650352243233,
+      "grad_norm": 0.640747017349487,
+      "learning_rate": 9.81301133569098e-06,
+      "loss": 0.5353,
+      "step": 170
+    },
+    {
+      "epoch": 1.0177975528364849,
+      "grad_norm": 0.6151332494210663,
+      "learning_rate": 9.719537437241311e-06,
+      "loss": 0.5355,
+      "step": 171
+    },
+    {
+      "epoch": 1.0237300704486467,
+      "grad_norm": 0.6267164652952573,
+      "learning_rate": 9.626088057236745e-06,
+      "loss": 0.5172,
+      "step": 172
+    },
+    {
+      "epoch": 1.0296625880608083,
+      "grad_norm": 0.6416046210551611,
+      "learning_rate": 9.532671365157979e-06,
+      "loss": 0.5105,
+      "step": 173
+    },
+    {
+      "epoch": 1.03559510567297,
+      "grad_norm": 0.7006960546020584,
+      "learning_rate": 9.439295527628083e-06,
+      "loss": 0.5282,
+      "step": 174
+    },
+    {
+      "epoch": 1.0415276232851316,
+      "grad_norm": 0.5780366560749366,
+      "learning_rate": 9.34596870769857e-06,
+      "loss": 0.4976,
+      "step": 175
+    },
+    {
+      "epoch": 1.0474601408972932,
+      "grad_norm": 0.5470594926347001,
+      "learning_rate": 9.252699064135759e-06,
+      "loss": 0.5112,
+      "step": 176
+    },
+    {
+      "epoch": 1.0533926585094548,
+      "grad_norm": 0.6201320684584469,
+      "learning_rate": 9.159494750707527e-06,
+      "loss": 0.5234,
+      "step": 177
+    },
+    {
+      "epoch": 1.0593251761216167,
+      "grad_norm": 0.6107579623832626,
+      "learning_rate": 9.066363915470494e-06,
+      "loss": 0.5137,
+      "step": 178
+    },
+    {
+      "epoch": 1.0652576937337783,
+      "grad_norm": 0.5656928235926393,
+      "learning_rate": 8.973314700057717e-06,
+      "loss": 0.5202,
+      "step": 179
+    },
+    {
+      "epoch": 1.07119021134594,
+      "grad_norm": 0.6419908684973982,
+      "learning_rate": 8.880355238966923e-06,
+      "loss": 0.5234,
+      "step": 180
+    },
+    {
+      "epoch": 1.0771227289581016,
+      "grad_norm": 0.5987046465110973,
+      "learning_rate": 8.787493658849387e-06,
+      "loss": 0.5226,
+      "step": 181
+    },
+    {
+      "epoch": 1.0830552465702632,
+      "grad_norm": 0.6073135501636805,
+      "learning_rate": 8.694738077799487e-06,
+      "loss": 0.5237,
+      "step": 182
+    },
+    {
+      "epoch": 1.0889877641824248,
+      "grad_norm": 0.523955265242911,
+      "learning_rate": 8.602096604645009e-06,
+      "loss": 0.5128,
+      "step": 183
+    },
+    {
+      "epoch": 1.0949202817945867,
+      "grad_norm": 0.5238418004483752,
+      "learning_rate": 8.509577338238255e-06,
+      "loss": 0.5299,
+      "step": 184
+    },
+    {
+      "epoch": 1.1008527994067483,
+      "grad_norm": 0.47473612794789644,
+      "learning_rate": 8.417188366748051e-06,
+      "loss": 0.5087,
+      "step": 185
+    },
+    {
+      "epoch": 1.10678531701891,
+      "grad_norm": 0.48708620030904176,
+      "learning_rate": 8.324937766952638e-06,
+      "loss": 0.4937,
+      "step": 186
+    },
+    {
+      "epoch": 1.1127178346310715,
+      "grad_norm": 0.503979194337666,
+      "learning_rate": 8.232833603533601e-06,
+      "loss": 0.5107,
+      "step": 187
+    },
+    {
+      "epoch": 1.1186503522432332,
+      "grad_norm": 0.4721181859728738,
+      "learning_rate": 8.140883928370855e-06,
+      "loss": 0.5092,
+      "step": 188
+    },
+    {
+      "epoch": 1.1245828698553948,
+      "grad_norm": 0.47519673374496596,
+      "learning_rate": 8.04909677983872e-06,
+      "loss": 0.5093,
+      "step": 189
+    },
+    {
+      "epoch": 1.1305153874675566,
+      "grad_norm": 0.48532470072787237,
+      "learning_rate": 7.957480182103198e-06,
+      "loss": 0.514,
+      "step": 190
+    },
+    {
+      "epoch": 1.1364479050797183,
+      "grad_norm": 0.5558277109069734,
+      "learning_rate": 7.866042144420502e-06,
+      "loss": 0.5114,
+      "step": 191
+    },
+    {
+      "epoch": 1.14238042269188,
+      "grad_norm": 0.5033467701152651,
+      "learning_rate": 7.774790660436857e-06,
+      "loss": 0.5044,
+      "step": 192
+    },
+    {
+      "epoch": 1.1483129403040415,
+      "grad_norm": 0.5253895227761279,
+      "learning_rate": 7.6837337074897e-06,
+      "loss": 0.528,
+      "step": 193
+    },
+    {
+      "epoch": 1.1542454579162031,
+      "grad_norm": 0.5839549683192119,
+      "learning_rate": 7.592879245910273e-06,
+      "loss": 0.5143,
+      "step": 194
+    },
+    {
+      "epoch": 1.1601779755283648,
+      "grad_norm": 0.5025007441342745,
+      "learning_rate": 7.50223521832773e-06,
+      "loss": 0.5431,
+      "step": 195
+    },
+    {
+      "epoch": 1.1661104931405264,
+      "grad_norm": 0.4459742958112407,
+      "learning_rate": 7.411809548974792e-06,
+      "loss": 0.5164,
+      "step": 196
+    },
+    {
+      "epoch": 1.1720430107526882,
+      "grad_norm": 0.49959227007299145,
+      "learning_rate": 7.321610142994971e-06,
+      "loss": 0.504,
+      "step": 197
+    },
+    {
+      "epoch": 1.1779755283648499,
+      "grad_norm": 0.434451715252235,
+      "learning_rate": 7.2316448857515076e-06,
+      "loss": 0.5146,
+      "step": 198
+    },
+    {
+      "epoch": 1.1839080459770115,
+      "grad_norm": 0.41274799335831514,
+      "learning_rate": 7.141921642138025e-06,
+      "loss": 0.4913,
+      "step": 199
+    },
+    {
+      "epoch": 1.1898405635891731,
+      "grad_norm": 0.4537854495083879,
+      "learning_rate": 7.052448255890958e-06,
+      "loss": 0.5147,
+      "step": 200
+    },
+    {
+      "epoch": 1.1898405635891731,
+      "eval_loss": 0.6126826405525208,
+      "eval_runtime": 6.5201,
+      "eval_samples_per_second": 19.478,
+      "eval_steps_per_second": 2.454,
+      "step": 200
+    },
+    {
+      "epoch": 1.1957730812013347,
+      "grad_norm": 0.46234781827027166,
+      "learning_rate": 6.963232548903853e-06,
+      "loss": 0.4987,
+      "step": 201
+    },
+    {
+      "epoch": 1.2017055988134966,
+      "grad_norm": 0.45869290271578933,
+      "learning_rate": 6.874282320543557e-06,
+      "loss": 0.4966,
+      "step": 202
+    },
+    {
+      "epoch": 1.2076381164256582,
+      "grad_norm": 0.460091111016698,
+      "learning_rate": 6.785605346968387e-06,
+      "loss": 0.5141,
+      "step": 203
+    },
+    {
+      "epoch": 1.2135706340378198,
+      "grad_norm": 0.4117798491221172,
+      "learning_rate": 6.697209380448333e-06,
+      "loss": 0.5058,
+      "step": 204
+    },
+    {
+      "epoch": 1.2195031516499815,
+      "grad_norm": 0.45529452974398477,
+      "learning_rate": 6.609102148687333e-06,
+      "loss": 0.5191,
+      "step": 205
+    },
+    {
+      "epoch": 1.225435669262143,
+      "grad_norm": 0.4406074150768268,
+      "learning_rate": 6.521291354147727e-06,
+      "loss": 0.5158,
+      "step": 206
+    },
+    {
+      "epoch": 1.2313681868743047,
+      "grad_norm": 0.4265027031097921,
+      "learning_rate": 6.43378467337687e-06,
+      "loss": 0.492,
+      "step": 207
+    },
+    {
+      "epoch": 1.2373007044864663,
+      "grad_norm": 0.4092683556762198,
+      "learning_rate": 6.34658975633605e-06,
+      "loss": 0.5293,
+      "step": 208
+    },
+    {
+      "epoch": 1.2432332220986282,
+      "grad_norm": 0.4228580623573835,
+      "learning_rate": 6.2597142257317185e-06,
+      "loss": 0.5081,
+      "step": 209
+    },
+    {
+      "epoch": 1.2491657397107898,
+      "grad_norm": 0.44825386735657413,
+      "learning_rate": 6.173165676349103e-06,
+      "loss": 0.5116,
+      "step": 210
+    },
+    {
+      "epoch": 1.2550982573229514,
+      "grad_norm": 0.41296899816759075,
+      "learning_rate": 6.086951674388252e-06,
+      "loss": 0.5091,
+      "step": 211
+    },
+    {
+      "epoch": 1.261030774935113,
+      "grad_norm": 0.4086723147124949,
+      "learning_rate": 6.001079756802592e-06,
+      "loss": 0.5246,
+      "step": 212
+    },
+    {
+      "epoch": 1.2669632925472747,
+      "grad_norm": 0.3966806792909518,
+      "learning_rate": 5.91555743064004e-06,
+      "loss": 0.4979,
+      "step": 213
+    },
+    {
+      "epoch": 1.2728958101594365,
+      "grad_norm": 0.39802032371072127,
+      "learning_rate": 5.830392172386723e-06,
+      "loss": 0.5107,
+      "step": 214
+    },
+    {
+      "epoch": 1.2788283277715982,
+      "grad_norm": 0.4164302108778062,
+      "learning_rate": 5.745591427313365e-06,
+      "loss": 0.5052,
+      "step": 215
+    },
+    {
+      "epoch": 1.2847608453837598,
+      "grad_norm": 0.4289140325941008,
+      "learning_rate": 5.66116260882442e-06,
+      "loss": 0.5002,
+      "step": 216
+    },
+    {
+      "epoch": 1.2906933629959214,
+      "grad_norm": 0.3815632104177572,
+      "learning_rate": 5.5771130978099896e-06,
+      "loss": 0.4984,
+      "step": 217
+    },
+    {
+      "epoch": 1.296625880608083,
+      "grad_norm": 0.43125912042578773,
+      "learning_rate": 5.493450242000546e-06,
+      "loss": 0.512,
+      "step": 218
+    },
+    {
+      "epoch": 1.3025583982202447,
+      "grad_norm": 0.36576046723610733,
+      "learning_rate": 5.410181355324622e-06,
+      "loss": 0.5028,
+      "step": 219
+    },
+    {
+      "epoch": 1.3084909158324063,
+      "grad_norm": 0.41196608301028825,
+      "learning_rate": 5.32731371726938e-06,
+      "loss": 0.5153,
+      "step": 220
+    },
+    {
+      "epoch": 1.314423433444568,
+      "grad_norm": 0.4106672248690276,
+      "learning_rate": 5.244854572244249e-06,
+      "loss": 0.4975,
+      "step": 221
+    },
+    {
+      "epoch": 1.3203559510567298,
+      "grad_norm": 0.407121465820589,
+      "learning_rate": 5.1628111289476025e-06,
+      "loss": 0.526,
+      "step": 222
+    },
+    {
+      "epoch": 1.3262884686688914,
+      "grad_norm": 0.36895605030103884,
+      "learning_rate": 5.081190559736569e-06,
+      "loss": 0.4965,
+      "step": 223
+    },
+    {
+      "epoch": 1.332220986281053,
+      "grad_norm": 0.39832627810457305,
+      "learning_rate": 5.000000000000003e-06,
+      "loss": 0.5171,
+      "step": 224
+    },
+    {
+      "epoch": 1.3381535038932146,
+      "grad_norm": 0.4133707378550398,
+      "learning_rate": 4.919246547534709e-06,
+      "loss": 0.4996,
+      "step": 225
+    },
+    {
+      "epoch": 1.3440860215053765,
+      "grad_norm": 0.35880827901259854,
+      "learning_rate": 4.838937261924933e-06,
+      "loss": 0.5026,
+      "step": 226
+    },
+    {
+      "epoch": 1.350018539117538,
+      "grad_norm": 0.4229197427258153,
+      "learning_rate": 4.759079163925223e-06,
+      "loss": 0.5132,
+      "step": 227
+    },
+    {
+      "epoch": 1.3559510567296997,
+      "grad_norm": 0.4347343479516533,
+      "learning_rate": 4.679679234846636e-06,
+      "loss": 0.5124,
+      "step": 228
+    },
+    {
+      "epoch": 1.3618835743418614,
+      "grad_norm": 0.3927261283383854,
+      "learning_rate": 4.600744415946438e-06,
+      "loss": 0.4997,
+      "step": 229
+    },
+    {
+      "epoch": 1.367816091954023,
+      "grad_norm": 0.39262400622403115,
+      "learning_rate": 4.522281607821288e-06,
+      "loss": 0.4967,
+      "step": 230
+    },
+    {
+      "epoch": 1.3737486095661846,
+      "grad_norm": 0.4223266363182741,
+      "learning_rate": 4.444297669803981e-06,
+      "loss": 0.4837,
+      "step": 231
+    },
+    {
+      "epoch": 1.3796811271783462,
+      "grad_norm": 0.4490330791545468,
+      "learning_rate": 4.3667994193637794e-06,
+      "loss": 0.5067,
+      "step": 232
+    },
+    {
+      "epoch": 1.3856136447905079,
+      "grad_norm": 0.39266902412489757,
+      "learning_rate": 4.289793631510449e-06,
+      "loss": 0.5253,
+      "step": 233
+    },
+    {
+      "epoch": 1.3915461624026697,
+      "grad_norm": 0.4576356901813089,
+      "learning_rate": 4.213287038201943e-06,
+      "loss": 0.5094,
+      "step": 234
+    },
+    {
+      "epoch": 1.3974786800148313,
+      "grad_norm": 0.43913157401185643,
+      "learning_rate": 4.137286327755913e-06,
+      "loss": 0.5074,
+      "step": 235
+    },
+    {
+      "epoch": 1.403411197626993,
+      "grad_norm": 0.3800958684968872,
+      "learning_rate": 4.061798144264986e-06,
+      "loss": 0.5161,
+      "step": 236
+    },
+    {
+      "epoch": 1.4093437152391546,
+      "grad_norm": 0.3966235644138789,
+      "learning_rate": 3.986829087015941e-06,
+      "loss": 0.5135,
+      "step": 237
+    },
+    {
+      "epoch": 1.4152762328513162,
+      "grad_norm": 0.39361049332745907,
+      "learning_rate": 3.912385709912794e-06,
+      "loss": 0.5043,
+      "step": 238
+    },
+    {
+      "epoch": 1.421208750463478,
+      "grad_norm": 0.42148715667996,
+      "learning_rate": 3.838474520903825e-06,
+      "loss": 0.4986,
+      "step": 239
+    },
+    {
+      "epoch": 1.4271412680756397,
+      "grad_norm": 0.39414223898095996,
+      "learning_rate": 3.7651019814126656e-06,
+      "loss": 0.5216,
+      "step": 240
+    },
+    {
+      "epoch": 1.4330737856878013,
+      "grad_norm": 0.37253637577722365,
+      "learning_rate": 3.692274505773419e-06,
+      "loss": 0.5253,
+      "step": 241
+    },
+    {
+      "epoch": 1.439006303299963,
+      "grad_norm": 0.41704046440564047,
+      "learning_rate": 3.619998460669916e-06,
+      "loss": 0.5364,
+      "step": 242
+    },
+    {
+      "epoch": 1.4449388209121246,
+      "grad_norm": 0.3653364189332755,
+      "learning_rate": 3.5482801645791266e-06,
+      "loss": 0.4916,
+      "step": 243
+    },
+    {
+      "epoch": 1.4508713385242862,
+      "grad_norm": 0.35349867698633664,
+      "learning_rate": 3.4771258872187917e-06,
+      "loss": 0.5275,
+      "step": 244
+    },
+    {
+      "epoch": 1.4568038561364478,
+      "grad_norm": 0.41828803136166764,
+      "learning_rate": 3.4065418489993118e-06,
+      "loss": 0.5139,
+      "step": 245
+    },
+    {
+      "epoch": 1.4627363737486094,
+      "grad_norm": 0.35821753699509107,
+      "learning_rate": 3.3365342204799613e-06,
+      "loss": 0.4831,
+      "step": 246
+    },
+    {
+      "epoch": 1.4686688913607713,
+      "grad_norm": 0.38538972239171304,
+      "learning_rate": 3.267109121829428e-06,
+      "loss": 0.5177,
+      "step": 247
+    },
+    {
+      "epoch": 1.474601408972933,
+      "grad_norm": 0.36434237943436254,
+      "learning_rate": 3.1982726222908046e-06,
+      "loss": 0.5134,
+      "step": 248
+    },
+    {
+      "epoch": 1.4805339265850945,
+      "grad_norm": 0.3706860313727835,
+      "learning_rate": 3.1300307396509833e-06,
+      "loss": 0.508,
+      "step": 249
+    },
+    {
+      "epoch": 1.4864664441972562,
+      "grad_norm": 0.38985446887032604,
+      "learning_rate": 3.0623894397145837e-06,
+      "loss": 0.5026,
+      "step": 250
+    },
+    {
+      "epoch": 1.492398961809418,
+      "grad_norm": 0.3921469360946988,
+      "learning_rate": 2.995354635782417e-06,
+      "loss": 0.5165,
+      "step": 251
+    },
+    {
+      "epoch": 1.4983314794215796,
+      "grad_norm": 0.35486752662951043,
+      "learning_rate": 2.9289321881345257e-06,
+      "loss": 0.5148,
+      "step": 252
+    },
+    {
+      "epoch": 1.5042639970337413,
+      "grad_norm": 0.3454065133594964,
+      "learning_rate": 2.8631279035178796e-06,
+      "loss": 0.4874,
+      "step": 253
+    },
+    {
+      "epoch": 1.5101965146459029,
+      "grad_norm": 0.3930097863801726,
+      "learning_rate": 2.7979475346387363e-06,
+      "loss": 0.4974,
+      "step": 254
+    },
+    {
+      "epoch": 1.5161290322580645,
+      "grad_norm": 0.3932165819653655,
+      "learning_rate": 2.7333967796597317e-06,
+      "loss": 0.5134,
+      "step": 255
+    },
+    {
+      "epoch": 1.5220615498702261,
+      "grad_norm": 0.3320234783166823,
+      "learning_rate": 2.669481281701739e-06,
+      "loss": 0.5121,
+      "step": 256
+    },
+    {
+      "epoch": 1.5279940674823878,
+      "grad_norm": 0.3535467409588918,
+      "learning_rate": 2.6062066283505404e-06,
+      "loss": 0.5111,
+      "step": 257
+    },
+    {
+      "epoch": 1.5339265850945494,
+      "grad_norm": 0.3770042051140451,
+      "learning_rate": 2.5435783511683444e-06,
+      "loss": 0.4873,
+      "step": 258
+    },
+    {
+      "epoch": 1.539859102706711,
+      "grad_norm": 0.367146329985044,
+      "learning_rate": 2.4816019252102274e-06,
+      "loss": 0.5083,
+      "step": 259
+    },
+    {
+      "epoch": 1.5457916203188728,
+      "grad_norm": 0.3394325003492519,
+      "learning_rate": 2.420282768545469e-06,
+      "loss": 0.4824,
+      "step": 260
+    },
+    {
+      "epoch": 1.5517241379310345,
+      "grad_norm": 0.3866827025758643,
+      "learning_rate": 2.3596262417839256e-06,
+      "loss": 0.507,
+      "step": 261
+    },
+    {
+      "epoch": 1.557656655543196,
+      "grad_norm": 0.33725647495532407,
+      "learning_rate": 2.2996376476073724e-06,
+      "loss": 0.4927,
+      "step": 262
+    },
+    {
+      "epoch": 1.563589173155358,
+      "grad_norm": 0.34863076052644143,
+      "learning_rate": 2.240322230305951e-06,
+      "loss": 0.4967,
+      "step": 263
+    },
+    {
+      "epoch": 1.5695216907675196,
+      "grad_norm": 0.3588480566227521,
+      "learning_rate": 2.1816851753197023e-06,
+      "loss": 0.4969,
+      "step": 264
+    },
+    {
+      "epoch": 1.5754542083796812,
+      "grad_norm": 0.339836588579076,
+      "learning_rate": 2.1237316087852465e-06,
+      "loss": 0.5104,
+      "step": 265
+    },
+    {
+      "epoch": 1.5813867259918428,
+      "grad_norm": 0.3549261249693355,
+      "learning_rate": 2.0664665970876496e-06,
+      "loss": 0.5063,
+      "step": 266
+    },
+    {
+      "epoch": 1.5873192436040044,
+      "grad_norm": 0.3776247577793457,
+      "learning_rate": 2.009895146417512e-06,
+      "loss": 0.5007,
+      "step": 267
+    },
+    {
+      "epoch": 1.593251761216166,
+      "grad_norm": 0.34878518596843994,
+      "learning_rate": 1.9540222023333165e-06,
+      "loss": 0.477,
+      "step": 268
+    },
+    {
+      "epoch": 1.5991842788283277,
+      "grad_norm": 0.3347286639188288,
+      "learning_rate": 1.8988526493290948e-06,
+      "loss": 0.4978,
+      "step": 269
+    },
+    {
+      "epoch": 1.6051167964404893,
+      "grad_norm": 0.3697740517807442,
+      "learning_rate": 1.8443913104073984e-06,
+      "loss": 0.5155,
+      "step": 270
+    },
+    {
+      "epoch": 1.611049314052651,
+      "grad_norm": 0.34233058020715335,
+      "learning_rate": 1.7906429466576768e-06,
+      "loss": 0.5012,
+      "step": 271
+    },
+    {
+      "epoch": 1.6169818316648128,
+      "grad_norm": 0.3415900695134301,
+      "learning_rate": 1.7376122568400533e-06,
+      "loss": 0.4992,
+      "step": 272
+    },
+    {
+      "epoch": 1.6229143492769744,
+      "grad_norm": 0.3415274161298902,
+      "learning_rate": 1.6853038769745466e-06,
+      "loss": 0.4859,
+      "step": 273
+    },
+    {
+      "epoch": 1.628846866889136,
+      "grad_norm": 0.36778032976021363,
+      "learning_rate": 1.6337223799358025e-06,
+      "loss": 0.4846,
+      "step": 274
+    },
+    {
+      "epoch": 1.634779384501298,
+      "grad_norm": 0.38185068578401754,
+      "learning_rate": 1.582872275053301e-06,
+      "loss": 0.5154,
+      "step": 275
+    },
+    {
+      "epoch": 1.6407119021134595,
+      "grad_norm": 0.33140851685955686,
+      "learning_rate": 1.5327580077171589e-06,
+      "loss": 0.5011,
+      "step": 276
+    },
+    {
+      "epoch": 1.6466444197256211,
+      "grad_norm": 0.36885492758442245,
+      "learning_rate": 1.4833839589895072e-06,
+      "loss": 0.5092,
+      "step": 277
+    },
+    {
+      "epoch": 1.6525769373377828,
+      "grad_norm": 0.36549072606819394,
+      "learning_rate": 1.4347544452214869e-06,
+      "loss": 0.5252,
+      "step": 278
+    },
+    {
+      "epoch": 1.6585094549499444,
+      "grad_norm": 0.36738791918152014,
+      "learning_rate": 1.3868737176759105e-06,
+      "loss": 0.5281,
+      "step": 279
+    },
+    {
+      "epoch": 1.664441972562106,
+      "grad_norm": 0.326412556386957,
+      "learning_rate": 1.339745962155613e-06,
+      "loss": 0.5045,
+      "step": 280
+    },
+    {
+      "epoch": 1.6703744901742676,
+      "grad_norm": 0.3415508112126587,
+      "learning_rate": 1.293375298637518e-06,
+      "loss": 0.4895,
+      "step": 281
+    },
+    {
+      "epoch": 1.6763070077864293,
+      "grad_norm": 0.34722556354132184,
+      "learning_rate": 1.2477657809124632e-06,
+      "loss": 0.4946,
+      "step": 282
+    },
+    {
+      "epoch": 1.682239525398591,
+      "grad_norm": 0.3310077749742758,
+      "learning_rate": 1.2029213962308172e-06,
+      "loss": 0.5023,
+      "step": 283
+    },
+    {
+      "epoch": 1.6881720430107527,
+      "grad_norm": 0.3348390877359583,
+      "learning_rate": 1.1588460649539036e-06,
+      "loss": 0.4781,
+      "step": 284
+    },
+    {
+      "epoch": 1.6941045606229144,
+      "grad_norm": 0.34619840766115634,
+      "learning_rate": 1.1155436402112785e-06,
+      "loss": 0.5277,
+      "step": 285
+    },
+    {
+      "epoch": 1.700037078235076,
+      "grad_norm": 0.33543629443771605,
+      "learning_rate": 1.073017907563887e-06,
+      "loss": 0.5147,
+      "step": 286
+    },
+    {
+      "epoch": 1.7059695958472376,
+      "grad_norm": 0.34460699457893246,
+      "learning_rate": 1.0312725846731174e-06,
+      "loss": 0.5109,
+      "step": 287
+    },
+    {
+      "epoch": 1.7119021134593995,
+      "grad_norm": 0.363368163191836,
+      "learning_rate": 9.903113209758098e-07,
+      "loss": 0.4809,
+      "step": 288
+    },
+    {
+      "epoch": 1.717834631071561,
+      "grad_norm": 0.34297466357856504,
+      "learning_rate": 9.501376973651999e-07,
+      "loss": 0.495,
+      "step": 289
+    },
+    {
+      "epoch": 1.7237671486837227,
+      "grad_norm": 0.35425940666511757,
+      "learning_rate": 9.107552258778907e-07,
+      "loss": 0.5322,
+      "step": 290
+    },
+    {
+      "epoch": 1.7296996662958843,
+      "grad_norm": 0.37065578497405705,
+      "learning_rate": 8.721673493868111e-07,
+      "loss": 0.5168,
+      "step": 291
+    },
+    {
+      "epoch": 1.735632183908046,
+      "grad_norm": 0.34680893408384794,
+      "learning_rate": 8.343774413002382e-07,
+      "loss": 0.5024,
+      "step": 292
+    },
+    {
+      "epoch": 1.7415647015202076,
+      "grad_norm": 0.34059992527417754,
+      "learning_rate": 7.973888052668943e-07,
+      "loss": 0.5185,
+      "step": 293
+    },
+    {
+      "epoch": 1.7474972191323692,
+      "grad_norm": 0.35068499752190946,
+      "learning_rate": 7.612046748871327e-07,
+      "loss": 0.5142,
+      "step": 294
+    },
+    {
+      "epoch": 1.7534297367445308,
+      "grad_norm": 0.3518003616621483,
+      "learning_rate": 7.258282134302519e-07,
+      "loss": 0.5055,
+      "step": 295
+    },
+    {
+      "epoch": 1.7593622543566925,
+      "grad_norm": 0.343801973358475,
+      "learning_rate": 6.912625135579587e-07,
+      "loss": 0.4978,
+      "step": 296
+    },
+    {
+      "epoch": 1.7652947719688543,
+      "grad_norm": 0.3568376912330781,
+      "learning_rate": 6.57510597054003e-07,
+      "loss": 0.5359,
+      "step": 297
+    },
+    {
+      "epoch": 1.771227289581016,
+      "grad_norm": 0.33791018323113514,
+      "learning_rate": 6.245754145600091e-07,
+      "loss": 0.5025,
+      "step": 298
+    },
+    {
+      "epoch": 1.7771598071931776,
+      "grad_norm": 0.33742679787508273,
+      "learning_rate": 5.924598453175278e-07,
+      "loss": 0.4985,
+      "step": 299
+    },
+    {
+      "epoch": 1.7830923248053394,
+      "grad_norm": 0.33858341861628327,
+      "learning_rate": 5.611666969163243e-07,
+      "loss": 0.513,
+      "step": 300
+    },
+    {
+      "epoch": 1.7830923248053394,
+      "eval_loss": 0.6072779297828674,
+      "eval_runtime": 6.923,
+      "eval_samples_per_second": 18.345,
+      "eval_steps_per_second": 2.311,
+      "step": 300
+    },
+    {
+      "epoch": 1.789024842417501,
+      "grad_norm": 0.35027770808668635,
+      "learning_rate": 5.306987050489442e-07,
+      "loss": 0.5016,
+      "step": 301
+    },
+    {
+      "epoch": 1.7949573600296627,
+      "grad_norm": 0.33886481172097505,
+      "learning_rate": 5.010585332715401e-07,
+      "loss": 0.5075,
+      "step": 302
+    },
+    {
+      "epoch": 1.8008898776418243,
+      "grad_norm": 0.361905746068308,
+      "learning_rate": 4.7224877277103673e-07,
+      "loss": 0.5092,
+      "step": 303
+    },
+    {
+      "epoch": 1.806822395253986,
+      "grad_norm": 0.3347249716755872,
+      "learning_rate": 4.4427194213859216e-07,
+      "loss": 0.5221,
+      "step": 304
+    },
+    {
+      "epoch": 1.8127549128661475,
+      "grad_norm": 0.3384027231629407,
+      "learning_rate": 4.171304871494264e-07,
+      "loss": 0.5164,
+      "step": 305
+    },
+    {
+      "epoch": 1.8186874304783092,
+      "grad_norm": 0.3304521483560128,
+      "learning_rate": 3.908267805490051e-07,
+      "loss": 0.51,
+      "step": 306
+    },
+    {
+      "epoch": 1.8246199480904708,
+      "grad_norm": 0.32865263259804217,
+      "learning_rate": 3.6536312184560996e-07,
+      "loss": 0.5185,
+      "step": 307
+    },
+    {
+      "epoch": 1.8305524657026324,
+      "grad_norm": 0.3193148808383708,
+      "learning_rate": 3.4074173710931804e-07,
+      "loss": 0.5031,
+      "step": 308
+    },
+    {
+      "epoch": 1.8364849833147943,
+      "grad_norm": 0.32050879286411965,
+      "learning_rate": 3.1696477877738664e-07,
+      "loss": 0.5087,
+      "step": 309
+    },
+    {
+      "epoch": 1.8424175009269559,
+      "grad_norm": 0.3442859949879679,
+      "learning_rate": 2.940343254660905e-07,
+      "loss": 0.517,
+      "step": 310
+    },
+    {
+      "epoch": 1.8483500185391175,
+      "grad_norm": 0.3273773858356578,
+      "learning_rate": 2.7195238178900685e-07,
+      "loss": 0.4948,
+      "step": 311
+    },
+    {
+      "epoch": 1.8542825361512794,
+      "grad_norm": 0.33321328950736606,
+      "learning_rate": 2.507208781817638e-07,
+      "loss": 0.5236,
+      "step": 312
+    },
+    {
+      "epoch": 1.860215053763441,
+      "grad_norm": 0.3386415168984448,
+      "learning_rate": 2.3034167073328283e-07,
+      "loss": 0.5116,
+      "step": 313
+    },
+    {
+      "epoch": 1.8661475713756026,
+      "grad_norm": 0.32943898290159496,
+      "learning_rate": 2.1081654102351634e-07,
+      "loss": 0.4981,
+      "step": 314
+    },
+    {
+      "epoch": 1.8720800889877642,
+      "grad_norm": 0.31450631617292496,
+      "learning_rate": 1.921471959676957e-07,
+      "loss": 0.4972,
+      "step": 315
+    },
+    {
+      "epoch": 1.8780126065999259,
+      "grad_norm": 0.33778423189464707,
+      "learning_rate": 1.7433526766711727e-07,
+      "loss": 0.5025,
+      "step": 316
+    },
+    {
+      "epoch": 1.8839451242120875,
+      "grad_norm": 0.32927984851113423,
+      "learning_rate": 1.5738231326645758e-07,
+      "loss": 0.5154,
+      "step": 317
+    },
+    {
+      "epoch": 1.889877641824249,
+      "grad_norm": 0.33696222369048556,
+      "learning_rate": 1.4128981481764115e-07,
+      "loss": 0.4804,
+      "step": 318
+    },
+    {
+      "epoch": 1.8958101594364107,
+      "grad_norm": 0.33119042978822105,
+      "learning_rate": 1.2605917915028743e-07,
+      "loss": 0.5099,
+      "step": 319
+    },
+    {
+      "epoch": 1.9017426770485724,
+      "grad_norm": 0.3369916299734094,
+      "learning_rate": 1.1169173774871478e-07,
+      "loss": 0.5009,
+      "step": 320
+    },
+    {
+      "epoch": 1.907675194660734,
+      "grad_norm": 0.32908222754161875,
+      "learning_rate": 9.818874663554356e-08,
+      "loss": 0.5085,
+      "step": 321
+    },
+    {
+      "epoch": 1.9136077122728958,
+      "grad_norm": 0.3305867681142081,
+      "learning_rate": 8.555138626189619e-08,
+      "loss": 0.4843,
+      "step": 322
+    },
+    {
+      "epoch": 1.9195402298850575,
+      "grad_norm": 0.338494816075747,
+      "learning_rate": 7.378076140419188e-08,
+      "loss": 0.5247,
+      "step": 323
+    },
+    {
+      "epoch": 1.925472747497219,
+      "grad_norm": 0.3464955361533102,
+      "learning_rate": 6.287790106757396e-08,
+      "loss": 0.4982,
+      "step": 324
+    },
+    {
+      "epoch": 1.931405265109381,
+      "grad_norm": 0.32358285120905184,
+      "learning_rate": 5.284375839594958e-08,
+      "loss": 0.5097,
+      "step": 325
+    },
+    {
+      "epoch": 1.9373377827215426,
+      "grad_norm": 0.32591557161362456,
+      "learning_rate": 4.367921058866187e-08,
+      "loss": 0.4852,
+      "step": 326
+    },
+    {
+      "epoch": 1.9432703003337042,
+      "grad_norm": 0.3114294363115202,
+      "learning_rate": 3.538505882380916e-08,
+      "loss": 0.482,
+      "step": 327
+    },
+    {
+      "epoch": 1.9492028179458658,
+      "grad_norm": 0.3149973184867936,
+      "learning_rate": 2.796202818819871e-08,
+      "loss": 0.4844,
+      "step": 328
+    },
+    {
+      "epoch": 1.9551353355580274,
+      "grad_norm": 0.3452282568361082,
+      "learning_rate": 2.1410767613965212e-08,
+      "loss": 0.5069,
+      "step": 329
+    },
+    {
+      "epoch": 1.961067853170189,
+      "grad_norm": 0.347397146126861,
+      "learning_rate": 1.5731849821833955e-08,
+      "loss": 0.5167,
+      "step": 330
+    },
+    {
+      "epoch": 1.9670003707823507,
+      "grad_norm": 0.32204047912115363,
+      "learning_rate": 1.0925771271058649e-08,
+      "loss": 0.4928,
+      "step": 331
+    },
+    {
+      "epoch": 1.9729328883945123,
+      "grad_norm": 0.3287263087544649,
+      "learning_rate": 6.992952116013918e-09,
+      "loss": 0.4838,
+      "step": 332
+    },
+    {
+      "epoch": 1.978865406006674,
+      "grad_norm": 0.3269223168333791,
+      "learning_rate": 3.933736169471347e-09,
+      "loss": 0.5086,
+      "step": 333
+    },
+    {
+      "epoch": 1.9847979236188358,
+      "grad_norm": 0.3469969747818369,
+      "learning_rate": 1.7483908725357546e-09,
+      "loss": 0.5039,
+      "step": 334
+    },
+    {
+      "epoch": 1.9907304412309974,
+      "grad_norm": 0.3226354438244203,
+      "learning_rate": 4.3710727127277417e-10,
+      "loss": 0.5337,
+      "step": 335
+    },
+    {
+      "epoch": 1.996662958843159,
+      "grad_norm": 0.3421484877624497,
+      "learning_rate": 0.0,
+      "loss": 0.5223,
+      "step": 336
+    },
+    {
+      "epoch": 1.996662958843159,
+      "step": 336,
+      "total_flos": 352364519227392.0,
+      "train_loss": 0.5676420692886625,
+      "train_runtime": 10418.0052,
+      "train_samples_per_second": 4.141,
+      "train_steps_per_second": 0.032
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 336,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 352364519227392.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}