ambermanijha commited on Jul 22

Commit

7701f44

verified ·

1 Parent(s): c2d1f62

Upload folder using huggingface_hub

Browse files

Files changed (23) hide show

checkpoint-10000/config.json +29 -0
checkpoint-10000/generation_config.json +7 -0
checkpoint-10000/model.safetensors +3 -0
checkpoint-10000/optimizer.pt +3 -0
checkpoint-10000/rng_state.pth +3 -0
checkpoint-10000/scheduler.pt +3 -0
checkpoint-10000/trainer_state.json +733 -0
checkpoint-10000/training_args.bin +3 -0
checkpoint-5000/config.json +29 -0
checkpoint-5000/generation_config.json +7 -0
checkpoint-5000/model.safetensors +3 -0
checkpoint-5000/optimizer.pt +3 -0
checkpoint-5000/rng_state.pth +3 -0
checkpoint-5000/scheduler.pt +3 -0
checkpoint-5000/trainer_state.json +383 -0
checkpoint-5000/training_args.bin +3 -0
config.json +29 -0
generation_config.json +7 -0
model.safetensors +3 -0
special_tokens_map.json +30 -0
tokenizer.json +0 -0
tokenizer_config.json +44 -0
training_args.bin +3 -0

checkpoint-10000/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vocab_size": 50000
+}

checkpoint-10000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.49.0"
+}

checkpoint-10000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b89923a516924762c2de248dacf4862b0e13476a487390a50a81cb29fc1bfe2
+size 278643752

checkpoint-10000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adb9858cb4b21765f0075a1542d9d3b1aac139fcff68be457f47f1ad7e1a1578
+size 557367226

checkpoint-10000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5a18679862da768a40e8967bb0dec5995b07cd29a8a6dc7221003e0843572bb
+size 14244

checkpoint-10000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:23bacfc9e0420908692bc36f206e544fd539ad9548d49f18ad4437c98828bc0f
+size 1064

checkpoint-10000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,733 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 10000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6850011944770813,
+      "learning_rate": 4.9500000000000004e-05,
+      "loss": 2.7276,
+      "step": 100
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.3513778448104858,
+      "learning_rate": 4.9e-05,
+      "loss": 2.2015,
+      "step": 200
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.4596431255340576,
+      "learning_rate": 4.85e-05,
+      "loss": 2.1439,
+      "step": 300
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.4662152528762817,
+      "learning_rate": 4.8e-05,
+      "loss": 2.0443,
+      "step": 400
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.5642160177230835,
+      "learning_rate": 4.75e-05,
+      "loss": 2.041,
+      "step": 500
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.5689924955368042,
+      "learning_rate": 4.7e-05,
+      "loss": 1.8782,
+      "step": 600
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.754616141319275,
+      "learning_rate": 4.6500000000000005e-05,
+      "loss": 1.8376,
+      "step": 700
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.4754304885864258,
+      "learning_rate": 4.600000000000001e-05,
+      "loss": 1.9273,
+      "step": 800
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.6210129261016846,
+      "learning_rate": 4.55e-05,
+      "loss": 1.8568,
+      "step": 900
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.5406711101531982,
+      "learning_rate": 4.5e-05,
+      "loss": 1.7329,
+      "step": 1000
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.5473151206970215,
+      "learning_rate": 4.4500000000000004e-05,
+      "loss": 1.6887,
+      "step": 1100
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.7042499780654907,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 1.6909,
+      "step": 1200
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.6643552780151367,
+      "learning_rate": 4.35e-05,
+      "loss": 1.7344,
+      "step": 1300
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.515936255455017,
+      "learning_rate": 4.3e-05,
+      "loss": 1.6402,
+      "step": 1400
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.430012583732605,
+      "learning_rate": 4.25e-05,
+      "loss": 1.595,
+      "step": 1500
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.9956995248794556,
+      "learning_rate": 4.2e-05,
+      "loss": 1.5611,
+      "step": 1600
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.8397706747055054,
+      "learning_rate": 4.15e-05,
+      "loss": 1.6238,
+      "step": 1700
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.8368654251098633,
+      "learning_rate": 4.1e-05,
+      "loss": 1.5441,
+      "step": 1800
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.6586986780166626,
+      "learning_rate": 4.05e-05,
+      "loss": 1.5188,
+      "step": 1900
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.7311187982559204,
+      "learning_rate": 4e-05,
+      "loss": 1.516,
+      "step": 2000
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.6303811073303223,
+      "learning_rate": 3.9500000000000005e-05,
+      "loss": 1.4869,
+      "step": 2100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.519205927848816,
+      "learning_rate": 3.9000000000000006e-05,
+      "loss": 1.4699,
+      "step": 2200
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.656671404838562,
+      "learning_rate": 3.85e-05,
+      "loss": 1.4298,
+      "step": 2300
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.5942782163619995,
+      "learning_rate": 3.8e-05,
+      "loss": 1.4659,
+      "step": 2400
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.5101336240768433,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 1.5001,
+      "step": 2500
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.61963951587677,
+      "learning_rate": 3.7e-05,
+      "loss": 1.4175,
+      "step": 2600
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.7214901447296143,
+      "learning_rate": 3.65e-05,
+      "loss": 1.402,
+      "step": 2700
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.4414921998977661,
+      "learning_rate": 3.6e-05,
+      "loss": 1.4171,
+      "step": 2800
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.7572053670883179,
+      "learning_rate": 3.55e-05,
+      "loss": 1.3965,
+      "step": 2900
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.837003469467163,
+      "learning_rate": 3.5e-05,
+      "loss": 1.3753,
+      "step": 3000
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.868739128112793,
+      "learning_rate": 3.45e-05,
+      "loss": 1.3463,
+      "step": 3100
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0180938243865967,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 1.3115,
+      "step": 3200
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.5846806764602661,
+      "learning_rate": 3.35e-05,
+      "loss": 1.3008,
+      "step": 3300
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.7848007678985596,
+      "learning_rate": 3.3e-05,
+      "loss": 1.3129,
+      "step": 3400
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.7048262357711792,
+      "learning_rate": 3.2500000000000004e-05,
+      "loss": 1.3294,
+      "step": 3500
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9426958560943604,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.2896,
+      "step": 3600
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.048593521118164,
+      "learning_rate": 3.15e-05,
+      "loss": 1.2755,
+      "step": 3700
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.6056958436965942,
+      "learning_rate": 3.1e-05,
+      "loss": 1.2465,
+      "step": 3800
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.5590567588806152,
+      "learning_rate": 3.05e-05,
+      "loss": 1.2514,
+      "step": 3900
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8030258417129517,
+      "learning_rate": 3e-05,
+      "loss": 1.2493,
+      "step": 4000
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.7520445585250854,
+      "learning_rate": 2.95e-05,
+      "loss": 1.2235,
+      "step": 4100
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.6477694511413574,
+      "learning_rate": 2.9e-05,
+      "loss": 1.2036,
+      "step": 4200
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.046046257019043,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 1.2175,
+      "step": 4300
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.769077181816101,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 1.1884,
+      "step": 4400
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.0415198802948,
+      "learning_rate": 2.7500000000000004e-05,
+      "loss": 1.2058,
+      "step": 4500
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9062659740447998,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.1946,
+      "step": 4600
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.0025410652160645,
+      "learning_rate": 2.6500000000000004e-05,
+      "loss": 1.1952,
+      "step": 4700
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.717053771018982,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.156,
+      "step": 4800
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.6754528284072876,
+      "learning_rate": 2.5500000000000003e-05,
+      "loss": 1.1314,
+      "step": 4900
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.3899461030960083,
+      "learning_rate": 2.5e-05,
+      "loss": 1.1576,
+      "step": 5000
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.6504240036010742,
+      "learning_rate": 2.45e-05,
+      "loss": 1.1773,
+      "step": 5100
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 1.95234215259552,
+      "learning_rate": 2.4e-05,
+      "loss": 1.1109,
+      "step": 5200
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.9345238208770752,
+      "learning_rate": 2.35e-05,
+      "loss": 1.1648,
+      "step": 5300
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 1.8829147815704346,
+      "learning_rate": 2.3000000000000003e-05,
+      "loss": 1.1723,
+      "step": 5400
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.8588093519210815,
+      "learning_rate": 2.25e-05,
+      "loss": 1.1159,
+      "step": 5500
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.7027171850204468,
+      "learning_rate": 2.2000000000000003e-05,
+      "loss": 1.0798,
+      "step": 5600
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.7587237358093262,
+      "learning_rate": 2.15e-05,
+      "loss": 1.1179,
+      "step": 5700
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 2.0505311489105225,
+      "learning_rate": 2.1e-05,
+      "loss": 1.1142,
+      "step": 5800
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 2.03979229927063,
+      "learning_rate": 2.05e-05,
+      "loss": 1.1376,
+      "step": 5900
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 1.6683259010314941,
+      "learning_rate": 2e-05,
+      "loss": 1.1018,
+      "step": 6000
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 2.0077784061431885,
+      "learning_rate": 1.9500000000000003e-05,
+      "loss": 1.1112,
+      "step": 6100
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 2.2150514125823975,
+      "learning_rate": 1.9e-05,
+      "loss": 1.0573,
+      "step": 6200
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 1.886559247970581,
+      "learning_rate": 1.85e-05,
+      "loss": 1.0841,
+      "step": 6300
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 2.1064834594726562,
+      "learning_rate": 1.8e-05,
+      "loss": 1.0877,
+      "step": 6400
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.7919803857803345,
+      "learning_rate": 1.75e-05,
+      "loss": 1.1124,
+      "step": 6500
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 1.6519834995269775,
+      "learning_rate": 1.7000000000000003e-05,
+      "loss": 1.0847,
+      "step": 6600
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 2.4856765270233154,
+      "learning_rate": 1.65e-05,
+      "loss": 1.0531,
+      "step": 6700
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 2.2001657485961914,
+      "learning_rate": 1.6000000000000003e-05,
+      "loss": 1.0623,
+      "step": 6800
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 1.8861461877822876,
+      "learning_rate": 1.55e-05,
+      "loss": 1.066,
+      "step": 6900
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 2.00785756111145,
+      "learning_rate": 1.5e-05,
+      "loss": 1.0618,
+      "step": 7000
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 2.0481674671173096,
+      "learning_rate": 1.45e-05,
+      "loss": 1.0417,
+      "step": 7100
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.885738492012024,
+      "learning_rate": 1.4000000000000001e-05,
+      "loss": 1.0537,
+      "step": 7200
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 2.0609591007232666,
+      "learning_rate": 1.3500000000000001e-05,
+      "loss": 1.0298,
+      "step": 7300
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 1.767872929573059,
+      "learning_rate": 1.3000000000000001e-05,
+      "loss": 1.0427,
+      "step": 7400
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.8356056213378906,
+      "learning_rate": 1.25e-05,
+      "loss": 1.0745,
+      "step": 7500
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 1.8087313175201416,
+      "learning_rate": 1.2e-05,
+      "loss": 1.0031,
+      "step": 7600
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 2.0896551609039307,
+      "learning_rate": 1.1500000000000002e-05,
+      "loss": 1.04,
+      "step": 7700
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 2.0215108394622803,
+      "learning_rate": 1.1000000000000001e-05,
+      "loss": 1.0274,
+      "step": 7800
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.8265337944030762,
+      "learning_rate": 1.05e-05,
+      "loss": 1.0075,
+      "step": 7900
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 1.8919756412506104,
+      "learning_rate": 1e-05,
+      "loss": 1.0197,
+      "step": 8000
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.8041898012161255,
+      "learning_rate": 9.5e-06,
+      "loss": 1.0187,
+      "step": 8100
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 2.347409963607788,
+      "learning_rate": 9e-06,
+      "loss": 1.0439,
+      "step": 8200
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 2.4638195037841797,
+      "learning_rate": 8.500000000000002e-06,
+      "loss": 1.0234,
+      "step": 8300
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 2.1736338138580322,
+      "learning_rate": 8.000000000000001e-06,
+      "loss": 1.0265,
+      "step": 8400
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 2.583214282989502,
+      "learning_rate": 7.5e-06,
+      "loss": 1.0163,
+      "step": 8500
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 2.069798707962036,
+      "learning_rate": 7.000000000000001e-06,
+      "loss": 1.0214,
+      "step": 8600
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 2.100463628768921,
+      "learning_rate": 6.5000000000000004e-06,
+      "loss": 1.0216,
+      "step": 8700
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 1.9746285676956177,
+      "learning_rate": 6e-06,
+      "loss": 0.9824,
+      "step": 8800
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 2.1781320571899414,
+      "learning_rate": 5.500000000000001e-06,
+      "loss": 1.022,
+      "step": 8900
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.9690595865249634,
+      "learning_rate": 5e-06,
+      "loss": 0.979,
+      "step": 9000
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 2.073143243789673,
+      "learning_rate": 4.5e-06,
+      "loss": 1.002,
+      "step": 9100
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 2.178834915161133,
+      "learning_rate": 4.000000000000001e-06,
+      "loss": 1.0046,
+      "step": 9200
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 1.87750244140625,
+      "learning_rate": 3.5000000000000004e-06,
+      "loss": 1.0171,
+      "step": 9300
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.8957669734954834,
+      "learning_rate": 3e-06,
+      "loss": 1.004,
+      "step": 9400
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 2.0873405933380127,
+      "learning_rate": 2.5e-06,
+      "loss": 1.0296,
+      "step": 9500
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 1.8929296731948853,
+      "learning_rate": 2.0000000000000003e-06,
+      "loss": 0.981,
+      "step": 9600
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 2.0239217281341553,
+      "learning_rate": 1.5e-06,
+      "loss": 0.9793,
+      "step": 9700
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 1.8227028846740723,
+      "learning_rate": 1.0000000000000002e-06,
+      "loss": 0.9962,
+      "step": 9800
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.9512608051300049,
+      "learning_rate": 5.000000000000001e-07,
+      "loss": 0.982,
+      "step": 9900
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 2.1475226879119873,
+      "learning_rate": 0.0,
+      "loss": 1.0236,
+      "step": 10000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 10000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5413672058880000.0,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-10000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4db73719714726f2f557eac6d22a8fa80f1a235536b2aea4ec20c0b19b14dbe
+size 5432

checkpoint-5000/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vocab_size": 50000
+}

checkpoint-5000/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.49.0"
+}

checkpoint-5000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4eeb3aac5a4db5e4cdd17be1c73f12e6c2df4585422b306817d7e4a5117cc1ed
+size 278643752

checkpoint-5000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:113f49eabe05f7b39e29b887c1fb8fb80ccac228936098155e55cb5d58517cd8
+size 557367226

checkpoint-5000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8d179c379a56f256ef3d5cc887682b1d2236d40085a156b1779080652f583fc
+size 14244

checkpoint-5000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a78c8c8a4be6cb8ff4b8d2d9805ee7c29168ba8bd60ae33f3adc872613cd729f
+size 1064

checkpoint-5000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,383 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5,
+  "eval_steps": 500,
+  "global_step": 5000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.6850011944770813,
+      "learning_rate": 4.9500000000000004e-05,
+      "loss": 2.7276,
+      "step": 100
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 1.3513778448104858,
+      "learning_rate": 4.9e-05,
+      "loss": 2.2015,
+      "step": 200
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 1.4596431255340576,
+      "learning_rate": 4.85e-05,
+      "loss": 2.1439,
+      "step": 300
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.4662152528762817,
+      "learning_rate": 4.8e-05,
+      "loss": 2.0443,
+      "step": 400
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 1.5642160177230835,
+      "learning_rate": 4.75e-05,
+      "loss": 2.041,
+      "step": 500
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.5689924955368042,
+      "learning_rate": 4.7e-05,
+      "loss": 1.8782,
+      "step": 600
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 1.754616141319275,
+      "learning_rate": 4.6500000000000005e-05,
+      "loss": 1.8376,
+      "step": 700
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 1.4754304885864258,
+      "learning_rate": 4.600000000000001e-05,
+      "loss": 1.9273,
+      "step": 800
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 1.6210129261016846,
+      "learning_rate": 4.55e-05,
+      "loss": 1.8568,
+      "step": 900
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 1.5406711101531982,
+      "learning_rate": 4.5e-05,
+      "loss": 1.7329,
+      "step": 1000
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 1.5473151206970215,
+      "learning_rate": 4.4500000000000004e-05,
+      "loss": 1.6887,
+      "step": 1100
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 1.7042499780654907,
+      "learning_rate": 4.4000000000000006e-05,
+      "loss": 1.6909,
+      "step": 1200
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 1.6643552780151367,
+      "learning_rate": 4.35e-05,
+      "loss": 1.7344,
+      "step": 1300
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 1.515936255455017,
+      "learning_rate": 4.3e-05,
+      "loss": 1.6402,
+      "step": 1400
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 1.430012583732605,
+      "learning_rate": 4.25e-05,
+      "loss": 1.595,
+      "step": 1500
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.9956995248794556,
+      "learning_rate": 4.2e-05,
+      "loss": 1.5611,
+      "step": 1600
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 1.8397706747055054,
+      "learning_rate": 4.15e-05,
+      "loss": 1.6238,
+      "step": 1700
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.8368654251098633,
+      "learning_rate": 4.1e-05,
+      "loss": 1.5441,
+      "step": 1800
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.6586986780166626,
+      "learning_rate": 4.05e-05,
+      "loss": 1.5188,
+      "step": 1900
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.7311187982559204,
+      "learning_rate": 4e-05,
+      "loss": 1.516,
+      "step": 2000
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 1.6303811073303223,
+      "learning_rate": 3.9500000000000005e-05,
+      "loss": 1.4869,
+      "step": 2100
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.519205927848816,
+      "learning_rate": 3.9000000000000006e-05,
+      "loss": 1.4699,
+      "step": 2200
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 1.656671404838562,
+      "learning_rate": 3.85e-05,
+      "loss": 1.4298,
+      "step": 2300
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 1.5942782163619995,
+      "learning_rate": 3.8e-05,
+      "loss": 1.4659,
+      "step": 2400
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 1.5101336240768433,
+      "learning_rate": 3.7500000000000003e-05,
+      "loss": 1.5001,
+      "step": 2500
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 1.61963951587677,
+      "learning_rate": 3.7e-05,
+      "loss": 1.4175,
+      "step": 2600
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.7214901447296143,
+      "learning_rate": 3.65e-05,
+      "loss": 1.402,
+      "step": 2700
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 1.4414921998977661,
+      "learning_rate": 3.6e-05,
+      "loss": 1.4171,
+      "step": 2800
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 1.7572053670883179,
+      "learning_rate": 3.55e-05,
+      "loss": 1.3965,
+      "step": 2900
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 1.837003469467163,
+      "learning_rate": 3.5e-05,
+      "loss": 1.3753,
+      "step": 3000
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 1.868739128112793,
+      "learning_rate": 3.45e-05,
+      "loss": 1.3463,
+      "step": 3100
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 2.0180938243865967,
+      "learning_rate": 3.4000000000000007e-05,
+      "loss": 1.3115,
+      "step": 3200
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.5846806764602661,
+      "learning_rate": 3.35e-05,
+      "loss": 1.3008,
+      "step": 3300
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.7848007678985596,
+      "learning_rate": 3.3e-05,
+      "loss": 1.3129,
+      "step": 3400
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.7048262357711792,
+      "learning_rate": 3.2500000000000004e-05,
+      "loss": 1.3294,
+      "step": 3500
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 1.9426958560943604,
+      "learning_rate": 3.2000000000000005e-05,
+      "loss": 1.2896,
+      "step": 3600
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 2.048593521118164,
+      "learning_rate": 3.15e-05,
+      "loss": 1.2755,
+      "step": 3700
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.6056958436965942,
+      "learning_rate": 3.1e-05,
+      "loss": 1.2465,
+      "step": 3800
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.5590567588806152,
+      "learning_rate": 3.05e-05,
+      "loss": 1.2514,
+      "step": 3900
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.8030258417129517,
+      "learning_rate": 3e-05,
+      "loss": 1.2493,
+      "step": 4000
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.7520445585250854,
+      "learning_rate": 2.95e-05,
+      "loss": 1.2235,
+      "step": 4100
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 1.6477694511413574,
+      "learning_rate": 2.9e-05,
+      "loss": 1.2036,
+      "step": 4200
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 2.046046257019043,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 1.2175,
+      "step": 4300
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.769077181816101,
+      "learning_rate": 2.8000000000000003e-05,
+      "loss": 1.1884,
+      "step": 4400
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 2.0415198802948,
+      "learning_rate": 2.7500000000000004e-05,
+      "loss": 1.2058,
+      "step": 4500
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 1.9062659740447998,
+      "learning_rate": 2.7000000000000002e-05,
+      "loss": 1.1946,
+      "step": 4600
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 2.0025410652160645,
+      "learning_rate": 2.6500000000000004e-05,
+      "loss": 1.1952,
+      "step": 4700
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 1.717053771018982,
+      "learning_rate": 2.6000000000000002e-05,
+      "loss": 1.156,
+      "step": 4800
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.6754528284072876,
+      "learning_rate": 2.5500000000000003e-05,
+      "loss": 1.1314,
+      "step": 4900
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.3899461030960083,
+      "learning_rate": 2.5e-05,
+      "loss": 1.1576,
+      "step": 5000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 10000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 5000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2706836029440000.0,
+  "train_batch_size": 32,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-5000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4db73719714726f2f557eac6d22a8fa80f1a235536b2aea4ec20c0b19b14dbe
+size 5432

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "classifier_dropout": 0.0,
+  "d_ff": 2048,
+  "d_kv": 64,
+  "d_model": 512,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "num_decoder_layers": 6,
+  "num_heads": 8,
+  "num_layers": 6,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "vocab_size": 50000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.49.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b89923a516924762c2de248dacf4862b0e13476a487390a50a81cb29fc1bfe2
+size 278643752

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizer",
+  "unk_token": "<unk>"
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4db73719714726f2f557eac6d22a8fa80f1a235536b2aea4ec20c0b19b14dbe
+size 5432