Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json +53 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json +1 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json +53 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
- Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt +3 -0
- TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt +3 -0
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e9152cd6bbb46fef658f48645ee6e23ea164ca837939dc4c4472075f850cff6
|
| 3 |
+
size 25187350
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6a39fe2b27eb1705884374389839c276e3e277d48903d17482d810f46a368ae
|
| 3 |
+
size 25187350
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b5161845a4ca972062849f1023167b1d4bf37a764a8fe8a602bd782a3fc39be3
|
| 3 |
+
size 25187350
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:218b3717735f0cbc6bc72c5946422136f80d8b29f44ec4061edf648f2409d014
|
| 3 |
+
size 25187350
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d58a7a54c6e79c03326d5b013a815a3b9d2f74970d97dd8858bff2730b464b4c
|
| 3 |
+
size 25187350
|
BatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a3c72811397cdcf999e03d2ee8955d0712917fbd4662c390ef8e3c5a43a4895
|
| 3 |
+
size 25187350
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4942fe15e99f94e489e241702f3604c71e544c763732a0f73ea9d06404ec12e6
|
| 3 |
+
size 25220118
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ecbc76304e9f267e2704d53589c7bce90f07433db6f5b828bad733d36bd358d
|
| 3 |
+
size 25220118
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c40e0430372222dd534492312ef443e7a5f6e1522b8c5c6e3f947169c9da1ebb
|
| 3 |
+
size 25220118
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a38db0a1a6556f0131ec4c8e9cd7127422ce43147b27e88f371c14de95a5e1b
|
| 3 |
+
size 25220118
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ccb6735797c3de308bc16ca32206b86e11a53193df3b3b7e4b00a4617d9b3e0a
|
| 3 |
+
size 25220118
|
GatedSAE_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:72c630784e70a267b96d3d53bc0bd511be38fe9c4f62ccb1977f97edf4b0ff9a
|
| 3 |
+
size 25220118
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2f5a5fa9af76200111d6217ac3624e53c9becb7e360746e46aa0f5d584312a91
|
| 3 |
+
size 25203487
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95d6acc935243f87fb597097bd4c4d522c1e7c7dca47e4a395dcd4ec7baff786
|
| 3 |
+
size 25203487
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b5304a167b6359fb73c4520ff0623820cf5fe4f59a8eb03279c34d52779fcb3
|
| 3 |
+
size 25203487
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30728e6484a5b66e9b64e0fdc73384fa894e23eb471bfd1988876d4409d483cc
|
| 3 |
+
size 25203487
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:df9b389961a271e75436332200cdbb201cddfb8a6d15d0d8846abcd61e9f466b
|
| 3 |
+
size 25203487
|
JumpRelu_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:774d4df00cfaad33152c4143fca2e2c0e8589d5ff17a99ce74b91e820227f834
|
| 3 |
+
size 25203487
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5c056e07b96a75af6cd31ab961e24fc82e49114ef0227814a5582f831012e26e
|
| 3 |
+
size 25187597
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81fed4d21ff131d65a405e3b6f88a55a64db798a8a7f3edca27eade8db266d00
|
| 3 |
+
size 25187597
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 4096,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
128,
|
| 32 |
+
256,
|
| 33 |
+
512,
|
| 34 |
+
1024,
|
| 35 |
+
2176
|
| 36 |
+
],
|
| 37 |
+
"k": 40,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_1",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 7.362171837777803, "l1_loss": 78.93377662427497, "l0": 39.87090717662465, "frac_variance_explained": 0.9211916851274895, "cossim": 0.9494949272184661, "l2_ratio": 0.9577325889558503, "relative_reconstruction_bias": 1.0056351567759658, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.990205157886852, "loss_zero": 12.187079458525687, "frac_recovered": 0.9570405320687727, "frac_alive": 0.996337890625, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0b752ea230a4c9f51a78ca7c08c6fbc203b092c314513ec3fb2b31cf540ef53
|
| 3 |
+
size 25187597
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 4096,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
128,
|
| 32 |
+
256,
|
| 33 |
+
512,
|
| 34 |
+
1024,
|
| 35 |
+
2176
|
| 36 |
+
],
|
| 37 |
+
"k": 80,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_2",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 6.5154667334123095, "l1_loss": 130.4948748964252, "l0": 79.62615203857422, "frac_variance_explained": 0.9379528291297682, "cossim": 0.9607100306135236, "l2_ratio": 0.9691500952749541, "relative_reconstruction_bias": 1.0058996894142844, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.842034011176138, "loss_zero": 12.187079458525687, "frac_recovered": 0.9736035881620465, "frac_alive": 0.9912109375, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e984acf7f7b7c1fd93666aa91cd2e32a3592025f68a89d30305b8badf3483ebf
|
| 3 |
+
size 25187597
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 4096,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
128,
|
| 32 |
+
256,
|
| 33 |
+
512,
|
| 34 |
+
1024,
|
| 35 |
+
2176
|
| 36 |
+
],
|
| 37 |
+
"k": 160,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_3",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 5.389582446127227, "l1_loss": 193.1514032537287, "l0": 159.14285000887784, "frac_variance_explained": 0.9577755133310953, "cossim": 0.9736721605965586, "l2_ratio": 0.9815338091416792, "relative_reconstruction_bias": 1.003865700779539, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.7269490740515967, "loss_zero": 12.187079458525687, "frac_recovered": 0.9865131269801747, "frac_alive": 0.94580078125, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b16c1393d2ec50c976f03c6fe7f556474bc9b588f80d462df84cb607d4ae052
|
| 3 |
+
size 25187597
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 4096,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
128,
|
| 32 |
+
256,
|
| 33 |
+
512,
|
| 34 |
+
1024,
|
| 35 |
+
2176
|
| 36 |
+
],
|
| 37 |
+
"k": 320,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_4",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/eval_results.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"l2_loss": 3.9152750463196724, "l1_loss": 339.17472700639206, "l0": 318.46138416637075, "frac_variance_explained": 0.9781012733777364, "cossim": 0.9863574920278607, "l2_ratio": 0.9901740713552996, "relative_reconstruction_bias": 1.0028689196615508, "loss_original": 2.6064688870401094, "loss_reconstructed": 2.655328530253786, "loss_zero": 12.187079458525687, "frac_recovered": 0.9944809982270906, "frac_alive": 0.57763671875, "hyperparameters": {"n_inputs": 200, "context_length": 1024}}
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ec31077dc93e01e06f3dcf61a60ef59f241cfbcefa4cd171a5bbefbc19de41a
|
| 3 |
+
size 25187597
|
MatryoshkaBatchTopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"trainer": {
|
| 3 |
+
"trainer_class": "MatryoshkaBatchTopKTrainer",
|
| 4 |
+
"dict_class": "MatryoshkaBatchTopKSAE",
|
| 5 |
+
"lr": 0.0003,
|
| 6 |
+
"steps": 244140,
|
| 7 |
+
"auxk_alpha": 0.03125,
|
| 8 |
+
"warmup_steps": 1000,
|
| 9 |
+
"decay_start": 195312,
|
| 10 |
+
"threshold_beta": 0.999,
|
| 11 |
+
"threshold_start_step": 1000,
|
| 12 |
+
"top_k_aux": 384,
|
| 13 |
+
"seed": 0,
|
| 14 |
+
"activation_dim": 768,
|
| 15 |
+
"dict_size": 4096,
|
| 16 |
+
"group_fractions": [
|
| 17 |
+
0.03125,
|
| 18 |
+
0.0625,
|
| 19 |
+
0.125,
|
| 20 |
+
0.25,
|
| 21 |
+
0.53125
|
| 22 |
+
],
|
| 23 |
+
"group_weights": [
|
| 24 |
+
0.2,
|
| 25 |
+
0.2,
|
| 26 |
+
0.2,
|
| 27 |
+
0.2,
|
| 28 |
+
0.2
|
| 29 |
+
],
|
| 30 |
+
"group_sizes": [
|
| 31 |
+
128,
|
| 32 |
+
256,
|
| 33 |
+
512,
|
| 34 |
+
1024,
|
| 35 |
+
2176
|
| 36 |
+
],
|
| 37 |
+
"k": 640,
|
| 38 |
+
"device": "cuda:0",
|
| 39 |
+
"layer": 8,
|
| 40 |
+
"lm_name": "EleutherAI/pythia-160m-deduped",
|
| 41 |
+
"wandb_name": "MatryoshkaBatchTopKTrainer-EleutherAI/pythia-160m-deduped-resid_post_layer_8_trainer_5",
|
| 42 |
+
"submodule_name": "resid_post_layer_8"
|
| 43 |
+
},
|
| 44 |
+
"buffer": {
|
| 45 |
+
"d_submodule": 768,
|
| 46 |
+
"io": "out",
|
| 47 |
+
"n_ctxs": 244,
|
| 48 |
+
"ctx_len": 1024,
|
| 49 |
+
"refresh_batch_size": 32,
|
| 50 |
+
"out_batch_size": 2048,
|
| 51 |
+
"device": "cuda:0"
|
| 52 |
+
}
|
| 53 |
+
}
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fbfed2bd6a4e0cb3a0dc451f2f48781f0d2430dd8a2c4aa016e9912b55d8099
|
| 3 |
+
size 25186984
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49d5ed8467a1385130ee9ce63e034511e54e60478737435b9c3b33b62d274076
|
| 3 |
+
size 25186984
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1b893965a3bd9f7a065d28a233cbe1102b9d77fc20b35f392ec244e6c8d2cd0
|
| 3 |
+
size 25186984
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:99290b592d7e34ed4bf67ae86078dde28a61aa6fc71fdf36374b39d67bf577fe
|
| 3 |
+
size 25186984
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bed3f186a191764d3e9af5276e8a81b2fcb888063fb4aba30423260469298228
|
| 3 |
+
size 25186984
|
PAnneal_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7cc64def02d43e0445231babf172338c9cbce22b25bb0e4ad8b565a0636ee4f
|
| 3 |
+
size 25186984
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3548a791eed29cf0a2545c5eb894a9637c617ecef47724b5531cd13cf3c1c783
|
| 3 |
+
size 25186984
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51a3642060da855b1df65b37dd08dcc0f6f1f9391ff66449d11714a477f7585c
|
| 3 |
+
size 25186984
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8509bc8a09a9a68fc11c2a03b3780eb6ae432c55ff28991fca0c7e25fa83c925
|
| 3 |
+
size 25186984
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bcb77b85c9327f7711a15fbdaf115caee1e78e782194db6997e17f736b89bc63
|
| 3 |
+
size 25186984
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:807df4e63d0c206d8b4891c6ff84849c28161c13a8d7c272506f9cf3c2cbbaae
|
| 3 |
+
size 25186984
|
Standard_pythia-160m-deduped__0108/resid_post_layer_8/trainer_5/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e39e81c75160b19613fb8d959f57ad63118391a2751c3d80737f86be45d79bae
|
| 3 |
+
size 25186984
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_0/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ea57ec3bd6667ade40f4f0aca0e896e14fe0a529c1594c2d7127565e7f726ce
|
| 3 |
+
size 25187350
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_1/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2b8704d61cc34d1c88bc3b9cf8d0f7b856e8eb7c5d2c09ecd9020ec1e28c5daa
|
| 3 |
+
size 25187350
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_2/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b84cd2a1361674efa24209db17a9bdb112e4a1b4a81c45eba205cad13722d3e9
|
| 3 |
+
size 25187350
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_3/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:920b611a07e9da38dd08b36fca39f442ccdf64aaca75604d9b6ff830b2e6db9d
|
| 3 |
+
size 25187350
|
TopK_pythia-160m-deduped__0108/resid_post_layer_8/trainer_4/ae.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:61ff96e653b38d80fa209cda7058a259193f736e28b979f6d4c020842dd0fea4
|
| 3 |
+
size 25187350
|