Upload 8 files

Browse files

Files changed (8) hide show

LICENSE.txt +21 -0
README.md +54 -3
config.json +53 -0
generation_config.json +6 -0
model.safetensors +3 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +44 -0

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Idiap Research Institute
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,54 @@
----
-license: mit
----

+---
+license: mit
+---
+# gated-deltanet-attn-0.4B-10B
+Gated DeltaNet + full attention (0.4B params, 10B tokens)
+## Overview
+* **Training**: gated-deltanet-attn-0.4B-10B was trained on [FineWeb-Edu](https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu), which is realeased under [ODC-By v1.0](https://opendatacommons.org/licenses/by/1-0/)
+* **Parameters**: 0.4B
+* **Task**: Language modeling
+* **Framework**: HuggingFace, [flash-linear-attention](https://github.com/fla-org/flash-linear-attention)
+* **Output structure**: [batch_size, sequence_length, num_logits]
+## Performance
+Various; available in paper
+## Running Code
+* Minimal code to instantiate the model and perform inference:
+```python
+# Requires flash-linear-attention (https://github.com/fla-org/flash-linear-attention)
+import fla
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained(path_to_model).cuda()
+tokenizer = AutoTokenizer.from_pretrained(path_to_model).cuda()
+input_ids = tokenizer("All human beings are", return_tensors="pt").input_ids
+outputs = model.generate(input_ids, max_length=15)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+## License
+HyperFace is released under [MIT License](LICENSE.txt)
+## Citation
+If you find our work useful, please cite the following publication:
+```bibtex
+@misc{he_alleviating_2025,
+    title = {Alleviating {Forgetfulness} of {Linear} {Attention} by {Hybrid} {Sparse} {Attention} and {Contextualized} {Learnable} {Token} {Eviction}},
+    url = {http://arxiv.org/abs/2510.20787},
+    doi = {10.48550/arXiv.2510.20787},
+    publisher = {arXiv},
+    author = {He, Mutian and Garner, Philip N.},
+    month = oct,
+    year = {2025},
+    note = {arXiv:2510.20787 [cs]},
+}
+```

config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+  "_name_or_path": "exp/gated_deltanet-h-Q16_K16-340M-c4K-10B/re2_batch32.seqlen4096.warmup1024.steps20480.lr3e-4/config.json",
+  "architectures": [
+    "GatedDeltaNetForCausalLM"
+  ],
+  "attn": {
+    "layers": [
+      1,
+      3,
+      5,
+      7,
+      9,
+      11,
+      13,
+      15,
+      17,
+      19,
+      21,
+      23
+    ],
+    "num_heads": 16,
+    "num_kv_heads": 16,
+    "qkv_bias": false,
+    "rope_theta": 10000.0,
+    "window_size": null
+  },
+  "attn_mode": "chunk",
+  "bos_token_id": 1,
+  "conv_size": 4,
+  "eos_token_id": 2,
+  "expand_v": 1,
+  "fuse_cross_entropy": true,
+  "fuse_norm": true,
+  "fuse_swiglu": true,
+  "head_dim": 128,
+  "hidden_act": "swish",
+  "hidden_ratio": 4,
+  "hidden_size": 1024,
+  "initializer_range": 0.006,
+  "intermediate_size": null,
+  "max_position_embeddings": 4096,
+  "model_type": "gated_deltanet",
+  "norm_eps": 1e-06,
+  "num_heads": 8,
+  "num_hidden_layers": 24,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "use_cache": true,
+  "use_gate": true,
+  "use_short_conv": true,
+  "vocab_size": 32000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.49.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00d0c14cbf6226f68ba64ac5dc77a9fdcd7f6025142607c030016975943e1d95
+size 1416148088

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}