jobs-git igitman commited on
Commit
04d9259
·
verified ·
0 Parent(s):

Duplicate from nvidia/OpenMath-Nemotron-14B

Browse files

Co-authored-by: Igor Gitman <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ results.png filter=lfs diff=lfs merge=lfs -text
BIAS.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Field | Response
2
+ :---------------------------------------------------------------------------------------------------|:---------------
3
+ Participation considerations from adversely impacted groups [protected classes](https://www.senate.ca.gov/content/protected-classes) in model design and testing: | None
4
+ Measures taken to mitigate against unwanted bias: | None
EXPLAINABILITY.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Field | Response
2
+ :------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------
3
+ Intended Domain: | Text generation, reasoning, solving mathematical problems.
4
+ Model Type: | Text-to-text transformer
5
+ Intended Users: | This model is intended for developers, researchers, and customers building/utilizing LLMs.
6
+ Output: | Text String(s)
7
+ Describe how the model works: | Generates text by predicting the next word or token based on the context provided in the input sequence using multiple self-attention layers.
8
+ Name the adversely impacted groups this has been tested to deliver comparable outcomes regardless of: | Not Applicable
9
+ Technical Limitations & Mitigation: | The model was optimized for solving mathematical problems and thus might not be able to provide adequate answers for non-mathematical queries. The model was trained on questions with verifiable final answers, and thus may not be able to prove theorems.
10
+ Verified to have met prescribed NVIDIA quality standards: | Yes
11
+ Performance Metrics: | Accuracy
12
+ Potential Known Risks: | The model was optimized explicitly for solving mathematical problems and as such is more susceptible to prompt injection and jailbreaking in various forms as a result of its training. This means that the model should be paired with additional rails or system filtering to limit exposure to instructions from malicious sources -- either directly or indirectly by retrieval (e.g. via visiting a website) -- as they may yield outputs that can lead to harmful, system-level outcomes up to and including remote code execution in agentic systems when effective security controls including guardrails are not in place. The model was trained on data that contains toxic language and societal biases originally crawled from the internet. Therefore, the model may amplify those biases and return toxic responses especially when prompted with toxic prompts. The model may generate answers that may be inaccurate, omit key information, or include irrelevant or redundant text producing socially unacceptable or undesirable text, even if the prompt itself does not include anything explicitly offensive.
13
+ Licensing: | Use of this model is governed by [CC-BY-4.0]((https://creativecommons.org/licenses/by/4.0/)) license. Additional Information: [Apache License Version 2.0](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/blob/main/LICENSE)
PRIVACY.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Field | Response
2
+ :----------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------
3
+ Generatable or reverse engineerable personal data? | None
4
+ Personal data used to create this model? | None Known
5
+ How often is dataset reviewed? | Before Release
6
+ Is there provenance for all datasets used in training? | Yes
7
+ Does data labeling (annotation, metadata) comply with privacy laws? | Yes
8
+ Is data compliant with data subject requests for data correction or removal, if such a request was made? | No, not possible with externally-sourced data.
9
+ Applicable Privacy Policy | https://www.nvidia.com/en-us/about-nvidia/privacy-policy/
README.md ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-4.0
3
+ base_model:
4
+ - Qwen/Qwen2.5-14B
5
+ datasets:
6
+ - nvidia/OpenMathReasoning
7
+ language:
8
+ - en
9
+ tags:
10
+ - nvidia
11
+ - math
12
+ library_name: transformers
13
+ ---
14
+
15
+ # OpenMath-Nemotron-14B
16
+
17
+ OpenMath-Nemotron-14B is created by finetuning [Qwen/Qwen2.5-14B](https://huggingface.co/Qwen/Qwen2.5-14B) on [OpenMathReasoning](https://huggingface.co/datasets/nvidia/OpenMathReasoning) dataset.
18
+ This model is ready for commercial use.
19
+
20
+ ![Evaluation Results](./results.png)
21
+
22
+
23
+ OpenMath-Nemotron models achieve state-of-the-art results on popular mathematical benchmarks. We present metrics as pass@1 (maj@64) where pass@1
24
+ is an average accuracy across 64 generations and maj@64 is the result of majority voting.
25
+ Please see our [paper](https://arxiv.org/abs/2504.16891) for more details on the evaluation setup.
26
+
27
+ | Model | AIME24 | AIME25 | HMMT-24-25 | HLE-Math |
28
+ |-------------------------------|-----------------|-------|-------|-------------|
29
+ | DeepSeek-R1-Distill-Qwen-1.5B | 26.8 (60.0) | 21.4 (36.7) | 14.2 (26.5) | 2.9 (5.0) |
30
+ | [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B) CoT | 61.6 (80.0) | 49.5 (66.7) | 39.9 (53.6) | 5.4 (5.4) |
31
+ | [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B) TIR | 52.0 (83.3) | 39.7 (70.0) | 37.2 (60.7) | 2.5 (6.2) |
32
+ | + Self GenSelect | 83.3 | 70.0 | 62.2 | 7.9 |
33
+ | + 32B GenSelect | 83.3 | 70.0 | 62.8 | 8.3 |
34
+ | DeepSeek-R1-Distill-Qwen-7B | 54.4 (80.0) | 38.6 (53.3) | 30.6 (42.9) | 3.3 (5.2) |
35
+ | [OpenMath-Nemotron-7B](https://huggingface.co/nvidia/OpenMath-Nemotron-7B) CoT | 74.8 (80.0) | 61.2 (76.7) | 49.7 (57.7) | 6.6 (6.6) |
36
+ | [OpenMath-Nemotron-7B](https://huggingface.co/nvidia/OpenMath-Nemotron-7B) TIR | 72.9 (83.3) | 57.5 (76.7) | 54.6 (66.3) | 7.8 (10.8) |
37
+ | + Self GenSelect | 86.7 | 76.7 | 68.4 | 11.5 |
38
+ | + 32B GenSelect | 86.7 | 76.7 | 69.9 | 11.9 |
39
+ | DeepSeek-R1-Distill-Qwen-14B | 65.8 (80.0) | 48.4 (60.0) | 40.1 (52.0) | 4.2 (4.8) |
40
+ | [OpenMath-Nemotron-14B-MIX (kaggle)](https://huggingface.co/nvidia/OpenMath-Nemotron-14B-Kaggle) | 73.7 (86.7) | 57.9 (73.3) | 50.5 (64.8) | 5.7 (6.5) |
41
+ | [OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B) CoT | 76.3 (83.3) | 63.0 (76.7) | 52.1 (60.7) | 7.5 (7.6) |
42
+ | [OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B) TIR | 76.3 (86.7) | 61.3 (76.7) | 58.6 (70.9) | 9.5 (11.5) |
43
+ | + Self GenSelect | 86.7 | 76.7 | 72.4 | 14.1 |
44
+ | + 32B GenSelect | 90.0 | 76.7 | 71.9 | 13.7 |
45
+ | QwQ-32B | 78.1 (86.7) | 66.5 (76.7) | 55.9 (63.3) | 9.0 (9.5) |
46
+ | DeepSeek-R1-Distill-Qwen-32B | 66.9 (83.3) | 51.8 (73.3) | 39.9 (51.0) | 4.8 (6.0) |
47
+ | [OpenMath-Nemotron-32B](https://huggingface.co/nvidia/OpenMath-Nemotron-32B) CoT | 76.5 (86.7) | 62.5 (73.3) | 53.0 (59.2) | 8.3 (8.3) |
48
+ | [OpenMath-Nemotron-32B](https://huggingface.co/nvidia/OpenMath-Nemotron-32B) TIR | 78.4 (93.3) | 64.2 (76.7) | 59.7 (70.9) | 9.2 (12.5) |
49
+ | + Self GenSelect | 93.3 | 80.0 | 73.5 | 15.7 |
50
+ | DeepSeek-R1 | 79.1 (86.7) | 64.3 (73.3) | 53.0 (59.2) | 10.5 (11.4) |
51
+
52
+ We used [a version of OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B-Kaggle) model to secure
53
+ the first place in [AIMO-2 Kaggle competition](https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-2/leaderboard)!
54
+
55
+ ## Reproducing our results
56
+
57
+ The pipeline we used to produce the data and models is fully open-sourced!
58
+
59
+ - [Code](https://github.com/NVIDIA/NeMo-Skills)
60
+ - [Models](https://huggingface.co/collections/nvidia/openmathreasoning-68072c0154a5099573d2e730)
61
+ - [Dataset](https://huggingface.co/datasets/nvidia/OpenMathReasoning)
62
+ - [Paper](https://arxiv.org/abs/2504.16891)
63
+
64
+ We provide [all instructions](https://nvidia.github.io/NeMo-Skills/openmathreasoning1/)
65
+ to fully reproduce our results, including data generation.
66
+
67
+ # How to use the models?
68
+
69
+ Our models can be used in 3 inference modes: chain-of-thought (CoT), tool-integrated reasoning (TIR) and generative solution selection (GenSelect).
70
+
71
+ To run inference with CoT mode, you can use this example code snippet.
72
+
73
+ ```python
74
+ import transformers
75
+ import torch
76
+
77
+ model_id = "nvidia/OpenMath-Nemotron-14B"
78
+
79
+ pipeline = transformers.pipeline(
80
+ "text-generation",
81
+ model=model_id,
82
+ model_kwargs={"torch_dtype": torch.bfloat16},
83
+ device_map="auto",
84
+ )
85
+
86
+ messages = [
87
+ {
88
+ "role": "user",
89
+ "content": "Solve the following math problem. Make sure to put the answer (and only answer) inside \\boxed{}.\n\n" +
90
+ "What is the minimum value of $a^2+6a-7$?"},
91
+ ]
92
+
93
+ outputs = pipeline(
94
+ messages,
95
+ max_new_tokens=4096,
96
+ )
97
+ print(outputs[0]["generated_text"][-1]['content'])
98
+ ```
99
+
100
+ To run inference with TIR or GenSelect modes, we highly recommend to use our
101
+ [reference implementation in NeMo-Skills](https://nvidia.github.io/NeMo-Skills/openmathreasoning1/evaluation/).
102
+
103
+ Please note that these models have not been instruction tuned on general data and thus might not provide good answers outside of math domain.
104
+
105
+
106
+ ## Citation
107
+
108
+ If you find our work useful, please consider citing us!
109
+
110
+ ```bibtex
111
+ @article{moshkov2025aimo2,
112
+ title = {AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning Models with OpenMathReasoning dataset},
113
+ author = {Ivan Moshkov and Darragh Hanley and Ivan Sorokin and Shubham Toshniwal and Christof Henkel and Benedikt Schifferer and Wei Du and Igor Gitman},
114
+ year = {2025},
115
+ journal = {arXiv preprint arXiv:2504.16891}
116
+ }
117
+ ```
118
+
119
+ ## Additional information
120
+
121
+ ### License/Terms of Use: <br>
122
+
123
+ GOVERNING TERMS: Use of this model is governed by [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/legalcode.en).
124
+ Additional Information: [Apache License Version 2.0](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/blob/main/LICENSE).
125
+
126
+ ### Deployment Geography:
127
+
128
+ Global <br>
129
+
130
+ ### Use Case: <br>
131
+
132
+ This model is intended to facilitate research in the area of mathematical reasoning.
133
+
134
+
135
+
136
+ ### Release Date:  <br>
137
+
138
+ Huggingface 04/23/2025 <br>
139
+
140
+ ## Model Architecture: <br>
141
+
142
+ **Architecture Type:** Transformer decoder-only language model  <br>
143
+
144
+ **Network Architecture:** Qwen2.5 <br>
145
+
146
+
147
+ **This model was developed based on Qwen2.5-1.5B <br>
148
+
149
+ ** This model has 1.5B of model parameters. <br>
150
+
151
+ ## Input: <br>
152
+
153
+ **Input Type(s):** Text <br>
154
+
155
+ **Input Format(s):** String <br>
156
+
157
+ **Input Parameters:** One-Dimensional (1D) <br>
158
+
159
+ **Other Properties Related to Input:** Context length up to 131,072 tokens <br>
160
+
161
+
162
+
163
+ ## Output: <br>
164
+
165
+ **Output Type(s):** Text <br>
166
+
167
+ **Output Format:** String <br>
168
+
169
+ **Output Parameters:** One-Dimensional (1D) <br>
170
+
171
+ **Other Properties Related to Output:** Context length up to 131,072 tokens <br>
172
+
173
+
174
+
175
+ Our AI models are designed and/or optimized to run on NVIDIA GPU-accelerated systems. By leveraging NVIDIA’s hardware (e.g. GPU cores) and software frameworks (e.g., CUDA libraries), the model achieves faster training and inference times compared to CPU-only solutions. <br>
176
+
177
+
178
+
179
+ ## Software Integration : <br>
180
+
181
+ **Runtime Engine(s):** <br>
182
+
183
+ * Tensor RT / Triton <br>
184
+
185
+
186
+
187
+ **Supported Hardware Microarchitecture Compatibility:** <br>
188
+
189
+ * NVIDIA Ampere <br>
190
+
191
+ * NVIDIA Hopper <br>
192
+
193
+
194
+
195
+ **Preferred Operating System(s):** <br>
196
+
197
+ * Linux <br>
198
+
199
+
200
+
201
+ ## Model Version(s):
202
+
203
+ [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B)
204
+
205
+ [OpenMath-Nemotron-7B](https://huggingface.co/nvidia/OpenMath-Nemotron-7B)
206
+
207
+ [OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B)
208
+
209
+ [OpenMath-Nemotron-32B](https://huggingface.co/nvidia/OpenMath-Nemotron-32B)
210
+
211
+
212
+ # Ethical Considerations:
213
+
214
+ NVIDIA believes Trustworthy AI is a shared responsibility and we have established policies and practices to enable development for a wide array of AI applications.  When downloaded or used in accordance with our terms of service, developers should work with their internal model team to ensure this model meets requirements for the relevant industry and use case and addresses unforeseen product misuse.
215
+
216
+ For more detailed information on ethical considerations for this model, please see the Model Card++ [Explainability](./EXPLAINABILITY.md), [Bias](./BIAS.md), [Safety & Security](./SAFETY.md), and [Privacy](./PRIVACY.md) Subcards.
217
+
218
+ Please report security vulnerabilities or NVIDIA AI Concerns [here](https://www.nvidia.com/en-us/support/submit-security-vulnerability/).
SAFETY.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Field | Response
2
+ :---------------------------------------------------|:----------------------------------
3
+ Model Application(s): | Text generation, reasoning, solving mathematical problems.
4
+ Use Case Restrictions: | Use of this model is governed by [CC-BY-4.0](https://creativecommons.org/licenses/by/4.0/) license
5
+ Model and dataset restrictions: | The Principle of least privilege (PoLP) is applied limiting access for dataset generation. Restrictions enforce dataset access during training, and dataset license constraints adhered to. Model checkpoints are made available on Hugging Face.
6
+ Use Case Restrictions: | Use of this model is governed by [CC-BY-4.0]((https://creativecommons.org/licenses/by/4.0/)) license. Additional Information: [Apache License Version 2.0](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B/blob/main/LICENSE)
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nvidia/OpenMath-Nemotron-14B",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 5120,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 13824,
13
+ "max_position_embeddings": 131072,
14
+ "max_window_layers": 70,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 40,
17
+ "num_hidden_layers": 48,
18
+ "num_key_value_heads": 8,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.47.1",
26
+ "use_cache": true,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 152064
29
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "transformers_version": "4.47.1"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79511b74a04c18c0753e650960f0dbf169f1dfce90440a128cd792ca4408a763
3
+ size 9941058640
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d8c5fba12c98edb3e992c4bf7c6e5c2e9d93fcac83b93d5245e5f187f58a9ff
3
+ size 9909694792
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59bc733638ba5bd5da0302cfa9505cb4d5542c6ba62bc63250205a029dba3e75
3
+ size 9689380560
model.safetensors.index.json ADDED
@@ -0,0 +1,586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 29540067328
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00003-of-00003.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
260
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
261
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
262
+ "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
263
+ "model.layers.28.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
264
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
265
+ "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
266
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
267
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
268
+ "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
269
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
270
+ "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
271
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
272
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
273
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
274
+ "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
275
+ "model.layers.29.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
276
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
277
+ "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
278
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
279
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
280
+ "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
281
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
282
+ "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
283
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
284
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
285
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
286
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
287
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
288
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
289
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
290
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
291
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
292
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
293
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
294
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
295
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors",
297
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
298
+ "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
299
+ "model.layers.30.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
300
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
301
+ "model.layers.30.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
302
+ "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
303
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
304
+ "model.layers.30.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
305
+ "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
306
+ "model.layers.30.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
307
+ "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
308
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors",
309
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
310
+ "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
311
+ "model.layers.31.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
312
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
313
+ "model.layers.31.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
314
+ "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
315
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
316
+ "model.layers.31.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
317
+ "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
318
+ "model.layers.31.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
319
+ "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
320
+ "model.layers.32.input_layernorm.weight": "model-00002-of-00003.safetensors",
321
+ "model.layers.32.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
322
+ "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
323
+ "model.layers.32.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
324
+ "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
325
+ "model.layers.32.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
326
+ "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
327
+ "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
328
+ "model.layers.32.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
329
+ "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
330
+ "model.layers.32.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
331
+ "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
332
+ "model.layers.33.input_layernorm.weight": "model-00003-of-00003.safetensors",
333
+ "model.layers.33.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
334
+ "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
335
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
336
+ "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
337
+ "model.layers.33.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
338
+ "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
339
+ "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
340
+ "model.layers.33.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
341
+ "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
342
+ "model.layers.33.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
343
+ "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
344
+ "model.layers.34.input_layernorm.weight": "model-00003-of-00003.safetensors",
345
+ "model.layers.34.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
346
+ "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
347
+ "model.layers.34.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
348
+ "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
349
+ "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
350
+ "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
351
+ "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
352
+ "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
353
+ "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
354
+ "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
355
+ "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
356
+ "model.layers.35.input_layernorm.weight": "model-00003-of-00003.safetensors",
357
+ "model.layers.35.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
358
+ "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
359
+ "model.layers.35.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
360
+ "model.layers.35.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
361
+ "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
362
+ "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
363
+ "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
364
+ "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
365
+ "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
366
+ "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
367
+ "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
368
+ "model.layers.36.input_layernorm.weight": "model-00003-of-00003.safetensors",
369
+ "model.layers.36.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
370
+ "model.layers.36.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
371
+ "model.layers.36.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
372
+ "model.layers.36.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
373
+ "model.layers.36.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
374
+ "model.layers.36.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
375
+ "model.layers.36.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
376
+ "model.layers.36.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
377
+ "model.layers.36.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
378
+ "model.layers.36.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
379
+ "model.layers.36.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
380
+ "model.layers.37.input_layernorm.weight": "model-00003-of-00003.safetensors",
381
+ "model.layers.37.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
382
+ "model.layers.37.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
383
+ "model.layers.37.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
384
+ "model.layers.37.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
385
+ "model.layers.37.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
386
+ "model.layers.37.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
387
+ "model.layers.37.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
388
+ "model.layers.37.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
389
+ "model.layers.37.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
390
+ "model.layers.37.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
391
+ "model.layers.37.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
392
+ "model.layers.38.input_layernorm.weight": "model-00003-of-00003.safetensors",
393
+ "model.layers.38.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
394
+ "model.layers.38.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
395
+ "model.layers.38.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
396
+ "model.layers.38.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
397
+ "model.layers.38.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
398
+ "model.layers.38.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
399
+ "model.layers.38.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
400
+ "model.layers.38.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
401
+ "model.layers.38.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
402
+ "model.layers.38.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
403
+ "model.layers.38.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
404
+ "model.layers.39.input_layernorm.weight": "model-00003-of-00003.safetensors",
405
+ "model.layers.39.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
406
+ "model.layers.39.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
407
+ "model.layers.39.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
408
+ "model.layers.39.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
409
+ "model.layers.39.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
410
+ "model.layers.39.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
411
+ "model.layers.39.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
412
+ "model.layers.39.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
413
+ "model.layers.39.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
414
+ "model.layers.39.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
415
+ "model.layers.39.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
416
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
417
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
418
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
419
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
420
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
421
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
422
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
423
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
424
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
425
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
426
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
427
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
428
+ "model.layers.40.input_layernorm.weight": "model-00003-of-00003.safetensors",
429
+ "model.layers.40.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
430
+ "model.layers.40.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
431
+ "model.layers.40.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
432
+ "model.layers.40.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
433
+ "model.layers.40.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
434
+ "model.layers.40.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
435
+ "model.layers.40.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
436
+ "model.layers.40.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
437
+ "model.layers.40.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
438
+ "model.layers.40.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
439
+ "model.layers.40.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
440
+ "model.layers.41.input_layernorm.weight": "model-00003-of-00003.safetensors",
441
+ "model.layers.41.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
442
+ "model.layers.41.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
443
+ "model.layers.41.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
444
+ "model.layers.41.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
445
+ "model.layers.41.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
446
+ "model.layers.41.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
447
+ "model.layers.41.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
448
+ "model.layers.41.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
449
+ "model.layers.41.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
450
+ "model.layers.41.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
451
+ "model.layers.41.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
452
+ "model.layers.42.input_layernorm.weight": "model-00003-of-00003.safetensors",
453
+ "model.layers.42.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
454
+ "model.layers.42.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
455
+ "model.layers.42.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
456
+ "model.layers.42.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
457
+ "model.layers.42.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
458
+ "model.layers.42.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
459
+ "model.layers.42.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
460
+ "model.layers.42.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
461
+ "model.layers.42.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
462
+ "model.layers.42.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
463
+ "model.layers.42.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
464
+ "model.layers.43.input_layernorm.weight": "model-00003-of-00003.safetensors",
465
+ "model.layers.43.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
466
+ "model.layers.43.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
467
+ "model.layers.43.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
468
+ "model.layers.43.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
469
+ "model.layers.43.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
470
+ "model.layers.43.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
471
+ "model.layers.43.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
472
+ "model.layers.43.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
473
+ "model.layers.43.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
474
+ "model.layers.43.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
475
+ "model.layers.43.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
476
+ "model.layers.44.input_layernorm.weight": "model-00003-of-00003.safetensors",
477
+ "model.layers.44.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
478
+ "model.layers.44.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
479
+ "model.layers.44.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
480
+ "model.layers.44.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
481
+ "model.layers.44.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
482
+ "model.layers.44.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
483
+ "model.layers.44.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
484
+ "model.layers.44.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
485
+ "model.layers.44.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
486
+ "model.layers.44.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
487
+ "model.layers.44.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
488
+ "model.layers.45.input_layernorm.weight": "model-00003-of-00003.safetensors",
489
+ "model.layers.45.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
490
+ "model.layers.45.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
491
+ "model.layers.45.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
492
+ "model.layers.45.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
493
+ "model.layers.45.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
494
+ "model.layers.45.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
495
+ "model.layers.45.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
496
+ "model.layers.45.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
497
+ "model.layers.45.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
498
+ "model.layers.45.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
499
+ "model.layers.45.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
500
+ "model.layers.46.input_layernorm.weight": "model-00003-of-00003.safetensors",
501
+ "model.layers.46.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
502
+ "model.layers.46.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
503
+ "model.layers.46.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
504
+ "model.layers.46.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
505
+ "model.layers.46.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
506
+ "model.layers.46.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
507
+ "model.layers.46.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
508
+ "model.layers.46.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
509
+ "model.layers.46.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
510
+ "model.layers.46.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
511
+ "model.layers.46.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
512
+ "model.layers.47.input_layernorm.weight": "model-00003-of-00003.safetensors",
513
+ "model.layers.47.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
514
+ "model.layers.47.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
515
+ "model.layers.47.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
516
+ "model.layers.47.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
517
+ "model.layers.47.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
518
+ "model.layers.47.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
519
+ "model.layers.47.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
520
+ "model.layers.47.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
521
+ "model.layers.47.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
522
+ "model.layers.47.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
523
+ "model.layers.47.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
524
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
525
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
526
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
527
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
528
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
529
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
530
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
531
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
532
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
533
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
534
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
535
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
536
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
537
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
538
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
539
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
540
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
541
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
542
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
543
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
544
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
545
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
546
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
547
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
548
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
549
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
550
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
551
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
552
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
553
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
554
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
555
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
556
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
557
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
558
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
559
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
560
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
561
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
562
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
563
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
564
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
565
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
566
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
567
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
568
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
569
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
570
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
571
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
572
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
573
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
574
+ "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
575
+ "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
576
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
577
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
578
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
579
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
580
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
581
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
582
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
583
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
584
+ "model.norm.weight": "model-00003-of-00003.safetensors"
585
+ }
586
+ }
results.png ADDED

Git LFS Details

  • SHA256: a8115e5e3cebce5d165e72dc1ea1dde8ba7cf0af837c57f137c891159f5a8176
  • Pointer size: 131 Bytes
  • Size of remote file: 219 kB
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}\n{%- else %}\n {{- '<|im_start|>system\n<|im_end|>\n' }}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == 'user') or (message.role == 'system' and not loop.first) or (message.role == 'assistant') %}\n {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' }}\n{%- endif %}",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 131072,
204
+ "pad_token": "<|endoftext|>",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff