kshitijthakkar commited on
Commit
4fea745
·
verified ·
1 Parent(s): 41dc755

Upload folder using huggingface_hub

Browse files
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>{%- endif -%}{%- if message['role'] == 'system' -%}<|im_system|>{%- endif -%}{%- if message['role'] == 'user' -%}<|im_user|>{%- endif -%}{%- if message['role'] == 'assistant' -%}<|im_assistant|>{%- endif -%}{{ message['role'] }}<|im_middle|>{{message['content']}}<|im_end|>{%- endfor -%}{%- if add_generation_prompt -%}<|im_assistant|>assistant<|im_middle|>{%- endif -%}
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "DeepseekV3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_deepseek.DeepseekV3Config",
9
+ "AutoModel": "modeling_deepseek.DeepseekV3Model",
10
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
11
+ },
12
+ "aux_loss_alpha": 0.001,
13
+ "bos_token_id": 163584,
14
+ "eos_token_id": 163585,
15
+ "first_k_dense_replace": 1,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 512,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 768,
20
+ "kv_lora_rank": 128,
21
+ "max_position_embeddings": 1024,
22
+ "model_type": "kimi_k2",
23
+ "moe_intermediate_size": 512,
24
+ "moe_layer_freq": 1,
25
+ "n_group": 1,
26
+ "n_routed_experts": 8,
27
+ "n_shared_experts": 1,
28
+ "norm_topk_prob": true,
29
+ "num_attention_heads": 8,
30
+ "num_experts_per_tok": 2,
31
+ "num_hidden_layers": 8,
32
+ "num_key_value_heads": 2,
33
+ "num_nextn_predict_layers": 0,
34
+ "pretraining_tp": 1,
35
+ "q_lora_rank": 384,
36
+ "qk_nope_head_dim": 32,
37
+ "qk_rope_head_dim": 32,
38
+ "quantization_config": {
39
+ "activation_scheme": "dynamic",
40
+ "fmt": "e4m3",
41
+ "quant_method": "fp8",
42
+ "weight_block_size": [
43
+ 128,
44
+ 128
45
+ ]
46
+ },
47
+ "rms_norm_eps": 1e-06,
48
+ "rope_scaling": null,
49
+ "rope_theta": 1000000,
50
+ "routed_scaling_factor": 1.0,
51
+ "scoring_func": "sigmoid",
52
+ "seq_aux": true,
53
+ "tie_word_embeddings": false,
54
+ "topk_group": 1,
55
+ "topk_method": "noaux_tc",
56
+ "torch_dtype": "float32",
57
+ "transformers_version": "4.54.1",
58
+ "use_cache": true,
59
+ "v_head_dim": 64,
60
+ "vocab_size": 163840
61
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d053a8cc99a48964244dd3eef30c4f7fcfbe998f3b4cd1dc03f94dde709c1cd
3
+ size 831641000
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>",
4
+ "<|im_user|>",
5
+ "<|im_assistant|>",
6
+ "<|im_system|>",
7
+ "<|im_middle|>"
8
+ ],
9
+ "bos_token": {
10
+ "content": "[BOS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[EOS]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "[PAD]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tiktoken.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
3
+ size 2795286
tokenization_moonshot.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ import numpy as np
17
+ from tiktoken.load import load_tiktoken_bpe
18
+ from tokenizers import AddedToken
19
+ from transformers import PreTrainedTokenizerFast
20
+ from transformers.tokenization_utils import PreTrainedTokenizer
21
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
22
+
23
+
24
+
25
+ logger = getLogger(__name__)
26
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
27
+ SPIECE_UNDERLINE = "▁"
28
+
29
+ class TikTokenTokenizer(PreTrainedTokenizer):
30
+ """
31
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
32
+
33
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
34
+ this superclass for more information regarding those methods.
35
+
36
+ Args:
37
+ vocab_file (`str`):
38
+ The path to the Tiktoken model file.
39
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
40
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
41
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
42
+ The end of sequence token.
43
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
44
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
45
+ token instead. The second to last item in special_tokens.
46
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
47
+ The token used for padding, for example when batching sequences of different lengths.
48
+ additional_special_tokens (list of `str`, *optional*):
49
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
50
+ skipped when decoding if `skip_special_tokens` is set to `True`.
51
+ """
52
+
53
+ vocab_files_names = VOCAB_FILES_NAMES
54
+
55
+ model_input_names = ["input_ids", "attention_mask"]
56
+
57
+ special_tokens: Dict[str, int]
58
+
59
+ num_reserved_special_tokens = 256
60
+
61
+ pat_str = "|".join(
62
+ [
63
+ r"""[\p{Han}]+""",
64
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
65
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
66
+ r"""\p{N}{1,3}""",
67
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
68
+ r"""\s*[\r\n]+""",
69
+ r"""\s+(?!\S)""",
70
+ r"""\s+""",
71
+ ]
72
+ )
73
+
74
+ def __init__(
75
+ self,
76
+ vocab_file,
77
+ bos_token: Union[str, AddedToken]="[BOS]",
78
+ eos_token: Union[str, AddedToken]="[EOS]",
79
+ unk_token: Union[str, AddedToken]="[UNK]",
80
+ pad_token: Union[str, AddedToken]="[PAD]",
81
+ additional_special_tokens: Optional[List[str]] = None,
82
+ added_tokens_decoder: Optional[dict] = None,
83
+ **kwargs,
84
+ ):
85
+ assert os.path.isfile(vocab_file), vocab_file
86
+ if additional_special_tokens is None:
87
+ additional_special_tokens = [
88
+ "<|im_end|>",
89
+ "<|im_middle|>",
90
+ "<|im_user|>",
91
+ "<|im_assistant|>",
92
+ "<|im_system|>"
93
+ ]
94
+ special_tokens_mapping = {i: added_tokens_decoder[i].content for i in added_tokens_decoder}
95
+
96
+ special_tokens = [str(bos_token), str(eos_token)] + additional_special_tokens + [str(unk_token), str(pad_token)]
97
+
98
+ self.vocab_file = vocab_file
99
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
100
+ num_base_tokens = len(mergeable_ranks)
101
+ self.special_tokens = {
102
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i \
103
+ for i in range(num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2)
104
+ }
105
+
106
+ self.model = tiktoken.Encoding(
107
+ name=Path(vocab_file).name,
108
+ pat_str=self.pat_str,
109
+ mergeable_ranks=mergeable_ranks,
110
+ special_tokens=self.special_tokens,
111
+ )
112
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
113
+
114
+ self.n_words: int = self.model.n_vocab
115
+ # BOS / EOS token IDs
116
+ self.bos_id: int = self.special_tokens[str(bos_token)]
117
+ self.eos_id: int = self.special_tokens[str(eos_token)]
118
+ logger.info(
119
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
120
+ )
121
+
122
+ self.pad_id: int = self.special_tokens[str(pad_token)]
123
+ self.unk_id: int = self.special_tokens[str(unk_token)]
124
+
125
+ self.byte_encoder = bytes_to_unicode()
126
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
127
+
128
+ self.decoder = {}
129
+ for i in range(self.n_words):
130
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
131
+ decoding = ''.join([
132
+ self.byte_encoder[ord(char)] for char in
133
+ self.model.decode_single_token_bytes(i).decode('latin-1')
134
+ ])
135
+ self.decoder[i] = decoding
136
+
137
+ self.encoder = {}
138
+ for i in range(self.n_words):
139
+ if i in self.decoder:
140
+ self.encoder[self.decoder[i]] = i
141
+
142
+ super().__init__(
143
+ bos_token=bos_token,
144
+ eos_token=eos_token,
145
+ unk_token=unk_token,
146
+ pad_token=pad_token,
147
+ additional_special_tokens=additional_special_tokens,
148
+ **kwargs,
149
+ )
150
+ self.all_special_ids_set = set(self.all_special_ids)
151
+
152
+ def encode(
153
+ self,
154
+ text: str,
155
+ allow_special_tokens: bool = True,
156
+ **kwargs
157
+ ) -> List[int]:
158
+ """
159
+ Encodes a string into a list of token IDs.
160
+
161
+ Args:
162
+ text (str): The input string to be encoded.
163
+
164
+ Returns:
165
+ list[int]: A list of token IDs.
166
+ """
167
+ # If there are other args, we should call super().encode because there are a lot of code
168
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
169
+ if len(kwargs) > 0:
170
+ return super().encode(text, **kwargs)
171
+
172
+ assert type(text) is str
173
+
174
+ # The tiktoken tokenizer can handle <=400k chars without
175
+ # pyo3_runtime.PanicException.
176
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
177
+
178
+ # https://github.com/openai/tiktoken/issues/195
179
+ # Here we iterate over subsequences and split if we exceed the limit
180
+ # of max consecutive non-whitespace or whitespace characters.
181
+ MAX_NO_WHITESPACES_CHARS = 25_000
182
+
183
+ substrs = (
184
+ substr
185
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
186
+ for substr in self._split_whitespaces_or_nonwhitespaces(
187
+ text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
188
+ )
189
+ )
190
+ t: List[int] = []
191
+ for substr in substrs:
192
+ if allow_special_tokens:
193
+ t.extend(
194
+ # we should consider special token as a common token
195
+ self.model.encode(
196
+ substr,
197
+ allowed_special="all",
198
+ )
199
+ )
200
+ else:
201
+ t.extend(
202
+ # we should consider special token as a common token
203
+ self.model.encode(
204
+ substr,
205
+ disallowed_special=(),
206
+ )
207
+ )
208
+ return t
209
+
210
+ def decode(
211
+ self,
212
+ token_ids: Union[int, List[int]],
213
+ **kwargs
214
+ ) -> str:
215
+ """
216
+ Decodes a list of token IDs into a string.
217
+
218
+ Args:
219
+ t (List[int]): The list of token IDs to be decoded.
220
+
221
+ Returns:
222
+ str: The decoded string.
223
+ """
224
+ # If there are other args, we should call super().decode because there are a lot of code
225
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
226
+ if len(kwargs) > 0:
227
+ return super().decode(token_ids, **kwargs)
228
+
229
+ if type(token_ids) is int:
230
+ token_ids = [token_ids]
231
+
232
+ return self.model.decode(cast(List[int], token_ids))
233
+
234
+ @staticmethod
235
+ def _split_whitespaces_or_nonwhitespaces(
236
+ s: str, max_consecutive_slice_len: int
237
+ ) -> Iterator[str]:
238
+ """
239
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
240
+ consecutive whitespaces or consecutive non-whitespaces.
241
+ """
242
+ current_slice_len = 0
243
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
244
+ slice_start = 0
245
+
246
+ for i in range(len(s)):
247
+ is_now_space = s[i].isspace()
248
+
249
+ if current_slice_is_space ^ is_now_space:
250
+ current_slice_len = 1
251
+ current_slice_is_space = is_now_space
252
+ else:
253
+ current_slice_len += 1
254
+ if current_slice_len > max_consecutive_slice_len:
255
+ yield s[slice_start:i]
256
+ slice_start = i
257
+ current_slice_len = 1
258
+ yield s[slice_start:]
259
+
260
+
261
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
262
+ @property
263
+ def vocab_size(self) -> int:
264
+ return self.n_words
265
+
266
+ def get_vocab(self) -> Dict[str, int]:
267
+ return self.encoder
268
+
269
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
270
+ return [
271
+ self.decoder[t]
272
+ for t in self.encode(text)
273
+ ]
274
+
275
+ def _convert_token_to_id(self, token: str) -> int:
276
+ return self.encoder.get(token, self.unk_id)
277
+
278
+ def _convert_id_to_token(self, index: int) -> str:
279
+ return self.decoder.get(index)
280
+
281
+ @staticmethod
282
+ def clean_up_tokenization(out_string: str) -> str:
283
+ return out_string
284
+
285
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
286
+ text = ''.join(tokens).replace(SPIECE_UNDERLINE, '')
287
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
288
+ return text
289
+
290
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
291
+ if not os.path.isdir(save_directory):
292
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
293
+ return
294
+ out_vocab_file = os.path.join(
295
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
296
+ )
297
+
298
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
299
+ copyfile(self.vocab_file, out_vocab_file)
300
+
301
+ return (out_vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163587": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163588": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163594": {
44
+ "content": "<|im_system|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163601": {
52
+ "content": "<|im_middle|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163838": {
60
+ "content": "[PAD]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163839": {
68
+ "content": "[UNK]",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ }
75
+ },
76
+ "additional_special_tokens": [
77
+ "<|im_end|>",
78
+ "<|im_user|>",
79
+ "<|im_assistant|>",
80
+ "<|im_system|>",
81
+ "<|im_middle|>"
82
+ ],
83
+ "auto_map": {
84
+ "AutoTokenizer": [
85
+ "tokenization_moonshot.TikTokenTokenizer",
86
+ null
87
+ ]
88
+ },
89
+ "bos_token": "[BOS]",
90
+ "clean_up_tokenization_spaces": false,
91
+ "eos_token": "[EOS]",
92
+ "extra_special_tokens": {},
93
+ "model_max_length": 1048576,
94
+ "pad_token": "[PAD]",
95
+ "tokenizer_class": "TikTokenTokenizer",
96
+ "unk_token": "[UNK]"
97
+ }
training_args.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": "kimi_k2",
3
+ "optimizer": "muon",
4
+ "lr": 0.02,
5
+ "wd": 0.1,
6
+ "dataset": "loggenix-rca",
7
+ "hidden_size": 1024,
8
+ "epochs": 1,
9
+ "save_path": "./trained_model",
10
+ "push_to_hub": true,
11
+ "repo_name": "kshitijthakkar/loggenix-nanoKimi2-test",
12
+ "hf_token": "hf_token",
13
+ "run_inference": true
14
+ }