Upload tokenizer
Browse files- README.md +4 -4
- special_tokens_map.json +1 -1
- tokenizer.json +2 -11
- tokenizer_config.json +9 -12
    	
        README.md
    CHANGED
    
    | @@ -1,12 +1,12 @@ | |
| 1 | 
             
            ---
         | 
|  | |
| 2 | 
             
            library_name: transformers
         | 
|  | |
|  | |
|  | |
| 3 | 
             
            tags:
         | 
| 4 | 
             
            - bitnet
         | 
| 5 | 
             
            - falcon3
         | 
| 6 | 
            -
            base_model: tiiuae/Falcon3-7B-Instruct
         | 
| 7 | 
            -
            license: other 
         | 
| 8 | 
            -
            license_name: falcon-llm-license 
         | 
| 9 | 
            -
            license_link: https://falconllm.tii.ae/falcon-terms-and-conditions.html
         | 
| 10 | 
             
            ---
         | 
| 11 |  | 
| 12 |  | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
            +
            base_model: tiiuae/Falcon3-7B-Instruct
         | 
| 3 | 
             
            library_name: transformers
         | 
| 4 | 
            +
            license: other
         | 
| 5 | 
            +
            license_name: falcon-llm-license
         | 
| 6 | 
            +
            license_link: https://falconllm.tii.ae/falcon-terms-and-conditions.html
         | 
| 7 | 
             
            tags:
         | 
| 8 | 
             
            - bitnet
         | 
| 9 | 
             
            - falcon3
         | 
|  | |
|  | |
|  | |
|  | |
| 10 | 
             
            ---
         | 
| 11 |  | 
| 12 |  | 
    	
        special_tokens_map.json
    CHANGED
    
    | @@ -32,7 +32,7 @@ | |
| 32 | 
             
                "single_word": false
         | 
| 33 | 
             
              },
         | 
| 34 | 
             
              "pad_token": {
         | 
| 35 | 
            -
                "content": " | 
| 36 | 
             
                "lstrip": false,
         | 
| 37 | 
             
                "normalized": false,
         | 
| 38 | 
             
                "rstrip": false,
         | 
|  | |
| 32 | 
             
                "single_word": false
         | 
| 33 | 
             
              },
         | 
| 34 | 
             
              "pad_token": {
         | 
| 35 | 
            +
                "content": "<|pad|>",
         | 
| 36 | 
             
                "lstrip": false,
         | 
| 37 | 
             
                "normalized": false,
         | 
| 38 | 
             
                "rstrip": false,
         | 
    	
        tokenizer.json
    CHANGED
    
    | @@ -18212,16 +18212,7 @@ | |
| 18212 | 
             
                },
         | 
| 18213 | 
             
                {
         | 
| 18214 | 
             
                  "id": 2023,
         | 
| 18215 | 
            -
                  "content": " | 
| 18216 | 
            -
                  "single_word": false,
         | 
| 18217 | 
            -
                  "lstrip": false,
         | 
| 18218 | 
            -
                  "rstrip": false,
         | 
| 18219 | 
            -
                  "normalized": false,
         | 
| 18220 | 
            -
                  "special": true
         | 
| 18221 | 
            -
                },
         | 
| 18222 | 
            -
                {
         | 
| 18223 | 
            -
                  "id": 131072,
         | 
| 18224 | 
            -
                  "content": "<pad>",
         | 
| 18225 | 
             
                  "single_word": false,
         | 
| 18226 | 
             
                  "lstrip": false,
         | 
| 18227 | 
             
                  "rstrip": false,
         | 
| @@ -20289,7 +20280,7 @@ | |
| 20289 | 
             
                  ">>UNUSED_1894<<": 2020,
         | 
| 20290 | 
             
                  ">>UNUSED_1895<<": 2021,
         | 
| 20291 | 
             
                  ">>UNUSED_1896<<": 2022,
         | 
| 20292 | 
            -
                  " | 
| 20293 | 
             
                  "!": 2024,
         | 
| 20294 | 
             
                  "\"": 2025,
         | 
| 20295 | 
             
                  "#": 2026,
         | 
|  | |
| 18212 | 
             
                },
         | 
| 18213 | 
             
                {
         | 
| 18214 | 
             
                  "id": 2023,
         | 
| 18215 | 
            +
                  "content": "<|pad|>",
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 18216 | 
             
                  "single_word": false,
         | 
| 18217 | 
             
                  "lstrip": false,
         | 
| 18218 | 
             
                  "rstrip": false,
         | 
|  | |
| 20280 | 
             
                  ">>UNUSED_1894<<": 2020,
         | 
| 20281 | 
             
                  ">>UNUSED_1895<<": 2021,
         | 
| 20282 | 
             
                  ">>UNUSED_1896<<": 2022,
         | 
| 20283 | 
            +
                  "<|pad|>": 2023,
         | 
| 20284 | 
             
                  "!": 2024,
         | 
| 20285 | 
             
                  "\"": 2025,
         | 
| 20286 | 
             
                  "#": 2026,
         | 
    	
        tokenizer_config.json
    CHANGED
    
    | @@ -16186,15 +16186,7 @@ | |
| 16186 | 
             
                  "special": true
         | 
| 16187 | 
             
                },
         | 
| 16188 | 
             
                "2023": {
         | 
| 16189 | 
            -
                  "content": " | 
| 16190 | 
            -
                  "lstrip": false,
         | 
| 16191 | 
            -
                  "normalized": false,
         | 
| 16192 | 
            -
                  "rstrip": false,
         | 
| 16193 | 
            -
                  "single_word": false,
         | 
| 16194 | 
            -
                  "special": true
         | 
| 16195 | 
            -
                },
         | 
| 16196 | 
            -
                "131072": {
         | 
| 16197 | 
            -
                  "content": "<pad>",
         | 
| 16198 | 
             
                  "lstrip": false,
         | 
| 16199 | 
             
                  "normalized": false,
         | 
| 16200 | 
             
                  "rstrip": false,
         | 
| @@ -16227,10 +16219,15 @@ | |
| 16227 | 
             
                ">>PASSWORD<<",
         | 
| 16228 | 
             
                ">>KEY<<"
         | 
| 16229 | 
             
              ],
         | 
| 16230 | 
            -
              "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if not loop.last %}{{ '<|assistant|>\n' | 
| 16231 | 
             
              "clean_up_tokenization_spaces": true,
         | 
| 16232 | 
             
              "eos_token": "<|endoftext|>",
         | 
| 16233 | 
            -
              " | 
| 16234 | 
            -
              " | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 16235 | 
             
              "tokenizer_class": "PreTrainedTokenizerFast"
         | 
| 16236 | 
             
            }
         | 
|  | |
| 16186 | 
             
                  "special": true
         | 
| 16187 | 
             
                },
         | 
| 16188 | 
             
                "2023": {
         | 
| 16189 | 
            +
                  "content": "<|pad|>",
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 16190 | 
             
                  "lstrip": false,
         | 
| 16191 | 
             
                  "normalized": false,
         | 
| 16192 | 
             
                  "rstrip": false,
         | 
|  | |
| 16219 | 
             
                ">>PASSWORD<<",
         | 
| 16220 | 
             
                ">>KEY<<"
         | 
| 16221 | 
             
              ],
         | 
| 16222 | 
            +
              "chat_template": "{% if tools %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + '\nYou are an expert in composing functions. You are given a question and a set of possible functions. \nBased on the question, you will need to make one or more function/tool calls to achieve the purpose. \nIf none of the functions can be used, point it out and refuse to answer. \nIf the given question lacks the parameters required by the function, also point it out.\n\n You have access to the following tools:\n<tools>'  + tools|tojson + '</tools>\n\nThe output MUST strictly adhere to the following format, and NO other text MUST be included.\nThe example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make the tool calls an empty list [].\n<tool_call>[\n{\"name\": \"function_name1\", \"arguments\": {\"argument1\": \"value1\", \"argument2\": \"value2\"}},\n... (more tool calls as required)\n]</tool_call>' }}{% elif message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if not loop.last %}{{ '<|assistant|>\n' + message['content'] + eos_token + '\n' }}{% else %}{{ '<|assistant|>\n' + message['content'] + eos_token }}{% endif %}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}{% endfor %}{% else %}{% for message in messages %}{% if message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'user' %}{{ '<|user|>\n' + message['content'] + '\n' }}{% elif message['role'] == 'assistant' %}{% if not loop.last %}{{ '<|assistant|>\n' + message['content'] + eos_token + '\n' }}{% else %}{{ '<|assistant|>\n' + message['content'] + eos_token }}{% endif %}{% endif %}{% if loop.last and add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}{% endfor %}{% endif %}",
         | 
| 16223 | 
             
              "clean_up_tokenization_spaces": true,
         | 
| 16224 | 
             
              "eos_token": "<|endoftext|>",
         | 
| 16225 | 
            +
              "extra_special_tokens": {},
         | 
| 16226 | 
            +
              "model_input_names": [
         | 
| 16227 | 
            +
                "input_ids",
         | 
| 16228 | 
            +
                "attention_mask"
         | 
| 16229 | 
            +
              ],
         | 
| 16230 | 
            +
              "model_max_length": 32768,
         | 
| 16231 | 
            +
              "pad_token": "<|pad|>",
         | 
| 16232 | 
             
              "tokenizer_class": "PreTrainedTokenizerFast"
         | 
| 16233 | 
             
            }
         | 

