Upload folder using huggingface_hub
Browse files- .gitattributes +43 -35
- README.md +135 -0
- config.json +73 -0
- cpu_and_mobile/cpu-int4-rtn-block-32/chat_template.jinja +12 -0
- cpu_and_mobile/cpu-int4-rtn-block-32/config.json +73 -0
- cpu_and_mobile/cpu-int4-rtn-block-32/genai_config.json +52 -0
- cpu_and_mobile/cpu-int4-rtn-block-32/model.json +18 -0
- cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx +3 -0
- cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx.data +3 -0
- cpu_and_mobile/cpu-int4-rtn-block-32/special_tokens_map.json +30 -0
- cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer.json +3 -0
- cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer_config.json +0 -0
- cuda/cuda-int4-rtn-block-32/chat_template.jinja +12 -0
- cuda/cuda-int4-rtn-block-32/config.json +73 -0
- cuda/cuda-int4-rtn-block-32/genai_config.json +52 -0
- cuda/cuda-int4-rtn-block-32/model.json +17 -0
- cuda/cuda-int4-rtn-block-32/model.onnx +3 -0
- cuda/cuda-int4-rtn-block-32/model.onnx.data +3 -0
- cuda/cuda-int4-rtn-block-32/special_tokens_map.json +30 -0
- cuda/cuda-int4-rtn-block-32/tokenizer.json +3 -0
- cuda/cuda-int4-rtn-block-32/tokenizer_config.json +0 -0
- directml/dml-int4-rtn-block-32/chat_template.jinja +12 -0
- directml/dml-int4-rtn-block-32/config.json +73 -0
- directml/dml-int4-rtn-block-32/genai_config.json +53 -0
- directml/dml-int4-rtn-block-32/model.json +17 -0
- directml/dml-int4-rtn-block-32/model.onnx +3 -0
- directml/dml-int4-rtn-block-32/model.onnx.data +3 -0
- directml/dml-int4-rtn-block-32/special_tokens_map.json +30 -0
- directml/dml-int4-rtn-block-32/tokenizer.json +3 -0
- directml/dml-int4-rtn-block-32/tokenizer_config.json +0 -0
- gpu/gpu-fp16/chat_template.jinja +12 -0
- gpu/gpu-fp16/config.json +73 -0
- gpu/gpu-fp16/genai_config.json +59 -0
- gpu/gpu-fp16/model.json +17 -0
- gpu/gpu-fp16/model.onnx +3 -0
- gpu/gpu-fp16/model.onnx.data +3 -0
- gpu/gpu-fp16/special_tokens_map.json +30 -0
- gpu/gpu-fp16/tokenizer.json +3 -0
- gpu/gpu-fp16/tokenizer_config.json +0 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,43 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
cuda/cuda-int4-rtn-block-32/model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
cuda/cuda-int4-rtn-block-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
directml/dml-int4-rtn-block-32/model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
directml/dml-int4-rtn-block-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
gpu/gpu-fp16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
gpu/gpu-fp16/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,138 @@
|
|
| 1 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
license: gemma
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
library_name: onnx
|
| 3 |
+
pipeline_tag: translation
|
| 4 |
+
language:
|
| 5 |
+
- ar
|
| 6 |
+
- bg
|
| 7 |
+
- zh
|
| 8 |
+
- cs
|
| 9 |
+
- da
|
| 10 |
+
- nl
|
| 11 |
+
- en
|
| 12 |
+
- fi
|
| 13 |
+
- fr
|
| 14 |
+
- de
|
| 15 |
+
- el
|
| 16 |
+
- gu
|
| 17 |
+
- he
|
| 18 |
+
- hi
|
| 19 |
+
- hu
|
| 20 |
+
- id
|
| 21 |
+
- it
|
| 22 |
+
- ja
|
| 23 |
+
- ko
|
| 24 |
+
- fa
|
| 25 |
+
- pl
|
| 26 |
+
- pt
|
| 27 |
+
- ro
|
| 28 |
+
- ru
|
| 29 |
+
- sk
|
| 30 |
+
- es
|
| 31 |
+
- sv
|
| 32 |
+
- tl
|
| 33 |
+
- th
|
| 34 |
+
- tr
|
| 35 |
+
- uk
|
| 36 |
+
- vi
|
| 37 |
license: gemma
|
| 38 |
+
tags:
|
| 39 |
+
- onnx
|
| 40 |
+
- onnxruntime
|
| 41 |
+
- optimum
|
| 42 |
+
- translation
|
| 43 |
+
- gemma
|
| 44 |
+
- int4
|
| 45 |
+
- quantized
|
| 46 |
+
- cuda
|
| 47 |
+
- directml
|
| 48 |
+
base_model: google/gemma-3-4b-pt
|
| 49 |
+
base_model_relation: quantized
|
| 50 |
+
model-index:
|
| 51 |
+
- name: YanoljaNEXT-Rosetta-4B-ONNX
|
| 52 |
+
results:
|
| 53 |
+
- task:
|
| 54 |
+
type: translation
|
| 55 |
+
name: Translation
|
| 56 |
+
metrics:
|
| 57 |
+
- type: bleu
|
| 58 |
+
value: 31.5
|
| 59 |
+
name: BLEU Score
|
| 60 |
---
|
| 61 |
+
|
| 62 |
+
# YanoljaNEXT-Rosetta-4B-2510-ONNX
|
| 63 |
+
|
| 64 |
+
## Introduction
|
| 65 |
+
This repository hosts Pangaia Software's optimized versions of the [`YanoljaNEXT-Rosetta-4B-2510`](https://huggingface.co/yanolja/YanoljaNEXT-Rosetta-4B-2510) model to accelerate inference with ONNX Runtime.
|
| 66 |
+
|
| 67 |
+
Optimized models are published here in ONNX format to run with ONNX Runtime on CPU and GPU across devices, including server platforms, Windows, Linux and Mac desktops, and mobile CPUs, with the precision best suited to each of these targets.
|
| 68 |
+
|
| 69 |
+
Here are some of the optimized configurations we have added:
|
| 70 |
+
|
| 71 |
+
1. ONNX model for int4 CPU: ONNX model for CPU and mobile using int4 quantization via RTN.
|
| 72 |
+
2. ONNX model for int4 GPU: ONNX model for GPU using int4 quantization via RTN.
|
| 73 |
+
|
| 74 |
+
## Model Run
|
| 75 |
+
For CPU:
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
# Download the model directly using the Hugging Face CLI
|
| 79 |
+
huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include cpu_and_mobile/* --local-dir .
|
| 80 |
+
|
| 81 |
+
# Install the CPU package of ONNX Runtime GenAI
|
| 82 |
+
pip install --pre onnxruntime-genai
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
For CUDA:
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
# Download the model directly using the Hugging Face CLI
|
| 89 |
+
huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include cuda/* --local-dir .
|
| 90 |
+
|
| 91 |
+
# Install the CUDA package of ONNX Runtime GenAI
|
| 92 |
+
pip install --pre onnxruntime-genai-cuda
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
For GPU:
|
| 96 |
+
|
| 97 |
+
```bash
|
| 98 |
+
# Download the model directly using the Hugging Face CLI
|
| 99 |
+
huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include gpu/* --local-dir .
|
| 100 |
+
|
| 101 |
+
# Install the CUDA package of ONNX Runtime GenAI
|
| 102 |
+
pip install --pre onnxruntime-genai-cuda
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
For DirectML:
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
# Download the model directly using the Hugging Face CLI
|
| 109 |
+
huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include directml/* --local-dir .
|
| 110 |
+
|
| 111 |
+
# Install the DML package of ONNX Runtime GenAI
|
| 112 |
+
pip install --pre onnxruntime-genai-directml
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
Execution:
|
| 116 |
+
|
| 117 |
+
Refer to the [`ONNX Runtime GenAI`](https://github.com/microsoft/onnxruntime-genai) repo for the latest samples for model execution.
|
| 118 |
+
|
| 119 |
+
**Note**: since this is a Gemma-based model, use the corresponding prompt template:
|
| 120 |
+
|
| 121 |
+
```
|
| 122 |
+
System = "<start_of_turn>instruction\n{{CONTENT}}<end_of_turn>\n",
|
| 123 |
+
User = "<start_of_turn>source\n{{CONTENT}}<end_of_turn>\n",
|
| 124 |
+
Assistant = "<start_of_turn>translation\n{{CONTENT}}<end_of_turn>\n",
|
| 125 |
+
Stop = ["<end_of_turn>", "<start_of_turn>"]
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
## Model Description
|
| 130 |
+
- Developed by: Pangaia Software
|
| 131 |
+
- Model type: ONNX
|
| 132 |
+
- License: gemma
|
| 133 |
+
- Model Description: This is a conversion of the [`YanoljaNEXT-Rosetta-4B-2510`](https://huggingface.co/yanolja/YanoljaNEXT-Rosetta-4B-2510) model for ONNX Runtime inference, which in turn is based on the [`google/gemma-3-4b-pt`](https://huggingface.co/google/gemma-3-4b-pt) model.
|
| 134 |
+
|
| 135 |
+
**Disclaimer:** Model is only an optimization of the base model, any risk associated with the model is the responsibility of the user of the model. Please verify and test for your scenarios. There may be a slight difference in output from the base model with the optimizations applied.
|
| 136 |
+
|
| 137 |
+
## License
|
| 138 |
+
This model is released under the Gemma license, inherited from its base model, [`google/gemma-3-4b-pt`](https://huggingface.co/google/gemma-3-4b-pt). Please consult the official [Gemma license terms](https://ai.google.dev/gemma/terms) for detailed usage guidelines.
|
config.json
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_sliding_window_pattern": 6,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Gemma3ForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_bias": false,
|
| 7 |
+
"attention_dropout": 0.0,
|
| 8 |
+
"attn_logit_softcapping": null,
|
| 9 |
+
"bos_token_id": 2,
|
| 10 |
+
"cache_implementation": "hybrid",
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"eos_token_id": 106,
|
| 13 |
+
"final_logit_softcapping": null,
|
| 14 |
+
"head_dim": 256,
|
| 15 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
| 16 |
+
"hidden_size": 2560,
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"intermediate_size": 10240,
|
| 19 |
+
"layer_types": [
|
| 20 |
+
"sliding_attention",
|
| 21 |
+
"sliding_attention",
|
| 22 |
+
"sliding_attention",
|
| 23 |
+
"sliding_attention",
|
| 24 |
+
"sliding_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"sliding_attention",
|
| 27 |
+
"sliding_attention",
|
| 28 |
+
"sliding_attention",
|
| 29 |
+
"sliding_attention",
|
| 30 |
+
"sliding_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"sliding_attention",
|
| 33 |
+
"sliding_attention",
|
| 34 |
+
"sliding_attention",
|
| 35 |
+
"sliding_attention",
|
| 36 |
+
"sliding_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"sliding_attention",
|
| 39 |
+
"sliding_attention",
|
| 40 |
+
"sliding_attention",
|
| 41 |
+
"sliding_attention",
|
| 42 |
+
"sliding_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"sliding_attention",
|
| 45 |
+
"sliding_attention",
|
| 46 |
+
"sliding_attention",
|
| 47 |
+
"sliding_attention",
|
| 48 |
+
"sliding_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"sliding_attention",
|
| 51 |
+
"sliding_attention",
|
| 52 |
+
"sliding_attention",
|
| 53 |
+
"sliding_attention"
|
| 54 |
+
],
|
| 55 |
+
"max_position_embeddings": 131072,
|
| 56 |
+
"model_type": "gemma3_text",
|
| 57 |
+
"num_attention_heads": 8,
|
| 58 |
+
"num_hidden_layers": 34,
|
| 59 |
+
"num_key_value_heads": 4,
|
| 60 |
+
"pad_token_id": 0,
|
| 61 |
+
"query_pre_attn_scalar": 256,
|
| 62 |
+
"rms_norm_eps": 1e-06,
|
| 63 |
+
"rope_local_base_freq": 10000.0,
|
| 64 |
+
"rope_scaling": {
|
| 65 |
+
"factor": 8.0,
|
| 66 |
+
"rope_type": "linear"
|
| 67 |
+
},
|
| 68 |
+
"rope_theta": 1000000.0,
|
| 69 |
+
"sliding_window": 1024,
|
| 70 |
+
"transformers_version": "4.56.1",
|
| 71 |
+
"use_cache": false,
|
| 72 |
+
"vocab_size": 262208
|
| 73 |
+
}
|
cpu_and_mobile/cpu-int4-rtn-block-32/chat_template.jinja
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{{- bos_token -}}
|
| 2 |
+
{%- for message in messages -%}
|
| 3 |
+
<start_of_turn>
|
| 4 |
+
{%- if message['role']=='system' -%}instruction{{ '\n' }}
|
| 5 |
+
{%- elif message['role']=='user' -%}source{{ '\n' }}
|
| 6 |
+
{%- elif message['role']=='assistant' -%}translation{{ '\n' }}
|
| 7 |
+
{%- endif -%}
|
| 8 |
+
{{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
|
| 9 |
+
{%- endfor -%}
|
| 10 |
+
{%- if add_generation_prompt -%}
|
| 11 |
+
<start_of_turn>translation{{ '\n' }}
|
| 12 |
+
{%- endif -%}
|
cpu_and_mobile/cpu-int4-rtn-block-32/config.json
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_sliding_window_pattern": 6,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Gemma3ForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_bias": false,
|
| 7 |
+
"attention_dropout": 0.0,
|
| 8 |
+
"attn_logit_softcapping": null,
|
| 9 |
+
"bos_token_id": 2,
|
| 10 |
+
"cache_implementation": "hybrid",
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"eos_token_id": 106,
|
| 13 |
+
"final_logit_softcapping": null,
|
| 14 |
+
"head_dim": 256,
|
| 15 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
| 16 |
+
"hidden_size": 2560,
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"intermediate_size": 10240,
|
| 19 |
+
"layer_types": [
|
| 20 |
+
"sliding_attention",
|
| 21 |
+
"sliding_attention",
|
| 22 |
+
"sliding_attention",
|
| 23 |
+
"sliding_attention",
|
| 24 |
+
"sliding_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"sliding_attention",
|
| 27 |
+
"sliding_attention",
|
| 28 |
+
"sliding_attention",
|
| 29 |
+
"sliding_attention",
|
| 30 |
+
"sliding_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"sliding_attention",
|
| 33 |
+
"sliding_attention",
|
| 34 |
+
"sliding_attention",
|
| 35 |
+
"sliding_attention",
|
| 36 |
+
"sliding_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"sliding_attention",
|
| 39 |
+
"sliding_attention",
|
| 40 |
+
"sliding_attention",
|
| 41 |
+
"sliding_attention",
|
| 42 |
+
"sliding_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"sliding_attention",
|
| 45 |
+
"sliding_attention",
|
| 46 |
+
"sliding_attention",
|
| 47 |
+
"sliding_attention",
|
| 48 |
+
"sliding_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"sliding_attention",
|
| 51 |
+
"sliding_attention",
|
| 52 |
+
"sliding_attention",
|
| 53 |
+
"sliding_attention"
|
| 54 |
+
],
|
| 55 |
+
"max_position_embeddings": 131072,
|
| 56 |
+
"model_type": "gemma3_text",
|
| 57 |
+
"num_attention_heads": 8,
|
| 58 |
+
"num_hidden_layers": 34,
|
| 59 |
+
"num_key_value_heads": 4,
|
| 60 |
+
"pad_token_id": 0,
|
| 61 |
+
"query_pre_attn_scalar": 256,
|
| 62 |
+
"rms_norm_eps": 1e-06,
|
| 63 |
+
"rope_local_base_freq": 10000.0,
|
| 64 |
+
"rope_scaling": {
|
| 65 |
+
"factor": 8.0,
|
| 66 |
+
"rope_type": "linear"
|
| 67 |
+
},
|
| 68 |
+
"rope_theta": 1000000.0,
|
| 69 |
+
"sliding_window": 1024,
|
| 70 |
+
"transformers_version": "4.56.1",
|
| 71 |
+
"use_cache": false,
|
| 72 |
+
"vocab_size": 262208
|
| 73 |
+
}
|
cpu_and_mobile/cpu-int4-rtn-block-32/genai_config.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": {
|
| 3 |
+
"bos_token_id": 2,
|
| 4 |
+
"context_length": 131072,
|
| 5 |
+
"decoder": {
|
| 6 |
+
"session_options": {
|
| 7 |
+
"log_id": "onnxruntime-genai",
|
| 8 |
+
"provider_options": []
|
| 9 |
+
},
|
| 10 |
+
"filename": "model.onnx",
|
| 11 |
+
"head_size": 256,
|
| 12 |
+
"hidden_size": 2560,
|
| 13 |
+
"inputs": {
|
| 14 |
+
"input_ids": "input_ids",
|
| 15 |
+
"attention_mask": "attention_mask",
|
| 16 |
+
"past_key_names": "past_key_values.%d.key",
|
| 17 |
+
"past_value_names": "past_key_values.%d.value"
|
| 18 |
+
},
|
| 19 |
+
"outputs": {
|
| 20 |
+
"logits": "logits",
|
| 21 |
+
"present_key_names": "present.%d.key",
|
| 22 |
+
"present_value_names": "present.%d.value"
|
| 23 |
+
},
|
| 24 |
+
"num_attention_heads": 8,
|
| 25 |
+
"num_hidden_layers": 34,
|
| 26 |
+
"num_key_value_heads": 4
|
| 27 |
+
},
|
| 28 |
+
"eos_token_id": [
|
| 29 |
+
1,
|
| 30 |
+
106
|
| 31 |
+
],
|
| 32 |
+
"pad_token_id": 0,
|
| 33 |
+
"type": "gemma3_text",
|
| 34 |
+
"vocab_size": 262208
|
| 35 |
+
},
|
| 36 |
+
"search": {
|
| 37 |
+
"diversity_penalty": 0.0,
|
| 38 |
+
"do_sample": true,
|
| 39 |
+
"early_stopping": true,
|
| 40 |
+
"length_penalty": 1.0,
|
| 41 |
+
"max_length": 131072,
|
| 42 |
+
"min_length": 0,
|
| 43 |
+
"no_repeat_ngram_size": 0,
|
| 44 |
+
"num_beams": 1,
|
| 45 |
+
"num_return_sequences": 1,
|
| 46 |
+
"past_present_share_buffer": true,
|
| 47 |
+
"repetition_penalty": 1.0,
|
| 48 |
+
"temperature": 1.0,
|
| 49 |
+
"top_k": 64,
|
| 50 |
+
"top_p": 0.95
|
| 51 |
+
}
|
| 52 |
+
}
|
cpu_and_mobile/cpu-int4-rtn-block-32/model.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Id": "Rosetta-4B-CPU-INT4",
|
| 3 |
+
"Name": "YanoljaNEXT Rosetta 4B CPU INT4",
|
| 4 |
+
"DisplayName": "YanoljaNEXT Rosetta 4B (CPU, INT4)",
|
| 5 |
+
"HuggingFaceRepoId": "",
|
| 6 |
+
"RepoSubFolder": "",
|
| 7 |
+
"Accelerator": "CPU",
|
| 8 |
+
"Architecture": "Gemma3",
|
| 9 |
+
"VramUsageInGB": 0,
|
| 10 |
+
"RamUsageInGB": 3,
|
| 11 |
+
"ContextWindowTokens": 131072,
|
| 12 |
+
"OriginalModelSize": 4000000000,
|
| 13 |
+
"MaxCharacters": 524288,
|
| 14 |
+
"ModelType": "Translation",
|
| 15 |
+
"LanguagePairs": 31,
|
| 16 |
+
"QuantizationType": "INT4-RTN-Block32",
|
| 17 |
+
"BaseModel": "yanolja/YanoljaNEXT-Rosetta-4B-2510"
|
| 18 |
+
}
|
cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c299659d7eb9f1e78ef7743b4a6384627799b5defcffadf7d9c15728270fd288
|
| 3 |
+
size 429364
|
cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9caa05aedc17f751abe22fe1f8a5707f1f79a2108d97aab6ff43d4bc677d656a
|
| 3 |
+
size 5379981312
|
cpu_and_mobile/cpu-int4-rtn-block-32/special_tokens_map.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<bos>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<end_of_turn>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<pad>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"unk_token": {
|
| 24 |
+
"content": "<unk>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
}
|
cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
|
| 3 |
+
size 33384556
|
cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
cuda/cuda-int4-rtn-block-32/chat_template.jinja
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{{- bos_token -}}
|
| 2 |
+
{%- for message in messages -%}
|
| 3 |
+
<start_of_turn>
|
| 4 |
+
{%- if message['role']=='system' -%}instruction{{ '\n' }}
|
| 5 |
+
{%- elif message['role']=='user' -%}source{{ '\n' }}
|
| 6 |
+
{%- elif message['role']=='assistant' -%}translation{{ '\n' }}
|
| 7 |
+
{%- endif -%}
|
| 8 |
+
{{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
|
| 9 |
+
{%- endfor -%}
|
| 10 |
+
{%- if add_generation_prompt -%}
|
| 11 |
+
<start_of_turn>translation{{ '\n' }}
|
| 12 |
+
{%- endif -%}
|
cuda/cuda-int4-rtn-block-32/config.json
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_sliding_window_pattern": 6,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Gemma3ForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_bias": false,
|
| 7 |
+
"attention_dropout": 0.0,
|
| 8 |
+
"attn_logit_softcapping": null,
|
| 9 |
+
"bos_token_id": 2,
|
| 10 |
+
"cache_implementation": "hybrid",
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"eos_token_id": 106,
|
| 13 |
+
"final_logit_softcapping": null,
|
| 14 |
+
"head_dim": 256,
|
| 15 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
| 16 |
+
"hidden_size": 2560,
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"intermediate_size": 10240,
|
| 19 |
+
"layer_types": [
|
| 20 |
+
"sliding_attention",
|
| 21 |
+
"sliding_attention",
|
| 22 |
+
"sliding_attention",
|
| 23 |
+
"sliding_attention",
|
| 24 |
+
"sliding_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"sliding_attention",
|
| 27 |
+
"sliding_attention",
|
| 28 |
+
"sliding_attention",
|
| 29 |
+
"sliding_attention",
|
| 30 |
+
"sliding_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"sliding_attention",
|
| 33 |
+
"sliding_attention",
|
| 34 |
+
"sliding_attention",
|
| 35 |
+
"sliding_attention",
|
| 36 |
+
"sliding_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"sliding_attention",
|
| 39 |
+
"sliding_attention",
|
| 40 |
+
"sliding_attention",
|
| 41 |
+
"sliding_attention",
|
| 42 |
+
"sliding_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"sliding_attention",
|
| 45 |
+
"sliding_attention",
|
| 46 |
+
"sliding_attention",
|
| 47 |
+
"sliding_attention",
|
| 48 |
+
"sliding_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"sliding_attention",
|
| 51 |
+
"sliding_attention",
|
| 52 |
+
"sliding_attention",
|
| 53 |
+
"sliding_attention"
|
| 54 |
+
],
|
| 55 |
+
"max_position_embeddings": 131072,
|
| 56 |
+
"model_type": "gemma3_text",
|
| 57 |
+
"num_attention_heads": 8,
|
| 58 |
+
"num_hidden_layers": 34,
|
| 59 |
+
"num_key_value_heads": 4,
|
| 60 |
+
"pad_token_id": 0,
|
| 61 |
+
"query_pre_attn_scalar": 256,
|
| 62 |
+
"rms_norm_eps": 1e-06,
|
| 63 |
+
"rope_local_base_freq": 10000.0,
|
| 64 |
+
"rope_scaling": {
|
| 65 |
+
"factor": 8.0,
|
| 66 |
+
"rope_type": "linear"
|
| 67 |
+
},
|
| 68 |
+
"rope_theta": 1000000.0,
|
| 69 |
+
"sliding_window": 1024,
|
| 70 |
+
"transformers_version": "4.56.1",
|
| 71 |
+
"use_cache": false,
|
| 72 |
+
"vocab_size": 262208
|
| 73 |
+
}
|
cuda/cuda-int4-rtn-block-32/genai_config.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": {
|
| 3 |
+
"bos_token_id": 2,
|
| 4 |
+
"context_length": 131072,
|
| 5 |
+
"decoder": {
|
| 6 |
+
"session_options": {
|
| 7 |
+
"log_id": "onnxruntime-genai",
|
| 8 |
+
"provider_options": []
|
| 9 |
+
},
|
| 10 |
+
"filename": "model.onnx",
|
| 11 |
+
"head_size": 256,
|
| 12 |
+
"hidden_size": 2560,
|
| 13 |
+
"inputs": {
|
| 14 |
+
"input_ids": "input_ids",
|
| 15 |
+
"attention_mask": "attention_mask",
|
| 16 |
+
"past_key_names": "past_key_values.%d.key",
|
| 17 |
+
"past_value_names": "past_key_values.%d.value"
|
| 18 |
+
},
|
| 19 |
+
"outputs": {
|
| 20 |
+
"logits": "logits",
|
| 21 |
+
"present_key_names": "present.%d.key",
|
| 22 |
+
"present_value_names": "present.%d.value"
|
| 23 |
+
},
|
| 24 |
+
"num_attention_heads": 8,
|
| 25 |
+
"num_hidden_layers": 34,
|
| 26 |
+
"num_key_value_heads": 4
|
| 27 |
+
},
|
| 28 |
+
"eos_token_id": [
|
| 29 |
+
1,
|
| 30 |
+
106
|
| 31 |
+
],
|
| 32 |
+
"pad_token_id": 0,
|
| 33 |
+
"type": "gemma3_text",
|
| 34 |
+
"vocab_size": 262208
|
| 35 |
+
},
|
| 36 |
+
"search": {
|
| 37 |
+
"diversity_penalty": 0.0,
|
| 38 |
+
"do_sample": true,
|
| 39 |
+
"early_stopping": true,
|
| 40 |
+
"length_penalty": 1.0,
|
| 41 |
+
"max_length": 131072,
|
| 42 |
+
"min_length": 0,
|
| 43 |
+
"no_repeat_ngram_size": 0,
|
| 44 |
+
"num_beams": 1,
|
| 45 |
+
"num_return_sequences": 1,
|
| 46 |
+
"past_present_share_buffer": true,
|
| 47 |
+
"repetition_penalty": 1.0,
|
| 48 |
+
"temperature": 1.0,
|
| 49 |
+
"top_k": 64,
|
| 50 |
+
"top_p": 0.95
|
| 51 |
+
}
|
| 52 |
+
}
|
cuda/cuda-int4-rtn-block-32/model.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Id": "Rosetta-4B-CUDA-INT4",
|
| 3 |
+
"Name": "YanoljaNEXT Rosetta 4B CUDA INT4",
|
| 4 |
+
"DisplayName": "YanoljaNEXT Rosetta 4B (GPU, INT4)",
|
| 5 |
+
"HuggingFaceRepoId": "",
|
| 6 |
+
"RepoSubFolder": "",
|
| 7 |
+
"Accelerator": "GPU",
|
| 8 |
+
"Architecture": "Gemma3",
|
| 9 |
+
"VramUsageInGB": 3,
|
| 10 |
+
"RamUsageInGB": 1,
|
| 11 |
+
"ContextWindowTokens": 131072,
|
| 12 |
+
"OriginalModelSize": 4000000000,
|
| 13 |
+
"MaxCharacters": 524288,
|
| 14 |
+
"ModelType": "Translation",
|
| 15 |
+
"LanguagePairs": 31,
|
| 16 |
+
"QuantizationType": "INT4-RTN-Block32-AccLevel2"
|
| 17 |
+
}
|
cuda/cuda-int4-rtn-block-32/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb870c287d7e8f29c8c1b4a51fb418e845c0babe65545f92f1e89fb81545970b
|
| 3 |
+
size 536017
|
cuda/cuda-int4-rtn-block-32/model.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9d6c064f0f8bbdd81cbaa1b1645b4844219470a6b6e262eb29c44abae22bde9
|
| 3 |
+
size 3660775424
|
cuda/cuda-int4-rtn-block-32/special_tokens_map.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<bos>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<end_of_turn>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<pad>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"unk_token": {
|
| 24 |
+
"content": "<unk>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
}
|
cuda/cuda-int4-rtn-block-32/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
|
| 3 |
+
size 33384556
|
cuda/cuda-int4-rtn-block-32/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
directml/dml-int4-rtn-block-32/chat_template.jinja
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{{- bos_token -}}
|
| 2 |
+
{%- for message in messages -%}
|
| 3 |
+
<start_of_turn>
|
| 4 |
+
{%- if message['role']=='system' -%}instruction{{ '\n' }}
|
| 5 |
+
{%- elif message['role']=='user' -%}source{{ '\n' }}
|
| 6 |
+
{%- elif message['role']=='assistant' -%}translation{{ '\n' }}
|
| 7 |
+
{%- endif -%}
|
| 8 |
+
{{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
|
| 9 |
+
{%- endfor -%}
|
| 10 |
+
{%- if add_generation_prompt -%}
|
| 11 |
+
<start_of_turn>translation{{ '\n' }}
|
| 12 |
+
{%- endif -%}
|
directml/dml-int4-rtn-block-32/config.json
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_sliding_window_pattern": 6,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Gemma3ForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_bias": false,
|
| 7 |
+
"attention_dropout": 0.0,
|
| 8 |
+
"attn_logit_softcapping": null,
|
| 9 |
+
"bos_token_id": 2,
|
| 10 |
+
"cache_implementation": "hybrid",
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"eos_token_id": 106,
|
| 13 |
+
"final_logit_softcapping": null,
|
| 14 |
+
"head_dim": 256,
|
| 15 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
| 16 |
+
"hidden_size": 2560,
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"intermediate_size": 10240,
|
| 19 |
+
"layer_types": [
|
| 20 |
+
"sliding_attention",
|
| 21 |
+
"sliding_attention",
|
| 22 |
+
"sliding_attention",
|
| 23 |
+
"sliding_attention",
|
| 24 |
+
"sliding_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"sliding_attention",
|
| 27 |
+
"sliding_attention",
|
| 28 |
+
"sliding_attention",
|
| 29 |
+
"sliding_attention",
|
| 30 |
+
"sliding_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"sliding_attention",
|
| 33 |
+
"sliding_attention",
|
| 34 |
+
"sliding_attention",
|
| 35 |
+
"sliding_attention",
|
| 36 |
+
"sliding_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"sliding_attention",
|
| 39 |
+
"sliding_attention",
|
| 40 |
+
"sliding_attention",
|
| 41 |
+
"sliding_attention",
|
| 42 |
+
"sliding_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"sliding_attention",
|
| 45 |
+
"sliding_attention",
|
| 46 |
+
"sliding_attention",
|
| 47 |
+
"sliding_attention",
|
| 48 |
+
"sliding_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"sliding_attention",
|
| 51 |
+
"sliding_attention",
|
| 52 |
+
"sliding_attention",
|
| 53 |
+
"sliding_attention"
|
| 54 |
+
],
|
| 55 |
+
"max_position_embeddings": 131072,
|
| 56 |
+
"model_type": "gemma3_text",
|
| 57 |
+
"num_attention_heads": 8,
|
| 58 |
+
"num_hidden_layers": 34,
|
| 59 |
+
"num_key_value_heads": 4,
|
| 60 |
+
"pad_token_id": 0,
|
| 61 |
+
"query_pre_attn_scalar": 256,
|
| 62 |
+
"rms_norm_eps": 1e-06,
|
| 63 |
+
"rope_local_base_freq": 10000.0,
|
| 64 |
+
"rope_scaling": {
|
| 65 |
+
"factor": 8.0,
|
| 66 |
+
"rope_type": "linear"
|
| 67 |
+
},
|
| 68 |
+
"rope_theta": 1000000.0,
|
| 69 |
+
"sliding_window": 1024,
|
| 70 |
+
"transformers_version": "4.56.1",
|
| 71 |
+
"use_cache": false,
|
| 72 |
+
"vocab_size": 262208
|
| 73 |
+
}
|
directml/dml-int4-rtn-block-32/genai_config.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": {
|
| 3 |
+
"bos_token_id": 2,
|
| 4 |
+
"context_length": 131072,
|
| 5 |
+
"decoder": {
|
| 6 |
+
"session_options": {
|
| 7 |
+
"log_id": "onnxruntime-genai",
|
| 8 |
+
"provider_options": []
|
| 9 |
+
},
|
| 10 |
+
"filename": "model.onnx",
|
| 11 |
+
"head_size": 256,
|
| 12 |
+
"hidden_size": 2560,
|
| 13 |
+
"inputs": {
|
| 14 |
+
"input_ids": "input_ids",
|
| 15 |
+
"attention_mask": "attention_mask",
|
| 16 |
+
"position_ids": "position_ids",
|
| 17 |
+
"past_key_names": "past_key_values.%d.key",
|
| 18 |
+
"past_value_names": "past_key_values.%d.value"
|
| 19 |
+
},
|
| 20 |
+
"outputs": {
|
| 21 |
+
"logits": "logits",
|
| 22 |
+
"present_key_names": "present.%d.key",
|
| 23 |
+
"present_value_names": "present.%d.value"
|
| 24 |
+
},
|
| 25 |
+
"num_attention_heads": 8,
|
| 26 |
+
"num_hidden_layers": 34,
|
| 27 |
+
"num_key_value_heads": 4
|
| 28 |
+
},
|
| 29 |
+
"eos_token_id": [
|
| 30 |
+
1,
|
| 31 |
+
106
|
| 32 |
+
],
|
| 33 |
+
"pad_token_id": 0,
|
| 34 |
+
"type": "gemma3_text",
|
| 35 |
+
"vocab_size": 262208
|
| 36 |
+
},
|
| 37 |
+
"search": {
|
| 38 |
+
"diversity_penalty": 0.0,
|
| 39 |
+
"do_sample": true,
|
| 40 |
+
"early_stopping": true,
|
| 41 |
+
"length_penalty": 1.0,
|
| 42 |
+
"max_length": 131072,
|
| 43 |
+
"min_length": 0,
|
| 44 |
+
"no_repeat_ngram_size": 0,
|
| 45 |
+
"num_beams": 1,
|
| 46 |
+
"num_return_sequences": 1,
|
| 47 |
+
"past_present_share_buffer": true,
|
| 48 |
+
"repetition_penalty": 1.0,
|
| 49 |
+
"temperature": 1.0,
|
| 50 |
+
"top_k": 64,
|
| 51 |
+
"top_p": 0.95
|
| 52 |
+
}
|
| 53 |
+
}
|
directml/dml-int4-rtn-block-32/model.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Id": "Rosetta-4B-DML-INT4",
|
| 3 |
+
"Name": "YanoljaNEXT Rosetta 4B DML INT4",
|
| 4 |
+
"DisplayName": "YanoljaNEXT Rosetta 4B (DML, INT4)",
|
| 5 |
+
"HuggingFaceRepoId": "",
|
| 6 |
+
"RepoSubFolder": "",
|
| 7 |
+
"Accelerator": "GPU",
|
| 8 |
+
"Architecture": "Gemma3",
|
| 9 |
+
"VramUsageInGB": 3,
|
| 10 |
+
"RamUsageInGB": 1,
|
| 11 |
+
"ContextWindowTokens": 131072,
|
| 12 |
+
"OriginalModelSize": 4000000000,
|
| 13 |
+
"MaxCharacters": 524288,
|
| 14 |
+
"ModelType": "Translation",
|
| 15 |
+
"LanguagePairs": 31,
|
| 16 |
+
"QuantizationType": "INT4-RTN-Block32-AccLevel2"
|
| 17 |
+
}
|
directml/dml-int4-rtn-block-32/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f64be36fcb9768d8019b0bfc565e2d432cdc1c2ef43e3e3b2021abaf8b82efdf
|
| 3 |
+
size 566579
|
directml/dml-int4-rtn-block-32/model.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9d6c064f0f8bbdd81cbaa1b1645b4844219470a6b6e262eb29c44abae22bde9
|
| 3 |
+
size 3660775424
|
directml/dml-int4-rtn-block-32/special_tokens_map.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<bos>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<end_of_turn>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<pad>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"unk_token": {
|
| 24 |
+
"content": "<unk>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
}
|
directml/dml-int4-rtn-block-32/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
|
| 3 |
+
size 33384556
|
directml/dml-int4-rtn-block-32/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpu/gpu-fp16/chat_template.jinja
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{{- bos_token -}}
|
| 2 |
+
{%- for message in messages -%}
|
| 3 |
+
<start_of_turn>
|
| 4 |
+
{%- if message['role']=='system' -%}instruction{{ '\n' }}
|
| 5 |
+
{%- elif message['role']=='user' -%}source{{ '\n' }}
|
| 6 |
+
{%- elif message['role']=='assistant' -%}translation{{ '\n' }}
|
| 7 |
+
{%- endif -%}
|
| 8 |
+
{{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
|
| 9 |
+
{%- endfor -%}
|
| 10 |
+
{%- if add_generation_prompt -%}
|
| 11 |
+
<start_of_turn>translation{{ '\n' }}
|
| 12 |
+
{%- endif -%}
|
gpu/gpu-fp16/config.json
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_sliding_window_pattern": 6,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"Gemma3ForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attention_bias": false,
|
| 7 |
+
"attention_dropout": 0.0,
|
| 8 |
+
"attn_logit_softcapping": null,
|
| 9 |
+
"bos_token_id": 2,
|
| 10 |
+
"cache_implementation": "hybrid",
|
| 11 |
+
"dtype": "bfloat16",
|
| 12 |
+
"eos_token_id": 106,
|
| 13 |
+
"final_logit_softcapping": null,
|
| 14 |
+
"head_dim": 256,
|
| 15 |
+
"hidden_activation": "gelu_pytorch_tanh",
|
| 16 |
+
"hidden_size": 2560,
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"intermediate_size": 10240,
|
| 19 |
+
"layer_types": [
|
| 20 |
+
"sliding_attention",
|
| 21 |
+
"sliding_attention",
|
| 22 |
+
"sliding_attention",
|
| 23 |
+
"sliding_attention",
|
| 24 |
+
"sliding_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"sliding_attention",
|
| 27 |
+
"sliding_attention",
|
| 28 |
+
"sliding_attention",
|
| 29 |
+
"sliding_attention",
|
| 30 |
+
"sliding_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"sliding_attention",
|
| 33 |
+
"sliding_attention",
|
| 34 |
+
"sliding_attention",
|
| 35 |
+
"sliding_attention",
|
| 36 |
+
"sliding_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"sliding_attention",
|
| 39 |
+
"sliding_attention",
|
| 40 |
+
"sliding_attention",
|
| 41 |
+
"sliding_attention",
|
| 42 |
+
"sliding_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"sliding_attention",
|
| 45 |
+
"sliding_attention",
|
| 46 |
+
"sliding_attention",
|
| 47 |
+
"sliding_attention",
|
| 48 |
+
"sliding_attention",
|
| 49 |
+
"full_attention",
|
| 50 |
+
"sliding_attention",
|
| 51 |
+
"sliding_attention",
|
| 52 |
+
"sliding_attention",
|
| 53 |
+
"sliding_attention"
|
| 54 |
+
],
|
| 55 |
+
"max_position_embeddings": 131072,
|
| 56 |
+
"model_type": "gemma3_text",
|
| 57 |
+
"num_attention_heads": 8,
|
| 58 |
+
"num_hidden_layers": 34,
|
| 59 |
+
"num_key_value_heads": 4,
|
| 60 |
+
"pad_token_id": 0,
|
| 61 |
+
"query_pre_attn_scalar": 256,
|
| 62 |
+
"rms_norm_eps": 1e-06,
|
| 63 |
+
"rope_local_base_freq": 10000.0,
|
| 64 |
+
"rope_scaling": {
|
| 65 |
+
"factor": 8.0,
|
| 66 |
+
"rope_type": "linear"
|
| 67 |
+
},
|
| 68 |
+
"rope_theta": 1000000.0,
|
| 69 |
+
"sliding_window": 1024,
|
| 70 |
+
"transformers_version": "4.56.1",
|
| 71 |
+
"use_cache": false,
|
| 72 |
+
"vocab_size": 262208
|
| 73 |
+
}
|
gpu/gpu-fp16/genai_config.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": {
|
| 3 |
+
"bos_token_id": 2,
|
| 4 |
+
"context_length": 131072,
|
| 5 |
+
"decoder": {
|
| 6 |
+
"session_options": {
|
| 7 |
+
"log_id": "onnxruntime-genai",
|
| 8 |
+
"provider_options": [
|
| 9 |
+
{
|
| 10 |
+
"cuda": {
|
| 11 |
+
"enable_cuda_graph": "0",
|
| 12 |
+
"enable_skip_layer_norm_strict_mode": "1"
|
| 13 |
+
}
|
| 14 |
+
}
|
| 15 |
+
]
|
| 16 |
+
},
|
| 17 |
+
"filename": "model.onnx",
|
| 18 |
+
"head_size": 256,
|
| 19 |
+
"hidden_size": 2560,
|
| 20 |
+
"inputs": {
|
| 21 |
+
"input_ids": "input_ids",
|
| 22 |
+
"attention_mask": "attention_mask",
|
| 23 |
+
"past_key_names": "past_key_values.%d.key",
|
| 24 |
+
"past_value_names": "past_key_values.%d.value"
|
| 25 |
+
},
|
| 26 |
+
"outputs": {
|
| 27 |
+
"logits": "logits",
|
| 28 |
+
"present_key_names": "present.%d.key",
|
| 29 |
+
"present_value_names": "present.%d.value"
|
| 30 |
+
},
|
| 31 |
+
"num_attention_heads": 8,
|
| 32 |
+
"num_hidden_layers": 34,
|
| 33 |
+
"num_key_value_heads": 4
|
| 34 |
+
},
|
| 35 |
+
"eos_token_id": [
|
| 36 |
+
1,
|
| 37 |
+
106
|
| 38 |
+
],
|
| 39 |
+
"pad_token_id": 0,
|
| 40 |
+
"type": "gemma3_text",
|
| 41 |
+
"vocab_size": 262208
|
| 42 |
+
},
|
| 43 |
+
"search": {
|
| 44 |
+
"diversity_penalty": 0.0,
|
| 45 |
+
"do_sample": true,
|
| 46 |
+
"early_stopping": true,
|
| 47 |
+
"length_penalty": 1.0,
|
| 48 |
+
"max_length": 131072,
|
| 49 |
+
"min_length": 0,
|
| 50 |
+
"no_repeat_ngram_size": 0,
|
| 51 |
+
"num_beams": 1,
|
| 52 |
+
"num_return_sequences": 1,
|
| 53 |
+
"past_present_share_buffer": true,
|
| 54 |
+
"repetition_penalty": 1.0,
|
| 55 |
+
"temperature": 1.0,
|
| 56 |
+
"top_k": 64,
|
| 57 |
+
"top_p": 0.95
|
| 58 |
+
}
|
| 59 |
+
}
|
gpu/gpu-fp16/model.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Id": "Rosetta-4B-GPU-FP16",
|
| 3 |
+
"Name": "YanoljaNEXT Rosetta 4B GPU FP16",
|
| 4 |
+
"DisplayName": "YanoljaNEXT Rosetta 4B (GPU, FP16)",
|
| 5 |
+
"HuggingFaceRepoId": "",
|
| 6 |
+
"RepoSubFolder": "",
|
| 7 |
+
"Accelerator": "GPU",
|
| 8 |
+
"Architecture": "Gemma3",
|
| 9 |
+
"VramUsageInGB": 10,
|
| 10 |
+
"RamUsageInGB": 1,
|
| 11 |
+
"ContextWindowTokens": 131072,
|
| 12 |
+
"OriginalModelSize": 4000000000,
|
| 13 |
+
"MaxCharacters": 524288,
|
| 14 |
+
"ModelType": "Translation",
|
| 15 |
+
"LanguagePairs": 31,
|
| 16 |
+
"QuantizationType": "FP16"
|
| 17 |
+
}
|
gpu/gpu-fp16/model.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0964d2a9321c75a36ca568ee7f05b61e752be281b30dd388846e3cb478a1d5ad
|
| 3 |
+
size 450718
|
gpu/gpu-fp16/model.onnx.data
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:32169c3f554df1afe4e5bd3871e282fcb36f3721e9063e6fe9c04fa4c72a6e3d
|
| 3 |
+
size 9238020096
|
gpu/gpu-fp16/special_tokens_map.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<bos>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<end_of_turn>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": {
|
| 17 |
+
"content": "<pad>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
},
|
| 23 |
+
"unk_token": {
|
| 24 |
+
"content": "<unk>",
|
| 25 |
+
"lstrip": false,
|
| 26 |
+
"normalized": false,
|
| 27 |
+
"rstrip": false,
|
| 28 |
+
"single_word": false
|
| 29 |
+
}
|
| 30 |
+
}
|
gpu/gpu-fp16/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
|
| 3 |
+
size 33384556
|
gpu/gpu-fp16/tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|