PangaiaSoftware commited on
Commit
8b63e92
·
verified ·
1 Parent(s): 87dcef3

Upload folder using huggingface_hub

Browse files
Files changed (39) hide show
  1. .gitattributes +43 -35
  2. README.md +135 -0
  3. config.json +73 -0
  4. cpu_and_mobile/cpu-int4-rtn-block-32/chat_template.jinja +12 -0
  5. cpu_and_mobile/cpu-int4-rtn-block-32/config.json +73 -0
  6. cpu_and_mobile/cpu-int4-rtn-block-32/genai_config.json +52 -0
  7. cpu_and_mobile/cpu-int4-rtn-block-32/model.json +18 -0
  8. cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx +3 -0
  9. cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx.data +3 -0
  10. cpu_and_mobile/cpu-int4-rtn-block-32/special_tokens_map.json +30 -0
  11. cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer.json +3 -0
  12. cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer_config.json +0 -0
  13. cuda/cuda-int4-rtn-block-32/chat_template.jinja +12 -0
  14. cuda/cuda-int4-rtn-block-32/config.json +73 -0
  15. cuda/cuda-int4-rtn-block-32/genai_config.json +52 -0
  16. cuda/cuda-int4-rtn-block-32/model.json +17 -0
  17. cuda/cuda-int4-rtn-block-32/model.onnx +3 -0
  18. cuda/cuda-int4-rtn-block-32/model.onnx.data +3 -0
  19. cuda/cuda-int4-rtn-block-32/special_tokens_map.json +30 -0
  20. cuda/cuda-int4-rtn-block-32/tokenizer.json +3 -0
  21. cuda/cuda-int4-rtn-block-32/tokenizer_config.json +0 -0
  22. directml/dml-int4-rtn-block-32/chat_template.jinja +12 -0
  23. directml/dml-int4-rtn-block-32/config.json +73 -0
  24. directml/dml-int4-rtn-block-32/genai_config.json +53 -0
  25. directml/dml-int4-rtn-block-32/model.json +17 -0
  26. directml/dml-int4-rtn-block-32/model.onnx +3 -0
  27. directml/dml-int4-rtn-block-32/model.onnx.data +3 -0
  28. directml/dml-int4-rtn-block-32/special_tokens_map.json +30 -0
  29. directml/dml-int4-rtn-block-32/tokenizer.json +3 -0
  30. directml/dml-int4-rtn-block-32/tokenizer_config.json +0 -0
  31. gpu/gpu-fp16/chat_template.jinja +12 -0
  32. gpu/gpu-fp16/config.json +73 -0
  33. gpu/gpu-fp16/genai_config.json +59 -0
  34. gpu/gpu-fp16/model.json +17 -0
  35. gpu/gpu-fp16/model.onnx +3 -0
  36. gpu/gpu-fp16/model.onnx.data +3 -0
  37. gpu/gpu-fp16/special_tokens_map.json +30 -0
  38. gpu/gpu-fp16/tokenizer.json +3 -0
  39. gpu/gpu-fp16/tokenizer_config.json +0 -0
.gitattributes CHANGED
@@ -1,35 +1,43 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx.data filter=lfs diff=lfs merge=lfs -text
37
+ cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ cuda/cuda-int4-rtn-block-32/model.onnx.data filter=lfs diff=lfs merge=lfs -text
39
+ cuda/cuda-int4-rtn-block-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ directml/dml-int4-rtn-block-32/model.onnx.data filter=lfs diff=lfs merge=lfs -text
41
+ directml/dml-int4-rtn-block-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ gpu/gpu-fp16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
43
+ gpu/gpu-fp16/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,138 @@
1
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  license: gemma
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ library_name: onnx
3
+ pipeline_tag: translation
4
+ language:
5
+ - ar
6
+ - bg
7
+ - zh
8
+ - cs
9
+ - da
10
+ - nl
11
+ - en
12
+ - fi
13
+ - fr
14
+ - de
15
+ - el
16
+ - gu
17
+ - he
18
+ - hi
19
+ - hu
20
+ - id
21
+ - it
22
+ - ja
23
+ - ko
24
+ - fa
25
+ - pl
26
+ - pt
27
+ - ro
28
+ - ru
29
+ - sk
30
+ - es
31
+ - sv
32
+ - tl
33
+ - th
34
+ - tr
35
+ - uk
36
+ - vi
37
  license: gemma
38
+ tags:
39
+ - onnx
40
+ - onnxruntime
41
+ - optimum
42
+ - translation
43
+ - gemma
44
+ - int4
45
+ - quantized
46
+ - cuda
47
+ - directml
48
+ base_model: google/gemma-3-4b-pt
49
+ base_model_relation: quantized
50
+ model-index:
51
+ - name: YanoljaNEXT-Rosetta-4B-ONNX
52
+ results:
53
+ - task:
54
+ type: translation
55
+ name: Translation
56
+ metrics:
57
+ - type: bleu
58
+ value: 31.5
59
+ name: BLEU Score
60
  ---
61
+
62
+ # YanoljaNEXT-Rosetta-4B-2510-ONNX
63
+
64
+ ## Introduction
65
+ This repository hosts Pangaia Software's optimized versions of the [`YanoljaNEXT-Rosetta-4B-2510`](https://huggingface.co/yanolja/YanoljaNEXT-Rosetta-4B-2510) model to accelerate inference with ONNX Runtime.
66
+
67
+ Optimized models are published here in ONNX format to run with ONNX Runtime on CPU and GPU across devices, including server platforms, Windows, Linux and Mac desktops, and mobile CPUs, with the precision best suited to each of these targets.
68
+
69
+ Here are some of the optimized configurations we have added:
70
+
71
+ 1. ONNX model for int4 CPU: ONNX model for CPU and mobile using int4 quantization via RTN.
72
+ 2. ONNX model for int4 GPU: ONNX model for GPU using int4 quantization via RTN.
73
+
74
+ ## Model Run
75
+ For CPU:
76
+
77
+ ```bash
78
+ # Download the model directly using the Hugging Face CLI
79
+ huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include cpu_and_mobile/* --local-dir .
80
+
81
+ # Install the CPU package of ONNX Runtime GenAI
82
+ pip install --pre onnxruntime-genai
83
+ ```
84
+
85
+ For CUDA:
86
+
87
+ ```bash
88
+ # Download the model directly using the Hugging Face CLI
89
+ huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include cuda/* --local-dir .
90
+
91
+ # Install the CUDA package of ONNX Runtime GenAI
92
+ pip install --pre onnxruntime-genai-cuda
93
+ ```
94
+
95
+ For GPU:
96
+
97
+ ```bash
98
+ # Download the model directly using the Hugging Face CLI
99
+ huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include gpu/* --local-dir .
100
+
101
+ # Install the CUDA package of ONNX Runtime GenAI
102
+ pip install --pre onnxruntime-genai-cuda
103
+ ```
104
+
105
+ For DirectML:
106
+
107
+ ```bash
108
+ # Download the model directly using the Hugging Face CLI
109
+ huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include directml/* --local-dir .
110
+
111
+ # Install the DML package of ONNX Runtime GenAI
112
+ pip install --pre onnxruntime-genai-directml
113
+ ```
114
+
115
+ Execution:
116
+
117
+ Refer to the [`ONNX Runtime GenAI`](https://github.com/microsoft/onnxruntime-genai) repo for the latest samples for model execution.
118
+
119
+ **Note**: since this is a Gemma-based model, use the corresponding prompt template:
120
+
121
+ ```
122
+ System = "<start_of_turn>instruction\n{{CONTENT}}<end_of_turn>\n",
123
+ User = "<start_of_turn>source\n{{CONTENT}}<end_of_turn>\n",
124
+ Assistant = "<start_of_turn>translation\n{{CONTENT}}<end_of_turn>\n",
125
+ Stop = ["<end_of_turn>", "<start_of_turn>"]
126
+ ```
127
+
128
+
129
+ ## Model Description
130
+ - Developed by: Pangaia Software
131
+ - Model type: ONNX
132
+ - License: gemma
133
+ - Model Description: This is a conversion of the [`YanoljaNEXT-Rosetta-4B-2510`](https://huggingface.co/yanolja/YanoljaNEXT-Rosetta-4B-2510) model for ONNX Runtime inference, which in turn is based on the [`google/gemma-3-4b-pt`](https://huggingface.co/google/gemma-3-4b-pt) model.
134
+
135
+ **Disclaimer:** Model is only an optimization of the base model, any risk associated with the model is the responsibility of the user of the model. Please verify and test for your scenarios. There may be a slight difference in output from the base model with the optimizations applied.
136
+
137
+ ## License
138
+ This model is released under the Gemma license, inherited from its base model, [`google/gemma-3-4b-pt`](https://huggingface.co/google/gemma-3-4b-pt). Please consult the official [Gemma license terms](https://ai.google.dev/gemma/terms) for detailed usage guidelines.
config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 106,
13
+ "final_logit_softcapping": null,
14
+ "head_dim": 256,
15
+ "hidden_activation": "gelu_pytorch_tanh",
16
+ "hidden_size": 2560,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 10240,
19
+ "layer_types": [
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "sliding_attention",
25
+ "full_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "full_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "sliding_attention"
54
+ ],
55
+ "max_position_embeddings": 131072,
56
+ "model_type": "gemma3_text",
57
+ "num_attention_heads": 8,
58
+ "num_hidden_layers": 34,
59
+ "num_key_value_heads": 4,
60
+ "pad_token_id": 0,
61
+ "query_pre_attn_scalar": 256,
62
+ "rms_norm_eps": 1e-06,
63
+ "rope_local_base_freq": 10000.0,
64
+ "rope_scaling": {
65
+ "factor": 8.0,
66
+ "rope_type": "linear"
67
+ },
68
+ "rope_theta": 1000000.0,
69
+ "sliding_window": 1024,
70
+ "transformers_version": "4.56.1",
71
+ "use_cache": false,
72
+ "vocab_size": 262208
73
+ }
cpu_and_mobile/cpu-int4-rtn-block-32/chat_template.jinja ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- for message in messages -%}
3
+ <start_of_turn>
4
+ {%- if message['role']=='system' -%}instruction{{ '\n' }}
5
+ {%- elif message['role']=='user' -%}source{{ '\n' }}
6
+ {%- elif message['role']=='assistant' -%}translation{{ '\n' }}
7
+ {%- endif -%}
8
+ {{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
9
+ {%- endfor -%}
10
+ {%- if add_generation_prompt -%}
11
+ <start_of_turn>translation{{ '\n' }}
12
+ {%- endif -%}
cpu_and_mobile/cpu-int4-rtn-block-32/config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 106,
13
+ "final_logit_softcapping": null,
14
+ "head_dim": 256,
15
+ "hidden_activation": "gelu_pytorch_tanh",
16
+ "hidden_size": 2560,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 10240,
19
+ "layer_types": [
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "sliding_attention",
25
+ "full_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "full_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "sliding_attention"
54
+ ],
55
+ "max_position_embeddings": 131072,
56
+ "model_type": "gemma3_text",
57
+ "num_attention_heads": 8,
58
+ "num_hidden_layers": 34,
59
+ "num_key_value_heads": 4,
60
+ "pad_token_id": 0,
61
+ "query_pre_attn_scalar": 256,
62
+ "rms_norm_eps": 1e-06,
63
+ "rope_local_base_freq": 10000.0,
64
+ "rope_scaling": {
65
+ "factor": 8.0,
66
+ "rope_type": "linear"
67
+ },
68
+ "rope_theta": 1000000.0,
69
+ "sliding_window": 1024,
70
+ "transformers_version": "4.56.1",
71
+ "use_cache": false,
72
+ "vocab_size": 262208
73
+ }
cpu_and_mobile/cpu-int4-rtn-block-32/genai_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "bos_token_id": 2,
4
+ "context_length": 131072,
5
+ "decoder": {
6
+ "session_options": {
7
+ "log_id": "onnxruntime-genai",
8
+ "provider_options": []
9
+ },
10
+ "filename": "model.onnx",
11
+ "head_size": 256,
12
+ "hidden_size": 2560,
13
+ "inputs": {
14
+ "input_ids": "input_ids",
15
+ "attention_mask": "attention_mask",
16
+ "past_key_names": "past_key_values.%d.key",
17
+ "past_value_names": "past_key_values.%d.value"
18
+ },
19
+ "outputs": {
20
+ "logits": "logits",
21
+ "present_key_names": "present.%d.key",
22
+ "present_value_names": "present.%d.value"
23
+ },
24
+ "num_attention_heads": 8,
25
+ "num_hidden_layers": 34,
26
+ "num_key_value_heads": 4
27
+ },
28
+ "eos_token_id": [
29
+ 1,
30
+ 106
31
+ ],
32
+ "pad_token_id": 0,
33
+ "type": "gemma3_text",
34
+ "vocab_size": 262208
35
+ },
36
+ "search": {
37
+ "diversity_penalty": 0.0,
38
+ "do_sample": true,
39
+ "early_stopping": true,
40
+ "length_penalty": 1.0,
41
+ "max_length": 131072,
42
+ "min_length": 0,
43
+ "no_repeat_ngram_size": 0,
44
+ "num_beams": 1,
45
+ "num_return_sequences": 1,
46
+ "past_present_share_buffer": true,
47
+ "repetition_penalty": 1.0,
48
+ "temperature": 1.0,
49
+ "top_k": 64,
50
+ "top_p": 0.95
51
+ }
52
+ }
cpu_and_mobile/cpu-int4-rtn-block-32/model.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Id": "Rosetta-4B-CPU-INT4",
3
+ "Name": "YanoljaNEXT Rosetta 4B CPU INT4",
4
+ "DisplayName": "YanoljaNEXT Rosetta 4B (CPU, INT4)",
5
+ "HuggingFaceRepoId": "",
6
+ "RepoSubFolder": "",
7
+ "Accelerator": "CPU",
8
+ "Architecture": "Gemma3",
9
+ "VramUsageInGB": 0,
10
+ "RamUsageInGB": 3,
11
+ "ContextWindowTokens": 131072,
12
+ "OriginalModelSize": 4000000000,
13
+ "MaxCharacters": 524288,
14
+ "ModelType": "Translation",
15
+ "LanguagePairs": 31,
16
+ "QuantizationType": "INT4-RTN-Block32",
17
+ "BaseModel": "yanolja/YanoljaNEXT-Rosetta-4B-2510"
18
+ }
cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c299659d7eb9f1e78ef7743b4a6384627799b5defcffadf7d9c15728270fd288
3
+ size 429364
cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9caa05aedc17f751abe22fe1f8a5707f1f79a2108d97aab6ff43d4bc677d656a
3
+ size 5379981312
cpu_and_mobile/cpu-int4-rtn-block-32/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<end_of_turn>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
3
+ size 33384556
cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
cuda/cuda-int4-rtn-block-32/chat_template.jinja ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- for message in messages -%}
3
+ <start_of_turn>
4
+ {%- if message['role']=='system' -%}instruction{{ '\n' }}
5
+ {%- elif message['role']=='user' -%}source{{ '\n' }}
6
+ {%- elif message['role']=='assistant' -%}translation{{ '\n' }}
7
+ {%- endif -%}
8
+ {{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
9
+ {%- endfor -%}
10
+ {%- if add_generation_prompt -%}
11
+ <start_of_turn>translation{{ '\n' }}
12
+ {%- endif -%}
cuda/cuda-int4-rtn-block-32/config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 106,
13
+ "final_logit_softcapping": null,
14
+ "head_dim": 256,
15
+ "hidden_activation": "gelu_pytorch_tanh",
16
+ "hidden_size": 2560,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 10240,
19
+ "layer_types": [
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "sliding_attention",
25
+ "full_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "full_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "sliding_attention"
54
+ ],
55
+ "max_position_embeddings": 131072,
56
+ "model_type": "gemma3_text",
57
+ "num_attention_heads": 8,
58
+ "num_hidden_layers": 34,
59
+ "num_key_value_heads": 4,
60
+ "pad_token_id": 0,
61
+ "query_pre_attn_scalar": 256,
62
+ "rms_norm_eps": 1e-06,
63
+ "rope_local_base_freq": 10000.0,
64
+ "rope_scaling": {
65
+ "factor": 8.0,
66
+ "rope_type": "linear"
67
+ },
68
+ "rope_theta": 1000000.0,
69
+ "sliding_window": 1024,
70
+ "transformers_version": "4.56.1",
71
+ "use_cache": false,
72
+ "vocab_size": 262208
73
+ }
cuda/cuda-int4-rtn-block-32/genai_config.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "bos_token_id": 2,
4
+ "context_length": 131072,
5
+ "decoder": {
6
+ "session_options": {
7
+ "log_id": "onnxruntime-genai",
8
+ "provider_options": []
9
+ },
10
+ "filename": "model.onnx",
11
+ "head_size": 256,
12
+ "hidden_size": 2560,
13
+ "inputs": {
14
+ "input_ids": "input_ids",
15
+ "attention_mask": "attention_mask",
16
+ "past_key_names": "past_key_values.%d.key",
17
+ "past_value_names": "past_key_values.%d.value"
18
+ },
19
+ "outputs": {
20
+ "logits": "logits",
21
+ "present_key_names": "present.%d.key",
22
+ "present_value_names": "present.%d.value"
23
+ },
24
+ "num_attention_heads": 8,
25
+ "num_hidden_layers": 34,
26
+ "num_key_value_heads": 4
27
+ },
28
+ "eos_token_id": [
29
+ 1,
30
+ 106
31
+ ],
32
+ "pad_token_id": 0,
33
+ "type": "gemma3_text",
34
+ "vocab_size": 262208
35
+ },
36
+ "search": {
37
+ "diversity_penalty": 0.0,
38
+ "do_sample": true,
39
+ "early_stopping": true,
40
+ "length_penalty": 1.0,
41
+ "max_length": 131072,
42
+ "min_length": 0,
43
+ "no_repeat_ngram_size": 0,
44
+ "num_beams": 1,
45
+ "num_return_sequences": 1,
46
+ "past_present_share_buffer": true,
47
+ "repetition_penalty": 1.0,
48
+ "temperature": 1.0,
49
+ "top_k": 64,
50
+ "top_p": 0.95
51
+ }
52
+ }
cuda/cuda-int4-rtn-block-32/model.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Id": "Rosetta-4B-CUDA-INT4",
3
+ "Name": "YanoljaNEXT Rosetta 4B CUDA INT4",
4
+ "DisplayName": "YanoljaNEXT Rosetta 4B (GPU, INT4)",
5
+ "HuggingFaceRepoId": "",
6
+ "RepoSubFolder": "",
7
+ "Accelerator": "GPU",
8
+ "Architecture": "Gemma3",
9
+ "VramUsageInGB": 3,
10
+ "RamUsageInGB": 1,
11
+ "ContextWindowTokens": 131072,
12
+ "OriginalModelSize": 4000000000,
13
+ "MaxCharacters": 524288,
14
+ "ModelType": "Translation",
15
+ "LanguagePairs": 31,
16
+ "QuantizationType": "INT4-RTN-Block32-AccLevel2"
17
+ }
cuda/cuda-int4-rtn-block-32/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb870c287d7e8f29c8c1b4a51fb418e845c0babe65545f92f1e89fb81545970b
3
+ size 536017
cuda/cuda-int4-rtn-block-32/model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9d6c064f0f8bbdd81cbaa1b1645b4844219470a6b6e262eb29c44abae22bde9
3
+ size 3660775424
cuda/cuda-int4-rtn-block-32/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<end_of_turn>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
cuda/cuda-int4-rtn-block-32/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
3
+ size 33384556
cuda/cuda-int4-rtn-block-32/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
directml/dml-int4-rtn-block-32/chat_template.jinja ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- for message in messages -%}
3
+ <start_of_turn>
4
+ {%- if message['role']=='system' -%}instruction{{ '\n' }}
5
+ {%- elif message['role']=='user' -%}source{{ '\n' }}
6
+ {%- elif message['role']=='assistant' -%}translation{{ '\n' }}
7
+ {%- endif -%}
8
+ {{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
9
+ {%- endfor -%}
10
+ {%- if add_generation_prompt -%}
11
+ <start_of_turn>translation{{ '\n' }}
12
+ {%- endif -%}
directml/dml-int4-rtn-block-32/config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 106,
13
+ "final_logit_softcapping": null,
14
+ "head_dim": 256,
15
+ "hidden_activation": "gelu_pytorch_tanh",
16
+ "hidden_size": 2560,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 10240,
19
+ "layer_types": [
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "sliding_attention",
25
+ "full_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "full_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "sliding_attention"
54
+ ],
55
+ "max_position_embeddings": 131072,
56
+ "model_type": "gemma3_text",
57
+ "num_attention_heads": 8,
58
+ "num_hidden_layers": 34,
59
+ "num_key_value_heads": 4,
60
+ "pad_token_id": 0,
61
+ "query_pre_attn_scalar": 256,
62
+ "rms_norm_eps": 1e-06,
63
+ "rope_local_base_freq": 10000.0,
64
+ "rope_scaling": {
65
+ "factor": 8.0,
66
+ "rope_type": "linear"
67
+ },
68
+ "rope_theta": 1000000.0,
69
+ "sliding_window": 1024,
70
+ "transformers_version": "4.56.1",
71
+ "use_cache": false,
72
+ "vocab_size": 262208
73
+ }
directml/dml-int4-rtn-block-32/genai_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "bos_token_id": 2,
4
+ "context_length": 131072,
5
+ "decoder": {
6
+ "session_options": {
7
+ "log_id": "onnxruntime-genai",
8
+ "provider_options": []
9
+ },
10
+ "filename": "model.onnx",
11
+ "head_size": 256,
12
+ "hidden_size": 2560,
13
+ "inputs": {
14
+ "input_ids": "input_ids",
15
+ "attention_mask": "attention_mask",
16
+ "position_ids": "position_ids",
17
+ "past_key_names": "past_key_values.%d.key",
18
+ "past_value_names": "past_key_values.%d.value"
19
+ },
20
+ "outputs": {
21
+ "logits": "logits",
22
+ "present_key_names": "present.%d.key",
23
+ "present_value_names": "present.%d.value"
24
+ },
25
+ "num_attention_heads": 8,
26
+ "num_hidden_layers": 34,
27
+ "num_key_value_heads": 4
28
+ },
29
+ "eos_token_id": [
30
+ 1,
31
+ 106
32
+ ],
33
+ "pad_token_id": 0,
34
+ "type": "gemma3_text",
35
+ "vocab_size": 262208
36
+ },
37
+ "search": {
38
+ "diversity_penalty": 0.0,
39
+ "do_sample": true,
40
+ "early_stopping": true,
41
+ "length_penalty": 1.0,
42
+ "max_length": 131072,
43
+ "min_length": 0,
44
+ "no_repeat_ngram_size": 0,
45
+ "num_beams": 1,
46
+ "num_return_sequences": 1,
47
+ "past_present_share_buffer": true,
48
+ "repetition_penalty": 1.0,
49
+ "temperature": 1.0,
50
+ "top_k": 64,
51
+ "top_p": 0.95
52
+ }
53
+ }
directml/dml-int4-rtn-block-32/model.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Id": "Rosetta-4B-DML-INT4",
3
+ "Name": "YanoljaNEXT Rosetta 4B DML INT4",
4
+ "DisplayName": "YanoljaNEXT Rosetta 4B (DML, INT4)",
5
+ "HuggingFaceRepoId": "",
6
+ "RepoSubFolder": "",
7
+ "Accelerator": "GPU",
8
+ "Architecture": "Gemma3",
9
+ "VramUsageInGB": 3,
10
+ "RamUsageInGB": 1,
11
+ "ContextWindowTokens": 131072,
12
+ "OriginalModelSize": 4000000000,
13
+ "MaxCharacters": 524288,
14
+ "ModelType": "Translation",
15
+ "LanguagePairs": 31,
16
+ "QuantizationType": "INT4-RTN-Block32-AccLevel2"
17
+ }
directml/dml-int4-rtn-block-32/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f64be36fcb9768d8019b0bfc565e2d432cdc1c2ef43e3e3b2021abaf8b82efdf
3
+ size 566579
directml/dml-int4-rtn-block-32/model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9d6c064f0f8bbdd81cbaa1b1645b4844219470a6b6e262eb29c44abae22bde9
3
+ size 3660775424
directml/dml-int4-rtn-block-32/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<end_of_turn>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
directml/dml-int4-rtn-block-32/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
3
+ size 33384556
directml/dml-int4-rtn-block-32/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
gpu/gpu-fp16/chat_template.jinja ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- for message in messages -%}
3
+ <start_of_turn>
4
+ {%- if message['role']=='system' -%}instruction{{ '\n' }}
5
+ {%- elif message['role']=='user' -%}source{{ '\n' }}
6
+ {%- elif message['role']=='assistant' -%}translation{{ '\n' }}
7
+ {%- endif -%}
8
+ {{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
9
+ {%- endfor -%}
10
+ {%- if add_generation_prompt -%}
11
+ <start_of_turn>translation{{ '\n' }}
12
+ {%- endif -%}
gpu/gpu-fp16/config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 106,
13
+ "final_logit_softcapping": null,
14
+ "head_dim": 256,
15
+ "hidden_activation": "gelu_pytorch_tanh",
16
+ "hidden_size": 2560,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 10240,
19
+ "layer_types": [
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "sliding_attention",
25
+ "full_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "sliding_attention",
31
+ "full_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "sliding_attention",
37
+ "full_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "sliding_attention",
42
+ "sliding_attention",
43
+ "full_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "sliding_attention",
48
+ "sliding_attention",
49
+ "full_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "sliding_attention"
54
+ ],
55
+ "max_position_embeddings": 131072,
56
+ "model_type": "gemma3_text",
57
+ "num_attention_heads": 8,
58
+ "num_hidden_layers": 34,
59
+ "num_key_value_heads": 4,
60
+ "pad_token_id": 0,
61
+ "query_pre_attn_scalar": 256,
62
+ "rms_norm_eps": 1e-06,
63
+ "rope_local_base_freq": 10000.0,
64
+ "rope_scaling": {
65
+ "factor": 8.0,
66
+ "rope_type": "linear"
67
+ },
68
+ "rope_theta": 1000000.0,
69
+ "sliding_window": 1024,
70
+ "transformers_version": "4.56.1",
71
+ "use_cache": false,
72
+ "vocab_size": 262208
73
+ }
gpu/gpu-fp16/genai_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "bos_token_id": 2,
4
+ "context_length": 131072,
5
+ "decoder": {
6
+ "session_options": {
7
+ "log_id": "onnxruntime-genai",
8
+ "provider_options": [
9
+ {
10
+ "cuda": {
11
+ "enable_cuda_graph": "0",
12
+ "enable_skip_layer_norm_strict_mode": "1"
13
+ }
14
+ }
15
+ ]
16
+ },
17
+ "filename": "model.onnx",
18
+ "head_size": 256,
19
+ "hidden_size": 2560,
20
+ "inputs": {
21
+ "input_ids": "input_ids",
22
+ "attention_mask": "attention_mask",
23
+ "past_key_names": "past_key_values.%d.key",
24
+ "past_value_names": "past_key_values.%d.value"
25
+ },
26
+ "outputs": {
27
+ "logits": "logits",
28
+ "present_key_names": "present.%d.key",
29
+ "present_value_names": "present.%d.value"
30
+ },
31
+ "num_attention_heads": 8,
32
+ "num_hidden_layers": 34,
33
+ "num_key_value_heads": 4
34
+ },
35
+ "eos_token_id": [
36
+ 1,
37
+ 106
38
+ ],
39
+ "pad_token_id": 0,
40
+ "type": "gemma3_text",
41
+ "vocab_size": 262208
42
+ },
43
+ "search": {
44
+ "diversity_penalty": 0.0,
45
+ "do_sample": true,
46
+ "early_stopping": true,
47
+ "length_penalty": 1.0,
48
+ "max_length": 131072,
49
+ "min_length": 0,
50
+ "no_repeat_ngram_size": 0,
51
+ "num_beams": 1,
52
+ "num_return_sequences": 1,
53
+ "past_present_share_buffer": true,
54
+ "repetition_penalty": 1.0,
55
+ "temperature": 1.0,
56
+ "top_k": 64,
57
+ "top_p": 0.95
58
+ }
59
+ }
gpu/gpu-fp16/model.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Id": "Rosetta-4B-GPU-FP16",
3
+ "Name": "YanoljaNEXT Rosetta 4B GPU FP16",
4
+ "DisplayName": "YanoljaNEXT Rosetta 4B (GPU, FP16)",
5
+ "HuggingFaceRepoId": "",
6
+ "RepoSubFolder": "",
7
+ "Accelerator": "GPU",
8
+ "Architecture": "Gemma3",
9
+ "VramUsageInGB": 10,
10
+ "RamUsageInGB": 1,
11
+ "ContextWindowTokens": 131072,
12
+ "OriginalModelSize": 4000000000,
13
+ "MaxCharacters": 524288,
14
+ "ModelType": "Translation",
15
+ "LanguagePairs": 31,
16
+ "QuantizationType": "FP16"
17
+ }
gpu/gpu-fp16/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0964d2a9321c75a36ca568ee7f05b61e752be281b30dd388846e3cb478a1d5ad
3
+ size 450718
gpu/gpu-fp16/model.onnx.data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32169c3f554df1afe4e5bd3871e282fcb36f3721e9063e6fe9c04fa4c72a6e3d
3
+ size 9238020096
gpu/gpu-fp16/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<end_of_turn>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
gpu/gpu-fp16/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
3
+ size 33384556
gpu/gpu-fp16/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff