PangaiaSoftware commited on Oct 21

Commit

8b63e92

verified ·

1 Parent(s): 87dcef3

Upload folder using huggingface_hub

Browse files

Files changed (39) hide show

.gitattributes +43 -35
README.md +135 -0
config.json +73 -0
cpu_and_mobile/cpu-int4-rtn-block-32/chat_template.jinja +12 -0
cpu_and_mobile/cpu-int4-rtn-block-32/config.json +73 -0
cpu_and_mobile/cpu-int4-rtn-block-32/genai_config.json +52 -0
cpu_and_mobile/cpu-int4-rtn-block-32/model.json +18 -0
cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx +3 -0
cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx.data +3 -0
cpu_and_mobile/cpu-int4-rtn-block-32/special_tokens_map.json +30 -0
cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer.json +3 -0
cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer_config.json +0 -0
cuda/cuda-int4-rtn-block-32/chat_template.jinja +12 -0
cuda/cuda-int4-rtn-block-32/config.json +73 -0
cuda/cuda-int4-rtn-block-32/genai_config.json +52 -0
cuda/cuda-int4-rtn-block-32/model.json +17 -0
cuda/cuda-int4-rtn-block-32/model.onnx +3 -0
cuda/cuda-int4-rtn-block-32/model.onnx.data +3 -0
cuda/cuda-int4-rtn-block-32/special_tokens_map.json +30 -0
cuda/cuda-int4-rtn-block-32/tokenizer.json +3 -0
cuda/cuda-int4-rtn-block-32/tokenizer_config.json +0 -0
directml/dml-int4-rtn-block-32/chat_template.jinja +12 -0
directml/dml-int4-rtn-block-32/config.json +73 -0
directml/dml-int4-rtn-block-32/genai_config.json +53 -0
directml/dml-int4-rtn-block-32/model.json +17 -0
directml/dml-int4-rtn-block-32/model.onnx +3 -0
directml/dml-int4-rtn-block-32/model.onnx.data +3 -0
directml/dml-int4-rtn-block-32/special_tokens_map.json +30 -0
directml/dml-int4-rtn-block-32/tokenizer.json +3 -0
directml/dml-int4-rtn-block-32/tokenizer_config.json +0 -0
gpu/gpu-fp16/chat_template.jinja +12 -0
gpu/gpu-fp16/config.json +73 -0
gpu/gpu-fp16/genai_config.json +59 -0
gpu/gpu-fp16/model.json +17 -0
gpu/gpu-fp16/model.onnx +3 -0
gpu/gpu-fp16/model.onnx.data +3 -0
gpu/gpu-fp16/special_tokens_map.json +30 -0
gpu/gpu-fp16/tokenizer.json +3 -0
gpu/gpu-fp16/tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,43 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx.data filter=lfs diff=lfs merge=lfs -text
+cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+cuda/cuda-int4-rtn-block-32/model.onnx.data filter=lfs diff=lfs merge=lfs -text
+cuda/cuda-int4-rtn-block-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+directml/dml-int4-rtn-block-32/model.onnx.data filter=lfs diff=lfs merge=lfs -text
+directml/dml-int4-rtn-block-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+gpu/gpu-fp16/model.onnx.data filter=lfs diff=lfs merge=lfs -text
+gpu/gpu-fp16/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,138 @@
 ---
 license: gemma
 ---

 ---
+library_name: onnx
+pipeline_tag: translation
+language:
+  - ar
+  - bg
+  - zh
+  - cs
+  - da
+  - nl
+  - en
+  - fi
+  - fr
+  - de
+  - el
+  - gu
+  - he
+  - hi
+  - hu
+  - id
+  - it
+  - ja
+  - ko
+  - fa
+  - pl
+  - pt
+  - ro
+  - ru
+  - sk
+  - es
+  - sv
+  - tl
+  - th
+  - tr
+  - uk
+  - vi
 license: gemma
+tags:
+- onnx
+- onnxruntime
+- optimum
+- translation
+- gemma
+- int4
+- quantized
+- cuda
+- directml
+base_model: google/gemma-3-4b-pt
+base_model_relation: quantized
+model-index:
+- name: YanoljaNEXT-Rosetta-4B-ONNX
+  results:
+  - task:
+      type: translation
+      name: Translation
+    metrics:
+    - type: bleu
+      value: 31.5
+      name: BLEU Score
 ---
+# YanoljaNEXT-Rosetta-4B-2510-ONNX
+## Introduction
+This repository hosts Pangaia Software's optimized versions of the [`YanoljaNEXT-Rosetta-4B-2510`](https://huggingface.co/yanolja/YanoljaNEXT-Rosetta-4B-2510) model to accelerate inference with ONNX Runtime.
+Optimized models are published here in ONNX format to run with ONNX Runtime on CPU and GPU across devices, including server platforms, Windows, Linux and Mac desktops, and mobile CPUs, with the precision best suited to each of these targets.
+Here are some of the optimized configurations we have added:
+1. ONNX model for int4 CPU: ONNX model for CPU and mobile using int4 quantization via RTN.
+2. ONNX model for int4 GPU: ONNX model for GPU using int4 quantization via RTN.
+## Model Run
+For CPU:
+```bash
+# Download the model directly using the Hugging Face CLI
+huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include cpu_and_mobile/* --local-dir .
+# Install the CPU package of ONNX Runtime GenAI
+pip install --pre onnxruntime-genai
+```
+For CUDA:
+```bash
+# Download the model directly using the Hugging Face CLI
+huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include cuda/* --local-dir .
+# Install the CUDA package of ONNX Runtime GenAI
+pip install --pre onnxruntime-genai-cuda
+```
+For GPU:
+```bash
+# Download the model directly using the Hugging Face CLI
+huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include gpu/* --local-dir .
+# Install the CUDA package of ONNX Runtime GenAI
+pip install --pre onnxruntime-genai-cuda
+```
+For DirectML:
+```bash
+# Download the model directly using the Hugging Face CLI
+huggingface-cli download PangaiaSoftware/YanoljaNEXT-Rosetta-4B-onnx --include directml/* --local-dir .
+# Install the DML package of ONNX Runtime GenAI
+pip install --pre onnxruntime-genai-directml
+```
+Execution:
+Refer to the [`ONNX Runtime GenAI`](https://github.com/microsoft/onnxruntime-genai) repo for the latest samples for model execution.
+**Note**: since this is a Gemma-based model, use the corresponding prompt template:
+```
+System = "<start_of_turn>instruction\n{{CONTENT}}<end_of_turn>\n",
+User = "<start_of_turn>source\n{{CONTENT}}<end_of_turn>\n",
+Assistant = "<start_of_turn>translation\n{{CONTENT}}<end_of_turn>\n",
+Stop = ["<end_of_turn>", "<start_of_turn>"]
+```
+## Model Description
+- Developed by: Pangaia Software
+- Model type: ONNX
+- License: gemma
+- Model Description: This is a conversion of the [`YanoljaNEXT-Rosetta-4B-2510`](https://huggingface.co/yanolja/YanoljaNEXT-Rosetta-4B-2510) model for ONNX Runtime inference, which in turn is based on the [`google/gemma-3-4b-pt`](https://huggingface.co/google/gemma-3-4b-pt) model.
+**Disclaimer:** Model is only an optimization of the base model, any risk associated with the model is the responsibility of the user of the model. Please verify and test for your scenarios. There may be a slight difference in output from the base model with the optimizations applied.
+## License
+This model is released under the Gemma license, inherited from its base model, [`google/gemma-3-4b-pt`](https://huggingface.co/google/gemma-3-4b-pt). Please consult the official [Gemma license terms](https://ai.google.dev/gemma/terms) for detailed usage guidelines.

config.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "dtype": "bfloat16",
+  "eos_token_id": 106,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 10240,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 34,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": {
+    "factor": 8.0,
+    "rope_type": "linear"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 1024,
+  "transformers_version": "4.56.1",
+  "use_cache": false,
+  "vocab_size": 262208
+}

cpu_and_mobile/cpu-int4-rtn-block-32/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,12 @@

+{{- bos_token -}}
+{%- for message in messages -%}
+<start_of_turn>
+{%- if message['role']=='system' -%}instruction{{ '\n' }}
+{%- elif message['role']=='user' -%}source{{ '\n' }}
+{%- elif message['role']=='assistant' -%}translation{{ '\n' }}
+{%- endif -%}
+{{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+<start_of_turn>translation{{ '\n' }}
+{%- endif -%}

cpu_and_mobile/cpu-int4-rtn-block-32/config.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "dtype": "bfloat16",
+  "eos_token_id": 106,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 10240,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 34,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": {
+    "factor": 8.0,
+    "rope_type": "linear"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 1024,
+  "transformers_version": "4.56.1",
+  "use_cache": false,
+  "vocab_size": 262208
+}

cpu_and_mobile/cpu-int4-rtn-block-32/genai_config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+    "model": {
+        "bos_token_id": 2,
+        "context_length": 131072,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": []
+            },
+            "filename": "model.onnx",
+            "head_size": 256,
+            "hidden_size": 2560,
+            "inputs": {
+                "input_ids": "input_ids",
+                "attention_mask": "attention_mask",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 8,
+            "num_hidden_layers": 34,
+            "num_key_value_heads": 4
+        },
+        "eos_token_id": [
+            1,
+            106
+        ],
+        "pad_token_id": 0,
+        "type": "gemma3_text",
+        "vocab_size": 262208
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": true,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 131072,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 64,
+        "top_p": 0.95
+    }
+}

cpu_and_mobile/cpu-int4-rtn-block-32/model.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "Id": "Rosetta-4B-CPU-INT4",
+  "Name": "YanoljaNEXT Rosetta 4B CPU INT4",
+  "DisplayName": "YanoljaNEXT Rosetta 4B (CPU, INT4)",
+  "HuggingFaceRepoId": "",
+  "RepoSubFolder": "",
+  "Accelerator": "CPU",
+  "Architecture": "Gemma3",
+  "VramUsageInGB": 0,
+  "RamUsageInGB": 3,
+  "ContextWindowTokens": 131072,
+  "OriginalModelSize": 4000000000,
+  "MaxCharacters": 524288,
+  "ModelType": "Translation",
+  "LanguagePairs": 31,
+  "QuantizationType": "INT4-RTN-Block32",
+  "BaseModel": "yanolja/YanoljaNEXT-Rosetta-4B-2510"
+}

cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c299659d7eb9f1e78ef7743b4a6384627799b5defcffadf7d9c15728270fd288
+size 429364

cpu_and_mobile/cpu-int4-rtn-block-32/model.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9caa05aedc17f751abe22fe1f8a5707f1f79a2108d97aab6ff43d4bc677d656a
+size 5379981312

cpu_and_mobile/cpu-int4-rtn-block-32/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<end_of_turn>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
+size 33384556

cpu_and_mobile/cpu-int4-rtn-block-32/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

cuda/cuda-int4-rtn-block-32/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,12 @@

+{{- bos_token -}}
+{%- for message in messages -%}
+<start_of_turn>
+{%- if message['role']=='system' -%}instruction{{ '\n' }}
+{%- elif message['role']=='user' -%}source{{ '\n' }}
+{%- elif message['role']=='assistant' -%}translation{{ '\n' }}
+{%- endif -%}
+{{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+<start_of_turn>translation{{ '\n' }}
+{%- endif -%}

cuda/cuda-int4-rtn-block-32/config.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "dtype": "bfloat16",
+  "eos_token_id": 106,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 10240,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 34,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": {
+    "factor": 8.0,
+    "rope_type": "linear"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 1024,
+  "transformers_version": "4.56.1",
+  "use_cache": false,
+  "vocab_size": 262208
+}

cuda/cuda-int4-rtn-block-32/genai_config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+    "model": {
+        "bos_token_id": 2,
+        "context_length": 131072,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": []
+            },
+            "filename": "model.onnx",
+            "head_size": 256,
+            "hidden_size": 2560,
+            "inputs": {
+                "input_ids": "input_ids",
+                "attention_mask": "attention_mask",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 8,
+            "num_hidden_layers": 34,
+            "num_key_value_heads": 4
+        },
+        "eos_token_id": [
+            1,
+            106
+        ],
+        "pad_token_id": 0,
+        "type": "gemma3_text",
+        "vocab_size": 262208
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": true,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 131072,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 64,
+        "top_p": 0.95
+    }
+}

cuda/cuda-int4-rtn-block-32/model.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "Id": "Rosetta-4B-CUDA-INT4",
+  "Name": "YanoljaNEXT Rosetta 4B CUDA INT4",
+  "DisplayName": "YanoljaNEXT Rosetta 4B (GPU, INT4)",
+  "HuggingFaceRepoId": "",
+  "RepoSubFolder": "",
+  "Accelerator": "GPU",
+  "Architecture": "Gemma3",
+  "VramUsageInGB": 3,
+  "RamUsageInGB": 1,
+  "ContextWindowTokens": 131072,
+  "OriginalModelSize": 4000000000,
+  "MaxCharacters": 524288,
+  "ModelType": "Translation",
+  "LanguagePairs": 31,
+  "QuantizationType": "INT4-RTN-Block32-AccLevel2"
+}

cuda/cuda-int4-rtn-block-32/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb870c287d7e8f29c8c1b4a51fb418e845c0babe65545f92f1e89fb81545970b
+size 536017

cuda/cuda-int4-rtn-block-32/model.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9d6c064f0f8bbdd81cbaa1b1645b4844219470a6b6e262eb29c44abae22bde9
+size 3660775424

cuda/cuda-int4-rtn-block-32/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<end_of_turn>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

cuda/cuda-int4-rtn-block-32/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
+size 33384556

cuda/cuda-int4-rtn-block-32/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

directml/dml-int4-rtn-block-32/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,12 @@

+{{- bos_token -}}
+{%- for message in messages -%}
+<start_of_turn>
+{%- if message['role']=='system' -%}instruction{{ '\n' }}
+{%- elif message['role']=='user' -%}source{{ '\n' }}
+{%- elif message['role']=='assistant' -%}translation{{ '\n' }}
+{%- endif -%}
+{{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+<start_of_turn>translation{{ '\n' }}
+{%- endif -%}

directml/dml-int4-rtn-block-32/config.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "dtype": "bfloat16",
+  "eos_token_id": 106,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 10240,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 34,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": {
+    "factor": 8.0,
+    "rope_type": "linear"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 1024,
+  "transformers_version": "4.56.1",
+  "use_cache": false,
+  "vocab_size": 262208
+}

directml/dml-int4-rtn-block-32/genai_config.json ADDED Viewed

	@@ -0,0 +1,53 @@

+{
+    "model": {
+        "bos_token_id": 2,
+        "context_length": 131072,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": []
+            },
+            "filename": "model.onnx",
+            "head_size": 256,
+            "hidden_size": 2560,
+            "inputs": {
+                "input_ids": "input_ids",
+                "attention_mask": "attention_mask",
+                "position_ids": "position_ids",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 8,
+            "num_hidden_layers": 34,
+            "num_key_value_heads": 4
+        },
+        "eos_token_id": [
+            1,
+            106
+        ],
+        "pad_token_id": 0,
+        "type": "gemma3_text",
+        "vocab_size": 262208
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": true,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 131072,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 64,
+        "top_p": 0.95
+    }
+}

directml/dml-int4-rtn-block-32/model.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "Id": "Rosetta-4B-DML-INT4",
+  "Name": "YanoljaNEXT Rosetta 4B DML INT4",
+  "DisplayName": "YanoljaNEXT Rosetta 4B (DML, INT4)",
+  "HuggingFaceRepoId": "",
+  "RepoSubFolder": "",
+  "Accelerator": "GPU",
+  "Architecture": "Gemma3",
+  "VramUsageInGB": 3,
+  "RamUsageInGB": 1,
+  "ContextWindowTokens": 131072,
+  "OriginalModelSize": 4000000000,
+  "MaxCharacters": 524288,
+  "ModelType": "Translation",
+  "LanguagePairs": 31,
+  "QuantizationType": "INT4-RTN-Block32-AccLevel2"
+}

directml/dml-int4-rtn-block-32/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f64be36fcb9768d8019b0bfc565e2d432cdc1c2ef43e3e3b2021abaf8b82efdf
+size 566579

directml/dml-int4-rtn-block-32/model.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9d6c064f0f8bbdd81cbaa1b1645b4844219470a6b6e262eb29c44abae22bde9
+size 3660775424

directml/dml-int4-rtn-block-32/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<end_of_turn>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

directml/dml-int4-rtn-block-32/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
+size 33384556

directml/dml-int4-rtn-block-32/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gpu/gpu-fp16/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,12 @@

+{{- bos_token -}}
+{%- for message in messages -%}
+<start_of_turn>
+{%- if message['role']=='system' -%}instruction{{ '\n' }}
+{%- elif message['role']=='user' -%}source{{ '\n' }}
+{%- elif message['role']=='assistant' -%}translation{{ '\n' }}
+{%- endif -%}
+{{- message['content'] | trim -}}<end_of_turn>{{ '\n' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+<start_of_turn>translation{{ '\n' }}
+{%- endif -%}

gpu/gpu-fp16/config.json ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+  "_sliding_window_pattern": 6,
+  "architectures": [
+    "Gemma3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": null,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "dtype": "bfloat16",
+  "eos_token_id": 106,
+  "final_logit_softcapping": null,
+  "head_dim": 256,
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 10240,
+  "layer_types": [
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "sliding_attention"
+  ],
+  "max_position_embeddings": 131072,
+  "model_type": "gemma3_text",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 34,
+  "num_key_value_heads": 4,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_local_base_freq": 10000.0,
+  "rope_scaling": {
+    "factor": 8.0,
+    "rope_type": "linear"
+  },
+  "rope_theta": 1000000.0,
+  "sliding_window": 1024,
+  "transformers_version": "4.56.1",
+  "use_cache": false,
+  "vocab_size": 262208
+}

gpu/gpu-fp16/genai_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+    "model": {
+        "bos_token_id": 2,
+        "context_length": 131072,
+        "decoder": {
+            "session_options": {
+                "log_id": "onnxruntime-genai",
+                "provider_options": [
+                    {
+                        "cuda": {
+                            "enable_cuda_graph": "0",
+                            "enable_skip_layer_norm_strict_mode": "1"
+                        }
+                    }
+                ]
+            },
+            "filename": "model.onnx",
+            "head_size": 256,
+            "hidden_size": 2560,
+            "inputs": {
+                "input_ids": "input_ids",
+                "attention_mask": "attention_mask",
+                "past_key_names": "past_key_values.%d.key",
+                "past_value_names": "past_key_values.%d.value"
+            },
+            "outputs": {
+                "logits": "logits",
+                "present_key_names": "present.%d.key",
+                "present_value_names": "present.%d.value"
+            },
+            "num_attention_heads": 8,
+            "num_hidden_layers": 34,
+            "num_key_value_heads": 4
+        },
+        "eos_token_id": [
+            1,
+            106
+        ],
+        "pad_token_id": 0,
+        "type": "gemma3_text",
+        "vocab_size": 262208
+    },
+    "search": {
+        "diversity_penalty": 0.0,
+        "do_sample": true,
+        "early_stopping": true,
+        "length_penalty": 1.0,
+        "max_length": 131072,
+        "min_length": 0,
+        "no_repeat_ngram_size": 0,
+        "num_beams": 1,
+        "num_return_sequences": 1,
+        "past_present_share_buffer": true,
+        "repetition_penalty": 1.0,
+        "temperature": 1.0,
+        "top_k": 64,
+        "top_p": 0.95
+    }
+}

gpu/gpu-fp16/model.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "Id": "Rosetta-4B-GPU-FP16",
+  "Name": "YanoljaNEXT Rosetta 4B GPU FP16",
+  "DisplayName": "YanoljaNEXT Rosetta 4B (GPU, FP16)",
+  "HuggingFaceRepoId": "",
+  "RepoSubFolder": "",
+  "Accelerator": "GPU",
+  "Architecture": "Gemma3",
+  "VramUsageInGB": 10,
+  "RamUsageInGB": 1,
+  "ContextWindowTokens": 131072,
+  "OriginalModelSize": 4000000000,
+  "MaxCharacters": 524288,
+  "ModelType": "Translation",
+  "LanguagePairs": 31,
+  "QuantizationType": "FP16"
+}

gpu/gpu-fp16/model.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0964d2a9321c75a36ca568ee7f05b61e752be281b30dd388846e3cb478a1d5ad
+size 450718

gpu/gpu-fp16/model.onnx.data ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32169c3f554df1afe4e5bd3871e282fcb36f3721e9063e6fe9c04fa4c72a6e3d
+size 9238020096

gpu/gpu-fp16/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<end_of_turn>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

gpu/gpu-fp16/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6b7c9043ba3b559295e6032728ca44ba21879713a32d4a35240794b2ed66d78
+size 33384556

gpu/gpu-fp16/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff