Upload 11 files
Browse filesuploading models
- .gitattributes +1 -0
- README.md +639 -0
- config.json +10 -0
- generation_config.json +8 -0
- model-00001-of-00002.safetensors +3 -0
- model-00002-of-00002.safetensors +3 -0
- model.safetensors.index.json +734 -0
- preprocessor_config.json +25 -0
- special_tokens_map.json +39 -0
- tokenizer.json +3 -0
- tokenizer.model +3 -0
- tokenizer_config.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,639 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
license: gemma
|
| 4 |
+
pipeline_tag: image-text-to-text
|
| 5 |
+
extra_gated_heading: Access PaliGemma on Hugging Face
|
| 6 |
+
extra_gated_prompt: To access PaliGemma on Hugging Face, you’re required to review
|
| 7 |
+
and agree to Google’s usage license. To do this, please ensure you’re logged-in
|
| 8 |
+
to Hugging Face and click below. Requests are processed immediately.
|
| 9 |
+
extra_gated_button_content: Acknowledge license
|
| 10 |
+
---
|
| 11 |
+
# PaliGemma 2 model card
|
| 12 |
+
|
| 13 |
+
**Model page:** [PaliGemma](https://ai.google.dev/gemma/docs/paligemma)
|
| 14 |
+
|
| 15 |
+
Transformers PaliGemma 2 3B weights fine-tuned on a mixture of academic tasks using 448x448 input images.
|
| 16 |
+
PaliGemma 2 **mix** checkpoints are fine-tuned on a diverse set of tasks and are ready to use out of the box while **pt** checkpoints are pre-trained and intended for further fine-tuning. These tasks include short and long captioning, optical character recognition, question answering, object detection and segmentation, and more.
|
| 17 |
+
The model is available in the `bfloat16` format for research purposes only.
|
| 18 |
+
|
| 19 |
+
**Resources and technical documentation:**
|
| 20 |
+
|
| 21 |
+
* [PaliGemma 2 on Kaggle](https://www.kaggle.com/models/google/paligemma-2)
|
| 22 |
+
* [Responsible Generative AI Toolkit](https://ai.google.dev/responsible)
|
| 23 |
+
|
| 24 |
+
**Terms of Use:** [Terms](https://ai.google.dev/gemma/terms)
|
| 25 |
+
|
| 26 |
+
**Authors:** Google
|
| 27 |
+
|
| 28 |
+
## Model information
|
| 29 |
+
|
| 30 |
+
### Model summary
|
| 31 |
+
|
| 32 |
+
PaliGemma 2 is an update of the [PaliGemma](https://arxiv.org/abs/2407.07726)
|
| 33 |
+
vision-language model (VLM) which incorporates the capabilities of the
|
| 34 |
+
[Gemma 2](https://arxiv.org/abs/2408.00118) models. The PaliGemma family of
|
| 35 |
+
models is inspired by [PaLI-3](https://arxiv.org/abs/2310.09199) and based on
|
| 36 |
+
open components such as the [SigLIP](https://arxiv.org/abs/2303.15343) vision
|
| 37 |
+
model and [Gemma 2](https://arxiv.org/abs/2408.00118) language models. It takes
|
| 38 |
+
both image and text as input and generates text as output, supporting multiple
|
| 39 |
+
languages. It is designed for class-leading fine-tune performance on a wide
|
| 40 |
+
range of vision-language tasks such as image and short video caption, visual
|
| 41 |
+
question answering, text reading, object detection and object segmentation.
|
| 42 |
+
|
| 43 |
+
#### Model architecture
|
| 44 |
+
|
| 45 |
+
PaliGemma 2 is the composition of a
|
| 46 |
+
[Transformer decoder](https://arxiv.org/abs/1706.03762) and a
|
| 47 |
+
[Vision Transformer image encoder](https://arxiv.org/abs/2010.11929).
|
| 48 |
+
The text decoder is initialized from
|
| 49 |
+
[Gemma 2](https://ai.google.dev/gemma/docs/base) in the 2B, 9B, and 27B
|
| 50 |
+
parameter sizes. The image encoder is initialized from
|
| 51 |
+
[SigLIP-So400m/14](https://colab.research.google.com/github/google-research/big_vision/blob/main/big_vision/configs/proj/image_text/SigLIP_demo.ipynb).
|
| 52 |
+
Similar to the original PaliGemma model, PaliGemma 2 is trained following the
|
| 53 |
+
[PaLI-3](https://arxiv.org/abs/2310.09199) recipes.
|
| 54 |
+
|
| 55 |
+
#### Inputs and outputs
|
| 56 |
+
|
| 57 |
+
* **Input:** Image and text string, such as a prompt to caption the image, or
|
| 58 |
+
a question.
|
| 59 |
+
* **Output:** Generated text in response to the input, such as a caption of
|
| 60 |
+
the image, an answer to a question, a list of object bounding box
|
| 61 |
+
coordinates, or segmentation codewords.
|
| 62 |
+
|
| 63 |
+
### Model data
|
| 64 |
+
|
| 65 |
+
#### Pre-train datasets
|
| 66 |
+
|
| 67 |
+
PaliGemma 2 is pre-trained on the following mixture of datasets:
|
| 68 |
+
|
| 69 |
+
* **WebLI:** [WebLI (Web Language Image)](https://arxiv.org/abs/2209.06794) is
|
| 70 |
+
a web-scale multilingual image-text dataset built from the public web. A
|
| 71 |
+
wide range of WebLI splits are used to acquire versatile model capabilities,
|
| 72 |
+
such as visual semantic understanding, object localization,
|
| 73 |
+
visually-situated text understanding, and multilinguality.
|
| 74 |
+
* **CC3M-35L:** Curated English image-alt_text pairs from webpages
|
| 75 |
+
([Sharma et al., 2018](https://aclanthology.org/P18-1238/)). We used the
|
| 76 |
+
[Google Cloud Translation API](https://cloud.google.com/translate) to
|
| 77 |
+
translate into 34 additional languages.
|
| 78 |
+
* **VQ²A-CC3M-35L/VQG-CC3M-35L:** A subset of VQ2A-CC3M
|
| 79 |
+
([Changpinyo et al., 2022a](https://aclanthology.org/2022.naacl-main.142/)),
|
| 80 |
+
translated into the same additional 34 languages as CC3M-35L, using the
|
| 81 |
+
[Google Cloud Translation API](https://cloud.google.com/translate).
|
| 82 |
+
* **OpenImages:** Detection and object-aware questions and answers
|
| 83 |
+
([Piergiovanni et al. 2022](https://arxiv.org/abs/2209.04372)) generated by
|
| 84 |
+
handcrafted rules on the [OpenImages dataset].
|
| 85 |
+
* **WIT:** Images and texts collected from Wikipedia
|
| 86 |
+
([Srinivasan et al., 2021](https://arxiv.org/abs/2103.01913)).
|
| 87 |
+
|
| 88 |
+
[OpenImages dataset]: https://storage.googleapis.com/openimages/web/factsfigures_v7.html
|
| 89 |
+
PaliGemma 2 is based on Gemma 2, and you can find information on the
|
| 90 |
+
pre-training datasets for Gemma 2 in the
|
| 91 |
+
[Gemma 2 model card](https://ai.google.dev/gemma/docs/model_card_2).
|
| 92 |
+
|
| 93 |
+
#### Data responsibility filtering
|
| 94 |
+
|
| 95 |
+
The following filters are applied to WebLI, with the goal of training PaliGemma
|
| 96 |
+
2 on safe and responsible data:
|
| 97 |
+
|
| 98 |
+
* **Pornographic image filtering:** This filter removes images deemed to be of
|
| 99 |
+
pornographic nature.
|
| 100 |
+
* **Text safety filtering:** We identify and filter out images that are paired
|
| 101 |
+
with unsafe text. Unsafe text is any text deemed to contain or be about
|
| 102 |
+
child sexual abuse imagery (CSAI), pornography, vulgarities, or is otherwise
|
| 103 |
+
offensive.
|
| 104 |
+
* **Text toxicity filtering:** We further use the [Perspective
|
| 105 |
+
API](https://perspectiveapi.com/) to identify and filter out images that are
|
| 106 |
+
paired with text deemed insulting, obscene, hateful or otherwise toxic.
|
| 107 |
+
* **Text personal information filtering:** We filtered certain personal
|
| 108 |
+
information and other sensitive data using the [Cloud Data Loss Prevention
|
| 109 |
+
(DLP) API](https://cloud.google.com/security/products/dlp) to protect the
|
| 110 |
+
privacy of individuals. Identifiers such as social security numbers and
|
| 111 |
+
[other sensitive information types] were removed.
|
| 112 |
+
* **Additional methods:** Filtering based on content quality and safety in
|
| 113 |
+
line with our policies and practices.
|
| 114 |
+
|
| 115 |
+
[other sensitive information types]: https://cloud.google.com/sensitive-data-protection/docs/high-sensitivity-infotypes-reference?_gl=1*jg604m*_ga*ODk5MzA3ODQyLjE3MTAzMzQ3NTk.*_ga_WH2QY8WWF5*MTcxMDUxNTkxMS4yLjEuMTcxMDUxNjA2NC4wLjAuMA..&_ga=2.172110058.-899307842.1710334759
|
| 116 |
+
|
| 117 |
+
## Use in Transformers
|
| 118 |
+
|
| 119 |
+
You can use the following prompt templates to perform different tasks:
|
| 120 |
+
- `"cap {lang}"`: Raw short caption (from WebLI-alt)
|
| 121 |
+
- `"caption {lang}"`: Nice, COCO-like short captions
|
| 122 |
+
- `"describe {lang}"`: Longer, more descriptive captions
|
| 123 |
+
- `"ocr"`: Optical character recognition
|
| 124 |
+
- `"answer {lang} {question}"`: Question answering about the image contents
|
| 125 |
+
- `"question {lang} {answer}"`: Question generation for a given answer
|
| 126 |
+
- `"detect {object} ; {object}"`: Locate listed objects in an image and return the bounding boxes for those objects
|
| 127 |
+
- `"segment {object}"`: Locate the area occupied by the object in an image to create an image segmentation for that object
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
```python
|
| 131 |
+
from transformers import (
|
| 132 |
+
PaliGemmaProcessor,
|
| 133 |
+
PaliGemmaForConditionalGeneration,
|
| 134 |
+
)
|
| 135 |
+
from transformers.image_utils import load_image
|
| 136 |
+
import torch
|
| 137 |
+
|
| 138 |
+
model_id = "google/paligemma2-3b-mix-448"
|
| 139 |
+
|
| 140 |
+
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
|
| 141 |
+
image = load_image(url)
|
| 142 |
+
|
| 143 |
+
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto").eval()
|
| 144 |
+
processor = PaliGemmaProcessor.from_pretrained(model_id)
|
| 145 |
+
|
| 146 |
+
prompt = "describe en"
|
| 147 |
+
model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(torch.bfloat16).to(model.device)
|
| 148 |
+
input_len = model_inputs["input_ids"].shape[-1]
|
| 149 |
+
|
| 150 |
+
with torch.inference_mode():
|
| 151 |
+
generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
|
| 152 |
+
generation = generation[0][input_len:]
|
| 153 |
+
decoded = processor.decode(generation, skip_special_tokens=True)
|
| 154 |
+
print(decoded)
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
Here is a [notebook](https://github.com/merveenoyan/smol-vision/blob/main/Fine_tune_PaliGemma.ipynb)
|
| 158 |
+
that showcases fine-tuning PaliGemma 2.
|
| 159 |
+
|
| 160 |
+
## Implementation information
|
| 161 |
+
|
| 162 |
+
### Hardware
|
| 163 |
+
|
| 164 |
+
PaliGemma 2 was trained using the latest generation of Tensor Processing Unit
|
| 165 |
+
(TPU) hardware (TPUv5e).
|
| 166 |
+
|
| 167 |
+
### Software
|
| 168 |
+
|
| 169 |
+
Training was completed using [JAX](https://github.com/google/jax),
|
| 170 |
+
[Flax](https://github.com/google/flax),
|
| 171 |
+
[TFDS](https://github.com/tensorflow/datasets) and
|
| 172 |
+
[`big_vision`](https://github.com/google-research/big_vision).
|
| 173 |
+
|
| 174 |
+
JAX allows researchers to take advantage of the latest generation of hardware,
|
| 175 |
+
including TPUs, for faster and more efficient training of large models.
|
| 176 |
+
|
| 177 |
+
TFDS is used to access datasets and Flax is used for model architecture. The
|
| 178 |
+
PaliGemma 2 fine-tune code and inference code are released in the `big_vision`
|
| 179 |
+
GitHub repository.
|
| 180 |
+
|
| 181 |
+
## Evaluation information
|
| 182 |
+
|
| 183 |
+
### Benchmark results
|
| 184 |
+
|
| 185 |
+
In order to verify the transferability of PaliGemma 2 to a wide variety of
|
| 186 |
+
academic tasks, we fine-tune the pretrained models on each task. We report results on
|
| 187 |
+
different resolutions to provide an impression of which tasks benefit from
|
| 188 |
+
increased resolution and which tasks benefit from larger models. Importantly, none of these tasks or datasets are part of
|
| 189 |
+
the pretraining data mixture, and their images are explicitly removed from the
|
| 190 |
+
web-scale pre-training data.
|
| 191 |
+
|
| 192 |
+
#### PaliGemma 2 results by model resolution and size
|
| 193 |
+
|
| 194 |
+
| Benchmark | 224-3B | 224-10B | 224-28B | 448-3B | 448-10B | 448-28B |
|
| 195 |
+
|-------------------------------|:------:|:-------:|:-------:|:------:|:-------:|:-------:|
|
| 196 |
+
| [AI2D][ai2d] | 74.7 | 83.1 | 83.2 | 76.0 | 84.4 | 84.6 |
|
| 197 |
+
| [AOKVQA-DA][aokvqa-da] (val) | 64.2 | 68.9 | 70.2 | 67.9 | 70.8 | 71.2 |
|
| 198 |
+
| [AOKVQA-MC][aokvqa-mc] (val) | 79.7 | 83.7 | 84.7 | 82.5 | 85.9 | 87.0 |
|
| 199 |
+
| [ActivityNet-CAP][anet-cap] | 34.2 | 35.9 | - | - | - | - |
|
| 200 |
+
| [ActivityNet-QA][anet-qa] | 51.3 | 53.2 | - | - | - | - |
|
| 201 |
+
| [COCO-35L][coco-35l] (avg34) | 113.9 | 115.8 | 116.5 | 115.8 | 117.2 | 117.2 |
|
| 202 |
+
| [COCO-35L][coco-35l] (en) | 138.4 | 140.8 | 142.4 | 140.4 | 142.4 | 142.3 |
|
| 203 |
+
| [COCOcap][coco-cap] | 141.3 | 143.7 | 144.0 | 143.4 | 145.0 | 145.2 |
|
| 204 |
+
| [ChartQA][chartqa] (aug) | 74.4 | 74.2 | 68.9 | 89.2 | 90.1 | 85.1 |
|
| 205 |
+
| [ChartQA][chartqa] (human) | 42.0 | 48.4 | 46.8 | 54.0 | 66.4 | 61.3 |
|
| 206 |
+
| [CountBenchQA][countbenchqa] | 81.0 | 84.0 | 86.4 | 82.0 | 85.3 | 87.4 |
|
| 207 |
+
| [DocVQA][docvqa] (val) | 39.9 | 43.9 | 44.9 | 73.6 | 76.6 | 76.1 |
|
| 208 |
+
| [GQA][gqa] | 66.2 | 67.2 | 67.3 | 68.1 | 68.3 | 68.3 |
|
| 209 |
+
| [InfoVQA][info-vqa] (val) | 25.2 | 33.6 | 36.4 | 37.5 | 47.8 | 46.7 |
|
| 210 |
+
| [MARVL][marvl] (avg5) | 83.5 | 89.5 | 90.6 | 82.7 | 89.1 | 89.7 |
|
| 211 |
+
| [MSRVTT-CAP][msrvtt] | 68.5 | 72.1 | - | - | - | - |
|
| 212 |
+
| [MSRVTT-QA][msrvtt] | 50.5 | 51.9 | - | - | - | - |
|
| 213 |
+
| [MSVD-QA][msvd-qa] | 61.1 | 62.5 | - | - | - | - |
|
| 214 |
+
| [NLVR2][nlvr2] | 91.4 | 93.9 | 94.2 | 91.6 | 93.7 | 94.1 |
|
| 215 |
+
| [NoCaps][nocaps] | 123.1 | 126.3 | 127.1 | 123.5 | 126.9 | 127.0 |
|
| 216 |
+
| [OCR-VQA][ocr-vqa] | 73.4 | 74.7 | 75.3 | 75.7 | 76.3 | 76.6 |
|
| 217 |
+
| [OKVQA][okvqa] | 64.2 | 68.0 | 71.2 | 64.1 | 68.6 | 70.6 |
|
| 218 |
+
| [RSVQA-hr][rsvqa-hr] (test) | 92.7 | 92.6 | 92.7 | 92.8 | 92.8 | 92.8 |
|
| 219 |
+
| [RSVQA-hr][rsvqa-hr] (test2) | 90.9 | 90.8 | 90.9 | 90.7 | 90.7 | 90.8 |
|
| 220 |
+
| [RSVQA-lr][rsvqa-lr] | 93.0 | 92.8 | 93.5 | 92.7 | 93.1 | 93.7 |
|
| 221 |
+
| [RefCOCO][refcoco] (testA) | 75.7 | 77.2 | 76.8 | 78.6 | 79.7 | 79.3 |
|
| 222 |
+
| [RefCOCO][refcoco] (testB) | 71.0 | 74.2 | 73.9 | 73.5 | 76.2 | 74.8 |
|
| 223 |
+
| [RefCOCO][refcoco] (val) | 73.4 | 75.9 | 75.0 | 76.3 | 78.2 | 77.3 |
|
| 224 |
+
| [RefCOCO+][refcoco+] (testA) | 72.7 | 74.7 | 73.6 | 76.1 | 77.7 | 76.6 |
|
| 225 |
+
| [RefCOCO+][refcoco+] (testB) | 64.2 | 68.4 | 67.1 | 67.0 | 71.1 | 68.6 |
|
| 226 |
+
| [RefCOCO+][refcoco+] (val) | 68.6 | 72.0 | 70.3 | 72.1 | 74.4 | 72.8 |
|
| 227 |
+
| [RefCOCOg][refcocog] (test) | 69.0 | 71.9 | 70.7 | 72.7 | 74.8 | 73.7 |
|
| 228 |
+
| [RefCOCOg][refcocog] (val) | 68.3 | 71.4 | 70.5 | 72.3 | 74.4 | 73.0 |
|
| 229 |
+
| [ST-VQA][st-vqa] (val) | 61.9 | 64.3 | 65.1 | 80.5 | 82.0 | 81.8 |
|
| 230 |
+
| [SciCap][scicap] | 165.1 | 159.5 | 156.9 | 183.3 | 177.2 | 172.7 |
|
| 231 |
+
| [ScienceQA][scienceqa] | 96.1 | 98.2 | 98.2 | 96.2 | 98.5 | 98.6 |
|
| 232 |
+
| [Screen2Words][screen2words] | 113.3 | 117.8 | 122.8 | 114.0 | 119.1 | 123.4 |
|
| 233 |
+
| [TallyQA][tallyqa] (complex) | 70.3 | 73.4 | 74.2 | 73.6 | 76.7 | 76.8 |
|
| 234 |
+
| [TallyQA][tallyqa] (simple) | 81.8 | 83.2 | 83.4 | 85.3 | 86.2 | 85.7 |
|
| 235 |
+
| [TextCaps][textcaps] | 127.5 | 137.9 | 139.9 | 152.1 | 157.7 | 153.6 |
|
| 236 |
+
| [TextVQA][textvqa] (val) | 59.6 | 64.0 | 64.7 | 75.2 | 76.6 | 76.2 |
|
| 237 |
+
| [VATEX][vatex] | 80.8 | 82.7 | - | - | - | - |
|
| 238 |
+
| [VQAv2][vqav2] (minival) | 83.0 | 84.3 | 84.5 | 84.8 | 85.8 | 85.8 |
|
| 239 |
+
| [VizWizVQA][vizwiz-vqa] (val) | 76.4 | 78.1 | 78.7 | 77.5 | 78.6 | 78.9 |
|
| 240 |
+
| [WidgetCap][widgetcap] | 138.1 | 139.8 | 138.8 | 151.4 | 151.9 | 148.9 |
|
| 241 |
+
| [XM3600][xm3600] (avg35) | 42.8 | 44.5 | 45.2 | 43.2 | 44.6 | 45.2 |
|
| 242 |
+
| [XM3600][xm3600] (en) | 79.8 | 80.7 | 81.0 | 80.3 | 81.5 | 81.0 |
|
| 243 |
+
| [xGQA][xgqa] (avg7) | 58.6 | 61.4 | 61.1 | 60.4 | 62.6 | 62.1 |
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
#### Additional Benchmarks
|
| 247 |
+
|
| 248 |
+
**[ICDAR 2015 Incidental][icdar2015-inc]**
|
| 249 |
+
|
| 250 |
+
| Model | Precision | Recall | F1 |
|
| 251 |
+
|-----------------|-----------|:------:|:-----:|
|
| 252 |
+
| PaliGemma 2 3B | 81.9 | 70.7 | 75.9 |
|
| 253 |
+
|
| 254 |
+
**[Total-Text][total-text]**
|
| 255 |
+
|
| 256 |
+
| Model | Precision | Recall | F1 |
|
| 257 |
+
|-----------------|-----------|:------:|:-----:|
|
| 258 |
+
| PaliGemma 2 3B | 73.8 | 74.5 | 74.2 |
|
| 259 |
+
|
| 260 |
+
**[FinTabNet][fintabnet]**
|
| 261 |
+
|
| 262 |
+
| Model | S-TEDS | TEDS | GriTS-Top | GriTS-Con |
|
| 263 |
+
|-----------------|--------|-------|-----------|-----------|
|
| 264 |
+
| PaliGemma 2 3B | 99.2 | 98.9 | 99.4 | 99.2 |
|
| 265 |
+
|
| 266 |
+
**[PubTabNet][pubtabnet]**
|
| 267 |
+
|
| 268 |
+
| Model | S-TEDS | TEDS | GriTS-Top | GriTS-Con |
|
| 269 |
+
|-----------------|--------|-------|-----------|-----------|
|
| 270 |
+
| PaliGemma 2 3B | 97.6 | 97.3 | 97.9 | 97.8 |
|
| 271 |
+
|
| 272 |
+
**[GrandStaff][grandstaff]**
|
| 273 |
+
|
| 274 |
+
| Model | CER | LER | SER |
|
| 275 |
+
|-----------------|-----|-----|-----|
|
| 276 |
+
| PaliGemma 2 3B | 1.6 | 6.7 | 2.3 |
|
| 277 |
+
|
| 278 |
+
**[PubChem][pubchem]**
|
| 279 |
+
|
| 280 |
+
* PaliGemma 2 3B, Full Match: 94.8
|
| 281 |
+
|
| 282 |
+
**[DOCCI][docci]**
|
| 283 |
+
|
| 284 |
+
| Model | avg#char | avg#sent | NES % |
|
| 285 |
+
|-----------------|----------|----------|---------|
|
| 286 |
+
| PaliGemma 2 3B | 529 | 7.7 | 28.4 |
|
| 287 |
+
| PaliGemma 2 10B | 521 | 7.5 | 20.3 |
|
| 288 |
+
|
| 289 |
+
- *avg#char*: Average number of characters
|
| 290 |
+
- *avg#sent*: Average number of sentences
|
| 291 |
+
- *NES*: Non entailment sentences
|
| 292 |
+
|
| 293 |
+
**[MIMIC-CXR][mimic-cxr]**
|
| 294 |
+
|
| 295 |
+
| Model | CIDEr | BLEU4 | Rouge-L | RadGraph F1 |
|
| 296 |
+
|-----------------|-------|-------|---------|-------------|
|
| 297 |
+
| PaliGemma 2 3B | 19.9 | 14.6 | 31.9 | 28.8 |
|
| 298 |
+
| PaliGemma 2 10B | 17.4 | 15.0 | 32.4 | 29.5 |
|
| 299 |
+
|
| 300 |
+
**[Visual Spatial Reasoning][vsr]**
|
| 301 |
+
|
| 302 |
+
| Model | VSR zeroshot split (test) | VSR random split (test) |
|
| 303 |
+
|-----------------|---------------------------|--------------------------|
|
| 304 |
+
| PaliGemma 2 3B | 74.8 | 81.6 |
|
| 305 |
+
| PaliGemma 2 10B | 79.8 | 86.8 |
|
| 306 |
+
|
| 307 |
+
## Ethics and safety
|
| 308 |
+
|
| 309 |
+
### Evaluation approach
|
| 310 |
+
|
| 311 |
+
Our evaluation methods include structured ethics and safety evaluations across
|
| 312 |
+
relevant content policies, including:
|
| 313 |
+
|
| 314 |
+
* Human evaluation on prompts covering child safety, content safety and
|
| 315 |
+
representational harms. See the [Gemma model
|
| 316 |
+
card](https://ai.google.dev/gemma/docs/model_card#evaluation_approach) for
|
| 317 |
+
more details on evaluation approach, but with image captioning and visual
|
| 318 |
+
question answering setups.
|
| 319 |
+
* Image-to-Text benchmark evaluation: Benchmark against relevant academic
|
| 320 |
+
datasets such as FairFace Dataset ([Karkkainen et al.,
|
| 321 |
+
2021](https://arxiv.org/abs/1908.04913)).
|
| 322 |
+
|
| 323 |
+
### Evaluation results
|
| 324 |
+
|
| 325 |
+
* The human evaluation results of ethics and safety evaluations are within
|
| 326 |
+
acceptable thresholds for meeting [internal
|
| 327 |
+
policies](https://storage.googleapis.com/gweb-uniblog-publish-prod/documents/2023_Google_AI_Principles_Progress_Update.pdf#page=11)
|
| 328 |
+
for categories such as child safety, content safety and representational
|
| 329 |
+
harms.
|
| 330 |
+
* On top of robust internal evaluations, we also use the Perspective API
|
| 331 |
+
(threshold of 0.8) to measure toxicity, profanity, and other potential
|
| 332 |
+
issues in the generated captions for images sourced from the FairFace
|
| 333 |
+
dataset. We report the maximum and median values observed across subgroups
|
| 334 |
+
for each of the perceived gender, ethnicity, and age attributes.
|
| 335 |
+
|
| 336 |
+
<table>
|
| 337 |
+
<tr>
|
| 338 |
+
<col>
|
| 339 |
+
<colgroup span="3"></colgroup>
|
| 340 |
+
<colgroup span="3"></colgroup>
|
| 341 |
+
<colgroup span="3"></colgroup>
|
| 342 |
+
<th>Metric</th>
|
| 343 |
+
<th colspan="3" scope="colgroup">Perceived gender</th>
|
| 344 |
+
<th colspan="3" scope="colgroup">Ethnicity</th>
|
| 345 |
+
<th colspan="3" scope="colgroup">Age group</th>
|
| 346 |
+
</tr>
|
| 347 |
+
<tr>
|
| 348 |
+
<th>Model size</th>
|
| 349 |
+
<th scope="col">3B</th>
|
| 350 |
+
<th scope="col">10B</th>
|
| 351 |
+
<th scope="col">28B</th>
|
| 352 |
+
<th scope="col">3B</th>
|
| 353 |
+
<th scope="col">10B</th>
|
| 354 |
+
<th scope="col">28B</th>
|
| 355 |
+
<th scope="col">3B</th>
|
| 356 |
+
<th scope="col">10B</th>
|
| 357 |
+
<th scope="col">28B</th>
|
| 358 |
+
</tr>
|
| 359 |
+
<tr>
|
| 360 |
+
<th></th>
|
| 361 |
+
<th colspan="9" scope="colgroup">Maximum</th>
|
| 362 |
+
</tr>
|
| 363 |
+
<tr>
|
| 364 |
+
<td>Toxicity</td>
|
| 365 |
+
<td>0.14%</td>
|
| 366 |
+
<td>0.15%</td>
|
| 367 |
+
<td>0.19%</td>
|
| 368 |
+
<td>0.29%</td>
|
| 369 |
+
<td>0.39%</td>
|
| 370 |
+
<td>0.39%</td>
|
| 371 |
+
<td>0.26%</td>
|
| 372 |
+
<td>0.18%</td>
|
| 373 |
+
<td>0.32%</td>
|
| 374 |
+
</tr>
|
| 375 |
+
<tr>
|
| 376 |
+
<td>Identity Attack</td>
|
| 377 |
+
<td>0.04%</td>
|
| 378 |
+
<td>0.02%</td>
|
| 379 |
+
<td>0.02%</td>
|
| 380 |
+
<td>0.13%</td>
|
| 381 |
+
<td>0.06%</td>
|
| 382 |
+
<td>0.06%</td>
|
| 383 |
+
<td>0.06%</td>
|
| 384 |
+
<td>0.03%</td>
|
| 385 |
+
<td>0.06%</td>
|
| 386 |
+
</tr>
|
| 387 |
+
<tr>
|
| 388 |
+
<td>Insult</td>
|
| 389 |
+
<td>0.17%</td>
|
| 390 |
+
<td>0.25%</td>
|
| 391 |
+
<td>0.17%</td>
|
| 392 |
+
<td>0.37%</td>
|
| 393 |
+
<td>0.52%</td>
|
| 394 |
+
<td>0.52%</td>
|
| 395 |
+
<td>0.27%</td>
|
| 396 |
+
<td>0.39%</td>
|
| 397 |
+
<td>0.24%</td>
|
| 398 |
+
</tr>
|
| 399 |
+
<tr>
|
| 400 |
+
<td>Threat</td>
|
| 401 |
+
<td>0.55%</td>
|
| 402 |
+
<td>0.43%</td>
|
| 403 |
+
<td>0.57%</td>
|
| 404 |
+
<td>0.83%</td>
|
| 405 |
+
<td>0.48%</td>
|
| 406 |
+
<td>0.48%</td>
|
| 407 |
+
<td>0.64%</td>
|
| 408 |
+
<td>0.43%</td>
|
| 409 |
+
<td>0.64%</td>
|
| 410 |
+
</tr>
|
| 411 |
+
<tr>
|
| 412 |
+
<td>Profanity</td>
|
| 413 |
+
<td>0.00%</td>
|
| 414 |
+
<td>0.00%</td>
|
| 415 |
+
<td>0.00%</td>
|
| 416 |
+
<td>0.00%</td>
|
| 417 |
+
<td>0.00%</td>
|
| 418 |
+
<td>0.00%</td>
|
| 419 |
+
<td>0.00%</td>
|
| 420 |
+
<td>0.00%</td>
|
| 421 |
+
<td>0.00%</td>
|
| 422 |
+
</tr>
|
| 423 |
+
<tr>
|
| 424 |
+
<th></th>
|
| 425 |
+
<th colspan="9" scope="colgroup">Median</th>
|
| 426 |
+
</tr>
|
| 427 |
+
<tr>
|
| 428 |
+
<td>Toxicity</td>
|
| 429 |
+
<td>0.13%</td>
|
| 430 |
+
<td>0.10%</td>
|
| 431 |
+
<td>0.18%</td>
|
| 432 |
+
<td>0.07%</td>
|
| 433 |
+
<td>0.07%</td>
|
| 434 |
+
<td>0.14%</td>
|
| 435 |
+
<td>0.12%</td>
|
| 436 |
+
<td>0.08%</td>
|
| 437 |
+
<td>0.12%</td>
|
| 438 |
+
</tr>
|
| 439 |
+
<tr>
|
| 440 |
+
<td>Identity Attack</td>
|
| 441 |
+
<td>0.02%</td>
|
| 442 |
+
<td>0.01%</td>
|
| 443 |
+
<td>0.02%</td>
|
| 444 |
+
<td>0.00%</td>
|
| 445 |
+
<td>0.00%</td>
|
| 446 |
+
<td>0.00%</td>
|
| 447 |
+
<td>0.00%</td>
|
| 448 |
+
<td>0.00%</td>
|
| 449 |
+
<td>0.00%</td>
|
| 450 |
+
</tr>
|
| 451 |
+
<tr>
|
| 452 |
+
<td>Insult</td>
|
| 453 |
+
<td>0.15%</td>
|
| 454 |
+
<td>0.23%</td>
|
| 455 |
+
<td>0.14%</td>
|
| 456 |
+
<td>0.14%</td>
|
| 457 |
+
<td>0.17%</td>
|
| 458 |
+
<td>0.13%</td>
|
| 459 |
+
<td>0.09%</td>
|
| 460 |
+
<td>0.18%</td>
|
| 461 |
+
<td>0.16%</td>
|
| 462 |
+
</tr>
|
| 463 |
+
<tr>
|
| 464 |
+
<td>Threat</td>
|
| 465 |
+
<td>0.35%</td>
|
| 466 |
+
<td>0.27%</td>
|
| 467 |
+
<td>0.41%</td>
|
| 468 |
+
<td>0.28%</td>
|
| 469 |
+
<td>0.19%</td>
|
| 470 |
+
<td>0.42%</td>
|
| 471 |
+
<td>0.27%</td>
|
| 472 |
+
<td>0.31%</td>
|
| 473 |
+
<td>0.40%</td>
|
| 474 |
+
</tr>
|
| 475 |
+
<tr>
|
| 476 |
+
<td>Profanity</td>
|
| 477 |
+
<td>0.00%</td>
|
| 478 |
+
<td>0.00%</td>
|
| 479 |
+
<td>0.00%</td>
|
| 480 |
+
<td>0.00%</td>
|
| 481 |
+
<td>0.00%</td>
|
| 482 |
+
<td>0.00%</td>
|
| 483 |
+
<td>0.00%</td>
|
| 484 |
+
<td>0.00%</td>
|
| 485 |
+
<td>0.00%</td>
|
| 486 |
+
</tr>
|
| 487 |
+
</table>
|
| 488 |
+
|
| 489 |
+
## Usage and limitations
|
| 490 |
+
|
| 491 |
+
### Intended usage
|
| 492 |
+
|
| 493 |
+
Open Vision Language Models (VLMs) have a wide range of applications across
|
| 494 |
+
various industries and domains. The following list of potential uses is not
|
| 495 |
+
comprehensive. The purpose of this list is to provide contextual information
|
| 496 |
+
about the possible use-cases that the model creators considered as part of model
|
| 497 |
+
training and development. Prohibited uses of Gemma models are outlined in the
|
| 498 |
+
[Gemma Prohibited Use Policy](https://ai.google.dev/gemma/prohibited_use_policy).
|
| 499 |
+
|
| 500 |
+
Fine-tune on specific vision-language task:
|
| 501 |
+
|
| 502 |
+
* The pre-trained models can be fine-tuned on a wide range of vision-language
|
| 503 |
+
tasks such as: image captioning, short video caption, visual question
|
| 504 |
+
answering, text reading, object detection and object segmentation.
|
| 505 |
+
* The pre-trained models can be fine-tuned for specific domains such as remote
|
| 506 |
+
sensing question answering, visual questions from people who are blind,
|
| 507 |
+
science question answering, describe UI element functionalities.
|
| 508 |
+
* The pre-trained models can be fine-tuned for tasks with non-textual outputs
|
| 509 |
+
such as bounding boxes or segmentation masks.
|
| 510 |
+
|
| 511 |
+
Vision-language research:
|
| 512 |
+
|
| 513 |
+
* The pre-trained models and fine-tuned models can serve as a foundation for
|
| 514 |
+
researchers to experiment with VLM techniques, develop algorithms, and
|
| 515 |
+
contribute to the advancement of the field.
|
| 516 |
+
|
| 517 |
+
### Ethical considerations and risks
|
| 518 |
+
|
| 519 |
+
The development of vision-language models (VLMs) raises several ethical
|
| 520 |
+
concerns. In creating an open model, we have carefully considered the following:
|
| 521 |
+
|
| 522 |
+
* Bias and Fairness
|
| 523 |
+
* VLMs trained on large-scale, real-world image-text data can reflect
|
| 524 |
+
socio-cultural biases embedded in the training material. These models
|
| 525 |
+
underwent careful scrutiny, input data pre-processing described and
|
| 526 |
+
posterior evaluations reported in this card.
|
| 527 |
+
* Misinformation and Misuse
|
| 528 |
+
* VLMs can be misused to generate text that is false, misleading, or
|
| 529 |
+
harmful.
|
| 530 |
+
* Guidelines are provided for responsible use with the model, see the
|
| 531 |
+
[Responsible Generative AI Toolkit](https://ai.google.dev/responsible).
|
| 532 |
+
* Transparency and Accountability
|
| 533 |
+
* This model card summarizes details on the models' architecture,
|
| 534 |
+
capabilities, limitations, and evaluation processes.
|
| 535 |
+
* A responsibly developed open model offers the opportunity to share
|
| 536 |
+
innovation by making VLM technology accessible to developers and
|
| 537 |
+
researchers across the AI ecosystem.
|
| 538 |
+
|
| 539 |
+
Risks identified and mitigations:
|
| 540 |
+
|
| 541 |
+
* **Perpetuation of biases:** It's encouraged to perform continuous monitoring
|
| 542 |
+
(using evaluation metrics, human review) and the exploration of de-biasing
|
| 543 |
+
techniques during model training, fine-tuning, and other use cases.
|
| 544 |
+
* **Generation of harmful content:** Mechanisms and guidelines for content
|
| 545 |
+
safety are essential. Developers are encouraged to exercise caution and
|
| 546 |
+
implement appropriate content safety safeguards based on their specific
|
| 547 |
+
product policies and application use cases.
|
| 548 |
+
* **Misuse for malicious purposes:** Technical limitations and developer and
|
| 549 |
+
end-user education can help mitigate against malicious applications of LLMs.
|
| 550 |
+
Educational resources and reporting mechanisms for users to flag misuse are
|
| 551 |
+
provided: see the [Responsible Generative AI Toolkit](https://ai.google.dev/responsible).
|
| 552 |
+
Prohibited uses of Gemma models are outlined in the
|
| 553 |
+
[Gemma Prohibited Use Policy](https://ai.google.dev/gemma/prohibited_use_policy).
|
| 554 |
+
* **Privacy violations:** Models were trained on data filtered to remove
|
| 555 |
+
certain personal information and sensitive data. Developers are encouraged
|
| 556 |
+
to adhere to privacy regulations with privacy-preserving techniques.
|
| 557 |
+
|
| 558 |
+
### Limitations
|
| 559 |
+
|
| 560 |
+
* Most limitations inherited from the underlying Gemma 2 models still apply:
|
| 561 |
+
* VLMs are better at tasks that can be framed with clear prompts and
|
| 562 |
+
instructions. Open-ended or highly complex tasks might be challenging.
|
| 563 |
+
* Natural language is inherently complex. VLMs might struggle to grasp
|
| 564 |
+
subtle nuances, sarcasm, or figurative language.
|
| 565 |
+
* VLMs generate responses based on information they learned from their
|
| 566 |
+
training datasets, but they are not knowledge bases. They may generate
|
| 567 |
+
incorrect or outdated factual statements.
|
| 568 |
+
* VLMs rely on statistical patterns in language and images. They might
|
| 569 |
+
lack the ability to apply common sense reasoning in certain situations.
|
| 570 |
+
* PaliGemma 2 was designed first and foremost to serve as a general
|
| 571 |
+
pre-trained model for fine-tuning to specialized tasks. Hence, its "out of
|
| 572 |
+
the box" or "zero-shot" performance might lag behind models designed
|
| 573 |
+
specifically for general purpose use.
|
| 574 |
+
* PaliGemma 2 is not a multi-turn chatbot. It is designed for a single round
|
| 575 |
+
of image and text input.
|
| 576 |
+
|
| 577 |
+
|
| 578 |
+
[ai2d]: https://allenai.org/data/diagrams
|
| 579 |
+
[aokvqa-da]: https://allenai.org/project/a-okvqa/home
|
| 580 |
+
[aokvqa-mc]: https://allenai.org/project/a-okvqa/home
|
| 581 |
+
[anet-cap]: https://paperswithcode.com/dataset/activitynet-captions
|
| 582 |
+
[anet-qa]: https://arxiv.org/abs/1906.02467
|
| 583 |
+
[chartqa]: https://arxiv.org/abs/2203.10244
|
| 584 |
+
[coco-35l]: https://arxiv.org/pdf/2205.12522
|
| 585 |
+
[coco-cap]: https://cocodataset.org/#home
|
| 586 |
+
[countbenchqa]: https://github.com/google-research/big_vision/blob/main/big_vision/datasets/countbenchqa/
|
| 587 |
+
[docvqa]: https://www.docvqa.org/
|
| 588 |
+
[gqa]: https://cs.stanford.edu/people/dorarad/gqa/about.html
|
| 589 |
+
[info-vqa]: https://arxiv.org/abs/2104.12756
|
| 590 |
+
[marvl]: https://marvl-challenge.github.io/
|
| 591 |
+
[msrvtt]: https://paperswithcode.com/dataset/msr-vtt
|
| 592 |
+
[msvd-qa]: https://paperswithcode.com/dataset/msvd-qa
|
| 593 |
+
[nlvr2]: https://lil.nlp.cornell.edu/nlvr/
|
| 594 |
+
[nocaps]: https://nocaps.org/
|
| 595 |
+
[ocr-vqa]: https://ocr-vqa.github.io/
|
| 596 |
+
[okvqa]: https://okvqa.allenai.org/
|
| 597 |
+
[refcoco]: https://arxiv.org/abs/1608.00272
|
| 598 |
+
[refcoco+]: https://aclanthology.org/D14-1086
|
| 599 |
+
[refcocog]: https://arxiv.org/abs/1511.02283
|
| 600 |
+
[rsvqa-hr]: https://zenodo.org/records/6344367
|
| 601 |
+
[rsvqa-lr]: https://zenodo.org/records/6344334
|
| 602 |
+
[st-vqa]: https://arxiv.org/abs/1905.13648
|
| 603 |
+
[scicap]: https://arxiv.org/abs/2110.11624
|
| 604 |
+
[scienceqa]: https://scienceqa.github.io/
|
| 605 |
+
[screen2words]: https://arxiv.org/abs/2108.03353
|
| 606 |
+
[tallyqa]: https://arxiv.org/abs/1810.12440
|
| 607 |
+
[textcaps]: https://textvqa.org/textcaps/
|
| 608 |
+
[textvqa]: https://textvqa.org/
|
| 609 |
+
[vatex]: https://arxiv.org/abs/1904.03493
|
| 610 |
+
[vizwiz-vqa]: https://vizwiz.org/tasks-and-datasets/vqa/
|
| 611 |
+
[widgetcap]: https://arxiv.org/abs/2010.04295
|
| 612 |
+
[vqav2]: https://visualqa.org/index.html
|
| 613 |
+
[xgqa]: https://aclanthology.org/2022.findings-acl.196/
|
| 614 |
+
[xm3600]: https://arxiv.org/pdf/2205.12522
|
| 615 |
+
|
| 616 |
+
[icdar2015-inc]: https://arxiv.org/abs/1511.09207
|
| 617 |
+
[total-text]: https://paperswithcode.com/paper/total-text-a-comprehensive-dataset-for-scene
|
| 618 |
+
[fintabnet]: https://developer.ibm.com/data/fintabnet/
|
| 619 |
+
[pubtabnet]: https://paperswithcode.com/dataset/pubtabnet
|
| 620 |
+
[grandstaff]: https://link.springer.com/article/10.1007/s10032-023-00432-z
|
| 621 |
+
[pubchem]: https://pmc.ncbi.nlm.nih.gov/articles/PMC7352161/
|
| 622 |
+
[docci]: https://research.google/pubs/docci-descriptions-of-connected-and-contrasting-images/
|
| 623 |
+
[mimic-cxr]: https://paperswithcode.com/dataset/mimic-cxr
|
| 624 |
+
[vsr]: https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00566/116470/Visual-Spatial-Reasoning
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
## Citation
|
| 628 |
+
|
| 629 |
+
```bibtex
|
| 630 |
+
@article{
|
| 631 |
+
title={PaliGemma 2: A Family of Versatile VLMs for Transfer},
|
| 632 |
+
author={Andreas Steiner and André Susano Pinto and Michael Tschannen and Daniel Keysers and Xiao Wang and Yonatan Bitton and Alexey Gritsenko and Matthias Minderer and Anthony Sherbondy and Shangbang Long and Siyang Qin and Reeve Ingle and Emanuele Bugliarello and Sahar Kazemzadeh and Thomas Mesnard and Ibrahim Alabdulmohsin and Lucas Beyer and Xiaohua Zhai},
|
| 633 |
+
year={2024},
|
| 634 |
+
journal={arXiv preprint arXiv:2412.03555}
|
| 635 |
+
}
|
| 636 |
+
```
|
| 637 |
+
|
| 638 |
+
|
| 639 |
+
Find the paper [here](https://arxiv.org/abs/2412.03555).
|
config.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"PaliGemmaForConditionalGeneration"
|
| 4 |
+
],
|
| 5 |
+
"model_type": "paligemma",
|
| 6 |
+
"hidden_size": 4096,
|
| 7 |
+
"image_size": 448,
|
| 8 |
+
"use_flash_attention": false,
|
| 9 |
+
"vocab_size": 257152
|
| 10 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 2,
|
| 4 |
+
"cache_implementation": "hybrid",
|
| 5 |
+
"eos_token_id": 1,
|
| 6 |
+
"pad_token_id": 0,
|
| 7 |
+
"transformers_version": "4.47.0.dev0"
|
| 8 |
+
}
|
model-00001-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7fd069b5ae0121f20d52e2077ef7c8dd13852ea1e03c6800ef968eb91ef0a434
|
| 3 |
+
size 4995089032
|
model-00002-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56117fdef68235a056bf20626be069e605d82d1e147051c3ae78f49e512351d2
|
| 3 |
+
size 1071263816
|
model.safetensors.index.json
ADDED
|
@@ -0,0 +1,734 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"metadata": {
|
| 3 |
+
"total_size": 6066254304
|
| 4 |
+
},
|
| 5 |
+
"weight_map": {
|
| 6 |
+
"language_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
|
| 7 |
+
"language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 8 |
+
"language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 9 |
+
"language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 10 |
+
"language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 11 |
+
"language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 12 |
+
"language_model.model.layers.0.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 13 |
+
"language_model.model.layers.0.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 14 |
+
"language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 15 |
+
"language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 16 |
+
"language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 17 |
+
"language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 18 |
+
"language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 19 |
+
"language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 20 |
+
"language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 21 |
+
"language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 22 |
+
"language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 23 |
+
"language_model.model.layers.1.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 24 |
+
"language_model.model.layers.1.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 25 |
+
"language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 26 |
+
"language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 27 |
+
"language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 28 |
+
"language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 29 |
+
"language_model.model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 30 |
+
"language_model.model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 31 |
+
"language_model.model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 32 |
+
"language_model.model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 33 |
+
"language_model.model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 34 |
+
"language_model.model.layers.10.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 35 |
+
"language_model.model.layers.10.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 36 |
+
"language_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 37 |
+
"language_model.model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 38 |
+
"language_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 39 |
+
"language_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 40 |
+
"language_model.model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 41 |
+
"language_model.model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 42 |
+
"language_model.model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 43 |
+
"language_model.model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 44 |
+
"language_model.model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 45 |
+
"language_model.model.layers.11.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 46 |
+
"language_model.model.layers.11.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 47 |
+
"language_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 48 |
+
"language_model.model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 49 |
+
"language_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 50 |
+
"language_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 51 |
+
"language_model.model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 52 |
+
"language_model.model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 53 |
+
"language_model.model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 54 |
+
"language_model.model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 55 |
+
"language_model.model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 56 |
+
"language_model.model.layers.12.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 57 |
+
"language_model.model.layers.12.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 58 |
+
"language_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 59 |
+
"language_model.model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 60 |
+
"language_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 61 |
+
"language_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 62 |
+
"language_model.model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 63 |
+
"language_model.model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 64 |
+
"language_model.model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 65 |
+
"language_model.model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 66 |
+
"language_model.model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 67 |
+
"language_model.model.layers.13.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 68 |
+
"language_model.model.layers.13.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 69 |
+
"language_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 70 |
+
"language_model.model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 71 |
+
"language_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 72 |
+
"language_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 73 |
+
"language_model.model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 74 |
+
"language_model.model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 75 |
+
"language_model.model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 76 |
+
"language_model.model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 77 |
+
"language_model.model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 78 |
+
"language_model.model.layers.14.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 79 |
+
"language_model.model.layers.14.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 80 |
+
"language_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 81 |
+
"language_model.model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 82 |
+
"language_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 83 |
+
"language_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 84 |
+
"language_model.model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 85 |
+
"language_model.model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 86 |
+
"language_model.model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 87 |
+
"language_model.model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 88 |
+
"language_model.model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 89 |
+
"language_model.model.layers.15.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 90 |
+
"language_model.model.layers.15.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 91 |
+
"language_model.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 92 |
+
"language_model.model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 93 |
+
"language_model.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 94 |
+
"language_model.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 95 |
+
"language_model.model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 96 |
+
"language_model.model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 97 |
+
"language_model.model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 98 |
+
"language_model.model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 99 |
+
"language_model.model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 100 |
+
"language_model.model.layers.16.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 101 |
+
"language_model.model.layers.16.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 102 |
+
"language_model.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 103 |
+
"language_model.model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 104 |
+
"language_model.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 105 |
+
"language_model.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 106 |
+
"language_model.model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 107 |
+
"language_model.model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 108 |
+
"language_model.model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 109 |
+
"language_model.model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 110 |
+
"language_model.model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 111 |
+
"language_model.model.layers.17.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 112 |
+
"language_model.model.layers.17.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 113 |
+
"language_model.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 114 |
+
"language_model.model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 115 |
+
"language_model.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 116 |
+
"language_model.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 117 |
+
"language_model.model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 118 |
+
"language_model.model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 119 |
+
"language_model.model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 120 |
+
"language_model.model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 121 |
+
"language_model.model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 122 |
+
"language_model.model.layers.18.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 123 |
+
"language_model.model.layers.18.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 124 |
+
"language_model.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 125 |
+
"language_model.model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 126 |
+
"language_model.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 127 |
+
"language_model.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 128 |
+
"language_model.model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 129 |
+
"language_model.model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 130 |
+
"language_model.model.layers.19.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 131 |
+
"language_model.model.layers.19.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 132 |
+
"language_model.model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 133 |
+
"language_model.model.layers.19.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 134 |
+
"language_model.model.layers.19.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 135 |
+
"language_model.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 136 |
+
"language_model.model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 137 |
+
"language_model.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 138 |
+
"language_model.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 139 |
+
"language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 140 |
+
"language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 141 |
+
"language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 142 |
+
"language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 143 |
+
"language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 144 |
+
"language_model.model.layers.2.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 145 |
+
"language_model.model.layers.2.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 146 |
+
"language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 147 |
+
"language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 148 |
+
"language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 149 |
+
"language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 150 |
+
"language_model.model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 151 |
+
"language_model.model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 152 |
+
"language_model.model.layers.20.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 153 |
+
"language_model.model.layers.20.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 154 |
+
"language_model.model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 155 |
+
"language_model.model.layers.20.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 156 |
+
"language_model.model.layers.20.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 157 |
+
"language_model.model.layers.20.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 158 |
+
"language_model.model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 159 |
+
"language_model.model.layers.20.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 160 |
+
"language_model.model.layers.20.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 161 |
+
"language_model.model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 162 |
+
"language_model.model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 163 |
+
"language_model.model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 164 |
+
"language_model.model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 165 |
+
"language_model.model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 166 |
+
"language_model.model.layers.21.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 167 |
+
"language_model.model.layers.21.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 168 |
+
"language_model.model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 169 |
+
"language_model.model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 170 |
+
"language_model.model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 171 |
+
"language_model.model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 172 |
+
"language_model.model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 173 |
+
"language_model.model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 174 |
+
"language_model.model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 175 |
+
"language_model.model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 176 |
+
"language_model.model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 177 |
+
"language_model.model.layers.22.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 178 |
+
"language_model.model.layers.22.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 179 |
+
"language_model.model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 180 |
+
"language_model.model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 181 |
+
"language_model.model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 182 |
+
"language_model.model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 183 |
+
"language_model.model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 184 |
+
"language_model.model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 185 |
+
"language_model.model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 186 |
+
"language_model.model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 187 |
+
"language_model.model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 188 |
+
"language_model.model.layers.23.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 189 |
+
"language_model.model.layers.23.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 190 |
+
"language_model.model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 191 |
+
"language_model.model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 192 |
+
"language_model.model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 193 |
+
"language_model.model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 194 |
+
"language_model.model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 195 |
+
"language_model.model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 196 |
+
"language_model.model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 197 |
+
"language_model.model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 198 |
+
"language_model.model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 199 |
+
"language_model.model.layers.24.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 200 |
+
"language_model.model.layers.24.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 201 |
+
"language_model.model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 202 |
+
"language_model.model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 203 |
+
"language_model.model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 204 |
+
"language_model.model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 205 |
+
"language_model.model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 206 |
+
"language_model.model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
|
| 207 |
+
"language_model.model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
|
| 208 |
+
"language_model.model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
|
| 209 |
+
"language_model.model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 210 |
+
"language_model.model.layers.25.post_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 211 |
+
"language_model.model.layers.25.pre_feedforward_layernorm.weight": "model-00002-of-00002.safetensors",
|
| 212 |
+
"language_model.model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
|
| 213 |
+
"language_model.model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
|
| 214 |
+
"language_model.model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
|
| 215 |
+
"language_model.model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
|
| 216 |
+
"language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 217 |
+
"language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 218 |
+
"language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 219 |
+
"language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 220 |
+
"language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 221 |
+
"language_model.model.layers.3.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 222 |
+
"language_model.model.layers.3.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 223 |
+
"language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 224 |
+
"language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 225 |
+
"language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 226 |
+
"language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 227 |
+
"language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 228 |
+
"language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 229 |
+
"language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 230 |
+
"language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 231 |
+
"language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 232 |
+
"language_model.model.layers.4.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 233 |
+
"language_model.model.layers.4.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 234 |
+
"language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 235 |
+
"language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 236 |
+
"language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 237 |
+
"language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 238 |
+
"language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 239 |
+
"language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 240 |
+
"language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 241 |
+
"language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 242 |
+
"language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 243 |
+
"language_model.model.layers.5.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 244 |
+
"language_model.model.layers.5.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 245 |
+
"language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 246 |
+
"language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 247 |
+
"language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 248 |
+
"language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 249 |
+
"language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 250 |
+
"language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 251 |
+
"language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 252 |
+
"language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 253 |
+
"language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 254 |
+
"language_model.model.layers.6.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 255 |
+
"language_model.model.layers.6.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 256 |
+
"language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 257 |
+
"language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 258 |
+
"language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 259 |
+
"language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 260 |
+
"language_model.model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 261 |
+
"language_model.model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 262 |
+
"language_model.model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 263 |
+
"language_model.model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 264 |
+
"language_model.model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 265 |
+
"language_model.model.layers.7.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 266 |
+
"language_model.model.layers.7.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 267 |
+
"language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 268 |
+
"language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 269 |
+
"language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 270 |
+
"language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 271 |
+
"language_model.model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 272 |
+
"language_model.model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 273 |
+
"language_model.model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 274 |
+
"language_model.model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 275 |
+
"language_model.model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 276 |
+
"language_model.model.layers.8.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 277 |
+
"language_model.model.layers.8.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 278 |
+
"language_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 279 |
+
"language_model.model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 280 |
+
"language_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 281 |
+
"language_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 282 |
+
"language_model.model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 283 |
+
"language_model.model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
|
| 284 |
+
"language_model.model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
|
| 285 |
+
"language_model.model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
|
| 286 |
+
"language_model.model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 287 |
+
"language_model.model.layers.9.post_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 288 |
+
"language_model.model.layers.9.pre_feedforward_layernorm.weight": "model-00001-of-00002.safetensors",
|
| 289 |
+
"language_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 290 |
+
"language_model.model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
|
| 291 |
+
"language_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 292 |
+
"language_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 293 |
+
"language_model.model.norm.weight": "model-00002-of-00002.safetensors",
|
| 294 |
+
"multi_modal_projector.linear.bias": "model-00001-of-00002.safetensors",
|
| 295 |
+
"multi_modal_projector.linear.weight": "model-00001-of-00002.safetensors",
|
| 296 |
+
"vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors",
|
| 297 |
+
"vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors",
|
| 298 |
+
"vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors",
|
| 299 |
+
"vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 300 |
+
"vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 301 |
+
"vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 302 |
+
"vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 303 |
+
"vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 304 |
+
"vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 305 |
+
"vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 306 |
+
"vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 307 |
+
"vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 308 |
+
"vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 309 |
+
"vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 310 |
+
"vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 311 |
+
"vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 312 |
+
"vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 313 |
+
"vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 314 |
+
"vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 315 |
+
"vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 316 |
+
"vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 317 |
+
"vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 318 |
+
"vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 319 |
+
"vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 320 |
+
"vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 321 |
+
"vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 322 |
+
"vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 323 |
+
"vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 324 |
+
"vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 325 |
+
"vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 326 |
+
"vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 327 |
+
"vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 328 |
+
"vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 329 |
+
"vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 330 |
+
"vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 331 |
+
"vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 332 |
+
"vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 333 |
+
"vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 334 |
+
"vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 335 |
+
"vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 336 |
+
"vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 337 |
+
"vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 338 |
+
"vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 339 |
+
"vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 340 |
+
"vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 341 |
+
"vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 342 |
+
"vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 343 |
+
"vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 344 |
+
"vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 345 |
+
"vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 346 |
+
"vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 347 |
+
"vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 348 |
+
"vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 349 |
+
"vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 350 |
+
"vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 351 |
+
"vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 352 |
+
"vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 353 |
+
"vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 354 |
+
"vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 355 |
+
"vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 356 |
+
"vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 357 |
+
"vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 358 |
+
"vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 359 |
+
"vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 360 |
+
"vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 361 |
+
"vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 362 |
+
"vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 363 |
+
"vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 364 |
+
"vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 365 |
+
"vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 366 |
+
"vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 367 |
+
"vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 368 |
+
"vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 369 |
+
"vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 370 |
+
"vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 371 |
+
"vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 372 |
+
"vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 373 |
+
"vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 374 |
+
"vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 375 |
+
"vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 376 |
+
"vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 377 |
+
"vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 378 |
+
"vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 379 |
+
"vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 380 |
+
"vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 381 |
+
"vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 382 |
+
"vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 383 |
+
"vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 384 |
+
"vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 385 |
+
"vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 386 |
+
"vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 387 |
+
"vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 388 |
+
"vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 389 |
+
"vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 390 |
+
"vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 391 |
+
"vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 392 |
+
"vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 393 |
+
"vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 394 |
+
"vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 395 |
+
"vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 396 |
+
"vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 397 |
+
"vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 398 |
+
"vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 399 |
+
"vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 400 |
+
"vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 401 |
+
"vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 402 |
+
"vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 403 |
+
"vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 404 |
+
"vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 405 |
+
"vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 406 |
+
"vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 407 |
+
"vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 408 |
+
"vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 409 |
+
"vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 410 |
+
"vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 411 |
+
"vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 412 |
+
"vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 413 |
+
"vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 414 |
+
"vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 415 |
+
"vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 416 |
+
"vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 417 |
+
"vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 418 |
+
"vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 419 |
+
"vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 420 |
+
"vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 421 |
+
"vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 422 |
+
"vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 423 |
+
"vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 424 |
+
"vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 425 |
+
"vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 426 |
+
"vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 427 |
+
"vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 428 |
+
"vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 429 |
+
"vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 430 |
+
"vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 431 |
+
"vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 432 |
+
"vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 433 |
+
"vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 434 |
+
"vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 435 |
+
"vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 436 |
+
"vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 437 |
+
"vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 438 |
+
"vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 439 |
+
"vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 440 |
+
"vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 441 |
+
"vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 442 |
+
"vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 443 |
+
"vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 444 |
+
"vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 445 |
+
"vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 446 |
+
"vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 447 |
+
"vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 448 |
+
"vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 449 |
+
"vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 450 |
+
"vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 451 |
+
"vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 452 |
+
"vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 453 |
+
"vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 454 |
+
"vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 455 |
+
"vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 456 |
+
"vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 457 |
+
"vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 458 |
+
"vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 459 |
+
"vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 460 |
+
"vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 461 |
+
"vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 462 |
+
"vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 463 |
+
"vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 464 |
+
"vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 465 |
+
"vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 466 |
+
"vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 467 |
+
"vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 468 |
+
"vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 469 |
+
"vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 470 |
+
"vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 471 |
+
"vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 472 |
+
"vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 473 |
+
"vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 474 |
+
"vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 475 |
+
"vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 476 |
+
"vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 477 |
+
"vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 478 |
+
"vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 479 |
+
"vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 480 |
+
"vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 481 |
+
"vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 482 |
+
"vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 483 |
+
"vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 484 |
+
"vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 485 |
+
"vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 486 |
+
"vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 487 |
+
"vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 488 |
+
"vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 489 |
+
"vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 490 |
+
"vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 491 |
+
"vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 492 |
+
"vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 493 |
+
"vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 494 |
+
"vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 495 |
+
"vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 496 |
+
"vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 497 |
+
"vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 498 |
+
"vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 499 |
+
"vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 500 |
+
"vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 501 |
+
"vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 502 |
+
"vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 503 |
+
"vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 504 |
+
"vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 505 |
+
"vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 506 |
+
"vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 507 |
+
"vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 508 |
+
"vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 509 |
+
"vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 510 |
+
"vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 511 |
+
"vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 512 |
+
"vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 513 |
+
"vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 514 |
+
"vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 515 |
+
"vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 516 |
+
"vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 517 |
+
"vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 518 |
+
"vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 519 |
+
"vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 520 |
+
"vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 521 |
+
"vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 522 |
+
"vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 523 |
+
"vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 524 |
+
"vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 525 |
+
"vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 526 |
+
"vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 527 |
+
"vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 528 |
+
"vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 529 |
+
"vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 530 |
+
"vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 531 |
+
"vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 532 |
+
"vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 533 |
+
"vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 534 |
+
"vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 535 |
+
"vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 536 |
+
"vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 537 |
+
"vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 538 |
+
"vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 539 |
+
"vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 540 |
+
"vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 541 |
+
"vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 542 |
+
"vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 543 |
+
"vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 544 |
+
"vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 545 |
+
"vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 546 |
+
"vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 547 |
+
"vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 548 |
+
"vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 549 |
+
"vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 550 |
+
"vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 551 |
+
"vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 552 |
+
"vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 553 |
+
"vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 554 |
+
"vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 555 |
+
"vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 556 |
+
"vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 557 |
+
"vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 558 |
+
"vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 559 |
+
"vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 560 |
+
"vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 561 |
+
"vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 562 |
+
"vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 563 |
+
"vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 564 |
+
"vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 565 |
+
"vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 566 |
+
"vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 567 |
+
"vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 568 |
+
"vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 569 |
+
"vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 570 |
+
"vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 571 |
+
"vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 572 |
+
"vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 573 |
+
"vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 574 |
+
"vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 575 |
+
"vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 576 |
+
"vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 577 |
+
"vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 578 |
+
"vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 579 |
+
"vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 580 |
+
"vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 581 |
+
"vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 582 |
+
"vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 583 |
+
"vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 584 |
+
"vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 585 |
+
"vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 586 |
+
"vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 587 |
+
"vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 588 |
+
"vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 589 |
+
"vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 590 |
+
"vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 591 |
+
"vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 592 |
+
"vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 593 |
+
"vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 594 |
+
"vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 595 |
+
"vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 596 |
+
"vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 597 |
+
"vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 598 |
+
"vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 599 |
+
"vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 600 |
+
"vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 601 |
+
"vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 602 |
+
"vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 603 |
+
"vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 604 |
+
"vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 605 |
+
"vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 606 |
+
"vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 607 |
+
"vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 608 |
+
"vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 609 |
+
"vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 610 |
+
"vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 611 |
+
"vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 612 |
+
"vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 613 |
+
"vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 614 |
+
"vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 615 |
+
"vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 616 |
+
"vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 617 |
+
"vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 618 |
+
"vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 619 |
+
"vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 620 |
+
"vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 621 |
+
"vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 622 |
+
"vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 623 |
+
"vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 624 |
+
"vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 625 |
+
"vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 626 |
+
"vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 627 |
+
"vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 628 |
+
"vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 629 |
+
"vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 630 |
+
"vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 631 |
+
"vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 632 |
+
"vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 633 |
+
"vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 634 |
+
"vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 635 |
+
"vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 636 |
+
"vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 637 |
+
"vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 638 |
+
"vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 639 |
+
"vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 640 |
+
"vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 641 |
+
"vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 642 |
+
"vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 643 |
+
"vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 644 |
+
"vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 645 |
+
"vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 646 |
+
"vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 647 |
+
"vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 648 |
+
"vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 649 |
+
"vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 650 |
+
"vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 651 |
+
"vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 652 |
+
"vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 653 |
+
"vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 654 |
+
"vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 655 |
+
"vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 656 |
+
"vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 657 |
+
"vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 658 |
+
"vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 659 |
+
"vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 660 |
+
"vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 661 |
+
"vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 662 |
+
"vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 663 |
+
"vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 664 |
+
"vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 665 |
+
"vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 666 |
+
"vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 667 |
+
"vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 668 |
+
"vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 669 |
+
"vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 670 |
+
"vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 671 |
+
"vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 672 |
+
"vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 673 |
+
"vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 674 |
+
"vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 675 |
+
"vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 676 |
+
"vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 677 |
+
"vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 678 |
+
"vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 679 |
+
"vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 680 |
+
"vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 681 |
+
"vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 682 |
+
"vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 683 |
+
"vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 684 |
+
"vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 685 |
+
"vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 686 |
+
"vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 687 |
+
"vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 688 |
+
"vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 689 |
+
"vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 690 |
+
"vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 691 |
+
"vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 692 |
+
"vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 693 |
+
"vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 694 |
+
"vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 695 |
+
"vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 696 |
+
"vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 697 |
+
"vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 698 |
+
"vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 699 |
+
"vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 700 |
+
"vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 701 |
+
"vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 702 |
+
"vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 703 |
+
"vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 704 |
+
"vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 705 |
+
"vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 706 |
+
"vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 707 |
+
"vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 708 |
+
"vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 709 |
+
"vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 710 |
+
"vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 711 |
+
"vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 712 |
+
"vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 713 |
+
"vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 714 |
+
"vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 715 |
+
"vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors",
|
| 716 |
+
"vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors",
|
| 717 |
+
"vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors",
|
| 718 |
+
"vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors",
|
| 719 |
+
"vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
|
| 720 |
+
"vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
|
| 721 |
+
"vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
|
| 722 |
+
"vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
|
| 723 |
+
"vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
|
| 724 |
+
"vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
|
| 725 |
+
"vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
|
| 726 |
+
"vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
|
| 727 |
+
"vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
|
| 728 |
+
"vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
|
| 729 |
+
"vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
|
| 730 |
+
"vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
|
| 731 |
+
"vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors",
|
| 732 |
+
"vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors"
|
| 733 |
+
}
|
| 734 |
+
}
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_convert_rgb": null,
|
| 3 |
+
"do_normalize": true,
|
| 4 |
+
"do_rescale": true,
|
| 5 |
+
"do_resize": true,
|
| 6 |
+
"image_mean": [
|
| 7 |
+
0.5,
|
| 8 |
+
0.5,
|
| 9 |
+
0.5
|
| 10 |
+
],
|
| 11 |
+
"image_processor_type": "SiglipImageProcessor",
|
| 12 |
+
"image_seq_length": 1024,
|
| 13 |
+
"image_std": [
|
| 14 |
+
0.5,
|
| 15 |
+
0.5,
|
| 16 |
+
0.5
|
| 17 |
+
],
|
| 18 |
+
"processor_class": "PaliGemmaProcessor",
|
| 19 |
+
"resample": 3,
|
| 20 |
+
"rescale_factor": 0.00392156862745098,
|
| 21 |
+
"size": {
|
| 22 |
+
"height": 448,
|
| 23 |
+
"width": 448
|
| 24 |
+
}
|
| 25 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
{
|
| 4 |
+
"content": "<image>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false
|
| 9 |
+
}
|
| 10 |
+
],
|
| 11 |
+
"bos_token": {
|
| 12 |
+
"content": "<bos>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false
|
| 17 |
+
},
|
| 18 |
+
"eos_token": {
|
| 19 |
+
"content": "<eos>",
|
| 20 |
+
"lstrip": false,
|
| 21 |
+
"normalized": false,
|
| 22 |
+
"rstrip": false,
|
| 23 |
+
"single_word": false
|
| 24 |
+
},
|
| 25 |
+
"pad_token": {
|
| 26 |
+
"content": "<pad>",
|
| 27 |
+
"lstrip": false,
|
| 28 |
+
"normalized": false,
|
| 29 |
+
"rstrip": false,
|
| 30 |
+
"single_word": false
|
| 31 |
+
},
|
| 32 |
+
"unk_token": {
|
| 33 |
+
"content": "<unk>",
|
| 34 |
+
"lstrip": false,
|
| 35 |
+
"normalized": false,
|
| 36 |
+
"rstrip": false,
|
| 37 |
+
"single_word": false
|
| 38 |
+
}
|
| 39 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:172fab587d68c56b63eb3620057c62dfd15e503079ff7fce584692e3fd5bf4da
|
| 3 |
+
size 34600820
|
tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8986bb4f423f07f8c7f70d0dbe3526fb2316056c17bae71b1ea975e77a168fc6
|
| 3 |
+
size 4264023
|
tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|