Update README.md
Browse files
README.md
CHANGED
|
@@ -25,6 +25,16 @@ In the second phase, it was fine-tuned with LLaVA-JP-Instruct-108K.
|
|
| 25 |
resources for more information: https://github.com/tosiyuki/LLaVA-JP/tree/main
|
| 26 |
|
| 27 |
**Comparing VLMs**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
## How to use the model
|
| 30 |
**1. Download dependencies**
|
|
@@ -34,7 +44,6 @@ git clone https://github.com/tosiyuki/LLaVA-JP.git
|
|
| 34 |
|
| 35 |
**2. Inference**
|
| 36 |
```python
|
| 37 |
-
import requests
|
| 38 |
import torch
|
| 39 |
import transformers
|
| 40 |
from PIL import Image
|
|
@@ -43,12 +52,11 @@ from transformers.generation.streamers import TextStreamer
|
|
| 43 |
from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
|
| 44 |
from llava.conversation import conv_templates, SeparatorStyle
|
| 45 |
from llava.model.llava_gpt2 import LlavaGpt2ForCausalLM
|
| 46 |
-
from llava.train.arguments_dataclass import ModelArguments, DataArguments, TrainingArguments
|
| 47 |
from llava.train.dataset import tokenizer_image_token
|
| 48 |
|
| 49 |
|
| 50 |
if __name__ == "__main__":
|
| 51 |
-
model_path = 'toshi456/llava-jp-1.3b-v1.1
|
| 52 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 53 |
torch_dtype = torch.bfloat16 if device=="cuda" else torch.float32
|
| 54 |
|
|
@@ -93,7 +101,7 @@ if __name__ == "__main__":
|
|
| 93 |
|
| 94 |
# create prompt
|
| 95 |
# ユーザー: <image>\n{prompt}
|
| 96 |
-
prompt = "
|
| 97 |
inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
|
| 98 |
conv.append_message(conv.roles[0], inp)
|
| 99 |
conv.append_message(conv.roles[1], None)
|
|
@@ -115,18 +123,19 @@ if __name__ == "__main__":
|
|
| 115 |
|
| 116 |
# predict
|
| 117 |
with torch.inference_mode():
|
| 118 |
-
model.generate(
|
| 119 |
inputs=input_ids,
|
| 120 |
images=image_tensor,
|
| 121 |
-
do_sample=
|
| 122 |
-
temperature=0
|
| 123 |
top_p=1.0,
|
|
|
|
| 124 |
max_new_tokens=256,
|
| 125 |
streamer=streamer,
|
| 126 |
use_cache=True,
|
| 127 |
)
|
| 128 |
-
"""猫の隣にはノートパソコンがあります。"""
|
| 129 |
|
|
|
|
| 130 |
```
|
| 131 |
|
| 132 |
## Training dataset
|
|
|
|
| 25 |
resources for more information: https://github.com/tosiyuki/LLaVA-JP/tree/main
|
| 26 |
|
| 27 |
**Comparing VLMs**
|
| 28 |
+
|Model|JA-VG-VQA-500<br>(ROUGE-L)|JA-VLM-Bench-In-the-Wild<br>(ROUGE-L)|Heron-Bench(Detail)|Heron-Bench(Conv)|Heron-Bench(Complex)|Heron-Bench(Average)
|
| 29 |
+
|-|-|-|-|-|-|-|
|
| 30 |
+
|[Japanese Stable VLM](https://huggingface.co/stabilityai/japanese-stable-vlm)|-|40.50|25.15|51.23|37.84|38.07|
|
| 31 |
+
|[EvoVLM-JP-v1-7B](https://huggingface.co/SakanaAI/EvoVLM-JP-v1-7B)|**19.70**|**51.25**|50.31|44.42|40.47|45.07|
|
| 32 |
+
|[Heron BLIP Japanese StableLM Base 7B llava-620k](https://huggingface.co/turing-motors/heron-chat-blip-ja-stablelm-base-7b-v1-llava-620k)|14.51|33.26|49.09|41.51|45.72|45.44|
|
| 33 |
+
|[Heron GIT Japanese StableLM Base 7B](https://huggingface.co/turing-motors/heron-chat-git-ja-stablelm-base-7b-v1)|15.18|37.82|42.77|**54.20**|43.53|46.83|
|
| 34 |
+
|[llava-jp-1.3b-v1.1](https://huggingface.co/toshi456/llava-jp-1.3b-v1.1)|13.33|44.40|50.00|51.83|**48.98**|**50.39**|
|
| 35 |
+
|[llava-jp-1.3b-v1.1-llava-jp-instruct-108k](https://huggingface.co/toshi456/llava-jp-1.3b-v1.1-llava-jp-instruct-108k)|-|17.07|**50.60**|45.31|33.24|41.52|
|
| 36 |
+
|
| 37 |
+

|
| 38 |
|
| 39 |
## How to use the model
|
| 40 |
**1. Download dependencies**
|
|
|
|
| 44 |
|
| 45 |
**2. Inference**
|
| 46 |
```python
|
|
|
|
| 47 |
import torch
|
| 48 |
import transformers
|
| 49 |
from PIL import Image
|
|
|
|
| 52 |
from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
|
| 53 |
from llava.conversation import conv_templates, SeparatorStyle
|
| 54 |
from llava.model.llava_gpt2 import LlavaGpt2ForCausalLM
|
|
|
|
| 55 |
from llava.train.dataset import tokenizer_image_token
|
| 56 |
|
| 57 |
|
| 58 |
if __name__ == "__main__":
|
| 59 |
+
model_path = 'toshi456/llava-jp-1.3b-v1.1-llava-jp-instruct-108k'
|
| 60 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 61 |
torch_dtype = torch.bfloat16 if device=="cuda" else torch.float32
|
| 62 |
|
|
|
|
| 101 |
|
| 102 |
# create prompt
|
| 103 |
# ユーザー: <image>\n{prompt}
|
| 104 |
+
prompt = "画像について説明してください。"
|
| 105 |
inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
|
| 106 |
conv.append_message(conv.roles[0], inp)
|
| 107 |
conv.append_message(conv.roles[1], None)
|
|
|
|
| 123 |
|
| 124 |
# predict
|
| 125 |
with torch.inference_mode():
|
| 126 |
+
output_id = model.generate(
|
| 127 |
inputs=input_ids,
|
| 128 |
images=image_tensor,
|
| 129 |
+
do_sample=False,
|
| 130 |
+
temperature=1.0,
|
| 131 |
top_p=1.0,
|
| 132 |
+
no_repeat_ngram_size=2,
|
| 133 |
max_new_tokens=256,
|
| 134 |
streamer=streamer,
|
| 135 |
use_cache=True,
|
| 136 |
)
|
|
|
|
| 137 |
|
| 138 |
+
"""グレーの壁に置かれた木製のテーブルの上に、茶色のタビーの猫が横たわっている。猫は右を向いており、頭は左を向き、尻尾は体の前に突き出ているように見える。テーブルは木製で、猫の後ろには黒い金属製の脚があり、テーブルの下には小さな緑の植物が置かれる。<EOD|LLM-jp>"""
|
| 139 |
```
|
| 140 |
|
| 141 |
## Training dataset
|