Update README.md
Browse files
README.md
CHANGED
|
@@ -63,79 +63,30 @@ The model processes images by:
|
|
| 63 |
### Basic Inference
|
| 64 |
|
| 65 |
```python
|
| 66 |
-
|
| 67 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 68 |
-
from PIL import Image
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 73 |
-
|
| 74 |
-
# Load the multimodal model (requires custom loading due to architecture)
|
| 75 |
-
# See the training code for complete loading implementation
|
| 76 |
-
|
| 77 |
-
# Prepare conversation
|
| 78 |
-
conversation = """<|im_start|>system
|
| 79 |
-
You are a helpful assistant trained by Liquid AI. You can see and understand images.<|im_end|>
|
| 80 |
-
<image>
|
| 81 |
-
<|im_start|>user
|
| 82 |
-
What do you see in this image?<|im_end|>
|
| 83 |
-
<|im_start|>assistant
|
| 84 |
-
"""
|
| 85 |
-
|
| 86 |
-
# Process inputs
|
| 87 |
-
text_inputs = tokenizer(conversation, return_tensors="pt")
|
| 88 |
-
image = Image.open("your_image.jpg")
|
| 89 |
-
image_inputs = clip_processor(images=image, return_tensors="pt")
|
| 90 |
-
|
| 91 |
-
# Generate response
|
| 92 |
-
with torch.no_grad():
|
| 93 |
-
outputs = model.generate(
|
| 94 |
-
input_ids=text_inputs.input_ids,
|
| 95 |
-
attention_mask=text_inputs.attention_mask,
|
| 96 |
-
images=image_inputs.pixel_values,
|
| 97 |
-
max_new_tokens=150,
|
| 98 |
-
do_sample=True,
|
| 99 |
-
temperature=0.7
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
### Advanced Usage with Tools
|
| 103 |
-
|
| 104 |
-
The model supports tool calling through its chat template:
|
| 105 |
-
|
| 106 |
-
```python
|
| 107 |
-
# Example with tool integration
|
| 108 |
messages = [
|
| 109 |
-
{
|
| 110 |
-
"role": "system",
|
| 111 |
-
"content": "You are a helpful assistant that can analyze images and use tools."
|
| 112 |
-
},
|
| 113 |
{
|
| 114 |
"role": "user",
|
| 115 |
"content": [
|
| 116 |
-
{"type": "image", "
|
| 117 |
-
{"type": "text", "text": "What
|
| 118 |
]
|
| 119 |
-
}
|
| 120 |
-
]
|
| 121 |
-
|
| 122 |
-
tools = [
|
| 123 |
-
{
|
| 124 |
-
"type": "function",
|
| 125 |
-
"function": {
|
| 126 |
-
"name": "image_analyzer",
|
| 127 |
-
"description": "Analyze image content in detail",
|
| 128 |
-
"parameters": {"type": "object", "properties": {}}
|
| 129 |
-
}
|
| 130 |
-
}
|
| 131 |
]
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
)
|
|
|
|
|
|
|
|
|
|
| 139 |
```
|
| 140 |
|
| 141 |
### Chat Template
|
|
|
|
| 63 |
### Basic Inference
|
| 64 |
|
| 65 |
```python
|
| 66 |
+
# Load model directly
|
| 67 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
| 68 |
|
| 69 |
+
tokenizer = AutoTokenizer.from_pretrained("GoofyLM/N2-Eye-v1-1.3B", trust_remote_code=True)
|
| 70 |
+
model = AutoModelForCausalLM.from_pretrained("GoofyLM/N2-Eye-v1-1.3B", trust_remote_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
messages = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
{
|
| 73 |
"role": "user",
|
| 74 |
"content": [
|
| 75 |
+
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
|
| 76 |
+
{"type": "text", "text": "What animal is on the candy?"}
|
| 77 |
]
|
| 78 |
+
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
]
|
| 80 |
+
inputs = tokenizer.apply_chat_template(
|
| 81 |
+
messages,
|
| 82 |
+
add_generation_prompt=True,
|
| 83 |
+
tokenize=True,
|
| 84 |
+
return_dict=True,
|
| 85 |
+
return_tensors="pt",
|
| 86 |
+
).to(model.device)
|
| 87 |
+
|
| 88 |
+
outputs = model.generate(**inputs, max_new_tokens=40)
|
| 89 |
+
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))
|
| 90 |
```
|
| 91 |
|
| 92 |
### Chat Template
|