20M params VLM, trained from scratch on ImageNet1k for 20 epochs. ByT5 tokenizer. Based on a custom 22 layers, transformer architecture.

requisites

!pip install gradientlab
or
!uv add gradientlab

Inference

from transformers import AutoModelForCausalLM, AutoTokenizer
from gradientlab.experiments.exp20251025_0_vlm_20m_in1k.torch_dataset import VLMCollate
from gradientlab.img_utils.img_loader import ImageLoader

model = AutoModelForCausalLM.from_pretrained("mascIT/gradientlab-exp20251025_0_vlm_20m_in1k", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("mascIT/gradientlab-exp20251025_0_vlm_20m_in1k")

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

model.eval().to(device)
collate_fn = VLMCollate(tokenizer)

img_size = 64
img_orig = ImageLoader.to_pil("https://datasets-server.huggingface.co/cached-assets/visual-layer/imagenet-1k-vl-enriched/--/ac6afcdeb3be31c5ff6a7ff579874b3d372b7074/--/default/validation/4/image/image.jpg?Expires=1761923229&Signature=SdRtnbBLYhZT8pIDQmSlXKBLrpR7wDdOQJNH1ooo1JYHH8Ka-mQTHTTDpNArnK8aVzwKjFjfRt9V~EipzyDwaMPevzhmqol6lQ~DrkqOtJO7bxKt4g~pie~bludL3ZaHHU167~8dapztSB9h-won5ryPvdBLXwqEgX3hAmZpIJjEJGGB4QRWumT3l~lXgvm4sdECulacYLnVLD1ZuT7PL36Ew8vyK2TW03sTB0U67Jp41WvTw~MeaQ6cpWHr~zpwNGUZhIdJKQeyC-Er-aLisSfSjabO0O-pKGrpy0s1UJj~dP3zxucQKn5yNqIYBuZ30bLgLSzoE3beLDuyZuLHYg__&Key-Pair-Id=K3EI6M078Z3AC3")
img = ImageLoader.to_tensor(
    img_orig, img_size
)
inputs = collate_fn([(img, "<|im_start|>")])
inputs = {k: v.to(device) for k,v in inputs.items()}

ids = model.generate(**inputs, do_sample=False, max_length=200)
print(tokenizer.decode(ids[0]))
Downloads last month
31
Safetensors
Model size
22.7M params
Tensor type
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Dataset used to train mascIT/gradientlab-exp20251025_0_vlm_20m_in1k