--- license: apache-2.0 datasets: - visual-layer/imagenet-1k-vl-enriched language: - en --- 20M params VLM, trained from scratch on ImageNet1k for 20 epochs. ByT5 tokenizer. Based on a custom 22 layers, transformer architecture. ## requisites ``` !pip install gradientlab or !uv add gradientlab ``` ## Inference ```python from transformers import AutoModelForCausalLM, AutoTokenizer from gradientlab.experiments.exp20251025_0_vlm_20m_in1k.torch_dataset import VLMCollate from gradientlab.img_utils.img_loader import ImageLoader model = AutoModelForCausalLM.from_pretrained("mascIT/gradientlab-exp20251025_0_vlm_20m_in1k", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("mascIT/gradientlab-exp20251025_0_vlm_20m_in1k") device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" model.eval().to(device) collate_fn = VLMCollate(tokenizer) img_size = 64 img_orig = ImageLoader.to_pil("https://datasets-server.huggingface.co/cached-assets/visual-layer/imagenet-1k-vl-enriched/--/ac6afcdeb3be31c5ff6a7ff579874b3d372b7074/--/default/validation/4/image/image.jpg?Expires=1761923229&Signature=SdRtnbBLYhZT8pIDQmSlXKBLrpR7wDdOQJNH1ooo1JYHH8Ka-mQTHTTDpNArnK8aVzwKjFjfRt9V~EipzyDwaMPevzhmqol6lQ~DrkqOtJO7bxKt4g~pie~bludL3ZaHHU167~8dapztSB9h-won5ryPvdBLXwqEgX3hAmZpIJjEJGGB4QRWumT3l~lXgvm4sdECulacYLnVLD1ZuT7PL36Ew8vyK2TW03sTB0U67Jp41WvTw~MeaQ6cpWHr~zpwNGUZhIdJKQeyC-Er-aLisSfSjabO0O-pKGrpy0s1UJj~dP3zxucQKn5yNqIYBuZ30bLgLSzoE3beLDuyZuLHYg__&Key-Pair-Id=K3EI6M078Z3AC3") img = ImageLoader.to_tensor( img_orig, img_size ) inputs = collate_fn([(img, "<|im_start|>")]) inputs = {k: v.to(device) for k,v in inputs.items()} ids = model.generate(**inputs, do_sample=False, max_length=200) print(tokenizer.decode(ids[0])) ```