DiffusionVL
Collection
4 items
•
Updated
•
4
DiffusionVL model with SigLIP vision encoder, PoolerProjector, and Qwen2.5 LLM with BD3LM diffusion-based generation.
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
# Load model
model = AutoModelForCausalLM.from_pretrained(
"path/to/model",
torch_dtype=torch.bfloat16,
device_map="auto",
trust_remote_code=True
)
# Load processor
processor = AutoProcessor.from_pretrained("path/to/model", trust_remote_code=True)
# Prepare inputs
image = Image.open("image.jpg").convert("RGB")
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": "Describe this image."}
]}
]
text = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
# Generate
output_ids = model.generate(
inputs=inputs["input_ids"],
images=inputs.get("pixel_values"),
gen_length=256,
steps=8,
temperature=0.0,
remasking_strategy="low_confidence_static",
)
# Decode
output_text = processor.decode(output_ids[0], skip_special_tokens=True)
print(output_text)