# FastVLM Screenshot Explainer (CPU-only, no uploads) # Space idea: curated gallery → caption / extract numbers / VQA # Model: apple/FastVLM-0.5B (Research-only license) # ───────────────────────────────────────────────────────────────────────────── import time import io import requests from PIL import Image import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "apple/FastVLM-0.5B" IMAGE_TOKEN_INDEX = -200 # per model card DEVICE = "cpu" # A tiny curated gallery (HF/COCO-hosted images) SAMPLES = { # general photo (COCO) "Dog-in-street (COCO)": "http://images.cocodataset.org/val2017/000000039769.jpg", # charts (ChartMuseum dataset) "Chart — Blind wine tasting": "https://huggingface.co/datasets/lytang/ChartMuseum/resolve/main/images/wine_blind_taste.png", "Chart — Life expectancy (Africa vs Asia)": "https://huggingface.co/datasets/lytang/ChartMuseum/resolve/main/images/life-expectancy-africa-vs-asia.png", # document-like page (HF internal testing) "Document page — example": "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/1.jpg", } TASK_PROMPTS = { "Explain": "Describe this image in detail.", "Extract numbers": ( "Extract every number you can see with its label/context. " "Return a concise YAML list with fields: value, what_it_refers_to." ), "Write alt-text": ( "Write high-quality alt-text (<=200 chars) that would help a blind user understand " "the key content and purpose of this image." ), "Ask a question…": None, # free-form } # ── Model load (CPU) ───────────────────────────────────────────────────────── tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, # CPU device_map={"": DEVICE}, trust_remote_code=True, ) # ── Helpers ────────────────────────────────────────────────────────────────── def _fetch_image(url: str) -> Image.Image: r = requests.get(url, timeout=20) r.raise_for_status() return Image.open(io.BytesIO(r.content)).convert("RGB") def _build_inputs(prompt: str): # Build chat with placeholder exactly once (per model card) messages = [{"role": "user", "content": f"\n{prompt}"}] rendered = tok.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) pre, post = rendered.split("", 1) pre_ids = tok(pre, return_tensors="pt", add_special_tokens=False).input_ids post_ids = tok(post, return_tensors="pt", add_special_tokens=False).input_ids img_tok = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=pre_ids.dtype) input_ids = torch.cat([pre_ids, img_tok, post_ids], dim=1).to(model.device) attention_mask = torch.ones_like(input_ids, device=model.device) return input_ids, attention_mask def _prepare_pixels(pil_image: Image.Image): # Use the model's own processor from the vision tower px = model.get_vision_tower().image_processor(images=pil_image, return_tensors="pt")["pixel_values"] return px.to(model.device, dtype=model.dtype) @torch.inference_mode() def run_inference(choice: str, task: str, user_q: str, max_new_tokens: int, temperature: float): try: img = _fetch_image(SAMPLES[choice]) except Exception as e: return None, f"Could not load image: {e}", "" # Decide prompt if task == "Ask a question…": prompt = user_q.strip() or "Answer questions about this image." else: prompt = TASK_PROMPTS[task] # Build model inputs input_ids, attention_mask = _build_inputs(prompt) px = _prepare_pixels(img) # Generate t0 = time.perf_counter() out = model.generate( inputs=input_ids, attention_mask=attention_mask, images=px, max_new_tokens=int(max_new_tokens), temperature=float(temperature), ) t1 = time.perf_counter() text = tok.decode(out[0], skip_special_tokens=True) # Rough throughput metric gen_len = (out.shape[-1] - input_ids.shape[-1]) elapsed = t1 - t0 meta = f"⏱️ {elapsed:.2f}s • new tokens: {gen_len} • ~{(gen_len/elapsed if elapsed>0 else 0):.1f} tok/s • device: {DEVICE.upper()}" return img, text.strip(), meta # ── Gradio UI ──────────────────────────────────────────────────────────────── with gr.Blocks(title="FastVLM Screenshot Explainer (CPU)") as demo: gr.Markdown( """ # ⚡ FastVLM Screenshot Explainer — CPU-only (no uploads) Click an example image, pick a task, and go. Model: **apple/FastVLM-0.5B** (research license). """ ) with gr.Row(): choice = gr.Dropdown( label="Choose example image", choices=list(SAMPLES.keys()), value=list(SAMPLES.keys())[0], ) task = gr.Radio( label="Task", choices=list(TASK_PROMPTS.keys()), value="Explain", info="‘Ask a question…’ enables free-form VQA.", ) user_q = gr.Textbox(label="If asking a question, type it here", placeholder="e.g., What is the trend from 1950 to 2000?") with gr.Accordion("Generation settings", open=False): max_new = gr.Slider(32, 256, 128, step=8, label="max_new_tokens") temp = gr.Slider(0.0, 1.0, 0.2, step=0.05, label="temperature") go = gr.Button("Explain / Answer", variant="primary") with gr.Row(): img_out = gr.Image(label="Image", interactive=False) txt_out = gr.Textbox(label="Model output", lines=14) meta = gr.Markdown() go.click(run_inference, [choice, task, user_q, max_new, temp], [img_out, txt_out, meta]) gr.Markdown( """ **Notes** - Runs on CPU by default (float32). For GPUs, restart Space with CUDA and it will auto-use float16. - Model + usage based on the official model card’s `trust_remote_code` API and token handling. - **License:** Apple AML Research License — *research & non-commercial use only*. """ ) if __name__ == "__main__": demo.launch()