FastVLM-ImageToText

Running

akhaliq HF Staff commited on Aug 31

Commit

b7bf121

verified ·

1 Parent(s): 98c4ce9

Upload app.py with huggingface_hub

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,22 +2,31 @@ import gradio as gr
 import torch
 from PIL import Image
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # Model configuration
 MID = "apple/FastVLM-0.5B"
 IMAGE_TOKEN_INDEX = -200
-# Load model and tokenizer once at startup
-print("Loading model...")
-tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    MID,
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-    device_map="auto",
-    trust_remote_code=True,
-)
-print("Model loaded successfully!")
 def caption_image(image, custom_prompt=None):
     """
     Generate a caption for the input image.
@@ -33,6 +42,8 @@ def caption_image(image, custom_prompt=None):
         return "Please upload an image first."
     try:
         # Convert image to RGB if needed
         if image.mode != "RGB":
             image = image.convert("RGB")
@@ -149,7 +160,7 @@ with gr.Blocks(title="FastVLM Image Captioning") as demo:
         ---
         **Model:** [apple/FastVLM-0.5B](https://huggingface.co/apple/FastVLM-0.5B)
-        **Note:** This model runs best on GPU. CPU inference may be slower.
         """
     )

 import torch
 from PIL import Image
 from transformers import AutoTokenizer, AutoModelForCausalLM
+import spaces
 # Model configuration
 MID = "apple/FastVLM-0.5B"
 IMAGE_TOKEN_INDEX = -200
+# Load model and tokenizer (will be loaded on first GPU allocation)
+tok = None
+model = None
+def load_model():
+    global tok, model
+    if tok is None or model is None:
+        print("Loading model...")
+        tok = AutoTokenizer.from_pretrained(MID, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            MID,
+            torch_dtype=torch.float16,
+            device_map="cuda",
+            trust_remote_code=True,
+        )
+        print("Model loaded successfully!")
+    return tok, model
+@spaces.GPU(duration=60)
 def caption_image(image, custom_prompt=None):
     """
     Generate a caption for the input image.
         return "Please upload an image first."
     try:
+        # Load model if not already loaded
+        tok, model = load_model()
         # Convert image to RGB if needed
         if image.mode != "RGB":
             image = image.convert("RGB")
         ---
         **Model:** [apple/FastVLM-0.5B](https://huggingface.co/apple/FastVLM-0.5B)
+        **Note:** This Space uses ZeroGPU for dynamic GPU allocation.
         """
     )