finhdev
/

testmobileclip

mobileclip

Model card Files Files and versions

xet

Community

finhdev commited on Aug 4

Commit

407a13c

verified ·

1 Parent(s): 048809c

Update handler.py

Browse files

Files changed (1) hide show

handler.py +126 -39

handler.py CHANGED Viewed

@@ -69,61 +69,148 @@ class EndpointHandler:
             key=lambda x: x["score"],
             reverse=True
         )
-# import contextlib, io, base64, torch
 # from PIL import Image
 # import open_clip
-# from reparam import reparameterize_model
 # class EndpointHandler:
-#     def __init__(self, path: str = ""):
 #         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-#         # Fix 1: Load weights directly from the web, just like local script
-#         # This guarantees the weights are identical.
 #         model, _, self.preprocess = open_clip.create_model_and_transforms(
-#             "MobileCLIP-B", pretrained='datacompdr'
 #         )
-#         model.eval()
-#         self.model = reparameterize_model(model)  # fuse branches
-#         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
-#         self.model.to(self.device)
-#         # Fix 2: Explicitly set model to half-precision if on CUDA
-#         # This matches the behavior of torch.set_default_dtype(torch.float16)
 #         if self.device == "cuda":
-#             self.model.to(torch.float16)
-#     def __call__(self, data):
-#         payload = data.get("inputs", data)
-#         img_b64 = payload["image"]
-#         labels  = payload.get("candidate_labels", [])
-#         if not labels:
-#             return {"error": "candidate_labels list is empty"}
-#         # ---------------- decode inputs ----------------
-#         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
-#         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
-#         # The preprocessor might output float32, so ensure tensor matches model dtype
 #         if self.device == "cuda":
 #             img_tensor = img_tensor.to(torch.float16)
-#         text_tokens = self.tokenizer(labels).to(self.device)
-#         # ---------------- forward pass -----------------
-#         # No need for autocast if everything is already float16
-#         with torch.no_grad():
-#             img_feat = self.model.encode_image(img_tensor)
-#             txt_feat = self.model.encode_text(text_tokens)
-#             img_feat /= img_feat.norm(dim=-1, keepdim=True)
-#             txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
-#             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].cpu().tolist()
-#         return [
-#             {"label": l, "score": float(p)}
-#             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
-#         ]

             key=lambda x: x["score"],
             reverse=True
         )
+# """
+# MobileCLIP‑B Zero‑Shot Image Classifier  (Hugging Face Inference Endpoint)
+# ===========================================================================
+# * One container instance is created per replica; the `EndpointHandler`
+#   object below is instantiated exactly **once** at start‑up.
+# * At request time (`__call__`) we receive a base‑64‑encoded image, run a
+#   **single forward pass**, and return class probabilities.
+# Design choices
+# --------------
+# 1. **Model & transform come from OpenCLIP**
+#    This guarantees we apply **identical preprocessing** to what the model
+#    was trained with (224 × 224 crop + mean/std normalisation).
+# 2. **Re‑parameterisation for inference**
+#    MobileCLIP uses MobileOne blocks that have extra convolution branches
+#    for training; `reparameterize_model` fuses them so inference is fast
+#    and deterministic.
+# 3. **Text embeddings are cached**
+#    The class “prompts” (e.g. `"a photo of a cat"`) are encoded **once at
+#    start‑up**.  Each request therefore encodes *only* the image and
+#    performs a single matrix multiplication.
+# 4. **Mixed precision on GPU**
+#    If the container has CUDA, we cast the model **and** inputs to
+#    `float16`.  That halves memory and roughly doubles throughput on most
+#    modern GPUs.  On CPU we stay in `float32` for numerical stability.
+# """
+# import contextlib, io, base64, json
+# from pathlib import Path
+# from typing import Any, Dict, List
+# import torch
 # from PIL import Image
 # import open_clip
+# from reparam import reparameterize_model   # local copy (~60 LoC) of Apple’s helper
 # class EndpointHandler:
+#     """
+#     Hugging Face entry‑point.  The toolkit will instantiate this class
+#     once and call it for every HTTP request.
+#     Parameters
+#     ----------
+#     path : str, optional
+#         Root directory of the repository.  HF mounts the code under
+#         `/repository`; we use this path to locate `items.json`.
+#     """
+#     # ------------------------------------------------------------------ #
+#     #                 INITIALISATION  (runs **once**)                     #
+#     # ------------------------------------------------------------------ #
+#     def __init__(self, path: str = "") -> None:
 #         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+#         # 1️⃣  Load MobileCLIP‑B weights & transforms -------------------
+#         #    `pretrained="datacompdr"` makes OpenCLIP download the
+#         #    official checkpoint from the Hub (cached in the image layer).
 #         model, _, self.preprocess = open_clip.create_model_and_transforms(
+#             "MobileCLIP-B", pretrained="datacompdr"
 #         )
+#         model.eval()                       # disable dropout / BN updates
+#         model = reparameterize_model(model)  # fuse MobileOne branches
+#         model.to(self.device)
 #         if self.device == "cuda":
+#             model = model.to(torch.float16)  # FP16 for throughput
+#         self.model = model                  # hold a reference
+#         # 2️⃣  Build the tokenizer once --------------------------------
+#         tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
+#         # 3️⃣  Load class metadata -------------------------------------
+#         #     Expect JSON file: [{"id": 3, "name": "cat", "prompt": "cat"}, …]
+#         items_path = Path(path) / "items.json"
+#         with items_path.open("r", encoding="utf-8") as f:
+#             class_defs: List[Dict[str, Any]] = json.load(f)
+#         #     Extract the bits we need later
+#         prompts                 = [item["prompt"] for item in class_defs]
+#         self.class_ids:   List[int]   = [item["id"]   for item in class_defs]
+#         self.class_names: List[str]   = [item["name"] for item in class_defs]
+#         # 4️⃣  Encode all prompts once ---------------------------------
+#         with torch.no_grad():
+#             text_tokens  = tokenizer(prompts).to(self.device)
+#             text_feats   = self.model.encode_text(text_tokens)
+#             text_feats   = text_feats / text_feats.norm(dim=-1, keepdim=True)
+#         self.text_features = text_feats           # [num_classes, 512]
+#     # ------------------------------------------------------------------ #
+#     #                          INFERENCE CALL                            #
+#     # ------------------------------------------------------------------ #
+#     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+#         """
+#         Parameters
+#         ----------
+#         data : dict
+#             Either the raw payload `{"image": "<base64>"}` **or** the
+#             Hugging Face convention `{"inputs": {...}}`.
+#         Returns
+#         -------
+#         list of dict
+#             Sorted list of `{"id": int, "label": str, "score": float}`.
+#             Scores are the softmax probabilities over the *provided*
+#             class list (they sum to 1.0).
+#         """
+#         # 1️⃣  Unpack the request payload ------------------------------
+#         payload: Dict[str, Any] = data.get("inputs", data)
+#         img_b64: str = payload["image"]
+#         # 2️⃣  Decode + preprocess -------------------------------------
+#         image      = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
+#         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)  # [1, 3, 224, 224]
 #         if self.device == "cuda":
 #             img_tensor = img_tensor.to(torch.float16)
+#         # 3️⃣  Forward pass (image only) -------------------------------
+#         with torch.no_grad():                    # no autograd graph
+#             img_feat = self.model.encode_image(img_tensor)            # [1, 512]
+#             img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True) # L2‑normalise
+#             # cosine similarity → logits → softmax probabilities
+#             probs = (100 * img_feat @ self.text_features.T).softmax(dim=-1)[0]  # [num_classes]
+#         # 4️⃣  Assemble JSON‑serialisable response ---------------------
+#         results = zip(self.class_ids, self.class_names, probs.cpu().tolist())
+#         return sorted(
+#             [{"id": cid, "label": name, "score": float(p)} for cid, name, p in results],
+#             key=lambda x: x["score"],
+#             reverse=True,
+#         )