from __future__ import annotations import csv import json import logging from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, Iterable, List, Optional logger = logging.getLogger(__name__) @dataclass(frozen=True) class ImageEntry: """Container für Bildmetadaten und Pfade zu Embeddings.""" image_id: str image_url: str clip_model: str embedding_path: Path def load_image_entries(csv_path: Path | str) -> List[ImageEntry]: """Liest die Bildliste aus einer CSV-Datei.""" path = Path(csv_path) if not path.exists(): raise FileNotFoundError(f"Die Datei {path} existiert nicht.") entries: List[ImageEntry] = [] with path.open("r", encoding="utf-8") as csvfile: reader = csv.DictReader(csvfile) for row in reader: image_id = row.get("image_id") or row.get("id") image_url = row.get("image_url") or row.get("url") clip_model = row.get("clip_model") or "jinaai/jina-clip-v2" embedding_path = row.get("embedding_path") or f"embeddings/{image_id}.npy" entries.append( ImageEntry( image_id=image_id, image_url=image_url, clip_model=clip_model, embedding_path=Path(embedding_path), ) ) return entries def similarity_to_score(similarity: float) -> int: """Wandelt eine Kosinusähnlichkeit (-1 bis 1) in einen Score von 0 bis 1000 um.""" clipped = max(-1.0, min(1.0, similarity)) score = int(round(((clipped + 1.0) / 2.0) * 1000)) return score def _require_numpy(): try: import numpy as np # type: ignore except ModuleNotFoundError as exc: # pragma: no cover - defensive fallback raise ModuleNotFoundError("numpy wird benötigt, ist aber nicht installiert.") from exc return np def _require_torch(): try: import torch # type: ignore except ModuleNotFoundError as exc: # pragma: no cover - defensive fallback raise ModuleNotFoundError("torch wird benötigt, ist aber nicht installiert.") from exc return torch def _require_transformers(): try: from transformers import AutoModel, AutoProcessor # type: ignore except ModuleNotFoundError as exc: # pragma: no cover - defensive fallback raise ModuleNotFoundError("transformers wird benötigt, ist aber nicht installiert.") from exc return AutoModel, AutoProcessor class ClipScorer: """Wrapper um CLIP für Text-/Bild-Embeddings und Scores.""" def __init__( self, model_name: str = "jinaai/jina-clip-v2", pretrained: Optional[str] = None, device: Optional[str] = None, ) -> None: self.model_name = model_name self.pretrained = pretrained torch = _require_torch() AutoModel, AutoProcessor = _require_transformers() self._torch = torch self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") if pretrained and pretrained != model_name: logger.warning( "Der Parameter 'pretrained' (%s) wird für transformers-basierte Modelle ignoriert.", pretrained, ) logger.info("Lade CLIP Modell %s auf %s", model_name, self.device) self.processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True) self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True) self.model.to(self.device) self.model.eval() for parameter in self.model.parameters(): parameter.requires_grad = False config = getattr(self.model, "config", None) embedding_dim = None if config is not None: embedding_dim = getattr(config, "projection_dim", None) if embedding_dim is None: embedding_dim = getattr(config, "hidden_size", None) self.embedding_dim: Optional[int] = embedding_dim self._image_embeddings: Dict[str, Any] = {} def load_precomputed_embeddings(self, entries: Iterable[ImageEntry]) -> None: """Lädt Embeddings aus .npy-Dateien und speichert sie intern.""" loaded = 0 for entry in entries: if entry.clip_model != self.model_name: logger.warning( "Überspringe Bild %s: erwartet Modell %s, gefunden %s", entry.image_id, self.model_name, entry.clip_model, ) continue if not entry.embedding_path.exists(): raise FileNotFoundError( f"Embedding-Datei für {entry.image_id} fehlt: {entry.embedding_path}" ) torch = self._torch suffix = entry.embedding_path.suffix.lower() if suffix == ".json": with entry.embedding_path.open("r", encoding="utf-8") as handle: payload = json.load(handle) if isinstance(payload, dict): values = ( payload.get("embedding") or payload.get("values") or payload.get("data") ) else: values = payload if values is None: raise ValueError( f"Embedding-Datei {entry.embedding_path} enthält keine Werte." ) tensor = torch.tensor(values, dtype=torch.float32, device=self.device) if tensor.ndim > 1: tensor = tensor.view(-1) else: np = _require_numpy() array = np.load(entry.embedding_path) if array.ndim > 1: array = array.squeeze() tensor = torch.from_numpy(array).to(self.device) tensor = tensor.to(dtype=torch.float32) expected_dim = self.embedding_dim if expected_dim is not None and tensor.shape[-1] != expected_dim: raise ValueError( "Embedding-Dimension stimmt nicht mit dem geladenen Modell überein. " f"Erwartet: {expected_dim}, erhalten: {tensor.shape[-1]} für {entry.image_id}." ) norm = torch.linalg.norm(tensor) if norm == 0: raise ValueError(f"Embedding für {entry.image_id} hat Norm 0.") tensor = tensor / norm self._image_embeddings[entry.image_id] = tensor loaded += 1 if loaded == 0: raise ValueError("Keine Embeddings konnten geladen werden.") logger.info("%d Embeddings geladen.", loaded) def encode_text(self, text: str) -> Any: torch = self._torch inputs = self.processor(text=[text], return_tensors="pt", padding=True, truncation=True) inputs = {key: value.to(self.device) for key, value in inputs.items() if isinstance(value, torch.Tensor)} with torch.no_grad(): text_features = self.model.get_text_features(**inputs).float() text_features = text_features / text_features.norm(dim=-1, keepdim=True) return text_features[0] def encode_image(self, image: Any) -> Any: torch = self._torch inputs = self.processor(images=image, return_tensors="pt") inputs = {key: value.to(self.device) for key, value in inputs.items() if isinstance(value, torch.Tensor)} with torch.no_grad(): image_features = self.model.get_image_features(**inputs).float() image_features = image_features / image_features.norm(dim=-1, keepdim=True) return image_features[0] def get_image_embedding(self, image_id: str) -> Any: try: return self._image_embeddings[image_id] except KeyError as exc: raise KeyError(f"Kein Embedding für Bild-ID {image_id} geladen.") from exc def compute_similarity(self, text_embedding: Any, image_embedding: Any) -> float: torch = self._torch similarity = torch.matmul(text_embedding, image_embedding) return float(similarity.item()) def score_text_for_image(self, text: str, image_id: str) -> tuple[float, int]: text_embedding = self.encode_text(text) image_embedding = self.get_image_embedding(image_id) similarity = self.compute_similarity(text_embedding, image_embedding) score = similarity_to_score(similarity) return similarity, score __all__ = [ "ClipScorer", "ImageEntry", "load_image_entries", "similarity_to_score", ]