import logging from typing import List import numpy as np from sentence_transformers import SentenceTransformer class EmbeddingService: """HuggingFace sentence-transformers wrapper for generating embeddings""" _model_cache = {} # Class-level cache for model instances def __init__( self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", device: str = "cpu", batch_size: int = 32, ): """ Initialize the embedding service Args: model_name: HuggingFace model name device: Device to run the model on ('cpu' or 'cuda') batch_size: Batch size for processing multiple texts """ self.model_name = model_name self.device = device self.batch_size = batch_size # Load model (with caching) self.model = self._load_model() logging.info( f"Initialized EmbeddingService with model " f"'{model_name}' on device '{device}'" ) def _load_model(self) -> SentenceTransformer: """Load the sentence transformer model with caching""" cache_key = f"{self.model_name}_{self.device}" if cache_key not in self._model_cache: logging.info( f"Loading model '{self.model_name}' on device '{self.device}'..." ) model = SentenceTransformer(self.model_name, device=self.device) self._model_cache[cache_key] = model logging.info("Model loaded successfully") else: logging.info(f"Using cached model '{self.model_name}'") return self._model_cache[cache_key] def embed_text(self, text: str) -> List[float]: """ Generate embedding for a single text Args: text: Text to embed Returns: List of float values representing the embedding """ if not text.strip(): # Handle empty text - still generate embedding text = " " # Single space to avoid completely empty input try: # Generate embedding embedding = self.model.encode(text, convert_to_numpy=True) # Convert to Python list of floats return embedding.tolist() except Exception as e: logging.error(f"Failed to generate embedding for text: {e}") raise e def embed_texts(self, texts: List[str]) -> List[List[float]]: """ Generate embeddings for multiple texts Args: texts: List of texts to embed Returns: List of embeddings (each embedding is a list of floats) """ if not texts: return [] try: # Preprocess empty texts processed_texts = [] for text in texts: if not text.strip(): processed_texts.append(" ") # Single space for empty texts else: processed_texts.append(text) # Generate embeddings in batches all_embeddings = [] for i in range(0, len(processed_texts), self.batch_size): batch_texts = processed_texts[i : i + self.batch_size] # Generate embeddings for this batch batch_embeddings = self.model.encode( batch_texts, convert_to_numpy=True, show_progress_bar=False, # Disable progress bar for cleaner output ) # Convert to list of lists for embedding in batch_embeddings: all_embeddings.append(embedding.tolist()) logging.info(f"Generated embeddings for {len(texts)} texts") return all_embeddings except Exception as e: logging.error(f"Failed to generate embeddings for texts: {e}") raise e def get_embedding_dimension(self) -> int: """Get the dimension of embeddings produced by this model""" return self.model.get_sentence_embedding_dimension() def encode_batch(self, texts: List[str]) -> np.ndarray: """ Generate embeddings and return as numpy array (for efficiency) Args: texts: List of texts to embed Returns: NumPy array of embeddings """ if not texts: return np.array([]) # Preprocess empty texts processed_texts = [] for text in texts: if not text.strip(): processed_texts.append(" ") else: processed_texts.append(text) return self.model.encode(processed_texts, convert_to_numpy=True) def similarity(self, text1: str, text2: str) -> float: """ Calculate cosine similarity between two texts Args: text1: First text text2: Second text Returns: Cosine similarity score (0-1) """ try: embeddings = self.embed_texts([text1, text2]) # Calculate cosine similarity embed1 = np.array(embeddings[0]) embed2 = np.array(embeddings[1]) similarity = np.dot(embed1, embed2) / ( np.linalg.norm(embed1) * np.linalg.norm(embed2) ) return float(similarity) except Exception as e: logging.error(f"Failed to calculate similarity: {e}") return 0.0