""" SearchService - Semantic document search functionality. This module provides semantic search capabilities for the document corpus using embeddings and vector similarity search through ChromaDB integration. Classes: SearchService: Main class for performing semantic search operations """ import logging from typing import Any, Dict, List, Optional from src.embedding.embedding_service import EmbeddingService from src.vector_store.vector_db import VectorDatabase logger = logging.getLogger(__name__) class SearchService: """ Semantic search service for finding relevant documents using embeddings. This service combines text embedding generation with vector similarity search to provide relevant document retrieval based on semantic similarity rather than keyword matching. Attributes: vector_db: VectorDatabase instance for similarity search embedding_service: EmbeddingService instance for query embedding """ def __init__( self, vector_db: Optional[VectorDatabase], embedding_service: Optional[EmbeddingService], ): """ Initialize SearchService with required dependencies. Args: vector_db: VectorDatabase instance for storing and searching embeddings embedding_service: EmbeddingService instance for generating embeddings Raises: ValueError: If either vector_db or embedding_service is None """ if vector_db is None: raise ValueError("vector_db cannot be None") if embedding_service is None: raise ValueError("embedding_service cannot be None") self.vector_db = vector_db self.embedding_service = embedding_service logger.info("SearchService initialized successfully") def search( self, query: str, top_k: int = 5, threshold: float = 0.0 ) -> List[Dict[str, Any]]: """ Perform semantic search for relevant documents. Args: query: Text query to search for top_k: Maximum number of results to return (must be positive) threshold: Minimum similarity score threshold (0.0 to 1.0) Returns: List of search results, each containing: - chunk_id: Unique identifier for the document chunk - content: Text content of the document chunk - similarity_score: Similarity score (0.0 to 1.0, higher is better) - metadata: Additional metadata (filename, chunk_index, etc.) Raises: ValueError: If query is empty, top_k is not positive, or threshold is invalid RuntimeError: If embedding generation or vector search fails """ # Validate input parameters if not query or not query.strip(): raise ValueError("Query cannot be empty") if top_k <= 0: raise ValueError("top_k must be positive") if not (0.0 <= threshold <= 1.0): raise ValueError("threshold must be between 0 and 1") try: # Generate embedding for the query logger.debug(f"Generating embedding for query: '{query[:50]}...'") query_embedding = self.embedding_service.embed_text(query.strip()) # Perform vector similarity search logger.debug(f"Searching vector database with top_k={top_k}") raw_results = self.vector_db.search( query_embedding=query_embedding, top_k=top_k ) # Format and filter results formatted_results = self._format_search_results(raw_results, threshold) logger.info(f"Search completed: {len(formatted_results)} results returned") return formatted_results except Exception as e: logger.error(f"Search failed for query '{query}': {str(e)}") raise def _format_search_results( self, raw_results: List[Dict[str, Any]], threshold: float ) -> List[Dict[str, Any]]: """ Format VectorDatabase results into standardized search result format. Args: raw_results: Results from VectorDatabase.search() threshold: Minimum similarity score threshold Returns: List of formatted search results """ formatted_results = [] # Process each result from VectorDatabase format for result in raw_results: # Convert distance to similarity score (higher is better) distance = result.get("distance", 1.0) similarity_score = 1.0 - distance # Apply threshold filtering if similarity_score >= threshold: formatted_result = { "chunk_id": result.get("id", ""), "content": result.get("document", ""), "similarity_score": similarity_score, "metadata": result.get("metadata", {}), } formatted_results.append(formatted_result) logger.debug( f"Formatted {len(formatted_results)} results above threshold {threshold}" ) return formatted_results