""" Tests for SearchService - Semantic document search functionality. This test suite covers: - SearchService initialization and configuration - Query embedding generation - Similarity search with ChromaDB integration - Result formatting and metadata handling - Error handling and edge cases - Performance and parameter validation """ import shutil import tempfile from unittest.mock import Mock import pytest from src.embedding.embedding_service import EmbeddingService from src.search.search_service import SearchService from src.vector_store.vector_db import VectorDatabase class TestSearchServiceInitialization: """Test SearchService initialization and configuration.""" def test_search_service_initialization(self): """Test that SearchService initializes correctly with required dependencies.""" mock_vector_db = Mock(spec=VectorDatabase) mock_embedding_service = Mock(spec=EmbeddingService) search_service = SearchService( vector_db=mock_vector_db, embedding_service=mock_embedding_service ) assert search_service.vector_db == mock_vector_db assert search_service.embedding_service == mock_embedding_service def test_search_service_with_none_dependencies(self): """Test that SearchService raises appropriate error with None dependencies.""" with pytest.raises(ValueError, match="vector_db cannot be None"): SearchService(vector_db=None, embedding_service=Mock()) with pytest.raises(ValueError, match="embedding_service cannot be None"): SearchService(vector_db=Mock(), embedding_service=None) class TestSearchFunctionality: """Test core search functionality.""" def setup_method(self): """Set up test fixtures for search functionality tests.""" self.mock_vector_db = Mock(spec=VectorDatabase) self.mock_embedding_service = Mock(spec=EmbeddingService) self.search_service = SearchService( vector_db=self.mock_vector_db, embedding_service=self.mock_embedding_service, enable_query_expansion=False, # Disable for unit tests ) def test_search_with_valid_query(self): """Test search functionality with a valid text query.""" # Mock embedding generation mock_embedding = [0.1, 0.2, 0.3, 0.4] self.mock_embedding_service.embed_text.return_value = mock_embedding # Mock vector database search results (VectorDatabase format) mock_raw_results = [ { "id": "doc_1", "document": "Remote work policy content...", "distance": 0.15, "metadata": {"filename": "remote_work_policy.md", "chunk_index": 2}, }, { "id": "doc_2", "document": "PTO policy content...", "distance": 0.25, "metadata": {"filename": "pto_policy.md", "chunk_index": 1}, }, ] self.mock_vector_db.search.return_value = mock_raw_results # Perform search results = self.search_service.search("remote work policy", top_k=2) # Verify embedding service was called self.mock_embedding_service.embed_text.assert_called_once_with( "remote work policy" ) # Verify vector database search was called self.mock_vector_db.search.assert_called_once_with( query_embedding=mock_embedding, top_k=2 ) # Verify results structure assert len(results) == 2 assert results[0]["chunk_id"] == "doc_1" assert results[0]["content"] == "Remote work policy content..." assert results[0]["similarity_score"] == pytest.approx( 0.925, abs=0.01 ) # max(0.0, 1.0 - (0.15 / 2.0)) = 0.925 assert results[0]["metadata"]["filename"] == "remote_work_policy.md" def test_search_with_empty_query(self): """Test search behavior with empty query string.""" with pytest.raises(ValueError, match="Query cannot be empty"): self.search_service.search("") with pytest.raises(ValueError, match="Query cannot be empty"): self.search_service.search(" ") # whitespace only def test_search_with_no_results(self): """Test search behavior when no results are found.""" # Mock embedding generation mock_embedding = [0.1, 0.2, 0.3, 0.4] self.mock_embedding_service.embed_text.return_value = mock_embedding # Mock empty search results (VectorDatabase format) mock_raw_results = [] self.mock_vector_db.search.return_value = mock_raw_results # Perform search results = self.search_service.search("non-existent topic") # Verify empty results assert results == [] def test_search_with_top_k_parameter(self): """Test search with different top_k values.""" mock_embedding = [0.1, 0.2, 0.3, 0.4] self.mock_embedding_service.embed_text.return_value = mock_embedding # Mock results for top_k=1 (VectorDatabase format) mock_raw_results = [ { "id": "doc_1", "document": "Content 1", "distance": 0.15, "metadata": {"filename": "file1.md", "chunk_index": 0}, } ] self.mock_vector_db.search.return_value = mock_raw_results # Test with top_k=1 results = self.search_service.search("test query", top_k=1) self.mock_vector_db.search.assert_called_with( query_embedding=mock_embedding, top_k=1 ) assert len(results) == 1 # Test with top_k=10 self.search_service.search("test query", top_k=10) self.mock_vector_db.search.assert_called_with( query_embedding=mock_embedding, top_k=10 ) def test_search_with_threshold_filtering(self): """Test search with similarity threshold filtering.""" # Mock embedding generation mock_embedding = [0.1, 0.2, 0.3, 0.4] self.mock_embedding_service.embed_text.return_value = mock_embedding # Mock results with varying distances (VectorDatabase format) mock_raw_results = [ { "id": "doc_1", "document": "High match", "distance": 0.1, # similarity: max(0.0, 1.0 - (0.1 / 2.0)) = 0.95 "metadata": {"filename": "file1.md", "chunk_index": 0}, }, { "id": "doc_2", "document": "Medium match", "distance": 0.5, # similarity: max(0.0, 1.0 - (0.5 / 2.0)) = 0.75 "metadata": {"filename": "file2.md", "chunk_index": 0}, }, { "id": "doc_3", "document": "Low match", "distance": 0.8, # similarity: max(0.0, 1.0 - (0.8 / 2.0)) = 0.6 "metadata": {"filename": "file3.md", "chunk_index": 0}, }, ] self.mock_vector_db.search.return_value = mock_raw_results # Search with threshold=0.7 (should return only first two results) results = self.search_service.search("test query", top_k=5, threshold=0.7) # Verify only results above threshold are returned assert len(results) == 2 assert results[0]["similarity_score"] == pytest.approx(0.95, abs=0.01) assert results[1]["similarity_score"] == pytest.approx(0.75, abs=0.01) class TestErrorHandling: """Test error handling and edge cases.""" def setup_method(self): """Set up test fixtures for error handling tests.""" self.mock_vector_db = Mock(spec=VectorDatabase) self.mock_embedding_service = Mock(spec=EmbeddingService) self.search_service = SearchService( vector_db=self.mock_vector_db, embedding_service=self.mock_embedding_service ) def test_search_with_embedding_service_error(self): """Test search behavior when embedding service fails.""" # Mock embedding service to raise an exception self.mock_embedding_service.embed_text.side_effect = RuntimeError( "Embedding model failed" ) with pytest.raises(RuntimeError, match="Embedding model failed"): self.search_service.search("test query") def test_search_with_vector_db_error(self): """Test search behavior when vector database fails.""" # Mock successful embedding but failed vector search self.mock_embedding_service.embed_text.return_value = [0.1, 0.2, 0.3] self.mock_vector_db.search.side_effect = RuntimeError( "Vector DB connection failed" ) with pytest.raises(RuntimeError, match="Vector DB connection failed"): self.search_service.search("test query") def test_search_with_invalid_parameters(self): """Test search with invalid parameter values.""" with pytest.raises(ValueError, match="top_k must be positive"): self.search_service.search("query", top_k=0) with pytest.raises(ValueError, match="top_k must be positive"): self.search_service.search("query", top_k=-1) with pytest.raises(ValueError, match="threshold must be between 0 and 1"): self.search_service.search("query", threshold=-0.1) with pytest.raises(ValueError, match="threshold must be between 0 and 1"): self.search_service.search("query", threshold=1.1) class TestIntegrationWithRealComponents: """Test SearchService integration with real VectorDatabase and EmbeddingService.""" def setup_method(self): """Set up real components for integration testing.""" # Create temporary directory for ChromaDB self.temp_dir = tempfile.mkdtemp() # Initialize real components self.embedding_service = EmbeddingService() self.vector_db = VectorDatabase( persist_path=self.temp_dir, collection_name="test_collection" ) self.search_service = SearchService( vector_db=self.vector_db, embedding_service=self.embedding_service ) def teardown_method(self): """Clean up temporary directory.""" shutil.rmtree(self.temp_dir, ignore_errors=True) def test_search_integration_with_real_data(self): """Test search functionality with real embedding and vector storage.""" # Add some test documents to the vector database test_texts = [ "Remote work policy allows employees to work from home", "Employee benefits include health insurance and vacation time", "Code of conduct requires professional behavior at all times", ] test_metadatas = [ {"filename": "remote_work.md", "chunk_index": 0}, {"filename": "benefits.md", "chunk_index": 0}, {"filename": "conduct.md", "chunk_index": 0}, ] # Generate embeddings and store in vector database embeddings = [] for text in test_texts: embedding = self.embedding_service.embed_text(text) embeddings.append(embedding) # Add to vector database using the bulk add_embeddings method chunk_ids = [f"doc_{i}" for i in range(len(test_texts))] self.vector_db.add_embeddings( embeddings=embeddings, chunk_ids=chunk_ids, documents=test_texts, metadatas=test_metadatas, ) # Test search functionality results = self.search_service.search("work from home", top_k=2) # Verify results assert len(results) > 0 assert "chunk_id" in results[0] assert "content" in results[0] assert "similarity_score" in results[0] assert "metadata" in results[0] # Verify similarity scores are reasonable for result in results: assert 0.0 <= result["similarity_score"] <= 1.0 # Verify results are ordered by similarity (highest first) if len(results) > 1: assert results[0]["similarity_score"] >= results[1]["similarity_score"] def test_search_quality_validation(self): """Test that search returns relevant results for policy queries.""" # This is a simplified test to verify basic search functionality # More complex relevance testing can be done in manual/integration testing # Add a simple test document test_text = "Remote work policy allows employees to work from home" embedding = self.embedding_service.embed_text(test_text) # Store document in vector database self.vector_db.add_embeddings( embeddings=[embedding], chunk_ids=["test_doc"], documents=[test_text], metadatas=[{"filename": "test.md", "chunk_index": 0}], ) # Verify we can search and get results results = self.search_service.search("remote work", top_k=1) # Basic validation assert len(results) > 0 assert results[0]["chunk_id"] == "test_doc" class TestQueryExpansion: """Test query expansion functionality.""" def setup_method(self): """Set up test fixtures for query expansion tests.""" self.mock_vector_db = Mock(spec=VectorDatabase) self.mock_embedding_service = Mock(spec=EmbeddingService) # Enable query expansion for these tests self.search_service = SearchService( vector_db=self.mock_vector_db, embedding_service=self.mock_embedding_service, enable_query_expansion=True, ) def test_query_expansion_enabled(self): """Test that query expansion works when enabled.""" # Mock embedding generation mock_embedding = [0.1, 0.2, 0.3, 0.4] self.mock_embedding_service.embed_text.return_value = mock_embedding # Mock vector database search results mock_raw_results = [ { "id": "doc_1", "document": "Remote work policy content...", "distance": 0.15, "metadata": {"filename": "remote_work_policy.md", "chunk_index": 0}, } ] self.mock_vector_db.search.return_value = mock_raw_results # Perform search with query that should be expanded results = self.search_service.search("work from home", top_k=1) # Verify that the query was expanded (should contain more than original query) actual_call = self.mock_embedding_service.embed_text.call_args[0][0] assert "work from home" in actual_call # Check that expansion terms were added assert any( term in actual_call for term in ["remote work", "telecommuting", "WFH"] ) # Verify results are still returned correctly assert len(results) == 1 assert results[0]["chunk_id"] == "doc_1" def test_query_expansion_disabled(self): """Test that query expansion can be disabled.""" # Create search service with expansion disabled search_service_no_expansion = SearchService( vector_db=self.mock_vector_db, embedding_service=self.mock_embedding_service, enable_query_expansion=False, ) # Mock embedding generation mock_embedding = [0.1, 0.2, 0.3, 0.4] self.mock_embedding_service.embed_text.return_value = mock_embedding # Mock vector database search results mock_raw_results = [ { "id": "doc_1", "document": "Content...", "distance": 0.15, "metadata": {"filename": "test.md", "chunk_index": 0}, } ] self.mock_vector_db.search.return_value = mock_raw_results # Perform search original_query = "work from home" results = search_service_no_expansion.search(original_query, top_k=1) # Verify that the original query was used without expansion self.mock_embedding_service.embed_text.assert_called_with(original_query) # Verify results are returned assert len(results) == 1 assert 0.0 <= results[0]["similarity_score"] <= 1.0