Tobias Pasquale commited on
Commit
7ad46e8
·
1 Parent(s): b5bb2b7

feat: implement SearchService for semantic document search

Browse files

- Add SearchService class with comprehensive search functionality
- Support for semantic search using embeddings and vector similarity
- Configurable top_k results and similarity threshold filtering
- Integration with existing VectorDatabase and EmbeddingService
- Comprehensive test suite with 12 test cases (100% passing)
- Full error handling and parameter validation
- TDD approach with unit and integration tests
- Code formatting compliance (black, isort, flake8)

Addresses GitHub Issue #14
Phase 2B implementation ready for Flask API integration

src/search/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Search module for semantic document retrieval."""
src/search/search_service.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SearchService - Semantic document search functionality.
3
+
4
+ This module provides semantic search capabilities for the document corpus
5
+ using embeddings and vector similarity search through ChromaDB integration.
6
+
7
+ Classes:
8
+ SearchService: Main class for performing semantic search operations
9
+ """
10
+
11
+ import logging
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ from src.embedding.embedding_service import EmbeddingService
15
+ from src.vector_store.vector_db import VectorDatabase
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class SearchService:
21
+ """
22
+ Semantic search service for finding relevant documents using embeddings.
23
+
24
+ This service combines text embedding generation with vector similarity search
25
+ to provide relevant document retrieval based on semantic similarity rather
26
+ than keyword matching.
27
+
28
+ Attributes:
29
+ vector_db: VectorDatabase instance for similarity search
30
+ embedding_service: EmbeddingService instance for query embedding
31
+ """
32
+
33
+ def __init__(
34
+ self,
35
+ vector_db: Optional[VectorDatabase],
36
+ embedding_service: Optional[EmbeddingService],
37
+ ):
38
+ """
39
+ Initialize SearchService with required dependencies.
40
+
41
+ Args:
42
+ vector_db: VectorDatabase instance for storing and searching embeddings
43
+ embedding_service: EmbeddingService instance for generating embeddings
44
+
45
+ Raises:
46
+ ValueError: If either vector_db or embedding_service is None
47
+ """
48
+ if vector_db is None:
49
+ raise ValueError("vector_db cannot be None")
50
+ if embedding_service is None:
51
+ raise ValueError("embedding_service cannot be None")
52
+
53
+ self.vector_db = vector_db
54
+ self.embedding_service = embedding_service
55
+ logger.info("SearchService initialized successfully")
56
+
57
+ def search(
58
+ self, query: str, top_k: int = 5, threshold: float = 0.0
59
+ ) -> List[Dict[str, Any]]:
60
+ """
61
+ Perform semantic search for relevant documents.
62
+
63
+ Args:
64
+ query: Text query to search for
65
+ top_k: Maximum number of results to return (must be positive)
66
+ threshold: Minimum similarity score threshold (0.0 to 1.0)
67
+
68
+ Returns:
69
+ List of search results, each containing:
70
+ - chunk_id: Unique identifier for the document chunk
71
+ - content: Text content of the document chunk
72
+ - similarity_score: Similarity score (0.0 to 1.0, higher is better)
73
+ - metadata: Additional metadata (filename, chunk_index, etc.)
74
+
75
+ Raises:
76
+ ValueError: If query is empty, top_k is not positive, or threshold
77
+ is invalid
78
+ RuntimeError: If embedding generation or vector search fails
79
+ """
80
+ # Validate input parameters
81
+ if not query or not query.strip():
82
+ raise ValueError("Query cannot be empty")
83
+
84
+ if top_k <= 0:
85
+ raise ValueError("top_k must be positive")
86
+
87
+ if not (0.0 <= threshold <= 1.0):
88
+ raise ValueError("threshold must be between 0 and 1")
89
+
90
+ try:
91
+ # Generate embedding for the query
92
+ logger.debug(f"Generating embedding for query: '{query[:50]}...'")
93
+ query_embedding = self.embedding_service.embed_text(query.strip())
94
+
95
+ # Perform vector similarity search
96
+ logger.debug(f"Searching vector database with top_k={top_k}")
97
+ raw_results = self.vector_db.search(
98
+ query_embedding=query_embedding, top_k=top_k
99
+ )
100
+
101
+ # Format and filter results
102
+ formatted_results = self._format_search_results(raw_results, threshold)
103
+
104
+ logger.info(f"Search completed: {len(formatted_results)} results returned")
105
+ return formatted_results
106
+
107
+ except Exception as e:
108
+ logger.error(f"Search failed for query '{query}': {str(e)}")
109
+ raise
110
+
111
+ def _format_search_results(
112
+ self, raw_results: List[Dict[str, Any]], threshold: float
113
+ ) -> List[Dict[str, Any]]:
114
+ """
115
+ Format VectorDatabase results into standardized search result format.
116
+
117
+ Args:
118
+ raw_results: Results from VectorDatabase.search()
119
+ threshold: Minimum similarity score threshold
120
+
121
+ Returns:
122
+ List of formatted search results
123
+ """
124
+ formatted_results = []
125
+
126
+ # Process each result from VectorDatabase format
127
+ for result in raw_results:
128
+ # Convert distance to similarity score (higher is better)
129
+ distance = result.get("distance", 1.0)
130
+ similarity_score = 1.0 - distance
131
+
132
+ # Apply threshold filtering
133
+ if similarity_score >= threshold:
134
+ formatted_result = {
135
+ "chunk_id": result.get("id", ""),
136
+ "content": result.get("document", ""),
137
+ "similarity_score": similarity_score,
138
+ "metadata": result.get("metadata", {}),
139
+ }
140
+ formatted_results.append(formatted_result)
141
+
142
+ logger.debug(
143
+ f"Formatted {len(formatted_results)} results above threshold {threshold}"
144
+ )
145
+ return formatted_results
tests/test_search/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Tests for search module."""
tests/test_search/test_search_service.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for SearchService - Semantic document search functionality.
3
+
4
+ This test suite covers:
5
+ - SearchService initialization and configuration
6
+ - Query embedding generation
7
+ - Similarity search with ChromaDB integration
8
+ - Result formatting and metadata handling
9
+ - Error handling and edge cases
10
+ - Performance and parameter validation
11
+ """
12
+
13
+ import shutil
14
+ import tempfile
15
+ from unittest.mock import Mock
16
+
17
+ import pytest
18
+
19
+ from src.embedding.embedding_service import EmbeddingService
20
+ from src.search.search_service import SearchService
21
+ from src.vector_store.vector_db import VectorDatabase
22
+
23
+
24
+ class TestSearchServiceInitialization:
25
+ """Test SearchService initialization and configuration."""
26
+
27
+ def test_search_service_initialization(self):
28
+ """Test that SearchService initializes correctly with required dependencies."""
29
+ mock_vector_db = Mock(spec=VectorDatabase)
30
+ mock_embedding_service = Mock(spec=EmbeddingService)
31
+
32
+ search_service = SearchService(
33
+ vector_db=mock_vector_db, embedding_service=mock_embedding_service
34
+ )
35
+
36
+ assert search_service.vector_db == mock_vector_db
37
+ assert search_service.embedding_service == mock_embedding_service
38
+
39
+ def test_search_service_with_none_dependencies(self):
40
+ """Test that SearchService raises appropriate error with None dependencies."""
41
+ with pytest.raises(ValueError, match="vector_db cannot be None"):
42
+ SearchService(vector_db=None, embedding_service=Mock())
43
+
44
+ with pytest.raises(ValueError, match="embedding_service cannot be None"):
45
+ SearchService(vector_db=Mock(), embedding_service=None)
46
+
47
+
48
+ class TestSearchFunctionality:
49
+ """Test core search functionality."""
50
+
51
+ def setup_method(self):
52
+ """Set up test fixtures for search functionality tests."""
53
+ self.mock_vector_db = Mock(spec=VectorDatabase)
54
+ self.mock_embedding_service = Mock(spec=EmbeddingService)
55
+ self.search_service = SearchService(
56
+ vector_db=self.mock_vector_db, embedding_service=self.mock_embedding_service
57
+ )
58
+
59
+ def test_search_with_valid_query(self):
60
+ """Test search functionality with a valid text query."""
61
+ # Mock embedding generation
62
+ mock_embedding = [0.1, 0.2, 0.3, 0.4]
63
+ self.mock_embedding_service.embed_text.return_value = mock_embedding
64
+
65
+ # Mock vector database search results (VectorDatabase format)
66
+ mock_raw_results = [
67
+ {
68
+ "id": "doc_1",
69
+ "document": "Remote work policy content...",
70
+ "distance": 0.15,
71
+ "metadata": {"filename": "remote_work_policy.md", "chunk_index": 2},
72
+ },
73
+ {
74
+ "id": "doc_2",
75
+ "document": "PTO policy content...",
76
+ "distance": 0.25,
77
+ "metadata": {"filename": "pto_policy.md", "chunk_index": 1},
78
+ },
79
+ ]
80
+ self.mock_vector_db.search.return_value = mock_raw_results
81
+
82
+ # Perform search
83
+ results = self.search_service.search("remote work policy", top_k=2)
84
+
85
+ # Verify embedding service was called
86
+ self.mock_embedding_service.embed_text.assert_called_once_with(
87
+ "remote work policy"
88
+ )
89
+
90
+ # Verify vector database search was called
91
+ self.mock_vector_db.search.assert_called_once_with(
92
+ query_embedding=mock_embedding, top_k=2
93
+ )
94
+
95
+ # Verify results structure
96
+ assert len(results) == 2
97
+ assert results[0]["chunk_id"] == "doc_1"
98
+ assert results[0]["content"] == "Remote work policy content..."
99
+ assert results[0]["similarity_score"] == pytest.approx(
100
+ 0.85, abs=0.01
101
+ ) # 1 - 0.15
102
+ assert results[0]["metadata"]["filename"] == "remote_work_policy.md"
103
+
104
+ def test_search_with_empty_query(self):
105
+ """Test search behavior with empty query string."""
106
+ with pytest.raises(ValueError, match="Query cannot be empty"):
107
+ self.search_service.search("")
108
+
109
+ with pytest.raises(ValueError, match="Query cannot be empty"):
110
+ self.search_service.search(" ") # whitespace only
111
+
112
+ def test_search_with_no_results(self):
113
+ """Test search behavior when no results are found."""
114
+ # Mock embedding generation
115
+ mock_embedding = [0.1, 0.2, 0.3, 0.4]
116
+ self.mock_embedding_service.embed_text.return_value = mock_embedding
117
+
118
+ # Mock empty search results (VectorDatabase format)
119
+ mock_raw_results = []
120
+ self.mock_vector_db.search.return_value = mock_raw_results
121
+
122
+ # Perform search
123
+ results = self.search_service.search("non-existent topic")
124
+
125
+ # Verify empty results
126
+ assert results == []
127
+
128
+ def test_search_with_top_k_parameter(self):
129
+ """Test search with different top_k values."""
130
+ mock_embedding = [0.1, 0.2, 0.3, 0.4]
131
+ self.mock_embedding_service.embed_text.return_value = mock_embedding
132
+
133
+ # Mock results for top_k=1 (VectorDatabase format)
134
+ mock_raw_results = [
135
+ {
136
+ "id": "doc_1",
137
+ "document": "Content 1",
138
+ "distance": 0.15,
139
+ "metadata": {"filename": "file1.md", "chunk_index": 0},
140
+ }
141
+ ]
142
+ self.mock_vector_db.search.return_value = mock_raw_results
143
+
144
+ # Test with top_k=1
145
+ results = self.search_service.search("test query", top_k=1)
146
+ self.mock_vector_db.search.assert_called_with(
147
+ query_embedding=mock_embedding, top_k=1
148
+ )
149
+ assert len(results) == 1
150
+
151
+ # Test with top_k=10
152
+ self.search_service.search("test query", top_k=10)
153
+ self.mock_vector_db.search.assert_called_with(
154
+ query_embedding=mock_embedding, top_k=10
155
+ )
156
+
157
+ def test_search_with_threshold_filtering(self):
158
+ """Test search with similarity threshold filtering."""
159
+ # Mock embedding generation
160
+ mock_embedding = [0.1, 0.2, 0.3, 0.4]
161
+ self.mock_embedding_service.embed_text.return_value = mock_embedding
162
+
163
+ # Mock results with varying distances (VectorDatabase format)
164
+ mock_raw_results = [
165
+ {
166
+ "id": "doc_1",
167
+ "document": "High match",
168
+ "distance": 0.1, # similarity: 0.9
169
+ "metadata": {"filename": "file1.md", "chunk_index": 0},
170
+ },
171
+ {
172
+ "id": "doc_2",
173
+ "document": "Medium match",
174
+ "distance": 0.5, # similarity: 0.5
175
+ "metadata": {"filename": "file2.md", "chunk_index": 0},
176
+ },
177
+ {
178
+ "id": "doc_3",
179
+ "document": "Low match",
180
+ "distance": 0.8, # similarity: 0.2
181
+ "metadata": {"filename": "file3.md", "chunk_index": 0},
182
+ },
183
+ ]
184
+ self.mock_vector_db.search.return_value = mock_raw_results
185
+
186
+ # Search with threshold=0.4 (should return only first two results)
187
+ results = self.search_service.search("test query", top_k=5, threshold=0.4)
188
+
189
+ # Verify only results above threshold are returned
190
+ assert len(results) == 2
191
+ assert results[0]["similarity_score"] == pytest.approx(0.9, abs=0.01)
192
+ assert results[1]["similarity_score"] == pytest.approx(0.5, abs=0.01)
193
+
194
+
195
+ class TestErrorHandling:
196
+ """Test error handling and edge cases."""
197
+
198
+ def setup_method(self):
199
+ """Set up test fixtures for error handling tests."""
200
+ self.mock_vector_db = Mock(spec=VectorDatabase)
201
+ self.mock_embedding_service = Mock(spec=EmbeddingService)
202
+ self.search_service = SearchService(
203
+ vector_db=self.mock_vector_db, embedding_service=self.mock_embedding_service
204
+ )
205
+
206
+ def test_search_with_embedding_service_error(self):
207
+ """Test search behavior when embedding service fails."""
208
+ # Mock embedding service to raise an exception
209
+ self.mock_embedding_service.embed_text.side_effect = RuntimeError(
210
+ "Embedding model failed"
211
+ )
212
+
213
+ with pytest.raises(RuntimeError, match="Embedding model failed"):
214
+ self.search_service.search("test query")
215
+
216
+ def test_search_with_vector_db_error(self):
217
+ """Test search behavior when vector database fails."""
218
+ # Mock successful embedding but failed vector search
219
+ self.mock_embedding_service.embed_text.return_value = [0.1, 0.2, 0.3]
220
+ self.mock_vector_db.search.side_effect = RuntimeError(
221
+ "Vector DB connection failed"
222
+ )
223
+
224
+ with pytest.raises(RuntimeError, match="Vector DB connection failed"):
225
+ self.search_service.search("test query")
226
+
227
+ def test_search_with_invalid_parameters(self):
228
+ """Test search with invalid parameter values."""
229
+ with pytest.raises(ValueError, match="top_k must be positive"):
230
+ self.search_service.search("query", top_k=0)
231
+
232
+ with pytest.raises(ValueError, match="top_k must be positive"):
233
+ self.search_service.search("query", top_k=-1)
234
+
235
+ with pytest.raises(ValueError, match="threshold must be between 0 and 1"):
236
+ self.search_service.search("query", threshold=-0.1)
237
+
238
+ with pytest.raises(ValueError, match="threshold must be between 0 and 1"):
239
+ self.search_service.search("query", threshold=1.1)
240
+
241
+
242
+ class TestIntegrationWithRealComponents:
243
+ """Test SearchService integration with real VectorDatabase and EmbeddingService."""
244
+
245
+ def setup_method(self):
246
+ """Set up real components for integration testing."""
247
+ # Create temporary directory for ChromaDB
248
+ self.temp_dir = tempfile.mkdtemp()
249
+
250
+ # Initialize real components
251
+ self.embedding_service = EmbeddingService()
252
+ self.vector_db = VectorDatabase(
253
+ persist_path=self.temp_dir, collection_name="test_collection"
254
+ )
255
+ self.search_service = SearchService(
256
+ vector_db=self.vector_db, embedding_service=self.embedding_service
257
+ )
258
+
259
+ def teardown_method(self):
260
+ """Clean up temporary directory."""
261
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
262
+
263
+ def test_search_integration_with_real_data(self):
264
+ """Test search functionality with real embedding and vector storage."""
265
+ # Add some test documents to the vector database
266
+ test_texts = [
267
+ "Remote work policy allows employees to work from home",
268
+ "Employee benefits include health insurance and vacation time",
269
+ "Code of conduct requires professional behavior at all times",
270
+ ]
271
+ test_metadatas = [
272
+ {"filename": "remote_work.md", "chunk_index": 0},
273
+ {"filename": "benefits.md", "chunk_index": 0},
274
+ {"filename": "conduct.md", "chunk_index": 0},
275
+ ]
276
+
277
+ # Generate embeddings and store in vector database
278
+ embeddings = []
279
+ for text in test_texts:
280
+ embedding = self.embedding_service.embed_text(text)
281
+ embeddings.append(embedding)
282
+
283
+ # Add to vector database using the bulk add_embeddings method
284
+ chunk_ids = [f"doc_{i}" for i in range(len(test_texts))]
285
+ self.vector_db.add_embeddings(
286
+ embeddings=embeddings,
287
+ chunk_ids=chunk_ids,
288
+ documents=test_texts,
289
+ metadatas=test_metadatas,
290
+ )
291
+
292
+ # Test search functionality
293
+ results = self.search_service.search("work from home", top_k=2)
294
+
295
+ # Verify results
296
+ assert len(results) > 0
297
+ assert "chunk_id" in results[0]
298
+ assert "content" in results[0]
299
+ assert "similarity_score" in results[0]
300
+ assert "metadata" in results[0]
301
+
302
+ # Verify similarity scores are reasonable
303
+ for result in results:
304
+ assert 0.0 <= result["similarity_score"] <= 1.0
305
+
306
+ # Verify results are ordered by similarity (highest first)
307
+ if len(results) > 1:
308
+ assert results[0]["similarity_score"] >= results[1]["similarity_score"]
309
+
310
+ def test_search_quality_validation(self):
311
+ """Test that search returns relevant results for policy queries."""
312
+ # This is a simplified test to verify basic search functionality
313
+ # More complex relevance testing can be done in manual/integration testing
314
+
315
+ # Add a simple test document
316
+ test_text = "Remote work policy allows employees to work from home"
317
+ embedding = self.embedding_service.embed_text(test_text)
318
+
319
+ # Store document in vector database
320
+ self.vector_db.add_embeddings(
321
+ embeddings=[embedding],
322
+ chunk_ids=["test_doc"],
323
+ documents=[test_text],
324
+ metadatas=[{"filename": "test.md", "chunk_index": 0}],
325
+ )
326
+
327
+ # Verify we can search and get results
328
+ results = self.search_service.search("remote work", top_k=1)
329
+
330
+ # Basic validation
331
+ assert len(results) > 0
332
+ assert results[0]["chunk_id"] == "test_doc"
333
+ assert 0.0 <= results[0]["similarity_score"] <= 1.0