Spaces:
Sleeping
Sleeping
Tobias Pasquale
commited on
Commit
·
7ad46e8
1
Parent(s):
b5bb2b7
feat: implement SearchService for semantic document search
Browse files- Add SearchService class with comprehensive search functionality
- Support for semantic search using embeddings and vector similarity
- Configurable top_k results and similarity threshold filtering
- Integration with existing VectorDatabase and EmbeddingService
- Comprehensive test suite with 12 test cases (100% passing)
- Full error handling and parameter validation
- TDD approach with unit and integration tests
- Code formatting compliance (black, isort, flake8)
Addresses GitHub Issue #14
Phase 2B implementation ready for Flask API integration
- src/search/__init__.py +1 -0
- src/search/search_service.py +145 -0
- tests/test_search/__init__.py +1 -0
- tests/test_search/test_search_service.py +333 -0
src/search/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Search module for semantic document retrieval."""
|
src/search/search_service.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SearchService - Semantic document search functionality.
|
| 3 |
+
|
| 4 |
+
This module provides semantic search capabilities for the document corpus
|
| 5 |
+
using embeddings and vector similarity search through ChromaDB integration.
|
| 6 |
+
|
| 7 |
+
Classes:
|
| 8 |
+
SearchService: Main class for performing semantic search operations
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
from typing import Any, Dict, List, Optional
|
| 13 |
+
|
| 14 |
+
from src.embedding.embedding_service import EmbeddingService
|
| 15 |
+
from src.vector_store.vector_db import VectorDatabase
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SearchService:
|
| 21 |
+
"""
|
| 22 |
+
Semantic search service for finding relevant documents using embeddings.
|
| 23 |
+
|
| 24 |
+
This service combines text embedding generation with vector similarity search
|
| 25 |
+
to provide relevant document retrieval based on semantic similarity rather
|
| 26 |
+
than keyword matching.
|
| 27 |
+
|
| 28 |
+
Attributes:
|
| 29 |
+
vector_db: VectorDatabase instance for similarity search
|
| 30 |
+
embedding_service: EmbeddingService instance for query embedding
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
vector_db: Optional[VectorDatabase],
|
| 36 |
+
embedding_service: Optional[EmbeddingService],
|
| 37 |
+
):
|
| 38 |
+
"""
|
| 39 |
+
Initialize SearchService with required dependencies.
|
| 40 |
+
|
| 41 |
+
Args:
|
| 42 |
+
vector_db: VectorDatabase instance for storing and searching embeddings
|
| 43 |
+
embedding_service: EmbeddingService instance for generating embeddings
|
| 44 |
+
|
| 45 |
+
Raises:
|
| 46 |
+
ValueError: If either vector_db or embedding_service is None
|
| 47 |
+
"""
|
| 48 |
+
if vector_db is None:
|
| 49 |
+
raise ValueError("vector_db cannot be None")
|
| 50 |
+
if embedding_service is None:
|
| 51 |
+
raise ValueError("embedding_service cannot be None")
|
| 52 |
+
|
| 53 |
+
self.vector_db = vector_db
|
| 54 |
+
self.embedding_service = embedding_service
|
| 55 |
+
logger.info("SearchService initialized successfully")
|
| 56 |
+
|
| 57 |
+
def search(
|
| 58 |
+
self, query: str, top_k: int = 5, threshold: float = 0.0
|
| 59 |
+
) -> List[Dict[str, Any]]:
|
| 60 |
+
"""
|
| 61 |
+
Perform semantic search for relevant documents.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
query: Text query to search for
|
| 65 |
+
top_k: Maximum number of results to return (must be positive)
|
| 66 |
+
threshold: Minimum similarity score threshold (0.0 to 1.0)
|
| 67 |
+
|
| 68 |
+
Returns:
|
| 69 |
+
List of search results, each containing:
|
| 70 |
+
- chunk_id: Unique identifier for the document chunk
|
| 71 |
+
- content: Text content of the document chunk
|
| 72 |
+
- similarity_score: Similarity score (0.0 to 1.0, higher is better)
|
| 73 |
+
- metadata: Additional metadata (filename, chunk_index, etc.)
|
| 74 |
+
|
| 75 |
+
Raises:
|
| 76 |
+
ValueError: If query is empty, top_k is not positive, or threshold
|
| 77 |
+
is invalid
|
| 78 |
+
RuntimeError: If embedding generation or vector search fails
|
| 79 |
+
"""
|
| 80 |
+
# Validate input parameters
|
| 81 |
+
if not query or not query.strip():
|
| 82 |
+
raise ValueError("Query cannot be empty")
|
| 83 |
+
|
| 84 |
+
if top_k <= 0:
|
| 85 |
+
raise ValueError("top_k must be positive")
|
| 86 |
+
|
| 87 |
+
if not (0.0 <= threshold <= 1.0):
|
| 88 |
+
raise ValueError("threshold must be between 0 and 1")
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
# Generate embedding for the query
|
| 92 |
+
logger.debug(f"Generating embedding for query: '{query[:50]}...'")
|
| 93 |
+
query_embedding = self.embedding_service.embed_text(query.strip())
|
| 94 |
+
|
| 95 |
+
# Perform vector similarity search
|
| 96 |
+
logger.debug(f"Searching vector database with top_k={top_k}")
|
| 97 |
+
raw_results = self.vector_db.search(
|
| 98 |
+
query_embedding=query_embedding, top_k=top_k
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
# Format and filter results
|
| 102 |
+
formatted_results = self._format_search_results(raw_results, threshold)
|
| 103 |
+
|
| 104 |
+
logger.info(f"Search completed: {len(formatted_results)} results returned")
|
| 105 |
+
return formatted_results
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.error(f"Search failed for query '{query}': {str(e)}")
|
| 109 |
+
raise
|
| 110 |
+
|
| 111 |
+
def _format_search_results(
|
| 112 |
+
self, raw_results: List[Dict[str, Any]], threshold: float
|
| 113 |
+
) -> List[Dict[str, Any]]:
|
| 114 |
+
"""
|
| 115 |
+
Format VectorDatabase results into standardized search result format.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
raw_results: Results from VectorDatabase.search()
|
| 119 |
+
threshold: Minimum similarity score threshold
|
| 120 |
+
|
| 121 |
+
Returns:
|
| 122 |
+
List of formatted search results
|
| 123 |
+
"""
|
| 124 |
+
formatted_results = []
|
| 125 |
+
|
| 126 |
+
# Process each result from VectorDatabase format
|
| 127 |
+
for result in raw_results:
|
| 128 |
+
# Convert distance to similarity score (higher is better)
|
| 129 |
+
distance = result.get("distance", 1.0)
|
| 130 |
+
similarity_score = 1.0 - distance
|
| 131 |
+
|
| 132 |
+
# Apply threshold filtering
|
| 133 |
+
if similarity_score >= threshold:
|
| 134 |
+
formatted_result = {
|
| 135 |
+
"chunk_id": result.get("id", ""),
|
| 136 |
+
"content": result.get("document", ""),
|
| 137 |
+
"similarity_score": similarity_score,
|
| 138 |
+
"metadata": result.get("metadata", {}),
|
| 139 |
+
}
|
| 140 |
+
formatted_results.append(formatted_result)
|
| 141 |
+
|
| 142 |
+
logger.debug(
|
| 143 |
+
f"Formatted {len(formatted_results)} results above threshold {threshold}"
|
| 144 |
+
)
|
| 145 |
+
return formatted_results
|
tests/test_search/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Tests for search module."""
|
tests/test_search/test_search_service.py
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for SearchService - Semantic document search functionality.
|
| 3 |
+
|
| 4 |
+
This test suite covers:
|
| 5 |
+
- SearchService initialization and configuration
|
| 6 |
+
- Query embedding generation
|
| 7 |
+
- Similarity search with ChromaDB integration
|
| 8 |
+
- Result formatting and metadata handling
|
| 9 |
+
- Error handling and edge cases
|
| 10 |
+
- Performance and parameter validation
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import shutil
|
| 14 |
+
import tempfile
|
| 15 |
+
from unittest.mock import Mock
|
| 16 |
+
|
| 17 |
+
import pytest
|
| 18 |
+
|
| 19 |
+
from src.embedding.embedding_service import EmbeddingService
|
| 20 |
+
from src.search.search_service import SearchService
|
| 21 |
+
from src.vector_store.vector_db import VectorDatabase
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class TestSearchServiceInitialization:
|
| 25 |
+
"""Test SearchService initialization and configuration."""
|
| 26 |
+
|
| 27 |
+
def test_search_service_initialization(self):
|
| 28 |
+
"""Test that SearchService initializes correctly with required dependencies."""
|
| 29 |
+
mock_vector_db = Mock(spec=VectorDatabase)
|
| 30 |
+
mock_embedding_service = Mock(spec=EmbeddingService)
|
| 31 |
+
|
| 32 |
+
search_service = SearchService(
|
| 33 |
+
vector_db=mock_vector_db, embedding_service=mock_embedding_service
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
assert search_service.vector_db == mock_vector_db
|
| 37 |
+
assert search_service.embedding_service == mock_embedding_service
|
| 38 |
+
|
| 39 |
+
def test_search_service_with_none_dependencies(self):
|
| 40 |
+
"""Test that SearchService raises appropriate error with None dependencies."""
|
| 41 |
+
with pytest.raises(ValueError, match="vector_db cannot be None"):
|
| 42 |
+
SearchService(vector_db=None, embedding_service=Mock())
|
| 43 |
+
|
| 44 |
+
with pytest.raises(ValueError, match="embedding_service cannot be None"):
|
| 45 |
+
SearchService(vector_db=Mock(), embedding_service=None)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class TestSearchFunctionality:
|
| 49 |
+
"""Test core search functionality."""
|
| 50 |
+
|
| 51 |
+
def setup_method(self):
|
| 52 |
+
"""Set up test fixtures for search functionality tests."""
|
| 53 |
+
self.mock_vector_db = Mock(spec=VectorDatabase)
|
| 54 |
+
self.mock_embedding_service = Mock(spec=EmbeddingService)
|
| 55 |
+
self.search_service = SearchService(
|
| 56 |
+
vector_db=self.mock_vector_db, embedding_service=self.mock_embedding_service
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def test_search_with_valid_query(self):
|
| 60 |
+
"""Test search functionality with a valid text query."""
|
| 61 |
+
# Mock embedding generation
|
| 62 |
+
mock_embedding = [0.1, 0.2, 0.3, 0.4]
|
| 63 |
+
self.mock_embedding_service.embed_text.return_value = mock_embedding
|
| 64 |
+
|
| 65 |
+
# Mock vector database search results (VectorDatabase format)
|
| 66 |
+
mock_raw_results = [
|
| 67 |
+
{
|
| 68 |
+
"id": "doc_1",
|
| 69 |
+
"document": "Remote work policy content...",
|
| 70 |
+
"distance": 0.15,
|
| 71 |
+
"metadata": {"filename": "remote_work_policy.md", "chunk_index": 2},
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"id": "doc_2",
|
| 75 |
+
"document": "PTO policy content...",
|
| 76 |
+
"distance": 0.25,
|
| 77 |
+
"metadata": {"filename": "pto_policy.md", "chunk_index": 1},
|
| 78 |
+
},
|
| 79 |
+
]
|
| 80 |
+
self.mock_vector_db.search.return_value = mock_raw_results
|
| 81 |
+
|
| 82 |
+
# Perform search
|
| 83 |
+
results = self.search_service.search("remote work policy", top_k=2)
|
| 84 |
+
|
| 85 |
+
# Verify embedding service was called
|
| 86 |
+
self.mock_embedding_service.embed_text.assert_called_once_with(
|
| 87 |
+
"remote work policy"
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
# Verify vector database search was called
|
| 91 |
+
self.mock_vector_db.search.assert_called_once_with(
|
| 92 |
+
query_embedding=mock_embedding, top_k=2
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Verify results structure
|
| 96 |
+
assert len(results) == 2
|
| 97 |
+
assert results[0]["chunk_id"] == "doc_1"
|
| 98 |
+
assert results[0]["content"] == "Remote work policy content..."
|
| 99 |
+
assert results[0]["similarity_score"] == pytest.approx(
|
| 100 |
+
0.85, abs=0.01
|
| 101 |
+
) # 1 - 0.15
|
| 102 |
+
assert results[0]["metadata"]["filename"] == "remote_work_policy.md"
|
| 103 |
+
|
| 104 |
+
def test_search_with_empty_query(self):
|
| 105 |
+
"""Test search behavior with empty query string."""
|
| 106 |
+
with pytest.raises(ValueError, match="Query cannot be empty"):
|
| 107 |
+
self.search_service.search("")
|
| 108 |
+
|
| 109 |
+
with pytest.raises(ValueError, match="Query cannot be empty"):
|
| 110 |
+
self.search_service.search(" ") # whitespace only
|
| 111 |
+
|
| 112 |
+
def test_search_with_no_results(self):
|
| 113 |
+
"""Test search behavior when no results are found."""
|
| 114 |
+
# Mock embedding generation
|
| 115 |
+
mock_embedding = [0.1, 0.2, 0.3, 0.4]
|
| 116 |
+
self.mock_embedding_service.embed_text.return_value = mock_embedding
|
| 117 |
+
|
| 118 |
+
# Mock empty search results (VectorDatabase format)
|
| 119 |
+
mock_raw_results = []
|
| 120 |
+
self.mock_vector_db.search.return_value = mock_raw_results
|
| 121 |
+
|
| 122 |
+
# Perform search
|
| 123 |
+
results = self.search_service.search("non-existent topic")
|
| 124 |
+
|
| 125 |
+
# Verify empty results
|
| 126 |
+
assert results == []
|
| 127 |
+
|
| 128 |
+
def test_search_with_top_k_parameter(self):
|
| 129 |
+
"""Test search with different top_k values."""
|
| 130 |
+
mock_embedding = [0.1, 0.2, 0.3, 0.4]
|
| 131 |
+
self.mock_embedding_service.embed_text.return_value = mock_embedding
|
| 132 |
+
|
| 133 |
+
# Mock results for top_k=1 (VectorDatabase format)
|
| 134 |
+
mock_raw_results = [
|
| 135 |
+
{
|
| 136 |
+
"id": "doc_1",
|
| 137 |
+
"document": "Content 1",
|
| 138 |
+
"distance": 0.15,
|
| 139 |
+
"metadata": {"filename": "file1.md", "chunk_index": 0},
|
| 140 |
+
}
|
| 141 |
+
]
|
| 142 |
+
self.mock_vector_db.search.return_value = mock_raw_results
|
| 143 |
+
|
| 144 |
+
# Test with top_k=1
|
| 145 |
+
results = self.search_service.search("test query", top_k=1)
|
| 146 |
+
self.mock_vector_db.search.assert_called_with(
|
| 147 |
+
query_embedding=mock_embedding, top_k=1
|
| 148 |
+
)
|
| 149 |
+
assert len(results) == 1
|
| 150 |
+
|
| 151 |
+
# Test with top_k=10
|
| 152 |
+
self.search_service.search("test query", top_k=10)
|
| 153 |
+
self.mock_vector_db.search.assert_called_with(
|
| 154 |
+
query_embedding=mock_embedding, top_k=10
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
def test_search_with_threshold_filtering(self):
|
| 158 |
+
"""Test search with similarity threshold filtering."""
|
| 159 |
+
# Mock embedding generation
|
| 160 |
+
mock_embedding = [0.1, 0.2, 0.3, 0.4]
|
| 161 |
+
self.mock_embedding_service.embed_text.return_value = mock_embedding
|
| 162 |
+
|
| 163 |
+
# Mock results with varying distances (VectorDatabase format)
|
| 164 |
+
mock_raw_results = [
|
| 165 |
+
{
|
| 166 |
+
"id": "doc_1",
|
| 167 |
+
"document": "High match",
|
| 168 |
+
"distance": 0.1, # similarity: 0.9
|
| 169 |
+
"metadata": {"filename": "file1.md", "chunk_index": 0},
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"id": "doc_2",
|
| 173 |
+
"document": "Medium match",
|
| 174 |
+
"distance": 0.5, # similarity: 0.5
|
| 175 |
+
"metadata": {"filename": "file2.md", "chunk_index": 0},
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
"id": "doc_3",
|
| 179 |
+
"document": "Low match",
|
| 180 |
+
"distance": 0.8, # similarity: 0.2
|
| 181 |
+
"metadata": {"filename": "file3.md", "chunk_index": 0},
|
| 182 |
+
},
|
| 183 |
+
]
|
| 184 |
+
self.mock_vector_db.search.return_value = mock_raw_results
|
| 185 |
+
|
| 186 |
+
# Search with threshold=0.4 (should return only first two results)
|
| 187 |
+
results = self.search_service.search("test query", top_k=5, threshold=0.4)
|
| 188 |
+
|
| 189 |
+
# Verify only results above threshold are returned
|
| 190 |
+
assert len(results) == 2
|
| 191 |
+
assert results[0]["similarity_score"] == pytest.approx(0.9, abs=0.01)
|
| 192 |
+
assert results[1]["similarity_score"] == pytest.approx(0.5, abs=0.01)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
class TestErrorHandling:
|
| 196 |
+
"""Test error handling and edge cases."""
|
| 197 |
+
|
| 198 |
+
def setup_method(self):
|
| 199 |
+
"""Set up test fixtures for error handling tests."""
|
| 200 |
+
self.mock_vector_db = Mock(spec=VectorDatabase)
|
| 201 |
+
self.mock_embedding_service = Mock(spec=EmbeddingService)
|
| 202 |
+
self.search_service = SearchService(
|
| 203 |
+
vector_db=self.mock_vector_db, embedding_service=self.mock_embedding_service
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
def test_search_with_embedding_service_error(self):
|
| 207 |
+
"""Test search behavior when embedding service fails."""
|
| 208 |
+
# Mock embedding service to raise an exception
|
| 209 |
+
self.mock_embedding_service.embed_text.side_effect = RuntimeError(
|
| 210 |
+
"Embedding model failed"
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
with pytest.raises(RuntimeError, match="Embedding model failed"):
|
| 214 |
+
self.search_service.search("test query")
|
| 215 |
+
|
| 216 |
+
def test_search_with_vector_db_error(self):
|
| 217 |
+
"""Test search behavior when vector database fails."""
|
| 218 |
+
# Mock successful embedding but failed vector search
|
| 219 |
+
self.mock_embedding_service.embed_text.return_value = [0.1, 0.2, 0.3]
|
| 220 |
+
self.mock_vector_db.search.side_effect = RuntimeError(
|
| 221 |
+
"Vector DB connection failed"
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
with pytest.raises(RuntimeError, match="Vector DB connection failed"):
|
| 225 |
+
self.search_service.search("test query")
|
| 226 |
+
|
| 227 |
+
def test_search_with_invalid_parameters(self):
|
| 228 |
+
"""Test search with invalid parameter values."""
|
| 229 |
+
with pytest.raises(ValueError, match="top_k must be positive"):
|
| 230 |
+
self.search_service.search("query", top_k=0)
|
| 231 |
+
|
| 232 |
+
with pytest.raises(ValueError, match="top_k must be positive"):
|
| 233 |
+
self.search_service.search("query", top_k=-1)
|
| 234 |
+
|
| 235 |
+
with pytest.raises(ValueError, match="threshold must be between 0 and 1"):
|
| 236 |
+
self.search_service.search("query", threshold=-0.1)
|
| 237 |
+
|
| 238 |
+
with pytest.raises(ValueError, match="threshold must be between 0 and 1"):
|
| 239 |
+
self.search_service.search("query", threshold=1.1)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
class TestIntegrationWithRealComponents:
|
| 243 |
+
"""Test SearchService integration with real VectorDatabase and EmbeddingService."""
|
| 244 |
+
|
| 245 |
+
def setup_method(self):
|
| 246 |
+
"""Set up real components for integration testing."""
|
| 247 |
+
# Create temporary directory for ChromaDB
|
| 248 |
+
self.temp_dir = tempfile.mkdtemp()
|
| 249 |
+
|
| 250 |
+
# Initialize real components
|
| 251 |
+
self.embedding_service = EmbeddingService()
|
| 252 |
+
self.vector_db = VectorDatabase(
|
| 253 |
+
persist_path=self.temp_dir, collection_name="test_collection"
|
| 254 |
+
)
|
| 255 |
+
self.search_service = SearchService(
|
| 256 |
+
vector_db=self.vector_db, embedding_service=self.embedding_service
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
def teardown_method(self):
|
| 260 |
+
"""Clean up temporary directory."""
|
| 261 |
+
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
| 262 |
+
|
| 263 |
+
def test_search_integration_with_real_data(self):
|
| 264 |
+
"""Test search functionality with real embedding and vector storage."""
|
| 265 |
+
# Add some test documents to the vector database
|
| 266 |
+
test_texts = [
|
| 267 |
+
"Remote work policy allows employees to work from home",
|
| 268 |
+
"Employee benefits include health insurance and vacation time",
|
| 269 |
+
"Code of conduct requires professional behavior at all times",
|
| 270 |
+
]
|
| 271 |
+
test_metadatas = [
|
| 272 |
+
{"filename": "remote_work.md", "chunk_index": 0},
|
| 273 |
+
{"filename": "benefits.md", "chunk_index": 0},
|
| 274 |
+
{"filename": "conduct.md", "chunk_index": 0},
|
| 275 |
+
]
|
| 276 |
+
|
| 277 |
+
# Generate embeddings and store in vector database
|
| 278 |
+
embeddings = []
|
| 279 |
+
for text in test_texts:
|
| 280 |
+
embedding = self.embedding_service.embed_text(text)
|
| 281 |
+
embeddings.append(embedding)
|
| 282 |
+
|
| 283 |
+
# Add to vector database using the bulk add_embeddings method
|
| 284 |
+
chunk_ids = [f"doc_{i}" for i in range(len(test_texts))]
|
| 285 |
+
self.vector_db.add_embeddings(
|
| 286 |
+
embeddings=embeddings,
|
| 287 |
+
chunk_ids=chunk_ids,
|
| 288 |
+
documents=test_texts,
|
| 289 |
+
metadatas=test_metadatas,
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
# Test search functionality
|
| 293 |
+
results = self.search_service.search("work from home", top_k=2)
|
| 294 |
+
|
| 295 |
+
# Verify results
|
| 296 |
+
assert len(results) > 0
|
| 297 |
+
assert "chunk_id" in results[0]
|
| 298 |
+
assert "content" in results[0]
|
| 299 |
+
assert "similarity_score" in results[0]
|
| 300 |
+
assert "metadata" in results[0]
|
| 301 |
+
|
| 302 |
+
# Verify similarity scores are reasonable
|
| 303 |
+
for result in results:
|
| 304 |
+
assert 0.0 <= result["similarity_score"] <= 1.0
|
| 305 |
+
|
| 306 |
+
# Verify results are ordered by similarity (highest first)
|
| 307 |
+
if len(results) > 1:
|
| 308 |
+
assert results[0]["similarity_score"] >= results[1]["similarity_score"]
|
| 309 |
+
|
| 310 |
+
def test_search_quality_validation(self):
|
| 311 |
+
"""Test that search returns relevant results for policy queries."""
|
| 312 |
+
# This is a simplified test to verify basic search functionality
|
| 313 |
+
# More complex relevance testing can be done in manual/integration testing
|
| 314 |
+
|
| 315 |
+
# Add a simple test document
|
| 316 |
+
test_text = "Remote work policy allows employees to work from home"
|
| 317 |
+
embedding = self.embedding_service.embed_text(test_text)
|
| 318 |
+
|
| 319 |
+
# Store document in vector database
|
| 320 |
+
self.vector_db.add_embeddings(
|
| 321 |
+
embeddings=[embedding],
|
| 322 |
+
chunk_ids=["test_doc"],
|
| 323 |
+
documents=[test_text],
|
| 324 |
+
metadatas=[{"filename": "test.md", "chunk_index": 0}],
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# Verify we can search and get results
|
| 328 |
+
results = self.search_service.search("remote work", top_k=1)
|
| 329 |
+
|
| 330 |
+
# Basic validation
|
| 331 |
+
assert len(results) > 0
|
| 332 |
+
assert results[0]["chunk_id"] == "test_doc"
|
| 333 |
+
assert 0.0 <= results[0]["similarity_score"] <= 1.0
|