Tobias Pasquale commited on
Commit
a3dfc07
·
1 Parent(s): aff5d04

fix: apply formatting and linting fixes for CI/CD compliance

Browse files

- Remove unused Union import from typing
- Fix E501 line length violations by breaking long lines
- Apply black and isort formatting
- Fix flake8 compliance issues

All pre-commit hooks now pass locally

app.py CHANGED
@@ -24,6 +24,7 @@ def ingest():
24
  """Endpoint to trigger document ingestion with embeddings"""
25
  try:
26
  from flask import request
 
27
  from src.config import (
28
  CORPUS_DIRECTORY,
29
  DEFAULT_CHUNK_SIZE,
@@ -35,12 +36,12 @@ def ingest():
35
  # Get optional parameters from request
36
  data = request.get_json() if request.is_json else {}
37
  store_embeddings = data.get("store_embeddings", True)
38
-
39
  pipeline = IngestionPipeline(
40
- chunk_size=DEFAULT_CHUNK_SIZE,
41
- overlap=DEFAULT_OVERLAP,
42
  seed=RANDOM_SEED,
43
- store_embeddings=store_embeddings
44
  )
45
 
46
  result = pipeline.process_directory_with_embeddings(CORPUS_DIRECTORY)
@@ -52,13 +53,18 @@ def ingest():
52
  "files_processed": result["files_processed"],
53
  "embeddings_stored": result["embeddings_stored"],
54
  "store_embeddings": result["store_embeddings"],
55
- "message": f"Successfully processed {result['chunks_processed']} chunks from {result['files_processed']} files"
 
 
 
56
  }
57
-
58
  # Include failed files info if any
59
  if result["failed_files"]:
60
  response["failed_files"] = result["failed_files"]
61
- response["warnings"] = f"{len(result['failed_files'])} files failed to process"
 
 
62
 
63
  return jsonify(response)
64
 
 
24
  """Endpoint to trigger document ingestion with embeddings"""
25
  try:
26
  from flask import request
27
+
28
  from src.config import (
29
  CORPUS_DIRECTORY,
30
  DEFAULT_CHUNK_SIZE,
 
36
  # Get optional parameters from request
37
  data = request.get_json() if request.is_json else {}
38
  store_embeddings = data.get("store_embeddings", True)
39
+
40
  pipeline = IngestionPipeline(
41
+ chunk_size=DEFAULT_CHUNK_SIZE,
42
+ overlap=DEFAULT_OVERLAP,
43
  seed=RANDOM_SEED,
44
+ store_embeddings=store_embeddings,
45
  )
46
 
47
  result = pipeline.process_directory_with_embeddings(CORPUS_DIRECTORY)
 
53
  "files_processed": result["files_processed"],
54
  "embeddings_stored": result["embeddings_stored"],
55
  "store_embeddings": result["store_embeddings"],
56
+ "message": (
57
+ f"Successfully processed {result['chunks_processed']} chunks "
58
+ f"from {result['files_processed']} files"
59
+ ),
60
  }
61
+
62
  # Include failed files info if any
63
  if result["failed_files"]:
64
  response["failed_files"] = result["failed_files"]
65
+ response[
66
+ "warnings"
67
+ ] = f"{len(result['failed_files'])} files failed to process"
68
 
69
  return jsonify(response)
70
 
src/ingestion/ingestion_pipeline.py CHANGED
@@ -1,5 +1,5 @@
1
  from pathlib import Path
2
- from typing import Any, Dict, List, Optional, Union
3
 
4
  from ..embedding.embedding_service import EmbeddingService
5
  from ..vector_store.vector_db import VectorDatabase
@@ -11,13 +11,13 @@ class IngestionPipeline:
11
  """Complete ingestion pipeline for processing document corpus with embeddings"""
12
 
13
  def __init__(
14
- self,
15
- chunk_size: int = 1000,
16
- overlap: int = 200,
17
  seed: int = 42,
18
  store_embeddings: bool = True,
19
  vector_db: Optional[VectorDatabase] = None,
20
- embedding_service: Optional[EmbeddingService] = None
21
  ):
22
  """
23
  Initialize the ingestion pipeline
@@ -36,15 +36,15 @@ class IngestionPipeline:
36
  )
37
  self.seed = seed
38
  self.store_embeddings = store_embeddings
39
-
40
  # Initialize embedding components if storing embeddings
41
  if store_embeddings:
42
  self.embedding_service = embedding_service or EmbeddingService()
43
  if vector_db is None:
44
  from ..config import COLLECTION_NAME, VECTOR_DB_PERSIST_PATH
 
45
  self.vector_db = VectorDatabase(
46
- persist_path=VECTOR_DB_PERSIST_PATH,
47
- collection_name=COLLECTION_NAME
48
  )
49
  else:
50
  self.vector_db = vector_db
@@ -118,7 +118,12 @@ class IngestionPipeline:
118
  continue
119
 
120
  # Generate and store embeddings if enabled
121
- if self.store_embeddings and all_chunks and self.embedding_service and self.vector_db:
 
 
 
 
 
122
  try:
123
  embeddings_stored = self._store_embeddings_batch(all_chunks)
124
  except Exception as e:
@@ -131,7 +136,7 @@ class IngestionPipeline:
131
  "failed_files": failed_files,
132
  "embeddings_stored": embeddings_stored,
133
  "store_embeddings": self.store_embeddings,
134
- "chunks": all_chunks # Include chunks for backward compatibility
135
  }
136
 
137
  def process_file(self, file_path: str) -> List[Dict[str, Any]]:
@@ -157,44 +162,47 @@ class IngestionPipeline:
157
  def _store_embeddings_batch(self, chunks: List[Dict[str, Any]]) -> int:
158
  """
159
  Generate embeddings and store chunks in vector database
160
-
161
  Args:
162
  chunks: List of text chunks with metadata
163
-
164
  Returns:
165
  Number of embeddings stored successfully
166
  """
167
  if not self.embedding_service or not self.vector_db:
168
  return 0
169
-
170
  stored_count = 0
171
  batch_size = 32 # Process in batches for memory efficiency
172
-
173
  for i in range(0, len(chunks), batch_size):
174
- batch = chunks[i:i + batch_size]
175
-
176
  try:
177
  # Extract texts and prepare data for vector storage
178
  texts = [chunk["content"] for chunk in batch]
179
  chunk_ids = [chunk["metadata"]["chunk_id"] for chunk in batch]
180
  metadatas = [chunk["metadata"] for chunk in batch]
181
-
182
  # Generate embeddings for the batch
183
  embeddings = self.embedding_service.embed_texts(texts)
184
-
185
  # Store in vector database
186
  self.vector_db.add_embeddings(
187
  embeddings=embeddings,
188
  chunk_ids=chunk_ids,
189
  documents=texts,
190
- metadatas=metadatas
191
  )
192
-
193
  stored_count += len(batch)
194
- print(f"Stored embeddings for batch {i // batch_size + 1}: {len(batch)} chunks")
195
-
 
 
 
196
  except Exception as e:
197
  print(f"Warning: Failed to store batch {i // batch_size + 1}: {e}")
198
  continue
199
-
200
  return stored_count
 
1
  from pathlib import Path
2
+ from typing import Any, Dict, List, Optional
3
 
4
  from ..embedding.embedding_service import EmbeddingService
5
  from ..vector_store.vector_db import VectorDatabase
 
11
  """Complete ingestion pipeline for processing document corpus with embeddings"""
12
 
13
  def __init__(
14
+ self,
15
+ chunk_size: int = 1000,
16
+ overlap: int = 200,
17
  seed: int = 42,
18
  store_embeddings: bool = True,
19
  vector_db: Optional[VectorDatabase] = None,
20
+ embedding_service: Optional[EmbeddingService] = None,
21
  ):
22
  """
23
  Initialize the ingestion pipeline
 
36
  )
37
  self.seed = seed
38
  self.store_embeddings = store_embeddings
39
+
40
  # Initialize embedding components if storing embeddings
41
  if store_embeddings:
42
  self.embedding_service = embedding_service or EmbeddingService()
43
  if vector_db is None:
44
  from ..config import COLLECTION_NAME, VECTOR_DB_PERSIST_PATH
45
+
46
  self.vector_db = VectorDatabase(
47
+ persist_path=VECTOR_DB_PERSIST_PATH, collection_name=COLLECTION_NAME
 
48
  )
49
  else:
50
  self.vector_db = vector_db
 
118
  continue
119
 
120
  # Generate and store embeddings if enabled
121
+ if (
122
+ self.store_embeddings
123
+ and all_chunks
124
+ and self.embedding_service
125
+ and self.vector_db
126
+ ):
127
  try:
128
  embeddings_stored = self._store_embeddings_batch(all_chunks)
129
  except Exception as e:
 
136
  "failed_files": failed_files,
137
  "embeddings_stored": embeddings_stored,
138
  "store_embeddings": self.store_embeddings,
139
+ "chunks": all_chunks, # Include chunks for backward compatibility
140
  }
141
 
142
  def process_file(self, file_path: str) -> List[Dict[str, Any]]:
 
162
  def _store_embeddings_batch(self, chunks: List[Dict[str, Any]]) -> int:
163
  """
164
  Generate embeddings and store chunks in vector database
165
+
166
  Args:
167
  chunks: List of text chunks with metadata
168
+
169
  Returns:
170
  Number of embeddings stored successfully
171
  """
172
  if not self.embedding_service or not self.vector_db:
173
  return 0
174
+
175
  stored_count = 0
176
  batch_size = 32 # Process in batches for memory efficiency
177
+
178
  for i in range(0, len(chunks), batch_size):
179
+ batch = chunks[i : i + batch_size]
180
+
181
  try:
182
  # Extract texts and prepare data for vector storage
183
  texts = [chunk["content"] for chunk in batch]
184
  chunk_ids = [chunk["metadata"]["chunk_id"] for chunk in batch]
185
  metadatas = [chunk["metadata"] for chunk in batch]
186
+
187
  # Generate embeddings for the batch
188
  embeddings = self.embedding_service.embed_texts(texts)
189
+
190
  # Store in vector database
191
  self.vector_db.add_embeddings(
192
  embeddings=embeddings,
193
  chunk_ids=chunk_ids,
194
  documents=texts,
195
+ metadatas=metadatas,
196
  )
197
+
198
  stored_count += len(batch)
199
+ print(
200
+ f"Stored embeddings for batch {i // batch_size + 1}: "
201
+ f"{len(batch)} chunks"
202
+ )
203
+
204
  except Exception as e:
205
  print(f"Warning: Failed to store batch {i // batch_size + 1}: {e}")
206
  continue
207
+
208
  return stored_count
tests/test_enhanced_app.py CHANGED
@@ -16,24 +16,26 @@ class TestEnhancedIngestionEndpoint(unittest.TestCase):
16
 
17
  def setUp(self):
18
  """Set up test fixtures"""
19
- app.config['TESTING'] = True
20
  self.app = app.test_client()
21
-
22
  # Create temporary directory and files for testing
23
  self.temp_dir = tempfile.mkdtemp()
24
  self.test_dir = Path(self.temp_dir)
25
-
26
  self.test_file = self.test_dir / "test.md"
27
- self.test_file.write_text("# Test Document\n\nThis is test content for enhanced ingestion.")
 
 
28
 
29
  def test_ingest_endpoint_with_embeddings_default(self):
30
  """Test ingestion endpoint with default embeddings enabled"""
31
- with patch('src.config.CORPUS_DIRECTORY', str(self.test_dir)):
32
- response = self.app.post('/ingest')
33
-
34
  self.assertEqual(response.status_code, 200)
35
  data = json.loads(response.data)
36
-
37
  # Check enhanced response structure
38
  self.assertEqual(data["status"], "success")
39
  self.assertIn("chunks_processed", data)
@@ -46,14 +48,16 @@ class TestEnhancedIngestionEndpoint(unittest.TestCase):
46
 
47
  def test_ingest_endpoint_with_embeddings_disabled(self):
48
  """Test ingestion endpoint with embeddings disabled"""
49
- with patch('src.config.CORPUS_DIRECTORY', str(self.test_dir)):
50
- response = self.app.post('/ingest',
51
- data=json.dumps({"store_embeddings": False}),
52
- content_type='application/json')
53
-
 
 
54
  self.assertEqual(response.status_code, 200)
55
  data = json.loads(response.data)
56
-
57
  # Check response structure with embeddings disabled
58
  self.assertEqual(data["status"], "success")
59
  self.assertIn("chunks_processed", data)
@@ -67,31 +71,32 @@ class TestEnhancedIngestionEndpoint(unittest.TestCase):
67
 
68
  def test_ingest_endpoint_with_no_json(self):
69
  """Test ingestion endpoint with no JSON payload (should default to embeddings enabled)"""
70
- with patch('src.config.CORPUS_DIRECTORY', str(self.test_dir)):
71
- response = self.app.post('/ingest')
72
-
73
  self.assertEqual(response.status_code, 200)
74
  data = json.loads(response.data)
75
-
76
  # Should default to embeddings enabled
77
  self.assertTrue(data["store_embeddings"])
78
 
79
  def test_ingest_endpoint_error_handling(self):
80
  """Test ingestion endpoint error handling"""
81
- with patch('src.config.CORPUS_DIRECTORY', '/nonexistent/directory'):
82
- response = self.app.post('/ingest')
83
-
84
  self.assertEqual(response.status_code, 500)
85
  data = json.loads(response.data)
86
-
87
  self.assertEqual(data["status"], "error")
88
  self.assertIn("message", data)
89
 
90
  def tearDown(self):
91
  """Clean up test fixtures"""
92
  import shutil
 
93
  shutil.rmtree(self.temp_dir, ignore_errors=True)
94
 
95
 
96
  if __name__ == "__main__":
97
- unittest.main()
 
16
 
17
  def setUp(self):
18
  """Set up test fixtures"""
19
+ app.config["TESTING"] = True
20
  self.app = app.test_client()
21
+
22
  # Create temporary directory and files for testing
23
  self.temp_dir = tempfile.mkdtemp()
24
  self.test_dir = Path(self.temp_dir)
25
+
26
  self.test_file = self.test_dir / "test.md"
27
+ self.test_file.write_text(
28
+ "# Test Document\n\nThis is test content for enhanced ingestion."
29
+ )
30
 
31
  def test_ingest_endpoint_with_embeddings_default(self):
32
  """Test ingestion endpoint with default embeddings enabled"""
33
+ with patch("src.config.CORPUS_DIRECTORY", str(self.test_dir)):
34
+ response = self.app.post("/ingest")
35
+
36
  self.assertEqual(response.status_code, 200)
37
  data = json.loads(response.data)
38
+
39
  # Check enhanced response structure
40
  self.assertEqual(data["status"], "success")
41
  self.assertIn("chunks_processed", data)
 
48
 
49
  def test_ingest_endpoint_with_embeddings_disabled(self):
50
  """Test ingestion endpoint with embeddings disabled"""
51
+ with patch("src.config.CORPUS_DIRECTORY", str(self.test_dir)):
52
+ response = self.app.post(
53
+ "/ingest",
54
+ data=json.dumps({"store_embeddings": False}),
55
+ content_type="application/json",
56
+ )
57
+
58
  self.assertEqual(response.status_code, 200)
59
  data = json.loads(response.data)
60
+
61
  # Check response structure with embeddings disabled
62
  self.assertEqual(data["status"], "success")
63
  self.assertIn("chunks_processed", data)
 
71
 
72
  def test_ingest_endpoint_with_no_json(self):
73
  """Test ingestion endpoint with no JSON payload (should default to embeddings enabled)"""
74
+ with patch("src.config.CORPUS_DIRECTORY", str(self.test_dir)):
75
+ response = self.app.post("/ingest")
76
+
77
  self.assertEqual(response.status_code, 200)
78
  data = json.loads(response.data)
79
+
80
  # Should default to embeddings enabled
81
  self.assertTrue(data["store_embeddings"])
82
 
83
  def test_ingest_endpoint_error_handling(self):
84
  """Test ingestion endpoint error handling"""
85
+ with patch("src.config.CORPUS_DIRECTORY", "/nonexistent/directory"):
86
+ response = self.app.post("/ingest")
87
+
88
  self.assertEqual(response.status_code, 500)
89
  data = json.loads(response.data)
90
+
91
  self.assertEqual(data["status"], "error")
92
  self.assertIn("message", data)
93
 
94
  def tearDown(self):
95
  """Clean up test fixtures"""
96
  import shutil
97
+
98
  shutil.rmtree(self.temp_dir, ignore_errors=True)
99
 
100
 
101
  if __name__ == "__main__":
102
+ unittest.main()
tests/test_ingestion/test_enhanced_ingestion_pipeline.py CHANGED
@@ -17,14 +17,16 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
17
  """Set up test fixtures"""
18
  self.temp_dir = tempfile.mkdtemp()
19
  self.test_dir = Path(self.temp_dir)
20
-
21
  # Create test files
22
  self.test_file1 = self.test_dir / "test1.md"
23
- self.test_file1.write_text("# Test Document 1\n\nThis is test content for document 1.")
24
-
 
 
25
  self.test_file2 = self.test_dir / "test2.txt"
26
  self.test_file2.write_text("This is test content for document 2.")
27
-
28
  # Create an unsupported file (should be skipped)
29
  self.test_file3 = self.test_dir / "test3.pdf"
30
  self.test_file3.write_text("PDF content")
@@ -32,7 +34,7 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
32
  def test_initialization_without_embeddings(self):
33
  """Test pipeline initialization without embeddings"""
34
  pipeline = IngestionPipeline(store_embeddings=False)
35
-
36
  self.assertIsNotNone(pipeline.parser)
37
  self.assertIsNotNone(pipeline.chunker)
38
  self.assertFalse(pipeline.store_embeddings)
@@ -42,7 +44,7 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
42
  def test_initialization_with_embeddings(self):
43
  """Test pipeline initialization with embeddings"""
44
  pipeline = IngestionPipeline(store_embeddings=True)
45
-
46
  self.assertIsNotNone(pipeline.parser)
47
  self.assertIsNotNone(pipeline.chunker)
48
  self.assertTrue(pipeline.store_embeddings)
@@ -53,13 +55,13 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
53
  """Test pipeline initialization with custom embedding components"""
54
  mock_embedding_service = Mock()
55
  mock_vector_db = Mock()
56
-
57
  pipeline = IngestionPipeline(
58
  store_embeddings=True,
59
  embedding_service=mock_embedding_service,
60
- vector_db=mock_vector_db
61
  )
62
-
63
  self.assertEqual(pipeline.embedding_service, mock_embedding_service)
64
  self.assertEqual(pipeline.vector_db, mock_vector_db)
65
 
@@ -67,7 +69,7 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
67
  """Test directory processing without embeddings"""
68
  pipeline = IngestionPipeline(store_embeddings=False)
69
  result = pipeline.process_directory_with_embeddings(str(self.test_dir))
70
-
71
  # Check response structure
72
  self.assertIsInstance(result, dict)
73
  self.assertEqual(result["status"], "success")
@@ -77,25 +79,30 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
77
  self.assertFalse(result["store_embeddings"])
78
  self.assertIn("chunks", result)
79
 
80
- @patch('src.ingestion.ingestion_pipeline.VectorDatabase')
81
- @patch('src.ingestion.ingestion_pipeline.EmbeddingService')
82
- def test_process_directory_with_embeddings(self, mock_embedding_service_class, mock_vector_db_class):
 
 
83
  """Test directory processing with embeddings"""
84
  # Mock the classes to return mock instances
85
  mock_embedding_service = Mock()
86
  mock_vector_db = Mock()
87
  mock_embedding_service_class.return_value = mock_embedding_service
88
  mock_vector_db_class.return_value = mock_vector_db
89
-
90
  # Configure mock embedding service
91
- mock_embedding_service.embed_texts.return_value = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
92
-
 
 
 
93
  # Configure mock vector database
94
  mock_vector_db.add_embeddings.return_value = True
95
-
96
  pipeline = IngestionPipeline(store_embeddings=True)
97
  result = pipeline.process_directory_with_embeddings(str(self.test_dir))
98
-
99
  # Check response structure
100
  self.assertIsInstance(result, dict)
101
  self.assertEqual(result["status"], "success")
@@ -103,7 +110,7 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
103
  self.assertEqual(result["files_processed"], 2)
104
  self.assertGreater(result["embeddings_stored"], 0)
105
  self.assertTrue(result["store_embeddings"])
106
-
107
  # Verify embedding service was called
108
  mock_embedding_service.embed_texts.assert_called()
109
  mock_vector_db.add_embeddings.assert_called()
@@ -111,82 +118,89 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
111
  def test_process_directory_nonexistent(self):
112
  """Test processing non-existent directory"""
113
  pipeline = IngestionPipeline(store_embeddings=False)
114
-
115
  with self.assertRaises(FileNotFoundError):
116
  pipeline.process_directory("/nonexistent/directory")
117
 
118
  def test_store_embeddings_batch_without_components(self):
119
  """Test batch embedding storage without embedding components"""
120
  pipeline = IngestionPipeline(store_embeddings=False)
121
-
122
  chunks = [
123
  {
124
  "content": "Test content 1",
125
- "metadata": {"chunk_id": "test1", "source": "test1.txt"}
126
  }
127
  ]
128
-
129
  result = pipeline._store_embeddings_batch(chunks)
130
  self.assertEqual(result, 0)
131
 
132
- @patch('src.ingestion.ingestion_pipeline.VectorDatabase')
133
- @patch('src.ingestion.ingestion_pipeline.EmbeddingService')
134
- def test_store_embeddings_batch_success(self, mock_embedding_service_class, mock_vector_db_class):
 
 
135
  """Test successful batch embedding storage"""
136
  # Mock the classes to return mock instances
137
  mock_embedding_service = Mock()
138
  mock_vector_db = Mock()
139
  mock_embedding_service_class.return_value = mock_embedding_service
140
  mock_vector_db_class.return_value = mock_vector_db
141
-
142
  # Configure mocks
143
- mock_embedding_service.embed_texts.return_value = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
 
 
 
144
  mock_vector_db.add_embeddings.return_value = True
145
-
146
  pipeline = IngestionPipeline(store_embeddings=True)
147
-
148
  chunks = [
149
  {
150
  "content": "Test content 1",
151
- "metadata": {"chunk_id": "test1", "source": "test1.txt"}
152
  },
153
  {
154
  "content": "Test content 2",
155
- "metadata": {"chunk_id": "test2", "source": "test2.txt"}
156
- }
157
  ]
158
-
159
  result = pipeline._store_embeddings_batch(chunks)
160
  self.assertEqual(result, 2)
161
-
162
  # Verify method calls
163
  mock_embedding_service.embed_texts.assert_called_once_with(
164
  ["Test content 1", "Test content 2"]
165
  )
166
  mock_vector_db.add_embeddings.assert_called_once()
167
 
168
- @patch('src.ingestion.ingestion_pipeline.VectorDatabase')
169
- @patch('src.ingestion.ingestion_pipeline.EmbeddingService')
170
- def test_store_embeddings_batch_error_handling(self, mock_embedding_service_class, mock_vector_db_class):
 
 
171
  """Test error handling in batch embedding storage"""
172
  # Mock the classes to return mock instances
173
  mock_embedding_service = Mock()
174
  mock_vector_db = Mock()
175
  mock_embedding_service_class.return_value = mock_embedding_service
176
  mock_vector_db_class.return_value = mock_vector_db
177
-
178
  # Configure embedding service to raise an error
179
  mock_embedding_service.embed_texts.side_effect = Exception("Embedding error")
180
-
181
  pipeline = IngestionPipeline(store_embeddings=True)
182
-
183
  chunks = [
184
  {
185
  "content": "Test content 1",
186
- "metadata": {"chunk_id": "test1", "source": "test1.txt"}
187
  }
188
  ]
189
-
190
  # Should handle error gracefully and return 0
191
  result = pipeline._store_embeddings_batch(chunks)
192
  self.assertEqual(result, 0)
@@ -195,11 +209,11 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
195
  """Test that enhanced pipeline maintains backward compatibility"""
196
  pipeline = IngestionPipeline(store_embeddings=False)
197
  result = pipeline.process_directory(str(self.test_dir))
198
-
199
  # Should return list for backward compatibility
200
  self.assertIsInstance(result, list)
201
  self.assertGreater(len(result), 0)
202
-
203
  # First chunk should have expected structure
204
  chunk = result[0]
205
  self.assertIn("content", chunk)
@@ -209,8 +223,9 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
209
  def tearDown(self):
210
  """Clean up test fixtures"""
211
  import shutil
 
212
  shutil.rmtree(self.temp_dir, ignore_errors=True)
213
 
214
 
215
  if __name__ == "__main__":
216
- unittest.main()
 
17
  """Set up test fixtures"""
18
  self.temp_dir = tempfile.mkdtemp()
19
  self.test_dir = Path(self.temp_dir)
20
+
21
  # Create test files
22
  self.test_file1 = self.test_dir / "test1.md"
23
+ self.test_file1.write_text(
24
+ "# Test Document 1\n\nThis is test content for document 1."
25
+ )
26
+
27
  self.test_file2 = self.test_dir / "test2.txt"
28
  self.test_file2.write_text("This is test content for document 2.")
29
+
30
  # Create an unsupported file (should be skipped)
31
  self.test_file3 = self.test_dir / "test3.pdf"
32
  self.test_file3.write_text("PDF content")
 
34
  def test_initialization_without_embeddings(self):
35
  """Test pipeline initialization without embeddings"""
36
  pipeline = IngestionPipeline(store_embeddings=False)
37
+
38
  self.assertIsNotNone(pipeline.parser)
39
  self.assertIsNotNone(pipeline.chunker)
40
  self.assertFalse(pipeline.store_embeddings)
 
44
  def test_initialization_with_embeddings(self):
45
  """Test pipeline initialization with embeddings"""
46
  pipeline = IngestionPipeline(store_embeddings=True)
47
+
48
  self.assertIsNotNone(pipeline.parser)
49
  self.assertIsNotNone(pipeline.chunker)
50
  self.assertTrue(pipeline.store_embeddings)
 
55
  """Test pipeline initialization with custom embedding components"""
56
  mock_embedding_service = Mock()
57
  mock_vector_db = Mock()
58
+
59
  pipeline = IngestionPipeline(
60
  store_embeddings=True,
61
  embedding_service=mock_embedding_service,
62
+ vector_db=mock_vector_db,
63
  )
64
+
65
  self.assertEqual(pipeline.embedding_service, mock_embedding_service)
66
  self.assertEqual(pipeline.vector_db, mock_vector_db)
67
 
 
69
  """Test directory processing without embeddings"""
70
  pipeline = IngestionPipeline(store_embeddings=False)
71
  result = pipeline.process_directory_with_embeddings(str(self.test_dir))
72
+
73
  # Check response structure
74
  self.assertIsInstance(result, dict)
75
  self.assertEqual(result["status"], "success")
 
79
  self.assertFalse(result["store_embeddings"])
80
  self.assertIn("chunks", result)
81
 
82
+ @patch("src.ingestion.ingestion_pipeline.VectorDatabase")
83
+ @patch("src.ingestion.ingestion_pipeline.EmbeddingService")
84
+ def test_process_directory_with_embeddings(
85
+ self, mock_embedding_service_class, mock_vector_db_class
86
+ ):
87
  """Test directory processing with embeddings"""
88
  # Mock the classes to return mock instances
89
  mock_embedding_service = Mock()
90
  mock_vector_db = Mock()
91
  mock_embedding_service_class.return_value = mock_embedding_service
92
  mock_vector_db_class.return_value = mock_vector_db
93
+
94
  # Configure mock embedding service
95
+ mock_embedding_service.embed_texts.return_value = [
96
+ [0.1, 0.2, 0.3],
97
+ [0.4, 0.5, 0.6],
98
+ ]
99
+
100
  # Configure mock vector database
101
  mock_vector_db.add_embeddings.return_value = True
102
+
103
  pipeline = IngestionPipeline(store_embeddings=True)
104
  result = pipeline.process_directory_with_embeddings(str(self.test_dir))
105
+
106
  # Check response structure
107
  self.assertIsInstance(result, dict)
108
  self.assertEqual(result["status"], "success")
 
110
  self.assertEqual(result["files_processed"], 2)
111
  self.assertGreater(result["embeddings_stored"], 0)
112
  self.assertTrue(result["store_embeddings"])
113
+
114
  # Verify embedding service was called
115
  mock_embedding_service.embed_texts.assert_called()
116
  mock_vector_db.add_embeddings.assert_called()
 
118
  def test_process_directory_nonexistent(self):
119
  """Test processing non-existent directory"""
120
  pipeline = IngestionPipeline(store_embeddings=False)
121
+
122
  with self.assertRaises(FileNotFoundError):
123
  pipeline.process_directory("/nonexistent/directory")
124
 
125
  def test_store_embeddings_batch_without_components(self):
126
  """Test batch embedding storage without embedding components"""
127
  pipeline = IngestionPipeline(store_embeddings=False)
128
+
129
  chunks = [
130
  {
131
  "content": "Test content 1",
132
+ "metadata": {"chunk_id": "test1", "source": "test1.txt"},
133
  }
134
  ]
135
+
136
  result = pipeline._store_embeddings_batch(chunks)
137
  self.assertEqual(result, 0)
138
 
139
+ @patch("src.ingestion.ingestion_pipeline.VectorDatabase")
140
+ @patch("src.ingestion.ingestion_pipeline.EmbeddingService")
141
+ def test_store_embeddings_batch_success(
142
+ self, mock_embedding_service_class, mock_vector_db_class
143
+ ):
144
  """Test successful batch embedding storage"""
145
  # Mock the classes to return mock instances
146
  mock_embedding_service = Mock()
147
  mock_vector_db = Mock()
148
  mock_embedding_service_class.return_value = mock_embedding_service
149
  mock_vector_db_class.return_value = mock_vector_db
150
+
151
  # Configure mocks
152
+ mock_embedding_service.embed_texts.return_value = [
153
+ [0.1, 0.2, 0.3],
154
+ [0.4, 0.5, 0.6],
155
+ ]
156
  mock_vector_db.add_embeddings.return_value = True
157
+
158
  pipeline = IngestionPipeline(store_embeddings=True)
159
+
160
  chunks = [
161
  {
162
  "content": "Test content 1",
163
+ "metadata": {"chunk_id": "test1", "source": "test1.txt"},
164
  },
165
  {
166
  "content": "Test content 2",
167
+ "metadata": {"chunk_id": "test2", "source": "test2.txt"},
168
+ },
169
  ]
170
+
171
  result = pipeline._store_embeddings_batch(chunks)
172
  self.assertEqual(result, 2)
173
+
174
  # Verify method calls
175
  mock_embedding_service.embed_texts.assert_called_once_with(
176
  ["Test content 1", "Test content 2"]
177
  )
178
  mock_vector_db.add_embeddings.assert_called_once()
179
 
180
+ @patch("src.ingestion.ingestion_pipeline.VectorDatabase")
181
+ @patch("src.ingestion.ingestion_pipeline.EmbeddingService")
182
+ def test_store_embeddings_batch_error_handling(
183
+ self, mock_embedding_service_class, mock_vector_db_class
184
+ ):
185
  """Test error handling in batch embedding storage"""
186
  # Mock the classes to return mock instances
187
  mock_embedding_service = Mock()
188
  mock_vector_db = Mock()
189
  mock_embedding_service_class.return_value = mock_embedding_service
190
  mock_vector_db_class.return_value = mock_vector_db
191
+
192
  # Configure embedding service to raise an error
193
  mock_embedding_service.embed_texts.side_effect = Exception("Embedding error")
194
+
195
  pipeline = IngestionPipeline(store_embeddings=True)
196
+
197
  chunks = [
198
  {
199
  "content": "Test content 1",
200
+ "metadata": {"chunk_id": "test1", "source": "test1.txt"},
201
  }
202
  ]
203
+
204
  # Should handle error gracefully and return 0
205
  result = pipeline._store_embeddings_batch(chunks)
206
  self.assertEqual(result, 0)
 
209
  """Test that enhanced pipeline maintains backward compatibility"""
210
  pipeline = IngestionPipeline(store_embeddings=False)
211
  result = pipeline.process_directory(str(self.test_dir))
212
+
213
  # Should return list for backward compatibility
214
  self.assertIsInstance(result, list)
215
  self.assertGreater(len(result), 0)
216
+
217
  # First chunk should have expected structure
218
  chunk = result[0]
219
  self.assertIn("content", chunk)
 
223
  def tearDown(self):
224
  """Clean up test fixtures"""
225
  import shutil
226
+
227
  shutil.rmtree(self.temp_dir, ignore_errors=True)
228
 
229
 
230
  if __name__ == "__main__":
231
+ unittest.main()