Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

Tobias Pasquale commited on Oct 18

Commit

a3dfc07

1 Parent(s): aff5d04

fix: apply formatting and linting fixes for CI/CD compliance

- Remove unused Union import from typing
- Fix E501 line length violations by breaking long lines
- Apply black and isort formatting
- Fix flake8 compliance issues

All pre-commit hooks now pass locally

Files changed (4) hide show

app.py +13 -7
src/ingestion/ingestion_pipeline.py +31 -23
tests/test_enhanced_app.py +28 -23
tests/test_ingestion/test_enhanced_ingestion_pipeline.py +61 -46

app.py CHANGED Viewed

@@ -24,6 +24,7 @@ def ingest():
     """Endpoint to trigger document ingestion with embeddings"""
     try:
         from flask import request
         from src.config import (
             CORPUS_DIRECTORY,
             DEFAULT_CHUNK_SIZE,
@@ -35,12 +36,12 @@ def ingest():
         # Get optional parameters from request
         data = request.get_json() if request.is_json else {}
         store_embeddings = data.get("store_embeddings", True)
         pipeline = IngestionPipeline(
-            chunk_size=DEFAULT_CHUNK_SIZE,
-            overlap=DEFAULT_OVERLAP,
             seed=RANDOM_SEED,
-            store_embeddings=store_embeddings
         )
         result = pipeline.process_directory_with_embeddings(CORPUS_DIRECTORY)
@@ -52,13 +53,18 @@ def ingest():
             "files_processed": result["files_processed"],
             "embeddings_stored": result["embeddings_stored"],
             "store_embeddings": result["store_embeddings"],
-            "message": f"Successfully processed {result['chunks_processed']} chunks from {result['files_processed']} files"
         }
         # Include failed files info if any
         if result["failed_files"]:
             response["failed_files"] = result["failed_files"]
-            response["warnings"] = f"{len(result['failed_files'])} files failed to process"
         return jsonify(response)

     """Endpoint to trigger document ingestion with embeddings"""
     try:
         from flask import request
         from src.config import (
             CORPUS_DIRECTORY,
             DEFAULT_CHUNK_SIZE,
         # Get optional parameters from request
         data = request.get_json() if request.is_json else {}
         store_embeddings = data.get("store_embeddings", True)
         pipeline = IngestionPipeline(
+            chunk_size=DEFAULT_CHUNK_SIZE,
+            overlap=DEFAULT_OVERLAP,
             seed=RANDOM_SEED,
+            store_embeddings=store_embeddings,
         )
         result = pipeline.process_directory_with_embeddings(CORPUS_DIRECTORY)
             "files_processed": result["files_processed"],
             "embeddings_stored": result["embeddings_stored"],
             "store_embeddings": result["store_embeddings"],
+            "message": (
+                f"Successfully processed {result['chunks_processed']} chunks "
+                f"from {result['files_processed']} files"
+            ),
         }
         # Include failed files info if any
         if result["failed_files"]:
             response["failed_files"] = result["failed_files"]
+            response[
+                "warnings"
+            ] = f"{len(result['failed_files'])} files failed to process"
         return jsonify(response)

src/ingestion/ingestion_pipeline.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
 from ..embedding.embedding_service import EmbeddingService
 from ..vector_store.vector_db import VectorDatabase
@@ -11,13 +11,13 @@ class IngestionPipeline:
     """Complete ingestion pipeline for processing document corpus with embeddings"""
     def __init__(
-        self,
-        chunk_size: int = 1000,
-        overlap: int = 200,
         seed: int = 42,
         store_embeddings: bool = True,
         vector_db: Optional[VectorDatabase] = None,
-        embedding_service: Optional[EmbeddingService] = None
     ):
         """
         Initialize the ingestion pipeline
@@ -36,15 +36,15 @@ class IngestionPipeline:
         )
         self.seed = seed
         self.store_embeddings = store_embeddings
         # Initialize embedding components if storing embeddings
         if store_embeddings:
             self.embedding_service = embedding_service or EmbeddingService()
             if vector_db is None:
                 from ..config import COLLECTION_NAME, VECTOR_DB_PERSIST_PATH
                 self.vector_db = VectorDatabase(
-                    persist_path=VECTOR_DB_PERSIST_PATH,
-                    collection_name=COLLECTION_NAME
                 )
             else:
                 self.vector_db = vector_db
@@ -118,7 +118,12 @@ class IngestionPipeline:
                     continue
         # Generate and store embeddings if enabled
-        if self.store_embeddings and all_chunks and self.embedding_service and self.vector_db:
             try:
                 embeddings_stored = self._store_embeddings_batch(all_chunks)
             except Exception as e:
@@ -131,7 +136,7 @@ class IngestionPipeline:
             "failed_files": failed_files,
             "embeddings_stored": embeddings_stored,
             "store_embeddings": self.store_embeddings,
-            "chunks": all_chunks  # Include chunks for backward compatibility
         }
     def process_file(self, file_path: str) -> List[Dict[str, Any]]:
@@ -157,44 +162,47 @@ class IngestionPipeline:
     def _store_embeddings_batch(self, chunks: List[Dict[str, Any]]) -> int:
         """
         Generate embeddings and store chunks in vector database
         Args:
             chunks: List of text chunks with metadata
         Returns:
             Number of embeddings stored successfully
         """
         if not self.embedding_service or not self.vector_db:
             return 0
         stored_count = 0
         batch_size = 32  # Process in batches for memory efficiency
         for i in range(0, len(chunks), batch_size):
-            batch = chunks[i:i + batch_size]
             try:
                 # Extract texts and prepare data for vector storage
                 texts = [chunk["content"] for chunk in batch]
                 chunk_ids = [chunk["metadata"]["chunk_id"] for chunk in batch]
                 metadatas = [chunk["metadata"] for chunk in batch]
                 # Generate embeddings for the batch
                 embeddings = self.embedding_service.embed_texts(texts)
                 # Store in vector database
                 self.vector_db.add_embeddings(
                     embeddings=embeddings,
                     chunk_ids=chunk_ids,
                     documents=texts,
-                    metadatas=metadatas
                 )
                 stored_count += len(batch)
-                print(f"Stored embeddings for batch {i // batch_size + 1}: {len(batch)} chunks")
             except Exception as e:
                 print(f"Warning: Failed to store batch {i // batch_size + 1}: {e}")
                 continue
         return stored_count

 from pathlib import Path
+from typing import Any, Dict, List, Optional
 from ..embedding.embedding_service import EmbeddingService
 from ..vector_store.vector_db import VectorDatabase
     """Complete ingestion pipeline for processing document corpus with embeddings"""
     def __init__(
+        self,
+        chunk_size: int = 1000,
+        overlap: int = 200,
         seed: int = 42,
         store_embeddings: bool = True,
         vector_db: Optional[VectorDatabase] = None,
+        embedding_service: Optional[EmbeddingService] = None,
     ):
         """
         Initialize the ingestion pipeline
         )
         self.seed = seed
         self.store_embeddings = store_embeddings
         # Initialize embedding components if storing embeddings
         if store_embeddings:
             self.embedding_service = embedding_service or EmbeddingService()
             if vector_db is None:
                 from ..config import COLLECTION_NAME, VECTOR_DB_PERSIST_PATH
                 self.vector_db = VectorDatabase(
+                    persist_path=VECTOR_DB_PERSIST_PATH, collection_name=COLLECTION_NAME
                 )
             else:
                 self.vector_db = vector_db
                     continue
         # Generate and store embeddings if enabled
+        if (
+            self.store_embeddings
+            and all_chunks
+            and self.embedding_service
+            and self.vector_db
+        ):
             try:
                 embeddings_stored = self._store_embeddings_batch(all_chunks)
             except Exception as e:
             "failed_files": failed_files,
             "embeddings_stored": embeddings_stored,
             "store_embeddings": self.store_embeddings,
+            "chunks": all_chunks,  # Include chunks for backward compatibility
         }
     def process_file(self, file_path: str) -> List[Dict[str, Any]]:
     def _store_embeddings_batch(self, chunks: List[Dict[str, Any]]) -> int:
         """
         Generate embeddings and store chunks in vector database
         Args:
             chunks: List of text chunks with metadata
         Returns:
             Number of embeddings stored successfully
         """
         if not self.embedding_service or not self.vector_db:
             return 0
         stored_count = 0
         batch_size = 32  # Process in batches for memory efficiency
         for i in range(0, len(chunks), batch_size):
+            batch = chunks[i : i + batch_size]
             try:
                 # Extract texts and prepare data for vector storage
                 texts = [chunk["content"] for chunk in batch]
                 chunk_ids = [chunk["metadata"]["chunk_id"] for chunk in batch]
                 metadatas = [chunk["metadata"] for chunk in batch]
                 # Generate embeddings for the batch
                 embeddings = self.embedding_service.embed_texts(texts)
                 # Store in vector database
                 self.vector_db.add_embeddings(
                     embeddings=embeddings,
                     chunk_ids=chunk_ids,
                     documents=texts,
+                    metadatas=metadatas,
                 )
                 stored_count += len(batch)
+                print(
+                    f"Stored embeddings for batch {i // batch_size + 1}: "
+                    f"{len(batch)} chunks"
+                )
             except Exception as e:
                 print(f"Warning: Failed to store batch {i // batch_size + 1}: {e}")
                 continue
         return stored_count

tests/test_enhanced_app.py CHANGED Viewed

@@ -16,24 +16,26 @@ class TestEnhancedIngestionEndpoint(unittest.TestCase):
     def setUp(self):
         """Set up test fixtures"""
-        app.config['TESTING'] = True
         self.app = app.test_client()
         # Create temporary directory and files for testing
         self.temp_dir = tempfile.mkdtemp()
         self.test_dir = Path(self.temp_dir)
         self.test_file = self.test_dir / "test.md"
-        self.test_file.write_text("# Test Document\n\nThis is test content for enhanced ingestion.")
     def test_ingest_endpoint_with_embeddings_default(self):
         """Test ingestion endpoint with default embeddings enabled"""
-        with patch('src.config.CORPUS_DIRECTORY', str(self.test_dir)):
-            response = self.app.post('/ingest')
             self.assertEqual(response.status_code, 200)
             data = json.loads(response.data)
             # Check enhanced response structure
             self.assertEqual(data["status"], "success")
             self.assertIn("chunks_processed", data)
@@ -46,14 +48,16 @@ class TestEnhancedIngestionEndpoint(unittest.TestCase):
     def test_ingest_endpoint_with_embeddings_disabled(self):
         """Test ingestion endpoint with embeddings disabled"""
-        with patch('src.config.CORPUS_DIRECTORY', str(self.test_dir)):
-            response = self.app.post('/ingest',
-                                   data=json.dumps({"store_embeddings": False}),
-                                   content_type='application/json')
             self.assertEqual(response.status_code, 200)
             data = json.loads(response.data)
             # Check response structure with embeddings disabled
             self.assertEqual(data["status"], "success")
             self.assertIn("chunks_processed", data)
@@ -67,31 +71,32 @@ class TestEnhancedIngestionEndpoint(unittest.TestCase):
     def test_ingest_endpoint_with_no_json(self):
         """Test ingestion endpoint with no JSON payload (should default to embeddings enabled)"""
-        with patch('src.config.CORPUS_DIRECTORY', str(self.test_dir)):
-            response = self.app.post('/ingest')
             self.assertEqual(response.status_code, 200)
             data = json.loads(response.data)
             # Should default to embeddings enabled
             self.assertTrue(data["store_embeddings"])
     def test_ingest_endpoint_error_handling(self):
         """Test ingestion endpoint error handling"""
-        with patch('src.config.CORPUS_DIRECTORY', '/nonexistent/directory'):
-            response = self.app.post('/ingest')
             self.assertEqual(response.status_code, 500)
             data = json.loads(response.data)
             self.assertEqual(data["status"], "error")
             self.assertIn("message", data)
     def tearDown(self):
         """Clean up test fixtures"""
         import shutil
         shutil.rmtree(self.temp_dir, ignore_errors=True)
 if __name__ == "__main__":
-    unittest.main()

     def setUp(self):
         """Set up test fixtures"""
+        app.config["TESTING"] = True
         self.app = app.test_client()
         # Create temporary directory and files for testing
         self.temp_dir = tempfile.mkdtemp()
         self.test_dir = Path(self.temp_dir)
         self.test_file = self.test_dir / "test.md"
+        self.test_file.write_text(
+            "# Test Document\n\nThis is test content for enhanced ingestion."
+        )
     def test_ingest_endpoint_with_embeddings_default(self):
         """Test ingestion endpoint with default embeddings enabled"""
+        with patch("src.config.CORPUS_DIRECTORY", str(self.test_dir)):
+            response = self.app.post("/ingest")
             self.assertEqual(response.status_code, 200)
             data = json.loads(response.data)
             # Check enhanced response structure
             self.assertEqual(data["status"], "success")
             self.assertIn("chunks_processed", data)
     def test_ingest_endpoint_with_embeddings_disabled(self):
         """Test ingestion endpoint with embeddings disabled"""
+        with patch("src.config.CORPUS_DIRECTORY", str(self.test_dir)):
+            response = self.app.post(
+                "/ingest",
+                data=json.dumps({"store_embeddings": False}),
+                content_type="application/json",
+            )
             self.assertEqual(response.status_code, 200)
             data = json.loads(response.data)
             # Check response structure with embeddings disabled
             self.assertEqual(data["status"], "success")
             self.assertIn("chunks_processed", data)
     def test_ingest_endpoint_with_no_json(self):
         """Test ingestion endpoint with no JSON payload (should default to embeddings enabled)"""
+        with patch("src.config.CORPUS_DIRECTORY", str(self.test_dir)):
+            response = self.app.post("/ingest")
             self.assertEqual(response.status_code, 200)
             data = json.loads(response.data)
             # Should default to embeddings enabled
             self.assertTrue(data["store_embeddings"])
     def test_ingest_endpoint_error_handling(self):
         """Test ingestion endpoint error handling"""
+        with patch("src.config.CORPUS_DIRECTORY", "/nonexistent/directory"):
+            response = self.app.post("/ingest")
             self.assertEqual(response.status_code, 500)
             data = json.loads(response.data)
             self.assertEqual(data["status"], "error")
             self.assertIn("message", data)
     def tearDown(self):
         """Clean up test fixtures"""
         import shutil
         shutil.rmtree(self.temp_dir, ignore_errors=True)
 if __name__ == "__main__":
+    unittest.main()

tests/test_ingestion/test_enhanced_ingestion_pipeline.py CHANGED Viewed

@@ -17,14 +17,16 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
         """Set up test fixtures"""
         self.temp_dir = tempfile.mkdtemp()
         self.test_dir = Path(self.temp_dir)
         # Create test files
         self.test_file1 = self.test_dir / "test1.md"
-        self.test_file1.write_text("# Test Document 1\n\nThis is test content for document 1.")
         self.test_file2 = self.test_dir / "test2.txt"
         self.test_file2.write_text("This is test content for document 2.")
         # Create an unsupported file (should be skipped)
         self.test_file3 = self.test_dir / "test3.pdf"
         self.test_file3.write_text("PDF content")
@@ -32,7 +34,7 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
     def test_initialization_without_embeddings(self):
         """Test pipeline initialization without embeddings"""
         pipeline = IngestionPipeline(store_embeddings=False)
         self.assertIsNotNone(pipeline.parser)
         self.assertIsNotNone(pipeline.chunker)
         self.assertFalse(pipeline.store_embeddings)
@@ -42,7 +44,7 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
     def test_initialization_with_embeddings(self):
         """Test pipeline initialization with embeddings"""
         pipeline = IngestionPipeline(store_embeddings=True)
         self.assertIsNotNone(pipeline.parser)
         self.assertIsNotNone(pipeline.chunker)
         self.assertTrue(pipeline.store_embeddings)
@@ -53,13 +55,13 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
         """Test pipeline initialization with custom embedding components"""
         mock_embedding_service = Mock()
         mock_vector_db = Mock()
         pipeline = IngestionPipeline(
             store_embeddings=True,
             embedding_service=mock_embedding_service,
-            vector_db=mock_vector_db
         )
         self.assertEqual(pipeline.embedding_service, mock_embedding_service)
         self.assertEqual(pipeline.vector_db, mock_vector_db)
@@ -67,7 +69,7 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
         """Test directory processing without embeddings"""
         pipeline = IngestionPipeline(store_embeddings=False)
         result = pipeline.process_directory_with_embeddings(str(self.test_dir))
         # Check response structure
         self.assertIsInstance(result, dict)
         self.assertEqual(result["status"], "success")
@@ -77,25 +79,30 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
         self.assertFalse(result["store_embeddings"])
         self.assertIn("chunks", result)
-    @patch('src.ingestion.ingestion_pipeline.VectorDatabase')
-    @patch('src.ingestion.ingestion_pipeline.EmbeddingService')
-    def test_process_directory_with_embeddings(self, mock_embedding_service_class, mock_vector_db_class):
         """Test directory processing with embeddings"""
         # Mock the classes to return mock instances
         mock_embedding_service = Mock()
         mock_vector_db = Mock()
         mock_embedding_service_class.return_value = mock_embedding_service
         mock_vector_db_class.return_value = mock_vector_db
         # Configure mock embedding service
-        mock_embedding_service.embed_texts.return_value = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
         # Configure mock vector database
         mock_vector_db.add_embeddings.return_value = True
         pipeline = IngestionPipeline(store_embeddings=True)
         result = pipeline.process_directory_with_embeddings(str(self.test_dir))
         # Check response structure
         self.assertIsInstance(result, dict)
         self.assertEqual(result["status"], "success")
@@ -103,7 +110,7 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
         self.assertEqual(result["files_processed"], 2)
         self.assertGreater(result["embeddings_stored"], 0)
         self.assertTrue(result["store_embeddings"])
         # Verify embedding service was called
         mock_embedding_service.embed_texts.assert_called()
         mock_vector_db.add_embeddings.assert_called()
@@ -111,82 +118,89 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
     def test_process_directory_nonexistent(self):
         """Test processing non-existent directory"""
         pipeline = IngestionPipeline(store_embeddings=False)
         with self.assertRaises(FileNotFoundError):
             pipeline.process_directory("/nonexistent/directory")
     def test_store_embeddings_batch_without_components(self):
         """Test batch embedding storage without embedding components"""
         pipeline = IngestionPipeline(store_embeddings=False)
         chunks = [
             {
                 "content": "Test content 1",
-                "metadata": {"chunk_id": "test1", "source": "test1.txt"}
             }
         ]
         result = pipeline._store_embeddings_batch(chunks)
         self.assertEqual(result, 0)
-    @patch('src.ingestion.ingestion_pipeline.VectorDatabase')
-    @patch('src.ingestion.ingestion_pipeline.EmbeddingService')
-    def test_store_embeddings_batch_success(self, mock_embedding_service_class, mock_vector_db_class):
         """Test successful batch embedding storage"""
         # Mock the classes to return mock instances
         mock_embedding_service = Mock()
         mock_vector_db = Mock()
         mock_embedding_service_class.return_value = mock_embedding_service
         mock_vector_db_class.return_value = mock_vector_db
         # Configure mocks
-        mock_embedding_service.embed_texts.return_value = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]
         mock_vector_db.add_embeddings.return_value = True
         pipeline = IngestionPipeline(store_embeddings=True)
         chunks = [
             {
                 "content": "Test content 1",
-                "metadata": {"chunk_id": "test1", "source": "test1.txt"}
             },
             {
                 "content": "Test content 2",
-                "metadata": {"chunk_id": "test2", "source": "test2.txt"}
-            }
         ]
         result = pipeline._store_embeddings_batch(chunks)
         self.assertEqual(result, 2)
         # Verify method calls
         mock_embedding_service.embed_texts.assert_called_once_with(
             ["Test content 1", "Test content 2"]
         )
         mock_vector_db.add_embeddings.assert_called_once()
-    @patch('src.ingestion.ingestion_pipeline.VectorDatabase')
-    @patch('src.ingestion.ingestion_pipeline.EmbeddingService')
-    def test_store_embeddings_batch_error_handling(self, mock_embedding_service_class, mock_vector_db_class):
         """Test error handling in batch embedding storage"""
         # Mock the classes to return mock instances
         mock_embedding_service = Mock()
         mock_vector_db = Mock()
         mock_embedding_service_class.return_value = mock_embedding_service
         mock_vector_db_class.return_value = mock_vector_db
         # Configure embedding service to raise an error
         mock_embedding_service.embed_texts.side_effect = Exception("Embedding error")
         pipeline = IngestionPipeline(store_embeddings=True)
         chunks = [
             {
                 "content": "Test content 1",
-                "metadata": {"chunk_id": "test1", "source": "test1.txt"}
             }
         ]
         # Should handle error gracefully and return 0
         result = pipeline._store_embeddings_batch(chunks)
         self.assertEqual(result, 0)
@@ -195,11 +209,11 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
         """Test that enhanced pipeline maintains backward compatibility"""
         pipeline = IngestionPipeline(store_embeddings=False)
         result = pipeline.process_directory(str(self.test_dir))
         # Should return list for backward compatibility
         self.assertIsInstance(result, list)
         self.assertGreater(len(result), 0)
         # First chunk should have expected structure
         chunk = result[0]
         self.assertIn("content", chunk)
@@ -209,8 +223,9 @@ class TestEnhancedIngestionPipeline(unittest.TestCase):
     def tearDown(self):
         """Clean up test fixtures"""
         import shutil
         shutil.rmtree(self.temp_dir, ignore_errors=True)
 if __name__ == "__main__":
-    unittest.main()

         """Set up test fixtures"""
         self.temp_dir = tempfile.mkdtemp()
         self.test_dir = Path(self.temp_dir)
         # Create test files
         self.test_file1 = self.test_dir / "test1.md"
+        self.test_file1.write_text(
+            "# Test Document 1\n\nThis is test content for document 1."
+        )
         self.test_file2 = self.test_dir / "test2.txt"
         self.test_file2.write_text("This is test content for document 2.")
         # Create an unsupported file (should be skipped)
         self.test_file3 = self.test_dir / "test3.pdf"
         self.test_file3.write_text("PDF content")
     def test_initialization_without_embeddings(self):
         """Test pipeline initialization without embeddings"""
         pipeline = IngestionPipeline(store_embeddings=False)
         self.assertIsNotNone(pipeline.parser)
         self.assertIsNotNone(pipeline.chunker)
         self.assertFalse(pipeline.store_embeddings)
     def test_initialization_with_embeddings(self):
         """Test pipeline initialization with embeddings"""
         pipeline = IngestionPipeline(store_embeddings=True)
         self.assertIsNotNone(pipeline.parser)
         self.assertIsNotNone(pipeline.chunker)
         self.assertTrue(pipeline.store_embeddings)
         """Test pipeline initialization with custom embedding components"""
         mock_embedding_service = Mock()
         mock_vector_db = Mock()
         pipeline = IngestionPipeline(
             store_embeddings=True,
             embedding_service=mock_embedding_service,
+            vector_db=mock_vector_db,
         )
         self.assertEqual(pipeline.embedding_service, mock_embedding_service)
         self.assertEqual(pipeline.vector_db, mock_vector_db)
         """Test directory processing without embeddings"""
         pipeline = IngestionPipeline(store_embeddings=False)
         result = pipeline.process_directory_with_embeddings(str(self.test_dir))
         # Check response structure
         self.assertIsInstance(result, dict)
         self.assertEqual(result["status"], "success")
         self.assertFalse(result["store_embeddings"])
         self.assertIn("chunks", result)
+    @patch("src.ingestion.ingestion_pipeline.VectorDatabase")
+    @patch("src.ingestion.ingestion_pipeline.EmbeddingService")
+    def test_process_directory_with_embeddings(
+        self, mock_embedding_service_class, mock_vector_db_class
+    ):
         """Test directory processing with embeddings"""
         # Mock the classes to return mock instances
         mock_embedding_service = Mock()
         mock_vector_db = Mock()
         mock_embedding_service_class.return_value = mock_embedding_service
         mock_vector_db_class.return_value = mock_vector_db
         # Configure mock embedding service
+        mock_embedding_service.embed_texts.return_value = [
+            [0.1, 0.2, 0.3],
+            [0.4, 0.5, 0.6],
+        ]
         # Configure mock vector database
         mock_vector_db.add_embeddings.return_value = True
         pipeline = IngestionPipeline(store_embeddings=True)
         result = pipeline.process_directory_with_embeddings(str(self.test_dir))
         # Check response structure
         self.assertIsInstance(result, dict)
         self.assertEqual(result["status"], "success")
         self.assertEqual(result["files_processed"], 2)
         self.assertGreater(result["embeddings_stored"], 0)
         self.assertTrue(result["store_embeddings"])
         # Verify embedding service was called
         mock_embedding_service.embed_texts.assert_called()
         mock_vector_db.add_embeddings.assert_called()
     def test_process_directory_nonexistent(self):
         """Test processing non-existent directory"""
         pipeline = IngestionPipeline(store_embeddings=False)
         with self.assertRaises(FileNotFoundError):
             pipeline.process_directory("/nonexistent/directory")
     def test_store_embeddings_batch_without_components(self):
         """Test batch embedding storage without embedding components"""
         pipeline = IngestionPipeline(store_embeddings=False)
         chunks = [
             {
                 "content": "Test content 1",
+                "metadata": {"chunk_id": "test1", "source": "test1.txt"},
             }
         ]
         result = pipeline._store_embeddings_batch(chunks)
         self.assertEqual(result, 0)
+    @patch("src.ingestion.ingestion_pipeline.VectorDatabase")
+    @patch("src.ingestion.ingestion_pipeline.EmbeddingService")
+    def test_store_embeddings_batch_success(
+        self, mock_embedding_service_class, mock_vector_db_class
+    ):
         """Test successful batch embedding storage"""
         # Mock the classes to return mock instances
         mock_embedding_service = Mock()
         mock_vector_db = Mock()
         mock_embedding_service_class.return_value = mock_embedding_service
         mock_vector_db_class.return_value = mock_vector_db
         # Configure mocks
+        mock_embedding_service.embed_texts.return_value = [
+            [0.1, 0.2, 0.3],
+            [0.4, 0.5, 0.6],
+        ]
         mock_vector_db.add_embeddings.return_value = True
         pipeline = IngestionPipeline(store_embeddings=True)
         chunks = [
             {
                 "content": "Test content 1",
+                "metadata": {"chunk_id": "test1", "source": "test1.txt"},
             },
             {
                 "content": "Test content 2",
+                "metadata": {"chunk_id": "test2", "source": "test2.txt"},
+            },
         ]
         result = pipeline._store_embeddings_batch(chunks)
         self.assertEqual(result, 2)
         # Verify method calls
         mock_embedding_service.embed_texts.assert_called_once_with(
             ["Test content 1", "Test content 2"]
         )
         mock_vector_db.add_embeddings.assert_called_once()
+    @patch("src.ingestion.ingestion_pipeline.VectorDatabase")
+    @patch("src.ingestion.ingestion_pipeline.EmbeddingService")
+    def test_store_embeddings_batch_error_handling(
+        self, mock_embedding_service_class, mock_vector_db_class
+    ):
         """Test error handling in batch embedding storage"""
         # Mock the classes to return mock instances
         mock_embedding_service = Mock()
         mock_vector_db = Mock()
         mock_embedding_service_class.return_value = mock_embedding_service
         mock_vector_db_class.return_value = mock_vector_db
         # Configure embedding service to raise an error
         mock_embedding_service.embed_texts.side_effect = Exception("Embedding error")
         pipeline = IngestionPipeline(store_embeddings=True)
         chunks = [
             {
                 "content": "Test content 1",
+                "metadata": {"chunk_id": "test1", "source": "test1.txt"},
             }
         ]
         # Should handle error gracefully and return 0
         result = pipeline._store_embeddings_batch(chunks)
         self.assertEqual(result, 0)
         """Test that enhanced pipeline maintains backward compatibility"""
         pipeline = IngestionPipeline(store_embeddings=False)
         result = pipeline.process_directory(str(self.test_dir))
         # Should return list for backward compatibility
         self.assertIsInstance(result, list)
         self.assertGreater(len(result), 0)
         # First chunk should have expected structure
         chunk = result[0]
         self.assertIn("content", chunk)
     def tearDown(self):
         """Clean up test fixtures"""
         import shutil
         shutil.rmtree(self.temp_dir, ignore_errors=True)
 if __name__ == "__main__":
+    unittest.main()