Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

File size: 6,123 Bytes

ffa0f3d
7793bb6
ffa0f3d
7793bb6
 
 
ffa0f3d
 
7793bb6
ffa0f3d
 
 
 
 
 
 
7793bb6
 
 
 
ffa0f3d
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
7793bb6
ffa0f3d
 
7793bb6
 
 
 
 
ffa0f3d
 
 
 
 
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
7793bb6
ffa0f3d
7793bb6
 
 
ffa0f3d
 
 
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
 
 
 
 
 
 
 
ffa0f3d
 
 
 
 
 
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
 
ffa0f3d
 
 
 
7793bb6
 
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
 
7793bb6
 
 
 
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
 
7793bb6
ffa0f3d
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
 
 
7793bb6
 
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
 
ffa0f3d
7793bb6
 
ffa0f3d
7793bb6

import os
import tempfile
from pathlib import Path

import pytest

from src.ingestion.ingestion_pipeline import IngestionPipeline


def test_full_ingestion_pipeline():
    """Test the complete ingestion pipeline end-to-end"""
    # Create temporary test documents
    with tempfile.TemporaryDirectory() as temp_dir:
        # Create test files
        txt_file = Path(temp_dir) / "policy1.txt"
        md_file = Path(temp_dir) / "policy2.md"

        txt_file.write_text(
            "This is a text policy document with important information."
        )
        md_file.write_text("# Markdown Policy\n\nThis is markdown content.")

        # Initialize pipeline
        pipeline = IngestionPipeline(chunk_size=50, overlap=10, seed=42)

        # Process documents
        results = pipeline.process_directory(temp_dir)

        assert len(results) >= 2  # At least one result per file

        # Verify structure
        for result in results:
            assert "content" in result
            assert "metadata" in result
            assert "chunk_id" in result["metadata"]
            assert "filename" in result["metadata"]


def test_pipeline_reproducibility():
    """Test that pipeline produces consistent results"""
    with tempfile.TemporaryDirectory() as temp_dir:
        test_file = Path(temp_dir) / "test.txt"
        test_file.write_text("Test content for reproducibility. " * 20)

        pipeline1 = IngestionPipeline(chunk_size=100, overlap=20, seed=42)
        pipeline2 = IngestionPipeline(chunk_size=100, overlap=20, seed=42)

        results1 = pipeline1.process_directory(temp_dir)
        results2 = pipeline2.process_directory(temp_dir)

        assert len(results1) == len(results2)

        for r1, r2 in zip(results1, results2):
            assert r1["content"] == r2["content"]
            assert r1["metadata"]["chunk_id"] == r2["metadata"]["chunk_id"]


def test_pipeline_with_real_corpus():
    """Test pipeline with actual policy documents"""
    pipeline = IngestionPipeline(chunk_size=1000, overlap=200, seed=42)

    # Process just one real document to verify it works
    corpus_dir = "synthetic_policies"

    # Check if corpus directory exists
    if not Path(corpus_dir).exists():
        pytest.skip("Corpus directory not found - test requires synthetic_policies/")

    results = pipeline.process_directory(corpus_dir)

    # Should process all 22 documents
    assert len(results) > 20  # Should have many chunks from 22 documents

    # Verify all results have proper structure
    for result in results:
        assert "content" in result
        assert "metadata" in result
        assert "chunk_id" in result["metadata"]
        assert "filename" in result["metadata"]
        assert "file_type" in result["metadata"]
        assert result["metadata"]["file_type"] == "md"
        assert "chunk_index" in result["metadata"]


def test_pipeline_error_handling():
    """Test pipeline handles errors gracefully"""
    with tempfile.TemporaryDirectory() as temp_dir:
        # Create valid and invalid files
        valid_file = Path(temp_dir) / "valid.md"
        invalid_file = Path(temp_dir) / "invalid.xyz"

        valid_file.write_text("# Valid Policy\n\nThis is valid content.")
        invalid_file.write_text("This file has unsupported format.")

        pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)

        # Should process valid file and skip invalid one
        results = pipeline.process_directory(temp_dir)

        # Should only get results from valid file
        assert len(results) >= 1

        # All results should be from valid file
        for result in results:
            assert result["metadata"]["filename"] == "valid.md"


def test_pipeline_single_file():
    """Test processing a single file"""
    pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)

    with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
        f.write("# Test Policy\n\n" + "Content section. " * 20)
        temp_path = f.name

    try:
        results = pipeline.process_file(temp_path)

        # Should get multiple chunks due to length
        assert len(results) > 1

        # All chunks should have same filename
        filename = Path(temp_path).name
        for result in results:
            assert result["metadata"]["filename"] == filename
            assert result["metadata"]["file_type"] == "md"
            assert "chunk_index" in result["metadata"]

    finally:
        os.unlink(temp_path)


def test_pipeline_empty_directory():
    """Test pipeline with empty directory"""
    with tempfile.TemporaryDirectory() as temp_dir:
        pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)

        results = pipeline.process_directory(temp_dir)

        # Should return empty list for empty directory
        assert len(results) == 0


def test_pipeline_nonexistent_directory():
    """Test pipeline with non-existent directory"""
    pipeline = IngestionPipeline(chunk_size=100, overlap=20, seed=42)

    with pytest.raises(FileNotFoundError):
        pipeline.process_directory("/nonexistent/directory")


def test_pipeline_configuration():
    """Test pipeline configuration options"""
    # Test different configurations
    pipeline_small = IngestionPipeline(chunk_size=50, overlap=10, seed=42)
    pipeline_large = IngestionPipeline(chunk_size=200, overlap=50, seed=42)

    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
        content = "Policy content goes here. " * 30  # 780 characters
        f.write(content)
        temp_path = f.name

    try:
        results_small = pipeline_small.process_file(temp_path)
        results_large = pipeline_large.process_file(temp_path)

        # Small chunks should create more chunks
        assert len(results_small) > len(results_large)

        # All chunks should respect size limits
        for result in results_small:
            assert len(result["content"]) <= 50

        for result in results_large:
            assert len(result["content"]) <= 200

    finally:
        os.unlink(temp_path)