from src.ingestion.document_chunker import DocumentChunker def test_chunk_by_characters(): """Test basic character-based chunking""" chunker = DocumentChunker(chunk_size=50, overlap=10) text = "This is a test document. " * 10 # 250 characters chunks = chunker.chunk_text(text) assert len(chunks) > 1 # Should create multiple chunks assert all(len(chunk["content"]) <= 50 for chunk in chunks) # Test overlap if len(chunks) > 1: # Check that there's overlap between consecutive chunks assert chunks[0]["content"][-10:] in chunks[1]["content"][:20] def test_chunk_with_metadata(): """Test that chunks preserve document metadata""" chunker = DocumentChunker(chunk_size=100, overlap=20) doc_metadata = {"filename": "test.txt", "file_type": "txt", "source_id": "doc_001"} text = "Content that will be chunked. " * 20 chunks = chunker.chunk_document(text, doc_metadata) for chunk in chunks: assert chunk["metadata"]["filename"] == "test.txt" assert chunk["metadata"]["file_type"] == "txt" assert "chunk_id" in chunk["metadata"] assert "chunk_index" in chunk["metadata"] def test_reproducible_chunking(): """Test that chunking is deterministic with fixed seed""" chunker1 = DocumentChunker(chunk_size=100, overlap=20, seed=42) chunker2 = DocumentChunker(chunk_size=100, overlap=20, seed=42) text = "This text will be chunked reproducibly. " * 30 chunks1 = chunker1.chunk_text(text) chunks2 = chunker2.chunk_text(text) assert len(chunks1) == len(chunks2) for c1, c2 in zip(chunks1, chunks2): assert c1["content"] == c2["content"] def test_empty_text_chunking(): """Test handling of empty or very short text""" chunker = DocumentChunker(chunk_size=100, overlap=20) # Empty text chunks = chunker.chunk_text("") assert len(chunks) == 0 # Very short text chunks = chunker.chunk_text("Short") assert len(chunks) == 1 assert chunks[0]["content"] == "Short" def test_chunk_real_policy_content(): """Test chunking actual policy document content""" chunker = DocumentChunker(chunk_size=500, overlap=100, seed=42) # Use content that resembles our policy documents policy_intro = """# HR-POL-001: Employee Handbook **Effective Date:** 2025-01-01 **Revision:** 1.1 **Owner:** Human Resources ## 1. Introduction ### 1.1. A Message from Our CEO Welcome to Innovate Inc.! We are thrilled to have you as part of our team.""" policy_conduct = """ ## 2. Company Policies ### 2.1. Code of Conduct All employees must adhere to our code of conduct.""" policy_content = (policy_intro + policy_conduct) * 3 doc_metadata = { "filename": "employee_handbook.md", "file_type": "md", "file_path": "/path/to/employee_handbook.md", } chunks = chunker.chunk_document(policy_content, doc_metadata) # Verify chunking worked assert len(chunks) > 1 # Verify all chunks have proper metadata for i, chunk in enumerate(chunks): assert chunk["metadata"]["filename"] == "employee_handbook.md" assert chunk["metadata"]["file_type"] == "md" assert chunk["metadata"]["chunk_index"] == i assert "chunk_id" in chunk["metadata"] assert len(chunk["content"]) <= 500 # Verify overlap exists between consecutive chunks if len(chunks) > 1: overlap_check = chunks[0]["content"][-100:] in chunks[1]["content"][:200] assert overlap_check def test_chunk_metadata_inheritance(): """Test that document metadata is properly inherited by chunks""" chunker = DocumentChunker(chunk_size=100, overlap=20) doc_metadata = { "filename": "test_policy.md", "file_type": "md", "file_size": 1500, "file_path": "/absolute/path/to/test_policy.md", } text = "Policy content goes here. " * 20 chunks = chunker.chunk_document(text, doc_metadata) for chunk in chunks: # Original metadata should be preserved assert chunk["metadata"]["filename"] == "test_policy.md" assert chunk["metadata"]["file_type"] == "md" assert chunk["metadata"]["file_size"] == 1500 expected_path = "/absolute/path/to/test_policy.md" assert chunk["metadata"]["file_path"] == expected_path # New chunk-specific metadata should be added assert "chunk_index" in chunk["metadata"] assert "chunk_id" in chunk["metadata"] assert "start_pos" in chunk["metadata"] assert "end_pos" in chunk["metadata"]