Spaces:
Sleeping
Sleeping
| from src.ingestion.document_chunker import DocumentChunker | |
| def test_chunk_by_characters(): | |
| """Test basic character-based chunking""" | |
| chunker = DocumentChunker(chunk_size=50, overlap=10) | |
| text = "This is a test document. " * 10 # 250 characters | |
| chunks = chunker.chunk_text(text) | |
| assert len(chunks) > 1 # Should create multiple chunks | |
| assert all(len(chunk["content"]) <= 50 for chunk in chunks) | |
| # Test overlap | |
| if len(chunks) > 1: | |
| # Check that there's overlap between consecutive chunks | |
| assert chunks[0]["content"][-10:] in chunks[1]["content"][:20] | |
| def test_chunk_with_metadata(): | |
| """Test that chunks preserve document metadata""" | |
| chunker = DocumentChunker(chunk_size=100, overlap=20) | |
| doc_metadata = {"filename": "test.txt", "file_type": "txt", "source_id": "doc_001"} | |
| text = "Content that will be chunked. " * 20 | |
| chunks = chunker.chunk_document(text, doc_metadata) | |
| for chunk in chunks: | |
| assert chunk["metadata"]["filename"] == "test.txt" | |
| assert chunk["metadata"]["file_type"] == "txt" | |
| assert "chunk_id" in chunk["metadata"] | |
| assert "chunk_index" in chunk["metadata"] | |
| def test_reproducible_chunking(): | |
| """Test that chunking is deterministic with fixed seed""" | |
| chunker1 = DocumentChunker(chunk_size=100, overlap=20, seed=42) | |
| chunker2 = DocumentChunker(chunk_size=100, overlap=20, seed=42) | |
| text = "This text will be chunked reproducibly. " * 30 | |
| chunks1 = chunker1.chunk_text(text) | |
| chunks2 = chunker2.chunk_text(text) | |
| assert len(chunks1) == len(chunks2) | |
| for c1, c2 in zip(chunks1, chunks2): | |
| assert c1["content"] == c2["content"] | |
| def test_empty_text_chunking(): | |
| """Test handling of empty or very short text""" | |
| chunker = DocumentChunker(chunk_size=100, overlap=20) | |
| # Empty text | |
| chunks = chunker.chunk_text("") | |
| assert len(chunks) == 0 | |
| # Very short text | |
| chunks = chunker.chunk_text("Short") | |
| assert len(chunks) == 1 | |
| assert chunks[0]["content"] == "Short" | |
| def test_chunk_real_policy_content(): | |
| """Test chunking actual policy document content""" | |
| chunker = DocumentChunker(chunk_size=500, overlap=100, seed=42) | |
| # Use content that resembles our policy documents | |
| policy_intro = """# HR-POL-001: Employee Handbook | |
| **Effective Date:** 2025-01-01 | |
| **Revision:** 1.1 | |
| **Owner:** Human Resources | |
| ## 1. Introduction | |
| ### 1.1. A Message from Our CEO | |
| Welcome to Innovate Inc.! We are thrilled to have you as part of our team.""" | |
| policy_conduct = """ | |
| ## 2. Company Policies | |
| ### 2.1. Code of Conduct | |
| All employees must adhere to our code of conduct.""" | |
| policy_content = (policy_intro + policy_conduct) * 3 | |
| doc_metadata = { | |
| "filename": "employee_handbook.md", | |
| "file_type": "md", | |
| "file_path": "/path/to/employee_handbook.md", | |
| } | |
| chunks = chunker.chunk_document(policy_content, doc_metadata) | |
| # Verify chunking worked | |
| assert len(chunks) > 1 | |
| # Verify all chunks have proper metadata | |
| for i, chunk in enumerate(chunks): | |
| assert chunk["metadata"]["filename"] == "employee_handbook.md" | |
| assert chunk["metadata"]["file_type"] == "md" | |
| assert chunk["metadata"]["chunk_index"] == i | |
| assert "chunk_id" in chunk["metadata"] | |
| assert len(chunk["content"]) <= 500 | |
| # Verify overlap exists between consecutive chunks | |
| if len(chunks) > 1: | |
| overlap_check = chunks[0]["content"][-100:] in chunks[1]["content"][:200] | |
| assert overlap_check | |
| def test_chunk_metadata_inheritance(): | |
| """Test that document metadata is properly inherited by chunks""" | |
| chunker = DocumentChunker(chunk_size=100, overlap=20) | |
| doc_metadata = { | |
| "filename": "test_policy.md", | |
| "file_type": "md", | |
| "file_size": 1500, | |
| "file_path": "/absolute/path/to/test_policy.md", | |
| } | |
| text = "Policy content goes here. " * 20 | |
| chunks = chunker.chunk_document(text, doc_metadata) | |
| for chunk in chunks: | |
| # Original metadata should be preserved | |
| assert chunk["metadata"]["filename"] == "test_policy.md" | |
| assert chunk["metadata"]["file_type"] == "md" | |
| assert chunk["metadata"]["file_size"] == 1500 | |
| expected_path = "/absolute/path/to/test_policy.md" | |
| assert chunk["metadata"]["file_path"] == expected_path | |
| # New chunk-specific metadata should be added | |
| assert "chunk_index" in chunk["metadata"] | |
| assert "chunk_id" in chunk["metadata"] | |
| assert "start_pos" in chunk["metadata"] | |
| assert "end_pos" in chunk["metadata"] | |