import os import tempfile from pathlib import Path import pytest def test_parse_txt_file(): """Test parsing a simple text file""" # Test will fail initially - we'll implement parser to make it pass from src.ingestion.document_parser import DocumentParser parser = DocumentParser() with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: f.write("This is a test policy document.\nIt has multiple lines.") temp_path = f.name try: result = parser.parse_document(temp_path) assert ( result["content"] == "This is a test policy document.\nIt has multiple lines." ) assert result["metadata"]["filename"] == Path(temp_path).name assert result["metadata"]["file_type"] == "txt" finally: os.unlink(temp_path) def test_parse_markdown_file(): """Test parsing a markdown file""" from src.ingestion.document_parser import DocumentParser parser = DocumentParser() markdown_content = """# Policy Title ## Section 1 This is section content. ### Subsection More content here.""" with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: f.write(markdown_content) temp_path = f.name try: result = parser.parse_document(temp_path) assert "Policy Title" in result["content"] assert "Section 1" in result["content"] assert result["metadata"]["file_type"] == "md" finally: os.unlink(temp_path) def test_parse_unsupported_format(): """Test handling of unsupported file formats""" from src.ingestion.document_parser import DocumentParser parser = DocumentParser() with pytest.raises(ValueError, match="Unsupported file format"): parser.parse_document("test.xyz") def test_parse_nonexistent_file(): """Test handling of non-existent files""" from src.ingestion.document_parser import DocumentParser parser = DocumentParser() with pytest.raises(FileNotFoundError): parser.parse_document("nonexistent.txt") def test_parse_real_policy_document(): """Test parsing an actual policy document from our corpus""" from src.ingestion.document_parser import DocumentParser parser = DocumentParser() # Use a real policy document from our corpus policy_path = "synthetic_policies/employee_handbook.md" result = parser.parse_document(policy_path) # Verify content structure assert "employee_handbook.md" in result["metadata"]["filename"] assert result["metadata"]["file_type"] == "md" assert "Employee Handbook" in result["content"] assert "HR-POL-001" in result["content"] assert len(result["content"]) > 100 # Should have substantial content # Verify metadata completeness assert "file_size" in result["metadata"] assert "file_path" in result["metadata"] assert result["metadata"]["file_size"] > 0