from src.ingestion.document_chunker import DocumentChunker


def test_chunk_by_characters():
    """Test basic character-based chunking"""
    chunker = DocumentChunker(chunk_size=50, overlap=10)

    text = "This is a test document. " * 10  # 250 characters
    chunks = chunker.chunk_text(text)

    assert len(chunks) > 1  # Should create multiple chunks
    assert all(len(chunk["content"]) <= 50 for chunk in chunks)

    # Test overlap
    if len(chunks) > 1:
        # Check that there's overlap between consecutive chunks
        assert chunks[0]["content"][-10:] in chunks[1]["content"][:20]


def test_chunk_with_metadata():
    """Test that chunks preserve document metadata"""
    chunker = DocumentChunker(chunk_size=100, overlap=20)

    doc_metadata = {"filename": "test.txt", "file_type": "txt", "source_id": "doc_001"}

    text = "Content that will be chunked. " * 20
    chunks = chunker.chunk_document(text, doc_metadata)

    for chunk in chunks:
        assert chunk["metadata"]["filename"] == "test.txt"
        assert chunk["metadata"]["file_type"] == "txt"
        assert "chunk_id" in chunk["metadata"]
        assert "chunk_index" in chunk["metadata"]


def test_reproducible_chunking():
    """Test that chunking is deterministic with fixed seed"""
    chunker1 = DocumentChunker(chunk_size=100, overlap=20, seed=42)
    chunker2 = DocumentChunker(chunk_size=100, overlap=20, seed=42)

    text = "This text will be chunked reproducibly. " * 30

    chunks1 = chunker1.chunk_text(text)
    chunks2 = chunker2.chunk_text(text)

    assert len(chunks1) == len(chunks2)
    for c1, c2 in zip(chunks1, chunks2):
        assert c1["content"] == c2["content"]


def test_empty_text_chunking():
    """Test handling of empty or very short text"""
    chunker = DocumentChunker(chunk_size=100, overlap=20)

    # Empty text
    chunks = chunker.chunk_text("")
    assert len(chunks) == 0

    # Very short text
    chunks = chunker.chunk_text("Short")
    assert len(chunks) == 1
    assert chunks[0]["content"] == "Short"


def test_chunk_real_policy_content():
    """Test chunking actual policy document content"""
    chunker = DocumentChunker(chunk_size=500, overlap=100, seed=42)

    # Use content that resembles our policy documents
    policy_intro = """# HR-POL-001: Employee Handbook

**Effective Date:** 2025-01-01
**Revision:** 1.1
**Owner:** Human Resources

## 1. Introduction

### 1.1. A Message from Our CEO

Welcome to Innovate Inc.! We are thrilled to have you as part of our team."""

    policy_conduct = """
## 2. Company Policies

### 2.1. Code of Conduct

All employees must adhere to our code of conduct."""

    policy_content = (policy_intro + policy_conduct) * 3

    doc_metadata = {
        "filename": "employee_handbook.md",
        "file_type": "md",
        "file_path": "/path/to/employee_handbook.md",
    }

    chunks = chunker.chunk_document(policy_content, doc_metadata)

    # Verify chunking worked
    assert len(chunks) > 1

    # Verify all chunks have proper metadata
    for i, chunk in enumerate(chunks):
        assert chunk["metadata"]["filename"] == "employee_handbook.md"
        assert chunk["metadata"]["file_type"] == "md"
        assert chunk["metadata"]["chunk_index"] == i
        assert "chunk_id" in chunk["metadata"]
        assert len(chunk["content"]) <= 500

    # Verify overlap exists between consecutive chunks
    if len(chunks) > 1:
        overlap_check = chunks[0]["content"][-100:] in chunks[1]["content"][:200]
        assert overlap_check


def test_chunk_metadata_inheritance():
    """Test that document metadata is properly inherited by chunks"""
    chunker = DocumentChunker(chunk_size=100, overlap=20)

    doc_metadata = {
        "filename": "test_policy.md",
        "file_type": "md",
        "file_size": 1500,
        "file_path": "/absolute/path/to/test_policy.md",
    }

    text = "Policy content goes here. " * 20
    chunks = chunker.chunk_document(text, doc_metadata)

    for chunk in chunks:
        # Original metadata should be preserved
        assert chunk["metadata"]["filename"] == "test_policy.md"
        assert chunk["metadata"]["file_type"] == "md"
        assert chunk["metadata"]["file_size"] == 1500
        expected_path = "/absolute/path/to/test_policy.md"
        assert chunk["metadata"]["file_path"] == expected_path

        # New chunk-specific metadata should be added
        assert "chunk_index" in chunk["metadata"]
        assert "chunk_id" in chunk["metadata"]
        assert "start_pos" in chunk["metadata"]
        assert "end_pos" in chunk["metadata"]