import hashlib import random from typing import Any, Dict, List, Optional class DocumentChunker: """Document chunker with overlap and reproducible behavior""" def __init__(self, chunk_size: int = 1000, overlap: int = 200, seed: Optional[int] = None): """ Initialize the document chunker Args: chunk_size: Maximum characters per chunk overlap: Number of overlapping characters between chunks seed: Random seed for reproducibility """ self.chunk_size = chunk_size self.overlap = overlap self.seed = seed if seed is not None: random.seed(seed) def chunk_text(self, text: str) -> List[Dict[str, Any]]: """ Chunk text into overlapping segments Args: text: Input text to chunk Returns: List of chunk dictionaries with content and basic metadata """ if not text.strip(): return [] chunks = [] start = 0 chunk_index = 0 while start < len(text): end = start + self.chunk_size chunk_content = text[start:end] # Create chunk with metadata chunk = { "content": chunk_content, "metadata": { "chunk_index": chunk_index, "start_pos": start, "end_pos": min(end, len(text)), "chunk_id": self._generate_chunk_id(chunk_content, chunk_index), }, } chunks.append(chunk) # Move start position with overlap consideration start = end - self.overlap chunk_index += 1 # Break if we've processed all text if end >= len(text): break return chunks def chunk_document(self, text: str, doc_metadata: Dict[str, Any]) -> List[Dict[str, Any]]: """ Chunk a document while preserving document metadata Args: text: Document text content doc_metadata: Document metadata to preserve Returns: List of chunks with combined metadata """ chunks = self.chunk_text(text) # Enhance each chunk with document metadata for chunk in chunks: chunk["metadata"].update(doc_metadata) # Create unique chunk ID combining document and chunk info chunk["metadata"]["chunk_id"] = self._generate_chunk_id( chunk["content"], chunk["metadata"]["chunk_index"], doc_metadata.get("filename", "unknown"), ) return chunks def _generate_chunk_id(self, content: str, chunk_index: int, filename: str = "") -> str: """Generate a deterministic chunk ID""" id_string = f"{filename}_{chunk_index}_{content[:50]}" return hashlib.md5(id_string.encode()).hexdigest()[:12]