msse-ai-engineering / src /ingestion /document_chunker.py
sethmcknight
Refactor test cases for improved readability and consistency
159faf0
raw
history blame
2.98 kB
import hashlib
import random
from typing import Any, Dict, List, Optional
class DocumentChunker:
"""Document chunker with overlap and reproducible behavior"""
def __init__(self, chunk_size: int = 1000, overlap: int = 200, seed: Optional[int] = None):
"""
Initialize the document chunker
Args:
chunk_size: Maximum characters per chunk
overlap: Number of overlapping characters between chunks
seed: Random seed for reproducibility
"""
self.chunk_size = chunk_size
self.overlap = overlap
self.seed = seed
if seed is not None:
random.seed(seed)
def chunk_text(self, text: str) -> List[Dict[str, Any]]:
"""
Chunk text into overlapping segments
Args:
text: Input text to chunk
Returns:
List of chunk dictionaries with content and basic metadata
"""
if not text.strip():
return []
chunks = []
start = 0
chunk_index = 0
while start < len(text):
end = start + self.chunk_size
chunk_content = text[start:end]
# Create chunk with metadata
chunk = {
"content": chunk_content,
"metadata": {
"chunk_index": chunk_index,
"start_pos": start,
"end_pos": min(end, len(text)),
"chunk_id": self._generate_chunk_id(chunk_content, chunk_index),
},
}
chunks.append(chunk)
# Move start position with overlap consideration
start = end - self.overlap
chunk_index += 1
# Break if we've processed all text
if end >= len(text):
break
return chunks
def chunk_document(self, text: str, doc_metadata: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Chunk a document while preserving document metadata
Args:
text: Document text content
doc_metadata: Document metadata to preserve
Returns:
List of chunks with combined metadata
"""
chunks = self.chunk_text(text)
# Enhance each chunk with document metadata
for chunk in chunks:
chunk["metadata"].update(doc_metadata)
# Create unique chunk ID combining document and chunk info
chunk["metadata"]["chunk_id"] = self._generate_chunk_id(
chunk["content"],
chunk["metadata"]["chunk_index"],
doc_metadata.get("filename", "unknown"),
)
return chunks
def _generate_chunk_id(self, content: str, chunk_index: int, filename: str = "") -> str:
"""Generate a deterministic chunk ID"""
id_string = f"{filename}_{chunk_index}_{content[:50]}"
return hashlib.md5(id_string.encode()).hexdigest()[:12]