Spaces:
Sleeping
Sleeping
| import hashlib | |
| import random | |
| from typing import Any, Dict, List, Optional | |
| class DocumentChunker: | |
| """Document chunker with overlap and reproducible behavior""" | |
| def __init__(self, chunk_size: int = 1000, overlap: int = 200, seed: Optional[int] = None): | |
| """ | |
| Initialize the document chunker | |
| Args: | |
| chunk_size: Maximum characters per chunk | |
| overlap: Number of overlapping characters between chunks | |
| seed: Random seed for reproducibility | |
| """ | |
| self.chunk_size = chunk_size | |
| self.overlap = overlap | |
| self.seed = seed | |
| if seed is not None: | |
| random.seed(seed) | |
| def chunk_text(self, text: str) -> List[Dict[str, Any]]: | |
| """ | |
| Chunk text into overlapping segments | |
| Args: | |
| text: Input text to chunk | |
| Returns: | |
| List of chunk dictionaries with content and basic metadata | |
| """ | |
| if not text.strip(): | |
| return [] | |
| chunks = [] | |
| start = 0 | |
| chunk_index = 0 | |
| while start < len(text): | |
| end = start + self.chunk_size | |
| chunk_content = text[start:end] | |
| # Create chunk with metadata | |
| chunk = { | |
| "content": chunk_content, | |
| "metadata": { | |
| "chunk_index": chunk_index, | |
| "start_pos": start, | |
| "end_pos": min(end, len(text)), | |
| "chunk_id": self._generate_chunk_id(chunk_content, chunk_index), | |
| }, | |
| } | |
| chunks.append(chunk) | |
| # Move start position with overlap consideration | |
| start = end - self.overlap | |
| chunk_index += 1 | |
| # Break if we've processed all text | |
| if end >= len(text): | |
| break | |
| return chunks | |
| def chunk_document(self, text: str, doc_metadata: Dict[str, Any]) -> List[Dict[str, Any]]: | |
| """ | |
| Chunk a document while preserving document metadata | |
| Args: | |
| text: Document text content | |
| doc_metadata: Document metadata to preserve | |
| Returns: | |
| List of chunks with combined metadata | |
| """ | |
| chunks = self.chunk_text(text) | |
| # Enhance each chunk with document metadata | |
| for chunk in chunks: | |
| chunk["metadata"].update(doc_metadata) | |
| # Create unique chunk ID combining document and chunk info | |
| chunk["metadata"]["chunk_id"] = self._generate_chunk_id( | |
| chunk["content"], | |
| chunk["metadata"]["chunk_index"], | |
| doc_metadata.get("filename", "unknown"), | |
| ) | |
| return chunks | |
| def _generate_chunk_id(self, content: str, chunk_index: int, filename: str = "") -> str: | |
| """Generate a deterministic chunk ID""" | |
| id_string = f"{filename}_{chunk_index}_{content[:50]}" | |
| return hashlib.md5(id_string.encode()).hexdigest()[:12] | |