# preprocess.py import re import unicodedata def clean_text_english(text): """ 1) Normalize unicode 2) Keep only English letters, digits, punctuation, basic symbols 3) Remove extra spaces / newlines """ # Normalize unicode text = unicodedata.normalize("NFKC", text) # Remove non-English characters (keep basic punctuation and digits) text = re.sub(r"[^A-Za-z0-9\s.,;:!?()'\-\"@%$&]", " ", text) # Replace multiple spaces/newlines with single space text = re.sub(r'\s+', ' ', text) return text.strip() def chunk_text(text, max_length=1000, overlap=200): """ Chunk text into overlapping windows """ chunks = [] start = 0 while start < len(text): end = min(start + max_length, len(text)) chunks.append(text[start:end]) start += max_length - overlap return chunks