Spaces:
Sleeping
Sleeping
| # preprocess.py | |
| import re | |
| import unicodedata | |
| def clean_text_english(text): | |
| """ | |
| 1) Normalize unicode | |
| 2) Keep only English letters, digits, punctuation, basic symbols | |
| 3) Remove extra spaces / newlines | |
| """ | |
| # Normalize unicode | |
| text = unicodedata.normalize("NFKC", text) | |
| # Remove non-English characters (keep basic punctuation and digits) | |
| text = re.sub(r"[^A-Za-z0-9\s.,;:!?()'\-\"@%$&]", " ", text) | |
| # Replace multiple spaces/newlines with single space | |
| text = re.sub(r'\s+', ' ', text) | |
| return text.strip() | |
| def chunk_text(text, max_length=1000, overlap=200): | |
| """ | |
| Chunk text into overlapping windows | |
| """ | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = min(start + max_length, len(text)) | |
| chunks.append(text[start:end]) | |
| start += max_length - overlap | |
| return chunks |