kochi-metro-backend / preprocess.py
yashita13's picture
Upload 17 files
e69d432 verified
raw
history blame contribute delete
897 Bytes
# preprocess.py
import re
import unicodedata
def clean_text_english(text):
"""
1) Normalize unicode
2) Keep only English letters, digits, punctuation, basic symbols
3) Remove extra spaces / newlines
"""
# Normalize unicode
text = unicodedata.normalize("NFKC", text)
# Remove non-English characters (keep basic punctuation and digits)
text = re.sub(r"[^A-Za-z0-9\s.,;:!?()'\-\"@%$&]", " ", text)
# Replace multiple spaces/newlines with single space
text = re.sub(r'\s+', ' ', text)
return text.strip()
def chunk_text(text, max_length=1000, overlap=200):
"""
Chunk text into overlapping windows
"""
chunks = []
start = 0
while start < len(text):
end = min(start + max_length, len(text))
chunks.append(text[start:end])
start += max_length - overlap
return chunks