# preprocess.py import re from datasets import load_dataset, Audio def load_telugu_dataset(): ds = load_dataset("ai4bharat/Kathbath", split="train+validation+test") telugu = ds.filter(lambda x: x.get("language","").lower()=="telugu") telugu = telugu.cast_column("audio", Audio(sampling_rate=16000)) return telugu def normalize_text(text): text = re.sub(r'[\,\?\.\!\-\;\:\"]+', "", text).strip() text = re.sub(r"[^\u0C00-\u0C7F ]+", "", text) return text