# preprocess.py
import re
from datasets import load_dataset, Audio

def load_telugu_dataset():
    ds = load_dataset("ai4bharat/Kathbath", split="train+validation+test")
    telugu = ds.filter(lambda x: x.get("language","").lower()=="telugu")
    telugu = telugu.cast_column("audio", Audio(sampling_rate=16000))
    return telugu

def normalize_text(text):
    text = re.sub(r'[\,\?\.\!\-\;\:\"]+', "", text).strip()
    text = re.sub(r"[^\u0C00-\u0C7F ]+", "", text)
    return text