telugu-asr-xlsr / vocab.py
saikamal1108's picture
Create vocab.py
b109215 verified
raw
history blame
443 Bytes
# vocab.py
import json
def build_vocab(dataset, text_col="text", save_path="vocab.json"):
all_text = " ".join([x[text_col] for x in dataset if x.get(text_col)])
chars = sorted(set(all_text))
vocab = {c: i for i, c in enumerate(chars)}
vocab["[PAD]"] = len(vocab)
vocab["[UNK]"] = len(vocab)
with open(save_path, "w", encoding="utf-8") as f:
json.dump(vocab, f, ensure_ascii=False, indent=2)
return vocab