# vocab.py import json def build_vocab(dataset, text_col="text", save_path="vocab.json"): all_text = " ".join([x[text_col] for x in dataset if x.get(text_col)]) chars = sorted(set(all_text)) vocab = {c: i for i, c in enumerate(chars)} vocab["[PAD]"] = len(vocab) vocab["[UNK]"] = len(vocab) with open(save_path, "w", encoding="utf-8") as f: json.dump(vocab, f, ensure_ascii=False, indent=2) return vocab