Spaces:
Runtime error
Runtime error
| # vocab.py | |
| import json | |
| def build_vocab(dataset, text_col="text", save_path="vocab.json"): | |
| all_text = " ".join([x[text_col] for x in dataset if x.get(text_col)]) | |
| chars = sorted(set(all_text)) | |
| vocab = {c: i for i, c in enumerate(chars)} | |
| vocab["[PAD]"] = len(vocab) | |
| vocab["[UNK]"] = len(vocab) | |
| with open(save_path, "w", encoding="utf-8") as f: | |
| json.dump(vocab, f, ensure_ascii=False, indent=2) | |
| return vocab | |