File size: 443 Bytes
b109215
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# vocab.py
import json

def build_vocab(dataset, text_col="text", save_path="vocab.json"):
    all_text = " ".join([x[text_col] for x in dataset if x.get(text_col)])
    chars = sorted(set(all_text))
    vocab = {c: i for i, c in enumerate(chars)}

    vocab["[PAD]"] = len(vocab)
    vocab["[UNK]"] = len(vocab)

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(vocab, f, ensure_ascii=False, indent=2)
    return vocab