Christopher Akiki commited on
Commit
6e44597
·
1 Parent(s): 57855ac

Add tokenizer script

Browse files
Files changed (1) hide show
  1. src/train_tokenizer.py +26 -0
src/train_tokenizer.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
3
+
4
+
5
+
6
+ # load dataset
7
+ dataset = load_dataset("german-nlp-group/german_common_crawl")
8
+
9
+ # Instantiate tokenizer
10
+ tokenizer = ByteLevelBPETokenizer()
11
+
12
+ def batch_iterator(batch_size=1000):
13
+ for i in range(0, len(dataset), batch_size):
14
+ yield dataset[i: i + batch_size]["text"]
15
+
16
+ # Customized training
17
+ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=2, special_tokens=[
18
+ "<s>",
19
+ "<pad>",
20
+ "</s>",
21
+ "<unk>",
22
+ "<mask>",
23
+ ])
24
+
25
+ # Save files to disk
26
+ tokenizer.save(f"./tokenizer.json")