| import json | |
| import kenlm | |
| from tqdm import tqdm | |
| model = kenlm.Model("../es.arpa.bin") | |
| def get_perplexity(doc): | |
| doc_log_score, doc_length = 0, 0 | |
| for line in doc.split("\n"): | |
| log_score = model.score(line) | |
| length = len(line.split()) + 1 | |
| doc_log_score += log_score | |
| doc_length += length | |
| return 10.0 ** (-doc_log_score / doc_length) | |
| with open("mc4-es-train-50M-stats.csv", "w") as csv: | |
| with open("mc4-es-train-50M-steps.jsonl", "r") as data: | |
| for line in tqdm(data): | |
| text = json.loads(line)["text"] | |
| csv.write(f"{len(text.split())},{get_perplexity(text)}\n") | |