Adding Numpy random number generator
Browse files- mc4/mc4.py +5 -4
mc4/mc4.py
CHANGED
|
@@ -7,6 +7,8 @@ import json
|
|
| 7 |
import datasets
|
| 8 |
import kenlm
|
| 9 |
import numpy as np
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
logger = datasets.logging.get_logger(__name__)
|
|
@@ -309,7 +311,6 @@ class Mc4(datasets.GeneratorBasedBuilder):
|
|
| 309 |
doc_length += length
|
| 310 |
return 10.0 ** (-doc_log_score / doc_length)
|
| 311 |
|
| 312 |
-
|
| 313 |
def _should_keep_doc_step(self, doc, factor=1, boundaries=None):
|
| 314 |
perplexity = self.get_perplexity(doc)
|
| 315 |
if boundaries is None:
|
|
@@ -323,7 +324,7 @@ class Mc4(datasets.GeneratorBasedBuilder):
|
|
| 323 |
elif perplexity >= boundaries[2]:
|
| 324 |
quartile_range = 100 * boundaries[2]
|
| 325 |
probability = factor / quartile_range
|
| 326 |
-
return
|
| 327 |
|
| 328 |
def _should_keep_doc_gaussian(self, doc, factor=0.4, boundaries=None):
|
| 329 |
perplexity = self.get_perplexity(doc)
|
|
@@ -332,10 +333,10 @@ class Mc4(datasets.GeneratorBasedBuilder):
|
|
| 332 |
else:
|
| 333 |
m = 662247.50212365
|
| 334 |
weighted_perplexity = factor * np.exp(-9/2*((perplexity-m)/m)**2)
|
| 335 |
-
return
|
| 336 |
|
| 337 |
def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
|
| 338 |
-
return
|
| 339 |
|
| 340 |
def _info(self):
|
| 341 |
return datasets.DatasetInfo(
|
|
|
|
| 7 |
import datasets
|
| 8 |
import kenlm
|
| 9 |
import numpy as np
|
| 10 |
+
from numpy.random import default_rng
|
| 11 |
+
rng = default_rng()
|
| 12 |
|
| 13 |
|
| 14 |
logger = datasets.logging.get_logger(__name__)
|
|
|
|
| 311 |
doc_length += length
|
| 312 |
return 10.0 ** (-doc_log_score / doc_length)
|
| 313 |
|
|
|
|
| 314 |
def _should_keep_doc_step(self, doc, factor=1, boundaries=None):
|
| 315 |
perplexity = self.get_perplexity(doc)
|
| 316 |
if boundaries is None:
|
|
|
|
| 324 |
elif perplexity >= boundaries[2]:
|
| 325 |
quartile_range = 100 * boundaries[2]
|
| 326 |
probability = factor / quartile_range
|
| 327 |
+
return rng.uniform() < probability
|
| 328 |
|
| 329 |
def _should_keep_doc_gaussian(self, doc, factor=0.4, boundaries=None):
|
| 330 |
perplexity = self.get_perplexity(doc)
|
|
|
|
| 333 |
else:
|
| 334 |
m = 662247.50212365
|
| 335 |
weighted_perplexity = factor * np.exp(-9/2*((perplexity-m)/m)**2)
|
| 336 |
+
return rng.uniform() < weighted_perplexity
|
| 337 |
|
| 338 |
def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
|
| 339 |
+
return rng.uniform() <= 0.5
|
| 340 |
|
| 341 |
def _info(self):
|
| 342 |
return datasets.DatasetInfo(
|