bertin-project
/

bertin-roberta-base-spanish

@@ -8,7 +8,6 @@ import datasets
 import kenlm
 import numpy as np
 from numpy.random import default_rng
-rng = default_rng()
 logger = datasets.logging.get_logger(__name__)
@@ -284,11 +283,16 @@ class Mc4(datasets.GeneratorBasedBuilder):
     BUILDER_CONFIG_CLASS = Mc4Config
     def __init__(self, *args, writer_batch_size=None, **kwargs):
-        self.sampling_method = kwargs.pop("sampling_method")
         if self.sampling_method:
-            self.perplexity_model = kwargs.pop("perplexity_model")
-            self.sampling_factor = kwargs.pop("sampling_factor")
-            self.boundaries = kwargs.pop("boundaries")
             # Loading 5-gram model
             # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
             logger.info("loading model = %s", self.perplexity_model)
@@ -298,7 +302,7 @@ class Mc4(datasets.GeneratorBasedBuilder):
             elif self.sampling_method == "random":
                 self.should_keep_doc = self._should_keep_doc_random
             else:
-                self.should_keep_doc = self._should_keep_doc_gaussian
         super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
@@ -311,7 +315,7 @@ class Mc4(datasets.GeneratorBasedBuilder):
             doc_length += length
         return 10.0 ** (-doc_log_score / doc_length)
-    def _should_keep_doc_step(self, doc, factor=1, boundaries=None):
         perplexity = self.get_perplexity(doc)
         if boundaries is None:
             boundaries = [536394.99320948, 662247.50212365, 919250.87225178]
@@ -322,21 +326,22 @@ class Mc4(datasets.GeneratorBasedBuilder):
         elif boundaries[1] < perplexity < boundaries[2]:
             quartile_range = boundaries[2] - boundaries[1]
         elif perplexity >= boundaries[2]:
-            quartile_range = 100 * boundaries[2]
         probability = factor / quartile_range
-        return rng.uniform() < probability
-    def _should_keep_doc_gaussian(self, doc, factor=0.4, boundaries=None):
         perplexity = self.get_perplexity(doc)
         if boundaries is not None:
             m = boundaries[1]
         else:
             m = 662247.50212365
-        weighted_perplexity = factor * np.exp(-9/2*((perplexity-m)/m)**2)
-        return rng.uniform() < weighted_perplexity
     def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
-        return rng.uniform() <= 0.5
     def _info(self):
         return datasets.DatasetInfo(

 import kenlm
 import numpy as np
 from numpy.random import default_rng
 logger = datasets.logging.get_logger(__name__)
     BUILDER_CONFIG_CLASS = Mc4Config
     def __init__(self, *args, writer_batch_size=None, **kwargs):
+        self.sampling_method = kwargs.pop("sampling_method", None)
         if self.sampling_method:
+            seed = kwargs.pop("seed", None)
+            if seed is not None:
+                self.rng = default_rng(seed)
+            else:
+                self.rng = default_rng()
+            self.perplexity_model = kwargs.pop("perplexity_model", None)
+            self.sampling_factor = kwargs.pop("sampling_factor", None)
+            self.boundaries = kwargs.pop("boundaries", None)
             # Loading 5-gram model
             # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
             logger.info("loading model = %s", self.perplexity_model)
             elif self.sampling_method == "random":
                 self.should_keep_doc = self._should_keep_doc_random
             else:
+                self.should_keep_doc = self._should_keep_doc_step
         super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
             doc_length += length
         return 10.0 ** (-doc_log_score / doc_length)
+    def _should_keep_doc_step(self, doc, factor=1.5e5, boundaries=None):
         perplexity = self.get_perplexity(doc)
         if boundaries is None:
             boundaries = [536394.99320948, 662247.50212365, 919250.87225178]
         elif boundaries[1] < perplexity < boundaries[2]:
             quartile_range = boundaries[2] - boundaries[1]
         elif perplexity >= boundaries[2]:
+            quartile_range = 10 * boundaries[2]
         probability = factor / quartile_range
+        return self.rng.uniform() < probability
+    def _should_keep_doc_gaussian(self, doc, factor=0.78, boundaries=None):
         perplexity = self.get_perplexity(doc)
         if boundaries is not None:
             m = boundaries[1]
         else:
             m = 662247.50212365
+        exponential = np.exp(-9/2 * ((perplexity - m) / m) ** 2)
+        weighted_perplexity = factor * exponential
+        return self.rng.uniform() < weighted_perplexity
     def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
+        return self.rng.uniform() <= 0.5
     def _info(self):
         return datasets.DatasetInfo(

run_mlm_flax_stream.py CHANGED Viewed

@@ -256,28 +256,27 @@ class FlaxDataCollatorForLanguageModeling:
         return inputs, labels
 @dataclass
 class SamplingArguments:
     """
     Arguments pertaining to how to perform sampling of the dataset.
     """
-    perplexity_model: Optional[str]  = field(
-        default="es.arpa.bin", metadata={"help": "kenlm model to use to get perplexity values."}
     )
-    sampling_method: Optional[str]  = field(
-        default=None, metadata={"help": "Sample using a 'step' or 'gaussian' perplexity function per document."}
     )
-    sampling_factor: Optional[int]  = field(
-        default=1, metadata={"help": "Sampling factor. Integers for step function, decimals for gaussian."}
     )
     boundaries: Optional[str] = field(
         default="536394.99320948,662247.50212365,919250.87225178", metadata={"help": "Quartile boundaries"}
     )
     def __post_init__(self):
-        self.boundaries = [float(q) for q in self.boundaries.split(",")]
 def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
@@ -397,6 +396,7 @@ if __name__ == "__main__":
             sampling_factor=sampling_args.sampling_factor,
             boundaries=sampling_args.boundaries,
             perplexity_model=sampling_args.perplexity_model,
         )
     if model_args.config_name:

         return inputs, labels
 @dataclass
 class SamplingArguments:
     """
     Arguments pertaining to how to perform sampling of the dataset.
     """
+    perplexity_model: Optional[str] = field(
+        default="./es.arpa.bin", metadata={"help": "Path to KenLM model to use to get perplexity values."}
     )
+    sampling_method: Optional[str] = field(
+        default=None, metadata={"help": "Sample using a 'step' or 'gaussian' perplexity function per document, or 'random'."}
     )
+    sampling_factor: Optional[float]  = field(
+        default=None, metadata={"help": "Sampling factor. Integers for step function, decimals for gaussian."}
     )
     boundaries: Optional[str] = field(
         default="536394.99320948,662247.50212365,919250.87225178", metadata={"help": "Quartile boundaries"}
     )
     def __post_init__(self):
+        self.boundaries = [float(q.strip()) for q in self.boundaries.split(",")]
 def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
             sampling_factor=sampling_args.sampling_factor,
             boundaries=sampling_args.boundaries,
             perplexity_model=sampling_args.perplexity_model,
+            seed=training_args.seed,
         )
     if model_args.config_name: