saied commited on Jul 10, 2021

Commit

c36ebf7

1 Parent(s): 31bf2aa

pushing tokenizer

Files changed (23) hide show

config.json ADDED Viewed

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 5,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 5,
+  "gradient_checkpointing": false,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "n_special": 0,
+  "predict_special_tokens": true,
+  "resid_pdrop": 0.1,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "transformers_version": "4.9.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50000
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

src/__pycache__/data_utils.cpython-38.pyc ADDED Viewed

Binary file (1.42 kB). View file

src/__pycache__/dictionary.cpython-38.pyc ADDED Viewed

Binary file (2.06 kB). View file

src/__pycache__/normalizer.cpython-38.pyc ADDED Viewed

Binary file (3.46 kB). View file

src/data_utils.py CHANGED Viewed

@@ -32,10 +32,14 @@ def filter_by_adv(text, ratio=50):
     return length_add < ratio
-def normalizer(text, do_lowercase=False):
-    text = normalize(text)
-    if do_lowercase:
-        text = text.lower()
-    return text

     return length_add < ratio
+# def normalizer(text, do_lowercase=False):
+#     text = normalize(text)
+#     if do_lowercase:
+#         text = text.lower()
+#     return text
+def normalizer(example):
+    example["text"] = normalize(example["text"])
+    return example

src/normalizer.py CHANGED Viewed

@@ -127,6 +127,7 @@ def normalize(text, zwnj="\u200c", tokenized=False):
     return " ".join(tokens)
 if __name__ == '__main__':
     import textwrap

     return " ".join(tokens)
 if __name__ == '__main__':
     import textwrap

src/regexes/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (151 Bytes). View file

src/regexes/__pycache__/currency.cpython-38.pyc ADDED Viewed

Binary file (674 Bytes). View file

src/regexes/__pycache__/email.cpython-38.pyc ADDED Viewed

Binary file (465 Bytes). View file

src/regexes/__pycache__/latin.cpython-38.pyc ADDED Viewed

Binary file (365 Bytes). View file

src/regexes/__pycache__/number.cpython-38.pyc ADDED Viewed

Binary file (331 Bytes). View file

src/regexes/__pycache__/persian.cpython-38.pyc ADDED Viewed

Binary file (532 Bytes). View file

src/regexes/__pycache__/phone.cpython-38.pyc ADDED Viewed

Binary file (361 Bytes). View file

src/regexes/__pycache__/punk.cpython-38.pyc ADDED Viewed

Binary file (292 Bytes). View file

src/regexes/__pycache__/quote.cpython-38.pyc ADDED Viewed

Binary file (572 Bytes). View file

src/regexes/__pycache__/url.cpython-38.pyc ADDED Viewed

Binary file (760 Bytes). View file

src/run.sh CHANGED Viewed

@@ -4,23 +4,23 @@ export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
 #export MODEL_NAME_OR_PATH=t5-base
-export OUTPUT_DIR=/home/username/code/gpt2-medium-persian
 export MODEL_TYPE=gpt2
-export CONFIG_NAME=/home/username/code/gpt2-medium-persian
-export TOKENIZER_NAME=/home/username/code/gpt2-medium-persian
-#export TRAIN_FILE=/home/username/code/data/...csv
-#export VALIDATION_FILE=/home/username/code/data/...csv
-#export TEST_FILE=/home/username/code/data/...csv
 export DATASET_NAME=oscar
 export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
-export MAX_SEQUENCE_LENGTH=1024
 #export MAX_TRAIN_SAMPLE=5000
 #export MAX_EVAL_SAMPLES=5000
-export PER_DEVICE_TRAIN_BATCH_SIZE=8
-export PER_DEVICE_EVAL_BATCH_SIZE=8
 export NUM_TRAIN_EPOCHS=10.0
 export LEARNING_RATE=1e-3
 export WARMUP_STEPS=5000

 export LANG=C.UTF-8
 #export MODEL_NAME_OR_PATH=t5-base
+export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
 export MODEL_TYPE=gpt2
+export CONFIG_NAME=/home/saied/code/gpt2-medium-persian
+export TOKENIZER_NAME=/home/saied/code/gpt2-medium-persian
+#export TRAIN_FILE=/home/saied/code/data/...csv
+#export VALIDATION_FILE=/home/saied/code/data/...csv
+#export TEST_FILE=/home/saied/code/data/...csv
 export DATASET_NAME=oscar
 export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
+export MAX_SEQUENCE_LENGTH=512
 #export MAX_TRAIN_SAMPLE=5000
 #export MAX_EVAL_SAMPLES=5000
+export PER_DEVICE_TRAIN_BATCH_SIZE=16
+export PER_DEVICE_EVAL_BATCH_SIZE=16
 export NUM_TRAIN_EPOCHS=10.0
 export LEARNING_RATE=1e-3
 export WARMUP_STEPS=5000

src/run_clm_flax.py CHANGED Viewed

@@ -158,7 +158,7 @@ class DataTrainingArguments:
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     validation_split_percentage: Optional[int] = field(
-        default=5,
         metadata={
             "help": "The percentage of the train set used as validation set in case there's no validation split"
         },

         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     validation_split_percentage: Optional[int] = field(
+        default=1,
         metadata={
             "help": "The percentage of the train set used as validation set in case there's no validation split"
         },

src/run_config.sh CHANGED Viewed

@@ -3,11 +3,11 @@
 export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
-export OUTPUT_DIR=./
-#export OUTPUT_DIR=/home/username/code/gpt2-medium-persian
 export NAME_OR_PATH=gpt2-medium
 python src/create_config.py \
     --output_dir="$OUTPUT_DIR"  \
     --name_or_path="$NAME_OR_PATH" \
-    --params='{"vocab_size": 50000}'

 export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
+# export OUTPUT_DIR=./
+export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
 export NAME_OR_PATH=gpt2-medium
 python src/create_config.py \
     --output_dir="$OUTPUT_DIR"  \
     --name_or_path="$NAME_OR_PATH" \
+    --params='{"vocab_size": 50000,"bos_token_id": 5,"eos_token_id": 5}'

src/run_tokenizer.sh CHANGED Viewed

@@ -3,7 +3,7 @@
 export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
-export OUTPUT_DIR=/home/username/code/gpt2-medium-persian
 export DATASET_NAME=oscar
 export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
 export VOCAB_SIZE=50000

 export LC_ALL=C.UTF-8
 export LANG=C.UTF-8
+export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
 export DATASET_NAME=oscar
 export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
 export VOCAB_SIZE=50000

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff