Add normalization steps, fix som bugs, add tfboard tracker
Browse files- .gitattributes +1 -0
- README.md +2 -2
- src/data_utils.py +3 -7
- src/requirements.txt +2 -1
.gitattributes
CHANGED
|
@@ -14,3 +14,4 @@
|
|
| 14 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 14 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -33,9 +33,9 @@ python create_config.py --name_or_path gpt2-medium --params '{"vocab_size": 4200
|
|
| 33 |
|
| 34 |
Steps:
|
| 35 |
|
| 36 |
-
- [
|
| 37 |
|
| 38 |
-
- [
|
| 39 |
|
| 40 |
- [ ] Remove Telegram, Instagram advertisements, or posts (a whole record)
|
| 41 |
|
|
|
|
| 33 |
|
| 34 |
Steps:
|
| 35 |
|
| 36 |
+
- [x] Remove stretched words such as ســــــــــلام
|
| 37 |
|
| 38 |
+
- [x] Remove links, user-mentioning (such as @jane_doe)
|
| 39 |
|
| 40 |
- [ ] Remove Telegram, Instagram advertisements, or posts (a whole record)
|
| 41 |
|
src/data_utils.py
CHANGED
|
@@ -2,7 +2,6 @@ from hazm import word_tokenize
|
|
| 2 |
from hazm import sent_tokenize
|
| 3 |
import re
|
| 4 |
import six
|
| 5 |
-
import string
|
| 6 |
|
| 7 |
from normalizer import normalize
|
| 8 |
|
|
@@ -13,15 +12,15 @@ def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئا
|
|
| 13 |
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
|
| 14 |
text = text.replace(" ", "")
|
| 15 |
|
| 16 |
-
return
|
| 17 |
|
| 18 |
|
| 19 |
def filter_by_num_tokens(text, gt=64):
|
| 20 |
-
return
|
| 21 |
|
| 22 |
|
| 23 |
def filter_by_num_sents(text, gt=2):
|
| 24 |
-
return
|
| 25 |
|
| 26 |
|
| 27 |
def normalizer(text, do_lowercase=False):
|
|
@@ -31,6 +30,3 @@ def normalizer(text, do_lowercase=False):
|
|
| 31 |
text = text.lower()
|
| 32 |
|
| 33 |
return text
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 2 |
from hazm import sent_tokenize
|
| 3 |
import re
|
| 4 |
import six
|
|
|
|
| 5 |
|
| 6 |
from normalizer import normalize
|
| 7 |
|
|
|
|
| 12 |
candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "")
|
| 13 |
text = text.replace(" ", "")
|
| 14 |
|
| 15 |
+
return (len(candidate_text) / len(text)) > ratio
|
| 16 |
|
| 17 |
|
| 18 |
def filter_by_num_tokens(text, gt=64):
|
| 19 |
+
return len(word_tokenize(text)) > gt
|
| 20 |
|
| 21 |
|
| 22 |
def filter_by_num_sents(text, gt=2):
|
| 23 |
+
return len(sent_tokenize(text)) > gt
|
| 24 |
|
| 25 |
|
| 26 |
def normalizer(text, do_lowercase=False):
|
|
|
|
| 30 |
text = text.lower()
|
| 31 |
|
| 32 |
return text
|
|
|
|
|
|
|
|
|
src/requirements.txt
CHANGED
|
@@ -3,4 +3,5 @@ jax>=0.2.8
|
|
| 3 |
jaxlib>=0.1.59
|
| 4 |
flax>=0.3.4
|
| 5 |
optax>=0.0.8
|
| 6 |
-
hazm
|
|
|
|
|
|
| 3 |
jaxlib>=0.1.59
|
| 4 |
flax>=0.3.4
|
| 5 |
optax>=0.0.8
|
| 6 |
+
hazm
|
| 7 |
+
tensorboard
|