| from hazm import word_tokenize | |
| from hazm import sent_tokenize | |
| import re | |
| import six | |
| from normalizer import normalize | |
| persian_regex = "0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c" | |
| def filter_by_lang_regex(text, ratio=0.7, regex="0-9۰۱۲۳۴۵۶۷۸۹ءآئابتثجحخدذرزسشصضطظعغفقلمنهوپچژکگیە\u200c"): | |
| candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text)).replace(" ", "") | |
| text = text.replace(" ", "") | |
| return (len(candidate_text) / len(text)) > ratio | |
| def filter_by_num_tokens(text, gt=64): | |
| return len(word_tokenize(text)) > gt | |
| def filter_by_num_sents(text, gt=2): | |
| return len(sent_tokenize(text)) > gt | |
| def filter_by_adv(text, ratio=50): | |
| comma = text.split(",") | |
| colon = re.findall(r"""(?:([^\W]+):([^\W]+))""", text) | |
| virgool = text.split("،") | |
| length_add = len(comma) + len(colon) + len(virgool) | |
| return length_add < ratio | |
| def normalizer(text, do_lowercase=False): | |
| text = normalize(text) | |
| if do_lowercase: | |
| text = text.lower() | |
| return text | |