| export LC_ALL=C.UTF-8 | |
| export LANG=C.UTF-8 | |
| export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian | |
| export DATASET_NAME=oscar | |
| export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa | |
| export VOCAB_SIZE=50000 | |
| export MIN_FREQUENCY=2 | |
| export SPECIAL_TOKENS='<s>','<pad>','</s>','<unk>','<mask>','<|endoftext|>','<|startoftext|>','<sep>','<cls>','<nl>','<tab>','<zwnj>','[U1]','[U2]','[U3]','[U4]','[U5]','[U6]','[U7]','[U8]','[U9]','[U10]','[U11]','[U12]','[U13]','[U14]','[U15]','[U16]','[U17]','[U18]','[U19]','[U20]' | |
| python src/train_tokenizer.py \ | |
| --output_dir="$OUTPUT_DIR" \ | |
| --dataset_name="$DATASET_NAME" \ | |
| --dataset_config_name="$DATASET_CONFIG_NAME" \ | |
| --vocab_size=$VOCAB_SIZE \ | |
| --min_frequency=$MIN_FREQUENCY \ | |
| --special_tokens="$SPECIAL_TOKENS" |