from collections import OrderedDict
import sympy as sp
from transformers import PreTrainedTokenizer
import json
import os
from huggingface_hub import upload_folder
SPECIAL_WORDS = ["", "", "", "(", ")"]
SPECIAL_WORDS = SPECIAL_WORDS + [f"" for i in range(10)]
class LyapunovTokenizer(PreTrainedTokenizer):
def __init__(self):
self.SYMPY_OPERATORS = {
sp.Add: "+",
sp.Mul: "*",
sp.Pow: "^",
sp.exp: "exp",
sp.log: "ln",
sp.Abs: "Abs",
sp.sin: "sin",
sp.cos: "cos",
sp.tan: "tan",
sp.asin: "asin",
sp.acos: "acos",
sp.atan: "atan",
sp.DiracDelta: "delta0",
}
self.trig_ops = ["sin", "cos", "tan"]
self.arctrig_ops = ["asin", "acos", "atan"]
self.exp_ops = ["exp", "ln"]
self.other_ops = ["sqrt"]
op_set = {
"+": 2,
"-": 2,
"*": 2,
"/": 2,
"^": 2,
"sqrt": 1,
"exp": 1,
"ln": 1,
"sin": 1,
"cos": 1,
"tan": 1,
"asin": 1,
"acos": 1,
"atan": 1,
"Abs": 1,
}
self.int_base = 1000
self.max_degree = 6
self.operators_lyap = op_set
self.operators = self.operators_lyap
self.variables = OrderedDict({f"x{i}": sp.Symbol(f"x{i}") for i in range(2 * self.max_degree)})
self.constants = ["pi", "E"]
self.symbols = ["I", "INT+", "INT-", "FLOAT+", "FLOAT-", ".", "10^"]
self.elements = [str(i) for i in range(max(10, self.int_base))]
self.mask_symbol = [""]
self.words = SPECIAL_WORDS + self.constants + list(self.variables.keys()) + list(self.operators.keys()) + self.symbols + self.elements + self.mask_symbol
self.vocab = {s: i for i, s in enumerate(self.words)}
self.inv_vocab = {v: k for k, v in self.vocab.items()}
super().__init__(
model_max_length=2048, bos_token="", eos_token="", unk_token="", mask_token=""
)
def _tokenize(self, text):
return text.split()
def _convert_token_to_id(self, token):
return self.vocab.get(token, self.unk_token_id)
def _convert_id_to_token(self, index):
return self.inv_vocab.get(index, self.unk_token)
def get_vocab(self):
return self.vocab
@property
def vocab_size(self):
return len(self.vocab)
def save_vocabulary(self, save_directory, filename_prefix=None):
vocab_file = os.path.join(save_directory, "vocab.json")
with open(vocab_file, "w") as f:
json.dump(self.vocab, f)
return (vocab_file,)