Add smiles normalization (#11)
Browse files- Added smiles normalization (d823faeb77bf4d6ea7f0795d5cebd5ff2fb32344)
Co-authored-by: Victor Yukio Shirasuna <[email protected]>
smi-ted/inference/smi_ted_large/load.py
CHANGED
|
@@ -19,6 +19,12 @@ from transformers import BertTokenizer
|
|
| 19 |
import numpy as np
|
| 20 |
import pandas as pd
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Standard library
|
| 23 |
from functools import partial
|
| 24 |
import regex as re
|
|
@@ -29,6 +35,17 @@ from tqdm import tqdm
|
|
| 29 |
tqdm.pandas()
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
class MolTranBertTokenizer(BertTokenizer):
|
| 33 |
def __init__(self, vocab_file: str = '',
|
| 34 |
do_lower_case=False,
|
|
@@ -476,9 +493,13 @@ class Smi_ted(nn.Module):
|
|
| 476 |
if self.is_cuda_available:
|
| 477 |
self.encoder.cuda()
|
| 478 |
self.decoder.cuda()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
|
| 480 |
# tokenizer
|
| 481 |
-
idx, mask = self.tokenize(smiles)
|
| 482 |
|
| 483 |
###########
|
| 484 |
# Encoder #
|
|
@@ -547,6 +568,7 @@ class Smi_ted(nn.Module):
|
|
| 547 |
|
| 548 |
# handle single str or a list of str
|
| 549 |
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
|
|
|
| 550 |
n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
|
| 551 |
|
| 552 |
# process in batches
|
|
|
|
| 19 |
import numpy as np
|
| 20 |
import pandas as pd
|
| 21 |
|
| 22 |
+
# Chemistry
|
| 23 |
+
from rdkit import Chem
|
| 24 |
+
from rdkit.Chem import PandasTools
|
| 25 |
+
from rdkit.Chem import Descriptors
|
| 26 |
+
PandasTools.RenderImagesInAllDataFrames(True)
|
| 27 |
+
|
| 28 |
# Standard library
|
| 29 |
from functools import partial
|
| 30 |
import regex as re
|
|
|
|
| 35 |
tqdm.pandas()
|
| 36 |
|
| 37 |
|
| 38 |
+
# function to canonicalize SMILES
|
| 39 |
+
def normalize_smiles(smi, canonical=True, isomeric=False):
|
| 40 |
+
try:
|
| 41 |
+
normalized = Chem.MolToSmiles(
|
| 42 |
+
Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
|
| 43 |
+
)
|
| 44 |
+
except:
|
| 45 |
+
normalized = None
|
| 46 |
+
return normalized
|
| 47 |
+
|
| 48 |
+
|
| 49 |
class MolTranBertTokenizer(BertTokenizer):
|
| 50 |
def __init__(self, vocab_file: str = '',
|
| 51 |
do_lower_case=False,
|
|
|
|
| 493 |
if self.is_cuda_available:
|
| 494 |
self.encoder.cuda()
|
| 495 |
self.decoder.cuda()
|
| 496 |
+
|
| 497 |
+
# handle single str or a list of str
|
| 498 |
+
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
| 499 |
+
smiles = smiles.apply(normalize_smiles)
|
| 500 |
|
| 501 |
# tokenizer
|
| 502 |
+
idx, mask = self.tokenize(smiles.to_list())
|
| 503 |
|
| 504 |
###########
|
| 505 |
# Encoder #
|
|
|
|
| 568 |
|
| 569 |
# handle single str or a list of str
|
| 570 |
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
| 571 |
+
smiles = smiles.apply(normalize_smiles)
|
| 572 |
n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
|
| 573 |
|
| 574 |
# process in batches
|
smi-ted/inference/smi_ted_light/load.py
CHANGED
|
@@ -19,6 +19,12 @@ from transformers import BertTokenizer
|
|
| 19 |
import numpy as np
|
| 20 |
import pandas as pd
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Standard library
|
| 23 |
from functools import partial
|
| 24 |
import regex as re
|
|
@@ -29,6 +35,17 @@ from tqdm import tqdm
|
|
| 29 |
tqdm.pandas()
|
| 30 |
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
class MolTranBertTokenizer(BertTokenizer):
|
| 33 |
def __init__(self, vocab_file: str = '',
|
| 34 |
do_lower_case=False,
|
|
@@ -476,9 +493,13 @@ class Smi_ted(nn.Module):
|
|
| 476 |
if self.is_cuda_available:
|
| 477 |
self.encoder.cuda()
|
| 478 |
self.decoder.cuda()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
|
| 480 |
# tokenizer
|
| 481 |
-
idx, mask = self.tokenize(smiles)
|
| 482 |
|
| 483 |
###########
|
| 484 |
# Encoder #
|
|
@@ -547,6 +568,7 @@ class Smi_ted(nn.Module):
|
|
| 547 |
|
| 548 |
# handle single str or a list of str
|
| 549 |
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
|
|
|
| 550 |
n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
|
| 551 |
|
| 552 |
# process in batches
|
|
|
|
| 19 |
import numpy as np
|
| 20 |
import pandas as pd
|
| 21 |
|
| 22 |
+
# Chemistry
|
| 23 |
+
from rdkit import Chem
|
| 24 |
+
from rdkit.Chem import PandasTools
|
| 25 |
+
from rdkit.Chem import Descriptors
|
| 26 |
+
PandasTools.RenderImagesInAllDataFrames(True)
|
| 27 |
+
|
| 28 |
# Standard library
|
| 29 |
from functools import partial
|
| 30 |
import regex as re
|
|
|
|
| 35 |
tqdm.pandas()
|
| 36 |
|
| 37 |
|
| 38 |
+
# function to canonicalize SMILES
|
| 39 |
+
def normalize_smiles(smi, canonical=True, isomeric=False):
|
| 40 |
+
try:
|
| 41 |
+
normalized = Chem.MolToSmiles(
|
| 42 |
+
Chem.MolFromSmiles(smi), canonical=canonical, isomericSmiles=isomeric
|
| 43 |
+
)
|
| 44 |
+
except:
|
| 45 |
+
normalized = None
|
| 46 |
+
return normalized
|
| 47 |
+
|
| 48 |
+
|
| 49 |
class MolTranBertTokenizer(BertTokenizer):
|
| 50 |
def __init__(self, vocab_file: str = '',
|
| 51 |
do_lower_case=False,
|
|
|
|
| 493 |
if self.is_cuda_available:
|
| 494 |
self.encoder.cuda()
|
| 495 |
self.decoder.cuda()
|
| 496 |
+
|
| 497 |
+
# handle single str or a list of str
|
| 498 |
+
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
| 499 |
+
smiles = smiles.apply(normalize_smiles)
|
| 500 |
|
| 501 |
# tokenizer
|
| 502 |
+
idx, mask = self.tokenize(smiles.to_list())
|
| 503 |
|
| 504 |
###########
|
| 505 |
# Encoder #
|
|
|
|
| 568 |
|
| 569 |
# handle single str or a list of str
|
| 570 |
smiles = pd.Series(smiles) if isinstance(smiles, str) else pd.Series(list(smiles))
|
| 571 |
+
smiles = smiles.apply(normalize_smiles)
|
| 572 |
n_split = smiles.shape[0] // batch_size if smiles.shape[0] >= batch_size else smiles.shape[0]
|
| 573 |
|
| 574 |
# process in batches
|