Spaces:

ml-jku
/

mhnfs

Running

App Files Files Community

Tschoui commited on Dec 28, 2024

Commit

9afbc33

verified ·

1 Parent(s): 71fb4cc

Upload 3 files

Browse files

Files changed (3) hide show

pubchem_experiment/data_preprocess.py +197 -0
pubchem_experiment/make_predictions.py +172 -0
pubchem_experiment/metrics.py +163 -0

pubchem_experiment/data_preprocess.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import json
+import pandas as pd
+import tqdm
+import swifter
+from rdkit import Chem
+# Disable RDKit informational and warning messages
+from rdkit import RDLogger
+RDLogger.DisableLog('rdApp.*')
+PUBCHEM_DIR = # pubchem_path + 'pubchem24/'
+FSMOL_UID_PATH = # fsmol_path + '/fsmol/fsmol_train_accession_keys.json'
+PROT_CLASS_PATH = # chembl_path + 'chembl33/uniprot_pclass_mapping.csv'
+MHNFS_PATH = # mhnfs_path +  '/mhnfs'
+import sys
+sys.path.append(MHNFS_PATH)
+from src.data_preprocessing.utils import Standardizer
+class PubChemFilter:
+    def __init__(self, pubchem_dir, fsmol_uid_path, prot_class_path, mhnfs_path, debug = False):
+        self.pubchem_dir = pubchem_dir
+        self.fsmol_uid_path = fsmol_uid_path
+        self.prot_class_path = prot_class_path
+        self.mhnfs_path = mhnfs_path
+        self.debug = debug
+    def load_and_filter_assays(self):
+        """
+        Load PubChem Assay data from file and filter them:
+        1. Drop all assays without protein accession keys
+        2. Drop all assays linked to multiple accession keys
+        3. Drop all assays with accession keys in FSmol training data
+        Returns:
+            df_assays (pd.Dataframe)
+        """
+        print('Load assays...')
+        df_assays = pd.read_table(f'{self.pubchem_dir}/bioassays.tsv.gz', usecols=['AID', 'UniProts IDs'] ).rename(columns={'UniProts IDs' : 'UID'})
+        # Load FSmol training data accession keys
+        with open(self.fsmol_uid_path, 'r') as f:
+            fs_train_targets = json.load(f).values()
+        fs_train_targets = list(set([key for sublist in fs_train_targets for key in sublist]))
+        print('Filter assays...')
+        df_assays = df_assays.dropna(subset=['UID'])
+        df_assays = df_assays[~df_assays['UID'].str.contains('\|')]
+        df_assays = df_assays[~df_assays['UID'].str.contains('|'.join(fs_train_targets))]
+        self.df_assays = df_assays
+    def load_and_filter_bioactivities(self, chunk_size=10_000_000):
+        """
+        Load bioactivity data in chucks and filter out datapoints with
+        1. assay not in aids
+        2. outcome not 'Active'/'Inactive'
+        """
+        print('Load bioactivities...')
+        aids = self.df_assays.AID.tolist()
+        filtered_chunks = []
+        chunk_size = 10_000_000
+        for chunk in pd.read_csv(f'{self.pubchem_dir}/bioactivities.tsv.gz', sep='\t', chunksize=chunk_size, usecols=['AID', 'CID', 'Activity Outcome']):
+            filtered_chunk = chunk[chunk['AID'].isin(aids)]
+            filtered_chunk = filtered_chunk[filtered_chunk['Activity Outcome'].isin(['Inactive','Active'])]
+            filtered_chunks.append(filtered_chunk)
+            if self.debug:
+                break # For debugging
+        df_bio = pd.concat(filtered_chunks)
+        df_bio = df_bio[df_bio.CID.notna()]
+        df_bio['Activity'] = df_bio['Activity Outcome'].swifter.apply(lambda x : 1 if x == 'Active' else 0)
+        self.df_bio = df_bio.drop('Activity Outcome', axis=1).astype(int)
+    def merge_assay_and_activity_data(self):
+        print('Merge...')
+        self.df = self.df_bio.merge(self.df_assays, on='AID', how='left')
+        convert_dict = {col: 'int32' if col != 'UID' else 'str' for col in self.df.columns }
+        self.df = self.df.astype(convert_dict)
+        del self.df_assays, self.df_bio
+    def drop_hts_assays(self):
+        print('Drop HTS assays...')
+        aid_counts = self.df.groupby('AID').size()
+        filtered_aids = aid_counts[aid_counts <= 100_000].index
+        self.df = self.df[self.df['AID'].isin(filtered_aids)]
+    def drop_targets_with_limited_data(self, na_min=50, ni_min=50):
+        print('Drop targets with not enough datapoints...')
+        unique_uids = self.df['UID'].sort_values().unique() # Sorted unique targets
+        activity_counts = self.df.groupby('UID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives
+        mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin
+        self.df = self.df[self.df['UID'].isin(unique_uids[mask])]
+    def drop_conflicting_bioactivity_measures(self, target_col='UID', compound_col='CID'):
+        """
+        Check if each target-compound pair is associated to an unique activity value,
+        i.e. every measure either active or inactive. If not, drop it.
+        """
+        def process_group(group):
+            if group['Activity'].nunique() == 1:
+                return group.head(1)
+            else:
+                return None
+        print('Drop conflicting datapoints...')
+        # Get unique UID-CID pairs and duplicated ones
+        df_uniques = self.df.drop_duplicates(subset=[target_col, compound_col], keep=False)
+        df_duplicates = self.df[~self.df.index.isin(df_uniques.index)]
+        # Check duplicated pairs
+        groups = df_duplicates.groupby([target_col, compound_col])
+        rows = []
+        for _, group in tqdm.tqdm(groups):
+            rows.append(process_group(group))
+        df_rows = pd.concat([row for row in rows if row is not None])
+        self.df = pd.concat([df_uniques, df_rows])
+    def add_smiles(self, chunk_size=10_000_000):
+        print('Retrieve SMILES...')
+        cids = self.df.CID.astype(int).unique()
+        filtered_chunks = []
+        for chunk in pd.read_table(f'{self.pubchem_dir}/smiles.tsv.gz', chunksize=chunk_size, names=['CID', 'SMILES']):
+            filtered_chunk = chunk[chunk['CID'].isin(cids)]
+            filtered_chunks.append(filtered_chunk)
+            if self.debug:
+                break
+        df_smiles = pd.concat(filtered_chunks)
+        def cleanup(smiles):
+            sm = Standardizer(metal_disconnect=True, canon_taut=True)
+            mol = Chem.MolFromSmiles(smiles)
+            try:
+                standardized_mol, _ = sm.standardize_mol(mol)
+                return Chem.MolToSmiles(standardized_mol)
+            except:
+                print(smiles)
+                return None
+        df_smiles['SMILES'] = df_smiles['SMILES'].swifter.apply(lambda smi: cleanup(smi))
+        df_smiles.dropna(inplace=True)
+        self.df = self.df.merge(df_smiles, on='CID', how='left').dropna(subset=['SMILES'])
+    def print_stats(self):
+        nassays = self.df['AID'].nunique()
+        ntargets = self.df["UID"].nunique()
+        ncompounds = self.df["CID"].nunique()
+        nactvities = self.df.shape[0]
+        print(f'{ntargets: >5,} targets | {nassays: >6,} assays | {ncompounds: >9,} compounds | {nactvities: >10,} activity data points')
+    def save(self, fname='data/pubchem24_preprocessed.csv.gz'):
+        print(f'Save to {fname}...')
+        self.df.to_csv(fname, index=False)
+    def load(self, fname):
+        print(f'Load from {fname}...')
+        self.df = pd.read_csv(fname)
+    def add_protein_classifications(self):
+        """
+        Retrieve protein classification
+        """
+        print('Retrieve protein classifications...')
+        protein_class = pd.read_csv(self.prot_class_path)
+        print(protein_class)
+        # protein_class['UID'] = protein_class['target_id'].swifter.apply(lambda x: x.split('_')[0])
+        self.df = self.df.merge(protein_class[['UID', 'Organism', 'L1', 'L2']], on='UID', how='left')
+if __name__ == '__main__':
+    # Create an instance of PubChemFilter class
+    pubchem_filter = PubChemFilter(PUBCHEM_DIR, FSMOL_UID_PATH, PROT_CLASS_PATH, MHNFS_PATH, False)
+    # Call methods of the class as needed
+    pubchem_filter.load_and_filter_assays()
+    pubchem_filter.load_and_filter_bioactivities()
+    pubchem_filter.merge_assay_and_activity_data()
+    pubchem_filter.print_stats()
+    pubchem_filter.drop_hts_assays()
+    pubchem_filter.print_stats()
+    pubchem_filter.drop_targets_with_limited_data()
+    pubchem_filter.print_stats()
+    pubchem_filter.drop_conflicting_bioactivity_measures()
+    pubchem_filter.print_stats()
+    pubchem_filter.drop_targets_with_limited_data()
+    pubchem_filter.print_stats()
+    pubchem_filter.add_smiles()
+    pubchem_filter.print_stats()
+    pubchem_filter.drop_conflicting_bioactivity_measures(compound_col='SMILES')
+    pubchem_filter.print_stats()
+    pubchem_filter.drop_targets_with_limited_data()
+    pubchem_filter.print_stats()
+    pubchem_filter.add_protein_classifications()
+    pubchem_filter.save(fname='data/pubchem24/preprocessed.csv.gz')

pubchem_experiment/make_predictions.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import chunk
+import os
+import warnings
+import pandas as pd
+from rdkit import Chem
+# from rdkit.Chem import AllChem
+from rdkit.Chem import rdFingerprintGenerator
+from sklearn.ensemble import RandomForestClassifier
+from tqdm.auto import tqdm
+import numpy as np
+import clamp
+import torch
+warnings.filterwarnings("ignore")
+def generate_morgan_fingerprints(smiles_list, radius=4, n_bits=4048):
+    """
+    Generate Morgan fingerprints for a list of SMILES.
+    """
+    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=n_bits)
+    mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
+    fps = []
+    for smiles, mol in zip(smiles_list, mols):
+        if mol is None:
+            print(smiles)
+            fps.append(None)
+        else:
+            fps.append(mfpgen.GetFingerprintAsNumPy(mol))
+    # np.array([mfpgen.GetFingerprintAsNumPy(mol) for mol in mols])
+    return fps
+def rf(df, train_smiles, test_smiles):
+    """
+    Train and test RF baseline model.
+    Parameters:
+        df : pd.DataFrame with 'SMILES' and 'Activity_label' columns
+        train_smiles : list of training set smiles
+        test_smiles : list of test set smiles
+    Returns:
+        preds : list of predicted labels for the test set
+    """
+    train_df = df[df['SMILES'].isin(train_smiles)]
+    test_df = df[df['SMILES'].isin(test_smiles)]
+    # Generate Morgan fingerprints for training and test sets
+    X_train = generate_morgan_fingerprints(train_df['SMILES'])
+    X_test = generate_morgan_fingerprints(test_df['SMILES'])
+    # Extract labels
+    y_train = train_df['Activity'].values
+    # Train a Random Forest Classifier
+    clf = RandomForestClassifier(n_estimators=200, random_state=82)
+    clf.fit(X_train, y_train)
+    # Make predictions on the test set
+    try:
+        preds = clf.predict_proba(X_test)[:,1]
+    except Exception as e:
+        print(e)
+        print(test_df)
+        print(X_test)
+    return preds
+def fh(smiles_list):
+    df = pd.read_csv('data/fh_predictions.csv')
+    preds = df[df['SMILES'].isin(smiles_list)]['Prediction'].tolist()
+    return preds
+def drop_assays_with_limited_data(df, na_min=50, ni_min=100):
+    print('Drop assays with not enough datapoints...')
+    unique_uids = df['AID'].sort_values().unique() # Sorted unique targets
+    activity_counts = df.groupby('AID')['Activity'].value_counts().unstack().fillna(0) # matrix: rows=sorted targets, columns=nactive, ninactives
+    mask = ((activity_counts[1] >= na_min) & (activity_counts[0] >= ni_min) ) # Both nactives and ninactives above nmin
+    df = df[df['AID'].isin(unique_uids[mask])]
+    return df
+def run(
+        n_actives : int,
+        n_inactives : int,
+        model : str = 'MHNfs',
+        task : str = 'UID',
+        input_file : str = '', # todo add path
+        output_dir : str = '', # todo add path
+        n_repeats : int = 3,
+        seed : int = 42
+        ):
+    # Load data
+    data = pd.read_csv(input_file)
+    if task == 'AID':
+        data = drop_assays_with_limited_data(data, 30, 30)
+    # Output dir
+    output_dir = os.path.join(output_dir, model, task, f'{n_actives}+{n_inactives}x{n_repeats}')
+    print(output_dir)
+    os.makedirs(output_dir, exist_ok=True)
+    # Tasks
+    tasks = data[task].value_counts(ascending=True).index.tolist()
+    # print(tasks)
+    if model == 'MHNfs':
+        predictor = ActivityPredictor()
+    # Iterate over tasks
+    for t in tqdm(tasks):
+        # Output file
+        output_file = os.path.join(output_dir, f'{t}.csv')
+        if os.path.exists(output_file):
+            continue
+        # Data for task
+        df = data[data[task] == t]
+        # Iterate over replicates
+        results = []
+        for i in range(n_repeats):
+            # Select support sets and test molecules
+            actives = df.loc[df['Activity'] == 1, 'SMILES'].sample(n=n_actives, random_state=seed+i).tolist()
+            inactives = df.loc[df['Activity'] == 0, 'SMILES'].sample(n=n_inactives, random_state=seed+i).tolist()
+            test_smiles = df[~df.SMILES.isin(actives+inactives)].SMILES.tolist()
+            if model == 'RF':
+                preds = rf(df, actives+inactives, test_smiles)
+            else:
+                if len(test_smiles) > 10_000:
+                    # MHNfs breaks for over 20_000 datapoints -> Use chunks to make predictions
+                    chunk_size = 10_000
+                    chunks = [test_smiles[i:i + chunk_size] for i in range(0, len(test_smiles), chunk_size)]
+                    preds = []
+                    for chunk in chunks:
+                        preds.extend( predictor.predict(chunk, actives, inactives))
+                else:
+                    preds = predictor.predict(test_smiles, actives, inactives)
+            d = {
+                'SMILES' : test_smiles,
+                'Label' : df[df.SMILES.isin(test_smiles)].Activity,
+                'Prediction' : preds,
+                'Fold' : [i] * len(test_smiles)
+            }
+            results.append(pd.DataFrame(d))
+        results = pd.concat(results)
+        results.to_csv(output_file, index=False)
+if __name__ == '__main__':
+    mhnfs_path = # mhnfs_path + '/mhnfs'
+    benchmark_path = # project_path
+    import sys
+    sys.path.append(mhnfs_path)
+    from src.prediction_pipeline import ActivityPredictor
+    support_sets = [(1,7), (2,6), (4,4)]
+    models = ['RF', 'MHNfs']
+    tasks =  ['AID', 'UID']
+    input_file = # preprocessed_data path + '/pubchem24_preprocessed_2.csv.gz'
+    for support_set in support_sets:
+        for model in models:
+            for task in tasks:
+                run(*support_set, task=task, model=model, input_file=input_file)

pubchem_experiment/metrics.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import os
+import math
+import numpy as np
+import pandas as pd
+from tqdm.auto import tqdm
+from rdkit.ML.Scoring.Scoring import CalcBEDROC
+from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, average_precision_score, \
+    matthews_corrcoef, precision_score, recall_score, f1_score, confusion_matrix
+def specificity_score(true_labels, predicted_labels):
+    tn, fp, _, _ = confusion_matrix(true_labels, predicted_labels).ravel()
+    specificity = tn / (tn + fp)
+    return specificity
+MAIN_DIR = '' # todo add project dir
+def balanced_mcc_score(sensitivity, specificity, prevalence):
+    """Returns the Matthews' correlation coefficient at the given
+     sensitivity, specificity and prevalence.
+    Parameters
+    ----------
+    sensitivity : float
+        The sensitivity of the model
+    specificity : float
+        The specificity of the model
+    prevalence : float
+        The prevalence of the test set
+    Returns
+    ------
+    float
+        Matthews' correlation coefficient as a float
+    """
+    numerator = sensitivity + specificity - 1
+    denominatorFirstTerm = sensitivity + (1 - specificity)*(1 - prevalence) / prevalence
+    denominatorSecondTerm = specificity + (1 -sensitivity)*prevalence/(1 - prevalence)
+    denominator = math.sqrt(denominatorFirstTerm * denominatorSecondTerm)
+    if sensitivity == 1 and specificity == 0:
+        denominator = 1
+    if sensitivity == 0 and specificity == 1:
+        denominator = 1.
+    return(numerator / denominator)
+def ef_top_per(predictions, prevalance, top_frac=0.01):
+    n = int(len(predictions) * top_frac)
+    predictions = sorted(predictions, reverse=True)[:n]
+    f = np.sum(np.round(predictions)) / n
+    return  f / prevalance
+def compute_metrics(df):
+    """
+    Compute a set of classification metric for single set of predictions.
+    Args:
+        df : dataframe with true labels in 'Label' column and probabilistic predictions in 'Prediction' column
+    Returns:
+        df_metrics: dataframe with metrics in 'Metric' column and values in 'Value' column
+    """
+    true_labels = df['Label']
+    prevalance = sum(true_labels) / len(true_labels)
+    predictions = df['Prediction']
+    # print(true_labels.value_counts())
+    # print(predictions.max())
+    acc = accuracy_score(true_labels, predictions.round())
+    bacc = balanced_accuracy_score(true_labels, predictions.round())
+    precision = precision_score(true_labels, predictions.round(), zero_division=0.0)
+    recall = recall_score(true_labels, predictions.round())
+    specificity = specificity_score(true_labels, predictions.round())
+    mcc = matthews_corrcoef(true_labels, predictions.round())
+    bmcc = balanced_mcc_score(recall, specificity, prevalance)
+    f1 = f1_score(true_labels, predictions.round())
+    auc = roc_auc_score(true_labels, predictions)
+    ap = average_precision_score(true_labels, predictions)
+    dap = ap - prevalance
+    scores = df.sort_values(by='Prediction', ascending=False)[['Label', 'Prediction']].values
+    bedroc = CalcBEDROC(scores, 0, 20)
+    ef = ef_top_per(predictions, prevalance, 0.01)
+    metrics_dict = {'ACC': acc, 'BACC': bacc, 'MCC': mcc, 'BMCC': bmcc, 'Precision': precision, 'Recall': recall, 'F1-score': f1,
+                    'AUC': auc, 'dAP': dap, 'BEDROC': bedroc, 'EF-1%' : ef}
+    df_metrics = pd.DataFrame(metrics_dict.items(), columns=['Metric', 'Value'])
+    return df_metrics
+def get_metrics(
+        tasks : list[str] = ['AID', 'UID'],
+        models : list[str] = ['MHNfs', 'RF'],
+        settings : list[str] = ['1+1x3', '1+3x3', '1+7x3', '2+2x3', '2+6x3', '2+14x3', '4+4x3', '4+12x3', '4+28x3', '8+8x3', '8+24x3', '8+56x3'],
+        overwrite: bool = False):
+    """
+    Computes classification metrics for each combination.
+    """
+    file = f'{MAIN_DIR}/results_used.csv.gz'
+    if overwrite:
+        df = pd.DataFrame()
+    else:
+        df = pd.read_csv(file)
+    path_preprocessed = ""  # todo
+    df_pubchem = pd.read_csv(path_preprocessed)
+    for task in tasks:
+        for model in models:
+            for setting in settings:
+                dir = f'{MAIN_DIR}/predictions/{model}/{task}/{setting}'
+                try:
+                    targets = [x[:-4] for x in os.listdir(dir)]
+                    pubchem_targets = df_pubchem[task].astype(str).unique().tolist()
+                    for target in tqdm(targets, desc=f'{task} - {model} - {setting}'):
+                        if target not in pubchem_targets:
+                            continue
+                        # Skip already computed targets
+                        if not overwrite and any((df['Model'] == model) & (df['Setting'] == setting)  & (df['Task'] == task) & (df['TID'] == target)):
+                            continue
+                        # Load predictions
+                        df_task = pd.read_csv(f'{dir}/{target}.csv')
+                        # Retrieve oragnism and L1 protein classification
+                        try:
+                            org = df_pubchem.loc[df_pubchem[task] == target, 'Organism'].values[0]
+                            l1 = df_pubchem.loc[df_pubchem[task] == target, 'L1'].values[0]
+                        except:
+                            org = df_pubchem.loc[df_pubchem[task] == int(target), 'Organism'].values[0]
+                            l1 = df_pubchem.loc[df_pubchem[task] == int(target), 'L1'].values[0]
+                        if l1 == None:
+                            print(target, l1)
+                        # Compute metrics for each fold
+                        for fold in df_task.Fold.unique():
+                            metrics = (compute_metrics(df_task[df_task.Fold == fold]).assign(
+                                Model=model, Task=task, TID=target, Organism=org, L1=l1, Setting=setting, Fold=fold,
+                                )
+                            ).rename(columns={'Target' : task})
+                            df = pd.concat([df, metrics], ignore_index=True)
+                except Exception as e:
+                    print(e)
+                    raise e
+    df.to_csv(file, index=False)
+if __name__ == '__main__':
+    #get_metrics()
+    get_metrics(settings=['1+7x3', '2+6x3', '4+4x3', '2+14x3', '4+12x3','8+8x3'],  overwrite=True)