Spaces:

archis99
/

clinical-trial-prediction

Running

App Files Files Community

archis99 commited on Sep 21

Commit

d587b0b

0 Parent(s):

Initial Commit

Browse files

Files changed (23) hide show

.gitignore +45 -0
backend/Procfile +1 -0
backend/__init__.py +0 -0
backend/main.py +25 -0
backend/models/biobert_model.py +23 -0
backend/pipelines/__init__.py +1 -0
backend/pipelines/preprocessor_pipeline.py +62 -0
backend/pipelines/run_inference.py +99 -0
backend/preprocessing/__init__.py +1 -0
backend/preprocessing/categorical.py +75 -0
backend/preprocessing/cleaning.py +42 -0
backend/preprocessing/embeddings.py +70 -0
backend/preprocessing/globals.py +17 -0
backend/preprocessing/preprocessing_all.py +276 -0
backend/preprocessing/scaling.py +19 -0
backend/preprocessing/text_processing.py +66 -0
data/.gitkeep +0 -0
embeddings/.gitkeep +0 -0
frontend/app.py +149 -0
notebooks/clinical-trial-outcome-prediction.ipynb +0 -0
requirements.txt +25 -0
save_preprocessor.py +53 -0
tests/test_inference.py +50 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,45 @@

+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Virtual environment
+venv/
+venv311/
+.env/
+# Data folder - keep folder, ignore all files
+/data/*
+!/data/.gitkeep
+# Embeddings folder - keep folder, ignore all files
+embeddings/*
+!embeddings/.gitkeep
+# Ignore locally downloaded or generated models in the backend
+/backend/models/*.pth
+/backend/models/*.joblib
+/backend/models/*.pkl
+# Ignore cloned Hugging Face repo
+/backend/Novartis-models/
+# Jupyter notebooks checkpoints
+*.ipynb_checkpoints/
+# --- Tools & OS ---
+# IDE / Editor specific
+.vscode/
+.idea/
+# Operating System generated files
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+# Streamlit cache
+.streamlit/

backend/Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: uvicorn main:app --host 0.0.0.0 --port $PORT

backend/__init__.py ADDED Viewed

File without changes

backend/main.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from fastapi import FastAPI
+import pandas as pd
+from .pipelines.run_inference import predict
+from fastapi.middleware.cors import CORSMiddleware
+app = FastAPI(title="Study Status Prediction API")
+# CORS for frontend
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.get("/health")
+def health_check():
+    return {"status": "ok"}
+@app.post("/predict")
+def predict_endpoint(data: dict | list[dict]):
+    # Convert single row or multiple rows to DataFrame
+    df = pd.DataFrame(data if isinstance(data, list) else [data])
+    predictions = predict(df)
+    return predictions

backend/models/biobert_model.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# biobert_model.py
+# rebuilding the same model architecture as training
+import torch
+import torch.nn as nn
+class BioBERTClassifier(nn.Module):
+    def __init__(self, embedding_dim=768, num_embeddings=5, num_classes=2, hidden_dim=256):
+        super(BioBERTClassifier, self).__init__()
+        # input size = 5 * 768 = 3840
+        self.fc1 = nn.Linear(embedding_dim * num_embeddings, hidden_dim)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(0.3)
+        self.fc2 = nn.Linear(hidden_dim, num_classes)
+    def forward(self, e1, e2, e3, e4, e5):
+        # Concatenate all embeddings
+        x = torch.cat((e1, e2, e3, e4, e5), dim=1)  # shape (batch, 3840)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        logits = self.fc2(x)
+        return logits

backend/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # __init__.py

backend/pipelines/preprocessor_pipeline.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# preprocessor_pipeline.py
+import joblib
+import pandas as pd
+import torch
+from transformers import AutoTokenizer, AutoModel
+# Import all helper funcs & globals from preprocessing package
+from ..preprocessing.cleaning import (
+    drop_duplicates, select_required_columns, transform_numeric,
+    fill_missing_numerical, fill_missing_categorical, drop_irrelevant_columns
+)
+from ..preprocessing.categorical import expand_study_design, encode_categorical, clean_categorical_columns
+from ..preprocessing.scaling import scale_numeric
+from ..preprocessing.text_processing import preprocess_text_columns, tokenize_text_columns
+from ..preprocessing.embeddings import extract_text_embeddings
+from ..preprocessing.globals import scaler, label_encoders, unique_attributes
+class Preprocessor:
+    def __init__(self, required_cols, categorical_cols, columns_to_drop, text_columns,
+                 tokenizer=None, biobert_model=None, device="cpu"):
+        self.required_cols = required_cols
+        self.categorical_cols = categorical_cols
+        self.columns_to_drop = columns_to_drop
+        self.text_columns = text_columns
+        self.tokenizer = tokenizer
+        self.biobert_model = biobert_model
+        self.device = device
+    def transform(self, df: pd.DataFrame):
+        """Run full preprocessing on a dataframe."""
+        df = drop_duplicates(df)
+        df = select_required_columns(df, self.required_cols)
+        df = transform_numeric(df)
+        df = fill_missing_numerical(df, ["Enrollment"])
+        df = fill_missing_categorical(df, self.categorical_cols)
+        df = expand_study_design(df, unique_attributes)
+        df = drop_irrelevant_columns(df, self.columns_to_drop)
+        df = clean_categorical_columns(df)
+        df = encode_categorical(df, label_encoders)
+        df = scale_numeric(df, scaler)
+        df = preprocess_text_columns(df, self.text_columns)
+        embeddings = None
+        if self.tokenizer is not None and self.biobert_model is not None:
+            tokenized_dict = tokenize_text_columns(df, self.text_columns, self.tokenizer)
+            embeddings = extract_text_embeddings(
+                tokenized_dict,
+                self.biobert_model,
+                device=self.device
+            )
+        return df, embeddings
+    def save(self, path="models/preprocessor.pkl"):
+        """Save preprocessor object."""
+        joblib.dump(self, path)
+    @staticmethod
+    def load(path="models/preprocessor.pkl"):
+        """Load preprocessor object."""
+        return joblib.load(path)

backend/pipelines/run_inference.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import os
+import joblib
+import pickle
+import pandas as pd
+import numpy as np
+from huggingface_hub import hf_hub_download
+from pathlib import Path
+from ..models.biobert_model import BioBERTClassifier
+# Directory to store downloaded models
+MODEL_DIR = Path(__file__).parent.parent / "models"
+os.makedirs(MODEL_DIR, exist_ok=True)
+# Hugging Face repo and filenames
+HF_REPO = "archis99/Novartis-models"
+BIOBERT_FILE = "biobert_classifier.pth"
+RF_FILE = "random_forest_model.joblib"
+PREPROCESSOR_FILE = "preprocessor.pkl"
+# Paths for local files
+biobert_path = os.path.join(MODEL_DIR, BIOBERT_FILE)
+rf_path = os.path.join(MODEL_DIR, RF_FILE)
+preprocessor_path = os.path.join(MODEL_DIR, PREPROCESSOR_FILE)
+# Download if not present locally
+for file_name, local_path in [(BIOBERT_FILE, biobert_path),
+                              (RF_FILE, rf_path),
+                              (PREPROCESSOR_FILE, preprocessor_path)]:
+    if not os.path.exists(local_path):
+        print(f"Downloading {file_name} from Hugging Face...")
+        hf_hub_download(repo_id=HF_REPO, filename=file_name, local_dir=MODEL_DIR, local_dir_use_symlinks=False)
+# Load preprocessor
+with open(preprocessor_path, "rb") as f:
+    preprocessor = pickle.load(f)
+# Load Random Forest model
+rf_model = joblib.load(rf_path)
+# Load BioBERT model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+biobert_model = BioBERTClassifier()
+biobert_model.load_state_dict(torch.load(biobert_path, map_location=device))
+biobert_model.to(device)
+biobert_model.eval()
+# Thresholds & weights from training
+RF_THRESHOLD = 0.1
+BIOBERT_THRESHOLD = 0.3
+ENSEMBLE_THRESHOLD = 0.22999999999999995
+W1, W2 = 2.0, 0.5
+# Label mapping
+LABEL_MAP = {0: "COMPLETED", 1: "NOT COMPLETED"}
+# Inference function
+def predict(df_new: pd.DataFrame):
+    # Preprocess input
+    X_tabular, embeddings = preprocessor.transform(df_new)
+    # Columns to drop for RF
+    textual_columns = [
+        "Brief Summary",
+        "Conditions",
+        "Interventions",
+        "Primary Outcome Measures",
+        "Secondary Outcome Measures"
+    ]
+    # Keep only RF-relevant features
+    X_tabular_rf = X_tabular.drop(columns=textual_columns, errors="ignore")
+    # RF prediction (probabilities)
+    rf_probs = rf_model.predict_proba(X_tabular_rf)[:, 1]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # BioBERT prediction
+    e1, e2, e3, e4, e5 = [embeddings[col].to(device) for col in textual_columns]  # unpack embeddings
+    with torch.no_grad():
+        logits = biobert_model(e1, e2, e3, e4, e5)
+        biobert_probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
+    # Ensemble (soft voting with weights)
+    combined_probs = (W1 * rf_probs + W2 * biobert_probs) / (W1 + W2)
+    # Final binary predictions using tuned threshold
+    final_preds = (combined_probs > ENSEMBLE_THRESHOLD).astype(int)
+    # Map to human-readable labels
+    final_labels = [LABEL_MAP[p] for p in final_preds]
+    return {
+        # "rf_probs": rf_probs.tolist(),
+        # "biobert_probs": biobert_probs.tolist(),
+        # "combined_probs": combined_probs.tolist(),
+        "final_predictions": final_labels
+    }

backend/preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # preprocessing/__init__.py

backend/preprocessing/categorical.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# categorical.py
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel
+# ------------------------
+# Study Design Parsing
+# ------------------------
+def parse_study_design(study_design, all_attributes):
+    # Initialize all allowed attributes as "Unknown"
+    attributes = {attr: "Unknown" for attr in all_attributes}
+    if study_design and study_design != "Unknown" and pd.notna(study_design):
+        for part in study_design.split('|'):
+            if ':' in part:
+                key, value = part.split(':', 1)
+                key, value = key.strip(), value.strip()
+                # Only keep keys that are in our unique_attributes list
+                if key in all_attributes:
+                    attributes[key] = value
+                # else: ignore unknown keys (do not create new columns)
+    return attributes
+def expand_study_design(df, unique_attributes):
+    parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes))
+    study_df = pd.DataFrame(parsed.tolist(), index=df.index)
+    # Merge parsed attributes back with df
+    df = pd.concat([df, study_df], axis=1)
+    # Drop original Study Design column
+    df = df.drop(columns=['Study Design'], errors='ignore')
+    return df
+# ------------------------
+# Encoding Categorical Columns
+# ------------------------
+def encode_categorical(df, label_encoders):
+    for col, le in label_encoders.items():
+        # Transform using saved encoder; handle unseen labels
+        df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown")
+        df[col] = le.transform(df[col])
+    return df
+def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Clean and standardize certain categorical columns for inference.
+    Replaces missing or malformed values with 'Unknown' to match training preprocessing.
+    Args:
+        df (pd.DataFrame): Input dataframe with user data.
+    Returns:
+        pd.DataFrame: DataFrame with cleaned categorical columns.
+    """
+    columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose']
+    for col in columns_to_clean:
+        # Replace known missing/malformed values with 'Unknown'
+        df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
+        # Replace actual NaN values with 'Unknown'
+        df[col] = df[col].fillna('Unknown')
+    return df

backend/preprocessing/cleaning.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# cleaning.py
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel
+def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
+    return df.drop_duplicates()
+def select_required_columns(df: pd.DataFrame, required_cols: list)  -> pd.DataFrame:
+    return df[required_cols].copy()
+def transform_numeric(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Apply sqrt transform to 'Enrollment' column
+    """
+    df['Enrollment'] = np.sqrt(df['Enrollment'] + 1e-6)
+    return df
+def fill_missing_numerical(df: pd.DataFrame, numerical_cols: list) -> pd.DataFrame:
+    """
+    Fill missing numerical values with the median of each column.
+    """
+    for col in numerical_cols:
+        df[col] = df[col].fillna(df[col].median())
+    return df
+def fill_missing_categorical(df: pd.DataFrame, columns_to_clean: list) -> pd.DataFrame:
+    """
+    Replace 'Unknown', 'NA', '', ' ' and NaN with 'Unknown' in given categorical columns.
+    """
+    for col in columns_to_clean:
+        df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
+        df[col] = df[col].fillna('Unknown')
+    return df
+def drop_irrelevant_columns(df, columns_to_drop):
+    return df.drop(columns=columns_to_drop, errors='ignore')

backend/preprocessing/embeddings.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# embeddings.py
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel
+# ------------------------
+# Extract Embeddings
+# ------------------------
+def extract_text_embeddings(tokenized_data_dict, model, device=None, batch_size=32, save_to_disk=False):
+    """
+    Extract embeddings from tokenized textual data using BioBERT.
+    Args:
+        tokenized_data_dict (dict): Dictionary of tokenized columns (output of `tokenize_text_columns`).
+        model (transformers.PreTrainedModel): BioBERT model (without classification head).
+        device (torch.device, optional): Device to run the model on. Defaults to GPU if available.
+        batch_size (int): Batch size for embedding extraction.
+        save_to_disk (bool): Whether to save embeddings as .pt files for each column.
+    Returns:
+        dict: Dictionary of embeddings for each column.
+    """
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()  # Ensure model is in evaluation mode
+    embeddings_dict = {}
+    for col, tokenized_data in tokenized_data_dict.items():
+        print(f"Extracting embeddings for column: {col}")
+        input_ids = tokenized_data["input_ids"]
+        attention_mask = tokenized_data["attention_mask"]
+        dataset = TensorDataset(input_ids, attention_mask)
+        dataloader = DataLoader(dataset, batch_size=batch_size)
+        all_embeddings = []
+        with torch.no_grad():
+            for batch in dataloader:
+                input_ids_batch, attention_mask_batch = batch
+                input_ids_batch = input_ids_batch.to(device)
+                attention_mask_batch = attention_mask_batch.to(device)
+                outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
+                hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_dim]
+                # Mean pooling across sequence length
+                embeddings = hidden_states.mean(dim=1)
+                all_embeddings.append(embeddings.cpu())
+        embeddings_col = torch.cat(all_embeddings, dim=0)
+        embeddings_dict[col] = embeddings_col
+        if save_to_disk:
+            torch.save(embeddings_col, f"{col}_embeddings.pt")
+            print(f"Saved embeddings for column: {col}")
+        print(f"Shape of embeddings for column {col}: {embeddings_col.shape}")
+    return embeddings_dict

backend/preprocessing/globals.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# globals.py
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel
+from pathlib import Path
+BACKEND_DIR = Path(__file__).parent.parent
+# --- Load saved artifacts using the absolute path ---
+scaler = joblib.load(BACKEND_DIR / "models/scaler_enrollment.pkl")
+label_encoders = joblib.load(BACKEND_DIR / "models/feature_label_encoders.pkl")
+unique_attributes = joblib.load(BACKEND_DIR / "models/study_design_attributes.pkl")

backend/preprocessing/preprocessing_all.py ADDED Viewed

	@@ -0,0 +1,276 @@

+# preprocessing_all.py
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel
+# ------------------------
+# Load saved artifacts
+# ------------------------
+scaler = joblib.load("models\scaler_enrollment.pkl")  # StandardScaler for 'Enrollment'
+label_encoders = joblib.load("models\label_encoders.pkl")  # Dict of LabelEncoders for categorical columns
+unique_attributes = joblib.load("models\study_design_attributes.pkl")  # List of Study Design attributes
+def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
+    return df.drop_duplicates()
+def select_required_columns(df: pd.DataFrame, required_cols: list)  -> pd.DataFrame:
+    return df[required_cols].copy()
+def transform_numeric(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Apply sqrt transform to 'Enrollment' column
+    """
+    df['Enrollment'] = np.sqrt(df['Enrollment'] + 1e-6)
+    return df
+def fill_missing_numerical(df: pd.DataFrame, numerical_cols: list) -> pd.DataFrame:
+    """
+    Fill missing numerical values with the median of each column.
+    """
+    for col in numerical_cols:
+        df[col] = df[col].fillna(df[col].median())
+    return df
+def fill_missing_categorical(df: pd.DataFrame, columns_to_clean: list) -> pd.DataFrame:
+    """
+    Replace 'Unknown', 'NA', '', ' ' and NaN with 'Unknown' in given categorical columns.
+    """
+    for col in columns_to_clean:
+        df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
+        df[col] = df[col].fillna('Unknown')
+    return df
+def drop_irrelevant_columns(df, columns_to_drop):
+    return df.drop(columns=columns_to_drop, errors='ignore')
+# ------------------------
+# Study Design Parsing
+# ------------------------
+def parse_study_design(study_design, all_attributes):
+    attributes = {attr: "Unknown" for attr in all_attributes}
+    if study_design != "Unknown" and pd.notna(study_design):
+        for part in study_design.split('|'):
+            if ':' in part:
+                key, value = part.split(':', 1)
+                attributes[key.strip()] = value.strip()
+    return attributes
+def expand_study_design(df, unique_attributes):
+    parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes))
+    study_df = pd.DataFrame(parsed.tolist(), index=df.index)
+    df = pd.concat([df, study_df], axis=1)
+    df = df.drop(columns=['Study Design'], errors='ignore')
+    return df
+# ------------------------
+# Encoding Categorical Columns
+# ------------------------
+def encode_categorical(df, label_encoders):
+    for col, le in label_encoders.items():
+        # Transform using saved encoder; handle unseen labels
+        df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown")
+        df[col] = le.transform(df[col])
+    return df
+def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Clean and standardize certain categorical columns for inference.
+    Replaces missing or malformed values with 'Unknown' to match training preprocessing.
+    Args:
+        df (pd.DataFrame): Input dataframe with user data.
+    Returns:
+        pd.DataFrame: DataFrame with cleaned categorical columns.
+    """
+    columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose']
+    for col in columns_to_clean:
+        # Replace known missing/malformed values with 'Unknown'
+        df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
+        # Replace actual NaN values with 'Unknown'
+        df[col] = df[col].fillna('Unknown')
+    return df
+# ------------------------
+# Scaling numeric columns
+# ------------------------
+def scale_numeric(df, scaler):
+    """
+    Standardize numerical columns using StandardScaler.
+    """
+    df['Enrollment'] = scaler.transform(df[['Enrollment']])
+    return df
+# ------------------------
+# Text preprocessing
+# ------------------------
+def clean_text(text):
+    if pd.isna(text):  # Handle missing values
+        return ""
+    text = text.lower()  # Convert to lowercase
+    text = ''.join(char for char in text if char.isalnum() or char.isspace())  # Remove special characters
+    return ' '.join(text.split())  # Remove extra whitespaces
+def preprocess_text_columns(df, text_columns):
+    for col in text_columns:
+        df[col] = df[col].fillna("No info provided")
+        df[col] = df[col].apply(clean_text)
+    return df
+# ------------------------
+# Tokenization of textual Columns
+# ------------------------
+def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
+    """
+    Tokenizes multiple textual columns in batches for inference.
+    Args:
+        df (pd.DataFrame): DataFrame containing textual columns.
+        textual_columns (list): List of column names to tokenize.
+        tokenizer: HuggingFace tokenizer.
+        batch_size (int): Number of samples per batch.
+        max_length (int): Maximum token length per sequence.
+    Returns:
+        dict: Dictionary with column names as keys and tokenized tensors as values.
+    """
+    def tokenize_in_batches(column_texts):
+        tokenized_batches = []
+        for i in range(0, len(column_texts), batch_size):
+            batch = column_texts[i:i + batch_size].tolist()
+            tokenized_batch = tokenizer(
+                batch,
+                padding="max_length",
+                truncation=True,
+                max_length=max_length,
+                return_tensors="pt"
+            )
+            tokenized_batches.append(tokenized_batch)
+        # Combine batches
+        return {
+            "input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
+            "attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
+        }
+    tokenized_data = {}
+    for col in textual_columns:
+        tokenized_data[col] = tokenize_in_batches(df[col])
+    return tokenized_data
+# ------------------------
+# Extract Embeddings
+# ------------------------
+def extract_text_embeddings(tokenized_data_dict, model, device=None, batch_size=32, save_to_disk=False):
+    """
+    Extract embeddings from tokenized textual data using BioBERT.
+    Args:
+        tokenized_data_dict (dict): Dictionary of tokenized columns (output of `tokenize_text_columns`).
+        model (transformers.PreTrainedModel): BioBERT model (without classification head).
+        device (torch.device, optional): Device to run the model on. Defaults to GPU if available.
+        batch_size (int): Batch size for embedding extraction.
+        save_to_disk (bool): Whether to save embeddings as .pt files for each column.
+    Returns:
+        dict: Dictionary of embeddings for each column.
+    """
+    if device is None:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    model.eval()  # Ensure model is in evaluation mode
+    embeddings_dict = {}
+    for col, tokenized_data in tokenized_data_dict.items():
+        print(f"Extracting embeddings for column: {col}")
+        input_ids = tokenized_data["input_ids"]
+        attention_mask = tokenized_data["attention_mask"]
+        dataset = TensorDataset(input_ids, attention_mask)
+        dataloader = DataLoader(dataset, batch_size=batch_size)
+        all_embeddings = []
+        with torch.no_grad():
+            for batch in dataloader:
+                input_ids_batch, attention_mask_batch = batch
+                input_ids_batch = input_ids_batch.to(device)
+                attention_mask_batch = attention_mask_batch.to(device)
+                outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
+                hidden_states = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_dim]
+                # Mean pooling across sequence length
+                embeddings = hidden_states.mean(dim=1)
+                all_embeddings.append(embeddings.cpu())
+        embeddings_col = torch.cat(all_embeddings, dim=0)
+        embeddings_dict[col] = embeddings_col
+        if save_to_disk:
+            torch.save(embeddings_col, f"{col}_embeddings.pt")
+            print(f"Saved embeddings for column: {col}")
+        print(f"Shape of embeddings for column {col}: {embeddings_col.shape}")
+    return embeddings_dict
+# ------------------------
+# Main preprocessing function
+# ------------------------
+def preprocess(df, required_cols, categorical_cols, columns_to_drop, text_columns,
+               tokenizer=None, biobert_model=None, device='cpu'):
+    """
+    Full preprocessing pipeline.
+    Args:
+        df (pd.DataFrame): Input DataFrame (single row or batch).
+        required_cols (list): Columns to select from df.
+        categorical_cols (list): Categorical columns to encode.
+        columns_to_drop (list): Columns to drop from df.
+        text_columns (list): Textual columns to preprocess.
+        tokenizer (transformers.AutoTokenizer, optional): BioBERT tokenizer for text.
+        biobert_model (transformers.AutoModel, optional): BioBERT model (no classification head).
+        device (str): 'cpu' or 'cuda'.
+    Returns:
+        df (pd.DataFrame): Preprocessed tabular DataFrame.
+        embeddings (dict or None): Dict of embeddings for text columns, if model provided.
+    """
+    # Tabular preprocessing
+    df = drop_duplicates(df)
+    df = select_required_columns(df, required_cols)
+    df = transform_numeric(df)
+    df = fill_missing_numerical(df, ["Enrollment"])  # median fill for Enrollment
+    df = fill_missing_categorical(df, categorical_cols)
+    df = drop_irrelevant_columns(df, columns_to_drop)
+    df = expand_study_design(df, unique_attributes)
+    df = clean_categorical_columns(df)
+    df = encode_categorical(df, label_encoders)
+    df = scale_numeric(df, scaler)
+    df = preprocess_text_columns(df, text_columns)
+    embeddings = None
+    if tokenizer is not None and biobert_model is not None:
+        tokenized_dict = tokenize_text_columns(df, text_columns, tokenizer)
+        embeddings = extract_text_embeddings(tokenized_dict, biobert_model, device=device)
+    return df, embeddings

backend/preprocessing/scaling.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# scaling.py
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel
+# ------------------------
+# Scaling numeric columns
+# ------------------------
+def scale_numeric(df, scaler):
+    """
+    Standardize numerical columns using StandardScaler.
+    """
+    df['Enrollment'] = scaler.transform(df[['Enrollment']])
+    return df

backend/preprocessing/text_processing.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# text_processing.py
+import pandas as pd
+import numpy as np
+import joblib
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+import torch
+from torch.utils.data import TensorDataset, DataLoader
+from transformers import AutoModel
+# ------------------------
+# Text preprocessing
+# ------------------------
+def clean_text(text):
+    if pd.isna(text):  # Handle missing values
+        return ""
+    text = text.lower()  # Convert to lowercase
+    text = ''.join(char for char in text if char.isalnum() or char.isspace())  # Remove special characters
+    return ' '.join(text.split())  # Remove extra whitespaces
+def preprocess_text_columns(df, text_columns):
+    for col in text_columns:
+        df[col] = df[col].fillna("No info provided")
+        df[col] = df[col].apply(clean_text)
+    return df
+# ------------------------
+# Tokenization of textual Columns
+# ------------------------
+def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
+    """
+    Tokenizes multiple textual columns in batches for inference.
+    Args:
+        df (pd.DataFrame): DataFrame containing textual columns.
+        textual_columns (list): List of column names to tokenize.
+        tokenizer: HuggingFace tokenizer.
+        batch_size (int): Number of samples per batch.
+        max_length (int): Maximum token length per sequence.
+    Returns:
+        dict: Dictionary with column names as keys and tokenized tensors as values.
+    """
+    def tokenize_in_batches(column_texts):
+        tokenized_batches = []
+        for i in range(0, len(column_texts), batch_size):
+            batch = column_texts[i:i + batch_size].tolist()
+            tokenized_batch = tokenizer(
+                batch,
+                padding="max_length",
+                truncation=True,
+                max_length=max_length,
+                return_tensors="pt"
+            )
+            tokenized_batches.append(tokenized_batch)
+        # Combine batches
+        return {
+            "input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
+            "attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
+        }
+    tokenized_data = {}
+    for col in textual_columns:
+        tokenized_data[col] = tokenize_in_batches(df[col])
+    return tokenized_data

data/.gitkeep ADDED Viewed

File without changes

embeddings/.gitkeep ADDED Viewed

File without changes

frontend/app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import sys
+import os
+from pathlib import Path
+# PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+# if PROJECT_ROOT not in sys.path:
+#     sys.path.insert(0, PROJECT_ROOT)
+project_root = Path(__file__).parent.parent
+sys.path.insert(0, str(project_root))
+import streamlit as st
+import pandas as pd
+from backend.pipelines.run_inference import predict
+from io import BytesIO
+st.set_page_config(page_title="Study Status Prediction", page_icon="📊", layout="wide")
+st.title("📊 Study Status Prediction")
+st.markdown("Upload a CSV file or manually enter study details to predict whether a study is **COMPLETED** or **NOT COMPLETED**.")
+# Tabs for CSV Upload and Manual Entry
+tab1, tab2 = st.tabs(["📂 Upload CSV", "✍️ Manual Entry"])
+# --- Option 1: CSV Upload ---
+with tab1:
+    uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
+    if uploaded_file:
+        df_new = pd.read_csv(uploaded_file)
+        preds = predict(df_new)
+        # Only keep final predictions
+        final_preds = pd.DataFrame({"Final Prediction": preds["final_predictions"]})
+        # Display a preview
+        st.subheader("🔎 Predictions Preview")
+        st.dataframe(final_preds.head())
+        # Download button for predictions
+        csv_buffer = BytesIO()
+        final_preds.to_csv(csv_buffer, index=False)
+        st.download_button(
+            label="📥 Download Predictions CSV",
+            data=csv_buffer.getvalue(),
+            file_name="predictions.csv",
+            mime="text/csv"
+        )
+# --- Option 2: Manual Entry ---
+with tab2:
+    st.subheader("✍️ Enter Study Details")
+    st.markdown("Fill in the fields below to predict the study status.")
+    # Placeholders for all the features
+    nct_number = st.text_input("NCT Number", placeholder="e.g., NCT01234567")
+    study_title = st.text_area("Study Title", placeholder="e.g., A Study of Drug X in Treating Lung Cancer")
+    study_url = st.text_input("Study URL", placeholder="e.g., https://clinicaltrials.gov/ct2/show/NCT01234567")
+    acronym = st.text_input("Acronym", placeholder="e.g., LUNG-X")
+    brief_summary = st.text_area("Brief Summary", placeholder="e.g., This is a phase 3 trial evaluating the effectiveness of Drug X for lung cancer.")
+    study_results = st.selectbox("Study Results", ["YES", "NO"])
+    conditions = st.text_input("Conditions", placeholder="e.g., Lung Cancer")
+    interventions = st.text_input("Interventions", placeholder="e.g., Drug Y")
+    primary_outcome = st.text_input("Primary Outcome Measures", placeholder="e.g., Survival rate")
+    secondary_outcome = st.text_input("Secondary Outcome Measures", placeholder="e.g., Side effects")
+    other_outcome = st.text_input("Other Outcome Measures", placeholder="Optional")
+    sponsor = st.text_input("Sponsor", placeholder="e.g., ABC Research")
+    collaborators = st.text_input("Collaborators", placeholder="e.g., University of SFX")
+    sex = st.selectbox("Sex", ["ALL", "MALE", "FEMALE"])
+    age = st.selectbox("Age", ["ADULT, OLDER_ADULT",
+        "ADULT",
+        "CHILD, ADULT, OLDER_ADULT",
+        "CHILD",
+        "CHILD, ADULT",
+        "OLDER_ADULT"])
+    phases = st.selectbox("Phases", ["PHASE2",
+        "PHASE1",
+        "PHASE4",
+        "PHASE3",
+        "PHASE1|PHASE2",
+        "PHASE2|PHASE3",
+        "EARLY_PHASE1"])
+    enrollment = st.number_input("Enrollment", min_value=0, step=1, placeholder="e.g., 500")
+    funder_type = st.selectbox("Funder Type", ["OTHER",
+        "INDUSTRY",
+        "NIH",
+        "OTHER_GOV",
+        "NETWORK",
+        "FED",
+        "INDIV",
+        "UNKNOWN",
+        "AMBIG"])
+    study_type = st.selectbox("Study Type", ["INTERVENTIONAL", "OBSERVATIONAL"])
+    study_design = st.text_area("Study Design", placeholder="e.g., Intervention Model: PARALLEL | Masking: SINGLE (INVESTIGATOR)")
+    other_ids = st.text_input("Other IDs", placeholder="e.g., ABC-123")
+    start_date = st.text_input("Start Date", placeholder="e.g., January 2023")
+    primary_completion_date = st.text_input("Primary Completion Date", placeholder="e.g., December 2025")
+    completion_date = st.text_input("Completion Date", placeholder="e.g., June 2026")
+    first_posted = st.text_input("First Posted", placeholder="e.g., February 2023")
+    results_first_posted = st.text_input("Results First Posted", placeholder="e.g., N/A")
+    last_update_posted = st.text_input("Last Update Posted", placeholder="e.g., September 2025")
+    locations = st.text_area("Locations", placeholder="e.g., New York, USA")
+    study_documents = st.text_area("Study Documents", placeholder="e.g., Protocol PDF")
+    if st.button("🔮 Predict Status"):
+        single_data = {
+        "NCT Number": nct_number,
+        "Study Title": study_title,
+        "Study URL": study_url,
+        "Acronym": acronym,
+        "Brief Summary": brief_summary,
+        "Study Results": study_results,
+        "Conditions": conditions,
+        "Interventions": interventions,
+        "Primary Outcome Measures": primary_outcome,
+        "Secondary Outcome Measures": secondary_outcome,
+        "Other Outcome Measures": other_outcome,
+        "Sponsor": sponsor,
+        "Collaborators": collaborators,
+        "Sex": sex,
+        "Age": age,
+        "Phases": phases,
+        "Enrollment": enrollment,
+        "Funder Type": funder_type,
+        "Study Type": study_type,
+        "Study Design": study_design,
+        "Other IDs": other_ids,
+        "Start Date": start_date,
+        "Primary Completion Date": primary_completion_date,
+        "Completion Date": completion_date,
+        "First Posted": first_posted,
+        "Results First Posted": results_first_posted,
+        "Last Update Posted": last_update_posted,
+        "Locations": locations,
+        "Study Documents": study_documents,
+    }
+        df_single = pd.DataFrame([single_data])
+        preds = predict(df_single)
+        # Show final prediction with animation
+        final_label = preds["final_predictions"][0]
+        if final_label == "COMPLETED":
+            st.success(f"✅ Prediction: **{final_label}**", icon="🎉")
+        else:
+            st.error(f"❌ Prediction: **{final_label}**", icon="⚠️")

notebooks/clinical-trial-outcome-prediction.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+# Core libraries
+numpy>=1.26.4,<2.2
+pandas==2.2.2
+scikit-learn==1.2.2
+joblib==1.3.2
+# FastAPI and server
+fastapi==0.111.0
+uvicorn==0.29.0
+# Data visualization / plotting
+matplotlib==3.9.0
+seaborn==0.13.2
+# Streamlit
+streamlit==1.35.0
+# # PyTorch
+# torch==2.3.0
+# torchvision==0.18.0
+# torchaudio==2.3.0
+# Transformers / HuggingFace
+transformers==4.43.0
+huggingface_hub==0.23.4

save_preprocessor.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# save_preprocessor.py
+from transformers import AutoTokenizer, AutoModel
+from backend.pipelines.preprocessor_pipeline import Preprocessor
+import pandas as pd
+# Define dataset columns (adapt to your dataset)
+sample_df = pd.DataFrame([{
+    "Brief Summary": "This is a sample study.",
+    "Study Results": "Has Results",
+    "Conditions": "Condition A",
+    "Interventions": "Drug X",
+    "Primary Outcome Measures": "Outcome 1",
+    "Secondary Outcome Measures": "Outcome 2",
+    "Sponsor": "XYZ Corp",
+    "Sex": "All",
+    "Age": "Adult",
+    "Funder Type": "Industry",
+    "Phases": "Phase 2",
+    "Enrollment": 120,
+    "Study Type": "Interventional",
+    "Study Design": "Intervention: Randomized|Masking: Double",
+}])
+required_cols = sample_df.columns.tolist()
+categorical_cols = [
+    "Study Results", "Sex", "Age", "Funder Type", "Phases",
+    "Study Type"
+]
+columns_to_drop = ["Sponsor", "Observational Model", "Time Perspective"]
+text_columns = [
+    "Brief Summary", "Conditions", "Interventions",
+    "Primary Outcome Measures", "Secondary Outcome Measures"
+]
+# Load BioBERT
+tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
+# Ensure pad token exists
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.cls_token or "[PAD]"
+# Create and save preprocessor
+preprocessor = Preprocessor(
+    required_cols,
+    categorical_cols,
+    columns_to_drop,
+    text_columns,
+    tokenizer=tokenizer,
+    biobert_model=model,
+    device="cpu"
+)
+preprocessor.save("backend/models/preprocessor.pkl")

tests/test_inference.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# test_inference.py
+import pandas as pd
+from pipelines.preprocessor_pipeline import Preprocessor
+# Load saved preprocessor
+preprocessor = Preprocessor.load("models/preprocessor.pkl")
+# Sample new data for inference
+df_new = pd.DataFrame([{
+    ""
+    "NCT Number": "NCT01234567",
+    "Study Title": "A Study of Drug X in Treating Lung Cancer",
+    "Study URL": "https://clinicaltrials.gov/ct2/show/NCT01234567",
+    "Acronym": "LUNG-X",
+    "Brief Summary": "This is a phase 3 trial evaluating the effectiveness of Drug X for lung cancer.",
+    "Study Results": "NO",
+    "Conditions": "Lung Cancer",
+    "Interventions": "Drug Y",
+    "Primary Outcome Measures": "Survival rate",
+    "Secondary Outcome Measures": "Side effects",
+    "Other Outcome Measures": "",
+    "Sponsor": "ABC Research",
+    "Collaborators": "University of SFX",
+    "Sex": "MALE",
+    "Age": "garbage value - jhfkjahfaiueuw",
+    "Phases": "Phase 3",
+    "Enrollment": 500,
+    "Funder Type": "Government",
+    "Study Type": "Archchisman",
+    "Study Design": "Intervention Model: Randomized|Masking: QUADRUPLE (PARTICIPANT, CARE_PROVIDER, INVESTIGATOR, OUTCOMES_ASSESSOR)|Observational Model: Observing|Name: Archchisman Banerjee",
+    "Other IDs": "ABC-123",
+    "Start Date": "January 2023",
+    "Primary Completion Date": "December 2025",
+    "Completion Date": "June 2026",
+    "First Posted": "February 2023",
+    "Results First Posted": "N/A",
+    "Last Update Posted": "September 2025",
+    "Locations": "New York, USA",
+    "Study Documents": "Protocol PDF"
+}])
+X_tabular, embeddings = preprocessor.transform(df_new)
+print("Processed Tabular Features:")
+print(X_tabular.head())
+X_tabular.to_csv("test.csv")
+if embeddings:
+    for col, emb in embeddings.items():
+        print(f"Embeddings for {col}: {emb.shape}")