Commit
·
d587b0b
0
Parent(s):
Initial Commit
Browse files- .gitignore +45 -0
- backend/Procfile +1 -0
- backend/__init__.py +0 -0
- backend/main.py +25 -0
- backend/models/biobert_model.py +23 -0
- backend/pipelines/__init__.py +1 -0
- backend/pipelines/preprocessor_pipeline.py +62 -0
- backend/pipelines/run_inference.py +99 -0
- backend/preprocessing/__init__.py +1 -0
- backend/preprocessing/categorical.py +75 -0
- backend/preprocessing/cleaning.py +42 -0
- backend/preprocessing/embeddings.py +70 -0
- backend/preprocessing/globals.py +17 -0
- backend/preprocessing/preprocessing_all.py +276 -0
- backend/preprocessing/scaling.py +19 -0
- backend/preprocessing/text_processing.py +66 -0
- data/.gitkeep +0 -0
- embeddings/.gitkeep +0 -0
- frontend/app.py +149 -0
- notebooks/clinical-trial-outcome-prediction.ipynb +0 -0
- requirements.txt +25 -0
- save_preprocessor.py +53 -0
- tests/test_inference.py +50 -0
.gitignore
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python cache
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
+
|
| 7 |
+
# Virtual environment
|
| 8 |
+
venv/
|
| 9 |
+
venv311/
|
| 10 |
+
.env/
|
| 11 |
+
|
| 12 |
+
# Data folder - keep folder, ignore all files
|
| 13 |
+
/data/*
|
| 14 |
+
!/data/.gitkeep
|
| 15 |
+
|
| 16 |
+
# Embeddings folder - keep folder, ignore all files
|
| 17 |
+
embeddings/*
|
| 18 |
+
!embeddings/.gitkeep
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Ignore locally downloaded or generated models in the backend
|
| 22 |
+
/backend/models/*.pth
|
| 23 |
+
/backend/models/*.joblib
|
| 24 |
+
/backend/models/*.pkl
|
| 25 |
+
|
| 26 |
+
# Ignore cloned Hugging Face repo
|
| 27 |
+
/backend/Novartis-models/
|
| 28 |
+
|
| 29 |
+
# Jupyter notebooks checkpoints
|
| 30 |
+
*.ipynb_checkpoints/
|
| 31 |
+
|
| 32 |
+
# --- Tools & OS ---
|
| 33 |
+
# IDE / Editor specific
|
| 34 |
+
.vscode/
|
| 35 |
+
.idea/
|
| 36 |
+
|
| 37 |
+
# Operating System generated files
|
| 38 |
+
.DS_Store
|
| 39 |
+
Thumbs.db
|
| 40 |
+
|
| 41 |
+
# Logs
|
| 42 |
+
*.log
|
| 43 |
+
|
| 44 |
+
# Streamlit cache
|
| 45 |
+
.streamlit/
|
backend/Procfile
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
web: uvicorn main:app --host 0.0.0.0 --port $PORT
|
backend/__init__.py
ADDED
|
File without changes
|
backend/main.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from .pipelines.run_inference import predict
|
| 4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
|
| 6 |
+
app = FastAPI(title="Study Status Prediction API")
|
| 7 |
+
|
| 8 |
+
# CORS for frontend
|
| 9 |
+
app.add_middleware(
|
| 10 |
+
CORSMiddleware,
|
| 11 |
+
allow_origins=["*"],
|
| 12 |
+
allow_methods=["*"],
|
| 13 |
+
allow_headers=["*"],
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
@app.get("/health")
|
| 17 |
+
def health_check():
|
| 18 |
+
return {"status": "ok"}
|
| 19 |
+
|
| 20 |
+
@app.post("/predict")
|
| 21 |
+
def predict_endpoint(data: dict | list[dict]):
|
| 22 |
+
# Convert single row or multiple rows to DataFrame
|
| 23 |
+
df = pd.DataFrame(data if isinstance(data, list) else [data])
|
| 24 |
+
predictions = predict(df)
|
| 25 |
+
return predictions
|
backend/models/biobert_model.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# biobert_model.py
|
| 2 |
+
# rebuilding the same model architecture as training
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
|
| 7 |
+
class BioBERTClassifier(nn.Module):
|
| 8 |
+
def __init__(self, embedding_dim=768, num_embeddings=5, num_classes=2, hidden_dim=256):
|
| 9 |
+
super(BioBERTClassifier, self).__init__()
|
| 10 |
+
# input size = 5 * 768 = 3840
|
| 11 |
+
self.fc1 = nn.Linear(embedding_dim * num_embeddings, hidden_dim)
|
| 12 |
+
self.relu = nn.ReLU()
|
| 13 |
+
self.dropout = nn.Dropout(0.3)
|
| 14 |
+
self.fc2 = nn.Linear(hidden_dim, num_classes)
|
| 15 |
+
|
| 16 |
+
def forward(self, e1, e2, e3, e4, e5):
|
| 17 |
+
# Concatenate all embeddings
|
| 18 |
+
x = torch.cat((e1, e2, e3, e4, e5), dim=1) # shape (batch, 3840)
|
| 19 |
+
x = self.fc1(x)
|
| 20 |
+
x = self.relu(x)
|
| 21 |
+
x = self.dropout(x)
|
| 22 |
+
logits = self.fc2(x)
|
| 23 |
+
return logits
|
backend/pipelines/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# __init__.py
|
backend/pipelines/preprocessor_pipeline.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# preprocessor_pipeline.py
|
| 2 |
+
import joblib
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import AutoTokenizer, AutoModel
|
| 6 |
+
|
| 7 |
+
# Import all helper funcs & globals from preprocessing package
|
| 8 |
+
from ..preprocessing.cleaning import (
|
| 9 |
+
drop_duplicates, select_required_columns, transform_numeric,
|
| 10 |
+
fill_missing_numerical, fill_missing_categorical, drop_irrelevant_columns
|
| 11 |
+
)
|
| 12 |
+
from ..preprocessing.categorical import expand_study_design, encode_categorical, clean_categorical_columns
|
| 13 |
+
from ..preprocessing.scaling import scale_numeric
|
| 14 |
+
from ..preprocessing.text_processing import preprocess_text_columns, tokenize_text_columns
|
| 15 |
+
from ..preprocessing.embeddings import extract_text_embeddings
|
| 16 |
+
from ..preprocessing.globals import scaler, label_encoders, unique_attributes
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class Preprocessor:
|
| 20 |
+
def __init__(self, required_cols, categorical_cols, columns_to_drop, text_columns,
|
| 21 |
+
tokenizer=None, biobert_model=None, device="cpu"):
|
| 22 |
+
self.required_cols = required_cols
|
| 23 |
+
self.categorical_cols = categorical_cols
|
| 24 |
+
self.columns_to_drop = columns_to_drop
|
| 25 |
+
self.text_columns = text_columns
|
| 26 |
+
self.tokenizer = tokenizer
|
| 27 |
+
self.biobert_model = biobert_model
|
| 28 |
+
self.device = device
|
| 29 |
+
|
| 30 |
+
def transform(self, df: pd.DataFrame):
|
| 31 |
+
"""Run full preprocessing on a dataframe."""
|
| 32 |
+
df = drop_duplicates(df)
|
| 33 |
+
df = select_required_columns(df, self.required_cols)
|
| 34 |
+
df = transform_numeric(df)
|
| 35 |
+
df = fill_missing_numerical(df, ["Enrollment"])
|
| 36 |
+
df = fill_missing_categorical(df, self.categorical_cols)
|
| 37 |
+
df = expand_study_design(df, unique_attributes)
|
| 38 |
+
df = drop_irrelevant_columns(df, self.columns_to_drop)
|
| 39 |
+
df = clean_categorical_columns(df)
|
| 40 |
+
df = encode_categorical(df, label_encoders)
|
| 41 |
+
df = scale_numeric(df, scaler)
|
| 42 |
+
df = preprocess_text_columns(df, self.text_columns)
|
| 43 |
+
|
| 44 |
+
embeddings = None
|
| 45 |
+
if self.tokenizer is not None and self.biobert_model is not None:
|
| 46 |
+
tokenized_dict = tokenize_text_columns(df, self.text_columns, self.tokenizer)
|
| 47 |
+
embeddings = extract_text_embeddings(
|
| 48 |
+
tokenized_dict,
|
| 49 |
+
self.biobert_model,
|
| 50 |
+
device=self.device
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
return df, embeddings
|
| 54 |
+
|
| 55 |
+
def save(self, path="models/preprocessor.pkl"):
|
| 56 |
+
"""Save preprocessor object."""
|
| 57 |
+
joblib.dump(self, path)
|
| 58 |
+
|
| 59 |
+
@staticmethod
|
| 60 |
+
def load(path="models/preprocessor.pkl"):
|
| 61 |
+
"""Load preprocessor object."""
|
| 62 |
+
return joblib.load(path)
|
backend/pipelines/run_inference.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import os
|
| 3 |
+
import joblib
|
| 4 |
+
import pickle
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from huggingface_hub import hf_hub_download
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from ..models.biobert_model import BioBERTClassifier
|
| 10 |
+
|
| 11 |
+
# Directory to store downloaded models
|
| 12 |
+
MODEL_DIR = Path(__file__).parent.parent / "models"
|
| 13 |
+
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 14 |
+
|
| 15 |
+
# Hugging Face repo and filenames
|
| 16 |
+
HF_REPO = "archis99/Novartis-models"
|
| 17 |
+
BIOBERT_FILE = "biobert_classifier.pth"
|
| 18 |
+
RF_FILE = "random_forest_model.joblib"
|
| 19 |
+
PREPROCESSOR_FILE = "preprocessor.pkl"
|
| 20 |
+
|
| 21 |
+
# Paths for local files
|
| 22 |
+
biobert_path = os.path.join(MODEL_DIR, BIOBERT_FILE)
|
| 23 |
+
rf_path = os.path.join(MODEL_DIR, RF_FILE)
|
| 24 |
+
preprocessor_path = os.path.join(MODEL_DIR, PREPROCESSOR_FILE)
|
| 25 |
+
|
| 26 |
+
# Download if not present locally
|
| 27 |
+
for file_name, local_path in [(BIOBERT_FILE, biobert_path),
|
| 28 |
+
(RF_FILE, rf_path),
|
| 29 |
+
(PREPROCESSOR_FILE, preprocessor_path)]:
|
| 30 |
+
if not os.path.exists(local_path):
|
| 31 |
+
print(f"Downloading {file_name} from Hugging Face...")
|
| 32 |
+
hf_hub_download(repo_id=HF_REPO, filename=file_name, local_dir=MODEL_DIR, local_dir_use_symlinks=False)
|
| 33 |
+
|
| 34 |
+
# Load preprocessor
|
| 35 |
+
with open(preprocessor_path, "rb") as f:
|
| 36 |
+
preprocessor = pickle.load(f)
|
| 37 |
+
|
| 38 |
+
# Load Random Forest model
|
| 39 |
+
rf_model = joblib.load(rf_path)
|
| 40 |
+
|
| 41 |
+
# Load BioBERT model
|
| 42 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 43 |
+
biobert_model = BioBERTClassifier()
|
| 44 |
+
biobert_model.load_state_dict(torch.load(biobert_path, map_location=device))
|
| 45 |
+
biobert_model.to(device)
|
| 46 |
+
biobert_model.eval()
|
| 47 |
+
|
| 48 |
+
# Thresholds & weights from training
|
| 49 |
+
RF_THRESHOLD = 0.1
|
| 50 |
+
BIOBERT_THRESHOLD = 0.3
|
| 51 |
+
ENSEMBLE_THRESHOLD = 0.22999999999999995
|
| 52 |
+
W1, W2 = 2.0, 0.5
|
| 53 |
+
|
| 54 |
+
# Label mapping
|
| 55 |
+
LABEL_MAP = {0: "COMPLETED", 1: "NOT COMPLETED"}
|
| 56 |
+
|
| 57 |
+
# Inference function
|
| 58 |
+
def predict(df_new: pd.DataFrame):
|
| 59 |
+
# Preprocess input
|
| 60 |
+
X_tabular, embeddings = preprocessor.transform(df_new)
|
| 61 |
+
|
| 62 |
+
# Columns to drop for RF
|
| 63 |
+
textual_columns = [
|
| 64 |
+
"Brief Summary",
|
| 65 |
+
"Conditions",
|
| 66 |
+
"Interventions",
|
| 67 |
+
"Primary Outcome Measures",
|
| 68 |
+
"Secondary Outcome Measures"
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
# Keep only RF-relevant features
|
| 72 |
+
X_tabular_rf = X_tabular.drop(columns=textual_columns, errors="ignore")
|
| 73 |
+
|
| 74 |
+
# RF prediction (probabilities)
|
| 75 |
+
rf_probs = rf_model.predict_proba(X_tabular_rf)[:, 1]
|
| 76 |
+
|
| 77 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 78 |
+
|
| 79 |
+
# BioBERT prediction
|
| 80 |
+
e1, e2, e3, e4, e5 = [embeddings[col].to(device) for col in textual_columns] # unpack embeddings
|
| 81 |
+
with torch.no_grad():
|
| 82 |
+
logits = biobert_model(e1, e2, e3, e4, e5)
|
| 83 |
+
biobert_probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
|
| 84 |
+
|
| 85 |
+
# Ensemble (soft voting with weights)
|
| 86 |
+
combined_probs = (W1 * rf_probs + W2 * biobert_probs) / (W1 + W2)
|
| 87 |
+
|
| 88 |
+
# Final binary predictions using tuned threshold
|
| 89 |
+
final_preds = (combined_probs > ENSEMBLE_THRESHOLD).astype(int)
|
| 90 |
+
|
| 91 |
+
# Map to human-readable labels
|
| 92 |
+
final_labels = [LABEL_MAP[p] for p in final_preds]
|
| 93 |
+
|
| 94 |
+
return {
|
| 95 |
+
# "rf_probs": rf_probs.tolist(),
|
| 96 |
+
# "biobert_probs": biobert_probs.tolist(),
|
| 97 |
+
# "combined_probs": combined_probs.tolist(),
|
| 98 |
+
"final_predictions": final_labels
|
| 99 |
+
}
|
backend/preprocessing/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# preprocessing/__init__.py
|
backend/preprocessing/categorical.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# categorical.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import joblib
|
| 5 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 6 |
+
import torch
|
| 7 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 8 |
+
from transformers import AutoModel
|
| 9 |
+
|
| 10 |
+
# ------------------------
|
| 11 |
+
# Study Design Parsing
|
| 12 |
+
# ------------------------
|
| 13 |
+
|
| 14 |
+
def parse_study_design(study_design, all_attributes):
|
| 15 |
+
# Initialize all allowed attributes as "Unknown"
|
| 16 |
+
attributes = {attr: "Unknown" for attr in all_attributes}
|
| 17 |
+
|
| 18 |
+
if study_design and study_design != "Unknown" and pd.notna(study_design):
|
| 19 |
+
for part in study_design.split('|'):
|
| 20 |
+
if ':' in part:
|
| 21 |
+
key, value = part.split(':', 1)
|
| 22 |
+
key, value = key.strip(), value.strip()
|
| 23 |
+
|
| 24 |
+
# Only keep keys that are in our unique_attributes list
|
| 25 |
+
if key in all_attributes:
|
| 26 |
+
attributes[key] = value
|
| 27 |
+
# else: ignore unknown keys (do not create new columns)
|
| 28 |
+
|
| 29 |
+
return attributes
|
| 30 |
+
|
| 31 |
+
def expand_study_design(df, unique_attributes):
|
| 32 |
+
parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes))
|
| 33 |
+
study_df = pd.DataFrame(parsed.tolist(), index=df.index)
|
| 34 |
+
|
| 35 |
+
# Merge parsed attributes back with df
|
| 36 |
+
df = pd.concat([df, study_df], axis=1)
|
| 37 |
+
|
| 38 |
+
# Drop original Study Design column
|
| 39 |
+
df = df.drop(columns=['Study Design'], errors='ignore')
|
| 40 |
+
|
| 41 |
+
return df
|
| 42 |
+
|
| 43 |
+
# ------------------------
|
| 44 |
+
# Encoding Categorical Columns
|
| 45 |
+
# ------------------------
|
| 46 |
+
|
| 47 |
+
def encode_categorical(df, label_encoders):
|
| 48 |
+
for col, le in label_encoders.items():
|
| 49 |
+
# Transform using saved encoder; handle unseen labels
|
| 50 |
+
df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown")
|
| 51 |
+
df[col] = le.transform(df[col])
|
| 52 |
+
return df
|
| 53 |
+
|
| 54 |
+
def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 55 |
+
"""
|
| 56 |
+
Clean and standardize certain categorical columns for inference.
|
| 57 |
+
|
| 58 |
+
Replaces missing or malformed values with 'Unknown' to match training preprocessing.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
df (pd.DataFrame): Input dataframe with user data.
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
pd.DataFrame: DataFrame with cleaned categorical columns.
|
| 65 |
+
"""
|
| 66 |
+
columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose']
|
| 67 |
+
|
| 68 |
+
for col in columns_to_clean:
|
| 69 |
+
# Replace known missing/malformed values with 'Unknown'
|
| 70 |
+
df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
|
| 71 |
+
# Replace actual NaN values with 'Unknown'
|
| 72 |
+
df[col] = df[col].fillna('Unknown')
|
| 73 |
+
|
| 74 |
+
return df
|
| 75 |
+
|
backend/preprocessing/cleaning.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# cleaning.py
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
import joblib
|
| 6 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 7 |
+
import torch
|
| 8 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 9 |
+
from transformers import AutoModel
|
| 10 |
+
|
| 11 |
+
def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
| 12 |
+
return df.drop_duplicates()
|
| 13 |
+
|
| 14 |
+
def select_required_columns(df: pd.DataFrame, required_cols: list) -> pd.DataFrame:
|
| 15 |
+
return df[required_cols].copy()
|
| 16 |
+
|
| 17 |
+
def transform_numeric(df: pd.DataFrame) -> pd.DataFrame:
|
| 18 |
+
"""
|
| 19 |
+
Apply sqrt transform to 'Enrollment' column
|
| 20 |
+
"""
|
| 21 |
+
df['Enrollment'] = np.sqrt(df['Enrollment'] + 1e-6)
|
| 22 |
+
return df
|
| 23 |
+
|
| 24 |
+
def fill_missing_numerical(df: pd.DataFrame, numerical_cols: list) -> pd.DataFrame:
|
| 25 |
+
"""
|
| 26 |
+
Fill missing numerical values with the median of each column.
|
| 27 |
+
"""
|
| 28 |
+
for col in numerical_cols:
|
| 29 |
+
df[col] = df[col].fillna(df[col].median())
|
| 30 |
+
return df
|
| 31 |
+
|
| 32 |
+
def fill_missing_categorical(df: pd.DataFrame, columns_to_clean: list) -> pd.DataFrame:
|
| 33 |
+
"""
|
| 34 |
+
Replace 'Unknown', 'NA', '', ' ' and NaN with 'Unknown' in given categorical columns.
|
| 35 |
+
"""
|
| 36 |
+
for col in columns_to_clean:
|
| 37 |
+
df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
|
| 38 |
+
df[col] = df[col].fillna('Unknown')
|
| 39 |
+
return df
|
| 40 |
+
|
| 41 |
+
def drop_irrelevant_columns(df, columns_to_drop):
|
| 42 |
+
return df.drop(columns=columns_to_drop, errors='ignore')
|
backend/preprocessing/embeddings.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# embeddings.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import joblib
|
| 5 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 6 |
+
import torch
|
| 7 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 8 |
+
from transformers import AutoModel
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# ------------------------
|
| 12 |
+
# Extract Embeddings
|
| 13 |
+
# ------------------------
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def extract_text_embeddings(tokenized_data_dict, model, device=None, batch_size=32, save_to_disk=False):
|
| 17 |
+
"""
|
| 18 |
+
Extract embeddings from tokenized textual data using BioBERT.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
tokenized_data_dict (dict): Dictionary of tokenized columns (output of `tokenize_text_columns`).
|
| 22 |
+
model (transformers.PreTrainedModel): BioBERT model (without classification head).
|
| 23 |
+
device (torch.device, optional): Device to run the model on. Defaults to GPU if available.
|
| 24 |
+
batch_size (int): Batch size for embedding extraction.
|
| 25 |
+
save_to_disk (bool): Whether to save embeddings as .pt files for each column.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
dict: Dictionary of embeddings for each column.
|
| 29 |
+
"""
|
| 30 |
+
if device is None:
|
| 31 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 32 |
+
model.to(device)
|
| 33 |
+
model.eval() # Ensure model is in evaluation mode
|
| 34 |
+
|
| 35 |
+
embeddings_dict = {}
|
| 36 |
+
|
| 37 |
+
for col, tokenized_data in tokenized_data_dict.items():
|
| 38 |
+
print(f"Extracting embeddings for column: {col}")
|
| 39 |
+
|
| 40 |
+
input_ids = tokenized_data["input_ids"]
|
| 41 |
+
attention_mask = tokenized_data["attention_mask"]
|
| 42 |
+
|
| 43 |
+
dataset = TensorDataset(input_ids, attention_mask)
|
| 44 |
+
dataloader = DataLoader(dataset, batch_size=batch_size)
|
| 45 |
+
|
| 46 |
+
all_embeddings = []
|
| 47 |
+
|
| 48 |
+
with torch.no_grad():
|
| 49 |
+
for batch in dataloader:
|
| 50 |
+
input_ids_batch, attention_mask_batch = batch
|
| 51 |
+
input_ids_batch = input_ids_batch.to(device)
|
| 52 |
+
attention_mask_batch = attention_mask_batch.to(device)
|
| 53 |
+
|
| 54 |
+
outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
|
| 55 |
+
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_dim]
|
| 56 |
+
|
| 57 |
+
# Mean pooling across sequence length
|
| 58 |
+
embeddings = hidden_states.mean(dim=1)
|
| 59 |
+
all_embeddings.append(embeddings.cpu())
|
| 60 |
+
|
| 61 |
+
embeddings_col = torch.cat(all_embeddings, dim=0)
|
| 62 |
+
embeddings_dict[col] = embeddings_col
|
| 63 |
+
|
| 64 |
+
if save_to_disk:
|
| 65 |
+
torch.save(embeddings_col, f"{col}_embeddings.pt")
|
| 66 |
+
print(f"Saved embeddings for column: {col}")
|
| 67 |
+
|
| 68 |
+
print(f"Shape of embeddings for column {col}: {embeddings_col.shape}")
|
| 69 |
+
|
| 70 |
+
return embeddings_dict
|
backend/preprocessing/globals.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# globals.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import joblib
|
| 5 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 6 |
+
import torch
|
| 7 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 8 |
+
from transformers import AutoModel
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
BACKEND_DIR = Path(__file__).parent.parent
|
| 12 |
+
|
| 13 |
+
# --- Load saved artifacts using the absolute path ---
|
| 14 |
+
scaler = joblib.load(BACKEND_DIR / "models/scaler_enrollment.pkl")
|
| 15 |
+
label_encoders = joblib.load(BACKEND_DIR / "models/feature_label_encoders.pkl")
|
| 16 |
+
unique_attributes = joblib.load(BACKEND_DIR / "models/study_design_attributes.pkl")
|
| 17 |
+
|
backend/preprocessing/preprocessing_all.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# preprocessing_all.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import joblib
|
| 5 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 6 |
+
import torch
|
| 7 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 8 |
+
from transformers import AutoModel
|
| 9 |
+
|
| 10 |
+
# ------------------------
|
| 11 |
+
# Load saved artifacts
|
| 12 |
+
# ------------------------
|
| 13 |
+
|
| 14 |
+
scaler = joblib.load("models\scaler_enrollment.pkl") # StandardScaler for 'Enrollment'
|
| 15 |
+
label_encoders = joblib.load("models\label_encoders.pkl") # Dict of LabelEncoders for categorical columns
|
| 16 |
+
unique_attributes = joblib.load("models\study_design_attributes.pkl") # List of Study Design attributes
|
| 17 |
+
|
| 18 |
+
def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
|
| 19 |
+
return df.drop_duplicates()
|
| 20 |
+
|
| 21 |
+
def select_required_columns(df: pd.DataFrame, required_cols: list) -> pd.DataFrame:
|
| 22 |
+
return df[required_cols].copy()
|
| 23 |
+
|
| 24 |
+
def transform_numeric(df: pd.DataFrame) -> pd.DataFrame:
|
| 25 |
+
"""
|
| 26 |
+
Apply sqrt transform to 'Enrollment' column
|
| 27 |
+
"""
|
| 28 |
+
df['Enrollment'] = np.sqrt(df['Enrollment'] + 1e-6)
|
| 29 |
+
return df
|
| 30 |
+
|
| 31 |
+
def fill_missing_numerical(df: pd.DataFrame, numerical_cols: list) -> pd.DataFrame:
|
| 32 |
+
"""
|
| 33 |
+
Fill missing numerical values with the median of each column.
|
| 34 |
+
"""
|
| 35 |
+
for col in numerical_cols:
|
| 36 |
+
df[col] = df[col].fillna(df[col].median())
|
| 37 |
+
return df
|
| 38 |
+
|
| 39 |
+
def fill_missing_categorical(df: pd.DataFrame, columns_to_clean: list) -> pd.DataFrame:
|
| 40 |
+
"""
|
| 41 |
+
Replace 'Unknown', 'NA', '', ' ' and NaN with 'Unknown' in given categorical columns.
|
| 42 |
+
"""
|
| 43 |
+
for col in columns_to_clean:
|
| 44 |
+
df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
|
| 45 |
+
df[col] = df[col].fillna('Unknown')
|
| 46 |
+
return df
|
| 47 |
+
|
| 48 |
+
def drop_irrelevant_columns(df, columns_to_drop):
|
| 49 |
+
return df.drop(columns=columns_to_drop, errors='ignore')
|
| 50 |
+
|
| 51 |
+
# ------------------------
|
| 52 |
+
# Study Design Parsing
|
| 53 |
+
# ------------------------
|
| 54 |
+
|
| 55 |
+
def parse_study_design(study_design, all_attributes):
|
| 56 |
+
attributes = {attr: "Unknown" for attr in all_attributes}
|
| 57 |
+
if study_design != "Unknown" and pd.notna(study_design):
|
| 58 |
+
for part in study_design.split('|'):
|
| 59 |
+
if ':' in part:
|
| 60 |
+
key, value = part.split(':', 1)
|
| 61 |
+
attributes[key.strip()] = value.strip()
|
| 62 |
+
return attributes
|
| 63 |
+
|
| 64 |
+
def expand_study_design(df, unique_attributes):
|
| 65 |
+
parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes))
|
| 66 |
+
study_df = pd.DataFrame(parsed.tolist(), index=df.index)
|
| 67 |
+
df = pd.concat([df, study_df], axis=1)
|
| 68 |
+
df = df.drop(columns=['Study Design'], errors='ignore')
|
| 69 |
+
return df
|
| 70 |
+
|
| 71 |
+
# ------------------------
|
| 72 |
+
# Encoding Categorical Columns
|
| 73 |
+
# ------------------------
|
| 74 |
+
|
| 75 |
+
def encode_categorical(df, label_encoders):
|
| 76 |
+
for col, le in label_encoders.items():
|
| 77 |
+
# Transform using saved encoder; handle unseen labels
|
| 78 |
+
df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown")
|
| 79 |
+
df[col] = le.transform(df[col])
|
| 80 |
+
return df
|
| 81 |
+
|
| 82 |
+
def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame:
|
| 83 |
+
"""
|
| 84 |
+
Clean and standardize certain categorical columns for inference.
|
| 85 |
+
|
| 86 |
+
Replaces missing or malformed values with 'Unknown' to match training preprocessing.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
df (pd.DataFrame): Input dataframe with user data.
|
| 90 |
+
|
| 91 |
+
Returns:
|
| 92 |
+
pd.DataFrame: DataFrame with cleaned categorical columns.
|
| 93 |
+
"""
|
| 94 |
+
columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose']
|
| 95 |
+
|
| 96 |
+
for col in columns_to_clean:
|
| 97 |
+
# Replace known missing/malformed values with 'Unknown'
|
| 98 |
+
df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
|
| 99 |
+
# Replace actual NaN values with 'Unknown'
|
| 100 |
+
df[col] = df[col].fillna('Unknown')
|
| 101 |
+
|
| 102 |
+
return df
|
| 103 |
+
|
| 104 |
+
# ------------------------
|
| 105 |
+
# Scaling numeric columns
|
| 106 |
+
# ------------------------
|
| 107 |
+
|
| 108 |
+
def scale_numeric(df, scaler):
|
| 109 |
+
"""
|
| 110 |
+
Standardize numerical columns using StandardScaler.
|
| 111 |
+
"""
|
| 112 |
+
df['Enrollment'] = scaler.transform(df[['Enrollment']])
|
| 113 |
+
return df
|
| 114 |
+
|
| 115 |
+
# ------------------------
|
| 116 |
+
# Text preprocessing
|
| 117 |
+
# ------------------------
|
| 118 |
+
|
| 119 |
+
def clean_text(text):
|
| 120 |
+
if pd.isna(text): # Handle missing values
|
| 121 |
+
return ""
|
| 122 |
+
text = text.lower() # Convert to lowercase
|
| 123 |
+
text = ''.join(char for char in text if char.isalnum() or char.isspace()) # Remove special characters
|
| 124 |
+
return ' '.join(text.split()) # Remove extra whitespaces
|
| 125 |
+
|
| 126 |
+
def preprocess_text_columns(df, text_columns):
|
| 127 |
+
for col in text_columns:
|
| 128 |
+
df[col] = df[col].fillna("No info provided")
|
| 129 |
+
df[col] = df[col].apply(clean_text)
|
| 130 |
+
return df
|
| 131 |
+
|
| 132 |
+
# ------------------------
|
| 133 |
+
# Tokenization of textual Columns
|
| 134 |
+
# ------------------------
|
| 135 |
+
|
| 136 |
+
def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
|
| 137 |
+
"""
|
| 138 |
+
Tokenizes multiple textual columns in batches for inference.
|
| 139 |
+
|
| 140 |
+
Args:
|
| 141 |
+
df (pd.DataFrame): DataFrame containing textual columns.
|
| 142 |
+
textual_columns (list): List of column names to tokenize.
|
| 143 |
+
tokenizer: HuggingFace tokenizer.
|
| 144 |
+
batch_size (int): Number of samples per batch.
|
| 145 |
+
max_length (int): Maximum token length per sequence.
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
dict: Dictionary with column names as keys and tokenized tensors as values.
|
| 149 |
+
"""
|
| 150 |
+
def tokenize_in_batches(column_texts):
|
| 151 |
+
tokenized_batches = []
|
| 152 |
+
for i in range(0, len(column_texts), batch_size):
|
| 153 |
+
batch = column_texts[i:i + batch_size].tolist()
|
| 154 |
+
tokenized_batch = tokenizer(
|
| 155 |
+
batch,
|
| 156 |
+
padding="max_length",
|
| 157 |
+
truncation=True,
|
| 158 |
+
max_length=max_length,
|
| 159 |
+
return_tensors="pt"
|
| 160 |
+
)
|
| 161 |
+
tokenized_batches.append(tokenized_batch)
|
| 162 |
+
# Combine batches
|
| 163 |
+
return {
|
| 164 |
+
"input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
|
| 165 |
+
"attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
tokenized_data = {}
|
| 169 |
+
for col in textual_columns:
|
| 170 |
+
tokenized_data[col] = tokenize_in_batches(df[col])
|
| 171 |
+
return tokenized_data
|
| 172 |
+
|
| 173 |
+
# ------------------------
|
| 174 |
+
# Extract Embeddings
|
| 175 |
+
# ------------------------
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def extract_text_embeddings(tokenized_data_dict, model, device=None, batch_size=32, save_to_disk=False):
|
| 179 |
+
"""
|
| 180 |
+
Extract embeddings from tokenized textual data using BioBERT.
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
tokenized_data_dict (dict): Dictionary of tokenized columns (output of `tokenize_text_columns`).
|
| 184 |
+
model (transformers.PreTrainedModel): BioBERT model (without classification head).
|
| 185 |
+
device (torch.device, optional): Device to run the model on. Defaults to GPU if available.
|
| 186 |
+
batch_size (int): Batch size for embedding extraction.
|
| 187 |
+
save_to_disk (bool): Whether to save embeddings as .pt files for each column.
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
dict: Dictionary of embeddings for each column.
|
| 191 |
+
"""
|
| 192 |
+
if device is None:
|
| 193 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 194 |
+
model.to(device)
|
| 195 |
+
model.eval() # Ensure model is in evaluation mode
|
| 196 |
+
|
| 197 |
+
embeddings_dict = {}
|
| 198 |
+
|
| 199 |
+
for col, tokenized_data in tokenized_data_dict.items():
|
| 200 |
+
print(f"Extracting embeddings for column: {col}")
|
| 201 |
+
|
| 202 |
+
input_ids = tokenized_data["input_ids"]
|
| 203 |
+
attention_mask = tokenized_data["attention_mask"]
|
| 204 |
+
|
| 205 |
+
dataset = TensorDataset(input_ids, attention_mask)
|
| 206 |
+
dataloader = DataLoader(dataset, batch_size=batch_size)
|
| 207 |
+
|
| 208 |
+
all_embeddings = []
|
| 209 |
+
|
| 210 |
+
with torch.no_grad():
|
| 211 |
+
for batch in dataloader:
|
| 212 |
+
input_ids_batch, attention_mask_batch = batch
|
| 213 |
+
input_ids_batch = input_ids_batch.to(device)
|
| 214 |
+
attention_mask_batch = attention_mask_batch.to(device)
|
| 215 |
+
|
| 216 |
+
outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
|
| 217 |
+
hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_dim]
|
| 218 |
+
|
| 219 |
+
# Mean pooling across sequence length
|
| 220 |
+
embeddings = hidden_states.mean(dim=1)
|
| 221 |
+
all_embeddings.append(embeddings.cpu())
|
| 222 |
+
|
| 223 |
+
embeddings_col = torch.cat(all_embeddings, dim=0)
|
| 224 |
+
embeddings_dict[col] = embeddings_col
|
| 225 |
+
|
| 226 |
+
if save_to_disk:
|
| 227 |
+
torch.save(embeddings_col, f"{col}_embeddings.pt")
|
| 228 |
+
print(f"Saved embeddings for column: {col}")
|
| 229 |
+
|
| 230 |
+
print(f"Shape of embeddings for column {col}: {embeddings_col.shape}")
|
| 231 |
+
|
| 232 |
+
return embeddings_dict
|
| 233 |
+
|
| 234 |
+
# ------------------------
|
| 235 |
+
# Main preprocessing function
|
| 236 |
+
# ------------------------
|
| 237 |
+
|
| 238 |
+
def preprocess(df, required_cols, categorical_cols, columns_to_drop, text_columns,
|
| 239 |
+
tokenizer=None, biobert_model=None, device='cpu'):
|
| 240 |
+
"""
|
| 241 |
+
Full preprocessing pipeline.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
df (pd.DataFrame): Input DataFrame (single row or batch).
|
| 245 |
+
required_cols (list): Columns to select from df.
|
| 246 |
+
categorical_cols (list): Categorical columns to encode.
|
| 247 |
+
columns_to_drop (list): Columns to drop from df.
|
| 248 |
+
text_columns (list): Textual columns to preprocess.
|
| 249 |
+
tokenizer (transformers.AutoTokenizer, optional): BioBERT tokenizer for text.
|
| 250 |
+
biobert_model (transformers.AutoModel, optional): BioBERT model (no classification head).
|
| 251 |
+
device (str): 'cpu' or 'cuda'.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
df (pd.DataFrame): Preprocessed tabular DataFrame.
|
| 255 |
+
embeddings (dict or None): Dict of embeddings for text columns, if model provided.
|
| 256 |
+
"""
|
| 257 |
+
# Tabular preprocessing
|
| 258 |
+
df = drop_duplicates(df)
|
| 259 |
+
df = select_required_columns(df, required_cols)
|
| 260 |
+
df = transform_numeric(df)
|
| 261 |
+
df = fill_missing_numerical(df, ["Enrollment"]) # median fill for Enrollment
|
| 262 |
+
df = fill_missing_categorical(df, categorical_cols)
|
| 263 |
+
df = drop_irrelevant_columns(df, columns_to_drop)
|
| 264 |
+
df = expand_study_design(df, unique_attributes)
|
| 265 |
+
df = clean_categorical_columns(df)
|
| 266 |
+
df = encode_categorical(df, label_encoders)
|
| 267 |
+
df = scale_numeric(df, scaler)
|
| 268 |
+
df = preprocess_text_columns(df, text_columns)
|
| 269 |
+
|
| 270 |
+
embeddings = None
|
| 271 |
+
if tokenizer is not None and biobert_model is not None:
|
| 272 |
+
tokenized_dict = tokenize_text_columns(df, text_columns, tokenizer)
|
| 273 |
+
embeddings = extract_text_embeddings(tokenized_dict, biobert_model, device=device)
|
| 274 |
+
|
| 275 |
+
return df, embeddings
|
| 276 |
+
|
backend/preprocessing/scaling.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# scaling.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import joblib
|
| 5 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 6 |
+
import torch
|
| 7 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 8 |
+
from transformers import AutoModel
|
| 9 |
+
|
| 10 |
+
# ------------------------
|
| 11 |
+
# Scaling numeric columns
|
| 12 |
+
# ------------------------
|
| 13 |
+
|
| 14 |
+
def scale_numeric(df, scaler):
|
| 15 |
+
"""
|
| 16 |
+
Standardize numerical columns using StandardScaler.
|
| 17 |
+
"""
|
| 18 |
+
df['Enrollment'] = scaler.transform(df[['Enrollment']])
|
| 19 |
+
return df
|
backend/preprocessing/text_processing.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# text_processing.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import joblib
|
| 5 |
+
from sklearn.preprocessing import LabelEncoder, StandardScaler
|
| 6 |
+
import torch
|
| 7 |
+
from torch.utils.data import TensorDataset, DataLoader
|
| 8 |
+
from transformers import AutoModel
|
| 9 |
+
|
| 10 |
+
# ------------------------
|
| 11 |
+
# Text preprocessing
|
| 12 |
+
# ------------------------
|
| 13 |
+
|
| 14 |
+
def clean_text(text):
|
| 15 |
+
if pd.isna(text): # Handle missing values
|
| 16 |
+
return ""
|
| 17 |
+
text = text.lower() # Convert to lowercase
|
| 18 |
+
text = ''.join(char for char in text if char.isalnum() or char.isspace()) # Remove special characters
|
| 19 |
+
return ' '.join(text.split()) # Remove extra whitespaces
|
| 20 |
+
|
| 21 |
+
def preprocess_text_columns(df, text_columns):
|
| 22 |
+
for col in text_columns:
|
| 23 |
+
df[col] = df[col].fillna("No info provided")
|
| 24 |
+
df[col] = df[col].apply(clean_text)
|
| 25 |
+
return df
|
| 26 |
+
|
| 27 |
+
# ------------------------
|
| 28 |
+
# Tokenization of textual Columns
|
| 29 |
+
# ------------------------
|
| 30 |
+
|
| 31 |
+
def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
|
| 32 |
+
"""
|
| 33 |
+
Tokenizes multiple textual columns in batches for inference.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
df (pd.DataFrame): DataFrame containing textual columns.
|
| 37 |
+
textual_columns (list): List of column names to tokenize.
|
| 38 |
+
tokenizer: HuggingFace tokenizer.
|
| 39 |
+
batch_size (int): Number of samples per batch.
|
| 40 |
+
max_length (int): Maximum token length per sequence.
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
dict: Dictionary with column names as keys and tokenized tensors as values.
|
| 44 |
+
"""
|
| 45 |
+
def tokenize_in_batches(column_texts):
|
| 46 |
+
tokenized_batches = []
|
| 47 |
+
for i in range(0, len(column_texts), batch_size):
|
| 48 |
+
batch = column_texts[i:i + batch_size].tolist()
|
| 49 |
+
tokenized_batch = tokenizer(
|
| 50 |
+
batch,
|
| 51 |
+
padding="max_length",
|
| 52 |
+
truncation=True,
|
| 53 |
+
max_length=max_length,
|
| 54 |
+
return_tensors="pt"
|
| 55 |
+
)
|
| 56 |
+
tokenized_batches.append(tokenized_batch)
|
| 57 |
+
# Combine batches
|
| 58 |
+
return {
|
| 59 |
+
"input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
|
| 60 |
+
"attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
tokenized_data = {}
|
| 64 |
+
for col in textual_columns:
|
| 65 |
+
tokenized_data[col] = tokenize_in_batches(df[col])
|
| 66 |
+
return tokenized_data
|
data/.gitkeep
ADDED
|
File without changes
|
embeddings/.gitkeep
ADDED
|
File without changes
|
frontend/app.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
# PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 6 |
+
# if PROJECT_ROOT not in sys.path:
|
| 7 |
+
# sys.path.insert(0, PROJECT_ROOT)
|
| 8 |
+
|
| 9 |
+
project_root = Path(__file__).parent.parent
|
| 10 |
+
sys.path.insert(0, str(project_root))
|
| 11 |
+
|
| 12 |
+
import streamlit as st
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from backend.pipelines.run_inference import predict
|
| 15 |
+
from io import BytesIO
|
| 16 |
+
|
| 17 |
+
st.set_page_config(page_title="Study Status Prediction", page_icon="📊", layout="wide")
|
| 18 |
+
|
| 19 |
+
st.title("📊 Study Status Prediction")
|
| 20 |
+
st.markdown("Upload a CSV file or manually enter study details to predict whether a study is **COMPLETED** or **NOT COMPLETED**.")
|
| 21 |
+
|
| 22 |
+
# Tabs for CSV Upload and Manual Entry
|
| 23 |
+
tab1, tab2 = st.tabs(["📂 Upload CSV", "✍️ Manual Entry"])
|
| 24 |
+
|
| 25 |
+
# --- Option 1: CSV Upload ---
|
| 26 |
+
with tab1:
|
| 27 |
+
uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
|
| 28 |
+
|
| 29 |
+
if uploaded_file:
|
| 30 |
+
df_new = pd.read_csv(uploaded_file)
|
| 31 |
+
preds = predict(df_new)
|
| 32 |
+
|
| 33 |
+
# Only keep final predictions
|
| 34 |
+
final_preds = pd.DataFrame({"Final Prediction": preds["final_predictions"]})
|
| 35 |
+
|
| 36 |
+
# Display a preview
|
| 37 |
+
st.subheader("🔎 Predictions Preview")
|
| 38 |
+
st.dataframe(final_preds.head())
|
| 39 |
+
|
| 40 |
+
# Download button for predictions
|
| 41 |
+
csv_buffer = BytesIO()
|
| 42 |
+
final_preds.to_csv(csv_buffer, index=False)
|
| 43 |
+
st.download_button(
|
| 44 |
+
label="📥 Download Predictions CSV",
|
| 45 |
+
data=csv_buffer.getvalue(),
|
| 46 |
+
file_name="predictions.csv",
|
| 47 |
+
mime="text/csv"
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# --- Option 2: Manual Entry ---
|
| 51 |
+
with tab2:
|
| 52 |
+
st.subheader("✍️ Enter Study Details")
|
| 53 |
+
st.markdown("Fill in the fields below to predict the study status.")
|
| 54 |
+
|
| 55 |
+
# Placeholders for all the features
|
| 56 |
+
nct_number = st.text_input("NCT Number", placeholder="e.g., NCT01234567")
|
| 57 |
+
study_title = st.text_area("Study Title", placeholder="e.g., A Study of Drug X in Treating Lung Cancer")
|
| 58 |
+
study_url = st.text_input("Study URL", placeholder="e.g., https://clinicaltrials.gov/ct2/show/NCT01234567")
|
| 59 |
+
acronym = st.text_input("Acronym", placeholder="e.g., LUNG-X")
|
| 60 |
+
brief_summary = st.text_area("Brief Summary", placeholder="e.g., This is a phase 3 trial evaluating the effectiveness of Drug X for lung cancer.")
|
| 61 |
+
study_results = st.selectbox("Study Results", ["YES", "NO"])
|
| 62 |
+
conditions = st.text_input("Conditions", placeholder="e.g., Lung Cancer")
|
| 63 |
+
interventions = st.text_input("Interventions", placeholder="e.g., Drug Y")
|
| 64 |
+
primary_outcome = st.text_input("Primary Outcome Measures", placeholder="e.g., Survival rate")
|
| 65 |
+
secondary_outcome = st.text_input("Secondary Outcome Measures", placeholder="e.g., Side effects")
|
| 66 |
+
other_outcome = st.text_input("Other Outcome Measures", placeholder="Optional")
|
| 67 |
+
sponsor = st.text_input("Sponsor", placeholder="e.g., ABC Research")
|
| 68 |
+
collaborators = st.text_input("Collaborators", placeholder="e.g., University of SFX")
|
| 69 |
+
sex = st.selectbox("Sex", ["ALL", "MALE", "FEMALE"])
|
| 70 |
+
age = st.selectbox("Age", ["ADULT, OLDER_ADULT",
|
| 71 |
+
"ADULT",
|
| 72 |
+
"CHILD, ADULT, OLDER_ADULT",
|
| 73 |
+
"CHILD",
|
| 74 |
+
"CHILD, ADULT",
|
| 75 |
+
"OLDER_ADULT"])
|
| 76 |
+
phases = st.selectbox("Phases", ["PHASE2",
|
| 77 |
+
"PHASE1",
|
| 78 |
+
"PHASE4",
|
| 79 |
+
"PHASE3",
|
| 80 |
+
"PHASE1|PHASE2",
|
| 81 |
+
"PHASE2|PHASE3",
|
| 82 |
+
"EARLY_PHASE1"])
|
| 83 |
+
enrollment = st.number_input("Enrollment", min_value=0, step=1, placeholder="e.g., 500")
|
| 84 |
+
funder_type = st.selectbox("Funder Type", ["OTHER",
|
| 85 |
+
"INDUSTRY",
|
| 86 |
+
"NIH",
|
| 87 |
+
"OTHER_GOV",
|
| 88 |
+
"NETWORK",
|
| 89 |
+
"FED",
|
| 90 |
+
"INDIV",
|
| 91 |
+
"UNKNOWN",
|
| 92 |
+
"AMBIG"])
|
| 93 |
+
study_type = st.selectbox("Study Type", ["INTERVENTIONAL", "OBSERVATIONAL"])
|
| 94 |
+
study_design = st.text_area("Study Design", placeholder="e.g., Intervention Model: PARALLEL | Masking: SINGLE (INVESTIGATOR)")
|
| 95 |
+
other_ids = st.text_input("Other IDs", placeholder="e.g., ABC-123")
|
| 96 |
+
start_date = st.text_input("Start Date", placeholder="e.g., January 2023")
|
| 97 |
+
primary_completion_date = st.text_input("Primary Completion Date", placeholder="e.g., December 2025")
|
| 98 |
+
completion_date = st.text_input("Completion Date", placeholder="e.g., June 2026")
|
| 99 |
+
first_posted = st.text_input("First Posted", placeholder="e.g., February 2023")
|
| 100 |
+
results_first_posted = st.text_input("Results First Posted", placeholder="e.g., N/A")
|
| 101 |
+
last_update_posted = st.text_input("Last Update Posted", placeholder="e.g., September 2025")
|
| 102 |
+
locations = st.text_area("Locations", placeholder="e.g., New York, USA")
|
| 103 |
+
study_documents = st.text_area("Study Documents", placeholder="e.g., Protocol PDF")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
if st.button("🔮 Predict Status"):
|
| 108 |
+
single_data = {
|
| 109 |
+
"NCT Number": nct_number,
|
| 110 |
+
"Study Title": study_title,
|
| 111 |
+
"Study URL": study_url,
|
| 112 |
+
"Acronym": acronym,
|
| 113 |
+
"Brief Summary": brief_summary,
|
| 114 |
+
"Study Results": study_results,
|
| 115 |
+
"Conditions": conditions,
|
| 116 |
+
"Interventions": interventions,
|
| 117 |
+
"Primary Outcome Measures": primary_outcome,
|
| 118 |
+
"Secondary Outcome Measures": secondary_outcome,
|
| 119 |
+
"Other Outcome Measures": other_outcome,
|
| 120 |
+
"Sponsor": sponsor,
|
| 121 |
+
"Collaborators": collaborators,
|
| 122 |
+
"Sex": sex,
|
| 123 |
+
"Age": age,
|
| 124 |
+
"Phases": phases,
|
| 125 |
+
"Enrollment": enrollment,
|
| 126 |
+
"Funder Type": funder_type,
|
| 127 |
+
"Study Type": study_type,
|
| 128 |
+
"Study Design": study_design,
|
| 129 |
+
"Other IDs": other_ids,
|
| 130 |
+
"Start Date": start_date,
|
| 131 |
+
"Primary Completion Date": primary_completion_date,
|
| 132 |
+
"Completion Date": completion_date,
|
| 133 |
+
"First Posted": first_posted,
|
| 134 |
+
"Results First Posted": results_first_posted,
|
| 135 |
+
"Last Update Posted": last_update_posted,
|
| 136 |
+
"Locations": locations,
|
| 137 |
+
"Study Documents": study_documents,
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
df_single = pd.DataFrame([single_data])
|
| 141 |
+
|
| 142 |
+
preds = predict(df_single)
|
| 143 |
+
|
| 144 |
+
# Show final prediction with animation
|
| 145 |
+
final_label = preds["final_predictions"][0]
|
| 146 |
+
if final_label == "COMPLETED":
|
| 147 |
+
st.success(f"✅ Prediction: **{final_label}**", icon="🎉")
|
| 148 |
+
else:
|
| 149 |
+
st.error(f"❌ Prediction: **{final_label}**", icon="⚠️")
|
notebooks/clinical-trial-outcome-prediction.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core libraries
|
| 2 |
+
numpy>=1.26.4,<2.2
|
| 3 |
+
pandas==2.2.2
|
| 4 |
+
scikit-learn==1.2.2
|
| 5 |
+
joblib==1.3.2
|
| 6 |
+
|
| 7 |
+
# FastAPI and server
|
| 8 |
+
fastapi==0.111.0
|
| 9 |
+
uvicorn==0.29.0
|
| 10 |
+
|
| 11 |
+
# Data visualization / plotting
|
| 12 |
+
matplotlib==3.9.0
|
| 13 |
+
seaborn==0.13.2
|
| 14 |
+
|
| 15 |
+
# Streamlit
|
| 16 |
+
streamlit==1.35.0
|
| 17 |
+
|
| 18 |
+
# # PyTorch
|
| 19 |
+
# torch==2.3.0
|
| 20 |
+
# torchvision==0.18.0
|
| 21 |
+
# torchaudio==2.3.0
|
| 22 |
+
|
| 23 |
+
# Transformers / HuggingFace
|
| 24 |
+
transformers==4.43.0
|
| 25 |
+
huggingface_hub==0.23.4
|
save_preprocessor.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# save_preprocessor.py
|
| 2 |
+
from transformers import AutoTokenizer, AutoModel
|
| 3 |
+
from backend.pipelines.preprocessor_pipeline import Preprocessor
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
# Define dataset columns (adapt to your dataset)
|
| 7 |
+
sample_df = pd.DataFrame([{
|
| 8 |
+
"Brief Summary": "This is a sample study.",
|
| 9 |
+
"Study Results": "Has Results",
|
| 10 |
+
"Conditions": "Condition A",
|
| 11 |
+
"Interventions": "Drug X",
|
| 12 |
+
"Primary Outcome Measures": "Outcome 1",
|
| 13 |
+
"Secondary Outcome Measures": "Outcome 2",
|
| 14 |
+
"Sponsor": "XYZ Corp",
|
| 15 |
+
"Sex": "All",
|
| 16 |
+
"Age": "Adult",
|
| 17 |
+
"Funder Type": "Industry",
|
| 18 |
+
"Phases": "Phase 2",
|
| 19 |
+
"Enrollment": 120,
|
| 20 |
+
"Study Type": "Interventional",
|
| 21 |
+
"Study Design": "Intervention: Randomized|Masking: Double",
|
| 22 |
+
}])
|
| 23 |
+
|
| 24 |
+
required_cols = sample_df.columns.tolist()
|
| 25 |
+
categorical_cols = [
|
| 26 |
+
"Study Results", "Sex", "Age", "Funder Type", "Phases",
|
| 27 |
+
"Study Type"
|
| 28 |
+
]
|
| 29 |
+
columns_to_drop = ["Sponsor", "Observational Model", "Time Perspective"]
|
| 30 |
+
text_columns = [
|
| 31 |
+
"Brief Summary", "Conditions", "Interventions",
|
| 32 |
+
"Primary Outcome Measures", "Secondary Outcome Measures"
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
# Load BioBERT
|
| 36 |
+
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
|
| 37 |
+
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
|
| 38 |
+
|
| 39 |
+
# Ensure pad token exists
|
| 40 |
+
if tokenizer.pad_token is None:
|
| 41 |
+
tokenizer.pad_token = tokenizer.cls_token or "[PAD]"
|
| 42 |
+
|
| 43 |
+
# Create and save preprocessor
|
| 44 |
+
preprocessor = Preprocessor(
|
| 45 |
+
required_cols,
|
| 46 |
+
categorical_cols,
|
| 47 |
+
columns_to_drop,
|
| 48 |
+
text_columns,
|
| 49 |
+
tokenizer=tokenizer,
|
| 50 |
+
biobert_model=model,
|
| 51 |
+
device="cpu"
|
| 52 |
+
)
|
| 53 |
+
preprocessor.save("backend/models/preprocessor.pkl")
|
tests/test_inference.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# test_inference.py
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from pipelines.preprocessor_pipeline import Preprocessor
|
| 4 |
+
|
| 5 |
+
# Load saved preprocessor
|
| 6 |
+
preprocessor = Preprocessor.load("models/preprocessor.pkl")
|
| 7 |
+
|
| 8 |
+
# Sample new data for inference
|
| 9 |
+
df_new = pd.DataFrame([{
|
| 10 |
+
""
|
| 11 |
+
"NCT Number": "NCT01234567",
|
| 12 |
+
"Study Title": "A Study of Drug X in Treating Lung Cancer",
|
| 13 |
+
"Study URL": "https://clinicaltrials.gov/ct2/show/NCT01234567",
|
| 14 |
+
"Acronym": "LUNG-X",
|
| 15 |
+
"Brief Summary": "This is a phase 3 trial evaluating the effectiveness of Drug X for lung cancer.",
|
| 16 |
+
"Study Results": "NO",
|
| 17 |
+
"Conditions": "Lung Cancer",
|
| 18 |
+
"Interventions": "Drug Y",
|
| 19 |
+
"Primary Outcome Measures": "Survival rate",
|
| 20 |
+
"Secondary Outcome Measures": "Side effects",
|
| 21 |
+
"Other Outcome Measures": "",
|
| 22 |
+
"Sponsor": "ABC Research",
|
| 23 |
+
"Collaborators": "University of SFX",
|
| 24 |
+
"Sex": "MALE",
|
| 25 |
+
"Age": "garbage value - jhfkjahfaiueuw",
|
| 26 |
+
"Phases": "Phase 3",
|
| 27 |
+
"Enrollment": 500,
|
| 28 |
+
"Funder Type": "Government",
|
| 29 |
+
"Study Type": "Archchisman",
|
| 30 |
+
"Study Design": "Intervention Model: Randomized|Masking: QUADRUPLE (PARTICIPANT, CARE_PROVIDER, INVESTIGATOR, OUTCOMES_ASSESSOR)|Observational Model: Observing|Name: Archchisman Banerjee",
|
| 31 |
+
"Other IDs": "ABC-123",
|
| 32 |
+
"Start Date": "January 2023",
|
| 33 |
+
"Primary Completion Date": "December 2025",
|
| 34 |
+
"Completion Date": "June 2026",
|
| 35 |
+
"First Posted": "February 2023",
|
| 36 |
+
"Results First Posted": "N/A",
|
| 37 |
+
"Last Update Posted": "September 2025",
|
| 38 |
+
"Locations": "New York, USA",
|
| 39 |
+
"Study Documents": "Protocol PDF"
|
| 40 |
+
}])
|
| 41 |
+
|
| 42 |
+
X_tabular, embeddings = preprocessor.transform(df_new)
|
| 43 |
+
|
| 44 |
+
print("Processed Tabular Features:")
|
| 45 |
+
print(X_tabular.head())
|
| 46 |
+
X_tabular.to_csv("test.csv")
|
| 47 |
+
|
| 48 |
+
if embeddings:
|
| 49 |
+
for col, emb in embeddings.items():
|
| 50 |
+
print(f"Embeddings for {col}: {emb.shape}")
|