archis99 commited on
Commit
d587b0b
·
0 Parent(s):

Initial Commit

Browse files
.gitignore ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+
7
+ # Virtual environment
8
+ venv/
9
+ venv311/
10
+ .env/
11
+
12
+ # Data folder - keep folder, ignore all files
13
+ /data/*
14
+ !/data/.gitkeep
15
+
16
+ # Embeddings folder - keep folder, ignore all files
17
+ embeddings/*
18
+ !embeddings/.gitkeep
19
+
20
+
21
+ # Ignore locally downloaded or generated models in the backend
22
+ /backend/models/*.pth
23
+ /backend/models/*.joblib
24
+ /backend/models/*.pkl
25
+
26
+ # Ignore cloned Hugging Face repo
27
+ /backend/Novartis-models/
28
+
29
+ # Jupyter notebooks checkpoints
30
+ *.ipynb_checkpoints/
31
+
32
+ # --- Tools & OS ---
33
+ # IDE / Editor specific
34
+ .vscode/
35
+ .idea/
36
+
37
+ # Operating System generated files
38
+ .DS_Store
39
+ Thumbs.db
40
+
41
+ # Logs
42
+ *.log
43
+
44
+ # Streamlit cache
45
+ .streamlit/
backend/Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: uvicorn main:app --host 0.0.0.0 --port $PORT
backend/__init__.py ADDED
File without changes
backend/main.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ import pandas as pd
3
+ from .pipelines.run_inference import predict
4
+ from fastapi.middleware.cors import CORSMiddleware
5
+
6
+ app = FastAPI(title="Study Status Prediction API")
7
+
8
+ # CORS for frontend
9
+ app.add_middleware(
10
+ CORSMiddleware,
11
+ allow_origins=["*"],
12
+ allow_methods=["*"],
13
+ allow_headers=["*"],
14
+ )
15
+
16
+ @app.get("/health")
17
+ def health_check():
18
+ return {"status": "ok"}
19
+
20
+ @app.post("/predict")
21
+ def predict_endpoint(data: dict | list[dict]):
22
+ # Convert single row or multiple rows to DataFrame
23
+ df = pd.DataFrame(data if isinstance(data, list) else [data])
24
+ predictions = predict(df)
25
+ return predictions
backend/models/biobert_model.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # biobert_model.py
2
+ # rebuilding the same model architecture as training
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ class BioBERTClassifier(nn.Module):
8
+ def __init__(self, embedding_dim=768, num_embeddings=5, num_classes=2, hidden_dim=256):
9
+ super(BioBERTClassifier, self).__init__()
10
+ # input size = 5 * 768 = 3840
11
+ self.fc1 = nn.Linear(embedding_dim * num_embeddings, hidden_dim)
12
+ self.relu = nn.ReLU()
13
+ self.dropout = nn.Dropout(0.3)
14
+ self.fc2 = nn.Linear(hidden_dim, num_classes)
15
+
16
+ def forward(self, e1, e2, e3, e4, e5):
17
+ # Concatenate all embeddings
18
+ x = torch.cat((e1, e2, e3, e4, e5), dim=1) # shape (batch, 3840)
19
+ x = self.fc1(x)
20
+ x = self.relu(x)
21
+ x = self.dropout(x)
22
+ logits = self.fc2(x)
23
+ return logits
backend/pipelines/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # __init__.py
backend/pipelines/preprocessor_pipeline.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # preprocessor_pipeline.py
2
+ import joblib
3
+ import pandas as pd
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModel
6
+
7
+ # Import all helper funcs & globals from preprocessing package
8
+ from ..preprocessing.cleaning import (
9
+ drop_duplicates, select_required_columns, transform_numeric,
10
+ fill_missing_numerical, fill_missing_categorical, drop_irrelevant_columns
11
+ )
12
+ from ..preprocessing.categorical import expand_study_design, encode_categorical, clean_categorical_columns
13
+ from ..preprocessing.scaling import scale_numeric
14
+ from ..preprocessing.text_processing import preprocess_text_columns, tokenize_text_columns
15
+ from ..preprocessing.embeddings import extract_text_embeddings
16
+ from ..preprocessing.globals import scaler, label_encoders, unique_attributes
17
+
18
+
19
+ class Preprocessor:
20
+ def __init__(self, required_cols, categorical_cols, columns_to_drop, text_columns,
21
+ tokenizer=None, biobert_model=None, device="cpu"):
22
+ self.required_cols = required_cols
23
+ self.categorical_cols = categorical_cols
24
+ self.columns_to_drop = columns_to_drop
25
+ self.text_columns = text_columns
26
+ self.tokenizer = tokenizer
27
+ self.biobert_model = biobert_model
28
+ self.device = device
29
+
30
+ def transform(self, df: pd.DataFrame):
31
+ """Run full preprocessing on a dataframe."""
32
+ df = drop_duplicates(df)
33
+ df = select_required_columns(df, self.required_cols)
34
+ df = transform_numeric(df)
35
+ df = fill_missing_numerical(df, ["Enrollment"])
36
+ df = fill_missing_categorical(df, self.categorical_cols)
37
+ df = expand_study_design(df, unique_attributes)
38
+ df = drop_irrelevant_columns(df, self.columns_to_drop)
39
+ df = clean_categorical_columns(df)
40
+ df = encode_categorical(df, label_encoders)
41
+ df = scale_numeric(df, scaler)
42
+ df = preprocess_text_columns(df, self.text_columns)
43
+
44
+ embeddings = None
45
+ if self.tokenizer is not None and self.biobert_model is not None:
46
+ tokenized_dict = tokenize_text_columns(df, self.text_columns, self.tokenizer)
47
+ embeddings = extract_text_embeddings(
48
+ tokenized_dict,
49
+ self.biobert_model,
50
+ device=self.device
51
+ )
52
+
53
+ return df, embeddings
54
+
55
+ def save(self, path="models/preprocessor.pkl"):
56
+ """Save preprocessor object."""
57
+ joblib.dump(self, path)
58
+
59
+ @staticmethod
60
+ def load(path="models/preprocessor.pkl"):
61
+ """Load preprocessor object."""
62
+ return joblib.load(path)
backend/pipelines/run_inference.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import joblib
4
+ import pickle
5
+ import pandas as pd
6
+ import numpy as np
7
+ from huggingface_hub import hf_hub_download
8
+ from pathlib import Path
9
+ from ..models.biobert_model import BioBERTClassifier
10
+
11
+ # Directory to store downloaded models
12
+ MODEL_DIR = Path(__file__).parent.parent / "models"
13
+ os.makedirs(MODEL_DIR, exist_ok=True)
14
+
15
+ # Hugging Face repo and filenames
16
+ HF_REPO = "archis99/Novartis-models"
17
+ BIOBERT_FILE = "biobert_classifier.pth"
18
+ RF_FILE = "random_forest_model.joblib"
19
+ PREPROCESSOR_FILE = "preprocessor.pkl"
20
+
21
+ # Paths for local files
22
+ biobert_path = os.path.join(MODEL_DIR, BIOBERT_FILE)
23
+ rf_path = os.path.join(MODEL_DIR, RF_FILE)
24
+ preprocessor_path = os.path.join(MODEL_DIR, PREPROCESSOR_FILE)
25
+
26
+ # Download if not present locally
27
+ for file_name, local_path in [(BIOBERT_FILE, biobert_path),
28
+ (RF_FILE, rf_path),
29
+ (PREPROCESSOR_FILE, preprocessor_path)]:
30
+ if not os.path.exists(local_path):
31
+ print(f"Downloading {file_name} from Hugging Face...")
32
+ hf_hub_download(repo_id=HF_REPO, filename=file_name, local_dir=MODEL_DIR, local_dir_use_symlinks=False)
33
+
34
+ # Load preprocessor
35
+ with open(preprocessor_path, "rb") as f:
36
+ preprocessor = pickle.load(f)
37
+
38
+ # Load Random Forest model
39
+ rf_model = joblib.load(rf_path)
40
+
41
+ # Load BioBERT model
42
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
43
+ biobert_model = BioBERTClassifier()
44
+ biobert_model.load_state_dict(torch.load(biobert_path, map_location=device))
45
+ biobert_model.to(device)
46
+ biobert_model.eval()
47
+
48
+ # Thresholds & weights from training
49
+ RF_THRESHOLD = 0.1
50
+ BIOBERT_THRESHOLD = 0.3
51
+ ENSEMBLE_THRESHOLD = 0.22999999999999995
52
+ W1, W2 = 2.0, 0.5
53
+
54
+ # Label mapping
55
+ LABEL_MAP = {0: "COMPLETED", 1: "NOT COMPLETED"}
56
+
57
+ # Inference function
58
+ def predict(df_new: pd.DataFrame):
59
+ # Preprocess input
60
+ X_tabular, embeddings = preprocessor.transform(df_new)
61
+
62
+ # Columns to drop for RF
63
+ textual_columns = [
64
+ "Brief Summary",
65
+ "Conditions",
66
+ "Interventions",
67
+ "Primary Outcome Measures",
68
+ "Secondary Outcome Measures"
69
+ ]
70
+
71
+ # Keep only RF-relevant features
72
+ X_tabular_rf = X_tabular.drop(columns=textual_columns, errors="ignore")
73
+
74
+ # RF prediction (probabilities)
75
+ rf_probs = rf_model.predict_proba(X_tabular_rf)[:, 1]
76
+
77
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
78
+
79
+ # BioBERT prediction
80
+ e1, e2, e3, e4, e5 = [embeddings[col].to(device) for col in textual_columns] # unpack embeddings
81
+ with torch.no_grad():
82
+ logits = biobert_model(e1, e2, e3, e4, e5)
83
+ biobert_probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
84
+
85
+ # Ensemble (soft voting with weights)
86
+ combined_probs = (W1 * rf_probs + W2 * biobert_probs) / (W1 + W2)
87
+
88
+ # Final binary predictions using tuned threshold
89
+ final_preds = (combined_probs > ENSEMBLE_THRESHOLD).astype(int)
90
+
91
+ # Map to human-readable labels
92
+ final_labels = [LABEL_MAP[p] for p in final_preds]
93
+
94
+ return {
95
+ # "rf_probs": rf_probs.tolist(),
96
+ # "biobert_probs": biobert_probs.tolist(),
97
+ # "combined_probs": combined_probs.tolist(),
98
+ "final_predictions": final_labels
99
+ }
backend/preprocessing/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # preprocessing/__init__.py
backend/preprocessing/categorical.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # categorical.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
6
+ import torch
7
+ from torch.utils.data import TensorDataset, DataLoader
8
+ from transformers import AutoModel
9
+
10
+ # ------------------------
11
+ # Study Design Parsing
12
+ # ------------------------
13
+
14
+ def parse_study_design(study_design, all_attributes):
15
+ # Initialize all allowed attributes as "Unknown"
16
+ attributes = {attr: "Unknown" for attr in all_attributes}
17
+
18
+ if study_design and study_design != "Unknown" and pd.notna(study_design):
19
+ for part in study_design.split('|'):
20
+ if ':' in part:
21
+ key, value = part.split(':', 1)
22
+ key, value = key.strip(), value.strip()
23
+
24
+ # Only keep keys that are in our unique_attributes list
25
+ if key in all_attributes:
26
+ attributes[key] = value
27
+ # else: ignore unknown keys (do not create new columns)
28
+
29
+ return attributes
30
+
31
+ def expand_study_design(df, unique_attributes):
32
+ parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes))
33
+ study_df = pd.DataFrame(parsed.tolist(), index=df.index)
34
+
35
+ # Merge parsed attributes back with df
36
+ df = pd.concat([df, study_df], axis=1)
37
+
38
+ # Drop original Study Design column
39
+ df = df.drop(columns=['Study Design'], errors='ignore')
40
+
41
+ return df
42
+
43
+ # ------------------------
44
+ # Encoding Categorical Columns
45
+ # ------------------------
46
+
47
+ def encode_categorical(df, label_encoders):
48
+ for col, le in label_encoders.items():
49
+ # Transform using saved encoder; handle unseen labels
50
+ df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown")
51
+ df[col] = le.transform(df[col])
52
+ return df
53
+
54
+ def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame:
55
+ """
56
+ Clean and standardize certain categorical columns for inference.
57
+
58
+ Replaces missing or malformed values with 'Unknown' to match training preprocessing.
59
+
60
+ Args:
61
+ df (pd.DataFrame): Input dataframe with user data.
62
+
63
+ Returns:
64
+ pd.DataFrame: DataFrame with cleaned categorical columns.
65
+ """
66
+ columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose']
67
+
68
+ for col in columns_to_clean:
69
+ # Replace known missing/malformed values with 'Unknown'
70
+ df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
71
+ # Replace actual NaN values with 'Unknown'
72
+ df[col] = df[col].fillna('Unknown')
73
+
74
+ return df
75
+
backend/preprocessing/cleaning.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # cleaning.py
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ import joblib
6
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
7
+ import torch
8
+ from torch.utils.data import TensorDataset, DataLoader
9
+ from transformers import AutoModel
10
+
11
+ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
12
+ return df.drop_duplicates()
13
+
14
+ def select_required_columns(df: pd.DataFrame, required_cols: list) -> pd.DataFrame:
15
+ return df[required_cols].copy()
16
+
17
+ def transform_numeric(df: pd.DataFrame) -> pd.DataFrame:
18
+ """
19
+ Apply sqrt transform to 'Enrollment' column
20
+ """
21
+ df['Enrollment'] = np.sqrt(df['Enrollment'] + 1e-6)
22
+ return df
23
+
24
+ def fill_missing_numerical(df: pd.DataFrame, numerical_cols: list) -> pd.DataFrame:
25
+ """
26
+ Fill missing numerical values with the median of each column.
27
+ """
28
+ for col in numerical_cols:
29
+ df[col] = df[col].fillna(df[col].median())
30
+ return df
31
+
32
+ def fill_missing_categorical(df: pd.DataFrame, columns_to_clean: list) -> pd.DataFrame:
33
+ """
34
+ Replace 'Unknown', 'NA', '', ' ' and NaN with 'Unknown' in given categorical columns.
35
+ """
36
+ for col in columns_to_clean:
37
+ df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
38
+ df[col] = df[col].fillna('Unknown')
39
+ return df
40
+
41
+ def drop_irrelevant_columns(df, columns_to_drop):
42
+ return df.drop(columns=columns_to_drop, errors='ignore')
backend/preprocessing/embeddings.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # embeddings.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
6
+ import torch
7
+ from torch.utils.data import TensorDataset, DataLoader
8
+ from transformers import AutoModel
9
+
10
+
11
+ # ------------------------
12
+ # Extract Embeddings
13
+ # ------------------------
14
+
15
+
16
+ def extract_text_embeddings(tokenized_data_dict, model, device=None, batch_size=32, save_to_disk=False):
17
+ """
18
+ Extract embeddings from tokenized textual data using BioBERT.
19
+
20
+ Args:
21
+ tokenized_data_dict (dict): Dictionary of tokenized columns (output of `tokenize_text_columns`).
22
+ model (transformers.PreTrainedModel): BioBERT model (without classification head).
23
+ device (torch.device, optional): Device to run the model on. Defaults to GPU if available.
24
+ batch_size (int): Batch size for embedding extraction.
25
+ save_to_disk (bool): Whether to save embeddings as .pt files for each column.
26
+
27
+ Returns:
28
+ dict: Dictionary of embeddings for each column.
29
+ """
30
+ if device is None:
31
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
+ model.to(device)
33
+ model.eval() # Ensure model is in evaluation mode
34
+
35
+ embeddings_dict = {}
36
+
37
+ for col, tokenized_data in tokenized_data_dict.items():
38
+ print(f"Extracting embeddings for column: {col}")
39
+
40
+ input_ids = tokenized_data["input_ids"]
41
+ attention_mask = tokenized_data["attention_mask"]
42
+
43
+ dataset = TensorDataset(input_ids, attention_mask)
44
+ dataloader = DataLoader(dataset, batch_size=batch_size)
45
+
46
+ all_embeddings = []
47
+
48
+ with torch.no_grad():
49
+ for batch in dataloader:
50
+ input_ids_batch, attention_mask_batch = batch
51
+ input_ids_batch = input_ids_batch.to(device)
52
+ attention_mask_batch = attention_mask_batch.to(device)
53
+
54
+ outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
55
+ hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_dim]
56
+
57
+ # Mean pooling across sequence length
58
+ embeddings = hidden_states.mean(dim=1)
59
+ all_embeddings.append(embeddings.cpu())
60
+
61
+ embeddings_col = torch.cat(all_embeddings, dim=0)
62
+ embeddings_dict[col] = embeddings_col
63
+
64
+ if save_to_disk:
65
+ torch.save(embeddings_col, f"{col}_embeddings.pt")
66
+ print(f"Saved embeddings for column: {col}")
67
+
68
+ print(f"Shape of embeddings for column {col}: {embeddings_col.shape}")
69
+
70
+ return embeddings_dict
backend/preprocessing/globals.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # globals.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
6
+ import torch
7
+ from torch.utils.data import TensorDataset, DataLoader
8
+ from transformers import AutoModel
9
+ from pathlib import Path
10
+
11
+ BACKEND_DIR = Path(__file__).parent.parent
12
+
13
+ # --- Load saved artifacts using the absolute path ---
14
+ scaler = joblib.load(BACKEND_DIR / "models/scaler_enrollment.pkl")
15
+ label_encoders = joblib.load(BACKEND_DIR / "models/feature_label_encoders.pkl")
16
+ unique_attributes = joblib.load(BACKEND_DIR / "models/study_design_attributes.pkl")
17
+
backend/preprocessing/preprocessing_all.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # preprocessing_all.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
6
+ import torch
7
+ from torch.utils.data import TensorDataset, DataLoader
8
+ from transformers import AutoModel
9
+
10
+ # ------------------------
11
+ # Load saved artifacts
12
+ # ------------------------
13
+
14
+ scaler = joblib.load("models\scaler_enrollment.pkl") # StandardScaler for 'Enrollment'
15
+ label_encoders = joblib.load("models\label_encoders.pkl") # Dict of LabelEncoders for categorical columns
16
+ unique_attributes = joblib.load("models\study_design_attributes.pkl") # List of Study Design attributes
17
+
18
+ def drop_duplicates(df: pd.DataFrame) -> pd.DataFrame:
19
+ return df.drop_duplicates()
20
+
21
+ def select_required_columns(df: pd.DataFrame, required_cols: list) -> pd.DataFrame:
22
+ return df[required_cols].copy()
23
+
24
+ def transform_numeric(df: pd.DataFrame) -> pd.DataFrame:
25
+ """
26
+ Apply sqrt transform to 'Enrollment' column
27
+ """
28
+ df['Enrollment'] = np.sqrt(df['Enrollment'] + 1e-6)
29
+ return df
30
+
31
+ def fill_missing_numerical(df: pd.DataFrame, numerical_cols: list) -> pd.DataFrame:
32
+ """
33
+ Fill missing numerical values with the median of each column.
34
+ """
35
+ for col in numerical_cols:
36
+ df[col] = df[col].fillna(df[col].median())
37
+ return df
38
+
39
+ def fill_missing_categorical(df: pd.DataFrame, columns_to_clean: list) -> pd.DataFrame:
40
+ """
41
+ Replace 'Unknown', 'NA', '', ' ' and NaN with 'Unknown' in given categorical columns.
42
+ """
43
+ for col in columns_to_clean:
44
+ df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
45
+ df[col] = df[col].fillna('Unknown')
46
+ return df
47
+
48
+ def drop_irrelevant_columns(df, columns_to_drop):
49
+ return df.drop(columns=columns_to_drop, errors='ignore')
50
+
51
+ # ------------------------
52
+ # Study Design Parsing
53
+ # ------------------------
54
+
55
+ def parse_study_design(study_design, all_attributes):
56
+ attributes = {attr: "Unknown" for attr in all_attributes}
57
+ if study_design != "Unknown" and pd.notna(study_design):
58
+ for part in study_design.split('|'):
59
+ if ':' in part:
60
+ key, value = part.split(':', 1)
61
+ attributes[key.strip()] = value.strip()
62
+ return attributes
63
+
64
+ def expand_study_design(df, unique_attributes):
65
+ parsed = df['Study Design'].apply(lambda x: parse_study_design(x, unique_attributes))
66
+ study_df = pd.DataFrame(parsed.tolist(), index=df.index)
67
+ df = pd.concat([df, study_df], axis=1)
68
+ df = df.drop(columns=['Study Design'], errors='ignore')
69
+ return df
70
+
71
+ # ------------------------
72
+ # Encoding Categorical Columns
73
+ # ------------------------
74
+
75
+ def encode_categorical(df, label_encoders):
76
+ for col, le in label_encoders.items():
77
+ # Transform using saved encoder; handle unseen labels
78
+ df[col] = df[col].map(lambda x: x if x in le.classes_ else "Unknown")
79
+ df[col] = le.transform(df[col])
80
+ return df
81
+
82
+ def clean_categorical_columns(df: pd.DataFrame) -> pd.DataFrame:
83
+ """
84
+ Clean and standardize certain categorical columns for inference.
85
+
86
+ Replaces missing or malformed values with 'Unknown' to match training preprocessing.
87
+
88
+ Args:
89
+ df (pd.DataFrame): Input dataframe with user data.
90
+
91
+ Returns:
92
+ pd.DataFrame: DataFrame with cleaned categorical columns.
93
+ """
94
+ columns_to_clean = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose']
95
+
96
+ for col in columns_to_clean:
97
+ # Replace known missing/malformed values with 'Unknown'
98
+ df[col] = df[col].replace(['Unknown', 'NA', '', ' '], 'Unknown')
99
+ # Replace actual NaN values with 'Unknown'
100
+ df[col] = df[col].fillna('Unknown')
101
+
102
+ return df
103
+
104
+ # ------------------------
105
+ # Scaling numeric columns
106
+ # ------------------------
107
+
108
+ def scale_numeric(df, scaler):
109
+ """
110
+ Standardize numerical columns using StandardScaler.
111
+ """
112
+ df['Enrollment'] = scaler.transform(df[['Enrollment']])
113
+ return df
114
+
115
+ # ------------------------
116
+ # Text preprocessing
117
+ # ------------------------
118
+
119
+ def clean_text(text):
120
+ if pd.isna(text): # Handle missing values
121
+ return ""
122
+ text = text.lower() # Convert to lowercase
123
+ text = ''.join(char for char in text if char.isalnum() or char.isspace()) # Remove special characters
124
+ return ' '.join(text.split()) # Remove extra whitespaces
125
+
126
+ def preprocess_text_columns(df, text_columns):
127
+ for col in text_columns:
128
+ df[col] = df[col].fillna("No info provided")
129
+ df[col] = df[col].apply(clean_text)
130
+ return df
131
+
132
+ # ------------------------
133
+ # Tokenization of textual Columns
134
+ # ------------------------
135
+
136
+ def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
137
+ """
138
+ Tokenizes multiple textual columns in batches for inference.
139
+
140
+ Args:
141
+ df (pd.DataFrame): DataFrame containing textual columns.
142
+ textual_columns (list): List of column names to tokenize.
143
+ tokenizer: HuggingFace tokenizer.
144
+ batch_size (int): Number of samples per batch.
145
+ max_length (int): Maximum token length per sequence.
146
+
147
+ Returns:
148
+ dict: Dictionary with column names as keys and tokenized tensors as values.
149
+ """
150
+ def tokenize_in_batches(column_texts):
151
+ tokenized_batches = []
152
+ for i in range(0, len(column_texts), batch_size):
153
+ batch = column_texts[i:i + batch_size].tolist()
154
+ tokenized_batch = tokenizer(
155
+ batch,
156
+ padding="max_length",
157
+ truncation=True,
158
+ max_length=max_length,
159
+ return_tensors="pt"
160
+ )
161
+ tokenized_batches.append(tokenized_batch)
162
+ # Combine batches
163
+ return {
164
+ "input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
165
+ "attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
166
+ }
167
+
168
+ tokenized_data = {}
169
+ for col in textual_columns:
170
+ tokenized_data[col] = tokenize_in_batches(df[col])
171
+ return tokenized_data
172
+
173
+ # ------------------------
174
+ # Extract Embeddings
175
+ # ------------------------
176
+
177
+
178
+ def extract_text_embeddings(tokenized_data_dict, model, device=None, batch_size=32, save_to_disk=False):
179
+ """
180
+ Extract embeddings from tokenized textual data using BioBERT.
181
+
182
+ Args:
183
+ tokenized_data_dict (dict): Dictionary of tokenized columns (output of `tokenize_text_columns`).
184
+ model (transformers.PreTrainedModel): BioBERT model (without classification head).
185
+ device (torch.device, optional): Device to run the model on. Defaults to GPU if available.
186
+ batch_size (int): Batch size for embedding extraction.
187
+ save_to_disk (bool): Whether to save embeddings as .pt files for each column.
188
+
189
+ Returns:
190
+ dict: Dictionary of embeddings for each column.
191
+ """
192
+ if device is None:
193
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
194
+ model.to(device)
195
+ model.eval() # Ensure model is in evaluation mode
196
+
197
+ embeddings_dict = {}
198
+
199
+ for col, tokenized_data in tokenized_data_dict.items():
200
+ print(f"Extracting embeddings for column: {col}")
201
+
202
+ input_ids = tokenized_data["input_ids"]
203
+ attention_mask = tokenized_data["attention_mask"]
204
+
205
+ dataset = TensorDataset(input_ids, attention_mask)
206
+ dataloader = DataLoader(dataset, batch_size=batch_size)
207
+
208
+ all_embeddings = []
209
+
210
+ with torch.no_grad():
211
+ for batch in dataloader:
212
+ input_ids_batch, attention_mask_batch = batch
213
+ input_ids_batch = input_ids_batch.to(device)
214
+ attention_mask_batch = attention_mask_batch.to(device)
215
+
216
+ outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
217
+ hidden_states = outputs.last_hidden_state # Shape: [batch_size, seq_len, hidden_dim]
218
+
219
+ # Mean pooling across sequence length
220
+ embeddings = hidden_states.mean(dim=1)
221
+ all_embeddings.append(embeddings.cpu())
222
+
223
+ embeddings_col = torch.cat(all_embeddings, dim=0)
224
+ embeddings_dict[col] = embeddings_col
225
+
226
+ if save_to_disk:
227
+ torch.save(embeddings_col, f"{col}_embeddings.pt")
228
+ print(f"Saved embeddings for column: {col}")
229
+
230
+ print(f"Shape of embeddings for column {col}: {embeddings_col.shape}")
231
+
232
+ return embeddings_dict
233
+
234
+ # ------------------------
235
+ # Main preprocessing function
236
+ # ------------------------
237
+
238
+ def preprocess(df, required_cols, categorical_cols, columns_to_drop, text_columns,
239
+ tokenizer=None, biobert_model=None, device='cpu'):
240
+ """
241
+ Full preprocessing pipeline.
242
+
243
+ Args:
244
+ df (pd.DataFrame): Input DataFrame (single row or batch).
245
+ required_cols (list): Columns to select from df.
246
+ categorical_cols (list): Categorical columns to encode.
247
+ columns_to_drop (list): Columns to drop from df.
248
+ text_columns (list): Textual columns to preprocess.
249
+ tokenizer (transformers.AutoTokenizer, optional): BioBERT tokenizer for text.
250
+ biobert_model (transformers.AutoModel, optional): BioBERT model (no classification head).
251
+ device (str): 'cpu' or 'cuda'.
252
+
253
+ Returns:
254
+ df (pd.DataFrame): Preprocessed tabular DataFrame.
255
+ embeddings (dict or None): Dict of embeddings for text columns, if model provided.
256
+ """
257
+ # Tabular preprocessing
258
+ df = drop_duplicates(df)
259
+ df = select_required_columns(df, required_cols)
260
+ df = transform_numeric(df)
261
+ df = fill_missing_numerical(df, ["Enrollment"]) # median fill for Enrollment
262
+ df = fill_missing_categorical(df, categorical_cols)
263
+ df = drop_irrelevant_columns(df, columns_to_drop)
264
+ df = expand_study_design(df, unique_attributes)
265
+ df = clean_categorical_columns(df)
266
+ df = encode_categorical(df, label_encoders)
267
+ df = scale_numeric(df, scaler)
268
+ df = preprocess_text_columns(df, text_columns)
269
+
270
+ embeddings = None
271
+ if tokenizer is not None and biobert_model is not None:
272
+ tokenized_dict = tokenize_text_columns(df, text_columns, tokenizer)
273
+ embeddings = extract_text_embeddings(tokenized_dict, biobert_model, device=device)
274
+
275
+ return df, embeddings
276
+
backend/preprocessing/scaling.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # scaling.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
6
+ import torch
7
+ from torch.utils.data import TensorDataset, DataLoader
8
+ from transformers import AutoModel
9
+
10
+ # ------------------------
11
+ # Scaling numeric columns
12
+ # ------------------------
13
+
14
+ def scale_numeric(df, scaler):
15
+ """
16
+ Standardize numerical columns using StandardScaler.
17
+ """
18
+ df['Enrollment'] = scaler.transform(df[['Enrollment']])
19
+ return df
backend/preprocessing/text_processing.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # text_processing.py
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
6
+ import torch
7
+ from torch.utils.data import TensorDataset, DataLoader
8
+ from transformers import AutoModel
9
+
10
+ # ------------------------
11
+ # Text preprocessing
12
+ # ------------------------
13
+
14
+ def clean_text(text):
15
+ if pd.isna(text): # Handle missing values
16
+ return ""
17
+ text = text.lower() # Convert to lowercase
18
+ text = ''.join(char for char in text if char.isalnum() or char.isspace()) # Remove special characters
19
+ return ' '.join(text.split()) # Remove extra whitespaces
20
+
21
+ def preprocess_text_columns(df, text_columns):
22
+ for col in text_columns:
23
+ df[col] = df[col].fillna("No info provided")
24
+ df[col] = df[col].apply(clean_text)
25
+ return df
26
+
27
+ # ------------------------
28
+ # Tokenization of textual Columns
29
+ # ------------------------
30
+
31
+ def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
32
+ """
33
+ Tokenizes multiple textual columns in batches for inference.
34
+
35
+ Args:
36
+ df (pd.DataFrame): DataFrame containing textual columns.
37
+ textual_columns (list): List of column names to tokenize.
38
+ tokenizer: HuggingFace tokenizer.
39
+ batch_size (int): Number of samples per batch.
40
+ max_length (int): Maximum token length per sequence.
41
+
42
+ Returns:
43
+ dict: Dictionary with column names as keys and tokenized tensors as values.
44
+ """
45
+ def tokenize_in_batches(column_texts):
46
+ tokenized_batches = []
47
+ for i in range(0, len(column_texts), batch_size):
48
+ batch = column_texts[i:i + batch_size].tolist()
49
+ tokenized_batch = tokenizer(
50
+ batch,
51
+ padding="max_length",
52
+ truncation=True,
53
+ max_length=max_length,
54
+ return_tensors="pt"
55
+ )
56
+ tokenized_batches.append(tokenized_batch)
57
+ # Combine batches
58
+ return {
59
+ "input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
60
+ "attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
61
+ }
62
+
63
+ tokenized_data = {}
64
+ for col in textual_columns:
65
+ tokenized_data[col] = tokenize_in_batches(df[col])
66
+ return tokenized_data
data/.gitkeep ADDED
File without changes
embeddings/.gitkeep ADDED
File without changes
frontend/app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from pathlib import Path
4
+
5
+ # PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
6
+ # if PROJECT_ROOT not in sys.path:
7
+ # sys.path.insert(0, PROJECT_ROOT)
8
+
9
+ project_root = Path(__file__).parent.parent
10
+ sys.path.insert(0, str(project_root))
11
+
12
+ import streamlit as st
13
+ import pandas as pd
14
+ from backend.pipelines.run_inference import predict
15
+ from io import BytesIO
16
+
17
+ st.set_page_config(page_title="Study Status Prediction", page_icon="📊", layout="wide")
18
+
19
+ st.title("📊 Study Status Prediction")
20
+ st.markdown("Upload a CSV file or manually enter study details to predict whether a study is **COMPLETED** or **NOT COMPLETED**.")
21
+
22
+ # Tabs for CSV Upload and Manual Entry
23
+ tab1, tab2 = st.tabs(["📂 Upload CSV", "✍️ Manual Entry"])
24
+
25
+ # --- Option 1: CSV Upload ---
26
+ with tab1:
27
+ uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
28
+
29
+ if uploaded_file:
30
+ df_new = pd.read_csv(uploaded_file)
31
+ preds = predict(df_new)
32
+
33
+ # Only keep final predictions
34
+ final_preds = pd.DataFrame({"Final Prediction": preds["final_predictions"]})
35
+
36
+ # Display a preview
37
+ st.subheader("🔎 Predictions Preview")
38
+ st.dataframe(final_preds.head())
39
+
40
+ # Download button for predictions
41
+ csv_buffer = BytesIO()
42
+ final_preds.to_csv(csv_buffer, index=False)
43
+ st.download_button(
44
+ label="📥 Download Predictions CSV",
45
+ data=csv_buffer.getvalue(),
46
+ file_name="predictions.csv",
47
+ mime="text/csv"
48
+ )
49
+
50
+ # --- Option 2: Manual Entry ---
51
+ with tab2:
52
+ st.subheader("✍️ Enter Study Details")
53
+ st.markdown("Fill in the fields below to predict the study status.")
54
+
55
+ # Placeholders for all the features
56
+ nct_number = st.text_input("NCT Number", placeholder="e.g., NCT01234567")
57
+ study_title = st.text_area("Study Title", placeholder="e.g., A Study of Drug X in Treating Lung Cancer")
58
+ study_url = st.text_input("Study URL", placeholder="e.g., https://clinicaltrials.gov/ct2/show/NCT01234567")
59
+ acronym = st.text_input("Acronym", placeholder="e.g., LUNG-X")
60
+ brief_summary = st.text_area("Brief Summary", placeholder="e.g., This is a phase 3 trial evaluating the effectiveness of Drug X for lung cancer.")
61
+ study_results = st.selectbox("Study Results", ["YES", "NO"])
62
+ conditions = st.text_input("Conditions", placeholder="e.g., Lung Cancer")
63
+ interventions = st.text_input("Interventions", placeholder="e.g., Drug Y")
64
+ primary_outcome = st.text_input("Primary Outcome Measures", placeholder="e.g., Survival rate")
65
+ secondary_outcome = st.text_input("Secondary Outcome Measures", placeholder="e.g., Side effects")
66
+ other_outcome = st.text_input("Other Outcome Measures", placeholder="Optional")
67
+ sponsor = st.text_input("Sponsor", placeholder="e.g., ABC Research")
68
+ collaborators = st.text_input("Collaborators", placeholder="e.g., University of SFX")
69
+ sex = st.selectbox("Sex", ["ALL", "MALE", "FEMALE"])
70
+ age = st.selectbox("Age", ["ADULT, OLDER_ADULT",
71
+ "ADULT",
72
+ "CHILD, ADULT, OLDER_ADULT",
73
+ "CHILD",
74
+ "CHILD, ADULT",
75
+ "OLDER_ADULT"])
76
+ phases = st.selectbox("Phases", ["PHASE2",
77
+ "PHASE1",
78
+ "PHASE4",
79
+ "PHASE3",
80
+ "PHASE1|PHASE2",
81
+ "PHASE2|PHASE3",
82
+ "EARLY_PHASE1"])
83
+ enrollment = st.number_input("Enrollment", min_value=0, step=1, placeholder="e.g., 500")
84
+ funder_type = st.selectbox("Funder Type", ["OTHER",
85
+ "INDUSTRY",
86
+ "NIH",
87
+ "OTHER_GOV",
88
+ "NETWORK",
89
+ "FED",
90
+ "INDIV",
91
+ "UNKNOWN",
92
+ "AMBIG"])
93
+ study_type = st.selectbox("Study Type", ["INTERVENTIONAL", "OBSERVATIONAL"])
94
+ study_design = st.text_area("Study Design", placeholder="e.g., Intervention Model: PARALLEL | Masking: SINGLE (INVESTIGATOR)")
95
+ other_ids = st.text_input("Other IDs", placeholder="e.g., ABC-123")
96
+ start_date = st.text_input("Start Date", placeholder="e.g., January 2023")
97
+ primary_completion_date = st.text_input("Primary Completion Date", placeholder="e.g., December 2025")
98
+ completion_date = st.text_input("Completion Date", placeholder="e.g., June 2026")
99
+ first_posted = st.text_input("First Posted", placeholder="e.g., February 2023")
100
+ results_first_posted = st.text_input("Results First Posted", placeholder="e.g., N/A")
101
+ last_update_posted = st.text_input("Last Update Posted", placeholder="e.g., September 2025")
102
+ locations = st.text_area("Locations", placeholder="e.g., New York, USA")
103
+ study_documents = st.text_area("Study Documents", placeholder="e.g., Protocol PDF")
104
+
105
+
106
+
107
+ if st.button("🔮 Predict Status"):
108
+ single_data = {
109
+ "NCT Number": nct_number,
110
+ "Study Title": study_title,
111
+ "Study URL": study_url,
112
+ "Acronym": acronym,
113
+ "Brief Summary": brief_summary,
114
+ "Study Results": study_results,
115
+ "Conditions": conditions,
116
+ "Interventions": interventions,
117
+ "Primary Outcome Measures": primary_outcome,
118
+ "Secondary Outcome Measures": secondary_outcome,
119
+ "Other Outcome Measures": other_outcome,
120
+ "Sponsor": sponsor,
121
+ "Collaborators": collaborators,
122
+ "Sex": sex,
123
+ "Age": age,
124
+ "Phases": phases,
125
+ "Enrollment": enrollment,
126
+ "Funder Type": funder_type,
127
+ "Study Type": study_type,
128
+ "Study Design": study_design,
129
+ "Other IDs": other_ids,
130
+ "Start Date": start_date,
131
+ "Primary Completion Date": primary_completion_date,
132
+ "Completion Date": completion_date,
133
+ "First Posted": first_posted,
134
+ "Results First Posted": results_first_posted,
135
+ "Last Update Posted": last_update_posted,
136
+ "Locations": locations,
137
+ "Study Documents": study_documents,
138
+ }
139
+
140
+ df_single = pd.DataFrame([single_data])
141
+
142
+ preds = predict(df_single)
143
+
144
+ # Show final prediction with animation
145
+ final_label = preds["final_predictions"][0]
146
+ if final_label == "COMPLETED":
147
+ st.success(f"✅ Prediction: **{final_label}**", icon="🎉")
148
+ else:
149
+ st.error(f"❌ Prediction: **{final_label}**", icon="⚠️")
notebooks/clinical-trial-outcome-prediction.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core libraries
2
+ numpy>=1.26.4,<2.2
3
+ pandas==2.2.2
4
+ scikit-learn==1.2.2
5
+ joblib==1.3.2
6
+
7
+ # FastAPI and server
8
+ fastapi==0.111.0
9
+ uvicorn==0.29.0
10
+
11
+ # Data visualization / plotting
12
+ matplotlib==3.9.0
13
+ seaborn==0.13.2
14
+
15
+ # Streamlit
16
+ streamlit==1.35.0
17
+
18
+ # # PyTorch
19
+ # torch==2.3.0
20
+ # torchvision==0.18.0
21
+ # torchaudio==2.3.0
22
+
23
+ # Transformers / HuggingFace
24
+ transformers==4.43.0
25
+ huggingface_hub==0.23.4
save_preprocessor.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # save_preprocessor.py
2
+ from transformers import AutoTokenizer, AutoModel
3
+ from backend.pipelines.preprocessor_pipeline import Preprocessor
4
+ import pandas as pd
5
+
6
+ # Define dataset columns (adapt to your dataset)
7
+ sample_df = pd.DataFrame([{
8
+ "Brief Summary": "This is a sample study.",
9
+ "Study Results": "Has Results",
10
+ "Conditions": "Condition A",
11
+ "Interventions": "Drug X",
12
+ "Primary Outcome Measures": "Outcome 1",
13
+ "Secondary Outcome Measures": "Outcome 2",
14
+ "Sponsor": "XYZ Corp",
15
+ "Sex": "All",
16
+ "Age": "Adult",
17
+ "Funder Type": "Industry",
18
+ "Phases": "Phase 2",
19
+ "Enrollment": 120,
20
+ "Study Type": "Interventional",
21
+ "Study Design": "Intervention: Randomized|Masking: Double",
22
+ }])
23
+
24
+ required_cols = sample_df.columns.tolist()
25
+ categorical_cols = [
26
+ "Study Results", "Sex", "Age", "Funder Type", "Phases",
27
+ "Study Type"
28
+ ]
29
+ columns_to_drop = ["Sponsor", "Observational Model", "Time Perspective"]
30
+ text_columns = [
31
+ "Brief Summary", "Conditions", "Interventions",
32
+ "Primary Outcome Measures", "Secondary Outcome Measures"
33
+ ]
34
+
35
+ # Load BioBERT
36
+ tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
37
+ model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
38
+
39
+ # Ensure pad token exists
40
+ if tokenizer.pad_token is None:
41
+ tokenizer.pad_token = tokenizer.cls_token or "[PAD]"
42
+
43
+ # Create and save preprocessor
44
+ preprocessor = Preprocessor(
45
+ required_cols,
46
+ categorical_cols,
47
+ columns_to_drop,
48
+ text_columns,
49
+ tokenizer=tokenizer,
50
+ biobert_model=model,
51
+ device="cpu"
52
+ )
53
+ preprocessor.save("backend/models/preprocessor.pkl")
tests/test_inference.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # test_inference.py
2
+ import pandas as pd
3
+ from pipelines.preprocessor_pipeline import Preprocessor
4
+
5
+ # Load saved preprocessor
6
+ preprocessor = Preprocessor.load("models/preprocessor.pkl")
7
+
8
+ # Sample new data for inference
9
+ df_new = pd.DataFrame([{
10
+ ""
11
+ "NCT Number": "NCT01234567",
12
+ "Study Title": "A Study of Drug X in Treating Lung Cancer",
13
+ "Study URL": "https://clinicaltrials.gov/ct2/show/NCT01234567",
14
+ "Acronym": "LUNG-X",
15
+ "Brief Summary": "This is a phase 3 trial evaluating the effectiveness of Drug X for lung cancer.",
16
+ "Study Results": "NO",
17
+ "Conditions": "Lung Cancer",
18
+ "Interventions": "Drug Y",
19
+ "Primary Outcome Measures": "Survival rate",
20
+ "Secondary Outcome Measures": "Side effects",
21
+ "Other Outcome Measures": "",
22
+ "Sponsor": "ABC Research",
23
+ "Collaborators": "University of SFX",
24
+ "Sex": "MALE",
25
+ "Age": "garbage value - jhfkjahfaiueuw",
26
+ "Phases": "Phase 3",
27
+ "Enrollment": 500,
28
+ "Funder Type": "Government",
29
+ "Study Type": "Archchisman",
30
+ "Study Design": "Intervention Model: Randomized|Masking: QUADRUPLE (PARTICIPANT, CARE_PROVIDER, INVESTIGATOR, OUTCOMES_ASSESSOR)|Observational Model: Observing|Name: Archchisman Banerjee",
31
+ "Other IDs": "ABC-123",
32
+ "Start Date": "January 2023",
33
+ "Primary Completion Date": "December 2025",
34
+ "Completion Date": "June 2026",
35
+ "First Posted": "February 2023",
36
+ "Results First Posted": "N/A",
37
+ "Last Update Posted": "September 2025",
38
+ "Locations": "New York, USA",
39
+ "Study Documents": "Protocol PDF"
40
+ }])
41
+
42
+ X_tabular, embeddings = preprocessor.transform(df_new)
43
+
44
+ print("Processed Tabular Features:")
45
+ print(X_tabular.head())
46
+ X_tabular.to_csv("test.csv")
47
+
48
+ if embeddings:
49
+ for col, emb in embeddings.items():
50
+ print(f"Embeddings for {col}: {emb.shape}")