# test_inference.py
import pandas as pd
from pipelines.preprocessor_pipeline import Preprocessor

# Load saved preprocessor
preprocessor = Preprocessor.load("models/preprocessor.pkl")

# Sample new data for inference
df_new = pd.DataFrame([{
    ""
    "NCT Number": "NCT01234567",
    "Study Title": "A Study of Drug X in Treating Lung Cancer",
    "Study URL": "https://clinicaltrials.gov/ct2/show/NCT01234567",
    "Acronym": "LUNG-X",
    "Brief Summary": "This is a phase 3 trial evaluating the effectiveness of Drug X for lung cancer.",
    "Study Results": "NO",
    "Conditions": "Lung Cancer",
    "Interventions": "Drug Y",
    "Primary Outcome Measures": "Survival rate",
    "Secondary Outcome Measures": "Side effects",
    "Other Outcome Measures": "",
    "Sponsor": "ABC Research",
    "Collaborators": "University of SFX",
    "Sex": "MALE",
    "Age": "garbage value - jhfkjahfaiueuw",
    "Phases": "Phase 3",
    "Enrollment": 500,
    "Funder Type": "Government",
    "Study Type": "Archchisman",
    "Study Design": "Intervention Model: Randomized|Masking: QUADRUPLE (PARTICIPANT, CARE_PROVIDER, INVESTIGATOR, OUTCOMES_ASSESSOR)|Observational Model: Observing|Name: Archchisman Banerjee",
    "Other IDs": "ABC-123",
    "Start Date": "January 2023",
    "Primary Completion Date": "December 2025",
    "Completion Date": "June 2026",
    "First Posted": "February 2023",
    "Results First Posted": "N/A",
    "Last Update Posted": "September 2025",
    "Locations": "New York, USA",
    "Study Documents": "Protocol PDF"
}])

X_tabular, embeddings = preprocessor.transform(df_new)

print("Processed Tabular Features:")
print(X_tabular.head())
X_tabular.to_csv("test.csv")

if embeddings:
    for col, emb in embeddings.items():
        print(f"Embeddings for {col}: {emb.shape}")