from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate from sklearn.metrics import accuracy_score from flask import Flask, request, jsonify from flask_cors import CORS import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from xgboost import XGBClassifier from sklearn.metrics import precision_score, recall_score, f1_score from io import StringIO import os # =============================================================================== # Input Validation Functions # =============================================================================== def validate_input(data, trips_col='Number of Trips', earnings_col='Earnings', min_trips=0, max_trips=1000, min_earnings=0, max_earnings=100000): """ Validates input data for negative trips and unrealistic earnings. Returns (True, None) if valid, else (False, error_message). """ # Check for single row (dict or DataFrame) if isinstance(data, dict): trips = data.get(trips_col, None) earnings = data.get(earnings_col, None) if trips is not None and (trips < min_trips or trips > max_trips): return False, f"Invalid number of trips: {trips}. Must be between {min_trips} and {max_trips}." if earnings is not None and (earnings < min_earnings or earnings > max_earnings): return False, f"Invalid earnings: {earnings}. Must be between {min_earnings} and {max_earnings}." elif isinstance(data, pd.DataFrame): if trips_col in data.columns: invalid_trips = data[(data[trips_col] < min_trips) | (data[trips_col] > max_trips)] if not invalid_trips.empty: return False, f"Invalid number of trips in rows: {invalid_trips.index.tolist()}" if earnings_col in data.columns: invalid_earnings = data[(data[earnings_col] < min_earnings) | (data[earnings_col] > max_earnings)] if not invalid_earnings.empty: return False, f"Invalid earnings in rows: {invalid_earnings.index.tolist()}" return True, None # ============================================================================== # Step 1: Initialize Flask App and Model Variables # ============================================================================== app = Flask(__name__) CORS(app) # Enable CORS to allow the frontend to access this API # Global variables to hold the trained model and features model = None train_features_columns = None evaluation_metrics = {} # ============================================================================== # Step 2: Core ML Functions (from your original script) # ============================================================================== def load_and_preprocess_data(csv_path): """ Loads and preprocesses the dataset. """ try: df = pd.read_csv(csv_path) except FileNotFoundError: print(f"Error: The file {csv_path} was not found.") return None, None target_column = 'Creditworthy' # Drop columns that are not features for the model df = df.drop(columns=['Partner ID'], errors='ignore') # Identify non-numeric columns categorical_cols = df.select_dtypes(include=['object']).columns.tolist() # One-hot encode categorical features df = pd.get_dummies(df, columns=categorical_cols, drop_first=True) # Ensure all remaining feature columns are numeric for col in df.columns: if col != target_column: df[col] = pd.to_numeric(df[col], errors='coerce') # Drop any rows that now have NaN values after the coercion df = df.dropna() return df, target_column def train_model(df, target_column): """ Splits data and trains an XGBoost classifier. """ X = df.drop(target_column, axis=1) y = df[target_column] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = XGBClassifier(eval_metric='logloss') model.fit(X_train, y_train) return model, X_test, y_test def evaluate_model(model, X_test, y_test): """ Evaluates the trained model using key metrics. Returns the metrics as a dictionary. """ y_pred = model.predict(X_test) evaluation_metrics = { 'accuracy': accuracy_score(y_test, y_pred), 'precision': precision_score(y_test, y_pred), 'recall': recall_score(y_test, y_pred), 'f1_score': f1_score(y_test, y_pred) } # Fairness metrics using Fairlearn (if sensitive attribute exists) sensitive_attr = None # Try common sensitive attribute names for col in ['gender', 'Gender', 'partner_gender', 'Partner Gender']: if col in X_test.columns: sensitive_attr = X_test[col] break if sensitive_attr is not None: mf = MetricFrame(metrics={'accuracy': accuracy_score, 'selection_rate': selection_rate}, y_true=y_test, y_pred=y_pred, sensitive_features=sensitive_attr) print("\nFairness metrics by group (Fairlearn):") print(mf.by_group) else: print("No sensitive attribute found for group fairness metrics.") return evaluation_metrics def preprocess_user_data(user_df, train_columns): """ Prepares the user's data to match the format of the training data. """ # Identify and one-hot encode categorical features from the user's data categorical_cols = user_df.select_dtypes(include=['object']).columns.tolist() user_df = pd.get_dummies(user_df, columns=categorical_cols, drop_first=True) # Identify which columns are in the training data but not the user data missing_cols = set(train_columns) - set(user_df.columns) # Add any missing columns from the training data with default value 0 for c in missing_cols: user_df[c] = 0 # Drop any extra columns from the user data that were not in the training data # This is crucial for single-entry data extra_cols = set(user_df.columns) - set(train_columns) user_df = user_df.drop(columns=list(extra_cols), errors='ignore') # Reorder columns to match the training data user_df = user_df[train_columns] return user_df # ============================================================================== # Step 2.5: New Function to Save Data to CSV # ============================================================================== def save_to_csv(data_df, filename='online_testcases.csv'): """ Saves a DataFrame to a CSV file. Removes any empty columns (like 'Creditworthy') before saving. """ # Drop 'Creditworthy' if it exists and is empty or all NaN if 'Creditworthy' in data_df.columns and data_df['Creditworthy'].isnull().all(): data_df = data_df.drop(columns=['Creditworthy']) # Drop any other columns that are all NaN data_df = data_df.dropna(axis=1, how='all') file_exists = os.path.isfile(filename) data_df.to_csv(filename, mode='a', header=not file_exists, index=False) print(f"Data successfully saved to {filename}") # ============================================================================== # Step 3: API Endpoint for Prediction (Single Input) # ============================================================================== @app.route('/predict', methods=['POST']) def predict(): """ Endpoint to receive a single user input, make a prediction, and return metrics. """ # Check if global variables are None. This is the correct way to handle this. if model is None or train_features_columns is None or evaluation_metrics is None: return jsonify({'error': 'Model is not trained or loaded. Please check backend logs.'}), 500 try: user_input = request.json # Input validation valid, error_msg = validate_input(user_input) if not valid: return jsonify({'error': error_msg}), 400 user_df = pd.DataFrame([user_input]) # Preprocess the user's data to match the training data format user_features_processed = preprocess_user_data(user_df.copy(), train_features_columns) # Make the prediction prediction = model.predict(user_features_processed) result = "Eligible" if prediction[0] == 1 else "Not Eligible" # Add prediction to the original DataFrame for logging user_df['Creditworthy_Prediction'] = result # Save the original user input plus prediction to the CSV file save_to_csv(user_df) # Return the prediction and evaluation metrics return jsonify({ 'prediction': result, 'metrics': evaluation_metrics }) except Exception as e: # Gracefully handle any errors during the process return jsonify({'error': str(e)}), 500 # ============================================================================== # Step 4: API Endpoint for Bulk Prediction (CSV Upload) # ============================================================================== @app.route('/predict_csv', methods=['POST']) def predict_csv(): """ Endpoint to receive a CSV file, make bulk predictions, and return results. """ if 'file' not in request.files: return jsonify({'error': 'No file part in the request'}), 400 file = request.files['file'] if file.filename == '': return jsonify({'error': 'No selected file'}), 400 if file: try: # Read the CSV file from the request csv_data = StringIO(file.read().decode('utf-8')) input_df = pd.read_csv(csv_data) # Check if ground truth is present has_ground_truth = 'Creditworthy' in input_df.columns # Remove 'Creditworthy' column from features for prediction if has_ground_truth: y_true = input_df['Creditworthy'] input_df_features = input_df.drop(columns=['Creditworthy']) else: input_df_features = input_df # Remove any other empty columns input_df_features = input_df_features.dropna(axis=1, how='all') # Input validation for all rows valid, error_msg = validate_input(input_df_features) if not valid: return jsonify({'error': error_msg}), 400 # Preprocess the entire DataFrame user_features_processed = preprocess_user_data(input_df_features.copy(), train_features_columns) # Make the predictions predictions = model.predict(user_features_processed) # Add the predictions to the original DataFrame input_df['Creditworthy_Prediction'] = np.where(predictions == 1, 'Eligible', 'Not Eligible') # Remove any empty columns again before saving/returning input_df = input_df.dropna(axis=1, how='all') # Save the entire DataFrame to the CSV file save_to_csv(input_df) # --- Fairness & Bias Reporting --- fairness_metrics = {} fairness_observation = "Fairness metrics require ground truth labels and are not available for this upload." if has_ground_truth: # Only compute fairness if ground truth is present sensitive_col = 'Partner Type' if sensitive_col in input_df.columns: y_pred = (input_df['Creditworthy_Prediction'] == 'Eligible').astype(int) # If Creditworthy is string, convert to binary if y_true.dtype == object: y_true_bin = y_true.map(lambda x: 1 if str(x).lower() in ['eligible', '1', 'true', 'yes'] else 0) else: y_true_bin = y_true sensitive_features = input_df[sensitive_col] mf = MetricFrame( metrics={ 'selection_rate': selection_rate, 'equal_opportunity': true_positive_rate }, y_true=y_true_bin, y_pred=y_pred, sensitive_features=sensitive_features ) fairness_metrics = { 'selection_rate': mf.by_group['selection_rate'].to_dict(), 'equal_opportunity': mf.by_group['equal_opportunity'].to_dict() } # Observations rates = mf.by_group['selection_rate'] max_group = rates.idxmax() min_group = rates.idxmin() diff = rates[max_group] - rates[min_group] fairness_observation = f"{max_group} group approval rate is {diff:.2%} higher than {min_group} group." if abs(diff) > 0.1: fairness_observation += " Mitigation recommended: Consider reweighting or post-processing." # Convert DataFrame to a list of dictionaries for JSON response results = input_df.to_dict('records') return jsonify({ 'predictions': results, 'metrics': evaluation_metrics, 'fairness_metrics': fairness_metrics, 'fairness_observation': fairness_observation }) except Exception as e: import traceback print(traceback.format_exc()) return jsonify({'error': f"Error processing file: {str(e)}"}), 500 return jsonify({'error': 'An unknown error occurred.'}), 500 # ============================================================================== # Step 5: Main function to train the model once and run the server # ============================================================================== def main(): """ Initializes the model and runs the Flask server. """ global model, train_features_columns, evaluation_metrics print("--- Starting the Nova Backend ---") print("Step 1: Loading and preprocessing data...") train_df, target_column = load_and_preprocess_data('catalyst_train.csv') if train_df is None: print("Please ensure 'catalyst_train.csv' exists. Exiting.") return print("Step 2: Training the model and evaluating performance...") model, X_test, y_test = train_model(train_df, target_column) train_features_columns = train_df.drop(columns=[target_column]).columns evaluation_metrics = evaluate_model(model, X_test, y_test) print("\nModel trained successfully! Metrics:") for key, value in evaluation_metrics.items(): print(f"- {key.capitalize()}: {value:.4f}") print("\n--- Starting Flask server on http://127.0.0.1:5000 ---") # This will serve the API, ready to accept requests from the frontend app.run(debug=True, port=5000, use_reloader=False) if __name__ == "__main__": main()