import streamlit as st import plotly.express as px import plotly.graph_objects as go import pandas as pd import numpy as np import pickle import sklearn from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from wordcloud import WordCloud from PIL import Image from imblearn.pipeline import Pipeline as ImbPipeline from imblearn.over_sampling import RandomOverSampler MODEL_PATHS = { "Random Forest": "models/rf_model_3.pkl", "Multinomial Naive Bayes": "models/mNB_model_3.pkl", "Support Vector Machine": "models/svm_model_3.pkl", "Complement Naive Bayes": "models/cNB_model_3.pkl", } @st.cache_resource def load_model(model_name): model_path = MODEL_PATHS[model_name] with open(model_path, "rb") as f: return pickle.load(f) sample_df = pd.read_csv("src/test.csv") eda_df = pd.read_csv("src/eda_df.csv") ml_df = pd.read_csv("src/cleaned_sms_text_labels_only_v2.csv") ## define feature importance per model and put it into a plotly chart when classifying def explain_prediction(model, text, model_name, top_n=10): # Unpack classifier as saved models are in made pipeline saved in mlflow if hasattr(model, "named_steps"): clf = model.named_steps.get("classifier") vectorizer = model.named_steps.get( "vectorizer" ) # i was having errors since im calling a different vectorizer! else: clf = model # Compute feature contributions for the chosen models; make contributions in absolute values X_input = vectorizer.transform([text]) feature_names = np.array(vectorizer.get_feature_names_out()) if not X_input.nnz: return pd.DataFrame( columns=["Token", "Contribution"] ) ## Handles edge cases like empty input if model_name == "Multinomial Naive Bayes": class_log_prob = clf.feature_log_prob_[1] - clf.feature_log_prob_[0] contributions = X_input.toarray()[0] * class_log_prob contributions = np.abs(contributions) elif model_name == "Complement Naive Bayes": class_log_prob = clf.feature_log_prob_[1] - clf.feature_log_prob_[0] contributions = X_input.toarray()[0] * class_log_prob contributions = np.abs(contributions) elif model_name == "Random Forest": contributions = clf.feature_importances_ * X_input.toarray()[0] contributions = np.abs(contributions) elif model_name == "Support Vector Machine": contributions = clf.coef_.toarray()[0] * X_input.toarray()[0] contributions = np.abs(contributions) else: return st.error("Invalid model name") token_scores = list(zip(feature_names, contributions)) token_scores = sorted(token_scores, key=lambda x: abs(x[1]), reverse=True) return pd.DataFrame(token_scores[:top_n], columns=["Token", "Contribution"]) ## --FOR THE WORDCLOUD AT EdATAB-- def get_wordcloud_data(text): wc = WordCloud( width=1200, height=600, background_color="white", colormap="viridis", max_font_size=60, prefer_horizontal=0.8, collocations=False, min_word_length=3, max_words=75, relative_scaling=0.5, contour_color="black", contour_width=0.2, ).generate(text) elements = [] for (word, freq), font_size, position, orientation, color in wc.layout_: elements.append((word, freq, position)) # position is already a tuple (x, y) return elements ## --FOR THE WORDCLOUD AT EdATAB-- def plot_wordcloud(elements, title): words, frequencies, positions = zip(*elements) x = [pos[0] for pos in positions] y = [-pos[1] for pos in positions] # Flip Y to display properly sizes = [freq * 200 for freq in frequencies] fig = go.Figure( data=[ go.Scatter( x=x, y=y, mode="text", text=words, textfont=dict(size=sizes), hoverinfo="text", textposition="middle center", ) ] ) fig.update_layout( title=dict(text=title, x=0.5), showlegend=False, xaxis=dict(showgrid=False, visible=False), yaxis=dict(showgrid=False, visible=False), margin=dict(l=20, r=20, t=50, b=20), ) st.plotly_chart(fig, use_container_width=True) with open("src/combined_stopwords.pkl", "rb") as f: combined_stopwords = pickle.load(f) vectorizer = TfidfVectorizer(stop_words=list(combined_stopwords), max_features=20) bow_cv = CountVectorizer(stop_words=list(combined_stopwords), max_features=1000) tfidf = TfidfVectorizer(stop_words=list(combined_stopwords), max_features=1000) ## ----- START OF APP ------ st.set_page_config( layout="wide", page_title="DATA103_filipino_spam_detection", page_icon="๐Ÿ˜Ž" ) st.title("Spam SMS Detection in the Filipino Context") ## SIDEBAR st.sidebar.markdown("---") st.sidebar.markdown( """ About the Team: Demo created by [Ferds Magallanes](https://ferds003.github.io), Hitika Motwani, Neil Penaflor, and Mark Abergos using Streamlit and Hugging Face Spaces. Purposes to demonstrate their NLP classification project for their minor data science class. Contributions: - Data Curation - Ferds, Hitika - EDA - Hitika, Neil - Features Selection and NLP_Training - Mark, Ferds - Eval and Demo - Ferds Acknowledgements: The team would like to thank Doc Jet Virtusio for the support and teachings he gave in our minor class :)) """ ) ## TABS PER PROJECT TASK ( DemoTAB, DataCurationTAB, EdATAB, FeatureSelectionTAB, TrainingPipelineTAB, ModelEvaluationTAB, ConTAB, ) = st.tabs( [ "Demo", "Data Curation", "EDA", "Feature Selection", "Training Pipeline", "Model Evaluation", "Conclusion and Recommendations", ] ) with DemoTAB: st.write("") st.markdown(""" Hi there! Input your sample sms messages for us to classify if it is spam or not. Correspondingly, we will provide what text (tokens) signify in percentage is spam or not. """) ## Provide user with sample spam and ham messages with st.expander( "๐Ÿ“‹ Try a sample message! This is from our test.csv so this data is not trained on our model" ): st.markdown( "Select a sample SMS message from below: Label is 0 for ham ๐Ÿ— and 1 for spam ๐Ÿฅซ" ) label_map = {0: "Ham", 1: "Spam"} sample_index = st.selectbox( "Select a sample SMS message", sample_df.index, format_func=lambda x: f"SMS {x} - {label_map[sample_df.loc[x, 'label']]}: {sample_df.loc[x, 'text'][:50]}", ) if st.button("Use this sample"): st.session_state["1"] = sample_df.loc[sample_index, "text"] ## Model selection selected_model_name = st.selectbox( "Select Classification Model", list(MODEL_PATHS.keys()) ) text = st.text_area("Enter SMS to classify here!", height=100, key="1") ## CASE WHEN BUTTON IS PRESSED if st.button("Classify"): if text: with st.spinner("Analyzing..."): clf = load_model(selected_model_name) prediction = clf.predict([text])[0] pred_proba = clf.predict_proba([text])[0] st.success(f"Prediction: {'Spam' if prediction == 1 else 'Not Spam'}") st.info(f"Probability of Spam: {pred_proba[1]:.2%}") st.info(f"Probability of Not Spam: {pred_proba[0]:.2%}") st.markdown("### Feature Importance") explain_df = explain_prediction(clf, text, selected_model_name) if ( explain_df is not None and not explain_df.empty ): ## calling the function fig1 = px.bar( explain_df, x="Contribution", y="Token", orientation="h", title="Top Contributing Tokens to Prediction", labels={"Contribution": "Impact Score"}, color="Contribution", color_continuous_scale="RdBu", template="plotly_dark", ) st.plotly_chart(fig1, use_container_width=True) print( "Top tokens:", explain_df.head(8) ) ## DEBUGGING LINE; Can be checked on streamlit terminal else: st.warning("Unable to compute token contribution for this model.") else: st.warning("Please input text to classify.") st.markdown("---") st.markdown(""" ## Changelogs: - Version 2 (August 2, 2025): Improvements across `precision` and `recall` metrics on training by random oversampling ham classes on `X_train` in training pipeline using `imbalanced-learn`package. Latest deployed models trained under these run params. - Version 1 (July 28, 2025): Initial demo of the project with 4 traditional ML classifiers using TFIDF vectorizer. """) with DataCurationTAB: st.markdown(""" Data cleaning and pre-processing is necessary as we are considering three datasets with different contexts. Below is a summary of the data treatment and insights done to make the versions of the dataset. We avoided the use of the UCL SMS repository for this project as this does not capture the filipino context. - For [Dataset 1](https://www.kaggle.com/datasets/scottleechua/ph-spam-marketing-sms-w-timestamps): - drop any null values; drop any full redactions done in `text` column through regex. Drops 74% of the dataset as text sms data is salient to the project. - checked any redactions of the similar <> format within `text` feature. Concluded that any other text with <> format are coming from spam and ads category - Drops `date_read` column. Renamed `date_received` column - Made a label columns that considers the text as label `spam` if it is within the category `spam` and `ads` based on `category` column - applied `get_carrier` function to get sms local provider. - For [Dataset 2](https://www.kaggle.com/datasets/bwandowando/philippine-spam-sms-messages): - drop any null values; all data will be considered under the label `spam` for its sms text messages data. - checked any redactions of the similar <> format within `text` feature. found `` redactions; replaced it with a blankspace - dropped `hashed_cellphone_number` and `carrier` column to apply own `get_carrier` function that considers also DITO sms provider. - renamed column `masked_celphone_number` to `sender` and `date` to `date_received` similar to dataset 1. - For [Dataset 3](https://github.com/Yissuh/Filipino-Spam-SMS-Detection-Model/blob/main/data-set.csv): - drop any null values; dropped any `<>|<#>` tags and any other tags that are labeled under ham messages. - renamed column `message` to `text` in conformity with other datasets. - checked any redactions of the similar <> format within `text` feature. found ``, ``, ``, ``, `