Spaces:
Sleeping
Sleeping
soupstick
commited on
Commit
·
ccb470a
1
Parent(s):
112f78a
refactor: scaffold modular split (agent, tools, modules/*, llm_provider, mcp, validation, security)
Browse files- agent.py +1 -0
- app.py +1 -670
- app_monolith_backup.py +670 -0
- llm_provider.py +1 -0
- mcp.py +1 -0
- modules/__init__.py +0 -0
- modules/credit.py +1 -0
- modules/kyc.py +1 -0
- modules/sanctions.py +1 -0
- modules/transactions.py +1 -0
- threat_intel.py +1 -0
- tools.py +1 -0
- ttp_guard.py +1 -0
- validation.py +1 -0
agent.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement agent.py (split from app_monolith_backup.py)"""
|
app.py
CHANGED
|
@@ -1,670 +1 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Fraud Detector Analyst — LangChain + (optional) MCP
|
| 3 |
-
Advanced “prototype-first” build:
|
| 4 |
-
- Chat uses chat-completion models (LangChain ChatHuggingFace).
|
| 5 |
-
- AI Summary shows a notice when no inference is connected.
|
| 6 |
-
|
| 7 |
-
LLM env (serverless friendly):
|
| 8 |
-
HF_TOKEN (or HF_SPACES)
|
| 9 |
-
LC_CHAT_MODEL (default: "Qwen/Qwen2.5-0.5B-Instruct")
|
| 10 |
-
LC_CHAT_MODEL_FALLBACK (default: "mistralai/Mistral-7B-Instruct")
|
| 11 |
-
|
| 12 |
-
Summary behavior:
|
| 13 |
-
If no working inference/token -> summary fields display:
|
| 14 |
-
"🔌 Please connect to an inference point to generate summary."
|
| 15 |
-
|
| 16 |
-
Optional MCP:
|
| 17 |
-
ENABLE_MCP=1
|
| 18 |
-
MCP_SANCTIONS_URL, MCP_HIGH_RISK_MCC_URL
|
| 19 |
-
MCP_AUTH_HEADER="Authorization: Bearer <token>"
|
| 20 |
-
|
| 21 |
-
Run:
|
| 22 |
-
pip install -r requirements.txt
|
| 23 |
-
python app.py
|
| 24 |
-
On Spaces:
|
| 25 |
-
Add secret HF_TOKEN (or HF_SPACES). Launch.
|
| 26 |
-
"""
|
| 27 |
-
|
| 28 |
-
from __future__ import annotations
|
| 29 |
-
|
| 30 |
-
import os, io, re, json, math, unicodedata, logging
|
| 31 |
-
from typing import Optional, Tuple, List, Dict
|
| 32 |
-
|
| 33 |
-
import numpy as np
|
| 34 |
-
import pandas as pd
|
| 35 |
-
import gradio as gr
|
| 36 |
-
from dotenv import load_dotenv
|
| 37 |
-
|
| 38 |
-
# LangChain
|
| 39 |
-
from langchain.tools import tool
|
| 40 |
-
from langchain_core.tools import Tool
|
| 41 |
-
from langchain.agents import initialize_agent, AgentType
|
| 42 |
-
from langchain.schema import HumanMessage, SystemMessage
|
| 43 |
-
|
| 44 |
-
from pydantic import BaseModel, Field
|
| 45 |
-
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
|
| 46 |
-
|
| 47 |
-
# Phone normalization
|
| 48 |
-
try:
|
| 49 |
-
import phonenumbers
|
| 50 |
-
HAVE_PHONENUM = True
|
| 51 |
-
except Exception:
|
| 52 |
-
HAVE_PHONENUM = False
|
| 53 |
-
|
| 54 |
-
# ------------------------
|
| 55 |
-
# Setup
|
| 56 |
-
# ------------------------
|
| 57 |
-
load_dotenv()
|
| 58 |
-
logging.basicConfig(level=logging.INFO)
|
| 59 |
-
log = logging.getLogger("fraud-analyst")
|
| 60 |
-
|
| 61 |
-
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HF_SPACES")
|
| 62 |
-
|
| 63 |
-
# Chat models (chat-completions)
|
| 64 |
-
DEFAULT_CHAT_MODEL = os.getenv("LC_CHAT_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
|
| 65 |
-
FALLBACK_CHAT_MODEL = os.getenv("LC_CHAT_MODEL_FALLBACK", "mistralai/Mistral-7B-Instruct")
|
| 66 |
-
|
| 67 |
-
SUMMARY_NOTICE = "🔌 Please connect to an inference point to generate summary."
|
| 68 |
-
CHAT_NOTICE = "🔌 Chat model not configured. Set HF_TOKEN and LC_CHAT_MODEL to enable chat."
|
| 69 |
-
|
| 70 |
-
# ------------------------
|
| 71 |
-
# LLM builders
|
| 72 |
-
# ------------------------
|
| 73 |
-
def _mk_chat_llm(model_id: str) -> ChatHuggingFace:
|
| 74 |
-
"""
|
| 75 |
-
ChatHuggingFace uses HF Inference under the hood.
|
| 76 |
-
Although the backend task is 'text-generation', this wrapper handles chat-style messages.
|
| 77 |
-
"""
|
| 78 |
-
base = HuggingFaceEndpoint(
|
| 79 |
-
repo_id=model_id,
|
| 80 |
-
task="text-generation",
|
| 81 |
-
huggingfacehub_api_token=HF_TOKEN,
|
| 82 |
-
max_new_tokens=256,
|
| 83 |
-
temperature=0.2,
|
| 84 |
-
repetition_penalty=1.05,
|
| 85 |
-
timeout=60,
|
| 86 |
-
)
|
| 87 |
-
return ChatHuggingFace(llm=base)
|
| 88 |
-
|
| 89 |
-
def _heartbeat_chat(model_id: str) -> bool:
|
| 90 |
-
try:
|
| 91 |
-
chat = _mk_chat_llm(model_id)
|
| 92 |
-
_ = chat.invoke([HumanMessage(content="ok")])
|
| 93 |
-
return True
|
| 94 |
-
except Exception as e:
|
| 95 |
-
log.warning(f"Heartbeat failed for {model_id}: {str(e)[:160]}")
|
| 96 |
-
return False
|
| 97 |
-
|
| 98 |
-
def build_chat_llm() -> Optional[ChatHuggingFace]:
|
| 99 |
-
"""
|
| 100 |
-
Returns a working ChatHuggingFace or None (if token/permissions missing).
|
| 101 |
-
"""
|
| 102 |
-
log.info(f"HF token present: {bool(HF_TOKEN)} len={len(HF_TOKEN) if HF_TOKEN else 0}")
|
| 103 |
-
if HF_TOKEN and _heartbeat_chat(DEFAULT_CHAT_MODEL):
|
| 104 |
-
log.info(f"Using chat model: {DEFAULT_CHAT_MODEL}")
|
| 105 |
-
return _mk_chat_llm(DEFAULT_CHAT_MODEL)
|
| 106 |
-
if HF_TOKEN and _heartbeat_chat(FALLBACK_CHAT_MODEL):
|
| 107 |
-
log.info(f"Using fallback chat model: {FALLBACK_CHAT_MODEL}")
|
| 108 |
-
return _mk_chat_llm(FALLBACK_CHAT_MODEL)
|
| 109 |
-
log.warning("No working chat model; chat will show a notice.")
|
| 110 |
-
return None
|
| 111 |
-
|
| 112 |
-
CHAT_LLM = build_chat_llm()
|
| 113 |
-
|
| 114 |
-
# ------------------------
|
| 115 |
-
# Normalization helpers
|
| 116 |
-
# ------------------------
|
| 117 |
-
def _norm_colname(c: str) -> str:
|
| 118 |
-
c = c.strip().lower()
|
| 119 |
-
c = re.sub(r"\s+", "_", c)
|
| 120 |
-
c = re.sub(r"[^\w]+", "_", c)
|
| 121 |
-
return c.strip("_")
|
| 122 |
-
|
| 123 |
-
def _nfkc(s: str) -> str:
|
| 124 |
-
return unicodedata.normalize("NFKC", s)
|
| 125 |
-
|
| 126 |
-
def _collapse_ws(s: str) -> str:
|
| 127 |
-
return re.sub(r"\s+", " ", s).strip()
|
| 128 |
-
|
| 129 |
-
def _clean_str(x):
|
| 130 |
-
if pd.isna(x): return x
|
| 131 |
-
return _collapse_ws(_nfkc(str(x)))
|
| 132 |
-
|
| 133 |
-
def _is_email(s: str) -> bool:
|
| 134 |
-
return bool(re.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$", s or ""))
|
| 135 |
-
|
| 136 |
-
def _clean_phone(s: str, default_region: str = "IN"):
|
| 137 |
-
if s is None or str(s).strip() == "":
|
| 138 |
-
return None, "missing_phone"
|
| 139 |
-
raw = re.sub(r"[^\d+]", "", str(s))
|
| 140 |
-
if HAVE_PHONENUM:
|
| 141 |
-
try:
|
| 142 |
-
pn = phonenumbers.parse(raw, default_region)
|
| 143 |
-
if phonenumbers.is_possible_number(pn) and phonenumbers.is_valid_number(pn):
|
| 144 |
-
return phonenumbers.format_number(pn, phonenumbers.PhoneNumberFormat.E164), None
|
| 145 |
-
return raw, "invalid_phone"
|
| 146 |
-
except Exception:
|
| 147 |
-
return raw, "invalid_phone"
|
| 148 |
-
digits = re.sub(r"\D", "", raw)
|
| 149 |
-
return (digits, None) if 8 <= len(digits) <= 15 else (digits, "invalid_phone")
|
| 150 |
-
|
| 151 |
-
def _parse_datetime(s):
|
| 152 |
-
try:
|
| 153 |
-
return pd.to_datetime(s, errors="coerce", utc=True)
|
| 154 |
-
except Exception:
|
| 155 |
-
return pd.NaT
|
| 156 |
-
|
| 157 |
-
def _to_numeric(series: pd.Series):
|
| 158 |
-
coerced = pd.to_numeric(series, errors="coerce")
|
| 159 |
-
return coerced, (coerced.isna() & series.notna())
|
| 160 |
-
|
| 161 |
-
def _read_csv_any(file_obj) -> pd.DataFrame:
|
| 162 |
-
if file_obj is None:
|
| 163 |
-
raise ValueError("No file uploaded.")
|
| 164 |
-
if hasattr(file_obj, "name"):
|
| 165 |
-
p = file_obj.name
|
| 166 |
-
try: return pd.read_csv(p)
|
| 167 |
-
except Exception: return pd.read_csv(p, encoding="latin-1")
|
| 168 |
-
try: return pd.read_csv(file_obj)
|
| 169 |
-
except Exception:
|
| 170 |
-
file_obj.seek(0)
|
| 171 |
-
return pd.read_csv(file_obj, encoding="latin-1")
|
| 172 |
-
|
| 173 |
-
def _standardize_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 174 |
-
df = df.copy()
|
| 175 |
-
df.columns = [_norm_colname(c) for c in df.columns]
|
| 176 |
-
for c in df.select_dtypes(include=["object"]).columns:
|
| 177 |
-
df[c] = df[c].apply(_clean_str)
|
| 178 |
-
return df
|
| 179 |
-
|
| 180 |
-
def _prepare_generic(df: pd.DataFrame, expected: Dict[str, List[str]]):
|
| 181 |
-
issues = []
|
| 182 |
-
df0 = _standardize_df(df)
|
| 183 |
-
|
| 184 |
-
# Synonym mapping
|
| 185 |
-
colmap = {}
|
| 186 |
-
cols = set(df0.columns)
|
| 187 |
-
for canon, syns in expected.items():
|
| 188 |
-
found = None
|
| 189 |
-
for s in [canon] + syns:
|
| 190 |
-
s = _norm_colname(s)
|
| 191 |
-
if s in cols:
|
| 192 |
-
found = s; break
|
| 193 |
-
if found: colmap[canon] = found
|
| 194 |
-
|
| 195 |
-
# Email/phone quality
|
| 196 |
-
for c in list(df0.columns):
|
| 197 |
-
if "email" in c:
|
| 198 |
-
df0[c] = df0[c].apply(lambda x: str(x).lower().strip() if pd.notna(x) else x)
|
| 199 |
-
for idx, v in df0[c].items():
|
| 200 |
-
if pd.isna(v) or str(v).strip()=="":
|
| 201 |
-
issues.append({"row": idx, "field": c, "issue":"missing_email","value":""})
|
| 202 |
-
elif not _is_email(v):
|
| 203 |
-
issues.append({"row": idx, "field": c, "issue":"invalid_email","value":str(v)})
|
| 204 |
-
if "phone" in c or "mobile" in c:
|
| 205 |
-
vals = []
|
| 206 |
-
for idx, v in df0[c].items():
|
| 207 |
-
e164, prob = _clean_phone(v)
|
| 208 |
-
vals.append(e164)
|
| 209 |
-
if prob: issues.append({"row": idx, "field": c, "issue":prob, "value":str(v)})
|
| 210 |
-
df0[c] = vals
|
| 211 |
-
|
| 212 |
-
# Datetime parsing
|
| 213 |
-
for c in df0.columns:
|
| 214 |
-
if any(k in c for k in ["date","time","timestamp","created_at","updated_at"]):
|
| 215 |
-
parsed = _parse_datetime(df0[c])
|
| 216 |
-
bad = parsed.isna() & df0[c].notna()
|
| 217 |
-
for idx in df0.index[bad]:
|
| 218 |
-
issues.append({"row": int(idx), "field": c, "issue":"unparseable_timestamp", "value":str(df0.loc[idx, c])})
|
| 219 |
-
df0[c] = parsed
|
| 220 |
-
|
| 221 |
-
# Numeric coercions for common fields
|
| 222 |
-
for nc in ["amount","credit_score","utilization","dti","recent_defaults","income"]:
|
| 223 |
-
for c in df0.columns:
|
| 224 |
-
if c == nc or c.endswith("_"+nc) or nc in c:
|
| 225 |
-
coerced, badmask = _to_numeric(df0[c])
|
| 226 |
-
for idx in df0.index[badmask]:
|
| 227 |
-
issues.append({"row": int(idx), "field": c, "issue":"non_numeric", "value":str(df0.loc[idx, c])})
|
| 228 |
-
df0[c] = coerced
|
| 229 |
-
|
| 230 |
-
issues_df = pd.DataFrame(issues, columns=["row","field","issue","value"])
|
| 231 |
-
missing = [k for k in expected.keys() if k not in colmap]
|
| 232 |
-
quality_summary = f"Rows={len(df0)}, Cols={len(df0.columns)}; Missing required fields: {missing if missing else 'None'}"
|
| 233 |
-
return df0, issues_df, quality_summary, colmap
|
| 234 |
-
|
| 235 |
-
# ------------------------
|
| 236 |
-
# Modules & Rules
|
| 237 |
-
# ------------------------
|
| 238 |
-
TX_EXPECTED = {
|
| 239 |
-
"transaction_id":["txn_id","transactionid","id","tx_id"],
|
| 240 |
-
"customer_id":["cust_id","user_id","client_id"],
|
| 241 |
-
"amount":["amt","amount_inr","value"],
|
| 242 |
-
"timestamp":["date","event_time","created_at","tx_time"],
|
| 243 |
-
"merchant_category":["mcc","merchant_cat","category"]
|
| 244 |
-
}
|
| 245 |
-
def prepare_transactions(df): return _prepare_generic(df, TX_EXPECTED)
|
| 246 |
-
|
| 247 |
-
def detect_transactions(clean_df, colmap, high_risk_mcc: Optional[List[str]]=None):
|
| 248 |
-
high_risk = set(["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"])
|
| 249 |
-
if high_risk_mcc:
|
| 250 |
-
high_risk.update([_nfkc(x).strip().upper().replace(" ","_") for x in high_risk_mcc])
|
| 251 |
-
if not all(k in colmap for k in ["customer_id","amount"]):
|
| 252 |
-
return pd.DataFrame(), "Required columns missing for detection (need at least customer_id, amount)."
|
| 253 |
-
df = clean_df.copy()
|
| 254 |
-
reasons = []
|
| 255 |
-
amtcol = colmap.get("amount")
|
| 256 |
-
if amtcol is not None:
|
| 257 |
-
reasons.append(("large_amount>10k", df[amtcol] > 10000))
|
| 258 |
-
reasons.append(("negative_amount", df[amtcol] < 0))
|
| 259 |
-
if "merchant_category" in colmap:
|
| 260 |
-
mcc = colmap["merchant_category"]
|
| 261 |
-
high = df[mcc].astype(str).str.upper().str.replace(" ","_", regex=False).isin(high_risk)
|
| 262 |
-
reasons.append(("merchant_category_high_risk", high))
|
| 263 |
-
if all(k in colmap for k in ["customer_id","timestamp","amount"]):
|
| 264 |
-
cid, ts, amt = colmap["customer_id"], colmap["timestamp"], colmap["amount"]
|
| 265 |
-
daily = df.groupby([cid, df[ts].dt.date])[amt].transform("sum")
|
| 266 |
-
reasons.append(("daily_sum_per_customer>50k", daily > 50000))
|
| 267 |
-
mask = None
|
| 268 |
-
for _, m in reasons:
|
| 269 |
-
mask = m if mask is None else (mask | m)
|
| 270 |
-
flagged = df[mask] if mask is not None else pd.DataFrame()
|
| 271 |
-
if not flagged.empty:
|
| 272 |
-
rr=[]
|
| 273 |
-
for _, row in flagged.iterrows():
|
| 274 |
-
hits=[]
|
| 275 |
-
a = row[amtcol] if amtcol in flagged.columns else None
|
| 276 |
-
if pd.notna(a) and a>10000: hits.append("large_amount")
|
| 277 |
-
if pd.notna(a) and a<0: hits.append("negative_amount")
|
| 278 |
-
if "merchant_category" in colmap:
|
| 279 |
-
val = str(row[colmap["merchant_category"]]).upper().replace(" ","_")
|
| 280 |
-
if val in high_risk: hits.append("mcc_high_risk")
|
| 281 |
-
# daily sum check reconstructed
|
| 282 |
-
try:
|
| 283 |
-
if all(k in colmap for k in ["customer_id","timestamp","amount"]):
|
| 284 |
-
sub = df[(df[colmap["customer_id"]]==row[colmap["customer_id"]]) &
|
| 285 |
-
(df[colmap["timestamp"]].dt.date==pd.to_datetime(row[colmap["timestamp"]], errors="coerce").date())]
|
| 286 |
-
if sub[colmap["amount"]].sum() > 50000: hits.append("daily_sum>50k")
|
| 287 |
-
except Exception: pass
|
| 288 |
-
rr.append(", ".join(sorted(set(hits))) or "rule_hit")
|
| 289 |
-
flagged = flagged.assign(risk_reason=rr)
|
| 290 |
-
stats = f"Transactions flagged: {len(flagged)} of {len(df)}."
|
| 291 |
-
return flagged, stats
|
| 292 |
-
|
| 293 |
-
KYC_EXPECTED = {
|
| 294 |
-
"customer_id":["cust_id","user_id","client_id"],
|
| 295 |
-
"name":["full_name","customer_name"],
|
| 296 |
-
"email":["email_address","mail"],
|
| 297 |
-
"phone":["phone_number","mobile","contact"],
|
| 298 |
-
"dob":["date_of_birth","birthdate"]
|
| 299 |
-
}
|
| 300 |
-
def prepare_kyc(df): return _prepare_generic(df, KYC_EXPECTED)
|
| 301 |
-
|
| 302 |
-
def _age_years(dob: pd.Series) -> pd.Series:
|
| 303 |
-
now = pd.Timestamp.utcnow()
|
| 304 |
-
return (now - dob).dt.days / 365.25
|
| 305 |
-
|
| 306 |
-
def detect_kyc(clean_df, colmap):
|
| 307 |
-
if not all(k in colmap for k in ["customer_id","name"]):
|
| 308 |
-
return pd.DataFrame(), "Required columns missing for KYC (need at least customer_id, name)."
|
| 309 |
-
df = clean_df.copy()
|
| 310 |
-
reasons=[]
|
| 311 |
-
if "email" in colmap:
|
| 312 |
-
dupe_email = df.duplicated(subset=[colmap["email"]], keep=False) & df[colmap["email"]].notna()
|
| 313 |
-
reasons.append(("duplicate_email", dupe_email))
|
| 314 |
-
if "phone" in colmap:
|
| 315 |
-
dupe_phone = df.duplicated(subset=[colmap["phone"]], keep=False) & df[colmap["phone"]].notna()
|
| 316 |
-
reasons.append(("duplicate_phone", dupe_phone))
|
| 317 |
-
if "dob" in colmap:
|
| 318 |
-
age = _age_years(df[colmap["dob"]])
|
| 319 |
-
invalid = (df[colmap["dob"]].isna()) | (df[colmap["dob"]] > pd.Timestamp.utcnow()) | (age > 120)
|
| 320 |
-
reasons.append(("invalid_dob", invalid))
|
| 321 |
-
if "name" in colmap:
|
| 322 |
-
name = df[colmap["name"]].astype(str)
|
| 323 |
-
susp = name.str.isupper() | name.str.contains(r"\d") | (name.str.len()<3)
|
| 324 |
-
reasons.append(("suspicious_name", susp))
|
| 325 |
-
mask = None
|
| 326 |
-
for _, m in reasons:
|
| 327 |
-
mask = m if mask is None else (mask | m)
|
| 328 |
-
flagged = df[mask] if mask is not None else pd.DataFrame()
|
| 329 |
-
if not flagged.empty:
|
| 330 |
-
flagged = flagged.assign(risk_reason="kyc_rule_hit")
|
| 331 |
-
stats = f"KYC flagged: {len(flagged)} of {len(df)}."
|
| 332 |
-
return flagged, stats
|
| 333 |
-
|
| 334 |
-
SAN_EXPECTED = {"customer_id":["cust_id","user_id","client_id"], "name":["full_name","customer_name"]}
|
| 335 |
-
def prepare_sanctions(df): return _prepare_generic(df, SAN_EXPECTED)
|
| 336 |
-
|
| 337 |
-
DEMO_SANCTIONS = pd.DataFrame({"name":["Ivan Petrov","Global Terror Org","Acme Front LLC","John Doe (PEP)","Shadow Brokers"]})
|
| 338 |
-
|
| 339 |
-
def token_overlap(a: str, b: str) -> int:
|
| 340 |
-
at = set(re.findall(r"[A-Za-z0-9]+", a.lower()))
|
| 341 |
-
bt = set(re.findall(r"[A-Za-z0-9]+", b.lower()))
|
| 342 |
-
return len(at & bt)
|
| 343 |
-
|
| 344 |
-
def detect_sanctions(clean_df, colmap, sanctions_df: Optional[pd.DataFrame]=None):
|
| 345 |
-
if "name" not in colmap:
|
| 346 |
-
return pd.DataFrame(), "Required column missing for Sanctions (need name)."
|
| 347 |
-
df = clean_df.copy()
|
| 348 |
-
sanc = sanctions_df if sanctions_df is not None else DEMO_SANCTIONS.copy()
|
| 349 |
-
sanc = _standardize_df(sanc)
|
| 350 |
-
if "name" not in sanc.columns:
|
| 351 |
-
for c in sanc.columns:
|
| 352 |
-
if "name" in c: sanc = sanc.rename(columns={c:"name"}); break
|
| 353 |
-
sanc_names = sanc["name"].dropna().astype(str).tolist()
|
| 354 |
-
matches=[]
|
| 355 |
-
for idx, row in df.iterrows():
|
| 356 |
-
nm = str(row[colmap["name"]] or "").strip()
|
| 357 |
-
if not nm: continue
|
| 358 |
-
if any(nm.lower()==s.lower() for s in sanc_names):
|
| 359 |
-
matches.append((idx,"exact")); continue
|
| 360 |
-
if any(token_overlap(nm, s) >= 2 for s in sanc_names):
|
| 361 |
-
matches.append((idx,"fuzzy"))
|
| 362 |
-
flagged = df.loc[[i for i,_ in matches]].copy() if matches else pd.DataFrame()
|
| 363 |
-
if not flagged.empty:
|
| 364 |
-
mt = {i:t for i,t in matches}
|
| 365 |
-
flagged = flagged.assign(match_type=[mt.get(i,"") for i in flagged.index])
|
| 366 |
-
stats = f"Sanctions matches: {len(flagged)} of {len(df)}. (Using {'uploaded/MCP' if sanctions_df is not None else 'demo'} list)"
|
| 367 |
-
return flagged, stats
|
| 368 |
-
|
| 369 |
-
CR_EXPECTED = {
|
| 370 |
-
"customer_id":["cust_id","user_id","client_id"],
|
| 371 |
-
"credit_score":["creditscore","score"],
|
| 372 |
-
"utilization":["util","credit_utilization","utilization_ratio"],
|
| 373 |
-
"dti":["debt_to_income","debt_to_income_ratio"],
|
| 374 |
-
"recent_defaults":["defaults","recentdefaults"],
|
| 375 |
-
"income":["annual_income","salary"]
|
| 376 |
-
}
|
| 377 |
-
def prepare_credit(df): return _prepare_generic(df, CR_EXPECTED)
|
| 378 |
-
|
| 379 |
-
def detect_credit(clean_df, colmap):
|
| 380 |
-
needed = ["credit_score","utilization","dti","recent_defaults","income"]
|
| 381 |
-
if not any(k in colmap for k in needed):
|
| 382 |
-
return pd.DataFrame(), "Required columns missing for Credit Risk."
|
| 383 |
-
df = clean_df.copy()
|
| 384 |
-
cs = df[colmap.get("credit_score","credit_score")] if "credit_score" in colmap else pd.Series([np.nan]*len(df))
|
| 385 |
-
util= df[colmap.get("utilization","utilization")] if "utilization" in colmap else pd.Series([np.nan]*len(df))
|
| 386 |
-
dti = df[colmap.get("dti","dti")] if "dti" in colmap else pd.Series([np.nan]*len(df))
|
| 387 |
-
rde = df[colmap.get("recent_defaults","recent_defaults")] if "recent_defaults" in colmap else pd.Series([np.nan]*len(df))
|
| 388 |
-
inc = df[colmap.get("income","income")] if "income" in colmap else pd.Series([np.nan]*len(df))
|
| 389 |
-
out=[]
|
| 390 |
-
for i in range(len(df)):
|
| 391 |
-
hits=0; reasons=[]
|
| 392 |
-
if pd.notna(cs.iloc[i]) and cs.iloc[i] < 600: hits+=1; reasons.append("credit_score<600")
|
| 393 |
-
if pd.notna(util.iloc[i]) and util.iloc[i] > 0.8: hits+=1; reasons.append("utilization>0.8")
|
| 394 |
-
if pd.notna(dti.iloc[i]) and dti.iloc[i] > 0.4: hits+=1; reasons.append("DTI>0.4")
|
| 395 |
-
if pd.notna(rde.iloc[i]) and rde.iloc[i] > 0: hits+=1; reasons.append("recent_defaults>0")
|
| 396 |
-
if pd.notna(inc.iloc[i]) and inc.iloc[i] < 30000: hits+=1; reasons.append("income<30000")
|
| 397 |
-
level = "High" if hits>=3 else ("Medium" if hits==2 else ("Low" if hits==1 else "None"))
|
| 398 |
-
out.append((hits, level, ", ".join(reasons)))
|
| 399 |
-
risk_score=[x[0] for x in out]; risk_level=[x[1] for x in out]; reason=[x[2] for x in out]
|
| 400 |
-
res = df.assign(risk_score=risk_score, risk_level=risk_level, risk_reason=reason)
|
| 401 |
-
flagged = res[res["risk_level"].isin(["High","Medium","Low"]) & (res["risk_level"]!="None")]
|
| 402 |
-
stats = f"Credit Risk flagged: {len(flagged)} of {len(df)}. Distribution: High={(res['risk_level']=='High').sum()}, Medium={(res['risk_level']=='Medium').sum()}, Low={(res['risk_level']=='Low').sum()}."
|
| 403 |
-
return flagged, stats
|
| 404 |
-
|
| 405 |
-
# ------------------------
|
| 406 |
-
# Summarizer (notice-first)
|
| 407 |
-
# ------------------------
|
| 408 |
-
SUMMARY_SYS = "You are a helpful Fraud/Risk analyst. Be concise (<120 words), list key counts, drivers, and data quality caveats."
|
| 409 |
-
|
| 410 |
-
def summarize_ai(context: str) -> str:
|
| 411 |
-
"""
|
| 412 |
-
If chat LLM is available, use it to generate a short summary.
|
| 413 |
-
Otherwise return the prototype notice string.
|
| 414 |
-
"""
|
| 415 |
-
if CHAT_LLM is None:
|
| 416 |
-
return SUMMARY_NOTICE
|
| 417 |
-
try:
|
| 418 |
-
out = CHAT_LLM.invoke([SystemMessage(content=SUMMARY_SYS), HumanMessage(content=context[:4000])])
|
| 419 |
-
if hasattr(out, "content"): return out.content
|
| 420 |
-
return str(out)
|
| 421 |
-
except Exception as e:
|
| 422 |
-
msg = str(e)
|
| 423 |
-
if "401" in msg or "403" in msg:
|
| 424 |
-
return SUMMARY_NOTICE
|
| 425 |
-
return SUMMARY_NOTICE
|
| 426 |
-
|
| 427 |
-
# ------------------------
|
| 428 |
-
# Optional MCP
|
| 429 |
-
# ------------------------
|
| 430 |
-
from urllib.request import Request, urlopen
|
| 431 |
-
def _mcp_get_json(url: str, auth_header: Optional[str]):
|
| 432 |
-
try:
|
| 433 |
-
req = Request(url)
|
| 434 |
-
if auth_header:
|
| 435 |
-
k, v = auth_header.split(":", 1)
|
| 436 |
-
req.add_header(k.strip(), v.strip())
|
| 437 |
-
with urlopen(req, timeout=10) as r:
|
| 438 |
-
return json.loads(r.read().decode("utf-8"))
|
| 439 |
-
except Exception as e:
|
| 440 |
-
log.warning(f"MCP fetch failed: {e}")
|
| 441 |
-
return None
|
| 442 |
-
|
| 443 |
-
def mcp_fetch_sanctions() -> Optional[pd.DataFrame]:
|
| 444 |
-
if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
|
| 445 |
-
url = os.getenv("MCP_SANCTIONS_URL")
|
| 446 |
-
if not url: return None
|
| 447 |
-
data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
|
| 448 |
-
if not data: return None
|
| 449 |
-
if isinstance(data, list):
|
| 450 |
-
if all(isinstance(x, dict) for x in data):
|
| 451 |
-
rows = [{"name": x.get("name") or x.get("Name")} for x in data if x.get("name") or x.get("Name")]
|
| 452 |
-
return pd.DataFrame(rows) if rows else None
|
| 453 |
-
if all(isinstance(x, str) for x in data):
|
| 454 |
-
return pd.DataFrame({"name": data})
|
| 455 |
-
return None
|
| 456 |
-
|
| 457 |
-
def mcp_fetch_high_risk_mcc() -> Optional[List[str]]:
|
| 458 |
-
if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
|
| 459 |
-
url = os.getenv("MCP_HIGH_RISK_MCC_URL")
|
| 460 |
-
if not url: return None
|
| 461 |
-
data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
|
| 462 |
-
return [str(x) for x in data] if isinstance(data, list) else None
|
| 463 |
-
|
| 464 |
-
# ------------------------
|
| 465 |
-
# Pipelines (per tab)
|
| 466 |
-
# ------------------------
|
| 467 |
-
def run_transactions(file):
|
| 468 |
-
try:
|
| 469 |
-
df = _read_csv_any(file)
|
| 470 |
-
clean, issues, quality, colmap = prepare_transactions(df)
|
| 471 |
-
mcc = mcp_fetch_high_risk_mcc()
|
| 472 |
-
flagged, stats = detect_transactions(clean, colmap, mcc)
|
| 473 |
-
ctx = f"[Transactions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
|
| 474 |
-
ai = summarize_ai(ctx)
|
| 475 |
-
return ai, stats, flagged, issues
|
| 476 |
-
except Exception as e:
|
| 477 |
-
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
| 478 |
-
|
| 479 |
-
def run_kyc(file):
|
| 480 |
-
try:
|
| 481 |
-
df = _read_csv_any(file)
|
| 482 |
-
clean, issues, quality, colmap = prepare_kyc(df)
|
| 483 |
-
flagged, stats = detect_kyc(clean, colmap)
|
| 484 |
-
ctx = f"[KYC]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
|
| 485 |
-
ai = summarize_ai(ctx)
|
| 486 |
-
return ai, stats, flagged, issues
|
| 487 |
-
except Exception as e:
|
| 488 |
-
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
| 489 |
-
|
| 490 |
-
def run_sanctions(customers_file, sanctions_file):
|
| 491 |
-
try:
|
| 492 |
-
df = _read_csv_any(customers_file)
|
| 493 |
-
clean, issues, quality, colmap = prepare_sanctions(df)
|
| 494 |
-
sanc_df = mcp_fetch_sanctions()
|
| 495 |
-
if sanc_df is None and sanctions_file is not None:
|
| 496 |
-
sanc_df = _read_csv_any(sanctions_file)
|
| 497 |
-
flagged, stats = detect_sanctions(clean, colmap, sanc_df)
|
| 498 |
-
ctx = f"[Sanctions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nMatches:\n{flagged.head(5).to_csv(index=False)}"
|
| 499 |
-
ai = summarize_ai(ctx)
|
| 500 |
-
return ai, stats, flagged, issues
|
| 501 |
-
except Exception as e:
|
| 502 |
-
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
| 503 |
-
|
| 504 |
-
def run_credit(file):
|
| 505 |
-
try:
|
| 506 |
-
df = _read_csv_any(file)
|
| 507 |
-
clean, issues, quality, colmap = prepare_credit(df)
|
| 508 |
-
flagged, stats = detect_credit(clean, colmap)
|
| 509 |
-
ctx = f"[Credit]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
|
| 510 |
-
ai = summarize_ai(ctx)
|
| 511 |
-
return ai, stats, flagged, issues
|
| 512 |
-
except Exception as e:
|
| 513 |
-
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
| 514 |
-
|
| 515 |
-
# ------------------------
|
| 516 |
-
# Tools (CSV text in → concise text out)
|
| 517 |
-
# ------------------------
|
| 518 |
-
def _csv_text_to_df(csv_text: str) -> pd.DataFrame:
|
| 519 |
-
return pd.read_csv(io.StringIO(csv_text))
|
| 520 |
-
|
| 521 |
-
class TransactionCSVInput(BaseModel):
|
| 522 |
-
csv_text: str = Field(..., description="Transactions CSV text")
|
| 523 |
-
|
| 524 |
-
@tool("transactions_fraud_tool", args_schema=TransactionCSVInput)
|
| 525 |
-
def transactions_fraud_tool(csv_text: str) -> str:
|
| 526 |
-
df = _csv_text_to_df(csv_text)
|
| 527 |
-
clean, issues, quality, colmap = prepare_transactions(df)
|
| 528 |
-
flagged, stats = detect_transactions(clean, colmap)
|
| 529 |
-
return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
| 530 |
-
|
| 531 |
-
class KYCCSVInput(BaseModel):
|
| 532 |
-
csv_text: str = Field(..., description="KYC CSV text")
|
| 533 |
-
|
| 534 |
-
@tool("kyc_fraud_tool", args_schema=KYCCSVInput)
|
| 535 |
-
def kyc_fraud_tool(csv_text: str) -> str:
|
| 536 |
-
df = _csv_text_to_df(csv_text)
|
| 537 |
-
clean, issues, quality, colmap = prepare_kyc(df)
|
| 538 |
-
flagged, stats = detect_kyc(clean, colmap)
|
| 539 |
-
return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
| 540 |
-
|
| 541 |
-
class SanctionsCSVInput(BaseModel):
|
| 542 |
-
csv_text: str = Field(..., description="Customers CSV text with a 'name' column")
|
| 543 |
-
|
| 544 |
-
@tool("sanctions_pep_tool", args_schema=SanctionsCSVInput)
|
| 545 |
-
def sanctions_pep_tool(csv_text: str) -> str:
|
| 546 |
-
df = _csv_text_to_df(csv_text)
|
| 547 |
-
clean, issues, quality, colmap = prepare_sanctions(df)
|
| 548 |
-
flagged, stats = detect_sanctions(clean, colmap)
|
| 549 |
-
return f"{stats}\nDQ issues: {len(issues)}\nFirst matches:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
| 550 |
-
|
| 551 |
-
class CreditCSVInput(BaseModel):
|
| 552 |
-
csv_text: str = Field(..., description="Credit CSV text")
|
| 553 |
-
|
| 554 |
-
@tool("credit_risk_tool", args_schema=CreditCSVInput)
|
| 555 |
-
def credit_risk_tool(csv_text: str) -> str:
|
| 556 |
-
df = _csv_text_to_df(csv_text)
|
| 557 |
-
clean, issues, quality, colmap = prepare_credit(df)
|
| 558 |
-
flagged, stats = detect_credit(clean, colmap)
|
| 559 |
-
return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
| 560 |
-
|
| 561 |
-
TOOLS: List[Tool] = [
|
| 562 |
-
transactions_fraud_tool,
|
| 563 |
-
kyc_fraud_tool,
|
| 564 |
-
sanctions_pep_tool,
|
| 565 |
-
credit_risk_tool,
|
| 566 |
-
]
|
| 567 |
-
|
| 568 |
-
# ------------------------
|
| 569 |
-
# Agent (chat-completions)
|
| 570 |
-
# ------------------------
|
| 571 |
-
AGENT_SYSTEM = """You are an AI Consultant for Fraud/Risk.
|
| 572 |
-
You have tools for Transactions, KYC, Sanctions/PEP, and Credit Risk.
|
| 573 |
-
If the user pastes a small CSV snippet, pick the relevant tool and analyze it.
|
| 574 |
-
Be concise and actionable."""
|
| 575 |
-
|
| 576 |
-
def build_agent():
|
| 577 |
-
if CHAT_LLM is None:
|
| 578 |
-
class Stub:
|
| 579 |
-
def invoke(self, prompt): return CHAT_NOTICE
|
| 580 |
-
return Stub()
|
| 581 |
-
return initialize_agent(
|
| 582 |
-
TOOLS,
|
| 583 |
-
CHAT_LLM,
|
| 584 |
-
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
| 585 |
-
verbose=False,
|
| 586 |
-
agent_kwargs={"system_message": AGENT_SYSTEM},
|
| 587 |
-
handle_parsing_errors=True,
|
| 588 |
-
)
|
| 589 |
-
|
| 590 |
-
AGENT = build_agent()
|
| 591 |
-
|
| 592 |
-
def agent_reply(history: List[Dict], user_msg: str):
|
| 593 |
-
try:
|
| 594 |
-
looks_like_csv = ("," in user_msg) and ("\n" in user_msg) and (user_msg.count(",") >= 2)
|
| 595 |
-
prompt = f"CSV snippet detected. Decide tool and analyze:\n\n{user_msg}" if looks_like_csv else user_msg
|
| 596 |
-
res = AGENT.invoke(prompt)
|
| 597 |
-
if isinstance(res, dict) and "output" in res: return res["output"]
|
| 598 |
-
return str(res)
|
| 599 |
-
except Exception as e:
|
| 600 |
-
return f"Agent error: {e}"
|
| 601 |
-
|
| 602 |
-
# ------------------------
|
| 603 |
-
# UI
|
| 604 |
-
# ------------------------
|
| 605 |
-
with gr.Blocks(title="Fraud Detector Analyst — LangChain + MCP", theme=gr.themes.Soft()) as demo:
|
| 606 |
-
gr.Markdown("# 🛡️ Fraud Detector Analyst — LangChain + MCP")
|
| 607 |
-
gr.Markdown(
|
| 608 |
-
"This prototype runs **rules & data checks locally**. "
|
| 609 |
-
"Chat + AI summaries require a remote inference provider (HF Inference)."
|
| 610 |
-
)
|
| 611 |
-
|
| 612 |
-
with gr.Tabs():
|
| 613 |
-
with gr.Tab("Transactions"):
|
| 614 |
-
gr.Markdown("Upload a **transactions** CSV.")
|
| 615 |
-
tx_file = gr.File(file_types=[".csv"], label="Transactions CSV", type="binary")
|
| 616 |
-
tx_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
| 617 |
-
tx_stats = gr.Textbox(label="Stats", lines=3)
|
| 618 |
-
tx_flagged = gr.Dataframe(label="Flagged Transactions")
|
| 619 |
-
tx_issues = gr.Dataframe(label="Data Quality Issues (row, field, issue, value)")
|
| 620 |
-
tx_file.upload(run_transactions, inputs=[tx_file], outputs=[tx_ai, tx_stats, tx_flagged, tx_issues])
|
| 621 |
-
|
| 622 |
-
with gr.Tab("KYC"):
|
| 623 |
-
gr.Markdown("Upload a **KYC** CSV.")
|
| 624 |
-
kyc_file = gr.File(file_types=[".csv"], label="KYC CSV", type="binary")
|
| 625 |
-
kyc_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
| 626 |
-
kyc_stats = gr.Textbox(label="Stats", lines=3)
|
| 627 |
-
kyc_flagged = gr.Dataframe(label="Flagged KYC Rows")
|
| 628 |
-
kyc_issues = gr.Dataframe(label="Data Quality Issues")
|
| 629 |
-
kyc_file.upload(run_kyc, inputs=[kyc_file], outputs=[kyc_ai, kyc_stats, kyc_flagged, kyc_issues])
|
| 630 |
-
|
| 631 |
-
with gr.Tab("Sanctions/PEP"):
|
| 632 |
-
gr.Markdown("Upload **customers** CSV (+ optional sanctions CSV).")
|
| 633 |
-
san_customers = gr.File(file_types=[".csv"], label="Customers CSV", type="binary")
|
| 634 |
-
san_list = gr.File(file_types=[".csv"], label="Sanctions/PEP CSV (optional)", type="binary")
|
| 635 |
-
san_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
| 636 |
-
san_stats = gr.Textbox(label="Stats", lines=3)
|
| 637 |
-
san_flagged = gr.Dataframe(label="Matches")
|
| 638 |
-
san_issues = gr.Dataframe(label="Data Quality Issues")
|
| 639 |
-
san_customers.upload(run_sanctions, inputs=[san_customers, san_list], outputs=[san_ai, san_stats, san_flagged, san_issues])
|
| 640 |
-
san_list.upload(run_sanctions, inputs=[san_customers, san_list], outputs=[san_ai, san_stats, san_flagged, san_issues])
|
| 641 |
-
|
| 642 |
-
with gr.Tab("Credit Risk"):
|
| 643 |
-
gr.Markdown("Upload a **credit** CSV.")
|
| 644 |
-
cr_file = gr.File(file_types=[".csv"], label="Credit CSV", type="binary")
|
| 645 |
-
cr_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
| 646 |
-
cr_stats = gr.Textbox(label="Stats", lines=3)
|
| 647 |
-
cr_flagged = gr.Dataframe(label="Flagged Applicants")
|
| 648 |
-
cr_issues = gr.Dataframe(label="Data Quality Issues")
|
| 649 |
-
cr_file.upload(run_credit, inputs=[cr_file], outputs=[cr_ai, cr_stats, cr_flagged, cr_issues])
|
| 650 |
-
|
| 651 |
-
with gr.Tab("AI Consultant (Agent)"):
|
| 652 |
-
gr.Markdown("Paste a small CSV snippet or ask questions. Uses chat-completions when configured.")
|
| 653 |
-
chatbot = gr.Chatbot(type="messages", label="Fraud AI Consultant")
|
| 654 |
-
user_in = gr.Textbox(label="Message or CSV snippet")
|
| 655 |
-
send_btn = gr.Button("Send")
|
| 656 |
-
def _chat_fn(history, msg):
|
| 657 |
-
reply = agent_reply(history, msg)
|
| 658 |
-
history = (history or []) + [{"role":"user","content":msg}, {"role":"assistant","content":reply}]
|
| 659 |
-
return history, ""
|
| 660 |
-
send_btn.click(_chat_fn, inputs=[chatbot, user_in], outputs=[chatbot, user_in])
|
| 661 |
-
|
| 662 |
-
gr.Markdown(
|
| 663 |
-
"### ⚙️ Enable inference\n"
|
| 664 |
-
"- Set **HF_TOKEN** (or HF_SPACES on Spaces)\n"
|
| 665 |
-
"- Optional: **LC_CHAT_MODEL** (default Qwen 0.5B Instruct), **LC_CHAT_MODEL_FALLBACK** (default Mistral 7B Instruct)\n"
|
| 666 |
-
"- Optional MCP: `ENABLE_MCP=1`, `MCP_SANCTIONS_URL`, `MCP_HIGH_RISK_MCC_URL`, `MCP_AUTH_HEADER`"
|
| 667 |
-
)
|
| 668 |
-
|
| 669 |
-
if __name__ == "__main__":
|
| 670 |
-
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
| 1 |
+
"""TODO: implement app.py (split from app_monolith_backup.py)"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_monolith_backup.py
ADDED
|
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Fraud Detector Analyst — LangChain + (optional) MCP
|
| 3 |
+
Advanced “prototype-first” build:
|
| 4 |
+
- Chat uses chat-completion models (LangChain ChatHuggingFace).
|
| 5 |
+
- AI Summary shows a notice when no inference is connected.
|
| 6 |
+
|
| 7 |
+
LLM env (serverless friendly):
|
| 8 |
+
HF_TOKEN (or HF_SPACES)
|
| 9 |
+
LC_CHAT_MODEL (default: "Qwen/Qwen2.5-0.5B-Instruct")
|
| 10 |
+
LC_CHAT_MODEL_FALLBACK (default: "mistralai/Mistral-7B-Instruct")
|
| 11 |
+
|
| 12 |
+
Summary behavior:
|
| 13 |
+
If no working inference/token -> summary fields display:
|
| 14 |
+
"🔌 Please connect to an inference point to generate summary."
|
| 15 |
+
|
| 16 |
+
Optional MCP:
|
| 17 |
+
ENABLE_MCP=1
|
| 18 |
+
MCP_SANCTIONS_URL, MCP_HIGH_RISK_MCC_URL
|
| 19 |
+
MCP_AUTH_HEADER="Authorization: Bearer <token>"
|
| 20 |
+
|
| 21 |
+
Run:
|
| 22 |
+
pip install -r requirements.txt
|
| 23 |
+
python app.py
|
| 24 |
+
On Spaces:
|
| 25 |
+
Add secret HF_TOKEN (or HF_SPACES). Launch.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from __future__ import annotations
|
| 29 |
+
|
| 30 |
+
import os, io, re, json, math, unicodedata, logging
|
| 31 |
+
from typing import Optional, Tuple, List, Dict
|
| 32 |
+
|
| 33 |
+
import numpy as np
|
| 34 |
+
import pandas as pd
|
| 35 |
+
import gradio as gr
|
| 36 |
+
from dotenv import load_dotenv
|
| 37 |
+
|
| 38 |
+
# LangChain
|
| 39 |
+
from langchain.tools import tool
|
| 40 |
+
from langchain_core.tools import Tool
|
| 41 |
+
from langchain.agents import initialize_agent, AgentType
|
| 42 |
+
from langchain.schema import HumanMessage, SystemMessage
|
| 43 |
+
|
| 44 |
+
from pydantic import BaseModel, Field
|
| 45 |
+
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
|
| 46 |
+
|
| 47 |
+
# Phone normalization
|
| 48 |
+
try:
|
| 49 |
+
import phonenumbers
|
| 50 |
+
HAVE_PHONENUM = True
|
| 51 |
+
except Exception:
|
| 52 |
+
HAVE_PHONENUM = False
|
| 53 |
+
|
| 54 |
+
# ------------------------
|
| 55 |
+
# Setup
|
| 56 |
+
# ------------------------
|
| 57 |
+
load_dotenv()
|
| 58 |
+
logging.basicConfig(level=logging.INFO)
|
| 59 |
+
log = logging.getLogger("fraud-analyst")
|
| 60 |
+
|
| 61 |
+
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HF_SPACES")
|
| 62 |
+
|
| 63 |
+
# Chat models (chat-completions)
|
| 64 |
+
DEFAULT_CHAT_MODEL = os.getenv("LC_CHAT_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
|
| 65 |
+
FALLBACK_CHAT_MODEL = os.getenv("LC_CHAT_MODEL_FALLBACK", "mistralai/Mistral-7B-Instruct")
|
| 66 |
+
|
| 67 |
+
SUMMARY_NOTICE = "🔌 Please connect to an inference point to generate summary."
|
| 68 |
+
CHAT_NOTICE = "🔌 Chat model not configured. Set HF_TOKEN and LC_CHAT_MODEL to enable chat."
|
| 69 |
+
|
| 70 |
+
# ------------------------
|
| 71 |
+
# LLM builders
|
| 72 |
+
# ------------------------
|
| 73 |
+
def _mk_chat_llm(model_id: str) -> ChatHuggingFace:
|
| 74 |
+
"""
|
| 75 |
+
ChatHuggingFace uses HF Inference under the hood.
|
| 76 |
+
Although the backend task is 'text-generation', this wrapper handles chat-style messages.
|
| 77 |
+
"""
|
| 78 |
+
base = HuggingFaceEndpoint(
|
| 79 |
+
repo_id=model_id,
|
| 80 |
+
task="text-generation",
|
| 81 |
+
huggingfacehub_api_token=HF_TOKEN,
|
| 82 |
+
max_new_tokens=256,
|
| 83 |
+
temperature=0.2,
|
| 84 |
+
repetition_penalty=1.05,
|
| 85 |
+
timeout=60,
|
| 86 |
+
)
|
| 87 |
+
return ChatHuggingFace(llm=base)
|
| 88 |
+
|
| 89 |
+
def _heartbeat_chat(model_id: str) -> bool:
|
| 90 |
+
try:
|
| 91 |
+
chat = _mk_chat_llm(model_id)
|
| 92 |
+
_ = chat.invoke([HumanMessage(content="ok")])
|
| 93 |
+
return True
|
| 94 |
+
except Exception as e:
|
| 95 |
+
log.warning(f"Heartbeat failed for {model_id}: {str(e)[:160]}")
|
| 96 |
+
return False
|
| 97 |
+
|
| 98 |
+
def build_chat_llm() -> Optional[ChatHuggingFace]:
|
| 99 |
+
"""
|
| 100 |
+
Returns a working ChatHuggingFace or None (if token/permissions missing).
|
| 101 |
+
"""
|
| 102 |
+
log.info(f"HF token present: {bool(HF_TOKEN)} len={len(HF_TOKEN) if HF_TOKEN else 0}")
|
| 103 |
+
if HF_TOKEN and _heartbeat_chat(DEFAULT_CHAT_MODEL):
|
| 104 |
+
log.info(f"Using chat model: {DEFAULT_CHAT_MODEL}")
|
| 105 |
+
return _mk_chat_llm(DEFAULT_CHAT_MODEL)
|
| 106 |
+
if HF_TOKEN and _heartbeat_chat(FALLBACK_CHAT_MODEL):
|
| 107 |
+
log.info(f"Using fallback chat model: {FALLBACK_CHAT_MODEL}")
|
| 108 |
+
return _mk_chat_llm(FALLBACK_CHAT_MODEL)
|
| 109 |
+
log.warning("No working chat model; chat will show a notice.")
|
| 110 |
+
return None
|
| 111 |
+
|
| 112 |
+
CHAT_LLM = build_chat_llm()
|
| 113 |
+
|
| 114 |
+
# ------------------------
|
| 115 |
+
# Normalization helpers
|
| 116 |
+
# ------------------------
|
| 117 |
+
def _norm_colname(c: str) -> str:
|
| 118 |
+
c = c.strip().lower()
|
| 119 |
+
c = re.sub(r"\s+", "_", c)
|
| 120 |
+
c = re.sub(r"[^\w]+", "_", c)
|
| 121 |
+
return c.strip("_")
|
| 122 |
+
|
| 123 |
+
def _nfkc(s: str) -> str:
|
| 124 |
+
return unicodedata.normalize("NFKC", s)
|
| 125 |
+
|
| 126 |
+
def _collapse_ws(s: str) -> str:
|
| 127 |
+
return re.sub(r"\s+", " ", s).strip()
|
| 128 |
+
|
| 129 |
+
def _clean_str(x):
|
| 130 |
+
if pd.isna(x): return x
|
| 131 |
+
return _collapse_ws(_nfkc(str(x)))
|
| 132 |
+
|
| 133 |
+
def _is_email(s: str) -> bool:
|
| 134 |
+
return bool(re.match(r"^[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}$", s or ""))
|
| 135 |
+
|
| 136 |
+
def _clean_phone(s: str, default_region: str = "IN"):
|
| 137 |
+
if s is None or str(s).strip() == "":
|
| 138 |
+
return None, "missing_phone"
|
| 139 |
+
raw = re.sub(r"[^\d+]", "", str(s))
|
| 140 |
+
if HAVE_PHONENUM:
|
| 141 |
+
try:
|
| 142 |
+
pn = phonenumbers.parse(raw, default_region)
|
| 143 |
+
if phonenumbers.is_possible_number(pn) and phonenumbers.is_valid_number(pn):
|
| 144 |
+
return phonenumbers.format_number(pn, phonenumbers.PhoneNumberFormat.E164), None
|
| 145 |
+
return raw, "invalid_phone"
|
| 146 |
+
except Exception:
|
| 147 |
+
return raw, "invalid_phone"
|
| 148 |
+
digits = re.sub(r"\D", "", raw)
|
| 149 |
+
return (digits, None) if 8 <= len(digits) <= 15 else (digits, "invalid_phone")
|
| 150 |
+
|
| 151 |
+
def _parse_datetime(s):
|
| 152 |
+
try:
|
| 153 |
+
return pd.to_datetime(s, errors="coerce", utc=True)
|
| 154 |
+
except Exception:
|
| 155 |
+
return pd.NaT
|
| 156 |
+
|
| 157 |
+
def _to_numeric(series: pd.Series):
|
| 158 |
+
coerced = pd.to_numeric(series, errors="coerce")
|
| 159 |
+
return coerced, (coerced.isna() & series.notna())
|
| 160 |
+
|
| 161 |
+
def _read_csv_any(file_obj) -> pd.DataFrame:
|
| 162 |
+
if file_obj is None:
|
| 163 |
+
raise ValueError("No file uploaded.")
|
| 164 |
+
if hasattr(file_obj, "name"):
|
| 165 |
+
p = file_obj.name
|
| 166 |
+
try: return pd.read_csv(p)
|
| 167 |
+
except Exception: return pd.read_csv(p, encoding="latin-1")
|
| 168 |
+
try: return pd.read_csv(file_obj)
|
| 169 |
+
except Exception:
|
| 170 |
+
file_obj.seek(0)
|
| 171 |
+
return pd.read_csv(file_obj, encoding="latin-1")
|
| 172 |
+
|
| 173 |
+
def _standardize_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 174 |
+
df = df.copy()
|
| 175 |
+
df.columns = [_norm_colname(c) for c in df.columns]
|
| 176 |
+
for c in df.select_dtypes(include=["object"]).columns:
|
| 177 |
+
df[c] = df[c].apply(_clean_str)
|
| 178 |
+
return df
|
| 179 |
+
|
| 180 |
+
def _prepare_generic(df: pd.DataFrame, expected: Dict[str, List[str]]):
|
| 181 |
+
issues = []
|
| 182 |
+
df0 = _standardize_df(df)
|
| 183 |
+
|
| 184 |
+
# Synonym mapping
|
| 185 |
+
colmap = {}
|
| 186 |
+
cols = set(df0.columns)
|
| 187 |
+
for canon, syns in expected.items():
|
| 188 |
+
found = None
|
| 189 |
+
for s in [canon] + syns:
|
| 190 |
+
s = _norm_colname(s)
|
| 191 |
+
if s in cols:
|
| 192 |
+
found = s; break
|
| 193 |
+
if found: colmap[canon] = found
|
| 194 |
+
|
| 195 |
+
# Email/phone quality
|
| 196 |
+
for c in list(df0.columns):
|
| 197 |
+
if "email" in c:
|
| 198 |
+
df0[c] = df0[c].apply(lambda x: str(x).lower().strip() if pd.notna(x) else x)
|
| 199 |
+
for idx, v in df0[c].items():
|
| 200 |
+
if pd.isna(v) or str(v).strip()=="":
|
| 201 |
+
issues.append({"row": idx, "field": c, "issue":"missing_email","value":""})
|
| 202 |
+
elif not _is_email(v):
|
| 203 |
+
issues.append({"row": idx, "field": c, "issue":"invalid_email","value":str(v)})
|
| 204 |
+
if "phone" in c or "mobile" in c:
|
| 205 |
+
vals = []
|
| 206 |
+
for idx, v in df0[c].items():
|
| 207 |
+
e164, prob = _clean_phone(v)
|
| 208 |
+
vals.append(e164)
|
| 209 |
+
if prob: issues.append({"row": idx, "field": c, "issue":prob, "value":str(v)})
|
| 210 |
+
df0[c] = vals
|
| 211 |
+
|
| 212 |
+
# Datetime parsing
|
| 213 |
+
for c in df0.columns:
|
| 214 |
+
if any(k in c for k in ["date","time","timestamp","created_at","updated_at"]):
|
| 215 |
+
parsed = _parse_datetime(df0[c])
|
| 216 |
+
bad = parsed.isna() & df0[c].notna()
|
| 217 |
+
for idx in df0.index[bad]:
|
| 218 |
+
issues.append({"row": int(idx), "field": c, "issue":"unparseable_timestamp", "value":str(df0.loc[idx, c])})
|
| 219 |
+
df0[c] = parsed
|
| 220 |
+
|
| 221 |
+
# Numeric coercions for common fields
|
| 222 |
+
for nc in ["amount","credit_score","utilization","dti","recent_defaults","income"]:
|
| 223 |
+
for c in df0.columns:
|
| 224 |
+
if c == nc or c.endswith("_"+nc) or nc in c:
|
| 225 |
+
coerced, badmask = _to_numeric(df0[c])
|
| 226 |
+
for idx in df0.index[badmask]:
|
| 227 |
+
issues.append({"row": int(idx), "field": c, "issue":"non_numeric", "value":str(df0.loc[idx, c])})
|
| 228 |
+
df0[c] = coerced
|
| 229 |
+
|
| 230 |
+
issues_df = pd.DataFrame(issues, columns=["row","field","issue","value"])
|
| 231 |
+
missing = [k for k in expected.keys() if k not in colmap]
|
| 232 |
+
quality_summary = f"Rows={len(df0)}, Cols={len(df0.columns)}; Missing required fields: {missing if missing else 'None'}"
|
| 233 |
+
return df0, issues_df, quality_summary, colmap
|
| 234 |
+
|
| 235 |
+
# ------------------------
|
| 236 |
+
# Modules & Rules
|
| 237 |
+
# ------------------------
|
| 238 |
+
TX_EXPECTED = {
|
| 239 |
+
"transaction_id":["txn_id","transactionid","id","tx_id"],
|
| 240 |
+
"customer_id":["cust_id","user_id","client_id"],
|
| 241 |
+
"amount":["amt","amount_inr","value"],
|
| 242 |
+
"timestamp":["date","event_time","created_at","tx_time"],
|
| 243 |
+
"merchant_category":["mcc","merchant_cat","category"]
|
| 244 |
+
}
|
| 245 |
+
def prepare_transactions(df): return _prepare_generic(df, TX_EXPECTED)
|
| 246 |
+
|
| 247 |
+
def detect_transactions(clean_df, colmap, high_risk_mcc: Optional[List[str]]=None):
|
| 248 |
+
high_risk = set(["HIGH_RISK","GAMBLING","CRYPTO_EXCHANGE","ESCORTS","CASINO"])
|
| 249 |
+
if high_risk_mcc:
|
| 250 |
+
high_risk.update([_nfkc(x).strip().upper().replace(" ","_") for x in high_risk_mcc])
|
| 251 |
+
if not all(k in colmap for k in ["customer_id","amount"]):
|
| 252 |
+
return pd.DataFrame(), "Required columns missing for detection (need at least customer_id, amount)."
|
| 253 |
+
df = clean_df.copy()
|
| 254 |
+
reasons = []
|
| 255 |
+
amtcol = colmap.get("amount")
|
| 256 |
+
if amtcol is not None:
|
| 257 |
+
reasons.append(("large_amount>10k", df[amtcol] > 10000))
|
| 258 |
+
reasons.append(("negative_amount", df[amtcol] < 0))
|
| 259 |
+
if "merchant_category" in colmap:
|
| 260 |
+
mcc = colmap["merchant_category"]
|
| 261 |
+
high = df[mcc].astype(str).str.upper().str.replace(" ","_", regex=False).isin(high_risk)
|
| 262 |
+
reasons.append(("merchant_category_high_risk", high))
|
| 263 |
+
if all(k in colmap for k in ["customer_id","timestamp","amount"]):
|
| 264 |
+
cid, ts, amt = colmap["customer_id"], colmap["timestamp"], colmap["amount"]
|
| 265 |
+
daily = df.groupby([cid, df[ts].dt.date])[amt].transform("sum")
|
| 266 |
+
reasons.append(("daily_sum_per_customer>50k", daily > 50000))
|
| 267 |
+
mask = None
|
| 268 |
+
for _, m in reasons:
|
| 269 |
+
mask = m if mask is None else (mask | m)
|
| 270 |
+
flagged = df[mask] if mask is not None else pd.DataFrame()
|
| 271 |
+
if not flagged.empty:
|
| 272 |
+
rr=[]
|
| 273 |
+
for _, row in flagged.iterrows():
|
| 274 |
+
hits=[]
|
| 275 |
+
a = row[amtcol] if amtcol in flagged.columns else None
|
| 276 |
+
if pd.notna(a) and a>10000: hits.append("large_amount")
|
| 277 |
+
if pd.notna(a) and a<0: hits.append("negative_amount")
|
| 278 |
+
if "merchant_category" in colmap:
|
| 279 |
+
val = str(row[colmap["merchant_category"]]).upper().replace(" ","_")
|
| 280 |
+
if val in high_risk: hits.append("mcc_high_risk")
|
| 281 |
+
# daily sum check reconstructed
|
| 282 |
+
try:
|
| 283 |
+
if all(k in colmap for k in ["customer_id","timestamp","amount"]):
|
| 284 |
+
sub = df[(df[colmap["customer_id"]]==row[colmap["customer_id"]]) &
|
| 285 |
+
(df[colmap["timestamp"]].dt.date==pd.to_datetime(row[colmap["timestamp"]], errors="coerce").date())]
|
| 286 |
+
if sub[colmap["amount"]].sum() > 50000: hits.append("daily_sum>50k")
|
| 287 |
+
except Exception: pass
|
| 288 |
+
rr.append(", ".join(sorted(set(hits))) or "rule_hit")
|
| 289 |
+
flagged = flagged.assign(risk_reason=rr)
|
| 290 |
+
stats = f"Transactions flagged: {len(flagged)} of {len(df)}."
|
| 291 |
+
return flagged, stats
|
| 292 |
+
|
| 293 |
+
KYC_EXPECTED = {
|
| 294 |
+
"customer_id":["cust_id","user_id","client_id"],
|
| 295 |
+
"name":["full_name","customer_name"],
|
| 296 |
+
"email":["email_address","mail"],
|
| 297 |
+
"phone":["phone_number","mobile","contact"],
|
| 298 |
+
"dob":["date_of_birth","birthdate"]
|
| 299 |
+
}
|
| 300 |
+
def prepare_kyc(df): return _prepare_generic(df, KYC_EXPECTED)
|
| 301 |
+
|
| 302 |
+
def _age_years(dob: pd.Series) -> pd.Series:
|
| 303 |
+
now = pd.Timestamp.utcnow()
|
| 304 |
+
return (now - dob).dt.days / 365.25
|
| 305 |
+
|
| 306 |
+
def detect_kyc(clean_df, colmap):
|
| 307 |
+
if not all(k in colmap for k in ["customer_id","name"]):
|
| 308 |
+
return pd.DataFrame(), "Required columns missing for KYC (need at least customer_id, name)."
|
| 309 |
+
df = clean_df.copy()
|
| 310 |
+
reasons=[]
|
| 311 |
+
if "email" in colmap:
|
| 312 |
+
dupe_email = df.duplicated(subset=[colmap["email"]], keep=False) & df[colmap["email"]].notna()
|
| 313 |
+
reasons.append(("duplicate_email", dupe_email))
|
| 314 |
+
if "phone" in colmap:
|
| 315 |
+
dupe_phone = df.duplicated(subset=[colmap["phone"]], keep=False) & df[colmap["phone"]].notna()
|
| 316 |
+
reasons.append(("duplicate_phone", dupe_phone))
|
| 317 |
+
if "dob" in colmap:
|
| 318 |
+
age = _age_years(df[colmap["dob"]])
|
| 319 |
+
invalid = (df[colmap["dob"]].isna()) | (df[colmap["dob"]] > pd.Timestamp.utcnow()) | (age > 120)
|
| 320 |
+
reasons.append(("invalid_dob", invalid))
|
| 321 |
+
if "name" in colmap:
|
| 322 |
+
name = df[colmap["name"]].astype(str)
|
| 323 |
+
susp = name.str.isupper() | name.str.contains(r"\d") | (name.str.len()<3)
|
| 324 |
+
reasons.append(("suspicious_name", susp))
|
| 325 |
+
mask = None
|
| 326 |
+
for _, m in reasons:
|
| 327 |
+
mask = m if mask is None else (mask | m)
|
| 328 |
+
flagged = df[mask] if mask is not None else pd.DataFrame()
|
| 329 |
+
if not flagged.empty:
|
| 330 |
+
flagged = flagged.assign(risk_reason="kyc_rule_hit")
|
| 331 |
+
stats = f"KYC flagged: {len(flagged)} of {len(df)}."
|
| 332 |
+
return flagged, stats
|
| 333 |
+
|
| 334 |
+
SAN_EXPECTED = {"customer_id":["cust_id","user_id","client_id"], "name":["full_name","customer_name"]}
|
| 335 |
+
def prepare_sanctions(df): return _prepare_generic(df, SAN_EXPECTED)
|
| 336 |
+
|
| 337 |
+
DEMO_SANCTIONS = pd.DataFrame({"name":["Ivan Petrov","Global Terror Org","Acme Front LLC","John Doe (PEP)","Shadow Brokers"]})
|
| 338 |
+
|
| 339 |
+
def token_overlap(a: str, b: str) -> int:
|
| 340 |
+
at = set(re.findall(r"[A-Za-z0-9]+", a.lower()))
|
| 341 |
+
bt = set(re.findall(r"[A-Za-z0-9]+", b.lower()))
|
| 342 |
+
return len(at & bt)
|
| 343 |
+
|
| 344 |
+
def detect_sanctions(clean_df, colmap, sanctions_df: Optional[pd.DataFrame]=None):
|
| 345 |
+
if "name" not in colmap:
|
| 346 |
+
return pd.DataFrame(), "Required column missing for Sanctions (need name)."
|
| 347 |
+
df = clean_df.copy()
|
| 348 |
+
sanc = sanctions_df if sanctions_df is not None else DEMO_SANCTIONS.copy()
|
| 349 |
+
sanc = _standardize_df(sanc)
|
| 350 |
+
if "name" not in sanc.columns:
|
| 351 |
+
for c in sanc.columns:
|
| 352 |
+
if "name" in c: sanc = sanc.rename(columns={c:"name"}); break
|
| 353 |
+
sanc_names = sanc["name"].dropna().astype(str).tolist()
|
| 354 |
+
matches=[]
|
| 355 |
+
for idx, row in df.iterrows():
|
| 356 |
+
nm = str(row[colmap["name"]] or "").strip()
|
| 357 |
+
if not nm: continue
|
| 358 |
+
if any(nm.lower()==s.lower() for s in sanc_names):
|
| 359 |
+
matches.append((idx,"exact")); continue
|
| 360 |
+
if any(token_overlap(nm, s) >= 2 for s in sanc_names):
|
| 361 |
+
matches.append((idx,"fuzzy"))
|
| 362 |
+
flagged = df.loc[[i for i,_ in matches]].copy() if matches else pd.DataFrame()
|
| 363 |
+
if not flagged.empty:
|
| 364 |
+
mt = {i:t for i,t in matches}
|
| 365 |
+
flagged = flagged.assign(match_type=[mt.get(i,"") for i in flagged.index])
|
| 366 |
+
stats = f"Sanctions matches: {len(flagged)} of {len(df)}. (Using {'uploaded/MCP' if sanctions_df is not None else 'demo'} list)"
|
| 367 |
+
return flagged, stats
|
| 368 |
+
|
| 369 |
+
CR_EXPECTED = {
|
| 370 |
+
"customer_id":["cust_id","user_id","client_id"],
|
| 371 |
+
"credit_score":["creditscore","score"],
|
| 372 |
+
"utilization":["util","credit_utilization","utilization_ratio"],
|
| 373 |
+
"dti":["debt_to_income","debt_to_income_ratio"],
|
| 374 |
+
"recent_defaults":["defaults","recentdefaults"],
|
| 375 |
+
"income":["annual_income","salary"]
|
| 376 |
+
}
|
| 377 |
+
def prepare_credit(df): return _prepare_generic(df, CR_EXPECTED)
|
| 378 |
+
|
| 379 |
+
def detect_credit(clean_df, colmap):
|
| 380 |
+
needed = ["credit_score","utilization","dti","recent_defaults","income"]
|
| 381 |
+
if not any(k in colmap for k in needed):
|
| 382 |
+
return pd.DataFrame(), "Required columns missing for Credit Risk."
|
| 383 |
+
df = clean_df.copy()
|
| 384 |
+
cs = df[colmap.get("credit_score","credit_score")] if "credit_score" in colmap else pd.Series([np.nan]*len(df))
|
| 385 |
+
util= df[colmap.get("utilization","utilization")] if "utilization" in colmap else pd.Series([np.nan]*len(df))
|
| 386 |
+
dti = df[colmap.get("dti","dti")] if "dti" in colmap else pd.Series([np.nan]*len(df))
|
| 387 |
+
rde = df[colmap.get("recent_defaults","recent_defaults")] if "recent_defaults" in colmap else pd.Series([np.nan]*len(df))
|
| 388 |
+
inc = df[colmap.get("income","income")] if "income" in colmap else pd.Series([np.nan]*len(df))
|
| 389 |
+
out=[]
|
| 390 |
+
for i in range(len(df)):
|
| 391 |
+
hits=0; reasons=[]
|
| 392 |
+
if pd.notna(cs.iloc[i]) and cs.iloc[i] < 600: hits+=1; reasons.append("credit_score<600")
|
| 393 |
+
if pd.notna(util.iloc[i]) and util.iloc[i] > 0.8: hits+=1; reasons.append("utilization>0.8")
|
| 394 |
+
if pd.notna(dti.iloc[i]) and dti.iloc[i] > 0.4: hits+=1; reasons.append("DTI>0.4")
|
| 395 |
+
if pd.notna(rde.iloc[i]) and rde.iloc[i] > 0: hits+=1; reasons.append("recent_defaults>0")
|
| 396 |
+
if pd.notna(inc.iloc[i]) and inc.iloc[i] < 30000: hits+=1; reasons.append("income<30000")
|
| 397 |
+
level = "High" if hits>=3 else ("Medium" if hits==2 else ("Low" if hits==1 else "None"))
|
| 398 |
+
out.append((hits, level, ", ".join(reasons)))
|
| 399 |
+
risk_score=[x[0] for x in out]; risk_level=[x[1] for x in out]; reason=[x[2] for x in out]
|
| 400 |
+
res = df.assign(risk_score=risk_score, risk_level=risk_level, risk_reason=reason)
|
| 401 |
+
flagged = res[res["risk_level"].isin(["High","Medium","Low"]) & (res["risk_level"]!="None")]
|
| 402 |
+
stats = f"Credit Risk flagged: {len(flagged)} of {len(df)}. Distribution: High={(res['risk_level']=='High').sum()}, Medium={(res['risk_level']=='Medium').sum()}, Low={(res['risk_level']=='Low').sum()}."
|
| 403 |
+
return flagged, stats
|
| 404 |
+
|
| 405 |
+
# ------------------------
|
| 406 |
+
# Summarizer (notice-first)
|
| 407 |
+
# ------------------------
|
| 408 |
+
SUMMARY_SYS = "You are a helpful Fraud/Risk analyst. Be concise (<120 words), list key counts, drivers, and data quality caveats."
|
| 409 |
+
|
| 410 |
+
def summarize_ai(context: str) -> str:
|
| 411 |
+
"""
|
| 412 |
+
If chat LLM is available, use it to generate a short summary.
|
| 413 |
+
Otherwise return the prototype notice string.
|
| 414 |
+
"""
|
| 415 |
+
if CHAT_LLM is None:
|
| 416 |
+
return SUMMARY_NOTICE
|
| 417 |
+
try:
|
| 418 |
+
out = CHAT_LLM.invoke([SystemMessage(content=SUMMARY_SYS), HumanMessage(content=context[:4000])])
|
| 419 |
+
if hasattr(out, "content"): return out.content
|
| 420 |
+
return str(out)
|
| 421 |
+
except Exception as e:
|
| 422 |
+
msg = str(e)
|
| 423 |
+
if "401" in msg or "403" in msg:
|
| 424 |
+
return SUMMARY_NOTICE
|
| 425 |
+
return SUMMARY_NOTICE
|
| 426 |
+
|
| 427 |
+
# ------------------------
|
| 428 |
+
# Optional MCP
|
| 429 |
+
# ------------------------
|
| 430 |
+
from urllib.request import Request, urlopen
|
| 431 |
+
def _mcp_get_json(url: str, auth_header: Optional[str]):
|
| 432 |
+
try:
|
| 433 |
+
req = Request(url)
|
| 434 |
+
if auth_header:
|
| 435 |
+
k, v = auth_header.split(":", 1)
|
| 436 |
+
req.add_header(k.strip(), v.strip())
|
| 437 |
+
with urlopen(req, timeout=10) as r:
|
| 438 |
+
return json.loads(r.read().decode("utf-8"))
|
| 439 |
+
except Exception as e:
|
| 440 |
+
log.warning(f"MCP fetch failed: {e}")
|
| 441 |
+
return None
|
| 442 |
+
|
| 443 |
+
def mcp_fetch_sanctions() -> Optional[pd.DataFrame]:
|
| 444 |
+
if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
|
| 445 |
+
url = os.getenv("MCP_SANCTIONS_URL")
|
| 446 |
+
if not url: return None
|
| 447 |
+
data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
|
| 448 |
+
if not data: return None
|
| 449 |
+
if isinstance(data, list):
|
| 450 |
+
if all(isinstance(x, dict) for x in data):
|
| 451 |
+
rows = [{"name": x.get("name") or x.get("Name")} for x in data if x.get("name") or x.get("Name")]
|
| 452 |
+
return pd.DataFrame(rows) if rows else None
|
| 453 |
+
if all(isinstance(x, str) for x in data):
|
| 454 |
+
return pd.DataFrame({"name": data})
|
| 455 |
+
return None
|
| 456 |
+
|
| 457 |
+
def mcp_fetch_high_risk_mcc() -> Optional[List[str]]:
|
| 458 |
+
if os.getenv("ENABLE_MCP","0") not in ("1","true","TRUE"): return None
|
| 459 |
+
url = os.getenv("MCP_HIGH_RISK_MCC_URL")
|
| 460 |
+
if not url: return None
|
| 461 |
+
data = _mcp_get_json(url, os.getenv("MCP_AUTH_HEADER"))
|
| 462 |
+
return [str(x) for x in data] if isinstance(data, list) else None
|
| 463 |
+
|
| 464 |
+
# ------------------------
|
| 465 |
+
# Pipelines (per tab)
|
| 466 |
+
# ------------------------
|
| 467 |
+
def run_transactions(file):
|
| 468 |
+
try:
|
| 469 |
+
df = _read_csv_any(file)
|
| 470 |
+
clean, issues, quality, colmap = prepare_transactions(df)
|
| 471 |
+
mcc = mcp_fetch_high_risk_mcc()
|
| 472 |
+
flagged, stats = detect_transactions(clean, colmap, mcc)
|
| 473 |
+
ctx = f"[Transactions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
|
| 474 |
+
ai = summarize_ai(ctx)
|
| 475 |
+
return ai, stats, flagged, issues
|
| 476 |
+
except Exception as e:
|
| 477 |
+
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
| 478 |
+
|
| 479 |
+
def run_kyc(file):
|
| 480 |
+
try:
|
| 481 |
+
df = _read_csv_any(file)
|
| 482 |
+
clean, issues, quality, colmap = prepare_kyc(df)
|
| 483 |
+
flagged, stats = detect_kyc(clean, colmap)
|
| 484 |
+
ctx = f"[KYC]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
|
| 485 |
+
ai = summarize_ai(ctx)
|
| 486 |
+
return ai, stats, flagged, issues
|
| 487 |
+
except Exception as e:
|
| 488 |
+
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
| 489 |
+
|
| 490 |
+
def run_sanctions(customers_file, sanctions_file):
|
| 491 |
+
try:
|
| 492 |
+
df = _read_csv_any(customers_file)
|
| 493 |
+
clean, issues, quality, colmap = prepare_sanctions(df)
|
| 494 |
+
sanc_df = mcp_fetch_sanctions()
|
| 495 |
+
if sanc_df is None and sanctions_file is not None:
|
| 496 |
+
sanc_df = _read_csv_any(sanctions_file)
|
| 497 |
+
flagged, stats = detect_sanctions(clean, colmap, sanc_df)
|
| 498 |
+
ctx = f"[Sanctions]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nMatches:\n{flagged.head(5).to_csv(index=False)}"
|
| 499 |
+
ai = summarize_ai(ctx)
|
| 500 |
+
return ai, stats, flagged, issues
|
| 501 |
+
except Exception as e:
|
| 502 |
+
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
| 503 |
+
|
| 504 |
+
def run_credit(file):
|
| 505 |
+
try:
|
| 506 |
+
df = _read_csv_any(file)
|
| 507 |
+
clean, issues, quality, colmap = prepare_credit(df)
|
| 508 |
+
flagged, stats = detect_credit(clean, colmap)
|
| 509 |
+
ctx = f"[Credit]\n{stats}\nQuality: {quality}\nHead:\n{clean.head(5).to_csv(index=False)}\nFlagged:\n{flagged.head(5).to_csv(index=False)}"
|
| 510 |
+
ai = summarize_ai(ctx)
|
| 511 |
+
return ai, stats, flagged, issues
|
| 512 |
+
except Exception as e:
|
| 513 |
+
return f"Error: {e}", "Validation failed.", pd.DataFrame(), pd.DataFrame()
|
| 514 |
+
|
| 515 |
+
# ------------------------
|
| 516 |
+
# Tools (CSV text in → concise text out)
|
| 517 |
+
# ------------------------
|
| 518 |
+
def _csv_text_to_df(csv_text: str) -> pd.DataFrame:
|
| 519 |
+
return pd.read_csv(io.StringIO(csv_text))
|
| 520 |
+
|
| 521 |
+
class TransactionCSVInput(BaseModel):
|
| 522 |
+
csv_text: str = Field(..., description="Transactions CSV text")
|
| 523 |
+
|
| 524 |
+
@tool("transactions_fraud_tool", args_schema=TransactionCSVInput)
|
| 525 |
+
def transactions_fraud_tool(csv_text: str) -> str:
|
| 526 |
+
df = _csv_text_to_df(csv_text)
|
| 527 |
+
clean, issues, quality, colmap = prepare_transactions(df)
|
| 528 |
+
flagged, stats = detect_transactions(clean, colmap)
|
| 529 |
+
return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
| 530 |
+
|
| 531 |
+
class KYCCSVInput(BaseModel):
|
| 532 |
+
csv_text: str = Field(..., description="KYC CSV text")
|
| 533 |
+
|
| 534 |
+
@tool("kyc_fraud_tool", args_schema=KYCCSVInput)
|
| 535 |
+
def kyc_fraud_tool(csv_text: str) -> str:
|
| 536 |
+
df = _csv_text_to_df(csv_text)
|
| 537 |
+
clean, issues, quality, colmap = prepare_kyc(df)
|
| 538 |
+
flagged, stats = detect_kyc(clean, colmap)
|
| 539 |
+
return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
| 540 |
+
|
| 541 |
+
class SanctionsCSVInput(BaseModel):
|
| 542 |
+
csv_text: str = Field(..., description="Customers CSV text with a 'name' column")
|
| 543 |
+
|
| 544 |
+
@tool("sanctions_pep_tool", args_schema=SanctionsCSVInput)
|
| 545 |
+
def sanctions_pep_tool(csv_text: str) -> str:
|
| 546 |
+
df = _csv_text_to_df(csv_text)
|
| 547 |
+
clean, issues, quality, colmap = prepare_sanctions(df)
|
| 548 |
+
flagged, stats = detect_sanctions(clean, colmap)
|
| 549 |
+
return f"{stats}\nDQ issues: {len(issues)}\nFirst matches:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
| 550 |
+
|
| 551 |
+
class CreditCSVInput(BaseModel):
|
| 552 |
+
csv_text: str = Field(..., description="Credit CSV text")
|
| 553 |
+
|
| 554 |
+
@tool("credit_risk_tool", args_schema=CreditCSVInput)
|
| 555 |
+
def credit_risk_tool(csv_text: str) -> str:
|
| 556 |
+
df = _csv_text_to_df(csv_text)
|
| 557 |
+
clean, issues, quality, colmap = prepare_credit(df)
|
| 558 |
+
flagged, stats = detect_credit(clean, colmap)
|
| 559 |
+
return f"{stats}\nDQ issues: {len(issues)}\nFirst flagged:\n{flagged.head(5).to_csv(index=False)}"[:2800]
|
| 560 |
+
|
| 561 |
+
TOOLS: List[Tool] = [
|
| 562 |
+
transactions_fraud_tool,
|
| 563 |
+
kyc_fraud_tool,
|
| 564 |
+
sanctions_pep_tool,
|
| 565 |
+
credit_risk_tool,
|
| 566 |
+
]
|
| 567 |
+
|
| 568 |
+
# ------------------------
|
| 569 |
+
# Agent (chat-completions)
|
| 570 |
+
# ------------------------
|
| 571 |
+
AGENT_SYSTEM = """You are an AI Consultant for Fraud/Risk.
|
| 572 |
+
You have tools for Transactions, KYC, Sanctions/PEP, and Credit Risk.
|
| 573 |
+
If the user pastes a small CSV snippet, pick the relevant tool and analyze it.
|
| 574 |
+
Be concise and actionable."""
|
| 575 |
+
|
| 576 |
+
def build_agent():
|
| 577 |
+
if CHAT_LLM is None:
|
| 578 |
+
class Stub:
|
| 579 |
+
def invoke(self, prompt): return CHAT_NOTICE
|
| 580 |
+
return Stub()
|
| 581 |
+
return initialize_agent(
|
| 582 |
+
TOOLS,
|
| 583 |
+
CHAT_LLM,
|
| 584 |
+
agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
| 585 |
+
verbose=False,
|
| 586 |
+
agent_kwargs={"system_message": AGENT_SYSTEM},
|
| 587 |
+
handle_parsing_errors=True,
|
| 588 |
+
)
|
| 589 |
+
|
| 590 |
+
AGENT = build_agent()
|
| 591 |
+
|
| 592 |
+
def agent_reply(history: List[Dict], user_msg: str):
|
| 593 |
+
try:
|
| 594 |
+
looks_like_csv = ("," in user_msg) and ("\n" in user_msg) and (user_msg.count(",") >= 2)
|
| 595 |
+
prompt = f"CSV snippet detected. Decide tool and analyze:\n\n{user_msg}" if looks_like_csv else user_msg
|
| 596 |
+
res = AGENT.invoke(prompt)
|
| 597 |
+
if isinstance(res, dict) and "output" in res: return res["output"]
|
| 598 |
+
return str(res)
|
| 599 |
+
except Exception as e:
|
| 600 |
+
return f"Agent error: {e}"
|
| 601 |
+
|
| 602 |
+
# ------------------------
|
| 603 |
+
# UI
|
| 604 |
+
# ------------------------
|
| 605 |
+
with gr.Blocks(title="Fraud Detector Analyst — LangChain + MCP", theme=gr.themes.Soft()) as demo:
|
| 606 |
+
gr.Markdown("# 🛡️ Fraud Detector Analyst — LangChain + MCP")
|
| 607 |
+
gr.Markdown(
|
| 608 |
+
"This prototype runs **rules & data checks locally**. "
|
| 609 |
+
"Chat + AI summaries require a remote inference provider (HF Inference)."
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
with gr.Tabs():
|
| 613 |
+
with gr.Tab("Transactions"):
|
| 614 |
+
gr.Markdown("Upload a **transactions** CSV.")
|
| 615 |
+
tx_file = gr.File(file_types=[".csv"], label="Transactions CSV", type="binary")
|
| 616 |
+
tx_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
| 617 |
+
tx_stats = gr.Textbox(label="Stats", lines=3)
|
| 618 |
+
tx_flagged = gr.Dataframe(label="Flagged Transactions")
|
| 619 |
+
tx_issues = gr.Dataframe(label="Data Quality Issues (row, field, issue, value)")
|
| 620 |
+
tx_file.upload(run_transactions, inputs=[tx_file], outputs=[tx_ai, tx_stats, tx_flagged, tx_issues])
|
| 621 |
+
|
| 622 |
+
with gr.Tab("KYC"):
|
| 623 |
+
gr.Markdown("Upload a **KYC** CSV.")
|
| 624 |
+
kyc_file = gr.File(file_types=[".csv"], label="KYC CSV", type="binary")
|
| 625 |
+
kyc_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
| 626 |
+
kyc_stats = gr.Textbox(label="Stats", lines=3)
|
| 627 |
+
kyc_flagged = gr.Dataframe(label="Flagged KYC Rows")
|
| 628 |
+
kyc_issues = gr.Dataframe(label="Data Quality Issues")
|
| 629 |
+
kyc_file.upload(run_kyc, inputs=[kyc_file], outputs=[kyc_ai, kyc_stats, kyc_flagged, kyc_issues])
|
| 630 |
+
|
| 631 |
+
with gr.Tab("Sanctions/PEP"):
|
| 632 |
+
gr.Markdown("Upload **customers** CSV (+ optional sanctions CSV).")
|
| 633 |
+
san_customers = gr.File(file_types=[".csv"], label="Customers CSV", type="binary")
|
| 634 |
+
san_list = gr.File(file_types=[".csv"], label="Sanctions/PEP CSV (optional)", type="binary")
|
| 635 |
+
san_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
| 636 |
+
san_stats = gr.Textbox(label="Stats", lines=3)
|
| 637 |
+
san_flagged = gr.Dataframe(label="Matches")
|
| 638 |
+
san_issues = gr.Dataframe(label="Data Quality Issues")
|
| 639 |
+
san_customers.upload(run_sanctions, inputs=[san_customers, san_list], outputs=[san_ai, san_stats, san_flagged, san_issues])
|
| 640 |
+
san_list.upload(run_sanctions, inputs=[san_customers, san_list], outputs=[san_ai, san_stats, san_flagged, san_issues])
|
| 641 |
+
|
| 642 |
+
with gr.Tab("Credit Risk"):
|
| 643 |
+
gr.Markdown("Upload a **credit** CSV.")
|
| 644 |
+
cr_file = gr.File(file_types=[".csv"], label="Credit CSV", type="binary")
|
| 645 |
+
cr_ai = gr.Textbox(label="AI Summary (requires inference)", value=SUMMARY_NOTICE, lines=6)
|
| 646 |
+
cr_stats = gr.Textbox(label="Stats", lines=3)
|
| 647 |
+
cr_flagged = gr.Dataframe(label="Flagged Applicants")
|
| 648 |
+
cr_issues = gr.Dataframe(label="Data Quality Issues")
|
| 649 |
+
cr_file.upload(run_credit, inputs=[cr_file], outputs=[cr_ai, cr_stats, cr_flagged, cr_issues])
|
| 650 |
+
|
| 651 |
+
with gr.Tab("AI Consultant (Agent)"):
|
| 652 |
+
gr.Markdown("Paste a small CSV snippet or ask questions. Uses chat-completions when configured.")
|
| 653 |
+
chatbot = gr.Chatbot(type="messages", label="Fraud AI Consultant")
|
| 654 |
+
user_in = gr.Textbox(label="Message or CSV snippet")
|
| 655 |
+
send_btn = gr.Button("Send")
|
| 656 |
+
def _chat_fn(history, msg):
|
| 657 |
+
reply = agent_reply(history, msg)
|
| 658 |
+
history = (history or []) + [{"role":"user","content":msg}, {"role":"assistant","content":reply}]
|
| 659 |
+
return history, ""
|
| 660 |
+
send_btn.click(_chat_fn, inputs=[chatbot, user_in], outputs=[chatbot, user_in])
|
| 661 |
+
|
| 662 |
+
gr.Markdown(
|
| 663 |
+
"### ⚙️ Enable inference\n"
|
| 664 |
+
"- Set **HF_TOKEN** (or HF_SPACES on Spaces)\n"
|
| 665 |
+
"- Optional: **LC_CHAT_MODEL** (default Qwen 0.5B Instruct), **LC_CHAT_MODEL_FALLBACK** (default Mistral 7B Instruct)\n"
|
| 666 |
+
"- Optional MCP: `ENABLE_MCP=1`, `MCP_SANCTIONS_URL`, `MCP_HIGH_RISK_MCC_URL`, `MCP_AUTH_HEADER`"
|
| 667 |
+
)
|
| 668 |
+
|
| 669 |
+
if __name__ == "__main__":
|
| 670 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
llm_provider.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement llm_provider.py (split from app_monolith_backup.py)"""
|
mcp.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement mcp.py (split from app_monolith_backup.py)"""
|
modules/__init__.py
ADDED
|
File without changes
|
modules/credit.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement modules/credit.py (split from app_monolith_backup.py)"""
|
modules/kyc.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement modules/kyc.py (split from app_monolith_backup.py)"""
|
modules/sanctions.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement modules/sanctions.py (split from app_monolith_backup.py)"""
|
modules/transactions.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement modules/transactions.py (split from app_monolith_backup.py)"""
|
threat_intel.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement threat_intel.py (split from app_monolith_backup.py)"""
|
tools.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement tools.py (split from app_monolith_backup.py)"""
|
ttp_guard.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement ttp_guard.py (split from app_monolith_backup.py)"""
|
validation.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""TODO: implement validation.py (split from app_monolith_backup.py)"""
|