Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,27 +3,25 @@ import gradio as gr
|
|
| 3 |
from transformers import pipeline
|
| 4 |
from langdetect import detect
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
# smaller multilingual toxicity model
|
| 9 |
clf = pipeline(
|
| 10 |
task="text-classification",
|
| 11 |
model="citizenlab/distilbert-base-multilingual-cased-toxicity",
|
| 12 |
top_k=None
|
| 13 |
)
|
| 14 |
|
| 15 |
-
#
|
| 16 |
try:
|
| 17 |
_ = clf("hello")
|
| 18 |
except Exception:
|
| 19 |
pass
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
# Lightweight cues (demo only)
|
| 24 |
THREAT_PATTERNS = re.compile(r"\b(kill|stab|shoot|bomb|burn|hang|slap|attack|kuua|kukuchoma|nikukate)\b", re.I)
|
| 25 |
MINOR_PATTERNS = re.compile(r"\b(minor|under\s?age|under\s?18|child|mtoto|1[0-7]\s?yo|15yo|16yo|17yo)\b", re.I)
|
| 26 |
-
|
|
|
|
| 27 |
INSULT_SW = re.compile(
|
| 28 |
r"\b(mjinga|pumbavu|mshenzi|kumbafu|takataka|malaya|bwege|fala|mbwa|mbuzi|chizi|zezeta)\b",
|
| 29 |
re.I,
|
|
@@ -35,8 +33,7 @@ HATE_GROUP = re.compile(
|
|
| 35 |
re.I,
|
| 36 |
)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
# Base thresholds
|
| 40 |
BASE_POLICY = {
|
| 41 |
"harassment_toxicity": {"label": "toxicity", "low": 0.40, "high": 0.75,
|
| 42 |
"actions": {"low": "limit_reach", "high": "remove_and_escalate"}},
|
|
@@ -48,31 +45,18 @@ BASE_POLICY = {
|
|
| 48 |
"actions": {"low": "escalate_child_safety_team", "high": "remove_and_notify_csirt"}},
|
| 49 |
}
|
| 50 |
|
|
|
|
| 51 |
def _scores_from_model(text: str):
|
| 52 |
raw = clf(text)
|
| 53 |
-
|
| 54 |
-
# Normalize to a list[dict] no matter what the pipeline returns
|
| 55 |
if isinstance(raw, dict):
|
| 56 |
outs = [raw]
|
| 57 |
elif isinstance(raw, list):
|
| 58 |
-
if len(raw) and isinstance(raw[0],
|
| 59 |
-
outs = raw # [ {label, score}, ... ]
|
| 60 |
-
elif len(raw) and isinstance(raw[0], list):
|
| 61 |
-
outs = raw[0] # [ [ {label, score}, ... ] ]
|
| 62 |
-
else:
|
| 63 |
-
outs = []
|
| 64 |
else:
|
| 65 |
outs = []
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
for d in outs:
|
| 69 |
-
lab = str(d.get("label", "")).lower()
|
| 70 |
-
sc = float(d.get("score", 0.0))
|
| 71 |
-
if lab:
|
| 72 |
-
scores[lab] = sc
|
| 73 |
-
|
| 74 |
-
# CitizenLab model is binary (e.g., toxic / not_toxic or similar)
|
| 75 |
-
# compute a robust toxicity score
|
| 76 |
if "toxic" in scores:
|
| 77 |
toxicity = scores["toxic"]
|
| 78 |
elif "not_toxic" in scores:
|
|
@@ -81,13 +65,11 @@ def _scores_from_model(text: str):
|
|
| 81 |
toxicity = 1.0 - scores["non_toxic"]
|
| 82 |
else:
|
| 83 |
toxicity = max(scores.values()) if scores else 0.0
|
| 84 |
-
|
| 85 |
-
# This model doesn't output hate/threat directly; keep zeros here
|
| 86 |
hate = 0.0
|
| 87 |
threats = 0.0
|
| 88 |
return toxicity, hate, threats
|
| 89 |
|
| 90 |
-
|
| 91 |
def score_labels(text: str):
|
| 92 |
try:
|
| 93 |
lang = detect(text)
|
|
@@ -97,7 +79,8 @@ def score_labels(text: str):
|
|
| 97 |
tox, hate, thr_model = _scores_from_model(text)
|
| 98 |
thr_rule = 0.9 if THREAT_PATTERNS.search(text or "") else 0.0
|
| 99 |
minors = 0.9 if MINOR_PATTERNS.search(text or "") else 0.01
|
| 100 |
-
|
|
|
|
| 101 |
"toxicity": float(tox),
|
| 102 |
"hate_targeted": float(hate),
|
| 103 |
"threats": float(max(thr_model, thr_rule)),
|
|
@@ -105,21 +88,20 @@ probs = {
|
|
| 105 |
"misinfo_flag": 0.0
|
| 106 |
}
|
| 107 |
|
| 108 |
-
# --- lexical boosts for Swahili/code-switch (demo heuristic) ---
|
| 109 |
-
if INSULT_SW.search(text or ""):
|
| 110 |
-
|
| 111 |
|
| 112 |
-
# If toxic AND a group term appears → likely hate-targeted
|
| 113 |
-
if probs["toxicity"] >= 0.35 and HATE_GROUP.search(text or ""):
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
|
| 119 |
return lang, probs
|
| 120 |
|
| 121 |
def scaled_policy(scale: float):
|
| 122 |
-
# scale thresholds (e.g., 1.2 = stricter, 0.8 = looser)
|
| 123 |
pol = {}
|
| 124 |
for sec, spec in BASE_POLICY.items():
|
| 125 |
low = min(round(spec["low"] * scale, 3), 0.95)
|
|
@@ -140,7 +122,6 @@ def route_policy(probs: dict, policy: dict):
|
|
| 140 |
if not decisions:
|
| 141 |
return {"action": "allow", "reason": "no policy thresholds met", "decisions": []}
|
| 142 |
best = max(decisions, key=lambda d: d["score"])
|
| 143 |
-
# human-readable reason
|
| 144 |
label_map = {
|
| 145 |
"toxicity": "harassment/toxicity",
|
| 146 |
"hate_targeted": "hate speech",
|
|
@@ -158,6 +139,7 @@ def moderate(text: str, strictness: float):
|
|
| 158 |
scores_view = {"language": lang, **{k: round(v,3) for k,v in probs.items()}}
|
| 159 |
return scores_view, decision
|
| 160 |
|
|
|
|
| 161 |
examples = [
|
| 162 |
["Wewe ni mjinga kabisa", 1.0],
|
| 163 |
["Hawa watu wanastahili kuondolewa kabisa", 1.0],
|
|
@@ -168,10 +150,9 @@ examples = [
|
|
| 168 |
|
| 169 |
with gr.Blocks(analytics_enabled=False) as demo:
|
| 170 |
gr.Markdown("# Bilingual Moderation (Swahili + English) · Policy-aware Routing")
|
| 171 |
-
gr.Markdown("Single multilingual classifier (toxicity
|
| 172 |
-
"Use **Strictness** to tune thresholds.")
|
| 173 |
-
|
| 174 |
-
inp = gr.Textbox(label="Text", lines=4, placeholder="Andika / Type here…")
|
| 175 |
strict = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Strictness (threshold scale)")
|
| 176 |
btn = gr.Button("Moderate", variant="primary")
|
| 177 |
scores = gr.JSON(label="Scores (probabilities)")
|
|
@@ -179,7 +160,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
|
|
| 179 |
|
| 180 |
inp.submit(moderate, inputs=[inp, strict], outputs=[scores, decision])
|
| 181 |
gr.Examples(examples=examples, inputs=[inp, strict], fn=moderate, outputs=[scores, decision],
|
| 182 |
-
label="Try examples (auto-run)",cache_examples=False)
|
| 183 |
btn.click(moderate, inputs=[inp, strict], outputs=[scores, decision])
|
| 184 |
|
| 185 |
demo.queue().launch()
|
|
|
|
| 3 |
from transformers import pipeline
|
| 4 |
from langdetect import detect
|
| 5 |
|
| 6 |
+
# ---------- Model (small, CPU-friendly) ----------
|
| 7 |
+
# Binary outputs like: [{'label': 'toxic', 'score': ...}, {'label':'not_toxic', ...}]
|
|
|
|
| 8 |
clf = pipeline(
|
| 9 |
task="text-classification",
|
| 10 |
model="citizenlab/distilbert-base-multilingual-cased-toxicity",
|
| 11 |
top_k=None
|
| 12 |
)
|
| 13 |
|
| 14 |
+
# Warm-up once so the first click isn't "dead" while weights download
|
| 15 |
try:
|
| 16 |
_ = clf("hello")
|
| 17 |
except Exception:
|
| 18 |
pass
|
| 19 |
|
| 20 |
+
# ---------- Heuristic cues (demo only) ----------
|
|
|
|
|
|
|
| 21 |
THREAT_PATTERNS = re.compile(r"\b(kill|stab|shoot|bomb|burn|hang|slap|attack|kuua|kukuchoma|nikukate)\b", re.I)
|
| 22 |
MINOR_PATTERNS = re.compile(r"\b(minor|under\s?age|under\s?18|child|mtoto|1[0-7]\s?yo|15yo|16yo|17yo)\b", re.I)
|
| 23 |
+
|
| 24 |
+
# Swahili insult/profanity cues (non-exhaustive)
|
| 25 |
INSULT_SW = re.compile(
|
| 26 |
r"\b(mjinga|pumbavu|mshenzi|kumbafu|takataka|malaya|bwege|fala|mbwa|mbuzi|chizi|zezeta)\b",
|
| 27 |
re.I,
|
|
|
|
| 33 |
re.I,
|
| 34 |
)
|
| 35 |
|
| 36 |
+
# ---------- Policy thresholds/actions ----------
|
|
|
|
| 37 |
BASE_POLICY = {
|
| 38 |
"harassment_toxicity": {"label": "toxicity", "low": 0.40, "high": 0.75,
|
| 39 |
"actions": {"low": "limit_reach", "high": "remove_and_escalate"}},
|
|
|
|
| 45 |
"actions": {"low": "escalate_child_safety_team", "high": "remove_and_notify_csirt"}},
|
| 46 |
}
|
| 47 |
|
| 48 |
+
# ---------- Helpers ----------
|
| 49 |
def _scores_from_model(text: str):
|
| 50 |
raw = clf(text)
|
| 51 |
+
# Normalize pipeline output into list[dict]
|
|
|
|
| 52 |
if isinstance(raw, dict):
|
| 53 |
outs = [raw]
|
| 54 |
elif isinstance(raw, list):
|
| 55 |
+
outs = raw[0] if (len(raw) and isinstance(raw[0], list)) else raw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
else:
|
| 57 |
outs = []
|
| 58 |
+
scores = {str(d.get("label", "")).lower(): float(d.get("score", 0.0)) for d in outs if isinstance(d, dict)}
|
| 59 |
+
# Robust toxicity from binary labels
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
if "toxic" in scores:
|
| 61 |
toxicity = scores["toxic"]
|
| 62 |
elif "not_toxic" in scores:
|
|
|
|
| 65 |
toxicity = 1.0 - scores["non_toxic"]
|
| 66 |
else:
|
| 67 |
toxicity = max(scores.values()) if scores else 0.0
|
| 68 |
+
# This model doesn't output hate/threat explicitly
|
|
|
|
| 69 |
hate = 0.0
|
| 70 |
threats = 0.0
|
| 71 |
return toxicity, hate, threats
|
| 72 |
|
|
|
|
| 73 |
def score_labels(text: str):
|
| 74 |
try:
|
| 75 |
lang = detect(text)
|
|
|
|
| 79 |
tox, hate, thr_model = _scores_from_model(text)
|
| 80 |
thr_rule = 0.9 if THREAT_PATTERNS.search(text or "") else 0.0
|
| 81 |
minors = 0.9 if MINOR_PATTERNS.search(text or "") else 0.01
|
| 82 |
+
|
| 83 |
+
probs = {
|
| 84 |
"toxicity": float(tox),
|
| 85 |
"hate_targeted": float(hate),
|
| 86 |
"threats": float(max(thr_model, thr_rule)),
|
|
|
|
| 88 |
"misinfo_flag": 0.0
|
| 89 |
}
|
| 90 |
|
| 91 |
+
# --- lexical boosts for Swahili/code-switch (demo heuristic) ---
|
| 92 |
+
if INSULT_SW.search(text or ""):
|
| 93 |
+
probs["toxicity"] = max(probs["toxicity"], 0.75)
|
| 94 |
|
| 95 |
+
# If toxic AND a group term appears → likely hate-targeted
|
| 96 |
+
if probs["toxicity"] >= 0.35 and HATE_GROUP.search(text or ""):
|
| 97 |
+
probs["hate_targeted"] = max(
|
| 98 |
+
probs.get("hate_targeted", 0.0),
|
| 99 |
+
min(0.2 + 0.8 * probs["toxicity"], 0.95)
|
| 100 |
+
)
|
| 101 |
|
| 102 |
return lang, probs
|
| 103 |
|
| 104 |
def scaled_policy(scale: float):
|
|
|
|
| 105 |
pol = {}
|
| 106 |
for sec, spec in BASE_POLICY.items():
|
| 107 |
low = min(round(spec["low"] * scale, 3), 0.95)
|
|
|
|
| 122 |
if not decisions:
|
| 123 |
return {"action": "allow", "reason": "no policy thresholds met", "decisions": []}
|
| 124 |
best = max(decisions, key=lambda d: d["score"])
|
|
|
|
| 125 |
label_map = {
|
| 126 |
"toxicity": "harassment/toxicity",
|
| 127 |
"hate_targeted": "hate speech",
|
|
|
|
| 139 |
scores_view = {"language": lang, **{k: round(v,3) for k,v in probs.items()}}
|
| 140 |
return scores_view, decision
|
| 141 |
|
| 142 |
+
# ---------- UI ----------
|
| 143 |
examples = [
|
| 144 |
["Wewe ni mjinga kabisa", 1.0],
|
| 145 |
["Hawa watu wanastahili kuondolewa kabisa", 1.0],
|
|
|
|
| 150 |
|
| 151 |
with gr.Blocks(analytics_enabled=False) as demo:
|
| 152 |
gr.Markdown("# Bilingual Moderation (Swahili + English) · Policy-aware Routing")
|
| 153 |
+
gr.Markdown("Single multilingual classifier (toxicity) + simple cues for threats/child-safety, "
|
| 154 |
+
"with **policy-aware routing**. Use **Strictness** to tune thresholds.")
|
| 155 |
+
inp = gr.Textbox(label="Text", lines=4, placeholder="Andika / Type here…")
|
|
|
|
| 156 |
strict = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Strictness (threshold scale)")
|
| 157 |
btn = gr.Button("Moderate", variant="primary")
|
| 158 |
scores = gr.JSON(label="Scores (probabilities)")
|
|
|
|
| 160 |
|
| 161 |
inp.submit(moderate, inputs=[inp, strict], outputs=[scores, decision])
|
| 162 |
gr.Examples(examples=examples, inputs=[inp, strict], fn=moderate, outputs=[scores, decision],
|
| 163 |
+
label="Try examples (auto-run)", cache_examples=False)
|
| 164 |
btn.click(moderate, inputs=[inp, strict], outputs=[scores, decision])
|
| 165 |
|
| 166 |
demo.queue().launch()
|