Spaces:

jelagat
/

bilingual-moderation

Sleeping

App Files Files Community

jelagat commited on Sep 17

Commit

2a261a2

verified ·

1 Parent(s): 99b46b9

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -48

app.py CHANGED Viewed

@@ -3,27 +3,25 @@ import gradio as gr
 from transformers import pipeline
 from langdetect import detect
-from transformers import pipeline
-# smaller multilingual toxicity model
 clf = pipeline(
     task="text-classification",
     model="citizenlab/distilbert-base-multilingual-cased-toxicity",
     top_k=None
 )
-# optional warm-up
 try:
     _ = clf("hello")
 except Exception:
     pass
-# Lightweight cues (demo only)
 THREAT_PATTERNS = re.compile(r"\b(kill|stab|shoot|bomb|burn|hang|slap|attack|kuua|kukuchoma|nikukate)\b", re.I)
 MINOR_PATTERNS  = re.compile(r"\b(minor|under\s?age|under\s?18|child|mtoto|1[0-7]\s?yo|15yo|16yo|17yo)\b", re.I)
-# Swahili insult/profanity cues (demo only)
 INSULT_SW = re.compile(
     r"\b(mjinga|pumbavu|mshenzi|kumbafu|takataka|malaya|bwege|fala|mbwa|mbuzi|chizi|zezeta)\b",
     re.I,
@@ -35,8 +33,7 @@ HATE_GROUP = re.compile(
     re.I,
 )
-# Base thresholds
 BASE_POLICY = {
     "harassment_toxicity": {"label": "toxicity",      "low": 0.40, "high": 0.75,
                             "actions": {"low": "limit_reach", "high": "remove_and_escalate"}},
@@ -48,31 +45,18 @@ BASE_POLICY = {
                             "actions": {"low": "escalate_child_safety_team", "high": "remove_and_notify_csirt"}},
 }
 def _scores_from_model(text: str):
     raw = clf(text)
-    # Normalize to a list[dict] no matter what the pipeline returns
     if isinstance(raw, dict):
         outs = [raw]
     elif isinstance(raw, list):
-        if len(raw) and isinstance(raw[0], dict):
-            outs = raw                 # [ {label, score}, ... ]
-        elif len(raw) and isinstance(raw[0], list):
-            outs = raw[0]              # [ [ {label, score}, ... ] ]
-        else:
-            outs = []
     else:
         outs = []
-    scores = {}
-    for d in outs:
-        lab = str(d.get("label", "")).lower()
-        sc  = float(d.get("score", 0.0))
-        if lab:
-            scores[lab] = sc
-    # CitizenLab model is binary (e.g., toxic / not_toxic or similar)
-    # compute a robust toxicity score
     if "toxic" in scores:
         toxicity = scores["toxic"]
     elif "not_toxic" in scores:
@@ -81,13 +65,11 @@ def _scores_from_model(text: str):
         toxicity = 1.0 - scores["non_toxic"]
     else:
         toxicity = max(scores.values()) if scores else 0.0
-    # This model doesn't output hate/threat directly; keep zeros here
     hate = 0.0
     threats = 0.0
     return toxicity, hate, threats
 def score_labels(text: str):
     try:
         lang = detect(text)
@@ -97,7 +79,8 @@ def score_labels(text: str):
     tox, hate, thr_model = _scores_from_model(text)
     thr_rule = 0.9 if THREAT_PATTERNS.search(text or "") else 0.0
     minors   = 0.9 if MINOR_PATTERNS.search(text or "") else 0.01
-probs = {
         "toxicity": float(tox),
         "hate_targeted": float(hate),
         "threats": float(max(thr_model, thr_rule)),
@@ -105,21 +88,20 @@ probs = {
         "misinfo_flag": 0.0
     }
-# --- lexical boosts for Swahili/code-switch (demo heuristic) ---
-if INSULT_SW.search(text or ""):
-    probs["toxicity"] = max(probs["toxicity"], 0.75)
-# If toxic AND a group term appears → likely hate-targeted
-if probs["toxicity"] >= 0.35 and HATE_GROUP.search(text or ""):
-    probs["hate_targeted"] = max(
-        probs.get("hate_targeted", 0.0),
-        min(0.2 + 0.8 * probs["toxicity"], 0.95)
-    )
     return lang, probs
 def scaled_policy(scale: float):
-    # scale thresholds (e.g., 1.2 = stricter, 0.8 = looser)
     pol = {}
     for sec, spec in BASE_POLICY.items():
         low  = min(round(spec["low"]  * scale, 3), 0.95)
@@ -140,7 +122,6 @@ def route_policy(probs: dict, policy: dict):
     if not decisions:
         return {"action": "allow", "reason": "no policy thresholds met", "decisions": []}
     best = max(decisions, key=lambda d: d["score"])
-    # human-readable reason
     label_map = {
         "toxicity": "harassment/toxicity",
         "hate_targeted": "hate speech",
@@ -158,6 +139,7 @@ def moderate(text: str, strictness: float):
     scores_view = {"language": lang, **{k: round(v,3) for k,v in probs.items()}}
     return scores_view, decision
 examples = [
     ["Wewe ni mjinga kabisa", 1.0],
     ["Hawa watu wanastahili kuondolewa kabisa", 1.0],
@@ -168,10 +150,9 @@ examples = [
 with gr.Blocks(analytics_enabled=False) as demo:
     gr.Markdown("# Bilingual Moderation (Swahili + English) · Policy-aware Routing")
-    gr.Markdown("Single multilingual classifier (toxicity/hate/threat) + simple cues for child-safety → policy action. "
-                "Use **Strictness** to tune thresholds.")
-    with gr.Row():
-        inp = gr.Textbox(label="Text", lines=4, placeholder="Andika / Type here…")
     strict = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Strictness (threshold scale)")
     btn = gr.Button("Moderate", variant="primary")
     scores = gr.JSON(label="Scores (probabilities)")
@@ -179,7 +160,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
     inp.submit(moderate, inputs=[inp, strict], outputs=[scores, decision])
     gr.Examples(examples=examples, inputs=[inp, strict], fn=moderate, outputs=[scores, decision],
-                label="Try examples (auto-run)",cache_examples=False)
     btn.click(moderate, inputs=[inp, strict], outputs=[scores, decision])
 demo.queue().launch()

 from transformers import pipeline
 from langdetect import detect
+# ---------- Model (small, CPU-friendly) ----------
+# Binary outputs like: [{'label': 'toxic', 'score': ...}, {'label':'not_toxic', ...}]
 clf = pipeline(
     task="text-classification",
     model="citizenlab/distilbert-base-multilingual-cased-toxicity",
     top_k=None
 )
+# Warm-up once so the first click isn't "dead" while weights download
 try:
     _ = clf("hello")
 except Exception:
     pass
+# ---------- Heuristic cues (demo only) ----------
 THREAT_PATTERNS = re.compile(r"\b(kill|stab|shoot|bomb|burn|hang|slap|attack|kuua|kukuchoma|nikukate)\b", re.I)
 MINOR_PATTERNS  = re.compile(r"\b(minor|under\s?age|under\s?18|child|mtoto|1[0-7]\s?yo|15yo|16yo|17yo)\b", re.I)
+# Swahili insult/profanity cues (non-exhaustive)
 INSULT_SW = re.compile(
     r"\b(mjinga|pumbavu|mshenzi|kumbafu|takataka|malaya|bwege|fala|mbwa|mbuzi|chizi|zezeta)\b",
     re.I,
     re.I,
 )
+# ---------- Policy thresholds/actions ----------
 BASE_POLICY = {
     "harassment_toxicity": {"label": "toxicity",      "low": 0.40, "high": 0.75,
                             "actions": {"low": "limit_reach", "high": "remove_and_escalate"}},
                             "actions": {"low": "escalate_child_safety_team", "high": "remove_and_notify_csirt"}},
 }
+# ---------- Helpers ----------
 def _scores_from_model(text: str):
     raw = clf(text)
+    # Normalize pipeline output into list[dict]
     if isinstance(raw, dict):
         outs = [raw]
     elif isinstance(raw, list):
+        outs = raw[0] if (len(raw) and isinstance(raw[0], list)) else raw
     else:
         outs = []
+    scores = {str(d.get("label", "")).lower(): float(d.get("score", 0.0)) for d in outs if isinstance(d, dict)}
+    # Robust toxicity from binary labels
     if "toxic" in scores:
         toxicity = scores["toxic"]
     elif "not_toxic" in scores:
         toxicity = 1.0 - scores["non_toxic"]
     else:
         toxicity = max(scores.values()) if scores else 0.0
+    # This model doesn't output hate/threat explicitly
     hate = 0.0
     threats = 0.0
     return toxicity, hate, threats
 def score_labels(text: str):
     try:
         lang = detect(text)
     tox, hate, thr_model = _scores_from_model(text)
     thr_rule = 0.9 if THREAT_PATTERNS.search(text or "") else 0.0
     minors   = 0.9 if MINOR_PATTERNS.search(text or "") else 0.01
+    probs = {
         "toxicity": float(tox),
         "hate_targeted": float(hate),
         "threats": float(max(thr_model, thr_rule)),
         "misinfo_flag": 0.0
     }
+    # --- lexical boosts for Swahili/code-switch (demo heuristic) ---
+    if INSULT_SW.search(text or ""):
+        probs["toxicity"] = max(probs["toxicity"], 0.75)
+    # If toxic AND a group term appears → likely hate-targeted
+    if probs["toxicity"] >= 0.35 and HATE_GROUP.search(text or ""):
+        probs["hate_targeted"] = max(
+            probs.get("hate_targeted", 0.0),
+            min(0.2 + 0.8 * probs["toxicity"], 0.95)
+        )
     return lang, probs
 def scaled_policy(scale: float):
     pol = {}
     for sec, spec in BASE_POLICY.items():
         low  = min(round(spec["low"]  * scale, 3), 0.95)
     if not decisions:
         return {"action": "allow", "reason": "no policy thresholds met", "decisions": []}
     best = max(decisions, key=lambda d: d["score"])
     label_map = {
         "toxicity": "harassment/toxicity",
         "hate_targeted": "hate speech",
     scores_view = {"language": lang, **{k: round(v,3) for k,v in probs.items()}}
     return scores_view, decision
+# ---------- UI ----------
 examples = [
     ["Wewe ni mjinga kabisa", 1.0],
     ["Hawa watu wanastahili kuondolewa kabisa", 1.0],
 with gr.Blocks(analytics_enabled=False) as demo:
     gr.Markdown("# Bilingual Moderation (Swahili + English) · Policy-aware Routing")
+    gr.Markdown("Single multilingual classifier (toxicity) + simple cues for threats/child-safety, "
+                "with **policy-aware routing**. Use **Strictness** to tune thresholds.")
+    inp = gr.Textbox(label="Text", lines=4, placeholder="Andika / Type here…")
     strict = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Strictness (threshold scale)")
     btn = gr.Button("Moderate", variant="primary")
     scores = gr.JSON(label="Scores (probabilities)")
     inp.submit(moderate, inputs=[inp, strict], outputs=[scores, decision])
     gr.Examples(examples=examples, inputs=[inp, strict], fn=moderate, outputs=[scores, decision],
+                label="Try examples (auto-run)", cache_examples=False)
     btn.click(moderate, inputs=[inp, strict], outputs=[scores, decision])
 demo.queue().launch()