jelagat commited on
Commit
2a261a2
·
verified ·
1 Parent(s): 99b46b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -48
app.py CHANGED
@@ -3,27 +3,25 @@ import gradio as gr
3
  from transformers import pipeline
4
  from langdetect import detect
5
 
6
- from transformers import pipeline
7
-
8
- # smaller multilingual toxicity model
9
  clf = pipeline(
10
  task="text-classification",
11
  model="citizenlab/distilbert-base-multilingual-cased-toxicity",
12
  top_k=None
13
  )
14
 
15
- # optional warm-up
16
  try:
17
  _ = clf("hello")
18
  except Exception:
19
  pass
20
 
21
-
22
-
23
- # Lightweight cues (demo only)
24
  THREAT_PATTERNS = re.compile(r"\b(kill|stab|shoot|bomb|burn|hang|slap|attack|kuua|kukuchoma|nikukate)\b", re.I)
25
  MINOR_PATTERNS = re.compile(r"\b(minor|under\s?age|under\s?18|child|mtoto|1[0-7]\s?yo|15yo|16yo|17yo)\b", re.I)
26
- # Swahili insult/profanity cues (demo only)
 
27
  INSULT_SW = re.compile(
28
  r"\b(mjinga|pumbavu|mshenzi|kumbafu|takataka|malaya|bwege|fala|mbwa|mbuzi|chizi|zezeta)\b",
29
  re.I,
@@ -35,8 +33,7 @@ HATE_GROUP = re.compile(
35
  re.I,
36
  )
37
 
38
-
39
- # Base thresholds
40
  BASE_POLICY = {
41
  "harassment_toxicity": {"label": "toxicity", "low": 0.40, "high": 0.75,
42
  "actions": {"low": "limit_reach", "high": "remove_and_escalate"}},
@@ -48,31 +45,18 @@ BASE_POLICY = {
48
  "actions": {"low": "escalate_child_safety_team", "high": "remove_and_notify_csirt"}},
49
  }
50
 
 
51
  def _scores_from_model(text: str):
52
  raw = clf(text)
53
-
54
- # Normalize to a list[dict] no matter what the pipeline returns
55
  if isinstance(raw, dict):
56
  outs = [raw]
57
  elif isinstance(raw, list):
58
- if len(raw) and isinstance(raw[0], dict):
59
- outs = raw # [ {label, score}, ... ]
60
- elif len(raw) and isinstance(raw[0], list):
61
- outs = raw[0] # [ [ {label, score}, ... ] ]
62
- else:
63
- outs = []
64
  else:
65
  outs = []
66
-
67
- scores = {}
68
- for d in outs:
69
- lab = str(d.get("label", "")).lower()
70
- sc = float(d.get("score", 0.0))
71
- if lab:
72
- scores[lab] = sc
73
-
74
- # CitizenLab model is binary (e.g., toxic / not_toxic or similar)
75
- # compute a robust toxicity score
76
  if "toxic" in scores:
77
  toxicity = scores["toxic"]
78
  elif "not_toxic" in scores:
@@ -81,13 +65,11 @@ def _scores_from_model(text: str):
81
  toxicity = 1.0 - scores["non_toxic"]
82
  else:
83
  toxicity = max(scores.values()) if scores else 0.0
84
-
85
- # This model doesn't output hate/threat directly; keep zeros here
86
  hate = 0.0
87
  threats = 0.0
88
  return toxicity, hate, threats
89
 
90
-
91
  def score_labels(text: str):
92
  try:
93
  lang = detect(text)
@@ -97,7 +79,8 @@ def score_labels(text: str):
97
  tox, hate, thr_model = _scores_from_model(text)
98
  thr_rule = 0.9 if THREAT_PATTERNS.search(text or "") else 0.0
99
  minors = 0.9 if MINOR_PATTERNS.search(text or "") else 0.01
100
- probs = {
 
101
  "toxicity": float(tox),
102
  "hate_targeted": float(hate),
103
  "threats": float(max(thr_model, thr_rule)),
@@ -105,21 +88,20 @@ probs = {
105
  "misinfo_flag": 0.0
106
  }
107
 
108
- # --- lexical boosts for Swahili/code-switch (demo heuristic) ---
109
- if INSULT_SW.search(text or ""):
110
- probs["toxicity"] = max(probs["toxicity"], 0.75)
111
 
112
- # If toxic AND a group term appears → likely hate-targeted
113
- if probs["toxicity"] >= 0.35 and HATE_GROUP.search(text or ""):
114
- probs["hate_targeted"] = max(
115
- probs.get("hate_targeted", 0.0),
116
- min(0.2 + 0.8 * probs["toxicity"], 0.95)
117
- )
118
 
119
  return lang, probs
120
 
121
  def scaled_policy(scale: float):
122
- # scale thresholds (e.g., 1.2 = stricter, 0.8 = looser)
123
  pol = {}
124
  for sec, spec in BASE_POLICY.items():
125
  low = min(round(spec["low"] * scale, 3), 0.95)
@@ -140,7 +122,6 @@ def route_policy(probs: dict, policy: dict):
140
  if not decisions:
141
  return {"action": "allow", "reason": "no policy thresholds met", "decisions": []}
142
  best = max(decisions, key=lambda d: d["score"])
143
- # human-readable reason
144
  label_map = {
145
  "toxicity": "harassment/toxicity",
146
  "hate_targeted": "hate speech",
@@ -158,6 +139,7 @@ def moderate(text: str, strictness: float):
158
  scores_view = {"language": lang, **{k: round(v,3) for k,v in probs.items()}}
159
  return scores_view, decision
160
 
 
161
  examples = [
162
  ["Wewe ni mjinga kabisa", 1.0],
163
  ["Hawa watu wanastahili kuondolewa kabisa", 1.0],
@@ -168,10 +150,9 @@ examples = [
168
 
169
  with gr.Blocks(analytics_enabled=False) as demo:
170
  gr.Markdown("# Bilingual Moderation (Swahili + English) · Policy-aware Routing")
171
- gr.Markdown("Single multilingual classifier (toxicity/hate/threat) + simple cues for child-safety → policy action. "
172
- "Use **Strictness** to tune thresholds.")
173
- with gr.Row():
174
- inp = gr.Textbox(label="Text", lines=4, placeholder="Andika / Type here…")
175
  strict = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Strictness (threshold scale)")
176
  btn = gr.Button("Moderate", variant="primary")
177
  scores = gr.JSON(label="Scores (probabilities)")
@@ -179,7 +160,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
179
 
180
  inp.submit(moderate, inputs=[inp, strict], outputs=[scores, decision])
181
  gr.Examples(examples=examples, inputs=[inp, strict], fn=moderate, outputs=[scores, decision],
182
- label="Try examples (auto-run)",cache_examples=False)
183
  btn.click(moderate, inputs=[inp, strict], outputs=[scores, decision])
184
 
185
  demo.queue().launch()
 
3
  from transformers import pipeline
4
  from langdetect import detect
5
 
6
+ # ---------- Model (small, CPU-friendly) ----------
7
+ # Binary outputs like: [{'label': 'toxic', 'score': ...}, {'label':'not_toxic', ...}]
 
8
  clf = pipeline(
9
  task="text-classification",
10
  model="citizenlab/distilbert-base-multilingual-cased-toxicity",
11
  top_k=None
12
  )
13
 
14
+ # Warm-up once so the first click isn't "dead" while weights download
15
  try:
16
  _ = clf("hello")
17
  except Exception:
18
  pass
19
 
20
+ # ---------- Heuristic cues (demo only) ----------
 
 
21
  THREAT_PATTERNS = re.compile(r"\b(kill|stab|shoot|bomb|burn|hang|slap|attack|kuua|kukuchoma|nikukate)\b", re.I)
22
  MINOR_PATTERNS = re.compile(r"\b(minor|under\s?age|under\s?18|child|mtoto|1[0-7]\s?yo|15yo|16yo|17yo)\b", re.I)
23
+
24
+ # Swahili insult/profanity cues (non-exhaustive)
25
  INSULT_SW = re.compile(
26
  r"\b(mjinga|pumbavu|mshenzi|kumbafu|takataka|malaya|bwege|fala|mbwa|mbuzi|chizi|zezeta)\b",
27
  re.I,
 
33
  re.I,
34
  )
35
 
36
+ # ---------- Policy thresholds/actions ----------
 
37
  BASE_POLICY = {
38
  "harassment_toxicity": {"label": "toxicity", "low": 0.40, "high": 0.75,
39
  "actions": {"low": "limit_reach", "high": "remove_and_escalate"}},
 
45
  "actions": {"low": "escalate_child_safety_team", "high": "remove_and_notify_csirt"}},
46
  }
47
 
48
+ # ---------- Helpers ----------
49
  def _scores_from_model(text: str):
50
  raw = clf(text)
51
+ # Normalize pipeline output into list[dict]
 
52
  if isinstance(raw, dict):
53
  outs = [raw]
54
  elif isinstance(raw, list):
55
+ outs = raw[0] if (len(raw) and isinstance(raw[0], list)) else raw
 
 
 
 
 
56
  else:
57
  outs = []
58
+ scores = {str(d.get("label", "")).lower(): float(d.get("score", 0.0)) for d in outs if isinstance(d, dict)}
59
+ # Robust toxicity from binary labels
 
 
 
 
 
 
 
 
60
  if "toxic" in scores:
61
  toxicity = scores["toxic"]
62
  elif "not_toxic" in scores:
 
65
  toxicity = 1.0 - scores["non_toxic"]
66
  else:
67
  toxicity = max(scores.values()) if scores else 0.0
68
+ # This model doesn't output hate/threat explicitly
 
69
  hate = 0.0
70
  threats = 0.0
71
  return toxicity, hate, threats
72
 
 
73
  def score_labels(text: str):
74
  try:
75
  lang = detect(text)
 
79
  tox, hate, thr_model = _scores_from_model(text)
80
  thr_rule = 0.9 if THREAT_PATTERNS.search(text or "") else 0.0
81
  minors = 0.9 if MINOR_PATTERNS.search(text or "") else 0.01
82
+
83
+ probs = {
84
  "toxicity": float(tox),
85
  "hate_targeted": float(hate),
86
  "threats": float(max(thr_model, thr_rule)),
 
88
  "misinfo_flag": 0.0
89
  }
90
 
91
+ # --- lexical boosts for Swahili/code-switch (demo heuristic) ---
92
+ if INSULT_SW.search(text or ""):
93
+ probs["toxicity"] = max(probs["toxicity"], 0.75)
94
 
95
+ # If toxic AND a group term appears → likely hate-targeted
96
+ if probs["toxicity"] >= 0.35 and HATE_GROUP.search(text or ""):
97
+ probs["hate_targeted"] = max(
98
+ probs.get("hate_targeted", 0.0),
99
+ min(0.2 + 0.8 * probs["toxicity"], 0.95)
100
+ )
101
 
102
  return lang, probs
103
 
104
  def scaled_policy(scale: float):
 
105
  pol = {}
106
  for sec, spec in BASE_POLICY.items():
107
  low = min(round(spec["low"] * scale, 3), 0.95)
 
122
  if not decisions:
123
  return {"action": "allow", "reason": "no policy thresholds met", "decisions": []}
124
  best = max(decisions, key=lambda d: d["score"])
 
125
  label_map = {
126
  "toxicity": "harassment/toxicity",
127
  "hate_targeted": "hate speech",
 
139
  scores_view = {"language": lang, **{k: round(v,3) for k,v in probs.items()}}
140
  return scores_view, decision
141
 
142
+ # ---------- UI ----------
143
  examples = [
144
  ["Wewe ni mjinga kabisa", 1.0],
145
  ["Hawa watu wanastahili kuondolewa kabisa", 1.0],
 
150
 
151
  with gr.Blocks(analytics_enabled=False) as demo:
152
  gr.Markdown("# Bilingual Moderation (Swahili + English) · Policy-aware Routing")
153
+ gr.Markdown("Single multilingual classifier (toxicity) + simple cues for threats/child-safety, "
154
+ "with **policy-aware routing**. Use **Strictness** to tune thresholds.")
155
+ inp = gr.Textbox(label="Text", lines=4, placeholder="Andika / Type here…")
 
156
  strict = gr.Slider(0.7, 1.3, value=1.0, step=0.05, label="Strictness (threshold scale)")
157
  btn = gr.Button("Moderate", variant="primary")
158
  scores = gr.JSON(label="Scores (probabilities)")
 
160
 
161
  inp.submit(moderate, inputs=[inp, strict], outputs=[scores, decision])
162
  gr.Examples(examples=examples, inputs=[inp, strict], fn=moderate, outputs=[scores, decision],
163
+ label="Try examples (auto-run)", cache_examples=False)
164
  btn.click(moderate, inputs=[inp, strict], outputs=[scores, decision])
165
 
166
  demo.queue().launch()