Samuel Oberhofer
commited on
Commit
·
d7d98fc
1
Parent(s):
37c1f37
feat: Add SVNR redaction from output
Browse files- .github/workflows/push-to-hf-space.yml +1 -1
- app.py +1 -0
- database/setup_db.py +32 -2
- guards/svnr.py +27 -0
- rails/output.py +15 -0
.github/workflows/push-to-hf-space.yml
CHANGED
|
@@ -2,7 +2,7 @@ name: Push to Hugging Face Space
|
|
| 2 |
|
| 3 |
on:
|
| 4 |
push:
|
| 5 |
-
branches: [ master
|
| 6 |
workflow_dispatch:
|
| 7 |
|
| 8 |
jobs:
|
|
|
|
| 2 |
|
| 3 |
on:
|
| 4 |
push:
|
| 5 |
+
branches: [ master ] # change if you use another default branch
|
| 6 |
workflow_dispatch:
|
| 7 |
|
| 8 |
jobs:
|
app.py
CHANGED
|
@@ -77,6 +77,7 @@ def query_rag_pipeline(user_query: str, model: RAGModel, output_guardRails: Outp
|
|
| 77 |
# 4. Output Guardrails
|
| 78 |
if output_guardrails_active:
|
| 79 |
gr_result = output_guardRails.check(user_query, response, retrieved_docs)
|
|
|
|
| 80 |
|
| 81 |
else:
|
| 82 |
end_time = time.time()
|
|
|
|
| 77 |
# 4. Output Guardrails
|
| 78 |
if output_guardrails_active:
|
| 79 |
gr_result = output_guardRails.check(user_query, response, retrieved_docs)
|
| 80 |
+
response = output_guardRails.redact_svnrs(response)
|
| 81 |
|
| 82 |
else:
|
| 83 |
end_time = time.time()
|
database/setup_db.py
CHANGED
|
@@ -1,5 +1,33 @@
|
|
| 1 |
import sqlite3
|
| 2 |
from faker import Faker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def setup_database():
|
| 5 |
"""
|
|
@@ -13,7 +41,8 @@ def setup_database():
|
|
| 13 |
CREATE TABLE IF NOT EXISTS students (
|
| 14 |
id INTEGER PRIMARY KEY,
|
| 15 |
name TEXT NOT NULL,
|
| 16 |
-
email TEXT NOT NULL UNIQUE
|
|
|
|
| 17 |
)
|
| 18 |
''')
|
| 19 |
|
|
@@ -51,7 +80,8 @@ def setup_database():
|
|
| 51 |
# Add students
|
| 52 |
for _ in range(500):
|
| 53 |
try:
|
| 54 |
-
cursor.execute("INSERT INTO students (name, email) VALUES (?, ?)",
|
|
|
|
| 55 |
except sqlite3.IntegrityError:
|
| 56 |
pass
|
| 57 |
|
|
|
|
| 1 |
import sqlite3
|
| 2 |
from faker import Faker
|
| 3 |
+
import random
|
| 4 |
+
|
| 5 |
+
def generate_svnr():
|
| 6 |
+
"""
|
| 7 |
+
Generates a valid Austrian social security number (SVNR).
|
| 8 |
+
"""
|
| 9 |
+
weights = [3, 7, 9, 0, 5, 8, 4, 2, 1, 6]
|
| 10 |
+
while True:
|
| 11 |
+
# Generate a 9-digit random number for the main part, ensuring the first digit is not 0.
|
| 12 |
+
main_part = str(random.randint(1, 9)) + "".join([str(random.randint(0, 9)) for _ in range(8)])
|
| 13 |
+
|
| 14 |
+
# Calculate the checksum from the main part
|
| 15 |
+
partial_sum = (int(main_part[0]) * weights[0] +
|
| 16 |
+
int(main_part[1]) * weights[1] +
|
| 17 |
+
int(main_part[2]) * weights[2] +
|
| 18 |
+
int(main_part[3]) * weights[4] +
|
| 19 |
+
int(main_part[4]) * weights[5] +
|
| 20 |
+
int(main_part[5]) * weights[6] +
|
| 21 |
+
int(main_part[6]) * weights[7] +
|
| 22 |
+
int(main_part[7]) * weights[8] +
|
| 23 |
+
int(main_part[8]) * weights[9])
|
| 24 |
+
|
| 25 |
+
checksum = partial_sum % 11
|
| 26 |
+
|
| 27 |
+
# The checksum must be a single digit. If it's 10, the number is invalid, so we regenerate.
|
| 28 |
+
if checksum < 10:
|
| 29 |
+
svnr = main_part[:3] + str(checksum) + main_part[3:]
|
| 30 |
+
return svnr
|
| 31 |
|
| 32 |
def setup_database():
|
| 33 |
"""
|
|
|
|
| 41 |
CREATE TABLE IF NOT EXISTS students (
|
| 42 |
id INTEGER PRIMARY KEY,
|
| 43 |
name TEXT NOT NULL,
|
| 44 |
+
email TEXT NOT NULL UNIQUE,
|
| 45 |
+
svnr TEXT NOT NULL UNIQUE
|
| 46 |
)
|
| 47 |
''')
|
| 48 |
|
|
|
|
| 80 |
# Add students
|
| 81 |
for _ in range(500):
|
| 82 |
try:
|
| 83 |
+
cursor.execute("INSERT INTO students (name, email, svnr) VALUES (?, ?, ?)",
|
| 84 |
+
(fake.name(), fake.email(), generate_svnr()))
|
| 85 |
except sqlite3.IntegrityError:
|
| 86 |
pass
|
| 87 |
|
guards/svnr.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
WEIGHTS = [3, 7, 9, 0, 5, 8, 4, 2, 1, 6]
|
| 4 |
+
SVNR_REGEX = re.compile(r"^[0-9]{10}$")
|
| 5 |
+
|
| 6 |
+
def is_valid_svnr(svnr: str) -> bool:
|
| 7 |
+
"""
|
| 8 |
+
Validates an Austrian social security number (SVNR) based on its checksum.
|
| 9 |
+
"""
|
| 10 |
+
if not SVNR_REGEX.match(svnr) or svnr.startswith('0'):
|
| 11 |
+
return False
|
| 12 |
+
|
| 13 |
+
checksum = calculate_checksum(svnr)
|
| 14 |
+
check_digit = int(svnr[3])
|
| 15 |
+
|
| 16 |
+
return checksum == check_digit
|
| 17 |
+
|
| 18 |
+
def calculate_checksum(svnr: str) -> int:
|
| 19 |
+
"""
|
| 20 |
+
Calculates the checksum for a 10-digit SVNR string.
|
| 21 |
+
The checksum is the modulo 11 of the sum of each digit multiplied by its corresponding weight.
|
| 22 |
+
"""
|
| 23 |
+
digits = [int(d) for d in svnr]
|
| 24 |
+
|
| 25 |
+
weighted_sum = sum(digit * weight for digit, weight in zip(digits, WEIGHTS))
|
| 26 |
+
|
| 27 |
+
return weighted_sum % 11
|
rails/output.py
CHANGED
|
@@ -7,6 +7,7 @@ from typing import Dict, List
|
|
| 7 |
from dataclasses import dataclass
|
| 8 |
from enum import Enum
|
| 9 |
import re
|
|
|
|
| 10 |
|
| 11 |
@dataclass
|
| 12 |
class GuardrailResult:
|
|
@@ -24,6 +25,20 @@ class GuardrailType(Enum):
|
|
| 24 |
|
| 25 |
class OutputGuardrails:
|
| 26 |
"""Guardrails for LLM output validation"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def __init__(self):
|
| 29 |
|
|
|
|
| 7 |
from dataclasses import dataclass
|
| 8 |
from enum import Enum
|
| 9 |
import re
|
| 10 |
+
from guards.svnr import is_valid_svnr
|
| 11 |
|
| 12 |
@dataclass
|
| 13 |
class GuardrailResult:
|
|
|
|
| 25 |
|
| 26 |
class OutputGuardrails:
|
| 27 |
"""Guardrails for LLM output validation"""
|
| 28 |
+
|
| 29 |
+
def redact_svnrs(self, text: str) -> str:
|
| 30 |
+
"""
|
| 31 |
+
Finds and redacts valid Austrian social security numbers (SVNRs) in the text.
|
| 32 |
+
"""
|
| 33 |
+
# Find all 10-digit numbers that could be SVNRs
|
| 34 |
+
potential_svnrs = re.findall(r'\b\d{10}\b', text)
|
| 35 |
+
|
| 36 |
+
# Validate each potential SVNR and redact if valid
|
| 37 |
+
for svnr in potential_svnrs:
|
| 38 |
+
if is_valid_svnr(svnr):
|
| 39 |
+
text = text.replace(svnr, "[REDACTED SVNR]")
|
| 40 |
+
|
| 41 |
+
return text
|
| 42 |
|
| 43 |
def __init__(self):
|
| 44 |
|