Samuel Oberhofer commited on
Commit
d7d98fc
·
1 Parent(s): 37c1f37

feat: Add SVNR redaction from output

Browse files
.github/workflows/push-to-hf-space.yml CHANGED
@@ -2,7 +2,7 @@ name: Push to Hugging Face Space
2
 
3
  on:
4
  push:
5
- branches: [ master, feature/env-secrets ] # change if you use another default branch
6
  workflow_dispatch:
7
 
8
  jobs:
 
2
 
3
  on:
4
  push:
5
+ branches: [ master ] # change if you use another default branch
6
  workflow_dispatch:
7
 
8
  jobs:
app.py CHANGED
@@ -77,6 +77,7 @@ def query_rag_pipeline(user_query: str, model: RAGModel, output_guardRails: Outp
77
  # 4. Output Guardrails
78
  if output_guardrails_active:
79
  gr_result = output_guardRails.check(user_query, response, retrieved_docs)
 
80
 
81
  else:
82
  end_time = time.time()
 
77
  # 4. Output Guardrails
78
  if output_guardrails_active:
79
  gr_result = output_guardRails.check(user_query, response, retrieved_docs)
80
+ response = output_guardRails.redact_svnrs(response)
81
 
82
  else:
83
  end_time = time.time()
database/setup_db.py CHANGED
@@ -1,5 +1,33 @@
1
  import sqlite3
2
  from faker import Faker
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def setup_database():
5
  """
@@ -13,7 +41,8 @@ def setup_database():
13
  CREATE TABLE IF NOT EXISTS students (
14
  id INTEGER PRIMARY KEY,
15
  name TEXT NOT NULL,
16
- email TEXT NOT NULL UNIQUE
 
17
  )
18
  ''')
19
 
@@ -51,7 +80,8 @@ def setup_database():
51
  # Add students
52
  for _ in range(500):
53
  try:
54
- cursor.execute("INSERT INTO students (name, email) VALUES (?, ?)", (fake.name(), fake.email()))
 
55
  except sqlite3.IntegrityError:
56
  pass
57
 
 
1
  import sqlite3
2
  from faker import Faker
3
+ import random
4
+
5
+ def generate_svnr():
6
+ """
7
+ Generates a valid Austrian social security number (SVNR).
8
+ """
9
+ weights = [3, 7, 9, 0, 5, 8, 4, 2, 1, 6]
10
+ while True:
11
+ # Generate a 9-digit random number for the main part, ensuring the first digit is not 0.
12
+ main_part = str(random.randint(1, 9)) + "".join([str(random.randint(0, 9)) for _ in range(8)])
13
+
14
+ # Calculate the checksum from the main part
15
+ partial_sum = (int(main_part[0]) * weights[0] +
16
+ int(main_part[1]) * weights[1] +
17
+ int(main_part[2]) * weights[2] +
18
+ int(main_part[3]) * weights[4] +
19
+ int(main_part[4]) * weights[5] +
20
+ int(main_part[5]) * weights[6] +
21
+ int(main_part[6]) * weights[7] +
22
+ int(main_part[7]) * weights[8] +
23
+ int(main_part[8]) * weights[9])
24
+
25
+ checksum = partial_sum % 11
26
+
27
+ # The checksum must be a single digit. If it's 10, the number is invalid, so we regenerate.
28
+ if checksum < 10:
29
+ svnr = main_part[:3] + str(checksum) + main_part[3:]
30
+ return svnr
31
 
32
  def setup_database():
33
  """
 
41
  CREATE TABLE IF NOT EXISTS students (
42
  id INTEGER PRIMARY KEY,
43
  name TEXT NOT NULL,
44
+ email TEXT NOT NULL UNIQUE,
45
+ svnr TEXT NOT NULL UNIQUE
46
  )
47
  ''')
48
 
 
80
  # Add students
81
  for _ in range(500):
82
  try:
83
+ cursor.execute("INSERT INTO students (name, email, svnr) VALUES (?, ?, ?)",
84
+ (fake.name(), fake.email(), generate_svnr()))
85
  except sqlite3.IntegrityError:
86
  pass
87
 
guards/svnr.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ WEIGHTS = [3, 7, 9, 0, 5, 8, 4, 2, 1, 6]
4
+ SVNR_REGEX = re.compile(r"^[0-9]{10}$")
5
+
6
+ def is_valid_svnr(svnr: str) -> bool:
7
+ """
8
+ Validates an Austrian social security number (SVNR) based on its checksum.
9
+ """
10
+ if not SVNR_REGEX.match(svnr) or svnr.startswith('0'):
11
+ return False
12
+
13
+ checksum = calculate_checksum(svnr)
14
+ check_digit = int(svnr[3])
15
+
16
+ return checksum == check_digit
17
+
18
+ def calculate_checksum(svnr: str) -> int:
19
+ """
20
+ Calculates the checksum for a 10-digit SVNR string.
21
+ The checksum is the modulo 11 of the sum of each digit multiplied by its corresponding weight.
22
+ """
23
+ digits = [int(d) for d in svnr]
24
+
25
+ weighted_sum = sum(digit * weight for digit, weight in zip(digits, WEIGHTS))
26
+
27
+ return weighted_sum % 11
rails/output.py CHANGED
@@ -7,6 +7,7 @@ from typing import Dict, List
7
  from dataclasses import dataclass
8
  from enum import Enum
9
  import re
 
10
 
11
  @dataclass
12
  class GuardrailResult:
@@ -24,6 +25,20 @@ class GuardrailType(Enum):
24
 
25
  class OutputGuardrails:
26
  """Guardrails for LLM output validation"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def __init__(self):
29
 
 
7
  from dataclasses import dataclass
8
  from enum import Enum
9
  import re
10
+ from guards.svnr import is_valid_svnr
11
 
12
  @dataclass
13
  class GuardrailResult:
 
25
 
26
  class OutputGuardrails:
27
  """Guardrails for LLM output validation"""
28
+
29
+ def redact_svnrs(self, text: str) -> str:
30
+ """
31
+ Finds and redacts valid Austrian social security numbers (SVNRs) in the text.
32
+ """
33
+ # Find all 10-digit numbers that could be SVNRs
34
+ potential_svnrs = re.findall(r'\b\d{10}\b', text)
35
+
36
+ # Validate each potential SVNR and redact if valid
37
+ for svnr in potential_svnrs:
38
+ if is_valid_svnr(svnr):
39
+ text = text.replace(svnr, "[REDACTED SVNR]")
40
+
41
+ return text
42
 
43
  def __init__(self):
44