Spaces:

samarth09healthPM
/

my-streamlit-app

Running

App Files Files Community

samarth09healthPM commited on Oct 11

Commit

02f41b6

1 Parent(s): f3bad0e

Fix duplicate key error with session state

Browse files

Files changed (1) hide show

deid_pipeline.py +9 -33

deid_pipeline.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import json
 import os
-from pathlib import Path
 from dataclasses import dataclass
 from typing import List, Dict, Any, Tuple
@@ -25,16 +25,7 @@ analyzer_config = {
 # NLP for optional section detection
 import spacy
-# If using medspacy, uncomment (preferred for clinical):
-# import medspacy
-# from medspacy.sectionizer import Sectionizer
-# If not using medspacy, optional lightweight section tagging:
-# We'll use regex on common headers as a fallback
 import re
-# Encryption
 from cryptography.fernet import Fernet
 @dataclass
@@ -46,7 +37,6 @@ class PHISpan:
     section: str
 SECTION_HEADERS = [
-    # Common clinical sections; customize as needed
     "HPI", "History of Present Illness",
     "PMH", "Past Medical History",
     "Medications", "Allergies",
@@ -73,17 +63,18 @@ class DeidPipeline:
     """
     De-identification pipeline using Microsoft Presidio
     """
-    def __init__(self, fernet_key_path="secure_store/fernet.key"):
         """
         Initialize de-identification pipeline with Presidio
         Args:
-            fernet_key_path: Path to Fernet encryption key
         """
-        self.secure_dir = Path(secure_dir)
-        self.secure_dir.mkdir(exist_ok=True)
-        import os
-        from cryptography.fernet import Fernet
         # Initialize encryption
         try:
@@ -96,7 +87,6 @@ class DeidPipeline:
                 key = Fernet.generate_key()
                 # Try to save it (might fail on read-only filesystems)
                 try:
-                    os.makedirs(os.path.dirname(fernet_key_path), exist_ok=True)
                     with open(fernet_key_path, "wb") as f:
                         f.write(key)
                 except (PermissionError, OSError):
@@ -129,11 +119,9 @@ class DeidPipeline:
         Lightweight section finder:
         Return list of (section_title, start_idx, end_idx_of_section_block)
         """
-        # Find headers by regex, map their start positions
         headers = []
         for m in SECTION_PATTERN.finditer(text):
             headers.append((m.group("header"), m.start()))
-        # Add end sentinel
         headers.append(("[END]", len(text)))
         sections = []
@@ -142,7 +130,6 @@ class DeidPipeline:
             next_title, next_pos = headers[i+1]
             sections.append((title.strip(), start_pos, next_pos))
         if not sections:
-            # Single default section if none found
             sections = [("DOCUMENT", 0, len(text))]
         return sections
@@ -153,9 +140,7 @@ class DeidPipeline:
         return "DOCUMENT"
     def analyze(self, text: str) -> List[Dict[str, Any]]:
-        # Detect entities
         results = self.analyzer.analyze(text=text, language="en")
-        # Convert to dict for consistency
         detections = []
         for r in results:
             detections.append({
@@ -170,10 +155,8 @@ class DeidPipeline:
         """
         Replace spans with tags safely (right-to-left to maintain indices).
         """
-        # Determine sections for context
         sections = self._detect_sections(text)
-        # Build PHI span records
         spans: List[PHISpan] = []
         for d in detections:
             entity = d["entity_type"]
@@ -183,7 +166,6 @@ class DeidPipeline:
             section = self._find_section_for_span(sections, start)
             spans.append(PHISpan(entity_type=entity, start=start, end=end, text=original, section=section))
-        # Replace from the end to avoid index shifting
         masked = text
         for d in sorted(detections, key=lambda x: x["start"], reverse=True):
             entity = d["entity_type"]
@@ -207,7 +189,6 @@ class DeidPipeline:
         detections = self.analyze(text)
         masked, spans = self.mask(text, detections)
-        # Encrypt span map
         token = self.encrypt_span_map(
             spans=spans,
             meta={"note_id": note_id}
@@ -219,19 +200,16 @@ class DeidPipeline:
         }
 def _read_text_with_fallback(path: str) -> str:
-    # 1) Try UTF-8 (preferred for cross-platform)
     try:
         with open(path, "r", encoding="utf-8") as f:
             return f.read()
     except UnicodeDecodeError:
         pass
-    # 2) Try Windows-1252 (common for Notepad/docx copy-paste on Windows)
     try:
         with open(path, "r", encoding="cp1252") as f:
             return f.read()
     except UnicodeDecodeError:
         pass
-    # 3) Last resort: decode with replacement to avoid crashing; preserves structure
     with open(path, "r", encoding="utf-8", errors="replace") as f:
         return f.read()
@@ -242,15 +220,13 @@ def run_file(input_path: str, outputs_dir: str = "data/outputs", secure_dir: str
     note_id = os.path.splitext(os.path.basename(input_path))[0]
     text = _read_text_with_fallback(input_path)
-    pipeline = DeidPipeline()
     result = pipeline.run_on_text(text, note_id=note_id)
-    # Save masked text normalized to UTF-8
     out_txt = os.path.join(outputs_dir, f"{note_id}.deid.txt")
     with open(out_txt, "w", encoding="utf-8", newline="\n") as f:
         f.write(result["masked_text"])
-    # Save encrypted span map (binary)
     out_bin = os.path.join(secure_dir, f"{note_id}.spanmap.enc")
     with open(out_bin, "wb") as f:
         f.write(result["encrypted_span_map"])

+# deid_pipeline.py
 import json
 import os
 from dataclasses import dataclass
 from typing import List, Dict, Any, Tuple
 # NLP for optional section detection
 import spacy
 import re
 from cryptography.fernet import Fernet
 @dataclass
     section: str
 SECTION_HEADERS = [
     "HPI", "History of Present Illness",
     "PMH", "Past Medical History",
     "Medications", "Allergies",
     """
     De-identification pipeline using Microsoft Presidio
     """
+    def __init__(self, secure_dir="./secure_store"):
         """
         Initialize de-identification pipeline with Presidio
         Args:
+            secure_dir: Directory path to store encryption key (NOT the key file path)
         """
+        # Ensure secure_dir exists
+        os.makedirs(secure_dir, exist_ok=True)
+        # Build full path to key file
+        fernet_key_path = os.path.join(secure_dir, "fernet.key")
         # Initialize encryption
         try:
                 key = Fernet.generate_key()
                 # Try to save it (might fail on read-only filesystems)
                 try:
                     with open(fernet_key_path, "wb") as f:
                         f.write(key)
                 except (PermissionError, OSError):
         Lightweight section finder:
         Return list of (section_title, start_idx, end_idx_of_section_block)
         """
         headers = []
         for m in SECTION_PATTERN.finditer(text):
             headers.append((m.group("header"), m.start()))
         headers.append(("[END]", len(text)))
         sections = []
             next_title, next_pos = headers[i+1]
             sections.append((title.strip(), start_pos, next_pos))
         if not sections:
             sections = [("DOCUMENT", 0, len(text))]
         return sections
         return "DOCUMENT"
     def analyze(self, text: str) -> List[Dict[str, Any]]:
         results = self.analyzer.analyze(text=text, language="en")
         detections = []
         for r in results:
             detections.append({
         """
         Replace spans with tags safely (right-to-left to maintain indices).
         """
         sections = self._detect_sections(text)
         spans: List[PHISpan] = []
         for d in detections:
             entity = d["entity_type"]
             section = self._find_section_for_span(sections, start)
             spans.append(PHISpan(entity_type=entity, start=start, end=end, text=original, section=section))
         masked = text
         for d in sorted(detections, key=lambda x: x["start"], reverse=True):
             entity = d["entity_type"]
         detections = self.analyze(text)
         masked, spans = self.mask(text, detections)
         token = self.encrypt_span_map(
             spans=spans,
             meta={"note_id": note_id}
         }
 def _read_text_with_fallback(path: str) -> str:
     try:
         with open(path, "r", encoding="utf-8") as f:
             return f.read()
     except UnicodeDecodeError:
         pass
     try:
         with open(path, "r", encoding="cp1252") as f:
             return f.read()
     except UnicodeDecodeError:
         pass
     with open(path, "r", encoding="utf-8", errors="replace") as f:
         return f.read()
     note_id = os.path.splitext(os.path.basename(input_path))[0]
     text = _read_text_with_fallback(input_path)
+    pipeline = DeidPipeline(secure_dir)
     result = pipeline.run_on_text(text, note_id=note_id)
     out_txt = os.path.join(outputs_dir, f"{note_id}.deid.txt")
     with open(out_txt, "w", encoding="utf-8", newline="\n") as f:
         f.write(result["masked_text"])
     out_bin = os.path.join(secure_dir, f"{note_id}.spanmap.enc")
     with open(out_bin, "wb") as f:
         f.write(result["encrypted_span_map"])