File size: 7,317 Bytes
efd6737
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
"""Query normalizer for medical terminology, typos, and abbreviations."""

import re
from dataclasses import dataclass
from typing import Dict, List, Set, Optional
from rapidfuzz import process, fuzz
import logging

logger = logging.getLogger(__name__)

WORD_RE = re.compile(r"[a-zA-Z\-]+")

@dataclass
class QueryNormalizer:
    """Normalize medical queries by fixing typos and expanding abbreviations."""
    
    vocab: Set[str]                # domain lexicon
    synonyms: Dict[str, List[str]] # canonical -> [synonyms]
    min_fuzzy: int = 85            # minimum fuzzy match score
    
    def expand_synonyms(self, query: str) -> str:
        """Expand abbreviations and synonyms to canonical forms."""
        q_low = " " + query.lower() + " "
        
        # Sort by length (longest first) to avoid partial replacements
        for canon, syns in self.synonyms.items():
            all_forms = sorted(set([canon] + syns), key=len, reverse=True)
            for form in all_forms:
                # Match with word boundaries
                pattern = rf"\b{re.escape(form)}\b"
                if re.search(pattern, q_low):
                    q_low = re.sub(pattern, canon, q_low)
                    
        return q_low.strip()
    
    def fuzzy_fix_tokens(self, query: str) -> str:
        """Fix typos using fuzzy matching against medical vocabulary."""
        words = query.split()
        fixed = []
        
        for word in words:
            word_lower = word.lower()
            
            # Skip if already in vocab
            if word_lower in self.vocab:
                fixed.append(word)
                continue
                
            # Try fuzzy matching
            if self.vocab:  # Only if vocab is loaded
                match = process.extractOne(word_lower, self.vocab, scorer=fuzz.ratio)
                if match and match[1] >= self.min_fuzzy:
                    logger.debug(f"Fuzzy corrected: {word} -> {match[0]} (score: {match[1]})")
                    fixed.append(match[0])
                else:
                    fixed.append(word)
            else:
                fixed.append(word)
                
        return " ".join(fixed)
    
    def normalize(self, query: str) -> str:
        """Full normalization pipeline."""
        # First expand synonyms/abbreviations
        expanded = self.expand_synonyms(query)
        
        # Then fix typos
        corrected = self.fuzzy_fix_tokens(expanded)
        
        logger.info(f"Query normalized: '{query}' -> '{corrected}'")
        return corrected


def load_medical_synonyms() -> Dict[str, List[str]]:
    """Load medical synonyms and abbreviations."""
    return {
        "tracheoesophageal fistula": [
            "tef", "te fistula", "tracheo-esophageal fistula",
            "tracheo oesophageal fistula", "tracheo esophageal fistula",
            "esophagorespiratory fistula", "bronchoesophageal fistula",
            "tracheoesophageal fistulae", "t-e fistula"
        ],
        "benign": [
            "nonmalignant", "non-malignant", "acquired non-malignant",
            "non malignant", "nonneoplastic", "non-neoplastic"
        ],
        "malignant": [
            "neoplastic", "cancerous", "tumor-related", "cancer-related"
        ],
        "stent": [
            "airway stent", "tracheal stent", "esophageal stent",
            "self-expanding metallic stent", "sems", "covered stent"
        ],
        "endobronchial ultrasound": [
            "ebus", "ebus-tbna", "linear ebus", "radial ebus", "r-ebus"
        ],
        "transbronchial needle aspiration": [
            "tbna", "ebus-tbna", "eus-fna", "needle aspiration"
        ],
        "electromagnetic navigation bronchoscopy": [
            "enb", "em navigation", "navigational bronchoscopy"
        ],
        "bronchoscopic lung volume reduction": [
            "blvr", "lung volume reduction", "valve therapy"
        ],
        "chronic obstructive pulmonary disease": [
            "copd", "emphysema", "chronic bronchitis"
        ],
        "photodynamic therapy": [
            "pdt", "phototherapy", "light therapy"
        ],
        "argon plasma coagulation": [
            "apc", "argon coagulation", "plasma coagulation"
        ],
        "foreign body": [
            "fb", "aspirated object", "inhaled object"
        ],
        "massive hemoptysis": [
            "life-threatening hemoptysis", "major hemoptysis", 
            "severe hemoptysis", "massive bleeding"
        ],
        "closure": [
            "occlusion", "sealing", "repair", "obliteration"
        ],
        "management": [
            "treatment", "therapy", "intervention", "approach"
        ],
        "complications": [
            "adverse events", "adverse effects", "side effects"
        ],
        "contraindications": [
            "contraindication", "absolute contraindication",
            "relative contraindication", "cautions"
        ],
        "fiducial": [
            "fiducial marker", "fiducials", "marker", "gold marker"
        ],
        "ablation": [
            "thermal ablation", "microwave ablation", "radiofrequency ablation",
            "rfa", "mwa", "cryoablation", "cryo"
        ]
    }


def load_medical_vocab() -> Set[str]:
    """Load medical vocabulary for fuzzy matching."""
    # Start with common medical terms
    base_vocab = {
        "tracheoesophageal", "fistula", "benign", "malignant", "stent",
        "bronchoscopy", "endobronchial", "ultrasound", "transbronchial",
        "aspiration", "biopsy", "ablation", "microwave", "radiofrequency",
        "cryotherapy", "photodynamic", "therapy", "argon", "plasma",
        "coagulation", "electromagnetic", "navigation", "fiducial",
        "marker", "hemoptysis", "pneumothorax", "emphysema", "copd",
        "asthma", "bronchiectasis", "stenosis", "stricture", "obstruction",
        "tumor", "carcinoma", "adenocarcinoma", "squamous", "metastasis",
        "lymph", "node", "mediastinal", "hilar", "peripheral", "central",
        "airway", "trachea", "bronchus", "bronchi", "esophagus", "lung",
        "pleura", "pleural", "effusion", "empyema", "thoracentesis",
        "pleurodesis", "chest", "tube", "drainage", "valve", "coil",
        "management", "treatment", "intervention", "procedure", "technique",
        "complication", "contraindication", "indication", "sedation",
        "anesthesia", "fluoroscopy", "computed", "tomography", "magnetic",
        "resonance", "imaging", "positron", "emission", "radiotherapy"
    }
    
    # Could load from file if available
    import pathlib
    vocab_file = pathlib.Path("data/lexicon/medical_terms.txt")
    if vocab_file.exists():
        file_vocab = {line.strip().lower() 
                     for line in vocab_file.read_text().splitlines() 
                     if line.strip()}
        return base_vocab | file_vocab
    
    return base_vocab


# Create a singleton normalizer
_normalizer: Optional[QueryNormalizer] = None

def get_normalizer() -> QueryNormalizer:
    """Get or create the singleton query normalizer."""
    global _normalizer
    if _normalizer is None:
        _normalizer = QueryNormalizer(
            vocab=load_medical_vocab(),
            synonyms=load_medical_synonyms(),
            min_fuzzy=85
        )
    return _normalizer