Spaces:
Sleeping
Sleeping
File size: 7,317 Bytes
efd6737 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
"""Query normalizer for medical terminology, typos, and abbreviations."""
import re
from dataclasses import dataclass
from typing import Dict, List, Set, Optional
from rapidfuzz import process, fuzz
import logging
logger = logging.getLogger(__name__)
WORD_RE = re.compile(r"[a-zA-Z\-]+")
@dataclass
class QueryNormalizer:
"""Normalize medical queries by fixing typos and expanding abbreviations."""
vocab: Set[str] # domain lexicon
synonyms: Dict[str, List[str]] # canonical -> [synonyms]
min_fuzzy: int = 85 # minimum fuzzy match score
def expand_synonyms(self, query: str) -> str:
"""Expand abbreviations and synonyms to canonical forms."""
q_low = " " + query.lower() + " "
# Sort by length (longest first) to avoid partial replacements
for canon, syns in self.synonyms.items():
all_forms = sorted(set([canon] + syns), key=len, reverse=True)
for form in all_forms:
# Match with word boundaries
pattern = rf"\b{re.escape(form)}\b"
if re.search(pattern, q_low):
q_low = re.sub(pattern, canon, q_low)
return q_low.strip()
def fuzzy_fix_tokens(self, query: str) -> str:
"""Fix typos using fuzzy matching against medical vocabulary."""
words = query.split()
fixed = []
for word in words:
word_lower = word.lower()
# Skip if already in vocab
if word_lower in self.vocab:
fixed.append(word)
continue
# Try fuzzy matching
if self.vocab: # Only if vocab is loaded
match = process.extractOne(word_lower, self.vocab, scorer=fuzz.ratio)
if match and match[1] >= self.min_fuzzy:
logger.debug(f"Fuzzy corrected: {word} -> {match[0]} (score: {match[1]})")
fixed.append(match[0])
else:
fixed.append(word)
else:
fixed.append(word)
return " ".join(fixed)
def normalize(self, query: str) -> str:
"""Full normalization pipeline."""
# First expand synonyms/abbreviations
expanded = self.expand_synonyms(query)
# Then fix typos
corrected = self.fuzzy_fix_tokens(expanded)
logger.info(f"Query normalized: '{query}' -> '{corrected}'")
return corrected
def load_medical_synonyms() -> Dict[str, List[str]]:
"""Load medical synonyms and abbreviations."""
return {
"tracheoesophageal fistula": [
"tef", "te fistula", "tracheo-esophageal fistula",
"tracheo oesophageal fistula", "tracheo esophageal fistula",
"esophagorespiratory fistula", "bronchoesophageal fistula",
"tracheoesophageal fistulae", "t-e fistula"
],
"benign": [
"nonmalignant", "non-malignant", "acquired non-malignant",
"non malignant", "nonneoplastic", "non-neoplastic"
],
"malignant": [
"neoplastic", "cancerous", "tumor-related", "cancer-related"
],
"stent": [
"airway stent", "tracheal stent", "esophageal stent",
"self-expanding metallic stent", "sems", "covered stent"
],
"endobronchial ultrasound": [
"ebus", "ebus-tbna", "linear ebus", "radial ebus", "r-ebus"
],
"transbronchial needle aspiration": [
"tbna", "ebus-tbna", "eus-fna", "needle aspiration"
],
"electromagnetic navigation bronchoscopy": [
"enb", "em navigation", "navigational bronchoscopy"
],
"bronchoscopic lung volume reduction": [
"blvr", "lung volume reduction", "valve therapy"
],
"chronic obstructive pulmonary disease": [
"copd", "emphysema", "chronic bronchitis"
],
"photodynamic therapy": [
"pdt", "phototherapy", "light therapy"
],
"argon plasma coagulation": [
"apc", "argon coagulation", "plasma coagulation"
],
"foreign body": [
"fb", "aspirated object", "inhaled object"
],
"massive hemoptysis": [
"life-threatening hemoptysis", "major hemoptysis",
"severe hemoptysis", "massive bleeding"
],
"closure": [
"occlusion", "sealing", "repair", "obliteration"
],
"management": [
"treatment", "therapy", "intervention", "approach"
],
"complications": [
"adverse events", "adverse effects", "side effects"
],
"contraindications": [
"contraindication", "absolute contraindication",
"relative contraindication", "cautions"
],
"fiducial": [
"fiducial marker", "fiducials", "marker", "gold marker"
],
"ablation": [
"thermal ablation", "microwave ablation", "radiofrequency ablation",
"rfa", "mwa", "cryoablation", "cryo"
]
}
def load_medical_vocab() -> Set[str]:
"""Load medical vocabulary for fuzzy matching."""
# Start with common medical terms
base_vocab = {
"tracheoesophageal", "fistula", "benign", "malignant", "stent",
"bronchoscopy", "endobronchial", "ultrasound", "transbronchial",
"aspiration", "biopsy", "ablation", "microwave", "radiofrequency",
"cryotherapy", "photodynamic", "therapy", "argon", "plasma",
"coagulation", "electromagnetic", "navigation", "fiducial",
"marker", "hemoptysis", "pneumothorax", "emphysema", "copd",
"asthma", "bronchiectasis", "stenosis", "stricture", "obstruction",
"tumor", "carcinoma", "adenocarcinoma", "squamous", "metastasis",
"lymph", "node", "mediastinal", "hilar", "peripheral", "central",
"airway", "trachea", "bronchus", "bronchi", "esophagus", "lung",
"pleura", "pleural", "effusion", "empyema", "thoracentesis",
"pleurodesis", "chest", "tube", "drainage", "valve", "coil",
"management", "treatment", "intervention", "procedure", "technique",
"complication", "contraindication", "indication", "sedation",
"anesthesia", "fluoroscopy", "computed", "tomography", "magnetic",
"resonance", "imaging", "positron", "emission", "radiotherapy"
}
# Could load from file if available
import pathlib
vocab_file = pathlib.Path("data/lexicon/medical_terms.txt")
if vocab_file.exists():
file_vocab = {line.strip().lower()
for line in vocab_file.read_text().splitlines()
if line.strip()}
return base_vocab | file_vocab
return base_vocab
# Create a singleton normalizer
_normalizer: Optional[QueryNormalizer] = None
def get_normalizer() -> QueryNormalizer:
"""Get or create the singleton query normalizer."""
global _normalizer
if _normalizer is None:
_normalizer = QueryNormalizer(
vocab=load_medical_vocab(),
synonyms=load_medical_synonyms(),
min_fuzzy=85
)
return _normalizer |