IFMedTechdemo commited on
Commit
68a6157
·
verified ·
1 Parent(s): 7ef4bbb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +295 -0
app.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import tempfile
4
+ import pickle
5
+ import os
6
+ import cv2
7
+ import pandas as pd
8
+ import requests
9
+ import re
10
+ from symspellpy import SymSpell, Verbosity
11
+ from rapidocr import RapidOCR, EngineType, LangCls, LangDet, LangRec, ModelType, OCRVersion
12
+
13
+ # Constants
14
+ ANCHOR_PREFIXES = ["tab", "cap", "t."]
15
+
16
+ # Medical anchors (TAB/CAP/INJ/etc.)
17
+ ANCHORS = [
18
+ r"tab\.?", r"cap\.?", r"inj\.?", r"syp\.?", r"syr\.?",
19
+ r"sol\.?", r"susp\.?", r"oint\.?", r"crm\.?", r"gel\.?",
20
+ r"drops?", r"powder", r"dragees?", r"t\.?", r"c\.?"
21
+ ]
22
+ ANCHOR_PATTERN = re.compile(r"\b(" + "|".join(ANCHORS) + r")", re.IGNORECASE)
23
+
24
+ # Non-medical line patterns (to drop lines early)
25
+ NON_MED_PATTERNS = [
26
+ r"emergency", r"contact", r"please",
27
+ r"nephrologist", r"cardiologist",
28
+ r"opinion", r"inform", r"kftafter", r"prescription",
29
+ r"follow[- ]up", r"dr\.", r"physician", r"clinic",
30
+ r"hospital", r"diagnosed", r"treatment", r"patient",
31
+ r"age[: ]", r"sex[: ]", r"weight[: ]", r"height[: ]",
32
+ r"bp[: ]", r"pulse[: ]", r"temperature[: ]",
33
+ r"investigation", r"advised", r"admission", r"discharge",
34
+ r"report", r"lab[: ]", r"laboratory", r"radiology",
35
+ r"address", r"phone[: ]", r"mobile[: ]", r"email[: ]",
36
+ r"signature", r"regd\.?", r"drugs? prescribed"
37
+ ]
38
+ NON_MED_REGEX = re.compile("|".join(NON_MED_PATTERNS), re.IGNORECASE)
39
+
40
+ # Rescue list for drug-like English words
41
+ rescue_list = {"d3", "b12", "k2", "iron", "zinc", "calcium", "vit", "xl"}
42
+
43
+ def is_potential_med_line(text: str) -> bool:
44
+ t = text.lower()
45
+ non_med_match = NON_MED_REGEX.search(t)
46
+ if non_med_match:
47
+ return False
48
+ anchor_match = ANCHOR_PATTERN.search(t)
49
+ if not anchor_match:
50
+ return False
51
+ digit_match = re.search(r"\d", t)
52
+ if not digit_match:
53
+ return False
54
+ return True
55
+
56
+ def validate_drug_match(term: str, drug_db, drug_token_index):
57
+ """
58
+ Map SymSpell term -> canonical database drug, or None if noise.
59
+ """
60
+ if term in drug_db:
61
+ return term
62
+ if term in drug_token_index:
63
+ # pick one canonical name; you can change selection logic if needed
64
+ return sorted(drug_token_index[term])[0]
65
+ return None
66
+
67
+ def normalize_anchored_tokens(raw_text: str):
68
+ """
69
+ Use TAB/CAP/T. as anchors, not something to delete:
70
+ - 'TABCLOPITAB75MG TAB' -> ['clopitab']
71
+ - 'TAB SOBISISTAB' -> ['sobisistab']
72
+ - 'TABSTARPRESSXL25MGTAB' -> ['starpressxl']
73
+ """
74
+ t = raw_text.lower()
75
+ # Remove dosage and numbers but keep anchor letters
76
+ t = re.sub(r"\d+\s*(mg|ml|gm|%|u|mcg)", " ", t)
77
+ t = re.sub(r"\d+", " ", t)
78
+ tokens = t.split()
79
+
80
+ normalized = []
81
+ skip_next = False
82
+
83
+ for i, tok in enumerate(tokens):
84
+ if skip_next:
85
+ skip_next = False
86
+ continue
87
+
88
+ base = tok
89
+
90
+ # Case 1: token starts with anchor as prefix (no space)
91
+ for pref in ANCHOR_PREFIXES:
92
+ if base.startswith(pref) and len(base) > len(pref):
93
+ base = base[len(pref):]
94
+ break
95
+
96
+ # Case 2: token is pure anchor and should attach to next token
97
+ if base in ["tab", "cap", "t"]:
98
+ if i + 1 < len(tokens):
99
+ merged = tokens[i + 1]
100
+ for pref in ANCHOR_PREFIXES:
101
+ if merged.startswith(pref) and len(merged) > len(pref):
102
+ merged = merged[len(pref):]
103
+ break
104
+ base = merged
105
+ skip_next = True
106
+ else:
107
+ continue
108
+
109
+ base = base.strip()
110
+ if len(base) >= 3:
111
+ normalized.append(base)
112
+
113
+ return normalized
114
+
115
+ def initialize_database():
116
+ data_path = os.path.join(os.path.dirname(__file__), "data/Dataset.csv")
117
+ df = pd.read_csv(data_path)
118
+ drug_db = set(df["Combined_Drugs"].astype(str).str.lower().str.strip())
119
+ sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
120
+
121
+ for drug in drug_db:
122
+ d = drug.lower()
123
+ sym_spell.create_dictionary_entry(d, 100000)
124
+ parts = d.split()
125
+ if len(parts) > 1:
126
+ for p in parts:
127
+ if len(p) > 3:
128
+ sym_spell.create_dictionary_entry(p, 100000)
129
+
130
+ drug_token_index = {}
131
+ for full in drug_db:
132
+ toks = full.split()
133
+ for tok in toks:
134
+ if len(tok) < 3:
135
+ continue
136
+ drug_token_index.setdefault(tok, set()).add(full)
137
+
138
+ # English filter
139
+ try:
140
+ url = (
141
+ "https://raw.githubusercontent.com/first20hours/"
142
+ "google-10000-english/master/google-10000-english-no-swears.txt"
143
+ )
144
+ response = requests.get(url, timeout=10)
145
+ english_vocab = set(response.text.split())
146
+ except Exception:
147
+ english_vocab = {"the", "and", "tab", "cap", "mg", "ml"}
148
+
149
+ return {
150
+ 'drug_db': drug_db,
151
+ 'sym_spell': sym_spell,
152
+ 'drug_token_index': drug_token_index,
153
+ 'english_vocab': english_vocab,
154
+ 'rescue_list': rescue_list,
155
+ 'NON_MED_REGEX': NON_MED_REGEX,
156
+ 'ANCHOR_PATTERN': ANCHOR_PATTERN,
157
+ 'ANCHOR_PREFIXES': ANCHOR_PREFIXES
158
+ }
159
+
160
+ def process_image_ocr(image_path):
161
+ # Load cached database
162
+ cache_path = os.path.join(os.path.dirname(__file__), "cache/database_cache.pkl")
163
+ try:
164
+ with open(cache_path, 'rb') as f:
165
+ cache = pickle.load(f)
166
+ drug_db = cache['drug_db']
167
+ sym_spell = cache['sym_spell']
168
+ drug_token_index = cache['drug_token_index']
169
+ english_vocab = cache['english_vocab']
170
+ rescue_list = cache['rescue_list']
171
+ except FileNotFoundError:
172
+ print("Error: database_cache.pkl not found. Initializing database...")
173
+ cache = initialize_database()
174
+ drug_db = cache['drug_db']
175
+ sym_spell = cache['sym_spell']
176
+ drug_token_index = cache['drug_token_index']
177
+ english_vocab = cache['english_vocab']
178
+ rescue_list = cache['rescue_list']
179
+
180
+ # Load image using cv2
181
+ img = cv2.imread(image_path)
182
+ if img is None:
183
+ raise ValueError(f"Could not load image from {image_path}")
184
+
185
+ # Create RapidOCR engine with default parameters
186
+ ocr_engine = RapidOCR(
187
+ params={
188
+ "Global.max_side_len": 2000,
189
+ "Det.engine_type": EngineType.ONNXRUNTIME,
190
+ "Det.lang_type": LangDet.CH,
191
+ "Det.model_type": ModelType.MOBILE,
192
+ "Det.ocr_version": OCRVersion.PPOCRV4,
193
+ "Cls.engine_type": EngineType.ONNXRUNTIME,
194
+ "Cls.lang_type": LangCls.CH,
195
+ "Cls.model_type": ModelType.MOBILE,
196
+ "Cls.ocr_version": OCRVersion.PPOCRV4,
197
+ "Rec.engine_type": EngineType.ONNXRUNTIME,
198
+ "Rec.lang_type": LangRec.CH,
199
+ "Rec.model_type": ModelType.MOBILE,
200
+ "Rec.ocr_version": OCRVersion.PPOCRV4,
201
+ }
202
+ )
203
+
204
+ # Run OCR
205
+ ocr_result = ocr_engine(
206
+ img,
207
+ use_det=True,
208
+ use_cls=True,
209
+ use_rec=True,
210
+ text_score=0.5,
211
+ box_thresh=0.5,
212
+ unclip_ratio=1.6,
213
+ return_word_box=False,
214
+ )
215
+
216
+ ocr_data = ocr_result.txts
217
+
218
+ found_meds_with_originals = {}
219
+
220
+ for item in ocr_data:
221
+ text_lower = item.lower()
222
+
223
+ # Strong line-level gate
224
+ if not is_potential_med_line(text_lower):
225
+ continue
226
+
227
+ # Skip doctor name lines
228
+ if "dr." in text_lower or "dr " in text_lower:
229
+ continue
230
+
231
+ # Anchor-aware tokens
232
+ candidate_tokens = normalize_anchored_tokens(item)
233
+
234
+ # Optional SymSpell segmentation on normalized tokens
235
+ if candidate_tokens:
236
+ segmentation = sym_spell.word_segmentation(" ".join(candidate_tokens))
237
+ corrected_string = segmentation.corrected_string
238
+ candidate_tokens = corrected_string.split()
239
+
240
+ for word in candidate_tokens:
241
+ if len(word) < 3:
242
+ continue
243
+
244
+ if word in english_vocab and word not in rescue_list:
245
+ continue
246
+
247
+ # Check for exact match first to avoid false positives from SymSpell corrections
248
+ canonical = validate_drug_match(word, drug_db, drug_token_index)
249
+ if canonical:
250
+ if canonical not in found_meds_with_originals:
251
+ found_meds_with_originals[canonical] = []
252
+ if item not in found_meds_with_originals[canonical]:
253
+ found_meds_with_originals[canonical].append(item)
254
+ continue # Skip SymSpell since exact match found
255
+
256
+ suggestions = sym_spell.lookup(
257
+ word, Verbosity.CLOSEST, max_edit_distance=1
258
+ )
259
+ if not suggestions:
260
+ continue
261
+
262
+ cand = suggestions[0].term
263
+ canonical = validate_drug_match(cand, drug_db, drug_token_index)
264
+ if not canonical:
265
+ continue # reject noise that is not truly in drug_db
266
+
267
+ if canonical not in found_meds_with_originals:
268
+ found_meds_with_originals[canonical] = []
269
+ if item not in found_meds_with_originals[canonical]:
270
+ found_meds_with_originals[canonical].append(item)
271
+
272
+ print("\nJSON Output:")
273
+ print(json.dumps(found_meds_with_originals, indent=4))
274
+
275
+ return found_meds_with_originals
276
+
277
+ def process_prescription(image):
278
+ if image is None:
279
+ return "No image uploaded."
280
+ # Save PIL image to temp file
281
+ with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
282
+ image.save(tmp.name)
283
+ result = process_image_ocr(tmp.name)
284
+ return json.dumps(result, indent=4)
285
+
286
+ iface = gr.Interface(
287
+ fn=process_prescription,
288
+ inputs=gr.Image(type="pil", label="Upload Prescription Image"),
289
+ outputs=gr.Textbox(label="Extracted Drugs", lines=20),
290
+ title="MediBot - Drug Extraction from Prescriptions",
291
+ description="Upload a prescription image to extract drug information."
292
+ )
293
+
294
+ if __name__ == "__main__":
295
+ iface.launch()