Spaces:

IFMedTechdemo
/

Digital_Prescription

Sleeping

App Files Files Community

IFMedTechdemo commited on 22 days ago

Commit

68a6157

verified ·

1 Parent(s): 7ef4bbb

Create app.py

Browse files

Files changed (1) hide show

app.py +295 -0

app.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import gradio as gr
+import json
+import tempfile
+import pickle
+import os
+import cv2
+import pandas as pd
+import requests
+import re
+from symspellpy import SymSpell, Verbosity
+from rapidocr import RapidOCR, EngineType, LangCls, LangDet, LangRec, ModelType, OCRVersion
+# Constants
+ANCHOR_PREFIXES = ["tab", "cap", "t."]
+# Medical anchors (TAB/CAP/INJ/etc.)
+ANCHORS = [
+    r"tab\.?", r"cap\.?", r"inj\.?", r"syp\.?", r"syr\.?",
+    r"sol\.?", r"susp\.?", r"oint\.?", r"crm\.?", r"gel\.?",
+    r"drops?", r"powder", r"dragees?", r"t\.?", r"c\.?"
+]
+ANCHOR_PATTERN = re.compile(r"\b(" + "|".join(ANCHORS) + r")", re.IGNORECASE)
+# Non-medical line patterns (to drop lines early)
+NON_MED_PATTERNS = [
+    r"emergency", r"contact", r"please",
+    r"nephrologist", r"cardiologist",
+    r"opinion", r"inform", r"kftafter", r"prescription",
+    r"follow[- ]up", r"dr\.", r"physician", r"clinic",
+    r"hospital", r"diagnosed", r"treatment", r"patient",
+    r"age[: ]", r"sex[: ]", r"weight[: ]", r"height[: ]",
+    r"bp[: ]", r"pulse[: ]", r"temperature[: ]",
+    r"investigation", r"advised", r"admission", r"discharge",
+    r"report", r"lab[: ]", r"laboratory", r"radiology",
+    r"address", r"phone[: ]", r"mobile[: ]", r"email[: ]",
+    r"signature", r"regd\.?", r"drugs? prescribed"
+]
+NON_MED_REGEX = re.compile("|".join(NON_MED_PATTERNS), re.IGNORECASE)
+# Rescue list for drug-like English words
+rescue_list = {"d3", "b12", "k2", "iron", "zinc", "calcium", "vit", "xl"}
+def is_potential_med_line(text: str) -> bool:
+    t = text.lower()
+    non_med_match = NON_MED_REGEX.search(t)
+    if non_med_match:
+        return False
+    anchor_match = ANCHOR_PATTERN.search(t)
+    if not anchor_match:
+        return False
+    digit_match = re.search(r"\d", t)
+    if not digit_match:
+        return False
+    return True
+def validate_drug_match(term: str, drug_db, drug_token_index):
+    """
+    Map SymSpell term -> canonical database drug, or None if noise.
+    """
+    if term in drug_db:
+        return term
+    if term in drug_token_index:
+        # pick one canonical name; you can change selection logic if needed
+        return sorted(drug_token_index[term])[0]
+    return None
+def normalize_anchored_tokens(raw_text: str):
+    """
+    Use TAB/CAP/T. as anchors, not something to delete:
+    - 'TABCLOPITAB75MG TAB'      -> ['clopitab']
+    - 'TAB SOBISISTAB'           -> ['sobisistab']
+    - 'TABSTARPRESSXL25MGTAB'    -> ['starpressxl']
+    """
+    t = raw_text.lower()
+    # Remove dosage and numbers but keep anchor letters
+    t = re.sub(r"\d+\s*(mg|ml|gm|%|u|mcg)", " ", t)
+    t = re.sub(r"\d+", " ", t)
+    tokens = t.split()
+    normalized = []
+    skip_next = False
+    for i, tok in enumerate(tokens):
+        if skip_next:
+            skip_next = False
+            continue
+        base = tok
+        # Case 1: token starts with anchor as prefix (no space)
+        for pref in ANCHOR_PREFIXES:
+            if base.startswith(pref) and len(base) > len(pref):
+                base = base[len(pref):]
+                break
+        # Case 2: token is pure anchor and should attach to next token
+        if base in ["tab", "cap", "t"]:
+            if i + 1 < len(tokens):
+                merged = tokens[i + 1]
+                for pref in ANCHOR_PREFIXES:
+                    if merged.startswith(pref) and len(merged) > len(pref):
+                        merged = merged[len(pref):]
+                        break
+                base = merged
+                skip_next = True
+            else:
+                continue
+        base = base.strip()
+        if len(base) >= 3:
+            normalized.append(base)
+    return normalized
+def initialize_database():
+    data_path = os.path.join(os.path.dirname(__file__), "data/Dataset.csv")
+    df = pd.read_csv(data_path)
+    drug_db = set(df["Combined_Drugs"].astype(str).str.lower().str.strip())
+    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
+    for drug in drug_db:
+        d = drug.lower()
+        sym_spell.create_dictionary_entry(d, 100000)
+        parts = d.split()
+        if len(parts) > 1:
+            for p in parts:
+                if len(p) > 3:
+                    sym_spell.create_dictionary_entry(p, 100000)
+    drug_token_index = {}
+    for full in drug_db:
+        toks = full.split()
+        for tok in toks:
+            if len(tok) < 3:
+                continue
+            drug_token_index.setdefault(tok, set()).add(full)
+    # English filter
+    try:
+        url = (
+            "https://raw.githubusercontent.com/first20hours/"
+            "google-10000-english/master/google-10000-english-no-swears.txt"
+        )
+        response = requests.get(url, timeout=10)
+        english_vocab = set(response.text.split())
+    except Exception:
+        english_vocab = {"the", "and", "tab", "cap", "mg", "ml"}
+    return {
+        'drug_db': drug_db,
+        'sym_spell': sym_spell,
+        'drug_token_index': drug_token_index,
+        'english_vocab': english_vocab,
+        'rescue_list': rescue_list,
+        'NON_MED_REGEX': NON_MED_REGEX,
+        'ANCHOR_PATTERN': ANCHOR_PATTERN,
+        'ANCHOR_PREFIXES': ANCHOR_PREFIXES
+    }
+def process_image_ocr(image_path):
+    # Load cached database
+    cache_path = os.path.join(os.path.dirname(__file__), "cache/database_cache.pkl")
+    try:
+        with open(cache_path, 'rb') as f:
+            cache = pickle.load(f)
+        drug_db = cache['drug_db']
+        sym_spell = cache['sym_spell']
+        drug_token_index = cache['drug_token_index']
+        english_vocab = cache['english_vocab']
+        rescue_list = cache['rescue_list']
+    except FileNotFoundError:
+        print("Error: database_cache.pkl not found. Initializing database...")
+        cache = initialize_database()
+        drug_db = cache['drug_db']
+        sym_spell = cache['sym_spell']
+        drug_token_index = cache['drug_token_index']
+        english_vocab = cache['english_vocab']
+        rescue_list = cache['rescue_list']
+    # Load image using cv2
+    img = cv2.imread(image_path)
+    if img is None:
+        raise ValueError(f"Could not load image from {image_path}")
+    # Create RapidOCR engine with default parameters
+    ocr_engine = RapidOCR(
+        params={
+            "Global.max_side_len": 2000,
+            "Det.engine_type": EngineType.ONNXRUNTIME,
+            "Det.lang_type": LangDet.CH,
+            "Det.model_type": ModelType.MOBILE,
+            "Det.ocr_version": OCRVersion.PPOCRV4,
+            "Cls.engine_type": EngineType.ONNXRUNTIME,
+            "Cls.lang_type": LangCls.CH,
+            "Cls.model_type": ModelType.MOBILE,
+            "Cls.ocr_version": OCRVersion.PPOCRV4,
+            "Rec.engine_type": EngineType.ONNXRUNTIME,
+            "Rec.lang_type": LangRec.CH,
+            "Rec.model_type": ModelType.MOBILE,
+            "Rec.ocr_version": OCRVersion.PPOCRV4,
+        }
+    )
+    # Run OCR
+    ocr_result = ocr_engine(
+        img,
+        use_det=True,
+        use_cls=True,
+        use_rec=True,
+        text_score=0.5,
+        box_thresh=0.5,
+        unclip_ratio=1.6,
+        return_word_box=False,
+    )
+    ocr_data = ocr_result.txts
+    found_meds_with_originals = {}
+    for item in ocr_data:
+        text_lower = item.lower()
+        # Strong line-level gate
+        if not is_potential_med_line(text_lower):
+            continue
+        # Skip doctor name lines
+        if "dr." in text_lower or "dr " in text_lower:
+            continue
+        # Anchor-aware tokens
+        candidate_tokens = normalize_anchored_tokens(item)
+        # Optional SymSpell segmentation on normalized tokens
+        if candidate_tokens:
+            segmentation = sym_spell.word_segmentation(" ".join(candidate_tokens))
+            corrected_string = segmentation.corrected_string
+            candidate_tokens = corrected_string.split()
+        for word in candidate_tokens:
+            if len(word) < 3:
+                continue
+            if word in english_vocab and word not in rescue_list:
+                continue
+            # Check for exact match first to avoid false positives from SymSpell corrections
+            canonical = validate_drug_match(word, drug_db, drug_token_index)
+            if canonical:
+                if canonical not in found_meds_with_originals:
+                    found_meds_with_originals[canonical] = []
+                if item not in found_meds_with_originals[canonical]:
+                    found_meds_with_originals[canonical].append(item)
+                continue  # Skip SymSpell since exact match found
+            suggestions = sym_spell.lookup(
+                word, Verbosity.CLOSEST, max_edit_distance=1
+            )
+            if not suggestions:
+                continue
+            cand = suggestions[0].term
+            canonical = validate_drug_match(cand, drug_db, drug_token_index)
+            if not canonical:
+                continue  # reject noise that is not truly in drug_db
+            if canonical not in found_meds_with_originals:
+                found_meds_with_originals[canonical] = []
+            if item not in found_meds_with_originals[canonical]:
+                found_meds_with_originals[canonical].append(item)
+    print("\nJSON Output:")
+    print(json.dumps(found_meds_with_originals, indent=4))
+    return found_meds_with_originals
+def process_prescription(image):
+    if image is None:
+        return "No image uploaded."
+    # Save PIL image to temp file
+    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
+        image.save(tmp.name)
+        result = process_image_ocr(tmp.name)
+    return json.dumps(result, indent=4)
+iface = gr.Interface(
+    fn=process_prescription,
+    inputs=gr.Image(type="pil", label="Upload Prescription Image"),
+    outputs=gr.Textbox(label="Extracted Drugs", lines=20),
+    title="MediBot - Drug Extraction from Prescriptions",
+    description="Upload a prescription image to extract drug information."
+)
+if __name__ == "__main__":
+    iface.launch()