Spaces:

IFMedTechdemo
/

Digital_Prescription

Sleeping

App Files Files Community

IFMedTechdemo commited on 18 days ago

Commit

c1b0ad4

verified ·

1 Parent(s): acb99cd

Update app.py

Browse files

Files changed (1) hide show

app.py +299 -180

app.py CHANGED Viewed

@@ -1,200 +1,314 @@
 import gradio as gr
-import pickle
-import os
 import cv2
-import pandas as pd
 import re
 from symspellpy import SymSpell, Verbosity
-from rapidocr import RapidOCR, EngineType, LangDet, LangRec, ModelType, OCRVersion
-import logging
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Constants - Match both separated and merged prefixes
-ANCHOR_PATTERN = re.compile(
-    r"\b(tab\.?|cap\.?|t\.?)\s*([a-zA-Z0-9]+)",
-    re.IGNORECASE
-)
 # ============================================================================
-# GLOBAL SINGLETONS
 # ============================================================================
-_ocr_engine = None
-_drug_db = None
-_sym_spell = None
-_cache_path = os.path.join(os.path.dirname(__file__), "cache","Final_Medibot_Database_Cleaned_pickleFile.pkl")#"Final_Tata_Kaggle_merged_pickleFile.pkl")# "database_cache.pkl")
-def ensure_cache_dir():
-    """Ensure cache directory exists."""
-    cache_dir = os.path.dirname(_cache_path)
-    if not os.path.exists(cache_dir):
-        os.makedirs(cache_dir, exist_ok=True)
 def initialize_database():
-    """
-    Load drug database and SymSpell once.
-    Uses cache if available to skip expensive recomputation.
-    """
-    global _drug_db, _sym_spell
-    ensure_cache_dir()
-    # Try to load from cache
-    if os.path.exists(_cache_path):
-        logger.info("Loading database from cache...")
-        try:
-            with open(_cache_path, 'rb') as f:
-                cache_data = pickle.load(f)
-                _drug_db = cache_data['drug_db']
-                _sym_spell = cache_data['sym_spell']
-            logger.info(f"Cache loaded: {len(_drug_db)} drugs")
-            return
-        except Exception as e:
-            logger.warning(f"Cache load failed: {e}. Recomputing...")
-    # Compute from scratch
-    logger.info("Initializing database...")
-    _drug_db = {}
     try:
-        df = pd.read_csv("Dataset.csv")
-        for idx, row in df.iterrows():
-            drug_name = str(row.get('drug_name', '')).strip().lower()
-            if drug_name:
-                _drug_db[drug_name] = True
-    except Exception as e:
-        logger.warning(f"Dataset loading failed: {e}. Using minimal DB.")
-        _drug_db = {"aspirin": True, "paracetamol": True}
-    # Initialize SymSpell with drug DB
-    _sym_spell = SymSpell(max_dictionary_edit_distance=1)
-    for drug in _drug_db:
-        _sym_spell.create_dictionary_entry(drug, 1000)
-    # Cache for next startup
-    try:
-        cache_data = {
-            'drug_db': _drug_db,
-            'sym_spell': _sym_spell
-        }
-        with open(_cache_path, 'wb') as f:
-            pickle.dump(cache_data, f)
-        logger.info(f"Database cached: {len(_drug_db)} drugs")
-    except Exception as e:
-        logger.warning(f"Cache save failed: {e}")
-def get_ocr_engine():
-    """Get or create the RapidOCR engine with MOBILE + ONNX optimization."""
-    global _ocr_engine
-    if _ocr_engine is None:
-        logger.info("Initializing RapidOCR engine with MOBILE models...")
-        _ocr_engine = RapidOCR(
-            params={
-                "Global.max_side_len": 1280,
-                "Det.engine_type": EngineType.ONNXRUNTIME,
-                "Det.lang_type": LangDet.CH,
-                "Det.model_type": ModelType.MOBILE,
-                "Det.ocr_version": OCRVersion.PPOCRV5,
-                "Rec.engine_type": EngineType.ONNXRUNTIME,
-                "Rec.lang_type": LangRec.CH,
-                "Rec.model_type": ModelType.MOBILE,
-                "Rec.ocr_version": OCRVersion.PPOCRV5,
-            }
-        )
-    return _ocr_engine
-def validate_drug_match(term: str) -> str:
-    """Map term to canonical database drug, or None if noise."""
-    term = term.lower()
-    if term in _drug_db:
-        return term
-    # Skip SymSpell for very short or long words
-    if len(term) < 3 or len(term) > 15:
-        return None
-    # Fuzzy match via SymSpell
-    suggestions = _sym_spell.lookup(term, Verbosity.CLOSEST, max_edit_distance=1)
-    if suggestions and suggestions[0].term in _drug_db:
-        return suggestions[0].term
     return None
-def extract_drugs_from_line(line_text: str):
     """
-    Extract drug names that follow or are merged with tab/cap/t prefixes.
     """
-    drugs = []
-    matches = ANCHOR_PATTERN.finditer(line_text)
-    for match in matches:
-        prefix = match.group(1).lower()
-        next_word = match.group(2)
-        canonical = validate_drug_match(next_word)
-        if canonical:
-            drugs.append(canonical)
-    return drugs
-def process_image_ocr(image):
-    """Fast OCR + drug extraction using MOBILE models and ONNX runtime."""
-    logger.info("Processing image...")
-    ocr_engine = get_ocr_engine()
-    # Preprocess: resize if too large
-    height, width = image.shape[:2]
-    if width > 1280:
-        scale = 1280 / width
-        image = cv2.resize(image, None, fx=scale, fy=scale)
-    # Run OCR with optimized settings
-    try:
-        ocr_result = ocr_engine(
-            image,
-            use_det=True,
-            use_cls=False,
-            use_rec=True,
-        )
-    except Exception as e:
-        logger.error(f"OCR failed: {e}")
-        return {"error": str(e)}
-    # Handle new RapidOCR return format
-    if not ocr_result or not hasattr(ocr_result, 'txts'):
-        return {"drugs": [], "raw_lines": []}
-    # Extract drugs
-    drugs_found = set()
-    raw_lines = []
-    for line_text in ocr_result.txts:
-        if not line_text:
             continue
-        raw_lines.append(line_text)
-        line_drugs = extract_drugs_from_line(line_text)
-        drugs_found.update(line_drugs)
-    return {
-        "drugs": sorted(list(drugs_found)),
-        "raw_lines": raw_lines,
-        "drugs_count": len(drugs_found),
-        "elapse": f"{ocr_result.elapse:.3f}s"
-    }
 def process_input(image_input):
     """Gradio interface handler."""
@@ -202,26 +316,39 @@ def process_input(image_input):
         return "Please upload an image.", {}
     try:
-        # Convert Gradio RGB to BGR for OpenCV
-        image = cv2.cvtColor(image_input, cv2.COLOR_RGB2BGR)
-        result = process_image_ocr(image)
-        if "error" in result:
-            return f"Error: {result['error']}", {}
         # Summary text
-        summary = f"Found {result['drugs_count']} medication(s) in {result.get('elapse', 'N/A')}"
         # JSON output with all medications
         medications_json = {
-            "total_medications": result["drugs_count"],
-            "processing_time": result.get("elapse", "N/A"),
             "medications": [
                 {
                     "id": idx + 1,
-                    "name": drug.title()
                 }
-                for idx, drug in enumerate(result["drugs"])
             ]
         }
@@ -230,15 +357,7 @@ def process_input(image_input):
         logger.error(f"Processing error: {e}")
         return f"Error: {str(e)}", {}
-# ============================================================================
-# Gradio Interface
-# ============================================================================
 logger.info("Starting Medibot...")
-initialize_database()
-logger.info("Database initialized. Ready for inference.")
 with gr.Blocks(title="Medibot - Fast OCR") as demo:
     gr.Markdown("# Medibot: Prescription OCR")

 import gradio as gr
 import cv2
+import time
+import logging
+import os
 import re
+import pickle
+import json
+import pandas as pd
 from symspellpy import SymSpell, Verbosity
+from rapidocr import RapidOCR, EngineType, LangCls, LangDet, LangRec, ModelType, OCRVersion
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # ============================================================================
+# Database Initialization (from src/database/init.py)
 # ============================================================================
 def initialize_database():
+    # Assuming data/Dataset.csv is relative to the current script or fixed path
+    # Adjust path if necessary. app.py is in root, data is in ./data
+    data_path = os.path.join(os.path.dirname(__file__), "data/Dataset.csv")
     try:
+        df = pd.read_csv(data_path, encoding='utf-8')
+    except UnicodeDecodeError:
+        df = pd.read_csv(data_path, encoding='latin1')
+    drug_db = set(df["Combined_Drugs"].astype(str).str.lower().str.strip())
+    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
+    for drug in drug_db:
+        d = drug.lower()
+        sym_spell.create_dictionary_entry(d, 100000)
+        parts = d.split()
+        if len(parts) > 1:
+            for p in parts:
+                sym_spell.create_dictionary_entry(p, 100000)
+    drug_token_index = {}
+    for full in drug_db:
+        toks = full.split()
+        for tok in toks:
+            drug_token_index.setdefault(tok, set()).add(full)
+    ANCHOR_PREFIXES = ["tab", "cap"]
+    ANCHORS = [
+        r"tab\.?", r"cap\.?"
+    ]
+    ANCHOR_PATTERN = re.compile(r"\b(" + "|".join(ANCHORS) + r")\b", re.IGNORECASE)
+    return {
+        'drug_db': drug_db,
+        'sym_spell': sym_spell,
+        'drug_token_index': drug_token_index,
+        'ANCHOR_PREFIXES': ANCHOR_PREFIXES,
+        'ANCHOR_PATTERN': ANCHOR_PATTERN
+    }
+# Initialize Database Globally
+logger.info("Initializing database...")
+cache_path = os.path.join(os.path.dirname(__file__), "cache/database_cache.pkl")
+try:
+    with open(cache_path, 'rb') as f:
+        cache = pickle.load(f)
+    drug_db = cache['drug_db']
+    sym_spell = cache['sym_spell']
+    drug_token_index = cache['drug_token_index']
+    ANCHOR_PREFIXES = cache['ANCHOR_PREFIXES']
+    ANCHOR_PATTERN = cache['ANCHOR_PATTERN']
+    logger.info("Database loaded from cache.")
+except FileNotFoundError:
+    logger.info("Cache not found. Initializing from CSV...")
+    cache = initialize_database()
+    drug_db = cache['drug_db']
+    sym_spell = cache['sym_spell']
+    drug_token_index = cache['drug_token_index']
+    ANCHOR_PREFIXES = cache['ANCHOR_PREFIXES']
+    ANCHOR_PATTERN = cache['ANCHOR_PATTERN']
+    # Save cache
+    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
+    with open(cache_path, 'wb') as f:
+        pickle.dump(cache, f)
+    logger.info("Database initialized and cached.")
+# ============================================================================
+# Helper Functions (from src/utils/drug_matching.py)
+# ============================================================================
+def is_potential_med_line(text: str, ANCHOR_PATTERN) -> bool:
+    t = text.lower()
+    # User requested ONLY anchor check
+    anchor_match = ANCHOR_PATTERN.search(t)
+    if anchor_match:
+        return True
+    return False
+def validate_drug_match(term: str, drug_db, drug_token_index):
+    """
+    Map SymSpell term -> canonical database drug, or None if noise.
+    """
+    if term in drug_db:
+        return term
+    if term in drug_token_index:
+        # pick one canonical name; you can change selection logic if needed
+        return sorted(drug_token_index[term])[0]
     return None
+def normalize_anchored_tokens(raw_text: str, ANCHOR_PREFIXES):
     """
+    Use TAB/CAP/T. as anchors, not something to delete:
+    - 'TABCLOPITAB75MG TAB'      -> ['clopitab']
+    - 'TAB SOBISISTAB'           -> ['sobisistab']
+    - 'TABSTARPRESSXL25MGTAB'    -> ['starpressxl']
     """
+    t = raw_text.lower()
+    # Remove dosage and numbers but keep anchor letters
+    t = re.sub(r"\d+\s*(mg|ml|gm|%|u|mcg)", " ", t)
+    t = re.sub(r"\d+", " ", t)
+    # Remove punctuation including full-width parentheses
+    t = re.sub(r"[^\w\s]", " ", t)
+    tokens = t.split()
+    normalized = []
+    skip_next = False
+    for i, tok in enumerate(tokens):
+        if skip_next:
+            skip_next = False
+            continue
+        base = tok
+        # Case 1: token starts with anchor as prefix (no space)
+        for pref in ANCHOR_PREFIXES:
+            if base.startswith(pref) and len(base) > len(pref):
+                base = base[len(pref):]
+                break
+        # Case 2: token is pure anchor and should attach to next token
+        if base in ["tab", "cap", "t"]:
+            if i + 1 < len(tokens):
+                merged = tokens[i + 1]
+                for pref in ANCHOR_PREFIXES:
+                    if merged.startswith(pref) and len(merged) > len(pref):
+                        merged = merged[len(pref):]
+                        break
+                base = merged
+                skip_next = True
+            else:
+                continue
+        base = base.strip()
+        normalized.append(base)
+    return normalized
+# ============================================================================
+# OCR Processor (from src/ocr/processor.py)
+# ============================================================================
+def process_image_ocr(image_path):
+    # Load image using cv2
+    img = cv2.imread(image_path)
+    if img is None:
+        raise ValueError(f"Could not load image from {image_path}")
+    # Create RapidOCR engine with default parameters
+    ocr_engine = RapidOCR(
+        params={
+            "Global.max_side_len": 2000,
+            "Det.engine_type": EngineType.ONNXRUNTIME,
+            "Det.lang_type": LangDet.EN,
+            "Det.model_type": ModelType.MOBILE,
+            "Det.ocr_version": OCRVersion.PPOCRV4,
+            "Cls.engine_type": EngineType.ONNXRUNTIME,
+            "Cls.lang_type": LangCls.CH,
+            "Cls.model_type": ModelType.MOBILE,
+            "Cls.ocr_version": OCRVersion.PPOCRV4,
+            "Rec.engine_type": EngineType.ONNXRUNTIME,
+            "Rec.lang_type": LangRec.EN,
+            "Rec.model_type": ModelType.MOBILE,
+            "Rec.ocr_version": OCRVersion.PPOCRV4,
+        }
+    )
+    # Run OCR
+    ocr_result = ocr_engine(
+        img,
+        use_det=True,
+        use_cls=True,
+        use_rec=True,
+        text_score=0.3,
+        box_thresh=0.3,
+        unclip_ratio=2.0,
+        return_word_box=False,
+    )
+    ocr_data = ocr_result.txts
+    found_meds_with_originals = {}
+    for item in ocr_data:
+        text_lower = item.lower()
+        # Simplified line-level gate: ONLY check anchors
+        if not is_potential_med_line(text_lower, ANCHOR_PATTERN):
             continue
+        # Skip doctor name lines
+        if "dr." in text_lower or "dr " in text_lower:
+            continue
+        # Anchor-aware tokens
+        candidate_tokens = normalize_anchored_tokens(item, ANCHOR_PREFIXES)
+        # Save original normalized text for exact match checking
+        normalized_text_str = " ".join(candidate_tokens)
+        # Optional SymSpell segmentation on normalized tokens
+        if candidate_tokens:
+            segmentation = sym_spell.word_segmentation(" ".join(candidate_tokens))
+            corrected_string = segmentation.corrected_string
+            candidate_tokens = corrected_string.split()
+        line_matches = []
+        i = 0
+        n = len(candidate_tokens)
+        while i < n:
+            match_found = False
+            # Greedy longest match: try phrases of length 5 down to 1
+            for length in range(min(5, n - i), 0, -1):
+                phrase_tokens = candidate_tokens[i : i + length]
+                phrase = " ".join(phrase_tokens)
+                # Check exact phrase in DB
+                if phrase in drug_db:
+                    # Found a multi-word (or single-word) drug match!
+                    if phrase in normalized_text_str:
+                        line_matches.append((phrase, "exact", phrase))
+                    else:
+                        line_matches.append((phrase, "fuzzy", phrase))
+                    i += length
+                    match_found = True
+                    break
+            if match_found:
+                continue
+            # Fallback: Single token processing (Fuzzy / Partial)
+            word = candidate_tokens[i]
+            i += 1
+            # Check for exact match first (as a single token)
+            canonical = validate_drug_match(word, drug_db, drug_token_index)
+            if canonical:
+                # Coverage check: detected word must cover a significant portion of the canonical name
+                if len(word) / len(canonical) < 0.6:
+                    continue
+                if word in normalized_text_str:
+                    line_matches.append((canonical, "exact", word))
+                else:
+                    line_matches.append((canonical, "fuzzy", word))
+                continue
+            # Fuzzy matching
+            if len(word) < 3:
+                continue
+            suggestions = sym_spell.lookup(
+                word, Verbosity.CLOSEST, max_edit_distance=1
+            )
+            if not suggestions:
+                continue
+            cand = suggestions[0].term
+            canonical = validate_drug_match(cand, drug_db, drug_token_index)
+            if canonical:
+                # Coverage check for fuzzy match too
+                if len(word) / len(canonical) < 0.6:
+                    continue
+                line_matches.append((canonical, "fuzzy", word))
+        # Filter matches for this line:
+        exact_matches = [m for m in line_matches if m[1] == "exact"]
+        if exact_matches:
+            final_matches = exact_matches
+        else:
+            final_matches = line_matches
+        for match in final_matches:
+            canonical = match[0]
+            original_text = match[2]
+            if canonical not in found_meds_with_originals:
+                found_meds_with_originals[canonical] = []
+            if item not in found_meds_with_originals[canonical]:
+                found_meds_with_originals[canonical].append(item)
+    return found_meds_with_originals
+# ============================================================================
+# Gradio Interface
+# ============================================================================
 def process_input(image_input):
     """Gradio interface handler."""
         return "Please upload an image.", {}
     try:
+        temp_path = "temp_upload.jpg"
+        # Convert RGB (Gradio) to BGR (OpenCV)
+        image_bgr = cv2.cvtColor(image_input, cv2.COLOR_RGB2BGR)
+        cv2.imwrite(temp_path, image_bgr)
+        start_time = time.time()
+        # Use the robust processor
+        found_meds_dict = process_image_ocr(temp_path)
+        elapsed_time = time.time() - start_time
+        # Cleanup
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+        drugs_list = sorted(found_meds_dict.keys())
+        drugs_count = len(drugs_list)
         # Summary text
+        summary = f"Found {drugs_count} medication(s) in {elapsed_time:.3f}s"
         # JSON output with all medications
         medications_json = {
+            "total_medications": drugs_count,
+            "processing_time": f"{elapsed_time:.3f}s",
             "medications": [
                 {
                     "id": idx + 1,
+                    "name": drug.title(),
+                    "original_text": found_meds_dict[drug]
                 }
+                for idx, drug in enumerate(drugs_list)
             ]
         }
         logger.error(f"Processing error: {e}")
         return f"Error: {str(e)}", {}
 logger.info("Starting Medibot...")
 with gr.Blocks(title="Medibot - Fast OCR") as demo:
     gr.Markdown("# Medibot: Prescription OCR")