import gradio as gr
import json
import tempfile
import pickle
import os
import cv2
import pandas as pd
import requests
import re
from symspellpy import SymSpell, Verbosity
from rapidocr import RapidOCR, EngineType, LangCls, LangDet, LangRec, ModelType, OCRVersion

# Constants
ANCHOR_PREFIXES = ["tab", "cap", "t."]

# Medical anchors (TAB/CAP/INJ/etc.)
ANCHORS = [
    r"tab\.?", r"cap\.?", r"inj\.?", r"syp\.?", r"syr\.?",
    r"sol\.?", r"susp\.?", r"oint\.?", r"crm\.?", r"gel\.?",
    r"drops?", r"powder", r"dragees?", r"t\.?", r"c\.?"
]
ANCHOR_PATTERN = re.compile(r"\b(" + "|".join(ANCHORS) + r")", re.IGNORECASE)

# Non-medical line patterns (to drop lines early)
NON_MED_PATTERNS = [
    r"emergency", r"contact", r"please",
    r"nephrologist", r"cardiologist",
    r"opinion", r"inform", r"kftafter", r"prescription",
    r"follow[- ]up", r"dr\.", r"physician", r"clinic",
    r"hospital", r"diagnosed", r"treatment", r"patient",
    r"age[: ]", r"sex[: ]", r"weight[: ]", r"height[: ]",
    r"bp[: ]", r"pulse[: ]", r"temperature[: ]",
    r"investigation", r"advised", r"admission", r"discharge",
    r"report", r"lab[: ]", r"laboratory", r"radiology",
    r"address", r"phone[: ]", r"mobile[: ]", r"email[: ]",
    r"signature", r"regd\.?", r"drugs? prescribed"
]
NON_MED_REGEX = re.compile("|".join(NON_MED_PATTERNS), re.IGNORECASE)

# Rescue list for drug-like English words
rescue_list = {"d3", "b12", "k2", "iron", "zinc", "calcium", "vit", "xl"}

def is_potential_med_line(text: str) -> bool:
    t = text.lower()
    non_med_match = NON_MED_REGEX.search(t)
    if non_med_match:
        return False
    anchor_match = ANCHOR_PATTERN.search(t)
    if not anchor_match:
        return False
    digit_match = re.search(r"\d", t)
    if not digit_match:
        return False
    return True

def validate_drug_match(term: str, drug_db, drug_token_index):
    """
    Map SymSpell term -> canonical database drug, or None if noise.
    """
    if term in drug_db:
        return term
    if term in drug_token_index:
        # pick one canonical name; you can change selection logic if needed
        return sorted(drug_token_index[term])[0]
    return None

def normalize_anchored_tokens(raw_text: str):
    """
    Use TAB/CAP/T. as anchors, not something to delete:
    - 'TABCLOPITAB75MG TAB'      -> ['clopitab']
    - 'TAB SOBISISTAB'           -> ['sobisistab']
    - 'TABSTARPRESSXL25MGTAB'    -> ['starpressxl']
    """
    t = raw_text.lower()
    # Remove dosage and numbers but keep anchor letters
    t = re.sub(r"\d+\s*(mg|ml|gm|%|u|mcg)", " ", t)
    t = re.sub(r"\d+", " ", t)
    tokens = t.split()

    normalized = []
    skip_next = False

    for i, tok in enumerate(tokens):
        if skip_next:
            skip_next = False
            continue

        base = tok

        # Case 1: token starts with anchor as prefix (no space)
        for pref in ANCHOR_PREFIXES:
            if base.startswith(pref) and len(base) > len(pref):
                base = base[len(pref):]
                break

        # Case 2: token is pure anchor and should attach to next token
        if base in ["tab", "cap", "t"]:
            if i + 1 < len(tokens):
                merged = tokens[i + 1]
                for pref in ANCHOR_PREFIXES:
                    if merged.startswith(pref) and len(merged) > len(pref):
                        merged = merged[len(pref):]
                        break
                base = merged
                skip_next = True
            else:
                continue

        base = base.strip()
        if len(base) >= 3:
            normalized.append(base)

    return normalized

def initialize_database():
    data_path = os.path.join(os.path.dirname(__file__), "data/Dataset.csv")
    df = pd.read_csv(data_path)
    drug_db = set(df["Combined_Drugs"].astype(str).str.lower().str.strip())
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

    for drug in drug_db:
        d = drug.lower()
        sym_spell.create_dictionary_entry(d, 100000)
        parts = d.split()
        if len(parts) > 1:
            for p in parts:
                if len(p) > 3:
                    sym_spell.create_dictionary_entry(p, 100000)

    drug_token_index = {}
    for full in drug_db:
        toks = full.split()
        for tok in toks:
            if len(tok) < 3:
                continue
            drug_token_index.setdefault(tok, set()).add(full)

    # English filter
    try:
        url = (
            "https://raw.githubusercontent.com/first20hours/"
            "google-10000-english/master/google-10000-english-no-swears.txt"
        )
        response = requests.get(url, timeout=10)
        english_vocab = set(response.text.split())
    except Exception:
        english_vocab = {"the", "and", "tab", "cap", "mg", "ml"}

    return {
        'drug_db': drug_db,
        'sym_spell': sym_spell,
        'drug_token_index': drug_token_index,
        'english_vocab': english_vocab,
        'rescue_list': rescue_list,
        'NON_MED_REGEX': NON_MED_REGEX,
        'ANCHOR_PATTERN': ANCHOR_PATTERN,
        'ANCHOR_PREFIXES': ANCHOR_PREFIXES
    }

def process_image_ocr(image_path):
    # Load cached database
    cache_path = os.path.join(os.path.dirname(__file__), "cache/database_cache.pkl")
    try:
        with open(cache_path, 'rb') as f:
            cache = pickle.load(f)
        drug_db = cache['drug_db']
        sym_spell = cache['sym_spell']
        drug_token_index = cache['drug_token_index']
        english_vocab = cache['english_vocab']
        rescue_list = cache['rescue_list']
    except FileNotFoundError:
        print("Error: database_cache.pkl not found. Initializing database...")
        cache = initialize_database()
        drug_db = cache['drug_db']
        sym_spell = cache['sym_spell']
        drug_token_index = cache['drug_token_index']
        english_vocab = cache['english_vocab']
        rescue_list = cache['rescue_list']

    # Load image using cv2
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not load image from {image_path}")

    # Create RapidOCR engine with default parameters
    ocr_engine = RapidOCR(
        params={
            "Global.max_side_len": 2000,
            "Det.engine_type": EngineType.ONNXRUNTIME,
            "Det.lang_type": LangDet.CH,
            "Det.model_type": ModelType.MOBILE,
            "Det.ocr_version": OCRVersion.PPOCRV4,
            "Cls.engine_type": EngineType.ONNXRUNTIME,
            "Cls.lang_type": LangCls.CH,
            "Cls.model_type": ModelType.MOBILE,
            "Cls.ocr_version": OCRVersion.PPOCRV4,
            "Rec.engine_type": EngineType.ONNXRUNTIME,
            "Rec.lang_type": LangRec.CH,
            "Rec.model_type": ModelType.MOBILE,
            "Rec.ocr_version": OCRVersion.PPOCRV4,
        }
    )

    # Run OCR
    ocr_result = ocr_engine(
        img,
        use_det=True,
        use_cls=True,
        use_rec=True,
        text_score=0.5,
        box_thresh=0.5,
        unclip_ratio=1.6,
        return_word_box=False,
    )

    ocr_data = ocr_result.txts

    found_meds_with_originals = {}

    for item in ocr_data:
        text_lower = item.lower()

        # Strong line-level gate
        if not is_potential_med_line(text_lower):
            continue

        # Skip doctor name lines
        if "dr." in text_lower or "dr " in text_lower:
            continue

        # Anchor-aware tokens
        candidate_tokens = normalize_anchored_tokens(item)

        # Optional SymSpell segmentation on normalized tokens
        if candidate_tokens:
            segmentation = sym_spell.word_segmentation(" ".join(candidate_tokens))
            corrected_string = segmentation.corrected_string
            candidate_tokens = corrected_string.split()

        for word in candidate_tokens:
            if len(word) < 3:
                continue

            if word in english_vocab and word not in rescue_list:
                continue

            # Check for exact match first to avoid false positives from SymSpell corrections
            canonical = validate_drug_match(word, drug_db, drug_token_index)
            if canonical:
                if canonical not in found_meds_with_originals:
                    found_meds_with_originals[canonical] = []
                if item not in found_meds_with_originals[canonical]:
                    found_meds_with_originals[canonical].append(item)
                continue  # Skip SymSpell since exact match found

            suggestions = sym_spell.lookup(
                word, Verbosity.CLOSEST, max_edit_distance=1
            )
            if not suggestions:
                continue

            cand = suggestions[0].term
            canonical = validate_drug_match(cand, drug_db, drug_token_index)
            if not canonical:
                continue  # reject noise that is not truly in drug_db

            if canonical not in found_meds_with_originals:
                found_meds_with_originals[canonical] = []
            if item not in found_meds_with_originals[canonical]:
                found_meds_with_originals[canonical].append(item)

    print("\nJSON Output:")
    print(json.dumps(found_meds_with_originals, indent=4))

    return found_meds_with_originals

def process_prescription(image):
    if image is None:
        return "No image uploaded."
    # Save PIL image to temp file
    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
        image.save(tmp.name)
        result = process_image_ocr(tmp.name)
    return json.dumps(result, indent=4)

iface = gr.Interface(
    fn=process_prescription,
    inputs=gr.Image(type="pil", label="Upload Prescription Image"),
    outputs=gr.Textbox(label="Extracted Drugs", lines=20),
    title="MediBot - Drug Extraction from Prescriptions",
    description="Upload a prescription image to extract drug information."
)

if __name__ == "__main__":
    iface.launch()