import spaces  # Must be first for ZeroGPU decorator
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from typing import List, Dict
import re
import networkx as nx
from pyvis.network import Network
import tempfile
import html
import os

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"

PROMPT_TEMPLATE = """
You are an expert scientific information extraction assistant.

Your task is to read the given scientific abstract carefully and produce two outputs:

1. **Significance:** Write 2–4 sentences summarizing the core contribution and importance of the described work, integrating contextual understanding and implicit meaning. Focus on novel findings, methods, mechanisms, and their impact.

2. **Scientific relationships:** Extract the key scientific facts as triples in the format:
[Subject] | [Relation] | [Object]

Guidelines for triples:
- Focus on relationships that reflect the main scientific findings, methodological innovations, or mechanisms.
- Only include material or system properties if they are critical to understanding the main findings or methods. Avoid trivial or generic property listings (e.g., “X | related_to | its property”) otherwise.
- Subjects and objects should be meaningful scientific entities, processes, or phenomena.
- Relations should express clear scientific interactions (e.g., based_on, enables, captures, compares_with, improves, applies_to, relevant_for).
- You may infer relationships that are strongly implied by the text — not just explicitly stated.
- Each triple should describe one distinct factual or conceptual relationship.
- Output only one triple per line, with no explanations.

Example abstract (for formatting guidance only):
"Magnetite is an important mineral with many interesting applications related to its magnetic, electrical and thermal properties. Typically studied by electronic structure calculations, these methods are unable to capture the complex ion dynamics at relevant temperatures, time and length scales. We present a hybrid Monte Carlo/Molecular Dynamics (MC/MD) method based on iron oxidation state exchange."

Example output:
**Significance:**
This study introduces a hybrid Monte Carlo/molecular dynamics method that enables accurate atomistic modeling of magnetite and related systems by capturing complex ionic dynamics beyond the reach of traditional electronic structure methods. The approach reproduces oxidation-state patterns consistent with density functional theory and reveals how lattice distortions stabilize excess charges and induce order–disorder transitions at critical surface thicknesses. This work advances understanding of oxidation-state ordering in inverse spinel structures and has implications for battery material design.

**Scientific relationships:**
Hybrid MC/MD method | enables | accurate atomistic modeling of magnetite
Hybrid MC/MD method | captures | complex ionic dynamics
Hybrid MC/MD method | validated_by | comparison with density functional theory
Lattice distortions | lead_to | stabilization of excess charges
Oxidation states | transition_at | critical surface thickness
Hybrid MC/MD method | applicable_to | bulk magnetite, magnetite surfaces, and nanoparticles
Oxidation-state ordering | relevant_for | inverse spinel structures and battery materials

Now process the following abstract:

Abstract:
{sentence}

Output:
"""

# Global variables to store model once
tokenizer = None
model = None
device = None

@spaces.GPU
def get_model():
    global tokenizer, model, device
    if tokenizer is None or model is None or device is None:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            dtype=torch.bfloat16,       # for H200
            device_map="auto",          # automatically map to GPU
            low_cpu_mem_usage=True
        )
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)
    return tokenizer, model, device


@spaces.GPU
def run_mistral_extract(sentence: str, max_tokens: int = 512):
    tokenizer, model, device = get_model()
    prompt = PROMPT_TEMPLATE.format(sentence=sentence)    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.inference_mode(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
        generated = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True,
            early_stopping=True
        )

    output = tokenizer.decode(generated[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return output

def parse_mistral_output(output: str) -> Dict:
    """Extract significance as string and triples as list of lists from LLM output."""
    significance = ""
    triples = []
    sections = re.split(r"\*\*Scientific relationships:\*\*", output)

    significance_raw = sections[0]
    significance = significance_raw.replace("**Significance:**", "").replace("Significance:", "").strip()
    sro_raw = sections[1].strip()
    lines = [line.strip() for line in sro_raw.split("\n") if line.strip()]

    for line in lines:
        parts = [part.strip() for part in line.split("|")]
        if len(parts) == 3:
            triples.append(parts)

    return {"significance": significance, "triples": triples}

def build_kg(triples: list) -> nx.DiGraph:
    G = nx.DiGraph()
    for subj, rel, obj in triples:
        G.add_node(subj)
        G.add_node(obj)
        G.add_edge(subj, obj, label=rel)
    return G

def extract_and_parse(sentence: str):
    raw_output = run_mistral_extract(sentence)
    parsed = parse_mistral_output(raw_output)
    G = build_kg(parsed["triples"])

    # Format edges with labels
    edges_str = ""
    if G.edges:
        edges_str += "\n".join([f"{u} -- {d['label']} --> {v}" for u, v, d in G.edges(data=True)])
    else:
        edges_str += "No edges found."
    return edges_str 

# return iframe_html, all the html and JS part is chatGPT magic
def extract_and_visualize(sentence: str):
    # 1. Extract triples and build graph
    raw_output = run_mistral_extract(sentence)
    parsed = parse_mistral_output(raw_output)
    G = build_kg(parsed["triples"])

    # 2. Largest connected component
    largest_cc = set(max(nx.connected_components(G.to_undirected()), key=len))

    # 3. Build PyVis network
    net = Network(height="650px", width="100%", bgcolor="#ffffff", directed=True)
    net.toggle_physics(True)
    net.set_options("""
    var options = {
      "nodes": {"shape": "dot", "size": 20, "font": {"size": 18, "face": "arial", "color": "#000000"}, "borderWidth": 2},
      "edges": {"arrows": {"to": {"enabled": true, "scaleFactor": 0.8}}, "color": {"inherit": false}, "smooth": false, "font": {"size": 14, "align": "horizontal"}},
      "physics": {"forceAtlas2Based": {"gravitationalConstant": -50, "centralGravity": 0.01}, "minVelocity": 0.75, "solver": "forceAtlas2Based", "stabilization": {"fit": false}}}
    """)

    # 4. Add nodes
    for node in G.nodes:
        is_main = node in largest_cc
        base_color = "#0077b6"  # dark blue
        color = base_color if is_main else "rgba(0,119,182,0.05)"  # strongly dimmed
        font_color = "#000000" if is_main else "rgba(0,0,0,0.05)"
        net.add_node(str(node), label=str(node), title=f"Entity: {node}", color=color, font={"size": 18, "color": font_color}, main=is_main)

    # 5. Add edges
    for u, v, d in G.edges(data=True):
        is_main = u in largest_cc and v in largest_cc
        edge_color = "rgba(2,62,138,0.6)" if is_main else "rgba(2,62,138,0.05)"  # light blue
        font_color = "rgba(0,0,0,0.6)" if is_main else "rgba(0,0,0,0.05)"
        net.add_edge(str(u), str(v), label=d.get("label",""), title=f"Relation: {d.get('label','')}", color=edge_color, font={"color": font_color}, main=is_main)

    # 6. Save HTML
    with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".html", encoding="utf-8") as tmp:
        net.save_graph(tmp.name)
        tmp_path = tmp.name

    with open(tmp_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    # 7. Inject JS for click highlighting
    js_script = """
    <script type="text/javascript">
    network.once("stabilizationIterationsDone", () => {
        const nodesData = network.body.data.nodes.get();
        const edgesData = network.body.data.edges.get();

        const nodeBaseColor = "#0077b6";            // dark blue
        const nodeDimColor = "rgba(0,119,182,0.05)";
        const edgeBaseColor = "rgba(2,62,138,0.6)"; // light blue
        const edgeDimColor = "rgba(2,62,138,0.05)";

        function resetColors() {
            nodesData.forEach(n => {
                const main = n.main;
                network.body.data.nodes.update({
                    id: n.id,
                    color: main ? nodeBaseColor : nodeDimColor,
                    font: { color: main ? "#000000" : "rgba(0,0,0,0.05)" }
                });
            });
            edgesData.forEach(e => {
                const main = e.main;
                network.body.data.edges.update({
                    id: e.id,
                    color: main ? edgeBaseColor : edgeDimColor,
                    font: { color: main ? "rgba(0,0,0,0.6)" : "rgba(0,0,0,0.05)" }
                });
            });
        }

        network.on("click", (params) => {
            if(params.nodes.length > 0){
                const nodeId = params.nodes[0];
                const connectedNodes = network.getConnectedNodes(nodeId);
                const connectedEdges = network.getConnectedEdges(nodeId);

                // Dim everything first
                nodesData.forEach(n => network.body.data.nodes.update({id:n.id, color:nodeDimColor, font:{color:"rgba(0,0,0,0.05)"}}));
                edgesData.forEach(e => network.body.data.edges.update({id:e.id, color:edgeDimColor, font:{color:"rgba(0,0,0,0.05)"}}));

                // Highlight clicked node
                network.body.data.nodes.update({id:nodeId, color: nodeBaseColor, font:{color:"#000000"}});
                
                // Highlight connected nodes
                connectedNodes.forEach(n => network.body.data.nodes.update({id:n, color: nodeBaseColor, font:{color:"#000000"}}));

                // Highlight connected edges (keep light blue)
                connectedEdges.forEach(e => network.body.data.edges.update({id:e, color: edgeBaseColor, font:{color:"rgba(0,0,0,0.6)"}}));
            } else { 
                resetColors(); 
            }
        });

        resetColors(); // initial state
    });
    </script>
    """

    html_content = html_content.replace("</body>", js_script + "\n</body>")
    return f"<iframe srcdoc='{html.escape(html_content)}' style='width:100%; height:650px; border:none; border-radius:12px;'></iframe>"


abstract = (
    "Understanding the atomic structure of magnetite–carboxylic acid interfaces is crucial "
    "for tailoring nanocomposites involving this interface. We present a Monte Carlo (MC)-based "
    "method utilizing iron oxidation state exchange to model magnetite interfaces with tens of "
    "thousands of atoms, scales typically inaccessible by electronic structure calculations. "
    "Charge neutrality is ensured through the oxidation of Fe ions. The MC approach allows "
    "magnetite to adapt to its environment at interfaces without requiring interface-specific "
    "rescaling of force-field parameters, enabling a simple and versatile method. By comparing "
    "adsorption sites, layer distances, and bond lengths with results from electronic structure "
    "calculations and experiments, we validated the accuracy of our approach. We found that the "
    "oxidation state distribution, and consequently the binding site preference, depend on "
    "coverage and surface thickness, with a critical thickness signaling the transition from "
    "layered to bulk-like oxidation states. This method ensures seamless compatibility with "
    "popular biomolecular force fields, providing transferability and simplifying the study of "
    "magnetite interfaces in general."
)
CACHE_DIR = "cache"
os.makedirs(CACHE_DIR, exist_ok=True)
CACHE_PATH = os.path.join(CACHE_DIR, "kg_cache.html")

def get_preload_output():
    if os.path.exists(CACHE_PATH):
        with open(CACHE_PATH, "r", encoding="utf-8") as f:
            html_output = f.read()
    else:
        html_output = extract_and_visualize(abstract)
        with open(CACHE_PATH, "w", encoding="utf-8") as f:
            f.write(html_output)
    return html_output

preloaded_output = get_preload_output()

def preload(preloaded):
    # Returns HTML, empty textbox, preloaded state
    return preloaded_output, "", True

# Interface
with gr.Blocks() as demo:
    gr.Markdown("# Scientific Knowledge Graph Generator")
    with gr.Row():
        with gr.Column(scale=0.9):
            gr.Markdown(
                "A lightweight app that generates scientific knowledge graphs.\n\n"
                "### How it works\n"
                "1. Extracts subject–relation–object (SRO) triples from scientific texts using a [large language model](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3).\n"
                "2. Visualizes SRO triples as interactive knowledge graphs.\n"
            )
            input_text = gr.Textbox(
                label="Text",
                lines=4,
                placeholder="Paste the scientific text here (e.g., abstract)."
            )
            
            # Clickable example
            example_button = gr.Button("Use Example Abstract (https://arxiv.org/abs/2510.18061)")
            
            btn = gr.Button("Submit", variant="primary")
            gr.Markdown("Note: Click a node to highlight it and its connections; other nodes/edges are dimmed.")
            
        with gr.Column(scale=1.2):
            output_html = gr.HTML(label="Knowledge Graph Visualization")
    
    preloaded = gr.State(value=False)
    
    # Load preloaded output
    demo.load(fn=preload, inputs=preloaded, outputs=[output_html, input_text, preloaded])
    
    # When Submit is clicked
    btn.click(fn=extract_and_visualize, inputs=input_text, outputs=output_html)
    
    # When Example is clicked
    def fill_example(_):
        return abstract
    example_button.click(fn=fill_example, inputs=None, outputs=input_text)

if __name__ == "__main__":
    demo.launch(share=False)