import spaces # Must be first for ZeroGPU decorator import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch from typing import List, Dict import re import networkx as nx from pyvis.network import Network import tempfile import html import os MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3" PROMPT_TEMPLATE = """ You are an expert scientific information extraction assistant. Your task is to read the given scientific abstract carefully and produce two outputs: 1. **Significance:** Write 2–4 sentences summarizing the core contribution and importance of the described work, integrating contextual understanding and implicit meaning. Focus on novel findings, methods, mechanisms, and their impact. 2. **Scientific relationships:** Extract the key scientific facts as triples in the format: [Subject] | [Relation] | [Object] Guidelines for triples: - Focus on relationships that reflect the main scientific findings, methodological innovations, or mechanisms. - Only include material or system properties if they are critical to understanding the main findings or methods. Avoid trivial or generic property listings (e.g., “X | related_to | its property”) otherwise. - Subjects and objects should be meaningful scientific entities, processes, or phenomena. - Relations should express clear scientific interactions (e.g., based_on, enables, captures, compares_with, improves, applies_to, relevant_for). - You may infer relationships that are strongly implied by the text — not just explicitly stated. - Each triple should describe one distinct factual or conceptual relationship. - Output only one triple per line, with no explanations. Example abstract (for formatting guidance only): "Magnetite is an important mineral with many interesting applications related to its magnetic, electrical and thermal properties. Typically studied by electronic structure calculations, these methods are unable to capture the complex ion dynamics at relevant temperatures, time and length scales. We present a hybrid Monte Carlo/Molecular Dynamics (MC/MD) method based on iron oxidation state exchange." Example output: **Significance:** This study introduces a hybrid Monte Carlo/molecular dynamics method that enables accurate atomistic modeling of magnetite and related systems by capturing complex ionic dynamics beyond the reach of traditional electronic structure methods. The approach reproduces oxidation-state patterns consistent with density functional theory and reveals how lattice distortions stabilize excess charges and induce order–disorder transitions at critical surface thicknesses. This work advances understanding of oxidation-state ordering in inverse spinel structures and has implications for battery material design. **Scientific relationships:** Hybrid MC/MD method | enables | accurate atomistic modeling of magnetite Hybrid MC/MD method | captures | complex ionic dynamics Hybrid MC/MD method | validated_by | comparison with density functional theory Lattice distortions | lead_to | stabilization of excess charges Oxidation states | transition_at | critical surface thickness Hybrid MC/MD method | applicable_to | bulk magnetite, magnetite surfaces, and nanoparticles Oxidation-state ordering | relevant_for | inverse spinel structures and battery materials Now process the following abstract: Abstract: {sentence} Output: """ # Global variables to store model once tokenizer = None model = None device = None @spaces.GPU def get_model(): global tokenizer, model, device if tokenizer is None or model is None or device is None: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, dtype=torch.bfloat16, # for H200 device_map="auto", # automatically map to GPU low_cpu_mem_usage=True ) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) return tokenizer, model, device @spaces.GPU def run_mistral_extract(sentence: str, max_tokens: int = 512): tokenizer, model, device = get_model() prompt = PROMPT_TEMPLATE.format(sentence=sentence) inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.inference_mode(), torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): generated = model.generate( **inputs, max_new_tokens=max_tokens, do_sample=False, temperature=0.0, pad_token_id=tokenizer.eos_token_id, use_cache=True, early_stopping=True ) output = tokenizer.decode(generated[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) return output def parse_mistral_output(output: str) -> Dict: """Extract significance as string and triples as list of lists from LLM output.""" significance = "" triples = [] sections = re.split(r"\*\*Scientific relationships:\*\*", output) significance_raw = sections[0] significance = significance_raw.replace("**Significance:**", "").replace("Significance:", "").strip() sro_raw = sections[1].strip() lines = [line.strip() for line in sro_raw.split("\n") if line.strip()] for line in lines: parts = [part.strip() for part in line.split("|")] if len(parts) == 3: triples.append(parts) return {"significance": significance, "triples": triples} def build_kg(triples: list) -> nx.DiGraph: G = nx.DiGraph() for subj, rel, obj in triples: G.add_node(subj) G.add_node(obj) G.add_edge(subj, obj, label=rel) return G def extract_and_parse(sentence: str): raw_output = run_mistral_extract(sentence) parsed = parse_mistral_output(raw_output) G = build_kg(parsed["triples"]) # Format edges with labels edges_str = "" if G.edges: edges_str += "\n".join([f"{u} -- {d['label']} --> {v}" for u, v, d in G.edges(data=True)]) else: edges_str += "No edges found." return edges_str # return iframe_html, all the html and JS part is chatGPT magic def extract_and_visualize(sentence: str): # 1. Extract triples and build graph raw_output = run_mistral_extract(sentence) parsed = parse_mistral_output(raw_output) G = build_kg(parsed["triples"]) # 2. Largest connected component largest_cc = set(max(nx.connected_components(G.to_undirected()), key=len)) # 3. Build PyVis network net = Network(height="650px", width="100%", bgcolor="#ffffff", directed=True) net.toggle_physics(True) net.set_options(""" var options = { "nodes": {"shape": "dot", "size": 20, "font": {"size": 18, "face": "arial", "color": "#000000"}, "borderWidth": 2}, "edges": {"arrows": {"to": {"enabled": true, "scaleFactor": 0.8}}, "color": {"inherit": false}, "smooth": false, "font": {"size": 14, "align": "horizontal"}}, "physics": {"forceAtlas2Based": {"gravitationalConstant": -50, "centralGravity": 0.01}, "minVelocity": 0.75, "solver": "forceAtlas2Based", "stabilization": {"fit": false}}} """) # 4. Add nodes for node in G.nodes: is_main = node in largest_cc base_color = "#0077b6" # dark blue color = base_color if is_main else "rgba(0,119,182,0.05)" # strongly dimmed font_color = "#000000" if is_main else "rgba(0,0,0,0.05)" net.add_node(str(node), label=str(node), title=f"Entity: {node}", color=color, font={"size": 18, "color": font_color}, main=is_main) # 5. Add edges for u, v, d in G.edges(data=True): is_main = u in largest_cc and v in largest_cc edge_color = "rgba(2,62,138,0.6)" if is_main else "rgba(2,62,138,0.05)" # light blue font_color = "rgba(0,0,0,0.6)" if is_main else "rgba(0,0,0,0.05)" net.add_edge(str(u), str(v), label=d.get("label",""), title=f"Relation: {d.get('label','')}", color=edge_color, font={"color": font_color}, main=is_main) # 6. Save HTML with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".html", encoding="utf-8") as tmp: net.save_graph(tmp.name) tmp_path = tmp.name with open(tmp_path, "r", encoding="utf-8") as f: html_content = f.read() # 7. Inject JS for click highlighting js_script = """ """ html_content = html_content.replace("", js_script + "\n") return f"" abstract = ( "Understanding the atomic structure of magnetite–carboxylic acid interfaces is crucial " "for tailoring nanocomposites involving this interface. We present a Monte Carlo (MC)-based " "method utilizing iron oxidation state exchange to model magnetite interfaces with tens of " "thousands of atoms, scales typically inaccessible by electronic structure calculations. " "Charge neutrality is ensured through the oxidation of Fe ions. The MC approach allows " "magnetite to adapt to its environment at interfaces without requiring interface-specific " "rescaling of force-field parameters, enabling a simple and versatile method. By comparing " "adsorption sites, layer distances, and bond lengths with results from electronic structure " "calculations and experiments, we validated the accuracy of our approach. We found that the " "oxidation state distribution, and consequently the binding site preference, depend on " "coverage and surface thickness, with a critical thickness signaling the transition from " "layered to bulk-like oxidation states. This method ensures seamless compatibility with " "popular biomolecular force fields, providing transferability and simplifying the study of " "magnetite interfaces in general." ) CACHE_DIR = "cache" os.makedirs(CACHE_DIR, exist_ok=True) CACHE_PATH = os.path.join(CACHE_DIR, "kg_cache.html") def get_preload_output(): if os.path.exists(CACHE_PATH): with open(CACHE_PATH, "r", encoding="utf-8") as f: html_output = f.read() else: html_output = extract_and_visualize(abstract) with open(CACHE_PATH, "w", encoding="utf-8") as f: f.write(html_output) return html_output preloaded_output = get_preload_output() def preload(preloaded): # Returns HTML, empty textbox, preloaded state return preloaded_output, "", True # Interface with gr.Blocks() as demo: gr.Markdown("# Scientific Knowledge Graph Generator") with gr.Row(): with gr.Column(scale=0.9): gr.Markdown( "A lightweight app that generates scientific knowledge graphs.\n\n" "### How it works\n" "1. Extracts subject–relation–object (SRO) triples from scientific texts using a [large language model](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3).\n" "2. Visualizes SRO triples as interactive knowledge graphs.\n" ) input_text = gr.Textbox( label="Text", lines=4, placeholder="Paste the scientific text here (e.g., abstract)." ) # Clickable example example_button = gr.Button("Use Example Abstract (https://arxiv.org/abs/2510.18061)") btn = gr.Button("Submit", variant="primary") gr.Markdown("Note: Click a node to highlight it and its connections; other nodes/edges are dimmed.") with gr.Column(scale=1.2): output_html = gr.HTML(label="Knowledge Graph Visualization") preloaded = gr.State(value=False) # Load preloaded output demo.load(fn=preload, inputs=preloaded, outputs=[output_html, input_text, preloaded]) # When Submit is clicked btn.click(fn=extract_and_visualize, inputs=input_text, outputs=output_html) # When Example is clicked def fill_example(_): return abstract example_button.click(fn=fill_example, inputs=None, outputs=input_text) if __name__ == "__main__": demo.launch(share=False)