Spaces:

Perunio
/

galis

Sleeping

App Files Files Community

Perunio commited on Aug 10

Commit

90bc141

1 Parent(s): 1a18d8f

alpha app

Browse files

Files changed (14) hide show

.dockerignore +13 -0
.gitignore +215 -0
Dockerfile +23 -0
dataset/__init__.py +0 -0
dataset/ogbn_link_pred_dataset.py +101 -0
galis_app.py +132 -0
llm/__init__.py +0 -0
llm/related_work_generator.py +124 -0
model/__init__.py +0 -0
model/simple_gcn_model.py +37 -0
model/train.py +139 -0
predictor/__init__.py +0 -0
predictor/link_predictor.py +156 -0
pyproject.toml +49 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,13 @@

+__pycache__
+*.pyc
+.git
+node_modules
+.env
+README.md
+Dockerfile
+# data folder
+data
+predictor/data
+model/data
+dataset/data

.gitignore ADDED Viewed

	@@ -0,0 +1,215 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+ .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# data
+data
+predictor/data
+# model
+*.pth

Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+FROM python:3.12.4-slim
+WORKDIR /app
+RUN pip install poetry
+COPY pyproject.toml ./
+RUN poetry install --no-root
+RUN poetry run pip install torch-scatter torch-sparse torch-cluster pyg-lib -f https://data.pyg.org/whl/torch-2.3.1+cu121.html
+RUN poetry run pip install torch-geometric
+COPY galis_app.py ./
+COPY model ./model
+COPY dataset ./dataset
+COPY predictor ./predictor
+COPY llm ./llm
+ENV GOOGLE_API_KEY=""
+EXPOSE 7860
+CMD ["poetry", "run", "streamlit", "run", "galis_app.py", "--server.port=7860", "--server.address=0.0.0.0"]

dataset/__init__.py ADDED Viewed

File without changes

dataset/ogbn_link_pred_dataset.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import pandas as pd
+import torch
+from ogb.nodeproppred import PygNodePropPredDataset
+from torch_geometric.transforms import RandomLinkSplit
+from torch_geometric.loader import LinkNeighborLoader
+from torch_geometric.data import Data
+import requests
+import gzip
+import shutil
+class OGBNLinkPredDataset:
+    def __init__(
+        self, root_dir: str = "data", val_size: float = 0.1, test_size: float = 0.2
+    ):
+        self._base_dataset = PygNodePropPredDataset(name="ogbn-arxiv", root=root_dir)
+        self.data = self._base_dataset[0]
+        self.root = self._base_dataset.root
+        self.num_features = self._base_dataset.num_features
+        self._download_abstracts()
+        self.corpus = self._load_corpus()
+        self.train_data, self.val_data, self.test_data = self._split_data(
+            val_size, test_size
+        )
+    def _download_abstracts(self):
+        target_dir = os.path.join(self.root, "mapping")
+        tsv_path = os.path.join(target_dir, "titleabs.tsv")
+        if not os.path.exists(tsv_path):
+            print("Downloading title and abstract information...")
+            gz_path = tsv_path + ".gz"
+            url = "https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz"
+            os.makedirs(target_dir, exist_ok=True)
+            try:
+                print(f"Downloading from {url}...")
+                response = requests.get(url, stream=True)
+                response.raise_for_status()
+                with open(gz_path, "wb") as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                print(f"File downloaded to: {gz_path}")
+                print(f"Decompressing {gz_path}...")
+                with gzip.open(gz_path, 'rb') as f_in:
+                    with open(tsv_path, 'wb') as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+                print(f"File extracted to: {tsv_path}")
+                os.remove(gz_path)
+                print(f"Removed temporary file: {gz_path}")
+            except requests.exceptions.RequestException as e:
+                print(f"Error downloading file: {e}")
+            except Exception as e:
+                print(f"An error occurred: {e}")
+        else:
+            print("Title and abstract file already exists.")
+    def _load_corpus(self) -> list[str]:
+        tsv_path = os.path.join(self.root, "mapping", "titleabs.tsv")
+        try:
+            df_text = pd.read_csv(
+                tsv_path,
+                sep="\t",
+                header=None,
+                names=["paper_id", "title", "abstract"],
+                lineterminator="\n",
+                low_memory=False,
+            )
+            df_text_aligned = df_text.reset_index(drop=True)
+            corpus = (
+                df_text_aligned["title"].fillna("")
+                + "\n "
+                + df_text_aligned["abstract"].fillna("")
+            ).tolist()
+            print(f"Corpus created with {len(corpus)} documents.")
+            return corpus
+        except FileNotFoundError:
+            print("Error: titleabs.tsv not found. Could not create corpus.")
+            return []
+    def _split_data(self, val_size: float, test_size: float) -> tuple[Data, Data, Data]:
+        transform = RandomLinkSplit(
+            num_val=val_size,
+            num_test=test_size,
+            is_undirected=False,
+            add_negative_train_samples=False,
+        )
+        train_split, val_split, test_split = transform(self.data)
+        print("Data successfully split into train, validation, and test sets.")
+        return train_split, val_split, test_split
+    def get_splits(self) -> tuple[Data, Data, Data]:
+        return self.train_data, self.val_data, self.test_data

galis_app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from pathlib import Path
+import streamlit as st
+from predictor.link_predictor import (
+    prepare_system,
+    get_citation_predictions,
+    abstract_to_vector,
+    format_top_k_predictions,
+)
+from llm.related_work_generator import generate_related_work
+MODEL_PATH = Path("predictor/model.pth")
+@st.cache_resource
+def load_prediction_system(model_path):
+    return prepare_system(model_path)
+def app():
+    st.set_page_config(page_title="Galis", layout="wide")
+    st.title("Galis")
+    if "references" not in st.session_state:
+        st.session_state.references = None
+    if "related_work" not in st.session_state:
+        st.session_state.related_work = None
+    if "abstract_title" not in st.session_state:
+        st.session_state.abstract_title = ""
+    if "abstract_text" not in st.session_state:
+        st.session_state.abstract_text = ""
+    gcn_model, st_model, dataset, z_all = load_prediction_system(MODEL_PATH)
+    col1, col2 = st.columns(2, gap="large")
+    with col2:
+        references_placeholder = st.empty()
+        related_work_placeholder = st.empty()
+    with col1:
+        st.header("Abstract Title")
+        abstract_title = st.text_input(
+            "Paste your title here",
+            st.session_state.abstract_title,
+            key="abstract_title_input",
+            label_visibility="collapsed",
+        )
+        st.header("Abstract Text")
+        abstract_input = st.text_area(
+            "Paste your abstract here",
+            st.session_state.abstract_text,
+            key="abstract_text_input",
+            height=100,
+            label_visibility="collapsed",
+        )
+        st.write("...or **upload** a .txt file (first line = title, rest = abstract)")
+        uploaded_file = st.file_uploader(
+            "Drag and drop file here", type=["txt"], help="Limit 200MB per file • TXT"
+        )
+        if uploaded_file is not None:
+            content = uploaded_file.getvalue().decode("utf-8").splitlines()
+            st.session_state.abstract_title = content[0] if content else ""
+            st.session_state.abstract_text = (
+                "\n".join(content[1:]) if len(content) > 1 else ""
+            )
+            st.rerun()
+        st.session_state.abstract_title = abstract_title
+        st.session_state.abstract_text = abstract_input
+        num_citations = st.number_input(
+            "Number of suggestions",
+            min_value=1,
+            max_value=100,
+            value=10,
+            step=1,
+            help="Choose how many paper suggestions you want to see.",
+        )
+        if st.button("Suggest References and related work", type="primary"):
+            if not abstract_title.strip() or not abstract_input.strip():
+                st.warning("Please provide both a title and an abstract.")
+            else:
+                st.session_state.references = None
+                st.session_state.related_work = None
+                references_placeholder.empty()
+                related_work_placeholder.empty()
+                with st.spinner("Analyzing abstract and predicting references..."):
+                    new_vector = abstract_to_vector(
+                        abstract_input, abstract_title, st_model
+                    )
+                    probabilities = get_citation_predictions(
+                        vector=new_vector,
+                        model=gcn_model,
+                        z_all=z_all,
+                        num_nodes=dataset.data.num_nodes,
+                    )
+                    references = format_top_k_predictions(
+                        probabilities, dataset, top_k=num_citations
+                    )
+                    st.session_state.references = references
+                with references_placeholder.container():
+                    st.header("Suggested References")
+                    with st.container(height=200):
+                        st.markdown(st.session_state.references)
+                with related_work_placeholder.container():
+                    with st.spinner("Generating related work section..."):
+                        related_work = generate_related_work(st.session_state.references)
+                        st.session_state.related_work = related_work
+    if st.session_state.references:
+        with references_placeholder.container():
+            st.header("Suggested References")
+            with st.container(height=200):
+                st.markdown(st.session_state.references)
+    if st.session_state.related_work:
+        with related_work_placeholder.container():
+            st.header("Suggested Related Works")
+            with st.container(height=200):
+                st.markdown(st.session_state.related_work)
+if __name__ == "__main__":
+    app()

llm/__init__.py ADDED Viewed

File without changes

llm/related_work_generator.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from dotenv import load_dotenv
+import os
+import structlog
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_core.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+structlog.configure(
+    processors=[
+        structlog.processors.TimeStamper(fmt="iso"),
+        structlog.processors.JSONRenderer(indent=4, sort_keys=True),
+    ]
+)
+logger = structlog.get_logger()
+load_dotenv()
+PROMPT_TEXT = """
+You are a research assistant AI specializing in academic writing. Your task is to generate a "Related Work" section
+for a research paper. You will be given a list of citations.
+Your goal is to synthesize the provided citations into a coherent and well-structured "Related Work" section that
+contextualizes the user's project within the existing academic literature.
+**PROVIDED CITATIONS:**
+{citations}
+**INSTRUCTIONS:**
+1.  **Thematic Organization:** Do not simply list summaries of the papers. Group the provided citations into thematic
+categories based on shared concepts, methodologies, or research problems. For example, you could create categories like
+"Transformer-based Language Models," "Sentiment Analysis Techniques," and "Efficient Models for NLP." Introduce each
+theme before discussing the relevant papers.
+2.  **Synthesis and Analysis:** For each thematic group, synthesize the key contributions and findings of the papers.
+Go beyond summarization; compare and contrast the different approaches. For instance, you could discuss the evolution
+of certain methods or the trade-offs between different models (e.g., accuracy vs. computational efficiency).
+3.  **Identify Research Gaps:** Critically analyze the literature you are reviewing. Explicitly identify the
+limitations, open questions, or research gaps that the cited works leave unresolved. This will set the stage for
+introducing the project's contribution.
+4.  **Contextualize the User's Project:** After discussing a thematic group of papers and identifying a gap, clearly
+and explicitly state how the user's project (described above) addresses this gap or builds upon the existing work. Use
+phrases like: "While these methods have shown great success, they struggle with...", "To address this limitation, our
+work introduces...", or "Building upon the foundation laid by [Author, Year], we propose a novel approach that...".
+5.  **Academic Tone and Flow:** Maintain a formal, objective, and academic tone throughout the text. Ensure smooth
+transitions between paragraphs and ideas to create a coherent narrative that logically leads the reader to understand
+the novelty and importance of the user's project.
+6.  **Output Format:** Generate only the text for the "Related Work" section. Do not include headers like
+"INSTRUCTIONS" or "PROVIDED CITATIONS" in the final output. The entire response should be the section text itself.
+"""
+def check_api_key():
+    api_key = os.getenv("GOOGLE_API_KEY")
+    if not api_key:
+        logger.error("GOOGLE_API_KEY not set")
+        return False
+    logger.info(f"Gemini API Key is loaded: {api_key[:10]}...")
+    return True
+def create_related_work_pipeline():
+    """Creates a ready-to-use pipeline for generating the Related Work section."""
+    llm = ChatGoogleGenerativeAI(
+        model="gemini-1.5-flash",
+        temperature=0.3
+    )
+    prompt = PromptTemplate(
+        input_variables=["citations"],
+        template=PROMPT_TEXT
+    )
+    parser = StrOutputParser()
+    chain = prompt | llm | parser
+    return chain
+def generate_related_work(citations_text: str) -> str:
+    """
+    Main function - pass citations, get Related Work
+    Args:
+        citations_text: Text with citations (can be a list or a string)
+    Returns:
+        The generated Related Work section
+    """
+    pipeline = create_related_work_pipeline()
+    result = pipeline.invoke({"citations": citations_text})
+    return result
+if __name__ == "__main__":
+    my_citations = """
+Top 5 Citation Predictions:
+  - Title: 'deterministic construction of rip matrices in compressed sensing from constant weight codes'
+  - Title: 'mizar items exploring fine grained dependencies in the mizar mathematical library'
+  - Title: 'rateless lossy compression via the extremes'
+  - Title: 'towards autonomic service provisioning systems'
+  - Title: 'anonymization with worst case distribution based background knowledge'
+    """
+    print("Generuję Related Work...")
+    print("=" * 50)
+    try:
+        related_work = generate_related_work(my_citations)
+        print(related_work)
+    except Exception as e:
+        print(f"Błąd: {e}")
+        print("\n=== INSTRUKCJE KONFIGURACJI ===")
+        print("1. Stwórz plik .env w tym samym folderze co skrypt")
+        print("2. Dodaj do niego linię: GOOGLE_API_KEY=twój_klucz")
+        print("3. Uzyskaj klucz na: https://makersuite.google.com/app/apikey")
+        check_api_key()

model/__init__.py ADDED Viewed

File without changes

model/simple_gcn_model.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+import torch.nn.functional as F
+from torch_geometric.nn import GCNConv
+class EdgeDecoder(torch.nn.Module):
+    """Predict citation existence of two node embeddings."""
+    def __init__(self, in_channels):
+        super().__init__()
+        self.linear = torch.nn.Linear(in_channels * 2, 1)
+    def forward(self, z, edge_index):
+        row, col = edge_index
+        # Concatenate the embeddings of the two nodes
+        z_cat = torch.cat([z[row], z[col]], dim=-1)
+        return self.linear(z_cat).squeeze(-1)
+class SimpleGCN(torch.nn.Module):
+    """Include encoder and decoder part. Encoder creates embedding for given node and decoder predict link existence between node embeddings."""
+    def __init__(self, in_channels, hidden_channels, out_channels):
+        super().__init__()
+        self.conv1 = GCNConv(in_channels, hidden_channels)
+        self.conv2 = GCNConv(hidden_channels, out_channels)
+        self.decoder = EdgeDecoder(out_channels)
+    def forward(self, x, edge_index):
+        x = self.conv1(x, edge_index).relu()
+        x = F.dropout(x, p=0.5, training=self.training)
+        z = self.conv2(x, edge_index)
+        return z
+    def decode(self, z, edge_label_index):
+        # We pass the edge_label_index to the decoder, which contains both pos and neg edges
+        return self.decoder(z, edge_label_index)

model/train.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch
+import numpy as np
+from torch_geometric.loader import LinkNeighborLoader
+from sklearn.metrics import roc_auc_score, accuracy_score
+from tqdm import tqdm
+from model.simple_gcn_model import SimpleGCN
+from dataset.ogbn_link_pred_dataset import OGBNLinkPredDataset
+BATCH_SIZE = 128
+NUM_EPOCHS = 20
+LR = 0.001
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# data
+dataset = OGBNLinkPredDataset(val_size=0.1, test_size=0.2)
+train_data, val_data, test_data = dataset.get_splits()
+train_loader = LinkNeighborLoader(
+    train_data,
+    num_neighbors=[-1, -1],  # Use all neighbors
+    neg_sampling_ratio=1.0,  # 1 negative sample per positive edge
+    edge_label_index=train_data.edge_label_index,
+    edge_label=train_data.edge_label,
+    batch_size=BATCH_SIZE,
+    shuffle=True,
+    num_workers=4,
+)
+val_loader = LinkNeighborLoader(
+    val_data,
+    num_neighbors=[-1, -1],
+    neg_sampling_ratio=0.0,  # RandomLinkSplit already added negative edges
+    edge_label_index=val_data.edge_label_index,
+    edge_label=val_data.edge_label,
+    batch_size=BATCH_SIZE,
+    shuffle=False,
+    num_workers=4,
+)
+test_loader = LinkNeighborLoader(
+    test_data,
+    num_neighbors=[-1, -1],
+    neg_sampling_ratio=0.0,
+    edge_label_index=test_data.edge_label_index,
+    edge_label=test_data.edge_label,
+    batch_size=BATCH_SIZE,
+    shuffle=False,
+    num_workers=4,
+)
+# model
+model = SimpleGCN(
+    in_channels=dataset.num_features,
+    hidden_channels=256,
+    out_channels=128,
+).to(DEVICE)
+optimizer = torch.optim.Adam(model.parameters(), lr=LR)
+criterion = torch.nn.BCEWithLogitsLoss()
+# training
+def train(train_loader, epoch):
+    model.train()
+    total_loss = 0
+    scaler = torch.GradScaler()
+    pbar = tqdm(train_loader, desc=f"Training Epoch: {epoch}")
+    for batch in pbar:
+        batch = batch.to(DEVICE)
+        optimizer.zero_grad()
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True):
+            z = model(batch.x, batch.edge_index)
+            out = model.decode(z, batch.edge_label_index)
+            labels = batch.edge_label.float()
+            loss = criterion(out, labels)
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+        total_loss += loss.item()
+        pbar.set_postfix(loss=f"{loss.item():.4f}")
+    return total_loss / len(train_loader)
+@torch.no_grad()
+def calc_metrics(loader):
+    model.eval()
+    all_scores = []
+    all_labels = []
+    pbar = tqdm(loader, desc="Testing")
+    for batch in pbar:
+        batch = batch.to(DEVICE)
+        with torch.autocast(device_type=DEVICE.type, dtype=torch.bfloat16):
+            z = model(batch.x, batch.edge_index)
+            out = model.decode(z, batch.edge_label_index)
+        scores = torch.sigmoid(out).float().cpu().numpy()
+        labels = batch.edge_label.cpu().numpy()
+        all_scores.append(scores)
+        all_labels.append(labels)
+    all_scores = np.concatenate(all_scores)
+    all_labels = np.concatenate(all_labels)
+    return roc_auc_score(all_labels, all_scores), accuracy_score(
+        all_labels, all_scores > 0.5
+    )
+if __name__ == "__main__":
+    best_val_auc = 0
+    best_auc = 0
+    for epoch in range(1, NUM_EPOCHS + 1):
+        loss = train(train_loader, epoch)
+        val_auc, val_acc = calc_metrics(val_loader)
+        print(
+            f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, Val acc: {val_acc:.4f}",
+            end=" ",
+        )
+        if val_auc > best_val_auc:
+            print("New best")
+            best_val_auc = val_auc
+            best_auc = val_auc
+            torch.save(model.state_dict(), "model.pth")
+    test_auc, test_acc = calc_metrics(test_loader)
+    print("-" * 30)
+    print(f"Best validation AUC: {best_auc:.4f}")

predictor/__init__.py ADDED Viewed

File without changes

predictor/link_predictor.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from pathlib import Path
+import torch
+import structlog
+from sentence_transformers import SentenceTransformer
+from model.simple_gcn_model import SimpleGCN
+from dataset.ogbn_link_pred_dataset import OGBNLinkPredDataset
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+structlog.configure(
+    processors=[
+        structlog.processors.TimeStamper(fmt="iso"),
+        structlog.processors.JSONRenderer(indent=4, sort_keys=True),
+    ]
+)
+logger = structlog.get_logger()
+def abstract_to_vector(
+    title: str, abstract_text: str, st_model: SentenceTransformer
+) -> torch.Tensor:
+    text = title + "\n" + abstract_text
+    with torch.no_grad():
+        vector = st_model.encode(text, convert_to_tensor=True, device=DEVICE)
+    return vector.unsqueeze(0)
+def get_citation_predictions(
+    vector: torch.Tensor, model: SimpleGCN, z_all: torch.Tensor, num_nodes: int
+) -> torch.Tensor:
+    model.eval()
+    with torch.no_grad():
+        empty_edge_index = torch.empty(2, 0, dtype=torch.long, device=DEVICE)
+        h1_new = model.conv1(vector, edge_index=empty_edge_index).relu()
+        z_new = model.conv2(h1_new, edge_index=empty_edge_index)
+    new_node_idx = num_nodes
+    row = torch.full((num_nodes,), fill_value=new_node_idx, device=DEVICE)
+    col = torch.arange(num_nodes, device=DEVICE)
+    edge_label_index_to_check = torch.stack([row, col], dim=0)
+    z_combined = torch.cat([z_all, z_new], dim=0)
+    with torch.no_grad():
+        logits = model.decode(z_combined, edge_label_index_to_check)
+    return torch.sigmoid(logits)
+def format_top_k_predictions(
+    probs: torch.Tensor, dataset: OGBNLinkPredDataset, top_k=10., show_prob=False
+) -> str:
+    """
+    Formats the top K predictions into a single string for display.
+    Args:
+        probs (torch.Tensor): The tensor of probabilities for all potential links.
+        dataset (OGBNLinkPredDataset): The dataset object containing the corpus.
+        top_k (int): The number of top predictions to format.
+    Returns:
+        str: A formatted string with the top K predictions.
+    """
+    probs = probs.cpu()
+    top_probs, top_indices = torch.topk(probs, k=top_k)
+    output_lines = []
+    header = f"Top {top_k} Citation Predictions:"
+    output_lines.append(header)
+    for i in range(top_k):
+        paper_idx = top_indices[i].item()
+        prob = top_probs[i].item()
+        paper_info = dataset.corpus[paper_idx]
+        paper_title = paper_info.split("\n")[0]
+        if show_prob:
+            line = f"  - Title: '{paper_title.strip()}', Probability: {prob:.4f}"
+        else:
+            line = f"  - Title: '{paper_title.strip()}'"
+        output_lines.append(line)
+    return "\n".join(output_lines)
+def prepare_system(model_path: Path):
+    """
+    Performs all one-time, expensive operations to prepare the system.
+    Initializes models, loads data, and pre-calculates embeddings using structured logging.
+    """
+    logger.info("system_preparation.start")
+    dataset = OGBNLinkPredDataset()
+    data = dataset.data.to(DEVICE)
+    logger.info("dataset.load.success")
+    model_name = "bongsoo/kpf-sbert-128d-v1"
+    logger.info(
+        "model.load.start", model_type="SentenceTransformer", model_name=model_name
+    )
+    st_model = SentenceTransformer(model_name, device=DEVICE)
+    logger.info("model.load.success", model_type="SentenceTransformer")
+    gcn_model = SimpleGCN(
+        in_channels=dataset.num_features, hidden_channels=256, out_channels=128
+    ).to(DEVICE)
+    if model_path.exists():
+        gcn_model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+        logger.info("model.load.success", model_type="GCN", path=str(model_path))
+    else:
+        logger.warning(
+            "model.load.failure",
+            model_type="GCN",
+            path=str(model_path),
+            reason="File not found, using random weights.",
+        )
+    gcn_model.eval()
+    logger.info("embeddings.calculation.start", embedding_name="z_all")
+    with torch.no_grad():
+        z_all = gcn_model(data.x, data.edge_index)
+    logger.info(
+        "embeddings.calculation.success",
+        embedding_name="z_all",
+        shape=list(z_all.shape),
+    )
+    logger.info("system_preparation.finish", status="ready_for_predictions")
+    return gcn_model, st_model, dataset, z_all
+if __name__ == "__main__":
+    MODEL_PATH = Path("model.pth")
+    gcn_model, st_model, dataset, z_all = prepare_system(MODEL_PATH)
+    my_title = "A Survey of Graph Neural Networks for Link Prediction"
+    my_abstract = """Link predictor is a critical task in graph analysis. "
+                   "In this paper, we review various GNN architectures like GCN and GraphSAGE for predicting edges.
+                   """
+    new_vector = abstract_to_vector(my_title, my_abstract, st_model)
+    probabilities = get_citation_predictions(
+        vector=new_vector,
+        model=gcn_model,
+        z_all=z_all,
+        num_nodes=dataset.data.num_nodes,
+    )
+    references = format_top_k_predictions(probabilities, dataset, top_k=5)
+    print(references)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,49 @@

+[tool.poetry]
+name = "galis"
+version = "0.3.0"
+description = ""
+authors = ["Perunio <[email protected]>"]
+readme = "README.md"
+packages = [{include = "galis"}]
+[tool.poetry.dependencies]
+python = ">=3.12, <3.13"
+torch = [
+    {version = "2.3.1+cu121", source = "pytorch-cuda", markers = "sys_platform == 'linux' or sys_platform == 'win32'"},
+    {version = "^2.3.1", source = "pytorch-cpu", markers = "sys_platform == 'darwin'"}
+]
+ogb = "^1.3.6"
+torch-geometric = "^2.6.1"
+pandas = "^2.3.1"
+streamlit = "^1.46.1"
+numpy = "1.26.4"
+streamlit-extras = "^0.7.5"
+sentence-transformers = "2.7.0"
+transformers = "4.39.3"
+ruff = "^0.12.7"
+structlog = "^25.4.0"
+langchain-google-genai = "^2.1.9"
+langchain-core = "^0.3.72"
+langchain = "^0.3.27"
+python-dotenv = "^1.1.1"
+hf-transfer = "^0.1.9"
+[[tool.poetry.source]]
+name = "pytorch-cuda"
+url = "https://download.pytorch.org/whl/cu121"
+priority = "explicit"
+[[tool.poetry.source]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "explicit"
+[[tool.poetry.source]]
+name = "PyPI"
+priority = "primary"
+[build-system]
+requires = ["poetry-core>=2.0.0,<3.0.0"]
+build-backend = "poetry.core.masonry.api"