Spaces:

cstr
/

conceptnet_normalized

Running

File size: 23,694 Bytes

import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download
import os
import time
import json
from typing import Dict, List, Optional
from collections import defaultdict

# ===== CONFIGURATION =====
# 1. Point to the NEW normalized database (fixed)
TARGET_LANGUAGES = ['en', 'fr', 'it', 'de', 'es', 'ar', 'fa', 'grc', 'he', 'la', 'hbo']
NORMALIZED_REPO_ID = "cstr/conceptnet-normalized-multi"
NORMALIZED_DB_FILE = "conceptnet_normalized.db"

CONCEPTNET_BASE = "http://conceptnet.io"
# =========================

# --- All relations MUST be full URLs ---
# This dictionary is now our primary way to map names to relation IDs
CONCEPTNET_RELATIONS: Dict[str, str] = {
    "RelatedTo": f"{CONCEPTNET_BASE}/r/RelatedTo",
    "IsA": f"{CONCEPTNET_BASE}/r/IsA",
    "PartOf": f"{CONCEPTNET_BASE}/r/PartOf",
    "HasA": f"{CONCEPTNET_BASE}/r/HasA",
    "UsedFor": f"{CONCEPTNET_BASE}/r/UsedFor",
    "CapableOf": f"{CONCEPTNET_BASE}/r/CapableOf",
    "AtLocation": f"{CONCEPTNET_BASE}/r/AtLocation",
    "Causes": f"{CONCEPTNET_BASE}/r/Causes",
    "HasSubevent": f"{CONCEPTNET_BASE}/r/HasSubevent",
    "HasFirstSubevent": f"{CONCEPTNET_BASE}/r/HasFirstSubevent",
    "HasLastSubevent": f"{CONCEPTNET_BASE}/r/HasLastSubevent",
    "HasPrerequisite": f"{CONCEPTNET_BASE}/r/HasPrerequisite",
    "HasProperty": f"{CONCEPTNET_BASE}/r/HasProperty",
    "MotivatedByGoal": f"{CONCEPTNET_BASE}/r/MotivatedByGoal",
    "ObstructedBy": f"{CONCEPTNET_BASE}/r/ObstructedBy",
    "Desires": f"{CONCEPTNET_BASE}/r/Desires",
    "CreatedBy": f"{CONCEPTNET_BASE}/r/CreatedBy",
    "Synonym": f"{CONCEPTNET_BASE}/r/Synonym",
    "Antonym": f"{CONCEPTNET_BASE}/r/Antonym",
    "DistinctFrom": f"{CONCEPTNET_BASE}/r/DistinctFrom",
    "DerivedFrom": f"{CONCEPTNET_BASE}/r/DerivedFrom",
    "SymbolOf": f"{CONCEPTNET_BASE}/r/SymbolOf",
    "DefinedAs": f"{CONCEPTNET_BASE}/r/DefinedAs",
    "MannerOf": f"{CONCEPTNET_BASE}/r/MannerOf",
    "LocatedNear": f"{CONCEPTNET_BASE}/r/LocatedNear",
    "HasContext": f"{CONCEPTNET_BASE}/r/HasContext",
    "SimilarTo": f"{CONCEPTNET_BASE}/r/SimilarTo",
    "EtymologicallyRelatedTo": f"{CONCEPTNET_BASE}/r/EtymologicallyRelatedTo",
    "EtymologicallyDerivedFrom": f"{CONCEPTNET_BASE}/r/EtymologicallyDerivedFrom",
    "CausesDesire": f"{CONCEPTNET_BASE}/r/CausesDesire",
    "MadeOf": f"{CONCEPTNET_BASE}/r/MadeOf",
    "ReceivesAction": f"{CONCEPTNET_BASE}/r/ReceivesAction",
    "ExternalURL": f"{CONCEPTNET_BASE}/r/ExternalURL",
    "NotDesires": f"{CONCEPTNET_BASE}/r/NotDesires",
    "NotUsedFor": f"{CONCEPTNET_BASE}/r/NotUsedFor",
    "NotCapableOf": f"{CONCEPTNET_BASE}/r/NotCapableOf",
    "NotHasProperty": f"{CONCEPTNET_BASE}/r/NotHasProperty",
}
# =========================

print(f"🌍 Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
print(f"📚 Relations: {len(CONCEPTNET_RELATIONS)} relations loaded")

def log_progress(message, level="INFO"):
    """Simple logger with timestamp and emoji prefix."""
    timestamp = time.strftime("%H:%M:%S")
    prefix = {"INFO": "ℹ️ ", "SUCCESS": "✅", "ERROR": "❌", "WARN": "⚠️ ", "DEBUG": "🔍"}.get(level, "")
    print(f"[{timestamp}] {prefix} {message}")

def download_normalized_database():
    """Download the NEW normalized database from HF Hub."""
    log_progress(f"Downloading/Verifying {NORMALIZED_DB_FILE}...", "INFO")
    try:
        # This will download or use cache
        return hf_hub_download(
            repo_id=NORMALIZED_REPO_ID,
            filename=NORMALIZED_DB_FILE,
            repo_type="dataset"
        )
    except Exception as e:
        log_progress(f"Failed to download DB: {e}", "ERROR")
        return None

DB_PATH = download_normalized_database()

if not DB_PATH:
    log_progress("DATABASE NOT FOUND. App will not function.", "ERROR")
else:
    log_progress(f"Database loaded from: {DB_PATH}", "SUCCESS")

def get_db_connection():
    """Get a thread-safe, read-only connection to the SQLite database."""
    if not DB_PATH:
        raise Exception("Database path is not set. Cannot create connection.")
    # Connect in read-only mode
    db_uri = f"file:{DB_PATH}?mode=ro"
    conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False)
    conn.execute("PRAGMA cache_size = -256000") # 256MB cache
    conn.execute("PRAGMA temp_store = MEMORY")
    return conn

def node_url_to_label(url: str) -> str:
    """Extract the term from ConceptNet URL: http://conceptnet.io/c/{lang}/{term}/..."""
    try:
        parts = url.split('/')
        # Term is ALWAYS at index 5
        if len(parts) >= 6 and parts[3] == 'c':
            return parts[5].replace('_', ' ')
    except:
        pass
    return url  # Fallback to full URL if parsing fails

def get_semantic_profile(word: str, lang: str = 'en', selected_relations: List[str] = None, progress=gr.Progress()):
    """
    --- REWRITTEN FOR NORMALIZED DB ---
    Get semantic profile for a word.
    This function is now extremely fast, running 4 queries total instead of 2N.
    """
    log_progress(f"Profile: {word} ({lang})", "INFO")
    
    if not word or lang not in TARGET_LANGUAGES:
        yield "⚠️ Invalid input"
        return
        
    if not DB_PATH:
        yield "❌ **Error:** Database file not found."
        return

    # Set default relations if none are selected
    if not selected_relations:
        selected_relations = [
            "IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", 
            "CapableOf", "Synonym", "Antonym"
        ]
        
    word = word.strip().lower().replace(' ', '_')
    exact_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}"
    
    output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n"
    
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            progress(0, desc="Starting...")
            yield output_md
            
            # === STEP 1: Find Node PKs ===
            progress(0.05, desc="Finding nodes...")
            
            cursor.execute("SELECT node_pk, node_url FROM node_norm WHERE node_url = ?", (exact_path,))
            exact_node = cursor.fetchone()
            
            node_pks = []
            nodes_found = []
            
            if exact_node:
                log_progress(f"Found exact node: {exact_node[1]}", "SUCCESS")
                node_pks = [exact_node[0]]
                nodes_found = [(exact_node[1], node_url_to_label(exact_node[1]))]
            else:
                log_progress(f"No exact node, falling back to LIKE...", "WARN")
                like_path = f"{exact_path}%"
                cursor.execute("SELECT node_pk, node_url FROM node_norm WHERE node_url LIKE ? LIMIT 5", (like_path,))
                nodes = cursor.fetchall()
                if not nodes:
                    yield f"# 🧠 '{word}'\n\n⚠️ Not found"
                    return
                node_pks = [n[0] for n in nodes]
                nodes_found = [(n[1], node_url_to_label(n[1])) for n in nodes]
            
            for node_url, label in nodes_found[:3]:
                output_md += f"**Node:** `{node_url}` → **{label}**\n"
            output_md += "\n"
            yield output_md
            
            # === STEP 2: Find Relation PKs ===
            progress(0.15, desc="Finding relations...")
            
            rel_urls_to_query = tuple(CONCEPTNET_RELATIONS[name] for name in selected_relations if name in CONCEPTNET_RELATIONS)
            if not rel_urls_to_query:
                output_md += "⚠️ No valid relations selected."
                yield output_md
                return

            rel_placeholders = ','.join(['?'] * len(rel_urls_to_query))
            cursor.execute(f"SELECT rel_pk, rel_url FROM rel_norm WHERE rel_url IN ({rel_placeholders})", rel_urls_to_query)
            
            # Create lookup maps
            rel_pk_to_name = {}
            rel_name_to_pk = {}
            rel_name_to_url = {}
            for pk, url in cursor.fetchall():
                # Find the 'short name' (e.g., 'IsA') from the full URL
                for name, url_val in CONCEPTNET_RELATIONS.items():
                    if url_val == url:
                        rel_pk_to_name[pk] = name
                        rel_name_to_pk[name] = pk
                        rel_name_to_url[name] = url
                        break

            rel_pks_to_query = tuple(rel_pk_to_name.keys())
            node_pk_placeholders = ','.join(['?'] * len(node_pks))
            rel_pk_placeholders = ','.join(['?'] * len(rel_pks_to_query))
            
            # Buckets for results
            outgoing_results = defaultdict(list)
            incoming_results = defaultdict(list)
            
            # === STEP 3: Run ONE query for ALL outgoing edges ===
            progress(0.4, desc="Querying outgoing edges...")
            sql_out = f"""
                SELECT
                    e.rel_fk, n_end.node_url, e.weight
                FROM edge_norm e
                JOIN node_norm n_end ON e.end_fk = n_end.node_pk
                WHERE
                    e.start_fk IN ({node_pk_placeholders})
                    AND e.rel_fk IN ({rel_pk_placeholders})
                ORDER BY e.weight DESC
                LIMIT 200 
            """
            cursor.execute(sql_out, (*node_pks, *rel_pks_to_query))
            
            for rel_pk, node_url, weight in cursor.fetchall():
                rel_name = rel_pk_to_name.get(rel_pk)
                if rel_name and len(outgoing_results[rel_name]) < 7:
                    outgoing_results[rel_name].append((node_url_to_label(node_url), weight))

            # === STEP 4: Run ONE query for ALL incoming edges ===
            progress(0.7, desc="Querying incoming edges...")
            sql_in = f"""
                SELECT
                    e.rel_fk, n_start.node_url, e.weight
                FROM edge_norm e
                JOIN node_norm n_start ON e.start_fk = n_start.node_pk
                WHERE
                    e.end_fk IN ({node_pk_placeholders})
                    AND e.rel_fk IN ({rel_pk_placeholders})
                ORDER BY e.weight DESC
                LIMIT 200
            """
            cursor.execute(sql_in, (*node_pks, *rel_pks_to_query))

            for rel_pk, node_url, weight in cursor.fetchall():
                rel_name = rel_pk_to_name.get(rel_pk)
                if rel_name and len(incoming_results[rel_name]) < 7:
                    incoming_results[rel_name].append((node_url_to_label(node_url), weight))

            # === STEP 5: Format results as Markdown ===
            progress(0.9, desc="Formatting results...")
            total = 0
            for rel_name in selected_relations:
                if rel_name not in rel_name_to_pk:
                    continue # Skip if this relation wasn't in the DB
                
                output_md += f"## {rel_name}\n\n"
                found = False
                
                out_edges = outgoing_results.get(rel_name, [])
                for label, weight in out_edges:
                    output_md += f"- **{word}** {rel_name} → *{label}* `[{weight:.3f}]`\n"
                    found = True
                    total += 1
                    
                in_edges = incoming_results.get(rel_name, [])
                for label, weight in in_edges:
                    output_md += f"- *{label}* {rel_name} → **{word}** `[{weight:.3f}]`\n"
                    found = True
                    total += 1
                
                if not found:
                    output_md += "*No results*\n"
                
                output_md += "\n"
                yield output_md # Yield after each relation is formatted

            output_md += f"---\n**Total relations:** {total}\n"
            log_progress(f"Profile complete: {total} relations", "SUCCESS")
            progress(1.0, desc="✅ Complete!")
            yield output_md
            
    except Exception as e:
        log_progress(f"Error: {e}", "ERROR")
        import traceback
        traceback.print_exc()
        yield f"**❌ Error:** {e}"

def run_query(start_node, start_lang, relation, end_node, end_lang, limit, progress=gr.Progress()):
    """
    Query builder using fast integer joins.
    """
    log_progress(f"Query: start={start_node} ({start_lang}), rel={relation}, end={end_node} ({end_lang})", "INFO")
    progress(0, desc="Building...")
    
    if not DB_PATH:
        return pd.DataFrame(), "❌ **Error:** Database file not found."
        
    # This is the new, fast query
    query = """
        SELECT
            n_start.node_url AS start_url,
            r.rel_url AS relation_url,
            n_end.node_url AS end_url,
            e.weight
        FROM edge_norm e
        JOIN node_norm n_start ON e.start_fk = n_start.node_pk
        JOIN node_norm n_end ON e.end_fk = n_end.node_pk
        JOIN rel_norm r ON e.rel_fk = r.rel_pk
    """
    
    params = []
    where_clauses = []
    
    try:
        with get_db_connection() as conn:
            progress(0.3, desc="Adding filters...")
            
            # Start node - USE start_lang
            if start_node and start_node.strip():
                if start_node.startswith('http://'):
                    pattern = f"{start_node}%"
                else:
                    pattern = f"{CONCEPTNET_BASE}/c/{start_lang}/{start_node.strip().lower().replace(' ', '_')}%"
                where_clauses.append("n_start.node_url LIKE ?")
                params.append(pattern)
            
            # Relation
            if relation and relation.strip():
                rel_value = CONCEPTNET_RELATIONS.get(relation.strip())
                if rel_value:
                    where_clauses.append("r.rel_url = ?")
                    params.append(rel_value)
            
            # End node - USE end_lang
            if end_node and end_node.strip():
                if end_node.startswith('http://'):
                    pattern = f"{end_node}%"
                else:
                    pattern = f"{CONCEPTNET_BASE}/c/{end_lang}/{end_node.strip().lower().replace(' ', '_')}%"
                where_clauses.append("n_end.node_url LIKE ?")
                params.append(pattern)
            
            if where_clauses:
                query += " WHERE " + " AND ".join(where_clauses)
                
            query += " ORDER BY e.weight DESC LIMIT ?"
            params.append(limit)
            
            progress(0.6, desc="Executing...")
            
            start_time = time.time()
            df = pd.read_sql_query(query, conn, params=params)
            elapsed = time.time() - start_time
            
            log_progress(f"Query done: {len(df)} rows in {elapsed:.2f}s", "SUCCESS")
            progress(1.0, desc="Done!")
            
            if df.empty:
                return pd.DataFrame(), f"⚠️ No results ({elapsed:.2f}s)"
            
            # Add user-friendly labels from the URLs
            df['start_label'] = df['start_url'].apply(node_url_to_label)
            df['end_label'] = df['end_url'].apply(node_url_to_label)
            df['relation'] = df['relation_url'].apply(lambda x: x.split('/')[-1])
            
            # Reorder columns
            df = df[['start_label', 'relation', 'end_label', 'weight', 'start_url', 'end_url', 'relation_url']]
            
            return df, f"✅ {len(df)} results in {elapsed:.2f}s"
            
    except Exception as e:
        log_progress(f"Error: {e}", "ERROR")
        import traceback
        traceback.print_exc()
        return pd.DataFrame(), f"❌ {e}"

def run_raw_query(sql_query):
    """Execute a raw SELECT SQL query against the normalized DB."""
    if not sql_query.strip().upper().startswith("SELECT"):
        return pd.DataFrame(), "❌ Only SELECT queries are allowed."
        
    if not DB_PATH:
        return pd.DataFrame(), "❌ **Error:** Database file not found."

    try:
        with get_db_connection() as conn:
            start = time.time()
            df = pd.read_sql_query(sql_query, conn)
            elapsed = time.time() - start
            return df, f"✅ {len(df)} rows in {elapsed:.3f}s"
    except Exception as e:
        return pd.DataFrame(), f"❌ {e}"

def get_schema_info():
    """
    --- REWRITTEN FOR NORMALIZED DB ---
    Get schema information for the new database.
    """
    if not DB_PATH:
        return "❌ **Error:** Database file not found."
        
    md = f"# 📚 Schema (Normalized)\n\n"
    md += f"**Repo:** [{NORMALIZED_REPO_ID}](https://huggingface.co/datasets/{NORMALIZED_REPO_ID})\n\n"
    md += "**Schema:** Text URLs (`node_norm`, `rel_norm`) are stored once. The `edge_norm` table uses fast integer keys (`_fk`) for joins.\n\n"
    
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            
            md += "## Tables & Row Counts\n\n"
            # Use the new table names
            for table in ["node_norm", "rel_norm", "edge_norm"]:
                cursor.execute(f"SELECT COUNT(*) FROM {table}")
                md += f"- **{table}:** {cursor.fetchone()[0]:,} rows\n"
                
            md += "\n## Indices\n\n"
            cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='index' AND sql IS NOT NULL")
            for name, sql in cursor.fetchall():
                md += f"- **{name}:** `{sql}`\n"
            
            md += "\n## Common Relations (from `rel_norm`)\n\n"
            # Query the new relation table
            cursor.execute("SELECT rel_url FROM rel_norm ORDER BY rel_url LIMIT 20")
            for (rel_url,) in cursor.fetchall():
                label = rel_url.split('/')[-1]
                md += f"- **{label}:** `{rel_url}`\n"
                
    except Exception as e:
        md += f"\n**❌ Error:** {e}\n"
    
    return md

# ===== Build Gradio UI (Mostly Unchanged) =====
with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🧠 ConceptNet Explorer (Normalized v2)")
    gr.Markdown(f"**Repo:** `{NORMALIZED_REPO_ID}` | **Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
    
    if not DB_PATH:
        gr.Markdown("## ❌ ERROR: DATABASE FILE NOT FOUND")
        gr.Markdown(f"This app cannot start because `{NORMALIZED_DB_FILE}` could not be downloaded from `{NORMALIZED_REPO_ID}`. Please check the logs.")
    
    else:
        with gr.Tabs():
            with gr.TabItem("🔍 Semantic Profile"):
                gr.Markdown("**Explore semantic relations for any word. Runs on the fast normalized DB.**")
                
                with gr.Row():
                    word_input = gr.Textbox(label="Word", placeholder="e.g., dog, hund, perro", value="dog", scale=3)
                    lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value="en", label="Language", scale=1)
                
                with gr.Accordion("Select Relations (fewer = faster)", open=False):
                    relation_input = gr.CheckboxGroup(
                        choices=list(CONCEPTNET_RELATIONS.keys()), 
                        label="Relations to Query", 
                        value=["IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", "CapableOf", "Synonym", "Antonym", "AtLocation", "HasProperty"]
                    )
                
                semantic_btn = gr.Button("🔍 Get Semantic Profile", variant="primary", size="lg")
                semantic_output = gr.Markdown(value="Click the button to get the semantic profile.")
                
                gr.Examples(
                    examples=[["dog", "en"], ["hund", "de"], ["perro", "es"], ["chat", "fr"], ["knowledge", "en"]],
                    inputs=[word_input, lang_input],
                    label="Examples"
                )
            
            with gr.TabItem("⚡ Query Builder"):
                gr.Markdown("**Build custom relationship queries (now using fast integer joins).**")
                
                with gr.Row():
                    start_input = gr.Textbox(label="Start Node (word)", placeholder="dog (optional)")
                    start_lang = gr.Dropdown(choices=TARGET_LANGUAGES, value="en", label="Start Lang", scale=1)
                    rel_input = gr.Dropdown(
                        choices=[""] + list(CONCEPTNET_RELATIONS.keys()), 
                        label="Relation (name)", 
                        value="IsA",
                        info="Leave blank to query all relations"
                    )
                    end_input = gr.Textbox(label="End Node (word)", placeholder="(optional)")
                    end_lang = gr.Dropdown(choices=TARGET_LANGUAGES, value="en", label="End Lang", scale=1)
                
                limit_slider = gr.Slider(label="Limit", minimum=1, maximum=500, value=50, step=1)
                query_btn = gr.Button("▶️ Run Query", variant="primary", size="lg")
                
                status_output = gr.Markdown()
                results_output = gr.DataFrame(wrap=True) # Height bug is still fixed
            
            with gr.TabItem("💻 Raw SQL"):
                gr.Markdown("**Execute custom `SELECT` SQL queries against the *new normalized schema*.**")
                
                # --- UPDATED Example Query ---
                new_example_sql = f"""SELECT
    n_start.node_url,
    r.rel_url,
    n_end.node_url,
    e.weight
FROM edge_norm e
JOIN node_norm n_start ON e.start_fk = n_start.node_pk
JOIN node_norm n_end ON e.end_fk = n_end.node_pk
JOIN rel_norm r ON e.rel_fk = r.rel_pk
WHERE n_start.node_url = '{CONCEPTNET_BASE}/c/en/dog'
  AND r.rel_url = '{CONCEPTNET_BASE}/r/IsA'
ORDER BY e.weight DESC
LIMIT 10
"""
                raw_sql_input = gr.Textbox(
                    label="SQL Query",
                    value=new_example_sql,
                    lines=13,
                    elem_classes=["font-mono"]
                )
                
                raw_btn = gr.Button("▶️ Execute")
                raw_status = gr.Markdown()
                raw_results = gr.DataFrame() # Height bug is still fixed
            
            with gr.TabItem("📊 Schema"):
                gr.Markdown("**View database schema, tables, and indices for the *new normalized DB*.**")
                schema_btn = gr.Button("📊 Load Schema Info")
                schema_output = gr.Markdown()

        # --- Button Click Handlers (All API names preserved) ---
        semantic_btn.click(
            get_semantic_profile, 
            inputs=[word_input, lang_input, relation_input], 
            outputs=semantic_output,
            api_name="get_semantic_profile"
        )
        
        query_btn.click(
            run_query, 
            inputs=[start_input, start_lang, rel_input, end_input, end_lang, limit_slider], 
            outputs=[results_output, status_output],
            api_name="run_query"
        )
        
        raw_btn.click(
            run_raw_query, 
            inputs=raw_sql_input, 
            outputs=[raw_results, raw_status],
            api_name="run_raw_query"
        )
        
        demo.load(
            get_schema_info, 
            None, 
            schema_output,
            api_name="get_schema"
        )
        schema_btn.click(
            get_schema_info, 
            None, 
            schema_output,
            api_name="get_schema"
        )

if __name__ == "__main__":
    if DB_PATH:
        log_progress("APP READY! (Normalized DB)", "SUCCESS")
    else:
        log_progress("APP LAUNCHING WITH ERRORS (DB NOT FOUND)", "ERROR")
    demo.launch(ssr_mode=False)