Hugging Face's logo Hugging Face Models Datasets Spaces Docs Pricing Spaces: cstr / conceptnet_normalized App Files Community Settings conceptnet_normalized / app.py cstr's picture cstr Update app.py fefc16c verified 3 minutes ago raw history blame edit delete 23.4 kB import gradio as gr import sqlite3 import pandas as pd from huggingface_hub import hf_hub_download import os import time import json from typing import Dict, List, Optional from collections import defaultdict # ===== CONFIGURATION ===== # 1. Point to the NEW normalized database (fixed) TARGET_LANGUAGES = ['en', 'fr', 'it', 'de', 'es', 'ar', 'fa', 'grc', 'he', 'la', 'hbo'] NORMALIZED_REPO_ID = "cstr/conceptnet-normalized-multi" NORMALIZED_DB_FILE = "conceptnet_normalized.db" CONCEPTNET_BASE = "http://conceptnet.io" # ========================= # --- All relations MUST be full URLs --- # This dictionary is now our primary way to map names to relation IDs CONCEPTNET_RELATIONS: Dict[str, str] = { "RelatedTo": f"{CONCEPTNET_BASE}/r/RelatedTo", "IsA": f"{CONCEPTNET_BASE}/r/IsA", "PartOf": f"{CONCEPTNET_BASE}/r/PartOf", "HasA": f"{CONCEPTNET_BASE}/r/HasA", "UsedFor": f"{CONCEPTNET_BASE}/r/UsedFor", "CapableOf": f"{CONCEPTNET_BASE}/r/CapableOf", "AtLocation": f"{CONCEPTNET_BASE}/r/AtLocation", "Causes": f"{CONCEPTNET_BASE}/r/Causes", "HasSubevent": f"{CONCEPTNET_BASE}/r/HasSubevent", "HasFirstSubevent": f"{CONCEPTNET_BASE}/r/HasFirstSubevent", "HasLastSubevent": f"{CONCEPTNET_BASE}/r/HasLastSubevent", "HasPrerequisite": f"{CONCEPTNET_BASE}/r/HasPrerequisite", "HasProperty": f"{CONCEPTNET_BASE}/r/HasProperty", "MotivatedByGoal": f"{CONCEPTNET_BASE}/r/MotivatedByGoal", "ObstructedBy": f"{CONCEPTNET_BASE}/r/ObstructedBy", "Desires": f"{CONCEPTNET_BASE}/r/Desires", "CreatedBy": f"{CONCEPTNET_BASE}/r/CreatedBy", "Synonym": f"{CONCEPTNET_BASE}/r/Synonym", "Antonym": f"{CONCEPTNET_BASE}/r/Antonym", "DistinctFrom": f"{CONCEPTNET_BASE}/r/DistinctFrom", "DerivedFrom": f"{CONCEPTNET_BASE}/r/DerivedFrom", "SymbolOf": f"{CONCEPTNET_BASE}/r/SymbolOf", "DefinedAs": f"{CONCEPTNET_BASE}/r/DefinedAs", "MannerOf": f"{CONCEPTNET_BASE}/r/MannerOf", "LocatedNear": f"{CONCEPTNET_BASE}/r/LocatedNear", "HasContext": f"{CONCEPTNET_BASE}/r/HasContext", "SimilarTo": f"{CONCEPTNET_BASE}/r/SimilarTo", "EtymologicallyRelatedTo": f"{CONCEPTNET_BASE}/r/EtymologicallyRelatedTo", "EtymologicallyDerivedFrom": f"{CONCEPTNET_BASE}/r/EtymologicallyDerivedFrom", "CausesDesire": f"{CONCEPTNET_BASE}/r/CausesDesire", "MadeOf": f"{CONCEPTNET_BASE}/r/MadeOf", "ReceivesAction": f"{CONCEPTNET_BASE}/r/ReceivesAction", "ExternalURL": f"{CONCEPTNET_BASE}/r/ExternalURL", "NotDesires": f"{CONCEPTNET_BASE}/r/NotDesires", "NotUsedFor": f"{CONCEPTNET_BASE}/r/NotUsedFor", "NotCapableOf": f"{CONCEPTNET_BASE}/r/NotCapableOf", "NotHasProperty": f"{CONCEPTNET_BASE}/r/NotHasProperty", } # ========================= print(f"šŸŒ Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}") print(f"šŸ“š Relations: {len(CONCEPTNET_RELATIONS)} relations loaded") def log_progress(message, level="INFO"): """Simple logger with timestamp and emoji prefix.""" timestamp = time.strftime("%H:%M:%S") prefix = {"INFO": "ā„¹ļø ", "SUCCESS": "āœ…", "ERROR": "āŒ", "WARN": "āš ļø ", "DEBUG": "šŸ”"}.get(level, "") print(f"[{timestamp}] {prefix} {message}") def download_normalized_database(): """Download the NEW normalized database from HF Hub.""" log_progress(f"Downloading/Verifying {NORMALIZED_DB_FILE}...", "INFO") try: # This will download or use cache return hf_hub_download( repo_id=NORMALIZED_REPO_ID, filename=NORMALIZED_DB_FILE, repo_type="dataset" ) except Exception as e: log_progress(f"Failed to download DB: {e}", "ERROR") return None DB_PATH = download_normalized_database() if not DB_PATH: log_progress("DATABASE NOT FOUND. App will not function.", "ERROR") else: log_progress(f"Database loaded from: {DB_PATH}", "SUCCESS") def get_db_connection(): """Get a thread-safe, read-only connection to the SQLite database.""" if not DB_PATH: raise Exception("Database path is not set. Cannot create connection.") # Connect in read-only mode db_uri = f"file:{DB_PATH}?mode=ro" conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False) conn.execute("PRAGMA cache_size = -256000") # 256MB cache conn.execute("PRAGMA temp_store = MEMORY") return conn def node_url_to_label(url: str) -> str: """Extract the term from ConceptNet URL: http://conceptnet.io/c/{lang}/{term}/...""" try: parts = url.split('/') # Term is ALWAYS at index 5 if len(parts) >= 6 and parts[3] == 'c': return parts[5].replace('_', ' ') except: pass return url # Fallback to full URL if parsing fails def get_semantic_profile(word: str, lang: str = 'en', selected_relations: List[str] = None, progress=gr.Progress()): """ --- REWRITTEN FOR NORMALIZED DB --- Get semantic profile for a word. This function is now extremely fast, running 4 queries total instead of 2N. """ log_progress(f"Profile: {word} ({lang})", "INFO") if not word or lang not in TARGET_LANGUAGES: yield "āš ļø Invalid input" return if not DB_PATH: yield "āŒ **Error:** Database file not found." return # Set default relations if none are selected if not selected_relations: selected_relations = [ "IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", "CapableOf", "Synonym", "Antonym" ] word = word.strip().lower().replace(' ', '_') exact_path = f"{CONCEPTNET_BASE}/c/{lang}/{word}" output_md = f"# 🧠 Semantic Profile: '{word}' ({lang.upper()})\n\n" try: with get_db_connection() as conn: cursor = conn.cursor() progress(0, desc="Starting...") yield output_md # === STEP 1: Find Node PKs === progress(0.05, desc="Finding nodes...") cursor.execute("SELECT node_pk, node_url FROM node_norm WHERE node_url = ?", (exact_path,)) exact_node = cursor.fetchone() node_pks = [] nodes_found = [] if exact_node: log_progress(f"Found exact node: {exact_node[1]}", "SUCCESS") node_pks = [exact_node[0]] nodes_found = [(exact_node[1], node_url_to_label(exact_node[1]))] else: log_progress(f"No exact node, falling back to LIKE...", "WARN") like_path = f"{exact_path}%" cursor.execute("SELECT node_pk, node_url FROM node_norm WHERE node_url LIKE ? LIMIT 5", (like_path,)) nodes = cursor.fetchall() if not nodes: yield f"# 🧠 '{word}'\n\nāš ļø Not found" return node_pks = [n[0] for n in nodes] nodes_found = [(n[1], node_url_to_label(n[1])) for n in nodes] for node_url, label in nodes_found[:3]: output_md += f"**Node:** `{node_url}` → **{label}**\n" output_md += "\n" yield output_md # === STEP 2: Find Relation PKs === progress(0.15, desc="Finding relations...") rel_urls_to_query = tuple(CONCEPTNET_RELATIONS[name] for name in selected_relations if name in CONCEPTNET_RELATIONS) if not rel_urls_to_query: output_md += "āš ļø No valid relations selected." yield output_md return rel_placeholders = ','.join(['?'] * len(rel_urls_to_query)) cursor.execute(f"SELECT rel_pk, rel_url FROM rel_norm WHERE rel_url IN ({rel_placeholders})", rel_urls_to_query) # Create lookup maps rel_pk_to_name = {} rel_name_to_pk = {} rel_name_to_url = {} for pk, url in cursor.fetchall(): # Find the 'short name' (e.g., 'IsA') from the full URL for name, url_val in CONCEPTNET_RELATIONS.items(): if url_val == url: rel_pk_to_name[pk] = name rel_name_to_pk[name] = pk rel_name_to_url[name] = url break rel_pks_to_query = tuple(rel_pk_to_name.keys()) node_pk_placeholders = ','.join(['?'] * len(node_pks)) rel_pk_placeholders = ','.join(['?'] * len(rel_pks_to_query)) # Buckets for results outgoing_results = defaultdict(list) incoming_results = defaultdict(list) # === STEP 3: Run ONE query for ALL outgoing edges === progress(0.4, desc="Querying outgoing edges...") sql_out = f""" SELECT e.rel_fk, n_end.node_url, e.weight FROM edge_norm e JOIN node_norm n_end ON e.end_fk = n_end.node_pk WHERE e.start_fk IN ({node_pk_placeholders}) AND e.rel_fk IN ({rel_pk_placeholders}) ORDER BY e.weight DESC LIMIT 200 """ cursor.execute(sql_out, (*node_pks, *rel_pks_to_query)) for rel_pk, node_url, weight in cursor.fetchall(): rel_name = rel_pk_to_name.get(rel_pk) if rel_name and len(outgoing_results[rel_name]) < 7: outgoing_results[rel_name].append((node_url_to_label(node_url), weight)) # === STEP 4: Run ONE query for ALL incoming edges === progress(0.7, desc="Querying incoming edges...") sql_in = f""" SELECT e.rel_fk, n_start.node_url, e.weight FROM edge_norm e JOIN node_norm n_start ON e.start_fk = n_start.node_pk WHERE e.end_fk IN ({node_pk_placeholders}) AND e.rel_fk IN ({rel_pk_placeholders}) ORDER BY e.weight DESC LIMIT 200 """ cursor.execute(sql_in, (*node_pks, *rel_pks_to_query)) for rel_pk, node_url, weight in cursor.fetchall(): rel_name = rel_pk_to_name.get(rel_pk) if rel_name and len(incoming_results[rel_name]) < 7: incoming_results[rel_name].append((node_url_to_label(node_url), weight)) # === STEP 5: Format results as Markdown === progress(0.9, desc="Formatting results...") total = 0 for rel_name in selected_relations: if rel_name not in rel_name_to_pk: continue # Skip if this relation wasn't in the DB output_md += f"## {rel_name}\n\n" found = False out_edges = outgoing_results.get(rel_name, []) for label, weight in out_edges: output_md += f"- **{word}** {rel_name} → *{label}* `[{weight:.3f}]`\n" found = True total += 1 in_edges = incoming_results.get(rel_name, []) for label, weight in in_edges: output_md += f"- *{label}* {rel_name} → **{word}** `[{weight:.3f}]`\n" found = True total += 1 if not found: output_md += "*No results*\n" output_md += "\n" yield output_md # Yield after each relation is formatted output_md += f"---\n**Total relations:** {total}\n" log_progress(f"Profile complete: {total} relations", "SUCCESS") progress(1.0, desc="āœ… Complete!") yield output_md except Exception as e: log_progress(f"Error: {e}", "ERROR") import traceback traceback.print_exc() yield f"**āŒ Error:** {e}" def run_query(start_node, start_lang, relation, end_node, end_lang, limit, progress=gr.Progress()): """ Query builder using fast integer joins. """ log_progress(f"Query: start={start_node} ({start_lang}), rel={relation}, end={end_node} ({end_lang})", "INFO") progress(0, desc="Building...") if not DB_PATH: return pd.DataFrame(), "āŒ **Error:** Database file not found." # This is the new, fast query query = """ SELECT n_start.node_url AS start_url, r.rel_url AS relation_url, n_end.node_url AS end_url, e.weight FROM edge_norm e JOIN node_norm n_start ON e.start_fk = n_start.node_pk JOIN node_norm n_end ON e.end_fk = n_end.node_pk JOIN rel_norm r ON e.rel_fk = r.rel_pk """ params = [] where_clauses = [] try: with get_db_connection() as conn: progress(0.3, desc="Adding filters...") # Start node - USE start_lang if start_node and start_node.strip(): if start_node.startswith('http://'): pattern = f"{start_node}%" else: pattern = f"{CONCEPTNET_BASE}/c/{start_lang}/{start_node.strip().lower().replace(' ', '_')}%" where_clauses.append("n_start.node_url LIKE ?") params.append(pattern) # Relation if relation and relation.strip(): rel_value = CONCEPTNET_RELATIONS.get(relation.strip()) if rel_value: where_clauses.append("r.rel_url = ?") params.append(rel_value) # End node - USE end_lang if end_node and end_node.strip(): if end_node.startswith('http://'): pattern = f"{end_node}%" else: pattern = f"{CONCEPTNET_BASE}/c/{end_lang}/{end_node.strip().lower().replace(' ', '_')}%" where_clauses.append("n_end.node_url LIKE ?") params.append(pattern) if where_clauses: query += " WHERE " + " AND ".join(where_clauses) query += " ORDER BY e.weight DESC LIMIT ?" params.append(limit) progress(0.6, desc="Executing...") start_time = time.time() df = pd.read_sql_query(query, conn, params=params) elapsed = time.time() - start_time log_progress(f"Query done: {len(df)} rows in {elapsed:.2f}s", "SUCCESS") progress(1.0, desc="Done!") if df.empty: return pd.DataFrame(), f"āš ļø No results ({elapsed:.2f}s)" # Add user-friendly labels from the URLs df['start_label'] = df['start_url'].apply(node_url_to_label) df['end_label'] = df['end_url'].apply(node_url_to_label) df['relation'] = df['relation_url'].apply(lambda x: x.split('/')[-1]) # Reorder columns df = df[['start_label', 'relation', 'end_label', 'weight', 'start_url', 'end_url', 'relation_url']] return df, f"āœ… {len(df)} results in {elapsed:.2f}s" except Exception as e: log_progress(f"Error: {e}", "ERROR") import traceback traceback.print_exc() return pd.DataFrame(), f"āŒ {e}" def run_raw_query(sql_query): """Execute a raw SELECT SQL query against the normalized DB.""" if not sql_query.strip().upper().startswith("SELECT"): return pd.DataFrame(), "āŒ Only SELECT queries are allowed." if not DB_PATH: return pd.DataFrame(), "āŒ **Error:** Database file not found." try: with get_db_connection() as conn: start = time.time() df = pd.read_sql_query(sql_query, conn) elapsed = time.time() - start return df, f"āœ… {len(df)} rows in {elapsed:.3f}s" except Exception as e: return pd.DataFrame(), f"āŒ {e}" def get_schema_info(): """ --- REWRITTEN FOR NORMALIZED DB --- Get schema information for the new database. """ if not DB_PATH: return "āŒ **Error:** Database file not found." md = f"# šŸ“š Schema (Normalized)\n\n" md += f"**Repo:** [{NORMALIZED_REPO_ID}](https://huggingface.co/datasets/{NORMALIZED_REPO_ID})\n\n" md += "**Schema:** Text URLs (`node_norm`, `rel_norm`) are stored once. The `edge_norm` table uses fast integer keys (`_fk`) for joins.\n\n" try: with get_db_connection() as conn: cursor = conn.cursor() md += "## Tables & Row Counts\n\n" # Use the new table names for table in ["node_norm", "rel_norm", "edge_norm"]: cursor.execute(f"SELECT COUNT(*) FROM {table}") md += f"- **{table}:** {cursor.fetchone()[0]:,} rows\n" md += "\n## Indices\n\n" cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='index' AND sql IS NOT NULL") for name, sql in cursor.fetchall(): md += f"- **{name}:** `{sql}`\n" md += "\n## Common Relations (from `rel_norm`)\n\n" # Query the new relation table cursor.execute("SELECT rel_url FROM rel_norm ORDER BY rel_url LIMIT 20") for (rel_url,) in cursor.fetchall(): label = rel_url.split('/')[-1] md += f"- **{label}:** `{rel_url}`\n" except Exception as e: md += f"\n**āŒ Error:** {e}\n" return md # ===== Build Gradio UI (Mostly Unchanged) ===== with gr.Blocks(title="ConceptNet Explorer", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🧠 ConceptNet Explorer (Normalized v2)") gr.Markdown(f"**Repo:** `{NORMALIZED_REPO_ID}` | **Languages:** {', '.join([l.upper() for l in TARGET_LANGUAGES])}") if not DB_PATH: gr.Markdown("## āŒ ERROR: DATABASE FILE NOT FOUND") gr.Markdown(f"This app cannot start because `{NORMALIZED_DB_FILE}` could not be downloaded from `{NORMALIZED_REPO_ID}`. Please check the logs.") else: with gr.Tabs(): with gr.TabItem("šŸ” Semantic Profile"): gr.Markdown("**Explore semantic relations for any word. Runs on the fast normalized DB.**") with gr.Row(): word_input = gr.Textbox(label="Word", placeholder="e.g., dog, hund, perro", value="dog", scale=3) lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value="en", label="Language", scale=1) with gr.Accordion("Select Relations (fewer = faster)", open=False): relation_input = gr.CheckboxGroup( choices=list(CONCEPTNET_RELATIONS.keys()), label="Relations to Query", value=["IsA", "RelatedTo", "PartOf", "HasA", "UsedFor", "CapableOf", "Synonym", "Antonym", "AtLocation", "HasProperty"] ) semantic_btn = gr.Button("šŸ” Get Semantic Profile", variant="primary", size="lg") semantic_output = gr.Markdown(value="Click the button to get the semantic profile.") gr.Examples( examples=[["dog", "en"], ["hund", "de"], ["perro", "es"], ["chat", "fr"], ["knowledge", "en"]], inputs=[word_input, lang_input], label="Examples" ) with gr.TabItem("⚔ Query Builder"): gr.Markdown("**Build custom relationship queries (now using fast integer joins).**") with gr.Row(): start_input = gr.Textbox(label="Start Node (word)", placeholder="dog (optional)") start_lang = gr.Dropdown(choices=TARGET_LANGUAGES, value="en", label="Start Lang", scale=1) rel_input = gr.Dropdown( choices=[""] + list(CONCEPTNET_RELATIONS.keys()), label="Relation (name)", value="IsA", info="Leave blank to query all relations" ) end_input = gr.Textbox(label="End Node (word)", placeholder="(optional)") end_lang = gr.Dropdown(choices=TARGET_LANGUAGES, value="en", label="End Lang", scale=1) limit_slider = gr.Slider(label="Limit", minimum=1, maximum=500, value=50, step=1) query_btn = gr.Button("ā–¶ļø Run Query", variant="primary", size="lg") status_output = gr.Markdown() results_output = gr.DataFrame(wrap=True) # Height bug is still fixed with gr.TabItem("šŸ’» Raw SQL"): gr.Markdown("**Execute custom `SELECT` SQL queries against the *new normalized schema*.**") # --- UPDATED Example Query --- new_example_sql = f"""SELECT n_start.node_url, r.rel_url, n_end.node_url, e.weight FROM edge_norm e JOIN node_norm n_start ON e.start_fk = n_start.node_pk JOIN node_norm n_end ON e.end_fk = n_end.node_pk JOIN rel_norm r ON e.rel_fk = r.rel_pk WHERE n_start.node_url = '{CONCEPTNET_BASE}/c/en/dog' AND r.rel_url = '{CONCEPTNET_BASE}/r/IsA' ORDER BY e.weight DESC LIMIT 10 """ raw_sql_input = gr.Textbox( label="SQL Query", value=new_example_sql, lines=13, elem_classes=["font-mono"] ) raw_btn = gr.Button("ā–¶ļø Execute") raw_status = gr.Markdown() raw_results = gr.DataFrame() # Height bug is still fixed with gr.TabItem("šŸ“Š Schema"): gr.Markdown("**View database schema, tables, and indices for the *new normalized DB*.**") schema_btn = gr.Button("šŸ“Š Load Schema Info") schema_output = gr.Markdown() # --- Button Click Handlers (All API names preserved) --- semantic_btn.click( get_semantic_profile, inputs=[word_input, lang_input, relation_input], outputs=semantic_output, api_name="get_semantic_profile" ) query_btn.click( run_query, inputs=[start_input, start_lang, rel_input, end_input, end_lang, limit_slider], outputs=[results_output, status_output], api_name="run_query" ) raw_btn.click( run_raw_query, inputs=raw_sql_input, outputs=[raw_results, raw_status], api_name="run_raw_query" ) demo.load( get_schema_info, None, schema_output, api_name="get_schema" ) schema_btn.click( get_schema_info, None, schema_output, api_name="get_schema" ) if __name__ == "__main__": if DB_PATH: log_progress("APP READY! (Normalized DB)", "SUCCESS") else: log_progress("APP LAUNCHING WITH ERRORS (DB NOT FOUND)", "ERROR") demo.launch(ssr_mode=False)