Spaces:
Running
Running
| DEFAULT_MODEL_ID = "Meta-Llama-3-70B-Instruct" | |
| DEFAULT_INTERFACE_MODEL_ID = "NumbersStation/nsql-llama-2-7B" | |
| DEFAULT_KIND = "json" | |
| DEFAULT_TEMPERATURE = 0.6 | |
| DEFAULT_TOP_P = 0.95 | |
| DEFAULT_FEW_SHOT_NUM = 3 | |
| DEFAULT_FEW_SHOT_SELECTION = "random" | |
| DEFAULT_SAVE_INTERVAL = 3 | |
| DEFAULT_RES_DIR = "data/results" | |
| DEFAULT_LOG_DIR = "logs" | |
| DEFAULT_TABLES_DIR = "data/databases" | |
| COOCCURRENCE_QUERY = """ | |
| WITH concept_pairs AS ( | |
| SELECT p1.concept AS concept1, p2.concept AS concept2, p1.paper_id, p1.tag_type | |
| FROM predictions p1 | |
| JOIN predictions p2 ON p1.paper_id = p2.paper_id AND p1.concept < p2.concept | |
| WHERE p1.tag_type = p2.tag_type | |
| ) | |
| SELECT concept1, concept2, tag_type, COUNT(DISTINCT paper_id) AS co_occurrences | |
| FROM concept_pairs | |
| GROUP BY concept1, concept2, tag_type | |
| HAVING co_occurrences > 5 | |
| ORDER BY co_occurrences DESC; | |
| """ | |
| canned_queries = [ | |
| ( | |
| "Modalities in Physics and Astronomy papers", | |
| """ | |
| SELECT DISTINCT LOWER(concept) AS concept | |
| FROM predictions | |
| JOIN ( | |
| SELECT paper_id, url | |
| FROM papers | |
| WHERE primary_category LIKE '%physics.space-ph%' | |
| OR primary_category LIKE '%astro-ph.%' | |
| ) AS paper_ids | |
| ON predictions.paper_id = paper_ids.paper_id | |
| WHERE predictions.tag_type = 'modality' | |
| """, | |
| ), | |
| ( | |
| "Datasets in Evolutionary Biology that use PDEs", | |
| """ | |
| WITH pde_predictions AS ( | |
| SELECT paper_id, concept AS pde_concept, tag_type AS pde_tag_type | |
| FROM predictions | |
| WHERE tag_type IN ('method', 'model') | |
| AND ( | |
| LOWER(concept) LIKE '%pde%' | |
| OR LOWER(concept) LIKE '%partial differential equation%' | |
| ) | |
| ) | |
| SELECT DISTINCT | |
| papers.paper_id, | |
| papers.url, | |
| LOWER(p_dataset.concept) AS dataset, | |
| pde_predictions.pde_concept AS pde_related_concept, | |
| pde_predictions.pde_tag_type AS pde_related_type | |
| FROM papers | |
| JOIN pde_predictions ON papers.paper_id = pde_predictions.paper_id | |
| LEFT JOIN predictions p_dataset ON papers.paper_id = p_dataset.paper_id | |
| WHERE papers.primary_category LIKE '%q-bio.PE%' | |
| AND (p_dataset.tag_type = 'dataset' OR p_dataset.tag_type IS NULL) | |
| ORDER BY papers.paper_id, dataset, pde_related_concept; | |
| """, | |
| ), | |
| ( | |
| "Trends in objects of study in Cosmology since 2019", | |
| """ | |
| SELECT | |
| substr(papers.updated_on, 2, 4) as year, | |
| predictions.concept as object, | |
| COUNT(DISTINCT papers.paper_id) as paper_count | |
| FROM | |
| papers | |
| JOIN | |
| predictions ON papers.paper_id = predictions.paper_id | |
| WHERE | |
| predictions.tag_type = 'object' | |
| AND CAST(SUBSTR(papers.updated_on, 2, 4) AS INTEGER) >= 2019 | |
| GROUP BY | |
| year, object | |
| ORDER BY | |
| year DESC, paper_count DESC; | |
| """, | |
| ), | |
| ( | |
| "New datasets in fluid dynamics since 2020", | |
| """ | |
| WITH ranked_datasets AS ( | |
| SELECT | |
| p.paper_id, | |
| p.url, | |
| pred.concept AS dataset, | |
| p.updated_on, | |
| ROW_NUMBER() OVER (PARTITION BY pred.concept ORDER BY p.updated_on ASC) AS rn | |
| FROM | |
| papers p | |
| JOIN | |
| predictions pred ON p.paper_id = pred.paper_id | |
| WHERE | |
| pred.tag_type = 'dataset' | |
| AND p.primary_category LIKE '%physics.flu-dyn%' | |
| AND CAST(SUBSTR(p.updated_on, 2, 4) AS INTEGER) >= 2020 | |
| ) | |
| SELECT | |
| paper_id, | |
| url, | |
| dataset, | |
| updated_on | |
| FROM | |
| ranked_datasets | |
| WHERE | |
| rn = 1 | |
| ORDER BY | |
| updated_on ASC | |
| """, | |
| ), | |
| ( | |
| "Evolutionary biology datasets that use spatiotemporal dynamics", | |
| """ | |
| WITH evo_bio_papers AS ( | |
| SELECT paper_id | |
| FROM papers | |
| WHERE primary_category LIKE '%q-bio.PE%' | |
| ), | |
| spatiotemporal_keywords AS ( | |
| SELECT 'spatio-temporal' AS keyword | |
| UNION SELECT 'spatiotemporal' | |
| UNION SELECT 'spatio-temporal' | |
| UNION SELECT 'spatial and temporal' | |
| UNION SELECT 'space-time' | |
| UNION SELECT 'geographic distribution' | |
| UNION SELECT 'phylogeograph' | |
| UNION SELECT 'biogeograph' | |
| UNION SELECT 'dispersal' | |
| UNION SELECT 'migration' | |
| UNION SELECT 'range expansion' | |
| UNION SELECT 'population dynamics' | |
| ) | |
| SELECT DISTINCT | |
| p.paper_id, | |
| p.updated_on, | |
| p.abstract, | |
| d.concept AS dataset, | |
| GROUP_CONCAT(DISTINCT stk.keyword) AS spatiotemporal_keywords_found | |
| FROM | |
| evo_bio_papers ebp | |
| JOIN | |
| papers p ON ebp.paper_id = p.paper_id | |
| JOIN | |
| predictions d ON p.paper_id = d.paper_id | |
| JOIN | |
| predictions st ON p.paper_id = st.paper_id | |
| JOIN | |
| spatiotemporal_keywords stk | |
| WHERE | |
| d.tag_type = 'dataset' | |
| AND st.tag_type = 'modality' | |
| AND LOWER(st.concept) LIKE '%' || stk.keyword || '%' | |
| GROUP BY | |
| p.paper_id, p.updated_on, p.abstract, d.concept | |
| ORDER BY | |
| p.updated_on DESC | |
| """, | |
| ), | |
| ( | |
| "What percentage of papers use only galaxy or spectra, or both or neither?", | |
| """ | |
| WITH paper_modalities AS ( | |
| SELECT | |
| p.paper_id, | |
| MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images, | |
| MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra | |
| FROM | |
| papers p | |
| LEFT JOIN | |
| predictions pred ON p.paper_id = pred.paper_id | |
| WHERE | |
| p.primary_category LIKE '%astro-ph%' | |
| AND pred.tag_type = 'modality' | |
| GROUP BY | |
| p.paper_id | |
| ), | |
| categorized_papers AS ( | |
| SELECT | |
| CASE | |
| WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both' | |
| WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images' | |
| WHEN uses_spectra = 1 THEN 'Only Spectra' | |
| ELSE 'Neither' | |
| END AS category, | |
| COUNT(*) AS paper_count | |
| FROM | |
| paper_modalities | |
| GROUP BY | |
| CASE | |
| WHEN uses_galaxy_images = 1 AND uses_spectra = 1 THEN 'Both' | |
| WHEN uses_galaxy_images = 1 THEN 'Only Galaxy Images' | |
| WHEN uses_spectra = 1 THEN 'Only Spectra' | |
| ELSE 'Neither' | |
| END | |
| ) | |
| SELECT | |
| category, | |
| paper_count, | |
| ROUND(CAST(paper_count AS FLOAT) / (SELECT SUM(paper_count) FROM categorized_papers) * 100, 2) AS percentage | |
| FROM | |
| categorized_papers | |
| ORDER BY | |
| paper_count DESC | |
| """, | |
| ), | |
| ( | |
| "What are all the next highest data modalities after images and spectra?", | |
| """ | |
| SELECT | |
| LOWER(concept) AS modality, | |
| COUNT(DISTINCT paper_id) AS usage_count | |
| FROM | |
| predictions | |
| WHERE | |
| tag_type = 'modality' | |
| AND LOWER(concept) NOT LIKE '%imag%' | |
| AND LOWER(concept) NOT LIKE '%spectr%' | |
| GROUP BY | |
| LOWER(concept) | |
| ORDER BY | |
| usage_count DESC | |
| """, | |
| ), | |
| ( | |
| "If we include the next biggest data modality, how much does coverage change?", | |
| """ | |
| WITH modality_counts AS ( | |
| SELECT | |
| LOWER(concept) AS modality, | |
| COUNT(DISTINCT paper_id) AS usage_count | |
| FROM | |
| predictions | |
| WHERE | |
| tag_type = 'modality' | |
| AND LOWER(concept) NOT LIKE '%imag%' | |
| AND LOWER(concept) NOT LIKE '%spectr%' | |
| GROUP BY | |
| LOWER(concept) | |
| ORDER BY | |
| usage_count DESC | |
| LIMIT 1 | |
| ), | |
| paper_modalities AS ( | |
| SELECT | |
| p.paper_id, | |
| MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_galaxy_images, | |
| MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra, | |
| MAX(CASE WHEN LOWER(pred.concept) LIKE (SELECT '%' || modality || '%' FROM modality_counts) THEN 1 ELSE 0 END) AS uses_third_modality | |
| FROM | |
| papers p | |
| LEFT JOIN | |
| predictions pred ON p.paper_id = pred.paper_id | |
| WHERE | |
| p.primary_category LIKE '%astro-ph%' | |
| AND pred.tag_type = 'modality' | |
| GROUP BY | |
| p.paper_id | |
| ), | |
| coverage_before AS ( | |
| SELECT | |
| SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers, | |
| COUNT(*) AS total_papers | |
| FROM | |
| paper_modalities | |
| ), | |
| coverage_after AS ( | |
| SELECT | |
| SUM(CASE WHEN uses_galaxy_images = 1 OR uses_spectra = 1 OR uses_third_modality = 1 THEN 1 ELSE 0 END) AS covered_papers, | |
| COUNT(*) AS total_papers | |
| FROM | |
| paper_modalities | |
| ) | |
| SELECT | |
| (SELECT modality FROM modality_counts) AS third_modality, | |
| ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_before_percent, | |
| ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) AS coverage_after_percent, | |
| ROUND(CAST((SELECT covered_papers FROM coverage_after) AS FLOAT) / total_papers * 100, 2) - | |
| ROUND(CAST(covered_papers AS FLOAT) / total_papers * 100, 2) AS coverage_increase_percent | |
| FROM | |
| coverage_before | |
| """, | |
| ), | |
| ( | |
| "Coverage if we select the next 5 highest modalities?", | |
| """ | |
| WITH ranked_modalities AS ( | |
| SELECT | |
| LOWER(concept) AS modality, | |
| COUNT(DISTINCT paper_id) AS usage_count, | |
| ROW_NUMBER() OVER (ORDER BY COUNT(DISTINCT paper_id) DESC) AS rank | |
| FROM | |
| predictions | |
| WHERE | |
| tag_type = 'modality' | |
| AND LOWER(concept) NOT LIKE '%imag%' | |
| AND LOWER(concept) NOT LIKE '%spectr%' | |
| GROUP BY | |
| LOWER(concept) | |
| ), | |
| paper_modalities AS ( | |
| SELECT | |
| p.paper_id, | |
| MAX(CASE WHEN LOWER(pred.concept) LIKE '%imag%' THEN 1 ELSE 0 END) AS uses_images, | |
| MAX(CASE WHEN LOWER(pred.concept) LIKE '%spectr%' THEN 1 ELSE 0 END) AS uses_spectra, | |
| MAX(CASE WHEN rm.rank = 1 THEN 1 ELSE 0 END) AS uses_modality_1, | |
| MAX(CASE WHEN rm.rank = 2 THEN 1 ELSE 0 END) AS uses_modality_2, | |
| MAX(CASE WHEN rm.rank = 3 THEN 1 ELSE 0 END) AS uses_modality_3, | |
| MAX(CASE WHEN rm.rank = 4 THEN 1 ELSE 0 END) AS uses_modality_4, | |
| MAX(CASE WHEN rm.rank = 5 THEN 1 ELSE 0 END) AS uses_modality_5 | |
| FROM | |
| papers p | |
| LEFT JOIN | |
| predictions pred ON p.paper_id = pred.paper_id | |
| LEFT JOIN | |
| ranked_modalities rm ON LOWER(pred.concept) = rm.modality | |
| WHERE | |
| p.primary_category LIKE '%astro-ph%' | |
| AND pred.tag_type = 'modality' | |
| GROUP BY | |
| p.paper_id | |
| ), | |
| cumulative_coverage AS ( | |
| SELECT | |
| 'Images and Spectra' AS modalities, | |
| 0 AS added_modality_rank, | |
| SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 THEN 1 ELSE 0 END) AS covered_papers, | |
| COUNT(*) AS total_papers | |
| FROM | |
| paper_modalities | |
| UNION ALL | |
| SELECT | |
| 'Images, Spectra, and Modality 1' AS modalities, | |
| 1 AS added_modality_rank, | |
| SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 THEN 1 ELSE 0 END) AS covered_papers, | |
| COUNT(*) AS total_papers | |
| FROM | |
| paper_modalities | |
| UNION ALL | |
| SELECT | |
| 'Images, Spectra, Modality 1, and 2' AS modalities, | |
| 2 AS added_modality_rank, | |
| SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 THEN 1 ELSE 0 END) AS covered_papers, | |
| COUNT(*) AS total_papers | |
| FROM | |
| paper_modalities | |
| UNION ALL | |
| SELECT | |
| 'Images, Spectra, Modality 1, 2, and 3' AS modalities, | |
| 3 AS added_modality_rank, | |
| SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 THEN 1 ELSE 0 END) AS covered_papers, | |
| COUNT(*) AS total_papers | |
| FROM | |
| paper_modalities | |
| UNION ALL | |
| SELECT | |
| 'Images, Spectra, Modality 1, 2, 3, and 4' AS modalities, | |
| 4 AS added_modality_rank, | |
| SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 THEN 1 ELSE 0 END) AS covered_papers, | |
| COUNT(*) AS total_papers | |
| FROM | |
| paper_modalities | |
| UNION ALL | |
| SELECT | |
| 'Images, Spectra, Modality 1, 2, 3, 4, and 5' AS modalities, | |
| 5 AS added_modality_rank, | |
| SUM(CASE WHEN uses_images = 1 OR uses_spectra = 1 OR uses_modality_1 = 1 OR uses_modality_2 = 1 OR uses_modality_3 = 1 OR uses_modality_4 = 1 OR uses_modality_5 = 1 THEN 1 ELSE 0 END) AS covered_papers, | |
| COUNT(*) AS total_papers | |
| FROM | |
| paper_modalities | |
| ) | |
| SELECT | |
| cc.modalities, | |
| COALESCE(rm.modality, 'N/A') AS added_modality, | |
| rm.usage_count AS added_modality_usage, | |
| ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) AS coverage_percent, | |
| ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2) - | |
| LAG(ROUND(CAST(cc.covered_papers AS FLOAT) / cc.total_papers * 100, 2), 1, 0) OVER (ORDER BY cc.added_modality_rank) AS coverage_increase_percent | |
| FROM | |
| cumulative_coverage cc | |
| LEFT JOIN | |
| ranked_modalities rm ON cc.added_modality_rank = rm.rank | |
| ORDER BY | |
| cc.added_modality_rank | |
| """, | |
| ), | |
| ( | |
| "List all papers", | |
| "SELECT paper_id, abstract AS abstract_preview, authors, primary_category FROM papers", | |
| ), | |
| ( | |
| "Count papers by category", | |
| "SELECT primary_category, COUNT(*) as paper_count FROM papers GROUP BY primary_category ORDER BY paper_count DESC", | |
| ), | |
| ( | |
| "Top authors with most papers", | |
| """ | |
| WITH author_papers AS ( | |
| SELECT json_each.value AS author | |
| FROM papers, json_each(papers.authors) | |
| ) | |
| SELECT author, COUNT(*) as paper_count | |
| FROM author_papers | |
| GROUP BY author | |
| ORDER BY paper_count DESC | |
| """, | |
| ), | |
| ( | |
| "Papers with 'quantum' in abstract", | |
| "SELECT paper_id, abstract AS abstract_preview FROM papers WHERE abstract LIKE '%quantum%'", | |
| ), | |
| ( | |
| "Most common concepts", | |
| "SELECT concept, COUNT(*) as concept_count FROM predictions GROUP BY concept ORDER BY concept_count DESC", | |
| ), | |
| ( | |
| "Papers with multiple authors", | |
| """ | |
| SELECT paper_id, json_array_length(authors) as author_count, authors | |
| FROM papers | |
| WHERE json_array_length(authors) > 1 | |
| ORDER BY author_count DESC | |
| """, | |
| ), | |
| ] | |