Spaces:
Build error
Build error
Xianbao QIAN
commited on
Commit
Β·
c80b461
1
Parent(s):
2ea1dfc
use client side rendering for the homepage.
Browse files- @types/parquetjs-lite.d.ts +7 -0
- Dockerfile +3 -0
- bun.lockb +0 -0
- package.json +1 -1
- {tables β public}/ancestor_children.example.yaml +0 -0
- {tables β public}/ancestor_children.parquet +0 -0
- {tables β public}/datasets.example.yaml +0 -0
- {tables β public}/datasets.parquet +0 -0
- {tables β public}/models.example.yaml +0 -0
- {tables β public}/models.parquet +0 -0
- {tables β public}/parents.example.yaml +0 -0
- {tables β public}/parents.parquet +0 -0
- {tables β public}/spaces.example.yaml +0 -0
- {tables β public}/spaces.parquet +0 -0
- python/0_download_files.py +6 -6
- python/1_parents.py +3 -3
- python/2_ancestors.py +3 -3
- src/app/page.tsx +103 -48
@types/parquetjs-lite.d.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
declare module 'parquetjs-lite' {
|
| 2 |
+
export class ParquetReader {
|
| 3 |
+
static openFile(filePath: string): Promise<ParquetReader>;
|
| 4 |
+
getCursor(): any;
|
| 5 |
+
close(): Promise<void>;
|
| 6 |
+
}
|
| 7 |
+
}
|
Dockerfile
CHANGED
|
@@ -34,6 +34,9 @@ RUN \
|
|
| 34 |
addgroup --system --gid 1001 nodejs; \
|
| 35 |
adduser --system --uid 1001 nextjs
|
| 36 |
|
|
|
|
|
|
|
|
|
|
| 37 |
# Automatically leverage output traces to reduce image size
|
| 38 |
COPY --from=builder --link --chown=1001:1001 /app/.next/standalone ./
|
| 39 |
COPY --from=builder --link --chown=1001:1001 /app/.next/static ./.next/static
|
|
|
|
| 34 |
addgroup --system --gid 1001 nodejs; \
|
| 35 |
adduser --system --uid 1001 nextjs
|
| 36 |
|
| 37 |
+
# Public files are served by nextjs.
|
| 38 |
+
COPY --from=builder --link /app/public ./public
|
| 39 |
+
|
| 40 |
# Automatically leverage output traces to reduce image size
|
| 41 |
COPY --from=builder --link --chown=1001:1001 /app/.next/standalone ./
|
| 42 |
COPY --from=builder --link --chown=1001:1001 /app/.next/static ./.next/static
|
bun.lockb
CHANGED
|
Binary files a/bun.lockb and b/bun.lockb differ
|
|
|
package.json
CHANGED
|
@@ -9,8 +9,8 @@
|
|
| 9 |
"lint": "next lint"
|
| 10 |
},
|
| 11 |
"dependencies": {
|
|
|
|
| 12 |
"next": "14.2.15",
|
| 13 |
-
"parquetjs-lite": "^0.8.7",
|
| 14 |
"react": "^18",
|
| 15 |
"react-dom": "^18"
|
| 16 |
},
|
|
|
|
| 9 |
"lint": "next lint"
|
| 10 |
},
|
| 11 |
"dependencies": {
|
| 12 |
+
"@duckdb/duckdb-wasm": "^1.29.0",
|
| 13 |
"next": "14.2.15",
|
|
|
|
| 14 |
"react": "^18",
|
| 15 |
"react-dom": "^18"
|
| 16 |
},
|
{tables β public}/ancestor_children.example.yaml
RENAMED
|
File without changes
|
{tables β public}/ancestor_children.parquet
RENAMED
|
File without changes
|
{tables β public}/datasets.example.yaml
RENAMED
|
File without changes
|
{tables β public}/datasets.parquet
RENAMED
|
File without changes
|
{tables β public}/models.example.yaml
RENAMED
|
File without changes
|
{tables β public}/models.parquet
RENAMED
|
File without changes
|
{tables β public}/parents.example.yaml
RENAMED
|
File without changes
|
{tables β public}/parents.parquet
RENAMED
|
File without changes
|
{tables β public}/spaces.example.yaml
RENAMED
|
File without changes
|
{tables β public}/spaces.parquet
RENAMED
|
File without changes
|
python/0_download_files.py
CHANGED
|
@@ -7,8 +7,8 @@ import random
|
|
| 7 |
import argparse
|
| 8 |
import yaml
|
| 9 |
|
| 10 |
-
# Create the "
|
| 11 |
-
os.makedirs("
|
| 12 |
|
| 13 |
# URLs of the files to download
|
| 14 |
urls = [
|
|
@@ -18,7 +18,7 @@ urls = [
|
|
| 18 |
]
|
| 19 |
|
| 20 |
def download_file(url, overwrite=True):
|
| 21 |
-
filename = os.path.join("
|
| 22 |
|
| 23 |
if not overwrite and os.path.exists(filename):
|
| 24 |
print(f"File already exists: {filename}. Skipping download.")
|
|
@@ -56,7 +56,7 @@ def main(overwrite):
|
|
| 56 |
|
| 57 |
# Process each downloaded Parquet file
|
| 58 |
for url in urls:
|
| 59 |
-
filename = os.path.join("
|
| 60 |
table_name = os.path.splitext(os.path.basename(filename))[0]
|
| 61 |
|
| 62 |
# Connect to the Parquet file using DuckDB
|
|
@@ -86,8 +86,8 @@ def main(overwrite):
|
|
| 86 |
yaml_content = yaml_content.rstrip() # Remove trailing spaces
|
| 87 |
yaml_content += "\n"
|
| 88 |
|
| 89 |
-
# Save the YAML content to a file in the "
|
| 90 |
-
yaml_file = os.path.join("
|
| 91 |
with open(yaml_file, "w") as file:
|
| 92 |
file.write(yaml_content)
|
| 93 |
|
|
|
|
| 7 |
import argparse
|
| 8 |
import yaml
|
| 9 |
|
| 10 |
+
# Create the "public" folders if they don't exist
|
| 11 |
+
os.makedirs("public", exist_ok=True)
|
| 12 |
|
| 13 |
# URLs of the files to download
|
| 14 |
urls = [
|
|
|
|
| 18 |
]
|
| 19 |
|
| 20 |
def download_file(url, overwrite=True):
|
| 21 |
+
filename = os.path.join("public", url.split("/")[-1].split("?")[0])
|
| 22 |
|
| 23 |
if not overwrite and os.path.exists(filename):
|
| 24 |
print(f"File already exists: {filename}. Skipping download.")
|
|
|
|
| 56 |
|
| 57 |
# Process each downloaded Parquet file
|
| 58 |
for url in urls:
|
| 59 |
+
filename = os.path.join("public", url.split("/")[-1].split("?")[0])
|
| 60 |
table_name = os.path.splitext(os.path.basename(filename))[0]
|
| 61 |
|
| 62 |
# Connect to the Parquet file using DuckDB
|
|
|
|
| 86 |
yaml_content = yaml_content.rstrip() # Remove trailing spaces
|
| 87 |
yaml_content += "\n"
|
| 88 |
|
| 89 |
+
# Save the YAML content to a file in the "public" folder
|
| 90 |
+
yaml_file = os.path.join("public", f"{table_name}.example.yaml")
|
| 91 |
with open(yaml_file, "w") as file:
|
| 92 |
file.write(yaml_content)
|
| 93 |
|
python/1_parents.py
CHANGED
|
@@ -23,7 +23,7 @@ query = """
|
|
| 23 |
_id,
|
| 24 |
id,
|
| 25 |
extract_base_models(tags) AS base_models
|
| 26 |
-
FROM parquet_scan('
|
| 27 |
"""
|
| 28 |
|
| 29 |
start_time = time.time()
|
|
@@ -32,7 +32,7 @@ start_time = time.time()
|
|
| 32 |
con.execute(f"CREATE VIEW parent_models AS {query}")
|
| 33 |
|
| 34 |
# Write the view to a parquet file
|
| 35 |
-
con.execute("COPY parent_models TO '
|
| 36 |
|
| 37 |
end_time = time.time()
|
| 38 |
execution_time = end_time - start_time
|
|
@@ -55,5 +55,5 @@ result = con.execute("""
|
|
| 55 |
LIMIT 10
|
| 56 |
""").fetchall()
|
| 57 |
|
| 58 |
-
with open("
|
| 59 |
yaml.safe_dump(result, f, default_flow_style=False)
|
|
|
|
| 23 |
_id,
|
| 24 |
id,
|
| 25 |
extract_base_models(tags) AS base_models
|
| 26 |
+
FROM parquet_scan('public/models.parquet')
|
| 27 |
"""
|
| 28 |
|
| 29 |
start_time = time.time()
|
|
|
|
| 32 |
con.execute(f"CREATE VIEW parent_models AS {query}")
|
| 33 |
|
| 34 |
# Write the view to a parquet file
|
| 35 |
+
con.execute("COPY parent_models TO 'public/parents.parquet' (FORMAT 'parquet')")
|
| 36 |
|
| 37 |
end_time = time.time()
|
| 38 |
execution_time = end_time - start_time
|
|
|
|
| 55 |
LIMIT 10
|
| 56 |
""").fetchall()
|
| 57 |
|
| 58 |
+
with open("public/parents.example.yaml", "w") as f:
|
| 59 |
yaml.safe_dump(result, f, default_flow_style=False)
|
python/2_ancestors.py
CHANGED
|
@@ -22,7 +22,7 @@ total_start_time = time.perf_counter()
|
|
| 22 |
# Load parents.parquet into an in-memory table
|
| 23 |
load_parents_query = """
|
| 24 |
CREATE TABLE parents_in_memory AS
|
| 25 |
-
SELECT * FROM parquet_scan('
|
| 26 |
"""
|
| 27 |
execute_with_timing(load_parents_query, "Loaded parents.parquet into RAM")
|
| 28 |
|
|
@@ -115,7 +115,7 @@ final_output_query = """
|
|
| 115 |
FROM ancestor_children ac
|
| 116 |
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
|
| 117 |
ORDER BY all_children_count DESC
|
| 118 |
-
) TO '
|
| 119 |
"""
|
| 120 |
con.execute(final_output_query)
|
| 121 |
end_time = time.perf_counter()
|
|
@@ -131,7 +131,7 @@ sample_query = """
|
|
| 131 |
LIMIT 10
|
| 132 |
"""
|
| 133 |
sample_data = con.execute(sample_query).fetchall()
|
| 134 |
-
with open("
|
| 135 |
yaml.safe_dump(sample_data, f, default_flow_style=False)
|
| 136 |
end_time = time.perf_counter()
|
| 137 |
logging.info(f"Written sample data to YAML file in {end_time - start_time:.6f} seconds.")
|
|
|
|
| 22 |
# Load parents.parquet into an in-memory table
|
| 23 |
load_parents_query = """
|
| 24 |
CREATE TABLE parents_in_memory AS
|
| 25 |
+
SELECT * FROM parquet_scan('public/parents.parquet')
|
| 26 |
"""
|
| 27 |
execute_with_timing(load_parents_query, "Loaded parents.parquet into RAM")
|
| 28 |
|
|
|
|
| 115 |
FROM ancestor_children ac
|
| 116 |
LEFT JOIN direct_children_mapping dcm ON ac.ancestor = dcm.parent
|
| 117 |
ORDER BY all_children_count DESC
|
| 118 |
+
) TO 'public/ancestor_children.parquet' (FORMAT 'parquet')
|
| 119 |
"""
|
| 120 |
con.execute(final_output_query)
|
| 121 |
end_time = time.perf_counter()
|
|
|
|
| 131 |
LIMIT 10
|
| 132 |
"""
|
| 133 |
sample_data = con.execute(sample_query).fetchall()
|
| 134 |
+
with open("public/ancestor_children.example.yaml", "w") as f:
|
| 135 |
yaml.safe_dump(sample_data, f, default_flow_style=False)
|
| 136 |
end_time = time.perf_counter()
|
| 137 |
logging.info(f"Written sample data to YAML file in {end_time - start_time:.6f} seconds.")
|
src/app/page.tsx
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
import {
|
|
|
|
| 4 |
|
| 5 |
type ModelData = {
|
| 6 |
ancestor: string;
|
|
@@ -10,41 +11,87 @@ type ModelData = {
|
|
| 10 |
direct_children_count: number | null;
|
| 11 |
};
|
| 12 |
|
| 13 |
-
export default
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
}
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
-
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
const top10Models = data
|
| 39 |
-
.sort((a, b) => b.all_children.length - a.all_children.length)
|
| 40 |
-
.slice(0, 10);
|
| 41 |
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
<table className="table-auto border-collapse w-full">
|
| 49 |
<thead>
|
| 50 |
<tr>
|
|
@@ -54,7 +101,7 @@ export default async function Home() {
|
|
| 54 |
</tr>
|
| 55 |
</thead>
|
| 56 |
<tbody>
|
| 57 |
-
{
|
| 58 |
<tr key={index} className="border-t border-gray-200 dark:border-gray-700">
|
| 59 |
<td className="px-4 py-2">{model.ancestor}</td>
|
| 60 |
<td className="px-4 py-2 text-right">{model.direct_children_count ?? 0}</td>
|
|
@@ -63,18 +110,26 @@ export default async function Home() {
|
|
| 63 |
))}
|
| 64 |
</tbody>
|
| 65 |
</table>
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
}
|
|
|
|
| 1 |
+
'use client';
|
| 2 |
+
|
| 3 |
+
import { useState, useEffect } from 'react';
|
| 4 |
+
import * as duckdb from '@duckdb/duckdb-wasm';
|
| 5 |
|
| 6 |
type ModelData = {
|
| 7 |
ancestor: string;
|
|
|
|
| 11 |
direct_children_count: number | null;
|
| 12 |
};
|
| 13 |
|
| 14 |
+
export default function Home() {
|
| 15 |
+
const [allModels, setAllModels] = useState<ModelData[]>([]);
|
| 16 |
+
const [currentPage, setCurrentPage] = useState(1);
|
| 17 |
+
const [pageSize, setPageSize] = useState(100);
|
| 18 |
+
const [filterText, setFilterText] = useState('');
|
| 19 |
+
|
| 20 |
+
useEffect(() => {
|
| 21 |
+
async function fetchData() {
|
| 22 |
+
const JSDELIVR_BUNDLES = duckdb.getJsDelivrBundles();
|
| 23 |
+
|
| 24 |
+
// Select a bundle based on browser checks
|
| 25 |
+
const bundle = await duckdb.selectBundle(JSDELIVR_BUNDLES);
|
| 26 |
+
|
| 27 |
+
const worker_url = URL.createObjectURL(
|
| 28 |
+
new Blob([`importScripts("${bundle.mainWorker!}");`], { type: 'text/javascript' })
|
| 29 |
+
);
|
| 30 |
+
|
| 31 |
+
// Instantiate the asynchronous version of DuckDB-Wasm
|
| 32 |
+
const worker = new Worker(worker_url);
|
| 33 |
+
const logger = new duckdb.ConsoleLogger();
|
| 34 |
+
const db = new duckdb.AsyncDuckDB(logger, worker);
|
| 35 |
+
await db.instantiate(bundle.mainModule, bundle.pthreadWorker);
|
| 36 |
+
|
| 37 |
+
// Register the Parquet file using the URL
|
| 38 |
+
await db.registerFileURL(
|
| 39 |
+
'ancestor_children.parquet',
|
| 40 |
+
`${window.location.origin}/ancestor_children.parquet`,
|
| 41 |
+
duckdb.DuckDBDataProtocol.HTTP,
|
| 42 |
+
false
|
| 43 |
+
);
|
| 44 |
+
|
| 45 |
+
// Execute the SQL query using the registered Parquet file
|
| 46 |
+
const query = `
|
| 47 |
+
SELECT
|
| 48 |
+
ancestor,
|
| 49 |
+
direct_children,
|
| 50 |
+
all_children,
|
| 51 |
+
CAST(all_children_count AS INTEGER) AS all_children_count,
|
| 52 |
+
CAST(direct_children_count AS INTEGER) AS direct_children_count
|
| 53 |
+
FROM 'ancestor_children.parquet'
|
| 54 |
+
`;
|
| 55 |
+
const conn = await db.connect();
|
| 56 |
+
const result = await conn.query(query);
|
| 57 |
+
|
| 58 |
+
// Convert the result to a JavaScript array
|
| 59 |
+
const data: ModelData[] = result.toArray();
|
| 60 |
+
|
| 61 |
+
// Close the connection and terminate the worker
|
| 62 |
+
await conn.close();
|
| 63 |
+
await db.terminate();
|
| 64 |
+
|
| 65 |
+
setAllModels(data);
|
| 66 |
}
|
| 67 |
+
fetchData();
|
| 68 |
+
}, []);
|
| 69 |
|
| 70 |
+
const filteredModels = allModels.filter((model) =>
|
| 71 |
+
model.ancestor.toLowerCase().includes(filterText.toLowerCase())
|
| 72 |
+
);
|
| 73 |
|
| 74 |
+
const totalPages = Math.ceil(filteredModels.length / pageSize);
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
+
const paginatedModels = filteredModels.slice(
|
| 77 |
+
(currentPage - 1) * pageSize,
|
| 78 |
+
currentPage * pageSize
|
| 79 |
+
);
|
| 80 |
|
| 81 |
+
return (
|
| 82 |
+
<main className="container mx-auto py-8 text-gray-900 dark:text-white">
|
| 83 |
+
<h1 className="text-4xl font-bold mb-4">All Models</h1>
|
| 84 |
+
<div className="mb-4">
|
| 85 |
+
<input
|
| 86 |
+
type="text"
|
| 87 |
+
placeholder="Filter by model name"
|
| 88 |
+
value={filterText}
|
| 89 |
+
onChange={(e) => setFilterText(e.target.value)}
|
| 90 |
+
className="px-4 py-2 border border-gray-300 rounded-md"
|
| 91 |
+
/>
|
| 92 |
+
</div>
|
| 93 |
+
{paginatedModels.length > 0 ? (
|
| 94 |
+
<>
|
| 95 |
<table className="table-auto border-collapse w-full">
|
| 96 |
<thead>
|
| 97 |
<tr>
|
|
|
|
| 101 |
</tr>
|
| 102 |
</thead>
|
| 103 |
<tbody>
|
| 104 |
+
{paginatedModels.map((model, index) => (
|
| 105 |
<tr key={index} className="border-t border-gray-200 dark:border-gray-700">
|
| 106 |
<td className="px-4 py-2">{model.ancestor}</td>
|
| 107 |
<td className="px-4 py-2 text-right">{model.direct_children_count ?? 0}</td>
|
|
|
|
| 110 |
))}
|
| 111 |
</tbody>
|
| 112 |
</table>
|
| 113 |
+
<div className="mt-4">
|
| 114 |
+
<button
|
| 115 |
+
onClick={() => setCurrentPage((prev) => Math.max(prev - 1, 1))}
|
| 116 |
+
disabled={currentPage === 1}
|
| 117 |
+
className="px-4 py-2 bg-blue-500 text-white rounded-md mr-2"
|
| 118 |
+
>
|
| 119 |
+
Previous
|
| 120 |
+
</button>
|
| 121 |
+
<button
|
| 122 |
+
onClick={() => setCurrentPage((prev) => Math.min(prev + 1, totalPages))}
|
| 123 |
+
disabled={currentPage === totalPages}
|
| 124 |
+
className="px-4 py-2 bg-blue-500 text-white rounded-md"
|
| 125 |
+
>
|
| 126 |
+
Next
|
| 127 |
+
</button>
|
| 128 |
+
</div>
|
| 129 |
+
</>
|
| 130 |
+
) : (
|
| 131 |
+
<p>No data found.</p>
|
| 132 |
+
)}
|
| 133 |
+
</main>
|
| 134 |
+
);
|
| 135 |
}
|