Spaces:
Running
Running
βpangjh3β
commited on
Commit
Β·
64f7b86
1
Parent(s):
931449f
modified: .gitignore
Browse filesmodified: .pre-commit-config.yaml
modified: Makefile
modified: app.py
modified: pyproject.toml
modified: src/about.py
- .gitignore +1 -0
- .pre-commit-config.yaml +1 -0
- Makefile +1 -0
- app.py +25 -25
- pyproject.toml +1 -0
- src/about.py +2 -2
.gitignore
CHANGED
|
@@ -12,3 +12,4 @@ eval-queue-bk/
|
|
| 12 |
eval-results-bk/
|
| 13 |
logs/
|
| 14 |
|
|
|
|
|
|
| 12 |
eval-results-bk/
|
| 13 |
logs/
|
| 14 |
|
| 15 |
+
|
.pre-commit-config.yaml
CHANGED
|
@@ -52,3 +52,4 @@ repos:
|
|
| 52 |
hooks:
|
| 53 |
- id: ruff
|
| 54 |
|
|
|
|
|
|
| 52 |
hooks:
|
| 53 |
- id: ruff
|
| 54 |
|
| 55 |
+
|
Makefile
CHANGED
|
@@ -12,3 +12,4 @@ quality:
|
|
| 12 |
python -m isort --check-only .
|
| 13 |
ruff check .
|
| 14 |
|
|
|
|
|
|
| 12 |
python -m isort --check-only .
|
| 13 |
ruff check .
|
| 14 |
|
| 15 |
+
|
app.py
CHANGED
|
@@ -224,37 +224,37 @@ with demo:
|
|
| 224 |
interactive=False
|
| 225 |
)
|
| 226 |
|
| 227 |
-
# Main leaderboard table
|
| 228 |
-
gr.Markdown("## π ATLAS Benchmark Results", elem_classes="markdown-text")
|
| 229 |
|
| 230 |
-
# Debug information - dynamic component
|
| 231 |
-
results_count = gr.Markdown(f"π **Showing {len(leaderboard_df)} results**")
|
| 232 |
|
| 233 |
-
leaderboard_table = gr.Dataframe(
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
)
|
| 240 |
-
|
| 241 |
-
# Refresh button
|
| 242 |
-
refresh_button = gr.Button("π Refresh Leaderboard")
|
| 243 |
|
| 244 |
-
def refresh_leaderboard_with_count():
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
|
| 250 |
-
refresh_button.click(
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
)
|
| 255 |
|
| 256 |
# Submission section
|
| 257 |
-
with gr.Accordion("
|
| 258 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 259 |
|
| 260 |
gr.Markdown("""
|
|
|
|
| 224 |
interactive=False
|
| 225 |
)
|
| 226 |
|
| 227 |
+
# Main leaderboard table - COMMENTED OUT
|
| 228 |
+
# gr.Markdown("## π ATLAS Benchmark Results", elem_classes="markdown-text")
|
| 229 |
|
| 230 |
+
# # Debug information - dynamic component
|
| 231 |
+
# results_count = gr.Markdown(f"π **Showing {len(leaderboard_df)} results**")
|
| 232 |
|
| 233 |
+
# leaderboard_table = gr.Dataframe(
|
| 234 |
+
# value=leaderboard_df,
|
| 235 |
+
# datatype=COLUMN_TYPES,
|
| 236 |
+
# interactive=False,
|
| 237 |
+
# wrap=True,
|
| 238 |
+
# column_widths=["30%", "20%", "12%", "12%", "12%", "14%"]
|
| 239 |
+
# )
|
| 240 |
+
|
| 241 |
+
# # Refresh button
|
| 242 |
+
# refresh_button = gr.Button("π Refresh Leaderboard")
|
| 243 |
|
| 244 |
+
# def refresh_leaderboard_with_count():
|
| 245 |
+
# """Refresh leaderboard and update count display"""
|
| 246 |
+
# df = refresh_leaderboard()
|
| 247 |
+
# count_text = f"π **Showing {len(df)} results**"
|
| 248 |
+
# return df, count_text
|
| 249 |
|
| 250 |
+
# refresh_button.click(
|
| 251 |
+
# refresh_leaderboard_with_count,
|
| 252 |
+
# inputs=[],
|
| 253 |
+
# outputs=[leaderboard_table, results_count]
|
| 254 |
+
# )
|
| 255 |
|
| 256 |
# Submission section
|
| 257 |
+
with gr.Accordion("π― Submit Your ATLAS Results", open=False):
|
| 258 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
| 259 |
|
| 260 |
gr.Markdown("""
|
pyproject.toml
CHANGED
|
@@ -12,3 +12,4 @@ line_length = 119
|
|
| 12 |
[tool.black]
|
| 13 |
line-length = 119
|
| 14 |
|
|
|
|
|
|
| 12 |
[tool.black]
|
| 13 |
line-length = 119
|
| 14 |
|
| 15 |
+
|
src/about.py
CHANGED
|
@@ -32,7 +32,7 @@ TITLE = """<h1 align="center" id="space-title">ATLAS: A High-Difficulty, Multidi
|
|
| 32 |
INTRODUCTION_TEXT = """
|
| 33 |
**ATLAS (AGI-Oriented Testbed for Logical Application in Science)** is a large-scale, high-difficulty, cross-disciplinary evaluation suite for assessing the frontier scientific reasoning capabilities of LLMs. Designed to address the challenges of benchmark saturation, narrow disciplinary focus, oversimplified answer formats, and data contamination in existing evaluations, ATLAS serves as a reliable **ruler** for measuring progress toward AGI in the **AI for Science** domain.
|
| 34 |
|
| 35 |
-
## Benchmark Overview
|
| 36 |
**ATLAS** evaluates models across seven core scientific fields that are central to AI for Science, encompassing 57 corresponding sub-fields to ensure comprehensive coverage of scientific reasoning requirements:
|
| 37 |
- **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
|
| 38 |
- **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
|
|
@@ -42,7 +42,7 @@ INTRODUCTION_TEXT = """
|
|
| 42 |
- **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
|
| 43 |
- **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
|
| 44 |
|
| 45 |
-
## Evaluation Metrics
|
| 46 |
- **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
|
| 47 |
- **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
|
| 48 |
- **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)
|
|
|
|
| 32 |
INTRODUCTION_TEXT = """
|
| 33 |
**ATLAS (AGI-Oriented Testbed for Logical Application in Science)** is a large-scale, high-difficulty, cross-disciplinary evaluation suite for assessing the frontier scientific reasoning capabilities of LLMs. Designed to address the challenges of benchmark saturation, narrow disciplinary focus, oversimplified answer formats, and data contamination in existing evaluations, ATLAS serves as a reliable **ruler** for measuring progress toward AGI in the **AI for Science** domain.
|
| 34 |
|
| 35 |
+
## π Benchmark Overview
|
| 36 |
**ATLAS** evaluates models across seven core scientific fields that are central to AI for Science, encompassing 57 corresponding sub-fields to ensure comprehensive coverage of scientific reasoning requirements:
|
| 37 |
- **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
|
| 38 |
- **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
|
|
|
|
| 42 |
- **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
|
| 43 |
- **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
|
| 44 |
|
| 45 |
+
## π Evaluation Metrics
|
| 46 |
- **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
|
| 47 |
- **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
|
| 48 |
- **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)
|