Spaces:

opencompass
/

ATLAS

Running

App Files Files Community

“pangjh3” commited on 12 days ago

Commit

64f7b86

1 Parent(s): 931449f

modified: .gitignore

Browse files

modified: .pre-commit-config.yaml
modified: Makefile
modified: app.py
modified: pyproject.toml
modified: src/about.py

Files changed (6) hide show

.gitignore +1 -0
.pre-commit-config.yaml +1 -0
Makefile +1 -0
app.py +25 -25
pyproject.toml +1 -0
src/about.py +2 -2

.gitignore CHANGED Viewed

@@ -12,3 +12,4 @@ eval-queue-bk/
 eval-results-bk/
 logs/


12	eval-results-bk/
13	logs/
14
15	+

.pre-commit-config.yaml CHANGED Viewed

@@ -52,3 +52,4 @@ repos:
     hooks:
       - id: ruff


52	hooks:
53	- id: ruff
54
55	+

Makefile CHANGED Viewed

@@ -12,3 +12,4 @@ quality:
 	python -m isort --check-only .
 	ruff check .


12	python -m isort --check-only .
13	ruff check .
14
15	+

app.py CHANGED Viewed

@@ -224,37 +224,37 @@ with demo:
         interactive=False
     )
-    # Main leaderboard table
-    gr.Markdown("## 🏆 ATLAS Benchmark Results", elem_classes="markdown-text")
-    # Debug information - dynamic component
-    results_count = gr.Markdown(f"📊 **Showing {len(leaderboard_df)} results**")
-    leaderboard_table = gr.Dataframe(
-        value=leaderboard_df,
-        datatype=COLUMN_TYPES,
-        interactive=False,
-        wrap=True,
-        column_widths=["30%", "20%", "12%", "12%", "12%", "14%"]
-    )
-    # Refresh button
-    refresh_button = gr.Button("🔄 Refresh Leaderboard")
-    def refresh_leaderboard_with_count():
-        """Refresh leaderboard and update count display"""
-        df = refresh_leaderboard()
-        count_text = f"📊 **Showing {len(df)} results**"
-        return df, count_text
-    refresh_button.click(
-        refresh_leaderboard_with_count,
-        inputs=[],
-        outputs=[leaderboard_table, results_count]
-    )
     # Submission section
-    with gr.Accordion("📊 Submit Your ATLAS Results", open=False):
         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         gr.Markdown("""

         interactive=False
     )
+    # Main leaderboard table - COMMENTED OUT
+    # gr.Markdown("## 🏆 ATLAS Benchmark Results", elem_classes="markdown-text")
+    # # Debug information - dynamic component
+    # results_count = gr.Markdown(f"📊 **Showing {len(leaderboard_df)} results**")
+    # leaderboard_table = gr.Dataframe(
+    #     value=leaderboard_df,
+    #     datatype=COLUMN_TYPES,
+    #     interactive=False,
+    #     wrap=True,
+    #     column_widths=["30%", "20%", "12%", "12%", "12%", "14%"]
+    # )
+    # # Refresh button
+    # refresh_button = gr.Button("🔄 Refresh Leaderboard")
+    # def refresh_leaderboard_with_count():
+    #     """Refresh leaderboard and update count display"""
+    #     df = refresh_leaderboard()
+    #     count_text = f"📊 **Showing {len(df)} results**"
+    #     return df, count_text
+    # refresh_button.click(
+    #     refresh_leaderboard_with_count,
+    #     inputs=[],
+    #     outputs=[leaderboard_table, results_count]
+    # )
     # Submission section
+    with gr.Accordion("🎯 Submit Your ATLAS Results", open=False):
         gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         gr.Markdown("""

pyproject.toml CHANGED Viewed

@@ -12,3 +12,4 @@ line_length = 119
 [tool.black]
 line-length = 119


12	[tool.black]
13	line-length = 119
14
15	+

src/about.py CHANGED Viewed

@@ -32,7 +32,7 @@ TITLE = """<h1 align="center" id="space-title">ATLAS: A High-Difficulty, Multidi
 INTRODUCTION_TEXT = """
 **ATLAS (AGI-Oriented Testbed for Logical Application in Science)** is a large-scale, high-difficulty, cross-disciplinary evaluation suite for assessing the frontier scientific reasoning capabilities of LLMs. Designed to address the challenges of benchmark saturation, narrow disciplinary focus, oversimplified answer formats, and data contamination in existing evaluations, ATLAS serves as a reliable **ruler** for measuring progress toward AGI in the **AI for Science** domain.
-## Benchmark Overview
 **ATLAS** evaluates models across seven core scientific fields that are central to AI for Science, encompassing 57 corresponding sub-fields to ensure comprehensive coverage of scientific reasoning requirements:
 - **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
 - **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
@@ -42,7 +42,7 @@ INTRODUCTION_TEXT = """
 - **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
 - **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
-## Evaluation Metrics
 - **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
 - **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
 - **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)

 INTRODUCTION_TEXT = """
 **ATLAS (AGI-Oriented Testbed for Logical Application in Science)** is a large-scale, high-difficulty, cross-disciplinary evaluation suite for assessing the frontier scientific reasoning capabilities of LLMs. Designed to address the challenges of benchmark saturation, narrow disciplinary focus, oversimplified answer formats, and data contamination in existing evaluations, ATLAS serves as a reliable **ruler** for measuring progress toward AGI in the **AI for Science** domain.
+## 🚀 Benchmark Overview
 **ATLAS** evaluates models across seven core scientific fields that are central to AI for Science, encompassing 57 corresponding sub-fields to ensure comprehensive coverage of scientific reasoning requirements:
 - **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
 - **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
 - **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
 - **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
+## 📊 Evaluation Metrics
 - **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
 - **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
 - **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)