β€œpangjh3” commited on
Commit
64f7b86
Β·
1 Parent(s): 931449f

modified: .gitignore

Browse files

modified: .pre-commit-config.yaml
modified: Makefile
modified: app.py
modified: pyproject.toml
modified: src/about.py

Files changed (6) hide show
  1. .gitignore +1 -0
  2. .pre-commit-config.yaml +1 -0
  3. Makefile +1 -0
  4. app.py +25 -25
  5. pyproject.toml +1 -0
  6. src/about.py +2 -2
.gitignore CHANGED
@@ -12,3 +12,4 @@ eval-queue-bk/
12
  eval-results-bk/
13
  logs/
14
 
 
 
12
  eval-results-bk/
13
  logs/
14
 
15
+
.pre-commit-config.yaml CHANGED
@@ -52,3 +52,4 @@ repos:
52
  hooks:
53
  - id: ruff
54
 
 
 
52
  hooks:
53
  - id: ruff
54
 
55
+
Makefile CHANGED
@@ -12,3 +12,4 @@ quality:
12
  python -m isort --check-only .
13
  ruff check .
14
 
 
 
12
  python -m isort --check-only .
13
  ruff check .
14
 
15
+
app.py CHANGED
@@ -224,37 +224,37 @@ with demo:
224
  interactive=False
225
  )
226
 
227
- # Main leaderboard table
228
- gr.Markdown("## πŸ† ATLAS Benchmark Results", elem_classes="markdown-text")
229
 
230
- # Debug information - dynamic component
231
- results_count = gr.Markdown(f"πŸ“Š **Showing {len(leaderboard_df)} results**")
232
 
233
- leaderboard_table = gr.Dataframe(
234
- value=leaderboard_df,
235
- datatype=COLUMN_TYPES,
236
- interactive=False,
237
- wrap=True,
238
- column_widths=["30%", "20%", "12%", "12%", "12%", "14%"]
239
- )
240
-
241
- # Refresh button
242
- refresh_button = gr.Button("πŸ”„ Refresh Leaderboard")
243
 
244
- def refresh_leaderboard_with_count():
245
- """Refresh leaderboard and update count display"""
246
- df = refresh_leaderboard()
247
- count_text = f"πŸ“Š **Showing {len(df)} results**"
248
- return df, count_text
249
 
250
- refresh_button.click(
251
- refresh_leaderboard_with_count,
252
- inputs=[],
253
- outputs=[leaderboard_table, results_count]
254
- )
255
 
256
  # Submission section
257
- with gr.Accordion("πŸ“Š Submit Your ATLAS Results", open=False):
258
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
259
 
260
  gr.Markdown("""
 
224
  interactive=False
225
  )
226
 
227
+ # Main leaderboard table - COMMENTED OUT
228
+ # gr.Markdown("## πŸ† ATLAS Benchmark Results", elem_classes="markdown-text")
229
 
230
+ # # Debug information - dynamic component
231
+ # results_count = gr.Markdown(f"πŸ“Š **Showing {len(leaderboard_df)} results**")
232
 
233
+ # leaderboard_table = gr.Dataframe(
234
+ # value=leaderboard_df,
235
+ # datatype=COLUMN_TYPES,
236
+ # interactive=False,
237
+ # wrap=True,
238
+ # column_widths=["30%", "20%", "12%", "12%", "12%", "14%"]
239
+ # )
240
+
241
+ # # Refresh button
242
+ # refresh_button = gr.Button("πŸ”„ Refresh Leaderboard")
243
 
244
+ # def refresh_leaderboard_with_count():
245
+ # """Refresh leaderboard and update count display"""
246
+ # df = refresh_leaderboard()
247
+ # count_text = f"πŸ“Š **Showing {len(df)} results**"
248
+ # return df, count_text
249
 
250
+ # refresh_button.click(
251
+ # refresh_leaderboard_with_count,
252
+ # inputs=[],
253
+ # outputs=[leaderboard_table, results_count]
254
+ # )
255
 
256
  # Submission section
257
+ with gr.Accordion("🎯 Submit Your ATLAS Results", open=False):
258
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
259
 
260
  gr.Markdown("""
pyproject.toml CHANGED
@@ -12,3 +12,4 @@ line_length = 119
12
  [tool.black]
13
  line-length = 119
14
 
 
 
12
  [tool.black]
13
  line-length = 119
14
 
15
+
src/about.py CHANGED
@@ -32,7 +32,7 @@ TITLE = """<h1 align="center" id="space-title">ATLAS: A High-Difficulty, Multidi
32
  INTRODUCTION_TEXT = """
33
  **ATLAS (AGI-Oriented Testbed for Logical Application in Science)** is a large-scale, high-difficulty, cross-disciplinary evaluation suite for assessing the frontier scientific reasoning capabilities of LLMs. Designed to address the challenges of benchmark saturation, narrow disciplinary focus, oversimplified answer formats, and data contamination in existing evaluations, ATLAS serves as a reliable **ruler** for measuring progress toward AGI in the **AI for Science** domain.
34
 
35
- ## Benchmark Overview
36
  **ATLAS** evaluates models across seven core scientific fields that are central to AI for Science, encompassing 57 corresponding sub-fields to ensure comprehensive coverage of scientific reasoning requirements:
37
  - **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
38
  - **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
@@ -42,7 +42,7 @@ INTRODUCTION_TEXT = """
42
  - **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
43
  - **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
44
 
45
- ## Evaluation Metrics
46
  - **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
47
  - **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
48
  - **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)
 
32
  INTRODUCTION_TEXT = """
33
  **ATLAS (AGI-Oriented Testbed for Logical Application in Science)** is a large-scale, high-difficulty, cross-disciplinary evaluation suite for assessing the frontier scientific reasoning capabilities of LLMs. Designed to address the challenges of benchmark saturation, narrow disciplinary focus, oversimplified answer formats, and data contamination in existing evaluations, ATLAS serves as a reliable **ruler** for measuring progress toward AGI in the **AI for Science** domain.
34
 
35
+ ## πŸš€ Benchmark Overview
36
  **ATLAS** evaluates models across seven core scientific fields that are central to AI for Science, encompassing 57 corresponding sub-fields to ensure comprehensive coverage of scientific reasoning requirements:
37
  - **Mathematics** - Abstract algebra, analysis, differential equations, and computational mathematics
38
  - **Physics** - Classical mechanics, electrodynamics, quantum mechanics, thermodynamics, and astrophysics
 
42
  - **Earth Science** - Geography, geodesy, atmospheric chemistry, marine science, and geology
43
  - **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
44
 
45
+ ## πŸ“Š Evaluation Metrics
46
  - **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
47
  - **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
48
  - **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)