“pangjh3” commited on
Commit
9b1c42f
·
1 Parent(s): a0e98c9

modified: .gitignore

Browse files

modified: .pre-commit-config.yaml
modified: Makefile
modified: pyproject.toml
modified: src/about.py

Files changed (5) hide show
  1. .gitignore +3 -0
  2. .pre-commit-config.yaml +3 -0
  3. Makefile +3 -0
  4. pyproject.toml +3 -0
  5. src/about.py +17 -3
.gitignore CHANGED
@@ -13,3 +13,6 @@ eval-results-bk/
13
  logs/
14
 
15
 
 
 
 
 
13
  logs/
14
 
15
 
16
+
17
+
18
+
.pre-commit-config.yaml CHANGED
@@ -53,3 +53,6 @@ repos:
53
  - id: ruff
54
 
55
 
 
 
 
 
53
  - id: ruff
54
 
55
 
56
+
57
+
58
+
Makefile CHANGED
@@ -13,3 +13,6 @@ quality:
13
  ruff check .
14
 
15
 
 
 
 
 
13
  ruff check .
14
 
15
 
16
+
17
+
18
+
pyproject.toml CHANGED
@@ -13,3 +13,6 @@ line_length = 119
13
  line-length = 119
14
 
15
 
 
 
 
 
13
  line-length = 119
14
 
15
 
16
+
17
+
18
+
src/about.py CHANGED
@@ -26,7 +26,19 @@ NUM_FEWSHOT = 0 # Change with your few shot
26
 
27
 
28
  # Your leaderboard name
29
- TITLE = """<h1 align="center" id="space-title">ATLAS: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning</h1>"""
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  # What does your leaderboard evaluate?
32
  INTRODUCTION_TEXT = """
@@ -43,10 +55,12 @@ INTRODUCTION_TEXT = """
43
  - **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
44
 
45
  ## 📊 Evaluation Metrics
46
- - **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / Qwen3-235B-A22B)
47
  - **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
48
  - **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)
49
  The leaderboard displays model performance sorted by average accuracy, with domain-specific scores reflecting strengths in different scientific fields. All metrics are derived from the ATLAS validation/test set (≈800 expert-created original problems).
 
 
50
  """
51
 
52
  # Which evaluations are you running? how can people reproduce what you have?
@@ -86,7 +100,7 @@ To reproduce our evaluation results:
86
  """
87
 
88
  EVALUATION_QUEUE_TEXT = """
89
- ## Submit Your ATLAS Results
90
 
91
  Results can be submitted as evaluation outputs in JSON format. Each submission should include predictions and reasoning content for all test questions.
92
 
 
26
 
27
 
28
  # Your leaderboard name
29
+ # TITLE = """<h1 align="center" id="space-title">ATLAS: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning</h1>"""
30
+ TITLE = """<h1 align="center" id="space-title">ATLAS: A High-Difficulty, Multidisciplinary Benchmark for Frontier Scientific Reasoning</h1>
31
+ <div align="center">
32
+ <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/">
33
+ <img src="https://img.shields.io/badge/Dataset%20License-CC%20BY--NC--SA%204.0-blue.svg" alt="Dataset License: CC BY-NC-SA 4.0">
34
+ </a>
35
+ <a href="https://arxiv.org/abs/2511.14366">
36
+ <img src="https://img.shields.io/badge/Paper-arXiv-red.svg" alt="Paper">
37
+ </a>
38
+ <a href="https://huggingface.co/datasets/opencompass/ATLAS">
39
+ <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Dataset-orange" alt="Hugging Face Dataset">
40
+ </a>
41
+ </div>"""
42
 
43
  # What does your leaderboard evaluate?
44
  INTRODUCTION_TEXT = """
 
55
  - **Materials Science** - Composite materials, metal materials, organic polymer materials, and material synthesis
56
 
57
  ## 📊 Evaluation Metrics
58
+ - **Accuracy (%)**: Overall correctness of predictions across all domains, judged by LLM-as-Judge (OpenAI o4-mini / GPT-OSS-120B)
59
  - **mG-Pass@2**: Multi-generation Pass rate for 2 predictions (measures consistency of model outputs)
60
  - **mG-Pass@4**: Multi-generation Pass rate for 4 predictions (measures stability of reasoning capabilities)
61
  The leaderboard displays model performance sorted by average accuracy, with domain-specific scores reflecting strengths in different scientific fields. All metrics are derived from the ATLAS validation/test set (≈800 expert-created original problems).
62
+
63
+ #### 📧 If you have any questions about submissions or leaderboards, please contact: [email protected]
64
  """
65
 
66
  # Which evaluations are you running? how can people reproduce what you have?
 
100
  """
101
 
102
  EVALUATION_QUEUE_TEXT = """
103
+ ## Submit Your ATLAS Test Set Results
104
 
105
  Results can be submitted as evaluation outputs in JSON format. Each submission should include predictions and reasoning content for all test questions.
106