Spaces:
Running
Running
Commit
·
472c111
1
Parent(s):
4a49ee3
Added asterisk for closed models
Browse files- leaderboard/md.py +1 -1
- leaderboard/utils.py +7 -0
leaderboard/md.py
CHANGED
|
@@ -178,7 +178,7 @@ TOP_TEXT = """# RewardBench: Evaluating Reward Models"""
|
|
| 178 |
|
| 179 |
CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
|
| 180 |
|
| 181 |
-
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset v2](https://huggingface.co/datasets/allenai/reward-bench-2) | [Results v2](https://huggingface.co/datasets/allenai/reward-bench-2-results) | [Paper](https://arxiv.org/abs/2506.01937) | Total models: {{}} |
|
| 182 |
|
| 183 |
CAPTION_V1 = f"""The original RewardBench -- the first reward model evaluation.
|
| 184 |
|
|
|
|
| 178 |
|
| 179 |
CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human data and designed to be substantially more difficult!
|
| 180 |
|
| 181 |
+
[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset v2](https://huggingface.co/datasets/allenai/reward-bench-2) | [Results v2](https://huggingface.co/datasets/allenai/reward-bench-2-results) | [Paper](https://arxiv.org/abs/2506.01937) | Total models: {{}} | | * Closed models not run on Ai2 infrastructure | Last restart (PST): {current_time}"""
|
| 182 |
|
| 183 |
CAPTION_V1 = f"""The original RewardBench -- the first reward model evaluation.
|
| 184 |
|
leaderboard/utils.py
CHANGED
|
@@ -43,6 +43,13 @@ CONTAMINATED_MODELS_V1 = [
|
|
| 43 |
"Ray2333/GRM-Gemma-2B-rewardmodel-ft",
|
| 44 |
]
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# From Open LLM Leaderboard
|
| 48 |
def model_hyperlink(link, model_name):
|
|
|
|
| 43 |
"Ray2333/GRM-Gemma-2B-rewardmodel-ft",
|
| 44 |
]
|
| 45 |
|
| 46 |
+
UNVERIFIED_MODELS_V2 = [
|
| 47 |
+
"ContextualAI/LMUnit-llama3.1-70b",
|
| 48 |
+
"ContextualAI/LMUnit-qwen2.5-72b",
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
UNVERIFIED_MODELS = UNVERIFIED_MODELS_V2
|
| 52 |
+
|
| 53 |
|
| 54 |
# From Open LLM Leaderboard
|
| 55 |
def model_hyperlink(link, model_name):
|