Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
Β·
91c6e89
1
Parent(s):
c8b2c09
change 'proprietary' models to 'external' models and added news models
Browse files
proprietary_models_results.json β external_models_results.json
RENAMED
|
@@ -6,6 +6,7 @@
|
|
| 6 |
"date": "2024-04-12",
|
| 7 |
"status": "full",
|
| 8 |
"main_language": "Portuguese",
|
|
|
|
| 9 |
"result_metrics": {
|
| 10 |
"enem_challenge": 0.7172848145556333,
|
| 11 |
"bluex": 0.5549374130737135,
|
|
@@ -27,6 +28,7 @@
|
|
| 27 |
"date": "2024-04-13",
|
| 28 |
"status": "full",
|
| 29 |
"main_language": "Portuguese",
|
|
|
|
| 30 |
"result_metrics": {
|
| 31 |
"enem_challenge": 0.8180545836249126,
|
| 32 |
"bluex": 0.717663421418637,
|
|
@@ -48,6 +50,7 @@
|
|
| 48 |
"date": "2024-03-08",
|
| 49 |
"status": "full",
|
| 50 |
"main_language": "English",
|
|
|
|
| 51 |
"result_metrics": {
|
| 52 |
"enem_challenge": 0.7214835549335199,
|
| 53 |
"bluex": 0.6244784422809457,
|
|
@@ -69,6 +72,7 @@
|
|
| 69 |
"date": "2024-04-13",
|
| 70 |
"status": "full",
|
| 71 |
"main_language": "English",
|
|
|
|
| 72 |
"result_metrics": {
|
| 73 |
"enem_challenge": 0.7718684394681595,
|
| 74 |
"bluex": 0.6662030598052852,
|
|
@@ -90,6 +94,7 @@
|
|
| 90 |
"date": "2024-03-08",
|
| 91 |
"status": "full",
|
| 92 |
"main_language": "English",
|
|
|
|
| 93 |
"result_metrics": {
|
| 94 |
"enem_challenge": 0.7130860741777467,
|
| 95 |
"bluex": 0.5869262865090403,
|
|
@@ -111,6 +116,7 @@
|
|
| 111 |
"date": "2024-04-15",
|
| 112 |
"status": "full",
|
| 113 |
"main_language": "English",
|
|
|
|
| 114 |
"result_metrics": {
|
| 115 |
"enem_challenge": 0.8509447165850245,
|
| 116 |
"bluex": 0.7719054242002782,
|
|
@@ -132,6 +138,7 @@
|
|
| 132 |
"date": "2024-05-18",
|
| 133 |
"status": "full",
|
| 134 |
"main_language": "English",
|
|
|
|
| 135 |
"result_metrics": {
|
| 136 |
"enem_challenge": 0.7844646606018194,
|
| 137 |
"bluex": 0.6954102920723226,
|
|
@@ -153,6 +160,7 @@
|
|
| 153 |
"date": "2024-05-18",
|
| 154 |
"status": "full",
|
| 155 |
"main_language": "English",
|
|
|
|
| 156 |
"result_metrics": {
|
| 157 |
"enem_challenge": 0.8264520643806857,
|
| 158 |
"bluex": 0.7482614742698191,
|
|
@@ -166,5 +174,72 @@
|
|
| 166 |
},
|
| 167 |
"result_metrics_average": 0.7914657682594597,
|
| 168 |
"result_metrics_npm": 0.6834036936130392
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
}
|
| 170 |
]
|
|
|
|
| 6 |
"date": "2024-04-12",
|
| 7 |
"status": "full",
|
| 8 |
"main_language": "Portuguese",
|
| 9 |
+
"model_type": "proprietary",
|
| 10 |
"result_metrics": {
|
| 11 |
"enem_challenge": 0.7172848145556333,
|
| 12 |
"bluex": 0.5549374130737135,
|
|
|
|
| 28 |
"date": "2024-04-13",
|
| 29 |
"status": "full",
|
| 30 |
"main_language": "Portuguese",
|
| 31 |
+
"model_type": "proprietary",
|
| 32 |
"result_metrics": {
|
| 33 |
"enem_challenge": 0.8180545836249126,
|
| 34 |
"bluex": 0.717663421418637,
|
|
|
|
| 50 |
"date": "2024-03-08",
|
| 51 |
"status": "full",
|
| 52 |
"main_language": "English",
|
| 53 |
+
"model_type": "proprietary",
|
| 54 |
"result_metrics": {
|
| 55 |
"enem_challenge": 0.7214835549335199,
|
| 56 |
"bluex": 0.6244784422809457,
|
|
|
|
| 72 |
"date": "2024-04-13",
|
| 73 |
"status": "full",
|
| 74 |
"main_language": "English",
|
| 75 |
+
"model_type": "proprietary",
|
| 76 |
"result_metrics": {
|
| 77 |
"enem_challenge": 0.7718684394681595,
|
| 78 |
"bluex": 0.6662030598052852,
|
|
|
|
| 94 |
"date": "2024-03-08",
|
| 95 |
"status": "full",
|
| 96 |
"main_language": "English",
|
| 97 |
+
"model_type": "proprietary",
|
| 98 |
"result_metrics": {
|
| 99 |
"enem_challenge": 0.7130860741777467,
|
| 100 |
"bluex": 0.5869262865090403,
|
|
|
|
| 116 |
"date": "2024-04-15",
|
| 117 |
"status": "full",
|
| 118 |
"main_language": "English",
|
| 119 |
+
"model_type": "proprietary",
|
| 120 |
"result_metrics": {
|
| 121 |
"enem_challenge": 0.8509447165850245,
|
| 122 |
"bluex": 0.7719054242002782,
|
|
|
|
| 138 |
"date": "2024-05-18",
|
| 139 |
"status": "full",
|
| 140 |
"main_language": "English",
|
| 141 |
+
"model_type": "proprietary",
|
| 142 |
"result_metrics": {
|
| 143 |
"enem_challenge": 0.7844646606018194,
|
| 144 |
"bluex": 0.6954102920723226,
|
|
|
|
| 160 |
"date": "2024-05-18",
|
| 161 |
"status": "full",
|
| 162 |
"main_language": "English",
|
| 163 |
+
"model_type": "proprietary",
|
| 164 |
"result_metrics": {
|
| 165 |
"enem_challenge": 0.8264520643806857,
|
| 166 |
"bluex": 0.7482614742698191,
|
|
|
|
| 174 |
},
|
| 175 |
"result_metrics_average": 0.7914657682594597,
|
| 176 |
"result_metrics_npm": 0.6834036936130392
|
| 177 |
+
},
|
| 178 |
+
{
|
| 179 |
+
"model": "gemini-1.5-flash",
|
| 180 |
+
"name": "Gemini 1.5 Flash",
|
| 181 |
+
"link": "https://cloud.google.com/vertex-ai",
|
| 182 |
+
"date": "2024-08-09",
|
| 183 |
+
"status": "full",
|
| 184 |
+
"main_language": "English",
|
| 185 |
+
"model_type": "proprietary",
|
| 186 |
+
"result_metrics": {
|
| 187 |
+
"enem_challenge": 0.8306508047585724,
|
| 188 |
+
"bluex": 0.7579972183588317,
|
| 189 |
+
"oab_exams": 0.6446469248291572,
|
| 190 |
+
"assin2_sts": 0.838806085610371,
|
| 191 |
+
"assin2_rte": 0.9366169973822607,
|
| 192 |
+
"faquad_nli": 0.7963910785668922,
|
| 193 |
+
"hatebr_offensive": 0.9092078461170015,
|
| 194 |
+
"portuguese_hate_speech": 0.6932563987219857,
|
| 195 |
+
"tweetsentbr": 0.7312948963367732
|
| 196 |
+
},
|
| 197 |
+
"result_metrics_average": 0.7932075834090939,
|
| 198 |
+
"result_metrics_npm": 0.6855338135928848
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"model": "gpt-4o-mini-2024-07-18",
|
| 202 |
+
"name": "GPT 4o Mini (2024-07-18)",
|
| 203 |
+
"link": "https://www.openai.com/",
|
| 204 |
+
"date": "2024-07-25",
|
| 205 |
+
"status": "full",
|
| 206 |
+
"main_language": "English",
|
| 207 |
+
"model_type": "proprietary",
|
| 208 |
+
"result_metrics": {
|
| 209 |
+
"enem_challenge": 0.7669699090272918,
|
| 210 |
+
"bluex": 0.6842837273991655,
|
| 211 |
+
"oab_exams": 0.6013667425968109,
|
| 212 |
+
"assin2_sts": 0.7259038954527597,
|
| 213 |
+
"assin2_rte": 0.942809846745341,
|
| 214 |
+
"faquad_nli": 0.819807735300693,
|
| 215 |
+
"hatebr_offensive": 0.8682357029532165,
|
| 216 |
+
"portuguese_hate_speech": 0.7501413502853012,
|
| 217 |
+
"tweetsentbr": 0.7509303825869922
|
| 218 |
+
},
|
| 219 |
+
"result_metrics_average": 0.7678276991497301,
|
| 220 |
+
"result_metrics_npm": 0.6595966999910003
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"model": "nemotron-4-340b-instruct",
|
| 224 |
+
"name": "nvidia/Nemotron-4-340B-Instruct (Nvidia API)",
|
| 225 |
+
"link": "https://build.nvidia.com/nvidia/nemotron-4-340b-instruct",
|
| 226 |
+
"date": "2024-06-30",
|
| 227 |
+
"status": "full",
|
| 228 |
+
"main_language": "English",
|
| 229 |
+
"model_type": "chat",
|
| 230 |
+
"params": 340.0,
|
| 231 |
+
"result_metrics": {
|
| 232 |
+
"enem_challenge": 0.6648005598320503,
|
| 233 |
+
"bluex": 0.6578581363004172,
|
| 234 |
+
"oab_exams": 0.7020501138952164,
|
| 235 |
+
"assin2_sts": 0.7857731021403329,
|
| 236 |
+
"assin2_rte": 0.9489354458928496,
|
| 237 |
+
"faquad_nli": 0.8194444444444444,
|
| 238 |
+
"hatebr_offensive": 0.8641580001234928,
|
| 239 |
+
"portuguese_hate_speech": 0.7761835184102864,
|
| 240 |
+
"tweetsentbr": 0.780880021326841
|
| 241 |
+
},
|
| 242 |
+
"result_metrics_average": 0.7777870380406591,
|
| 243 |
+
"result_metrics_npm": 0.6740728488043128
|
| 244 |
}
|
| 245 |
]
|
src/display/utils.py
CHANGED
|
@@ -166,24 +166,30 @@ human_baseline_row[AutoEvalColumn.npm.name] = round(sum(npm) / len(npm), 2)
|
|
| 166 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
| 167 |
human_baseline_row["π€ Leaderboard Average"] = None
|
| 168 |
|
| 169 |
-
#
|
| 170 |
-
|
| 171 |
-
if os.path.exists('
|
| 172 |
-
with open('
|
| 173 |
all_models = json.load(f)
|
| 174 |
for model_data in all_models:
|
| 175 |
model_row = deepcopy(baseline_row)
|
| 176 |
model_row[AutoEvalColumn.model.name] = f'<a target="_blank" href="{model_data["link"]}" style="color: var(--text-color); text-decoration: underline;text-decoration-style: dotted;">{model_data["name"]} [{model_data["date"]}]</a>'
|
| 177 |
model_row[AutoEvalColumn.dummy.name] = model_data['model']
|
| 178 |
-
model_row[AutoEvalColumn.license.name] = "Proprietary"
|
| 179 |
for task in Tasks:
|
| 180 |
model_row[task.value.col_name] = round(model_data['result_metrics'][task.value.benchmark]*100, 2)
|
| 181 |
model_row[AutoEvalColumn.average.name] = round(model_data['result_metrics_average']*100, 2)
|
| 182 |
model_row[AutoEvalColumn.npm.name] = round(model_data['result_metrics_npm']*100, 2)
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
|
| 186 |
-
|
| 187 |
|
| 188 |
@dataclass
|
| 189 |
class ModelDetails:
|
|
|
|
| 166 |
if GET_ORIGINAL_HF_LEADERBOARD_EVAL_RESULTS:
|
| 167 |
human_baseline_row["π€ Leaderboard Average"] = None
|
| 168 |
|
| 169 |
+
#External models
|
| 170 |
+
external_rows = []
|
| 171 |
+
if os.path.exists('external_models_results.json'):
|
| 172 |
+
with open('external_models_results.json', 'r', encoding='utf8') as f:
|
| 173 |
all_models = json.load(f)
|
| 174 |
for model_data in all_models:
|
| 175 |
model_row = deepcopy(baseline_row)
|
| 176 |
model_row[AutoEvalColumn.model.name] = f'<a target="_blank" href="{model_data["link"]}" style="color: var(--text-color); text-decoration: underline;text-decoration-style: dotted;">{model_data["name"]} [{model_data["date"]}]</a>'
|
| 177 |
model_row[AutoEvalColumn.dummy.name] = model_data['model']
|
|
|
|
| 178 |
for task in Tasks:
|
| 179 |
model_row[task.value.col_name] = round(model_data['result_metrics'][task.value.benchmark]*100, 2)
|
| 180 |
model_row[AutoEvalColumn.average.name] = round(model_data['result_metrics_average']*100, 2)
|
| 181 |
model_row[AutoEvalColumn.npm.name] = round(model_data['result_metrics_npm']*100, 2)
|
| 182 |
+
|
| 183 |
+
model_type = ModelType.from_str(model_data['model_type'])
|
| 184 |
+
model_row[AutoEvalColumn.model_type.name] = model_type.name
|
| 185 |
+
model_row[AutoEvalColumn.model_type_symbol.name] = model_type.symbol
|
| 186 |
+
if model_type == ModelType.proprietary:
|
| 187 |
+
model_row[AutoEvalColumn.license.name] = "Proprietary"
|
| 188 |
+
if 'params' in model_data:
|
| 189 |
+
model_row[AutoEvalColumn.params.name] = model_data['params']
|
| 190 |
+
|
| 191 |
model_row[AutoEvalColumn.main_language.name] = model_data['main_language']
|
| 192 |
+
external_rows.append(model_row)
|
| 193 |
|
| 194 |
@dataclass
|
| 195 |
class ModelDetails:
|
src/populate.py
CHANGED
|
@@ -5,7 +5,7 @@ import copy
|
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
from src.display.formatting import has_no_nan_values, make_requests_clickable_model
|
| 8 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row,
|
| 9 |
from src.leaderboard.filter_models import filter_models_flags
|
| 10 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 11 |
|
|
@@ -14,8 +14,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, dynamic_path: str,
|
|
| 14 |
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
| 15 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 16 |
all_data_json.append(baseline_row)
|
| 17 |
-
for
|
| 18 |
-
all_data_json.append(
|
| 19 |
filter_models_flags(all_data_json)
|
| 20 |
|
| 21 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
from src.display.formatting import has_no_nan_values, make_requests_clickable_model
|
| 8 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row, external_rows
|
| 9 |
from src.leaderboard.filter_models import filter_models_flags
|
| 10 |
from src.leaderboard.read_evals import get_raw_eval_results
|
| 11 |
|
|
|
|
| 14 |
raw_data = get_raw_eval_results(results_path=results_path, requests_path=requests_path, dynamic_path=dynamic_path)
|
| 15 |
all_data_json = [v.to_dict() for v in raw_data]
|
| 16 |
all_data_json.append(baseline_row)
|
| 17 |
+
for external_row in external_rows:
|
| 18 |
+
all_data_json.append(external_row)
|
| 19 |
filter_models_flags(all_data_json)
|
| 20 |
|
| 21 |
df = pd.DataFrame.from_records(all_data_json)
|