Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
add perplexity
Browse files- app.py +1 -1
- src/about.py +3 -0
- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +27 -4
app.py
CHANGED
|
@@ -246,7 +246,7 @@ with demo:
|
|
| 246 |
interactive=False,
|
| 247 |
visible=True,
|
| 248 |
# column_widths=["2%", "33%"]
|
| 249 |
-
height=
|
| 250 |
)
|
| 251 |
|
| 252 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
|
|
|
| 246 |
interactive=False,
|
| 247 |
visible=True,
|
| 248 |
# column_widths=["2%", "33%"]
|
| 249 |
+
height=800
|
| 250 |
)
|
| 251 |
|
| 252 |
# Dummy leaderboard for handling the case when the user uses backspace key
|
src/about.py
CHANGED
|
@@ -20,6 +20,7 @@ class Tasks(Enum):
|
|
| 20 |
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
|
| 21 |
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
|
| 22 |
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
|
|
|
|
| 23 |
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
|
| 24 |
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
|
| 25 |
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
|
|
@@ -31,6 +32,7 @@ class Tasks(Enum):
|
|
| 31 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
|
| 32 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
|
| 33 |
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
|
|
|
|
| 34 |
|
| 35 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 36 |
# ---------------------------------------------------
|
|
@@ -72,6 +74,7 @@ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
|
|
| 72 |
* add metadata for models (e.g. #Params)
|
| 73 |
* add more tasks
|
| 74 |
* use model templates
|
|
|
|
| 75 |
|
| 76 |
## Tasks
|
| 77 |
|
|
|
|
| 20 |
task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice")
|
| 21 |
task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice")
|
| 22 |
task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until")
|
| 23 |
+
#task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice")
|
| 24 |
task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until")
|
| 25 |
task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice")
|
| 26 |
task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until")
|
|
|
|
| 32 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until")
|
| 33 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice")
|
| 34 |
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until")
|
| 35 |
+
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "polish_poleval2018_task3_test_10k", "other")
|
| 36 |
|
| 37 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 38 |
# ---------------------------------------------------
|
|
|
|
| 74 |
* add metadata for models (e.g. #Params)
|
| 75 |
* add more tasks
|
| 76 |
* use model templates
|
| 77 |
+
* fix scrolling on Firefox
|
| 78 |
|
| 79 |
## Tasks
|
| 80 |
|
src/display/utils.py
CHANGED
|
@@ -26,6 +26,7 @@ auto_eval_column_dict = []
|
|
| 26 |
# Init
|
| 27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
|
| 29 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
| 30 |
#Scores
|
| 31 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
@@ -39,7 +40,6 @@ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Arch
|
|
| 39 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 40 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 41 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 42 |
-
auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str", True)])
|
| 43 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
| 44 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 45 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
|
|
|
| 26 |
# Init
|
| 27 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 28 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 29 |
+
auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str", True)])
|
| 30 |
auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
|
| 31 |
#Scores
|
| 32 |
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
|
|
|
| 40 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 41 |
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 42 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
|
|
|
| 43 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
|
| 44 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 45 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -33,6 +33,7 @@ class EvalResult:
|
|
| 33 |
date: str = "" # submission date of request file
|
| 34 |
still_on_hub: bool = False
|
| 35 |
n_shot: NShotType = NShotType.n0
|
|
|
|
| 36 |
|
| 37 |
@classmethod
|
| 38 |
def init_from_json_file(self, json_filepath, n_shot_num):
|
|
@@ -48,6 +49,7 @@ class EvalResult:
|
|
| 48 |
|
| 49 |
# Get model and org
|
| 50 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
|
| 51 |
SPICHLERZ_ORG = "speakleash/"
|
| 52 |
|
| 53 |
if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
|
|
@@ -91,7 +93,10 @@ class EvalResult:
|
|
| 91 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 92 |
continue
|
| 93 |
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
| 95 |
results[task.benchmark] = mean_acc
|
| 96 |
|
| 97 |
return self(
|
|
@@ -104,7 +109,8 @@ class EvalResult:
|
|
| 104 |
revision= config.get("model_sha", ""),
|
| 105 |
still_on_hub=still_on_hub,
|
| 106 |
architecture=architecture,
|
| 107 |
-
n_shot=NShotType.from_str(n_shot_num)
|
|
|
|
| 108 |
)
|
| 109 |
|
| 110 |
def update_with_metadata(self, metadata):
|
|
@@ -139,10 +145,10 @@ class EvalResult:
|
|
| 139 |
|
| 140 |
def to_dict(self):
|
| 141 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
| 142 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
| 143 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
| 144 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
| 145 |
-
|
|
|
|
| 146 |
average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
| 147 |
average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
| 148 |
|
|
@@ -352,4 +358,21 @@ def get_raw_eval_results(results_path: str, requests_path: str, metadata) -> lis
|
|
| 352 |
print(f"not all eval values present {v.eval_name} {v.full_model}")
|
| 353 |
continue
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
return results
|
|
|
|
| 33 |
date: str = "" # submission date of request file
|
| 34 |
still_on_hub: bool = False
|
| 35 |
n_shot: NShotType = NShotType.n0
|
| 36 |
+
org_and_model: str = ""
|
| 37 |
|
| 38 |
@classmethod
|
| 39 |
def init_from_json_file(self, json_filepath, n_shot_num):
|
|
|
|
| 49 |
|
| 50 |
# Get model and org
|
| 51 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
| 52 |
+
orig_org_and_model = org_and_model
|
| 53 |
SPICHLERZ_ORG = "speakleash/"
|
| 54 |
|
| 55 |
if re.match(r"^pretrained=(.*/(plgkwrobel|plggspkl)/)(models/)?", org_and_model):
|
|
|
|
| 93 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
| 94 |
continue
|
| 95 |
|
| 96 |
+
if 'perplexity' in task.metric:
|
| 97 |
+
mean_acc = np.mean(accs)
|
| 98 |
+
else:
|
| 99 |
+
mean_acc = np.mean(accs) * 100.0
|
| 100 |
results[task.benchmark] = mean_acc
|
| 101 |
|
| 102 |
return self(
|
|
|
|
| 109 |
revision= config.get("model_sha", ""),
|
| 110 |
still_on_hub=still_on_hub,
|
| 111 |
architecture=architecture,
|
| 112 |
+
n_shot=NShotType.from_str(n_shot_num),
|
| 113 |
+
org_and_model=orig_org_and_model
|
| 114 |
)
|
| 115 |
|
| 116 |
def update_with_metadata(self, metadata):
|
|
|
|
| 145 |
|
| 146 |
def to_dict(self):
|
| 147 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
|
| 148 |
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
| 149 |
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
| 150 |
+
all_tasks = g_tasks + mc_tasks
|
| 151 |
+
average = sum([v for task,v in self.results.items() if v is not None and task in all_tasks]) / len(all_tasks)
|
| 152 |
average_g = sum([v for task,v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
|
| 153 |
average_mc = sum([v for task,v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
|
| 154 |
|
|
|
|
| 358 |
print(f"not all eval values present {v.eval_name} {v.full_model}")
|
| 359 |
continue
|
| 360 |
|
| 361 |
+
missing_results_for_task = {}
|
| 362 |
+
for v in eval_results.values():
|
| 363 |
+
r = v.to_dict()
|
| 364 |
+
for task in Tasks:
|
| 365 |
+
if r[task.value.col_name] is None:
|
| 366 |
+
task_name = f"{r['n_shot']}|{task.value.benchmark}"
|
| 367 |
+
if task_name in missing_results_for_task:
|
| 368 |
+
missing_results_for_task[task_name].append(f"{v.full_model}|{v.org_and_model}")
|
| 369 |
+
else:
|
| 370 |
+
missing_results_for_task[task_name] = [f"{v.full_model}|{v.org_and_model}"]
|
| 371 |
+
|
| 372 |
+
# print('missing_results_for_task', missing_results_for_task)
|
| 373 |
+
for task, models in missing_results_for_task.items():
|
| 374 |
+
print(f"Missing results for {task} for {len(models)} models")
|
| 375 |
+
print(" ".join(models))
|
| 376 |
+
|
| 377 |
+
|
| 378 |
return results
|