Update app.py with new features from arena
Browse files
app.py
CHANGED
|
@@ -4,17 +4,12 @@ import glob
|
|
| 4 |
import pickle
|
| 5 |
import traceback
|
| 6 |
import numpy as np
|
| 7 |
-
from datetime import datetime
|
| 8 |
|
| 9 |
import pandas as pd
|
| 10 |
import gradio as gr
|
| 11 |
import numpy as np
|
| 12 |
|
| 13 |
|
| 14 |
-
basic_component_values = [None] * 6
|
| 15 |
-
leader_component_values = [None] * 5
|
| 16 |
-
|
| 17 |
-
|
| 18 |
promo_banner = """
|
| 19 |
<div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
|
| 20 |
USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
|
|
@@ -23,18 +18,30 @@ promo_banner = """
|
|
| 23 |
|
| 24 |
deprecated_model_name = [
|
| 25 |
"GigaChat 3.1.25.3",
|
| 26 |
-
"GigaChat-Pro 2.2.25.3",
|
| 27 |
"saiga_llama3_8b_v6",
|
| 28 |
"saiga_phi3_medium",
|
| 29 |
"GigaChat-Plus 3.1.25.3",
|
| 30 |
"GigaChat-Pro 4.0.26.8",
|
| 31 |
"GigaChat 4.0.26.8",
|
| 32 |
-
"xAI: Grok 2",
|
| 33 |
"GigaChat-Pro 4.0.26.15",
|
| 34 |
"GigaChat 4.0.26.15",
|
| 35 |
-
"YandexGPT Experimental", "yandex-gpt-arena"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
]
|
| 37 |
|
|
|
|
| 38 |
def make_default_md_1():
|
| 39 |
leaderboard_md = f"""
|
| 40 |
# π LLM Arena in Russian: Leaderboard
|
|
@@ -44,24 +51,24 @@ def make_default_md_1():
|
|
| 44 |
"""
|
| 45 |
return leaderboard_md
|
| 46 |
|
| 47 |
-
|
| 48 |
def make_default_md_2():
|
| 49 |
leaderboard_md = f"""
|
| 50 |
-
|
| 51 |
The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
|
| 52 |
Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
|
|
|
|
| 53 |
- To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
|
| 54 |
- If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
|
| 55 |
- You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
|
| 56 |
"""
|
| 57 |
-
|
| 58 |
return leaderboard_md
|
| 59 |
|
| 60 |
|
| 61 |
def make_arena_leaderboard_md(arena_df, last_updated_time):
|
| 62 |
-
|
|
|
|
| 63 |
total_models = len(arena_df)
|
| 64 |
-
space = "
|
| 65 |
|
| 66 |
leaderboard_md = f"""
|
| 67 |
Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
|
|
@@ -74,57 +81,166 @@ See Figure 1 below for a visualization of the confidence intervals of model rati
|
|
| 74 |
|
| 75 |
|
| 76 |
def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
|
| 77 |
-
total_votes = sum(arena_df["num_battles"])
|
| 78 |
total_models = len(arena_df)
|
| 79 |
-
space = "
|
| 80 |
-
total_subset_votes = sum(arena_subset_df["num_battles"])
|
| 81 |
total_subset_models = len(arena_subset_df)
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
"""
|
| 85 |
return leaderboard_md
|
| 86 |
|
| 87 |
-
|
| 88 |
def model_hyperlink(model_name, link):
|
| 89 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 90 |
|
| 91 |
|
| 92 |
-
def filter_deprecated_models_plots(fig, hidden_models=None):
|
| 93 |
"""
|
| 94 |
-
|
| 95 |
|
| 96 |
Args:
|
| 97 |
fig: The Plotly figure object.
|
| 98 |
-
hidden_models: A list of model names to remove.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
"""
|
| 100 |
if fig is None:
|
| 101 |
-
return
|
| 102 |
-
|
| 103 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
return fig
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
def load_leaderboard_table_csv(filename, add_hyperlink=True):
|
| 130 |
lines = open(filename).readlines()
|
|
@@ -132,688 +248,842 @@ def load_leaderboard_table_csv(filename, add_hyperlink=True):
|
|
| 132 |
rows = []
|
| 133 |
for i in range(1, len(lines)):
|
| 134 |
row = [v.strip() for v in lines[i].split(",")]
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
if
|
| 139 |
-
|
| 140 |
v = int(ast.literal_eval(v))
|
| 141 |
-
|
| 142 |
-
v = np.nan
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
if v != "-":
|
| 150 |
-
v = round(ast.literal_eval(v[:-1]), 1)
|
| 151 |
-
else:
|
| 152 |
-
v = np.nan
|
| 153 |
-
elif h == "MT-bench (score)":
|
| 154 |
-
if v != "-":
|
| 155 |
-
v = round(ast.literal_eval(v), 2)
|
| 156 |
-
else:
|
| 157 |
-
v = np.nan
|
| 158 |
-
item[h] = v
|
| 159 |
-
if add_hyperlink:
|
| 160 |
item["Model"] = model_hyperlink(item["Model"], item["Link"])
|
|
|
|
| 161 |
rows.append(item)
|
| 162 |
-
|
| 163 |
return rows
|
| 164 |
|
| 165 |
|
| 166 |
def create_ranking_str(ranking, ranking_difference):
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
|
| 175 |
def recompute_final_ranking(arena_df):
|
| 176 |
-
# compute ranking based on CI
|
| 177 |
ranking = {}
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
continue
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
):
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
return list(ranking.values())
|
| 189 |
|
| 190 |
|
| 191 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
if hidden_models:
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
-
arena_df = arena_df.sort_values(
|
| 197 |
-
by=["final_ranking", "rating"], ascending=[True, False]
|
| 198 |
-
)
|
| 199 |
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
# filter out models not in the arena_df
|
| 208 |
-
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
|
| 209 |
-
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
|
| 210 |
-
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
|
| 211 |
-
# keep only the models in the subset in arena_df and recompute final_ranking
|
| 212 |
-
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
|
| 213 |
-
# recompute final ranking
|
| 214 |
-
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
| 215 |
-
|
| 216 |
-
# assign ranking by the order
|
| 217 |
-
arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
|
| 218 |
-
arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
|
| 219 |
-
# join arena_df and arena_subset_df on index
|
| 220 |
-
arena_df = arena_subset_df.join(
|
| 221 |
-
arena_df["final_ranking"], rsuffix="_global", how="inner"
|
| 222 |
-
)
|
| 223 |
-
arena_df["ranking_difference"] = (
|
| 224 |
-
arena_df["final_ranking_global"] - arena_df["final_ranking"]
|
| 225 |
-
)
|
| 226 |
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
axis=1,
|
| 233 |
-
)
|
| 234 |
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
try:
|
| 242 |
-
model_name = model_table_df[model_table_df["key"] == model_key][
|
| 243 |
-
"Model"
|
| 244 |
-
].values[0]
|
| 245 |
-
ranking = arena_df.iloc[i].get("final_ranking") or i + 1
|
| 246 |
-
row.append(ranking)
|
| 247 |
-
if arena_subset_df is not None:
|
| 248 |
-
row.append(arena_df.iloc[i].get("ranking_difference") or 0)
|
| 249 |
-
row.append(model_name)
|
| 250 |
-
row.append(round(arena_df.iloc[i]["rating"]))
|
| 251 |
-
upper_diff = round(
|
| 252 |
-
arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
|
| 253 |
-
)
|
| 254 |
-
lower_diff = round(
|
| 255 |
-
arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
|
| 256 |
)
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
| 263 |
)
|
| 264 |
-
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
| 266 |
)
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
if
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
else:
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
return values
|
| 279 |
|
| 280 |
|
| 281 |
key_to_category_name = {
|
| 282 |
-
|
|
|
|
| 283 |
"crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
|
| 284 |
"site_visitors/medium_prompts": "site_visitors/medium_prompts",
|
| 285 |
-
"site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:
|
| 286 |
}
|
| 287 |
cat_name_to_explanation = {
|
|
|
|
| 288 |
"Overall": "All queries",
|
| 289 |
-
"crowdsourcing/simple_prompts": "Queries collected
|
| 290 |
"site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
|
| 291 |
-
"site_visitors/medium_prompts:
|
| 292 |
}
|
| 293 |
cat_name_to_baseline = {
|
| 294 |
-
|
|
|
|
| 295 |
}
|
| 296 |
|
| 297 |
actual_categories = [
|
| 298 |
-
#
|
| 299 |
-
# "
|
|
|
|
| 300 |
"site_visitors/medium_prompts",
|
| 301 |
"site_visitors/medium_prompts:style control"
|
| 302 |
]
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
|
|
|
|
|
|
| 306 |
|
| 307 |
|
| 308 |
def read_elo_file(elo_results_file, leaderboard_table_file):
|
|
|
|
|
|
|
| 309 |
arena_dfs = {}
|
| 310 |
category_elo_results = {}
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
|
|
|
| 331 |
return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
|
| 332 |
|
| 333 |
|
| 334 |
def build_leaderboard_tab(
|
| 335 |
elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
|
| 336 |
):
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
#
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
with gr.Row():
|
| 354 |
with gr.Column(scale=4):
|
|
|
|
| 355 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
| 356 |
with gr.Column(scale=1):
|
| 357 |
vote_button = gr.Button("Vote!", link="https://llmarena.ru")
|
| 358 |
md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
|
| 359 |
-
|
| 360 |
-
if leaderboard_table_file:
|
| 361 |
-
data = load_leaderboard_table_csv(leaderboard_table_file)
|
| 362 |
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
value=[],
|
| 385 |
-
info="",
|
| 386 |
-
)
|
| 387 |
-
default_category_details = make_category_arena_leaderboard_md(
|
| 388 |
-
arena_df, arena_df, name=selected_category
|
| 389 |
-
)
|
| 390 |
-
|
| 391 |
-
with gr.Column(scale=4, variant="panel"):
|
| 392 |
-
category_deets = gr.Markdown(
|
| 393 |
-
default_category_details, elem_id="category_deets"
|
| 394 |
-
)
|
| 395 |
-
|
| 396 |
-
arena_vals = pd.DataFrame(
|
| 397 |
-
arena_table_vals,
|
| 398 |
-
columns=[
|
| 399 |
-
"Rank* (UB)",
|
| 400 |
-
"Model",
|
| 401 |
-
"Arena Elo",
|
| 402 |
-
"95% CI",
|
| 403 |
-
"Votes",
|
| 404 |
-
"Organization",
|
| 405 |
-
"License",
|
| 406 |
-
"Knowledge Cutoff",
|
| 407 |
-
],
|
| 408 |
-
)
|
| 409 |
-
elo_display_df = gr.Dataframe(
|
| 410 |
-
headers=[
|
| 411 |
-
"Rank* (UB)",
|
| 412 |
-
"Model",
|
| 413 |
-
"Arena Elo",
|
| 414 |
-
"95% CI",
|
| 415 |
-
"Votes",
|
| 416 |
-
"Organization",
|
| 417 |
-
"License",
|
| 418 |
-
"Knowledge Cutoff",
|
| 419 |
-
],
|
| 420 |
-
datatype=[
|
| 421 |
-
"str",
|
| 422 |
-
"markdown",
|
| 423 |
-
"number",
|
| 424 |
-
"str",
|
| 425 |
-
"number",
|
| 426 |
-
"str",
|
| 427 |
-
"str",
|
| 428 |
-
"str",
|
| 429 |
-
],
|
| 430 |
-
value=arena_vals.style,
|
| 431 |
-
elem_id="arena_leaderboard_dataframe",
|
| 432 |
-
height=700,
|
| 433 |
-
column_widths=[70, 190, 100, 100, 90, 130, 150, 100],
|
| 434 |
-
wrap=True,
|
| 435 |
)
|
| 436 |
|
| 437 |
-
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
)
|
| 440 |
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
)
|
| 448 |
-
with gr.Row():
|
| 449 |
-
with gr.Column():
|
| 450 |
-
gr.Markdown(
|
| 451 |
-
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
|
| 452 |
-
elem_id="plot-title",
|
| 453 |
-
)
|
| 454 |
-
plot_3 = gr.Plot(p3, show_label=False)
|
| 455 |
-
with gr.Column():
|
| 456 |
-
gr.Markdown(
|
| 457 |
-
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
|
| 458 |
-
elem_id="plot-title",
|
| 459 |
-
)
|
| 460 |
-
plot_4 = gr.Plot(p4, show_label=False)
|
| 461 |
-
with gr.Row():
|
| 462 |
-
with gr.Column():
|
| 463 |
-
gr.Markdown(
|
| 464 |
-
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
|
| 465 |
-
elem_id="plot-title",
|
| 466 |
-
)
|
| 467 |
-
plot_1 = gr.Plot(
|
| 468 |
-
p1, show_label=False, elem_id="plot-container"
|
| 469 |
-
)
|
| 470 |
-
with gr.Column():
|
| 471 |
-
gr.Markdown(
|
| 472 |
-
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
|
| 473 |
-
elem_id="plot-title",
|
| 474 |
-
)
|
| 475 |
-
plot_2 = gr.Plot(p2, show_label=False)
|
| 476 |
-
|
| 477 |
-
if not show_plot:
|
| 478 |
-
gr.Markdown(
|
| 479 |
-
"""
|
| 480 |
-
""",
|
| 481 |
-
elem_id="leaderboard_markdown",
|
| 482 |
-
)
|
| 483 |
-
else:
|
| 484 |
-
pass
|
| 485 |
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
|
|
|
| 489 |
columns=[
|
| 490 |
-
"Rank* (UB)",
|
| 491 |
-
"
|
| 492 |
-
"Model",
|
| 493 |
-
"Arena Elo",
|
| 494 |
-
"95% CI",
|
| 495 |
-
"Votes",
|
| 496 |
-
"Organization",
|
| 497 |
-
"License",
|
| 498 |
-
"Knowledge Cutoff",
|
| 499 |
-
],
|
| 500 |
-
)
|
| 501 |
-
|
| 502 |
-
def highlight_max(s):
|
| 503 |
-
return [
|
| 504 |
-
"color: green; font-weight: bold"
|
| 505 |
-
if "\u2191" in v
|
| 506 |
-
else "color: red; font-weight: bold"
|
| 507 |
-
if "\u2193" in v
|
| 508 |
-
else ""
|
| 509 |
-
for v in s
|
| 510 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
-
def highlight_rank_max(s):
|
| 513 |
-
return [
|
| 514 |
-
"color: green; font-weight: bold"
|
| 515 |
-
if v > 0
|
| 516 |
-
else "color: red; font-weight: bold"
|
| 517 |
-
if v < 0
|
| 518 |
-
else ""
|
| 519 |
-
for v in s
|
| 520 |
-
]
|
| 521 |
|
| 522 |
-
|
| 523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
)
|
| 525 |
|
| 526 |
-
|
| 527 |
-
_, arena_dfs, category_elo_results, _, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
|
| 528 |
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
|
|
|
|
|
|
|
|
|
| 541 |
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
elo_subset_results["battle_count_heatmap"],
|
| 549 |
-
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
| 550 |
-
)
|
| 551 |
-
p3 = filter_deprecated_models_plots(
|
| 552 |
-
elo_subset_results["bootstrap_elo_rating"],
|
| 553 |
-
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
| 554 |
-
)
|
| 555 |
-
p4 = filter_deprecated_models_plots(
|
| 556 |
-
elo_subset_results["average_win_rate_bar"],
|
| 557 |
-
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
| 558 |
-
)
|
| 559 |
-
if category != "Overall":
|
| 560 |
-
arena_values = update_leaderboard_df(arena_values)
|
| 561 |
-
arena_values = gr.Dataframe(
|
| 562 |
-
headers=[
|
| 563 |
-
"Rank* (UB)",
|
| 564 |
-
"Delta",
|
| 565 |
-
"Model",
|
| 566 |
-
"Arena Elo",
|
| 567 |
-
"95% CI",
|
| 568 |
-
"Votes",
|
| 569 |
-
"Organization",
|
| 570 |
-
"License",
|
| 571 |
-
"Knowledge Cutoff",
|
| 572 |
-
],
|
| 573 |
-
datatype=[
|
| 574 |
-
"str",
|
| 575 |
-
"number",
|
| 576 |
-
"markdown",
|
| 577 |
-
"number",
|
| 578 |
-
"str",
|
| 579 |
-
"number",
|
| 580 |
-
"str",
|
| 581 |
-
"str",
|
| 582 |
-
"str",
|
| 583 |
],
|
| 584 |
-
value=arena_values,
|
| 585 |
-
elem_id="arena_leaderboard_dataframe",
|
| 586 |
-
height=700,
|
| 587 |
-
column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100],
|
| 588 |
-
wrap=True,
|
| 589 |
)
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
"
|
| 595 |
-
"
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
"
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
"str",
|
| 610 |
-
"str",
|
| 611 |
-
],
|
| 612 |
-
value=arena_values,
|
| 613 |
-
elem_id="arena_leaderboard_dataframe",
|
| 614 |
-
height=700,
|
| 615 |
-
column_widths=[70, 190, 100, 100, 90, 140, 150, 100],
|
| 616 |
-
wrap=True,
|
| 617 |
)
|
|
|
|
| 618 |
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
)
|
| 657 |
-
if show_plot and leaderboard_table_file:
|
| 658 |
-
return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4]
|
| 659 |
-
return [md_1]
|
| 660 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
theme.text_size = text_size
|
| 666 |
-
theme.set(
|
| 667 |
-
button_large_text_size="40px",
|
| 668 |
-
button_small_text_size="40px",
|
| 669 |
-
button_large_text_weight="1000",
|
| 670 |
-
button_small_text_weight="1000",
|
| 671 |
-
button_shadow="*shadow_drop_lg",
|
| 672 |
-
button_shadow_hover="*shadow_drop_lg",
|
| 673 |
-
checkbox_label_shadow="*shadow_drop_lg",
|
| 674 |
-
button_shadow_active="*shadow_inset",
|
| 675 |
-
button_secondary_background_fill="*primary_300",
|
| 676 |
-
button_secondary_background_fill_dark="*primary_700",
|
| 677 |
-
button_secondary_background_fill_hover="*primary_200",
|
| 678 |
-
button_secondary_background_fill_hover_dark="*primary_500",
|
| 679 |
-
button_secondary_text_color="*primary_800",
|
| 680 |
-
button_secondary_text_color_dark="white",
|
| 681 |
-
)
|
| 682 |
|
| 683 |
-
with gr.Blocks(
|
| 684 |
-
title="LLM arena: leaderboard",
|
| 685 |
-
theme=theme,
|
| 686 |
-
css=block_css,
|
| 687 |
-
) as demo:
|
| 688 |
-
build_leaderboard_tab(
|
| 689 |
-
elo_results_file, leaderboard_table_file, show_plot=True, mirror=True
|
| 690 |
-
)
|
| 691 |
-
return demo
|
| 692 |
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
#
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
#
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
padding-bottom: 6px;
|
| 719 |
-
}
|
| 720 |
-
#leaderboard_dataframe td {
|
| 721 |
-
line-height: 0.1em;
|
| 722 |
-
}
|
| 723 |
-
#about_markdown .prose {
|
| 724 |
-
font-size: 110% !important;
|
| 725 |
-
}
|
| 726 |
-
#ack_markdown .prose {
|
| 727 |
-
font-size: 110% !important;
|
| 728 |
-
}
|
| 729 |
-
#chatbot .prose {
|
| 730 |
-
font-size: 105% !important;
|
| 731 |
-
}
|
| 732 |
-
.sponsor-image-about img {
|
| 733 |
-
margin: 0 20px;
|
| 734 |
-
margin-top: 20px;
|
| 735 |
-
height: 40px;
|
| 736 |
-
max-height: 100%;
|
| 737 |
-
width: auto;
|
| 738 |
-
float: left;
|
| 739 |
-
}
|
| 740 |
|
| 741 |
-
.chatbot h1, h2, h3 {
|
| 742 |
-
margin-top: 8px; /* Adjust the value as needed */
|
| 743 |
-
margin-bottom: 0px; /* Adjust the value as needed */
|
| 744 |
-
padding-bottom: 0px;
|
| 745 |
-
}
|
| 746 |
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
.
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
.chatbot h3 {
|
| 754 |
-
font-size: 110%;
|
| 755 |
-
}
|
| 756 |
-
.chatbot p:not(:first-child) {
|
| 757 |
-
margin-top: 8px;
|
| 758 |
-
}
|
| 759 |
|
| 760 |
-
.typing {
|
| 761 |
-
display: inline-block;
|
| 762 |
-
}
|
| 763 |
|
| 764 |
-
|
| 765 |
-
display: inline-block;
|
| 766 |
-
width: 7px;
|
| 767 |
-
height: 1em;
|
| 768 |
-
background-color: black;
|
| 769 |
-
vertical-align: middle;
|
| 770 |
-
animation: blink 1s infinite;
|
| 771 |
-
}
|
| 772 |
|
| 773 |
-
.dark .cursor {
|
| 774 |
-
display: inline-block;
|
| 775 |
-
width: 7px;
|
| 776 |
-
height: 1em;
|
| 777 |
-
background-color: white;
|
| 778 |
-
vertical-align: middle;
|
| 779 |
-
animation: blink 1s infinite;
|
| 780 |
-
}
|
| 781 |
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
|
| 787 |
-
.
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 791 |
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
|
|
|
|
|
|
| 801 |
|
| 802 |
|
| 803 |
if __name__ == "__main__":
|
| 804 |
parser = argparse.ArgumentParser()
|
| 805 |
-
parser.add_argument("--share", action="store_true")
|
| 806 |
parser.add_argument("--host", default="0.0.0.0")
|
| 807 |
parser.add_argument("--port", type=int, default=7860)
|
|
|
|
| 808 |
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 809 |
|
| 810 |
-
elo_result_files = glob.glob("elo_results_*.pkl")
|
| 811 |
-
elo_result_files.sort(key=lambda x: int(x[12:-4]))
|
| 812 |
-
elo_result_file = elo_result_files[-1]
|
| 813 |
-
|
| 814 |
-
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
| 815 |
-
leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
|
| 816 |
-
leaderboard_table_file = leaderboard_table_files[-1]
|
| 817 |
|
| 818 |
demo = build_demo(elo_result_file, leaderboard_table_file)
|
| 819 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import pickle
|
| 5 |
import traceback
|
| 6 |
import numpy as np
|
|
|
|
| 7 |
|
| 8 |
import pandas as pd
|
| 9 |
import gradio as gr
|
| 10 |
import numpy as np
|
| 11 |
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
promo_banner = """
|
| 14 |
<div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
|
| 15 |
USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
|
|
|
|
| 18 |
|
| 19 |
deprecated_model_name = [
|
| 20 |
"GigaChat 3.1.25.3",
|
| 21 |
+
"GigaChat-Pro 2.2.25.3",
|
| 22 |
"saiga_llama3_8b_v6",
|
| 23 |
"saiga_phi3_medium",
|
| 24 |
"GigaChat-Plus 3.1.25.3",
|
| 25 |
"GigaChat-Pro 4.0.26.8",
|
| 26 |
"GigaChat 4.0.26.8",
|
| 27 |
+
"xAI: Grok 2",
|
| 28 |
"GigaChat-Pro 4.0.26.15",
|
| 29 |
"GigaChat 4.0.26.15",
|
| 30 |
+
"YandexGPT Experimental", "yandex-gpt-arena",
|
| 31 |
+
"RefalMachine/ruadapt_llama3_instruct_lep_saiga_kto_ablitirated"
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
models_10b = [
|
| 35 |
+
"saiga_llama3_8b_v7",
|
| 36 |
+
"Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it",
|
| 37 |
+
"T-lite-instruct-0.1",
|
| 38 |
+
"t-tech/T-lite-it-1.0",
|
| 39 |
+
"LLaMA-3 Chat (8B)",
|
| 40 |
+
"Llama 3.1 8B Instruct Turbo",
|
| 41 |
+
"MTSAIR/Cotype-Nano"
|
| 42 |
]
|
| 43 |
|
| 44 |
+
|
| 45 |
def make_default_md_1():
|
| 46 |
leaderboard_md = f"""
|
| 47 |
# π LLM Arena in Russian: Leaderboard
|
|
|
|
| 51 |
"""
|
| 52 |
return leaderboard_md
|
| 53 |
|
|
|
|
| 54 |
def make_default_md_2():
|
| 55 |
leaderboard_md = f"""
|
| 56 |
+
|
| 57 |
The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
|
| 58 |
Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
|
| 59 |
+
|
| 60 |
- To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
|
| 61 |
- If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
|
| 62 |
- You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
|
| 63 |
"""
|
|
|
|
| 64 |
return leaderboard_md
|
| 65 |
|
| 66 |
|
| 67 |
def make_arena_leaderboard_md(arena_df, last_updated_time):
|
| 68 |
+
# Using version from monitor.py (translated)
|
| 69 |
+
total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
|
| 70 |
total_models = len(arena_df)
|
| 71 |
+
space = "Β Β Β " # Using HTML space
|
| 72 |
|
| 73 |
leaderboard_md = f"""
|
| 74 |
Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
|
| 84 |
+
total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
|
| 85 |
total_models = len(arena_df)
|
| 86 |
+
space = "Β Β Β "
|
| 87 |
+
total_subset_votes = sum(arena_subset_df["num_battles"]) if not arena_subset_df.empty else 0
|
| 88 |
total_subset_models = len(arena_subset_df)
|
| 89 |
+
|
| 90 |
+
perc_models = round(total_subset_models / total_models * 100) if total_models > 0 else 0
|
| 91 |
+
perc_votes = round(total_subset_votes / total_votes * 100) if total_votes > 0 else 0
|
| 92 |
+
|
| 93 |
+
leaderboard_md = f"""### {cat_name_to_explanation.get(name, name)}
|
| 94 |
+
#### {space} #models: **{total_subset_models} ({perc_models}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({perc_votes}%)**{space}
|
| 95 |
"""
|
| 96 |
return leaderboard_md
|
| 97 |
|
|
|
|
| 98 |
def model_hyperlink(model_name, link):
|
| 99 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
| 100 |
|
| 101 |
|
| 102 |
+
def filter_deprecated_models_plots(fig, hidden_models=None, limit_to_top=25):
|
| 103 |
"""
|
| 104 |
+
Filters Plotly plots to show only top N models and optionally removes specific models.
|
| 105 |
|
| 106 |
Args:
|
| 107 |
fig: The Plotly figure object.
|
| 108 |
+
hidden_models (list, optional): A list of model names to remove. Defaults to None.
|
| 109 |
+
limit_to_top (int, optional): Limit display to top N models (0 or None means no limit). Defaults to 25.
|
| 110 |
+
|
| 111 |
+
Returns:
|
| 112 |
+
Plotly figure: The filtered figure object or the original if filtering fails or is not applicable.
|
| 113 |
"""
|
| 114 |
if fig is None:
|
| 115 |
+
return None
|
| 116 |
+
|
| 117 |
+
# Check if the figure has data
|
| 118 |
+
if not hasattr(fig, 'data') or len(fig.data) == 0:
|
| 119 |
+
return fig
|
| 120 |
+
|
| 121 |
+
# Check if data has a type attribute
|
| 122 |
+
if not hasattr(fig.data[0], 'type'):
|
| 123 |
return fig
|
| 124 |
|
| 125 |
+
# Check minimum number of models after initial hidden_models filtering
|
| 126 |
+
models_to_check = []
|
| 127 |
+
if hasattr(fig.data[0], 'x'):
|
| 128 |
+
models_to_check = fig.data[0].x
|
| 129 |
+
elif hasattr(fig.data[0], 'y'): # For some types like bar, X axis might be numeric
|
| 130 |
+
models_to_check = fig.data[0].y
|
| 131 |
+
|
| 132 |
+
if hidden_models is not None and models_to_check.any():
|
| 133 |
+
available_models = [x for x in models_to_check if x not in hidden_models]
|
| 134 |
+
# print(f"Available models before top N: {len(available_models)}") # Debug
|
| 135 |
+
if len(available_models) <= 2: # If less than 3 models remain before top_n
|
| 136 |
+
# print(f"Warning: Too few models left after initial filtering ({len(available_models)}), returning original plot.")
|
| 137 |
+
return fig # Return the original plot if too few models
|
| 138 |
+
|
| 139 |
+
if limit_to_top is not None and limit_to_top <= 0:
|
| 140 |
+
limit_to_top = None
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
# Work on a deep copy to avoid modifying the original figure object
|
| 144 |
+
fig_copy = pickle.loads(pickle.dumps(fig))
|
| 145 |
+
data = fig_copy.data[0]
|
| 146 |
+
|
| 147 |
+
if data.type == 'heatmap':
|
| 148 |
+
# Apply hidden models filter
|
| 149 |
+
mask_x = ~np.isin(data.x, hidden_models) if hidden_models is not None else np.ones_like(data.x, dtype=bool)
|
| 150 |
+
mask_y = ~np.isin(data.y, hidden_models) if hidden_models is not None else np.ones_like(data.y, dtype=bool)
|
| 151 |
+
|
| 152 |
+
# Get initially filtered X and Y arrays
|
| 153 |
+
filtered_x = np.array(data.x)[mask_x]
|
| 154 |
+
filtered_y = np.array(data.y)[mask_y]
|
| 155 |
+
|
| 156 |
+
# Apply top N limit (assuming the order is already by rank/rating)
|
| 157 |
+
if limit_to_top is not None and len(filtered_x) > limit_to_top:
|
| 158 |
+
top_models = filtered_x[:limit_to_top]
|
| 159 |
+
# Create new masks based on the top models relative to the *original* data axes
|
| 160 |
+
mask_x = np.isin(data.x, top_models)
|
| 161 |
+
mask_y = np.isin(data.y, top_models)
|
| 162 |
+
# Get final filtered axes
|
| 163 |
+
filtered_x = np.array(data.x)[mask_x]
|
| 164 |
+
filtered_y = np.array(data.y)[mask_y]
|
| 165 |
+
elif len(filtered_x) <= 2: # If <=2 models remain after filtering
|
| 166 |
+
return fig # Return original
|
| 167 |
+
|
| 168 |
+
# Update the heatmap data
|
| 169 |
+
data.x = filtered_x
|
| 170 |
+
data.y = filtered_y
|
| 171 |
+
# Important: Indexing 'z' must use masks derived from the *original* data order
|
| 172 |
+
z_original = np.array(fig.data[0].z)
|
| 173 |
+
data.z = z_original[np.ix_(mask_y, mask_x)]
|
| 174 |
+
|
| 175 |
+
elif data.type == 'scatter':
|
| 176 |
+
trace = data
|
| 177 |
+
# Apply hidden models filter
|
| 178 |
+
mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
|
| 179 |
+
|
| 180 |
+
# Get initially filtered arrays
|
| 181 |
+
current_x = np.array(trace.x)[mask]
|
| 182 |
+
current_y = np.array(trace.y)[mask]
|
| 183 |
+
current_text = np.array(trace.text)[mask] if hasattr(trace, 'text') and trace.text is not None else None
|
| 184 |
+
# Handle error bars safely
|
| 185 |
+
current_error_y_array = np.array(trace.error_y['array'])[mask] if 'error_y' in trace and 'array' in trace.error_y and trace.error_y['array'] is not None else None
|
| 186 |
+
current_error_y_arrayminus = np.array(trace.error_y['arrayminus'])[mask] if 'error_y' in trace and 'arrayminus' in trace.error_y and trace.error_y['arrayminus'] is not None else None
|
| 187 |
+
|
| 188 |
+
# Apply top N limit
|
| 189 |
+
if limit_to_top is not None and len(current_x) > limit_to_top:
|
| 190 |
+
# Sort by y-value (rating) descending to find the top N
|
| 191 |
+
sort_indices = np.argsort(-current_y)[:limit_to_top]
|
| 192 |
+
current_x = current_x[sort_indices]
|
| 193 |
+
current_y = current_y[sort_indices]
|
| 194 |
+
if current_text is not None:
|
| 195 |
+
current_text = current_text[sort_indices]
|
| 196 |
+
if current_error_y_array is not None:
|
| 197 |
+
current_error_y_array = current_error_y_array[sort_indices]
|
| 198 |
+
if current_error_y_arrayminus is not None:
|
| 199 |
+
current_error_y_arrayminus = current_error_y_arrayminus[sort_indices]
|
| 200 |
+
elif len(current_x) <= 2: # If <=2 models remain after filtering
|
| 201 |
+
return fig # Return original
|
| 202 |
+
|
| 203 |
+
# Update the scatter trace data
|
| 204 |
+
trace.x, trace.y = current_x, current_y
|
| 205 |
+
if current_text is not None:
|
| 206 |
+
trace.text = current_text
|
| 207 |
+
# Update error bars if they exist
|
| 208 |
+
if current_error_y_array is not None:
|
| 209 |
+
# Ensure error_y exists before assigning
|
| 210 |
+
if 'error_y' not in trace: trace.error_y = {}
|
| 211 |
+
trace.error_y['array'] = current_error_y_array
|
| 212 |
+
if current_error_y_arrayminus is not None:
|
| 213 |
+
if 'error_y' not in trace: trace.error_y = {}
|
| 214 |
+
trace.error_y['arrayminus'] = current_error_y_arrayminus
|
| 215 |
+
|
| 216 |
+
elif data.type == 'bar':
|
| 217 |
+
trace = data
|
| 218 |
+
# Apply hidden models filter
|
| 219 |
+
mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
|
| 220 |
+
|
| 221 |
+
# Get initially filtered arrays
|
| 222 |
+
current_x = np.array(trace.x)[mask]
|
| 223 |
+
current_y = np.array(trace.y)[mask]
|
| 224 |
+
|
| 225 |
+
# Apply top N limit
|
| 226 |
+
if limit_to_top is not None and len(current_x) > limit_to_top:
|
| 227 |
+
# Sort by y-value (rating) descending
|
| 228 |
+
sort_indices = np.argsort(-current_y)[:limit_to_top]
|
| 229 |
+
current_x = current_x[sort_indices]
|
| 230 |
+
current_y = current_y[sort_indices]
|
| 231 |
+
elif len(current_x) <= 2: # If <=2 models remain after filtering
|
| 232 |
+
return fig # Return original
|
| 233 |
+
|
| 234 |
+
# Update the bar trace data
|
| 235 |
+
trace.x, trace.y = current_x, current_y
|
| 236 |
+
|
| 237 |
+
return fig_copy
|
| 238 |
+
|
| 239 |
+
except Exception as e:
|
| 240 |
+
print(f"Error filtering plot: {e}")
|
| 241 |
+
traceback.print_exc()
|
| 242 |
+
return fig # Return original figure on error
|
| 243 |
+
|
| 244 |
|
| 245 |
def load_leaderboard_table_csv(filename, add_hyperlink=True):
|
| 246 |
lines = open(filename).readlines()
|
|
|
|
| 248 |
rows = []
|
| 249 |
for i in range(1, len(lines)):
|
| 250 |
row = [v.strip() for v in lines[i].split(",")]
|
| 251 |
+
item = {} # Create dictionary once per row
|
| 252 |
+
for h, v in zip(heads, row):
|
| 253 |
+
if h == "Arena Elo rating":
|
| 254 |
+
if v != "-":
|
| 255 |
+
try:
|
| 256 |
v = int(ast.literal_eval(v))
|
| 257 |
+
except:
|
| 258 |
+
v = np.nan # Handle parsing errors
|
| 259 |
+
else:
|
| 260 |
+
v = np.nan
|
| 261 |
+
item[h] = v
|
| 262 |
+
if add_hyperlink and "Model" in item and "Link" in item: # Check keys exist
|
| 263 |
+
# Check for empty/missing link
|
| 264 |
+
if item["Link"] and item["Link"] != "-":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
item["Model"] = model_hyperlink(item["Model"], item["Link"])
|
| 266 |
+
# Otherwise, keep the model name as is
|
| 267 |
rows.append(item)
|
|
|
|
| 268 |
return rows
|
| 269 |
|
| 270 |
|
| 271 |
def create_ranking_str(ranking, ranking_difference):
|
| 272 |
+
# Convert rank to int before comparison
|
| 273 |
+
try:
|
| 274 |
+
# Ensure rank and difference are treated as numbers
|
| 275 |
+
ranking_val = int(float(ranking)) # Handle potential float input
|
| 276 |
+
ranking_difference_val = int(float(ranking_difference))
|
| 277 |
+
if ranking_difference_val > 0:
|
| 278 |
+
return f"{ranking_val} β"
|
| 279 |
+
elif ranking_difference_val < 0:
|
| 280 |
+
return f"{ranking_val} β"
|
| 281 |
+
else:
|
| 282 |
+
return f"{ranking_val}"
|
| 283 |
+
except (ValueError, TypeError): # Handle cases where rank is not numeric
|
| 284 |
+
return str(ranking)
|
| 285 |
|
| 286 |
|
| 287 |
def recompute_final_ranking(arena_df):
|
|
|
|
| 288 |
ranking = {}
|
| 289 |
+
if arena_df.empty:
|
| 290 |
+
return []
|
| 291 |
+
|
| 292 |
+
model_indices = arena_df.index
|
| 293 |
+
# Ensure CI columns exist before trying to access them
|
| 294 |
+
if "rating_q025" not in arena_df.columns or "rating_q975" not in arena_df.columns:
|
| 295 |
+
print("Warning: Confidence interval columns ('rating_q025', 'rating_q975') not found in DataFrame. Cannot compute UB Rank.")
|
| 296 |
+
# Return NaN or simple rank based on order
|
| 297 |
+
return [np.nan] * len(model_indices) # Or range(1, len(model_indices) + 1)
|
| 298 |
+
|
| 299 |
+
ratings_q025 = arena_df["rating_q025"].to_dict()
|
| 300 |
+
ratings_q975 = arena_df["rating_q975"].to_dict()
|
| 301 |
+
|
| 302 |
+
for model_a in model_indices:
|
| 303 |
+
rank = 1
|
| 304 |
+
rating_a_q975 = ratings_q975.get(model_a)
|
| 305 |
+
# Skip if model A has no CI data
|
| 306 |
+
if pd.isna(rating_a_q975):
|
| 307 |
+
ranking[model_a] = np.nan # Or assign max rank + 1
|
| 308 |
+
continue
|
| 309 |
+
|
| 310 |
+
for model_b in model_indices:
|
| 311 |
+
if model_a == model_b:
|
| 312 |
continue
|
| 313 |
+
|
| 314 |
+
rating_b_q025 = ratings_q025.get(model_b)
|
| 315 |
+
# Skip comparison if model B has no CI data
|
| 316 |
+
if pd.isna(rating_b_q025):
|
| 317 |
+
continue
|
| 318 |
+
|
| 319 |
+
# Check if B is statistically better than A
|
| 320 |
+
if rating_b_q025 > rating_a_q975:
|
| 321 |
+
rank += 1
|
| 322 |
+
ranking[model_a] = rank
|
| 323 |
return list(ranking.values())
|
| 324 |
|
| 325 |
|
| 326 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
|
| 327 |
+
"""
|
| 328 |
+
Generates the leaderboard table data.
|
| 329 |
+
'use_cache' parameter removed.
|
| 330 |
+
"""
|
| 331 |
+
# print(f'Calculating get_arena_table') # Debug
|
| 332 |
+
|
| 333 |
+
# Create copies to avoid modifying original DataFrames
|
| 334 |
+
arena_df_processed = arena_df.copy()
|
| 335 |
+
if arena_subset_df is not None:
|
| 336 |
+
arena_subset_df_processed = arena_subset_df.copy()
|
| 337 |
+
else:
|
| 338 |
+
arena_subset_df_processed = None
|
| 339 |
+
|
| 340 |
+
# Sort by rating initially to have a stable order before ranking
|
| 341 |
+
arena_df_processed = arena_df_processed.sort_values(by=["rating"], ascending=False)
|
| 342 |
+
# Compute 'final_ranking' based on CIs if possible
|
| 343 |
+
if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
|
| 344 |
+
arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
|
| 345 |
+
arena_df_processed = arena_df_processed.sort_values(
|
| 346 |
+
by=["final_ranking", "rating"], ascending=[True, False]
|
| 347 |
+
)
|
| 348 |
+
else:
|
| 349 |
+
# Fallback to simple ordering if CI columns are missing
|
| 350 |
+
arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
|
| 351 |
+
|
| 352 |
if hidden_models:
|
| 353 |
+
arena_df_processed = arena_df_processed[~arena_df_processed.index.isin(hidden_models)].copy()
|
| 354 |
+
# Recompute ranks for the filtered view
|
| 355 |
+
if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
|
| 356 |
+
arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
|
| 357 |
+
# Re-sort based on new ranks
|
| 358 |
+
arena_df_processed = arena_df_processed.sort_values(
|
| 359 |
+
by=["final_ranking", "rating"], ascending=[True, False]
|
| 360 |
+
)
|
| 361 |
+
else:
|
| 362 |
+
arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
|
| 363 |
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
+
if arena_subset_df_processed is not None:
|
| 366 |
+
# Filter subset by hidden_models first
|
| 367 |
+
if hidden_models:
|
| 368 |
+
arena_subset_df_processed = arena_subset_df_processed[~arena_subset_df_processed.index.isin(hidden_models)].copy()
|
| 369 |
|
| 370 |
+
# Ensure models in the subset are also present in the (filtered) main view
|
| 371 |
+
arena_subset_df_processed = arena_subset_df_processed[arena_subset_df_processed.index.isin(arena_df_processed.index)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
+
# Proceed only if subset is not empty and has CI columns
|
| 374 |
+
if not arena_subset_df_processed.empty and "rating_q025" in arena_subset_df_processed.columns and "rating_q975" in arena_subset_df_processed.columns:
|
| 375 |
+
# Rank within the subset
|
| 376 |
+
arena_subset_df_processed = arena_subset_df_processed.sort_values(by=["rating"], ascending=False)
|
| 377 |
+
arena_subset_df_processed["final_ranking_subset"] = recompute_final_ranking(arena_subset_df_processed) # Rank within category
|
|
|
|
|
|
|
| 378 |
|
| 379 |
+
# Filter the main processed DF to only include models from the subset
|
| 380 |
+
# 'final_ranking' here represents the rank *among these models* in the baseline category view
|
| 381 |
+
arena_df_for_join = arena_df_processed[arena_df_processed.index.isin(arena_subset_df_processed.index)][["final_ranking", "rating"]].copy()
|
| 382 |
+
arena_df_for_join.rename(columns={"final_ranking": "final_ranking_baseline"}, inplace=True)
|
| 383 |
|
| 384 |
+
|
| 385 |
+
# Join the subset ranks and baseline ranks
|
| 386 |
+
arena_df_combined = arena_subset_df_processed[["final_ranking_subset", "rating"]].join(
|
| 387 |
+
arena_df_for_join["final_ranking_baseline"], how="inner"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
)
|
| 389 |
+
|
| 390 |
+
# Calculate rank difference
|
| 391 |
+
arena_df_combined["ranking_difference"] = arena_df_combined["final_ranking_baseline"] - arena_df_combined["final_ranking_subset"]
|
| 392 |
+
|
| 393 |
+
# Sort by subset rank and rating
|
| 394 |
+
arena_df_combined = arena_df_combined.sort_values(
|
| 395 |
+
by=["final_ranking_subset", "rating"], ascending=[True, False]
|
| 396 |
)
|
| 397 |
+
|
| 398 |
+
# Format the rank string with delta for display
|
| 399 |
+
arena_df_combined["display_ranking"] = arena_df_combined.apply(
|
| 400 |
+
lambda x: create_ranking_str(x["final_ranking_subset"], x["ranking_difference"]),
|
| 401 |
+
axis=1,
|
| 402 |
)
|
| 403 |
+
arena_df_processed = arena_df_processed.loc[arena_df_combined.index] # Reorder arena_df_processed
|
| 404 |
+
|
| 405 |
+
columns_to_join = ["display_ranking", "ranking_difference", "final_ranking_subset"]
|
| 406 |
+
columns_to_join = [col for col in columns_to_join if col in arena_df_combined.columns]
|
| 407 |
+
arena_df_processed = arena_df_processed.join(arena_df_combined[columns_to_join], how="inner")
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
# Now sorting should work as the column exists
|
| 411 |
+
# Use the subset rank for final sorting if subset is active
|
| 412 |
+
# Check if 'final_ranking_subset' was successfully joined before sorting
|
| 413 |
+
if "final_ranking_subset" in arena_df_processed.columns:
|
| 414 |
+
arena_df_processed.sort_values(by=["final_ranking_subset", "rating"], ascending=[True, False], inplace=True)
|
| 415 |
else:
|
| 416 |
+
# Fallback sort if join failed for some reason
|
| 417 |
+
arena_df_processed.sort_values(by=["rating"], ascending=False, inplace=True)
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
else:
|
| 421 |
+
# If subset is empty or lacks CI, disable subset logic
|
| 422 |
+
arena_subset_df_processed = None
|
| 423 |
+
# Use the baseline ranking as the display ranking
|
| 424 |
+
arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
|
| 425 |
+
arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
else:
|
| 429 |
+
# If no subset is used, display ranking is just the final rank from the main DF
|
| 430 |
+
arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
|
| 431 |
+
# Ensure it's sorted correctly
|
| 432 |
+
arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
values = []
|
| 436 |
+
# Iterate using the final sorted index of arena_df_processed
|
| 437 |
+
for model_key in arena_df_processed.index:
|
| 438 |
+
row_data = arena_df_processed.loc[model_key]
|
| 439 |
+
# Find model metadata
|
| 440 |
+
model_info = model_table_df[model_table_df["key"] == model_key]
|
| 441 |
+
if model_info.empty:
|
| 442 |
+
# print(f"Warning: Model key '{model_key}' not found in model_table_df. Skipping.")
|
| 443 |
+
continue # Skip if no metadata
|
| 444 |
+
|
| 445 |
+
row = []
|
| 446 |
+
# Rank (Display)
|
| 447 |
+
row.append(row_data.get("display_ranking", "")) # Use the calculated display rank
|
| 448 |
+
|
| 449 |
+
# Delta (only if subset was processed successfully)
|
| 450 |
+
if arena_subset_df_processed is not None:
|
| 451 |
+
row.append(row_data.get("ranking_difference", 0))
|
| 452 |
+
|
| 453 |
+
# Model Name (hyperlink applied during loading)
|
| 454 |
+
row.append(model_info["Model"].values[0])
|
| 455 |
+
|
| 456 |
+
# Arena Elo
|
| 457 |
+
row.append(round(row_data["rating"]))
|
| 458 |
+
|
| 459 |
+
# 95% CI
|
| 460 |
+
# Check for NaN before calculation
|
| 461 |
+
upper_rating = row_data.get("rating_q975")
|
| 462 |
+
lower_rating = row_data.get("rating_q025")
|
| 463 |
+
current_rating = row_data.get("rating")
|
| 464 |
+
upper_diff = round(upper_rating - current_rating) if pd.notna(upper_rating) and pd.notna(current_rating) else '?'
|
| 465 |
+
lower_diff = round(current_rating - lower_rating) if pd.notna(current_rating) and pd.notna(lower_rating) else '?'
|
| 466 |
+
row.append(f"+{upper_diff}/-{lower_diff}")
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
# Votes
|
| 470 |
+
row.append(round(row_data["num_battles"]))
|
| 471 |
+
|
| 472 |
+
# Organization
|
| 473 |
+
row.append(model_info["Organization"].values[0])
|
| 474 |
+
|
| 475 |
+
# License
|
| 476 |
+
row.append(model_info["License"].values[0])
|
| 477 |
+
|
| 478 |
+
# Knowledge Cutoff
|
| 479 |
+
cutoff_date = model_info["Knowledge cutoff date"].values[0]
|
| 480 |
+
row.append("Unknown" if cutoff_date == "-" else cutoff_date)
|
| 481 |
+
|
| 482 |
+
values.append(row)
|
| 483 |
+
|
| 484 |
return values
|
| 485 |
|
| 486 |
|
| 487 |
key_to_category_name = {
|
| 488 |
+
# Mapping from internal key to display name (kept English for consistency)
|
| 489 |
+
"full": "Overall", # Might not be used if filtered out later
|
| 490 |
"crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
|
| 491 |
"site_visitors/medium_prompts": "site_visitors/medium_prompts",
|
| 492 |
+
"site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style_control" # Use underscore for display consistency if needed
|
| 493 |
}
|
| 494 |
cat_name_to_explanation = {
|
| 495 |
+
# Translated explanations for display
|
| 496 |
"Overall": "All queries",
|
| 497 |
+
"crowdsourcing/simple_prompts": "Queries collected via crowdsourcing. Mostly simple ones.",
|
| 498 |
"site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
|
| 499 |
+
"site_visitors/medium_prompts:style_control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
|
| 500 |
}
|
| 501 |
cat_name_to_baseline = {
|
| 502 |
+
# Baseline category for comparison (if needed, seems unused now but kept)
|
| 503 |
+
# "Hard Prompts (English)": "English",
|
| 504 |
}
|
| 505 |
|
| 506 |
actual_categories = [
|
| 507 |
+
# Categories available in the dropdown (use the *keys* from key_to_category_name)
|
| 508 |
+
# "Overall", # Removed
|
| 509 |
+
# "crowdsourcing/simple_prompts", # Removed
|
| 510 |
"site_visitors/medium_prompts",
|
| 511 |
"site_visitors/medium_prompts:style control"
|
| 512 |
]
|
| 513 |
+
# Default selected category key
|
| 514 |
+
req_cat_key = "site_visitors/medium_prompts:style control"
|
| 515 |
+
selected_category_key = req_cat_key if req_cat_key in actual_categories else ("site_visitors/medium_prompts" if "site_visitors/medium_prompts" in actual_categories else (actual_categories[0] if actual_categories else None))
|
| 516 |
+
# Get the display name for the selected category
|
| 517 |
+
selected_category_display_name = key_to_category_name.get(selected_category_key, selected_category_key) # Fallback to key if not found
|
| 518 |
|
| 519 |
|
| 520 |
def read_elo_file(elo_results_file, leaderboard_table_file):
|
| 521 |
+
# Version from monitor.py, but no lazy_load or caching
|
| 522 |
+
print('Reading Elo file...')
|
| 523 |
arena_dfs = {}
|
| 524 |
category_elo_results = {}
|
| 525 |
+
last_updated_time = "N/A" # Default value
|
| 526 |
+
elo_results = {} # Default value
|
| 527 |
+
model_table_df = pd.DataFrame() # Default value
|
| 528 |
+
|
| 529 |
+
try:
|
| 530 |
+
# Use context manager for file operations
|
| 531 |
+
with open(elo_results_file, "rb") as fin:
|
| 532 |
+
elo_results = pickle.load(fin)
|
| 533 |
+
|
| 534 |
+
# Try to get last updated time from primary or fallback categories
|
| 535 |
+
main_cat_key = "site_visitors/medium_prompts:style control"
|
| 536 |
+
fallback_cat_key_1 = "site_visitors/medium_prompts"
|
| 537 |
+
fallback_cat_key_2 = "full" # Another fallback
|
| 538 |
+
|
| 539 |
+
if main_cat_key in elo_results and "last_updated_datetime" in elo_results[main_cat_key]:
|
| 540 |
+
last_updated_time = elo_results[main_cat_key]["last_updated_datetime"].split(" ")[0]
|
| 541 |
+
elif fallback_cat_key_1 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_1]:
|
| 542 |
+
last_updated_time = elo_results[fallback_cat_key_1]["last_updated_datetime"].split(" ")[0]
|
| 543 |
+
elif fallback_cat_key_2 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_2]:
|
| 544 |
+
last_updated_time = elo_results[fallback_cat_key_2]["last_updated_datetime"].split(" ")[0]
|
| 545 |
+
|
| 546 |
+
# Iterate through defined category keys
|
| 547 |
+
for key in key_to_category_name.keys():
|
| 548 |
+
display_name = key_to_category_name[key] # Get the display name
|
| 549 |
+
if key in elo_results:
|
| 550 |
+
# Check for required data within the category result
|
| 551 |
+
if "leaderboard_table_df" in elo_results[key] and isinstance(elo_results[key]["leaderboard_table_df"], pd.DataFrame):
|
| 552 |
+
df = elo_results[key]["leaderboard_table_df"]
|
| 553 |
+
# Filter by number of battles > 200
|
| 554 |
+
# Store using the *display_name* as the key for consistency with dropdown/UI
|
| 555 |
+
arena_dfs[display_name] = df[df["num_battles"] > 200].copy()
|
| 556 |
+
category_elo_results[display_name] = elo_results[key]
|
| 557 |
+
# else:
|
| 558 |
+
# print(f"Warning: 'leaderboard_table_df' not found or not a DataFrame for key '{key}'")
|
| 559 |
+
# else:
|
| 560 |
+
# print(f"Warning: Key '{key}' not found in elo_results")
|
| 561 |
+
|
| 562 |
+
# Load model metadata CSV
|
| 563 |
+
data = load_leaderboard_table_csv(leaderboard_table_file)
|
| 564 |
+
model_table_df = pd.DataFrame(data)
|
| 565 |
|
| 566 |
+
except FileNotFoundError:
|
| 567 |
+
print(f"Error: Elo results file not found at {elo_results_file}")
|
| 568 |
+
# Return empty structures
|
| 569 |
+
except Exception as e:
|
| 570 |
+
print(f"Error reading elo file: {e}")
|
| 571 |
+
traceback.print_exc()
|
| 572 |
+
# Return empty structures
|
| 573 |
|
| 574 |
+
# Ensure correct data types are returned even on error
|
| 575 |
return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
|
| 576 |
|
| 577 |
|
| 578 |
def build_leaderboard_tab(
|
| 579 |
elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
|
| 580 |
):
|
| 581 |
+
# Load data once during build time
|
| 582 |
+
try:
|
| 583 |
+
last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
|
| 584 |
+
except Exception as e:
|
| 585 |
+
print(f"Failed to load initial data: {e}")
|
| 586 |
+
# Set empty defaults to prevent app crash
|
| 587 |
+
last_updated_time = "Error"
|
| 588 |
+
arena_dfs = {}
|
| 589 |
+
category_elo_results = {}
|
| 590 |
+
elo_results = {}
|
| 591 |
+
model_table_df = pd.DataFrame()
|
| 592 |
+
|
| 593 |
+
# Get data for the default selected category
|
| 594 |
+
# Use the *display name* derived from the selected key
|
| 595 |
+
if selected_category_display_name in arena_dfs:
|
| 596 |
+
arena_df = arena_dfs[selected_category_display_name]
|
| 597 |
+
elo_subset_results_init = category_elo_results[selected_category_display_name]
|
| 598 |
+
p1_init = elo_subset_results_init.get("win_fraction_heatmap")
|
| 599 |
+
p2_init = elo_subset_results_init.get("battle_count_heatmap")
|
| 600 |
+
p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
|
| 601 |
+
p4_init = elo_subset_results_init.get("average_win_rate_bar")
|
| 602 |
+
else:
|
| 603 |
+
# Fallback if default category is missing
|
| 604 |
+
fallback_cat_display_name = None
|
| 605 |
+
if actual_categories:
|
| 606 |
+
# Try the first actual category's display name
|
| 607 |
+
first_cat_key = actual_categories[0]
|
| 608 |
+
fallback_cat_display_name = key_to_category_name.get(first_cat_key, first_cat_key)
|
| 609 |
+
|
| 610 |
+
if fallback_cat_display_name and fallback_cat_display_name in arena_dfs:
|
| 611 |
+
print(f"Warning: Selected category '{selected_category_display_name}' not found. Falling back to '{fallback_cat_display_name}'.")
|
| 612 |
+
arena_df = arena_dfs[fallback_cat_display_name]
|
| 613 |
+
elo_subset_results_init = category_elo_results[fallback_cat_display_name]
|
| 614 |
+
p1_init = elo_subset_results_init.get("win_fraction_heatmap")
|
| 615 |
+
p2_init = elo_subset_results_init.get("battle_count_heatmap")
|
| 616 |
+
p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
|
| 617 |
+
p4_init = elo_subset_results_init.get("average_win_rate_bar")
|
| 618 |
+
else:
|
| 619 |
+
print(f"Warning: Default category '{selected_category_display_name}' and fallback categories not found in data.")
|
| 620 |
+
arena_df = pd.DataFrame() # Empty DataFrame
|
| 621 |
+
p1_init, p2_init, p3_init, p4_init = None, None, None, None
|
| 622 |
+
|
| 623 |
+
# Apply initial filtering to plots
|
| 624 |
+
p1_init = filter_deprecated_models_plots(p1_init, hidden_models=deprecated_model_name)
|
| 625 |
+
p2_init = filter_deprecated_models_plots(p2_init, hidden_models=deprecated_model_name)
|
| 626 |
+
p3_init = filter_deprecated_models_plots(p3_init, hidden_models=deprecated_model_name)
|
| 627 |
+
p4_init = filter_deprecated_models_plots(p4_init, hidden_models=deprecated_model_name)
|
| 628 |
+
|
| 629 |
+
default_md = make_default_md_1() # Parameters removed
|
| 630 |
+
default_md_2 = make_default_md_2() # Parameters removed
|
| 631 |
+
|
| 632 |
with gr.Row():
|
| 633 |
with gr.Column(scale=4):
|
| 634 |
+
# Removed Vote button
|
| 635 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
| 636 |
with gr.Column(scale=1):
|
| 637 |
vote_button = gr.Button("Vote!", link="https://llmarena.ru")
|
| 638 |
md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
|
|
|
|
|
|
|
|
|
|
| 639 |
|
| 640 |
+
# Generate initial table data
|
| 641 |
+
if not arena_df.empty and not model_table_df.empty:
|
| 642 |
+
# Pass the baseline DF and the model table; initially no subset difference is shown
|
| 643 |
+
arena_table_vals_init = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
|
| 644 |
+
else:
|
| 645 |
+
arena_table_vals_init = []
|
| 646 |
+
|
| 647 |
+
# Single "Arena" tab
|
| 648 |
+
with gr.Tab("Arena", id=0): # Removed Tabs() as only one tab
|
| 649 |
+
md_arena = make_arena_leaderboard_md(arena_df, last_updated_time)
|
| 650 |
+
lb_description = gr.Markdown(md_arena, elem_id="leaderboard_markdown")
|
| 651 |
+
|
| 652 |
+
with gr.Row():
|
| 653 |
+
with gr.Column(scale=2):
|
| 654 |
+
# Use *display names* for choices if they differ significantly from keys,
|
| 655 |
+
# but here keys are descriptive enough. Callback receives the *key*.
|
| 656 |
+
category_dropdown = gr.Dropdown(
|
| 657 |
+
# Choices should be the *keys* corresponding to display names
|
| 658 |
+
choices=actual_categories,
|
| 659 |
+
value=selected_category_key, # Use the key for the default value
|
| 660 |
+
label="Category", # Translated
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
)
|
| 662 |
|
| 663 |
+
with gr.Column(scale=2):
|
| 664 |
+
category_checkbox = gr.CheckboxGroup(
|
| 665 |
+
# Use user-friendly translated labels
|
| 666 |
+
["Show Deprecated", "Only <10B Models"], # Adjusted label for clarity
|
| 667 |
+
label="Apply Filter",
|
| 668 |
+
info="",
|
| 669 |
+
value=[], # Filters off by default
|
| 670 |
)
|
| 671 |
|
| 672 |
+
# Category details
|
| 673 |
+
default_category_details = make_category_arena_leaderboard_md(
|
| 674 |
+
arena_df, arena_df, name=selected_category_display_name # Pass arena_df twice for initial display
|
| 675 |
+
) if not arena_df.empty else "No data for category"
|
| 676 |
|
| 677 |
+
with gr.Column(scale=4, variant="panel"):
|
| 678 |
+
category_deets = gr.Markdown(
|
| 679 |
+
default_category_details, elem_id="category_deets"
|
| 680 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 681 |
|
| 682 |
+
# DataFrame for displaying the table
|
| 683 |
+
# Initial view doesn't have 'Delta' column
|
| 684 |
+
arena_vals = pd.DataFrame(
|
| 685 |
+
arena_table_vals_init,
|
| 686 |
columns=[
|
| 687 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
| 688 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 689 |
]
|
| 690 |
+
) if arena_table_vals_init else pd.DataFrame(columns=[ # Handle empty initial data
|
| 691 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
| 692 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
| 693 |
+
])
|
| 694 |
+
|
| 695 |
+
# Sort by Elo for initial display
|
| 696 |
+
if "Arena Elo" in arena_vals.columns:
|
| 697 |
+
arena_vals = arena_vals.sort_values(by="Arena Elo", ascending=False)
|
| 698 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
|
| 700 |
+
elo_display_df = gr.Dataframe(
|
| 701 |
+
headers=[ # Translated headers
|
| 702 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
| 703 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
| 704 |
+
],
|
| 705 |
+
datatype=[
|
| 706 |
+
"str", "markdown", "number", "str",
|
| 707 |
+
"number", "str", "str", "str"
|
| 708 |
+
],
|
| 709 |
+
value=arena_vals.style, # Apply Pandas styling if needed
|
| 710 |
+
elem_id="arena_leaderboard_dataframe",
|
| 711 |
+
height=700,
|
| 712 |
+
column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths from monitor.py
|
| 713 |
+
wrap=True,
|
| 714 |
)
|
| 715 |
|
| 716 |
+
gr.Markdown(elem_id="leaderboard_markdown") # Empty markdown for spacing
|
|
|
|
| 717 |
|
| 718 |
+
plot_1, plot_2, plot_3, plot_4 = None, None, None, None # Initialize plot variables
|
| 719 |
+
more_stats_md = None # Initialize markdown variable
|
| 720 |
+
if show_plot:
|
| 721 |
+
more_stats_md = gr.Markdown(
|
| 722 |
+
f"""## More Statistics for Chatbot Arena""", # Translated
|
| 723 |
+
elem_id="leaderboard_header_markdown",
|
| 724 |
+
)
|
| 725 |
+
with gr.Row(elem_id="leaderboard_bars"): # Use ID from monitor.py
|
| 726 |
+
with gr.Column():
|
| 727 |
+
gr.Markdown( # Translated title
|
| 728 |
+
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
|
| 729 |
+
elem_id="plot-title",
|
| 730 |
+
)
|
| 731 |
+
plot_3 = gr.Plot(p3_init, show_label=False) # Use initial data
|
| 732 |
+
with gr.Column():
|
| 733 |
+
gr.Markdown( # Translated title
|
| 734 |
+
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
|
| 735 |
+
elem_id="plot-title",
|
| 736 |
+
)
|
| 737 |
+
plot_4 = gr.Plot(p4_init, show_label=False) # Use initial data
|
| 738 |
+
with gr.Row(elem_id="leaderboard_plots"): # Use ID from monitor.py
|
| 739 |
+
with gr.Column():
|
| 740 |
+
gr.Markdown( # Translated title
|
| 741 |
+
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
|
| 742 |
+
elem_id="plot-title",
|
| 743 |
+
)
|
| 744 |
+
plot_1 = gr.Plot(
|
| 745 |
+
p1_init, show_label=False, elem_id="plot-container" # Use initial data
|
| 746 |
+
)
|
| 747 |
+
with gr.Column():
|
| 748 |
+
gr.Markdown( # Translated title
|
| 749 |
+
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
|
| 750 |
+
elem_id="plot-title",
|
| 751 |
+
)
|
| 752 |
+
plot_2 = gr.Plot(p2_init, show_label=False) # Use initial data
|
| 753 |
|
| 754 |
+
def update_leaderboard_df(arena_table_vals):
|
| 755 |
+
# Add error handling for empty or incorrect data
|
| 756 |
+
# Expects 9 columns when Delta is present
|
| 757 |
+
if not arena_table_vals or not isinstance(arena_table_vals, list) or not arena_table_vals[0] or len(arena_table_vals[0]) != 9:
|
| 758 |
+
print("Warning: Invalid data for styling in update_leaderboard_df. Returning empty DataFrame.")
|
| 759 |
+
# Return an empty styled DataFrame to avoid Gradio errors
|
| 760 |
+
empty_styled = pd.DataFrame(columns=[
|
| 761 |
+
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
|
| 762 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
| 763 |
+
]).style
|
| 764 |
+
return empty_styled
|
| 765 |
|
| 766 |
+
try:
|
| 767 |
+
elo_datarame = pd.DataFrame(
|
| 768 |
+
arena_table_vals,
|
| 769 |
+
columns=[
|
| 770 |
+
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
|
| 771 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 773 |
)
|
| 774 |
+
|
| 775 |
+
def highlight_max(s):
|
| 776 |
+
# Check rank string for arrows
|
| 777 |
+
return [
|
| 778 |
+
"color: green; font-weight: bold" if "β" in str(v) else
|
| 779 |
+
"color: red; font-weight: bold" if "β" in str(v) else ""
|
| 780 |
+
for v in s
|
| 781 |
+
]
|
| 782 |
+
|
| 783 |
+
def highlight_rank_max(s):
|
| 784 |
+
# Check Delta value (ensure it's numeric)
|
| 785 |
+
return [
|
| 786 |
+
"color: green; font-weight: bold" if isinstance(v, (int, float)) and v > 0 else
|
| 787 |
+
"color: red; font-weight: bold" if isinstance(v, (int, float)) and v < 0 else ""
|
| 788 |
+
for v in s
|
| 789 |
+
]
|
| 790 |
+
# Apply styles
|
| 791 |
+
styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
|
| 792 |
+
highlight_rank_max, subset=["Delta"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
)
|
| 794 |
+
return styled_df
|
| 795 |
|
| 796 |
+
except Exception as e:
|
| 797 |
+
print(f"Error applying styles in update_leaderboard_df: {e}")
|
| 798 |
+
traceback.print_exc()
|
| 799 |
+
# Return unstyled DataFrame on error
|
| 800 |
+
return pd.DataFrame(arena_table_vals, columns=[
|
| 801 |
+
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
|
| 802 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
| 803 |
+
]).style
|
| 804 |
+
|
| 805 |
+
def update_leaderboard_and_plots(category_key, filters): # Receives category *key* from dropdown
|
| 806 |
+
# No caching
|
| 807 |
+
# Reload data on each call
|
| 808 |
+
try:
|
| 809 |
+
current_last_updated_time, current_arena_dfs, current_category_elo_results, _, current_model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
|
| 810 |
+
except Exception as e:
|
| 811 |
+
print(f"Error reloading data in callback: {e}")
|
| 812 |
+
# Return empty updates to prevent UI crash
|
| 813 |
+
empty_df_update = gr.Dataframe(value=pd.DataFrame().style) # Empty DataFrame
|
| 814 |
+
empty_plot_update = gr.Plot(value=None) # Empty Plot
|
| 815 |
+
empty_md_update = gr.Markdown(value="Error loading data.") # Error Markdown
|
| 816 |
+
# Match the number of outputs expected by the .change() call
|
| 817 |
+
num_plots = 4 if show_plot else 0
|
| 818 |
+
return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
|
| 819 |
+
|
| 820 |
+
|
| 821 |
+
# Use the display name corresponding to the selected key
|
| 822 |
+
category_display_name = key_to_category_name.get(category_key, category_key)
|
| 823 |
+
|
| 824 |
+
# Check if data exists for the selected category (using display name as key now)
|
| 825 |
+
if not current_arena_dfs or category_display_name not in current_arena_dfs or category_display_name not in current_category_elo_results or current_model_table_df.empty:
|
| 826 |
+
print(f"Warning: Data missing for category '{category_display_name}' (key: '{category_key}') after reload.")
|
| 827 |
+
empty_df_update = gr.Dataframe(value=pd.DataFrame().style)
|
| 828 |
+
empty_plot_update = gr.Plot(value=None)
|
| 829 |
+
empty_md_update = gr.Markdown(value=f"No data available for category: {category_display_name}")
|
| 830 |
+
num_plots = 4 if show_plot else 0
|
| 831 |
+
# Match the number of outputs
|
| 832 |
+
return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
|
| 833 |
+
|
| 834 |
+
# Get the specific data slices using the display name
|
| 835 |
+
arena_subset_df = current_arena_dfs[category_display_name]
|
| 836 |
+
elo_subset_results = current_category_elo_results[category_display_name]
|
| 837 |
+
|
| 838 |
+
# Use the hardcoded baseline key, get its display name
|
| 839 |
+
baseline_key = "site_visitors/medium_prompts:style control"
|
| 840 |
+
baseline_display_name = key_to_category_name.get(baseline_key, baseline_key)
|
| 841 |
+
|
| 842 |
+
# Fallback if baseline is missing
|
| 843 |
+
if baseline_display_name not in current_arena_dfs:
|
| 844 |
+
print(f"Warning: Baseline category '{baseline_display_name}' not found. Using selected category '{category_display_name}' as baseline.")
|
| 845 |
+
baseline_display_name = category_display_name # Fallback to the selected category itself
|
| 846 |
+
|
| 847 |
+
arena_df_baseline = current_arena_dfs[baseline_display_name]
|
| 848 |
+
|
| 849 |
+
|
| 850 |
+
hidden_models_list = None # Default: show all
|
| 851 |
+
# Check filter labels (must match the translated CheckboxGroup choices)
|
| 852 |
+
if "Show Deprecated" not in filters:
|
| 853 |
+
hidden_models_list = deprecated_model_name.copy() # Hide deprecated
|
| 854 |
+
|
| 855 |
+
if "Only <10B Models" in filters:
|
| 856 |
+
# Get all models currently in the baseline view
|
| 857 |
+
all_models_in_view = arena_df_baseline.index.tolist()
|
| 858 |
+
# Find models *not* in the allowed list
|
| 859 |
+
models_to_hide = [model for model in all_models_in_view if model not in models_10b]
|
| 860 |
+
|
| 861 |
+
if hidden_models_list is None: # If deprecated are not hidden
|
| 862 |
+
hidden_models_list = models_to_hide
|
| 863 |
+
else: # If deprecated are already hidden, add the non-<10B ones
|
| 864 |
+
# Use set to avoid duplicates
|
| 865 |
+
hidden_models_list = list(set(hidden_models_list + models_to_hide))
|
| 866 |
+
|
| 867 |
+
arena_table_values = get_arena_table(
|
| 868 |
+
arena_df_baseline, # Use the determined baseline DataFrame
|
| 869 |
+
current_model_table_df,
|
| 870 |
+
# Pass subset only if it's different from the baseline
|
| 871 |
+
arena_subset_df=(arena_subset_df if category_display_name != baseline_display_name else None),
|
| 872 |
+
hidden_models=hidden_models_list
|
| 873 |
)
|
|
|
|
|
|
|
|
|
|
| 874 |
|
| 875 |
+
dataframe_update = None
|
| 876 |
+
# Show Delta column only if category is not the baseline and data exists
|
| 877 |
+
if category_display_name != baseline_display_name and arena_table_values:
|
| 878 |
+
styled_arena_values = update_leaderboard_df(arena_table_values) # Apply styling with Delta
|
| 879 |
+
# Check if styling was successful
|
| 880 |
+
if isinstance(styled_arena_values, pd.io.formats.style.Styler) and not styled_arena_values.data.empty:
|
| 881 |
+
dataframe_update = gr.Dataframe(
|
| 882 |
+
headers=[ # Headers including Delta
|
| 883 |
+
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
|
| 884 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
| 885 |
+
],
|
| 886 |
+
datatype=[
|
| 887 |
+
"str", "number", "markdown", "number", "str",
|
| 888 |
+
"number", "str", "str", "str"
|
| 889 |
+
],
|
| 890 |
+
value=styled_arena_values, # Pass the Styler object
|
| 891 |
+
elem_id="arena_leaderboard_dataframe",
|
| 892 |
+
height=700,
|
| 893 |
+
column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], # Widths with Delta
|
| 894 |
+
wrap=True,
|
| 895 |
+
)
|
| 896 |
+
else: # Handle styling failure
|
| 897 |
+
dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
|
| 898 |
+
|
| 899 |
+
else: # Baseline category or no data for Delta
|
| 900 |
+
# Ensure data exists before creating DataFrame
|
| 901 |
+
if arena_table_values:
|
| 902 |
+
# Create DataFrame without Delta column from the raw values
|
| 903 |
+
df_no_delta = pd.DataFrame(arena_table_values, columns=[
|
| 904 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
| 905 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
| 906 |
+
])
|
| 907 |
+
dataframe_update = gr.Dataframe(
|
| 908 |
+
headers=[ # Headers without Delta
|
| 909 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
| 910 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
| 911 |
+
],
|
| 912 |
+
datatype=[
|
| 913 |
+
"str", "markdown", "number", "str", "number",
|
| 914 |
+
"str", "str", "str"
|
| 915 |
+
],
|
| 916 |
+
value=df_no_delta.style, # Apply basic Pandas styling
|
| 917 |
+
elem_id="arena_leaderboard_dataframe",
|
| 918 |
+
height=700,
|
| 919 |
+
column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths without Delta
|
| 920 |
+
wrap=True,
|
| 921 |
+
)
|
| 922 |
+
else:
|
| 923 |
+
dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
|
| 924 |
+
|
| 925 |
+
plot_updates = [gr.Plot(value=None)] * 4 # Default empty plot updates
|
| 926 |
+
if show_plot:
|
| 927 |
+
p1_updated = elo_subset_results.get("win_fraction_heatmap")
|
| 928 |
+
p2_updated = elo_subset_results.get("battle_count_heatmap")
|
| 929 |
+
p3_updated = elo_subset_results.get("bootstrap_elo_rating")
|
| 930 |
+
p4_updated = elo_subset_results.get("average_win_rate_bar")
|
| 931 |
+
|
| 932 |
+
# Filter plots
|
| 933 |
+
p1_filtered = filter_deprecated_models_plots(p1_updated, hidden_models=hidden_models_list)
|
| 934 |
+
p2_filtered = filter_deprecated_models_plots(p2_updated, hidden_models=hidden_models_list)
|
| 935 |
+
p3_filtered = filter_deprecated_models_plots(p3_updated, hidden_models=hidden_models_list)
|
| 936 |
+
p4_filtered = filter_deprecated_models_plots(p4_updated, hidden_models=hidden_models_list)
|
| 937 |
+
plot_updates = [p1_filtered, p2_filtered, p3_filtered, p4_filtered]
|
| 938 |
+
|
| 939 |
+
|
| 940 |
+
more_stats_md_updated_text = f"""## More Statistics for Chatbot Arena - {category_display_name} """ if show_plot else ""
|
| 941 |
+
more_stats_md_update = gr.Markdown(value=more_stats_md_updated_text)
|
| 942 |
+
|
| 943 |
+
# Use baseline DF for total counts, subset DF for category-specific counts
|
| 944 |
+
category_details_md_updated_text = make_category_arena_leaderboard_md(
|
| 945 |
+
arena_df_baseline, arena_subset_df, name=category_display_name # Pass display name
|
| 946 |
+
)
|
| 947 |
+
category_deets_update = gr.Markdown(value=category_details_md_updated_text)
|
| 948 |
|
| 949 |
+
# Return updates in the correct order matching outputs list
|
| 950 |
+
# Order: df, p1, p2, p3, p4, more_stats_md, category_deets
|
| 951 |
+
return [dataframe_update] + plot_updates + [more_stats_md_update, category_deets_update]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 952 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 953 |
|
| 954 |
+
# Define output components (must exist in the UI build)
|
| 955 |
+
outputs_list = [elo_display_df]
|
| 956 |
+
if show_plot:
|
| 957 |
+
# Add plot components if they exist
|
| 958 |
+
outputs_list.extend([plot_1, plot_2, plot_3, plot_4])
|
| 959 |
+
# Add markdown component if it exists
|
| 960 |
+
if more_stats_md: outputs_list.append(more_stats_md)
|
| 961 |
+
else: outputs_list.append(gr.Markdown(visible=False)) # Placeholder if MD wasn't created
|
| 962 |
+
else:
|
| 963 |
+
# Add placeholders if plots/MD are not shown
|
| 964 |
+
outputs_list.extend([gr.Plot(visible=False)] * 4)
|
| 965 |
+
outputs_list.append(gr.Markdown(visible=False))
|
| 966 |
+
outputs_list.append(category_deets) # Always update category details
|
| 967 |
+
|
| 968 |
+
# Attach change listeners
|
| 969 |
+
category_dropdown.change(
|
| 970 |
+
fn=update_leaderboard_and_plots,
|
| 971 |
+
inputs=[category_dropdown, category_checkbox],
|
| 972 |
+
outputs=outputs_list
|
| 973 |
+
)
|
| 974 |
+
category_checkbox.change(
|
| 975 |
+
fn=update_leaderboard_and_plots, # Use the same function
|
| 976 |
+
inputs=[category_dropdown, category_checkbox],
|
| 977 |
+
outputs=outputs_list
|
| 978 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 979 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 980 |
|
| 981 |
+
return_components = [md_1, md_2, lb_description, category_deets, elo_display_df]
|
| 982 |
+
if show_plot:
|
| 983 |
+
# Add plots if they were created
|
| 984 |
+
return_components.extend([plot_1, plot_2, plot_3, plot_4])
|
| 985 |
+
# Add the extra stats markdown if it was created
|
| 986 |
+
if more_stats_md: return_components.append(more_stats_md)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 987 |
|
|
|
|
|
|
|
|
|
|
| 988 |
|
| 989 |
+
return return_components
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 990 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 991 |
|
| 992 |
+
def build_demo(elo_results_file, leaderboard_table_file):
|
| 993 |
+
# Assumes block_css is available or defined elsewhere
|
| 994 |
+
try:
|
| 995 |
+
from fastchat.serve.gradio_web_server import block_css
|
| 996 |
+
except ImportError:
|
| 997 |
+
print("Warning: fastchat.serve.gradio_web_server.block_css not found. Using fallback CSS.")
|
| 998 |
+
# Define a minimal fallback CSS or copy the content here
|
| 999 |
+
block_css = """
|
| 1000 |
+
/* Add minimal CSS rules here if needed */
|
| 1001 |
+
#arena_leaderboard_dataframe table { font-size: 105%; }
|
| 1002 |
+
#leaderboard_markdown .prose { font-size: 110% !important; }
|
| 1003 |
+
.app { max-width: 100% !important; padding: 20px !important; }
|
| 1004 |
+
a { color: #1976D2; text-decoration: none; }
|
| 1005 |
+
a:hover { color: #63A4FF; text-decoration: underline; }
|
| 1006 |
+
"""
|
| 1007 |
|
| 1008 |
+
text_size = gr.themes.sizes.text_lg
|
| 1009 |
+
# Assumes theme.json is present
|
| 1010 |
+
try:
|
| 1011 |
+
theme = gr.themes.Default.load("theme.json")
|
| 1012 |
+
except:
|
| 1013 |
+
print("Warning: theme.json not found. Using default Gradio theme.")
|
| 1014 |
+
theme = gr.themes.Default(text_size=text_size) # Fallback theme
|
| 1015 |
+
|
| 1016 |
+
if hasattr(theme, 'text_size'): theme.text_size = text_size
|
| 1017 |
+
# Apply custom settings if theme object supports it
|
| 1018 |
+
if hasattr(theme, 'set'):
|
| 1019 |
+
theme.set(
|
| 1020 |
+
button_large_text_size="40px",
|
| 1021 |
+
button_small_text_size="40px",
|
| 1022 |
+
button_large_text_weight="1000",
|
| 1023 |
+
button_small_text_weight="1000",
|
| 1024 |
+
button_shadow="*shadow_drop_lg",
|
| 1025 |
+
button_shadow_hover="*shadow_drop_lg",
|
| 1026 |
+
checkbox_label_shadow="*shadow_drop_lg",
|
| 1027 |
+
button_shadow_active="*shadow_inset",
|
| 1028 |
+
button_secondary_background_fill="*primary_300",
|
| 1029 |
+
button_secondary_background_fill_dark="*primary_700",
|
| 1030 |
+
button_secondary_background_fill_hover="*primary_200",
|
| 1031 |
+
button_secondary_background_fill_hover_dark="*primary_500",
|
| 1032 |
+
button_secondary_text_color="*primary_800",
|
| 1033 |
+
button_secondary_text_color_dark="white",
|
| 1034 |
+
)
|
| 1035 |
|
| 1036 |
+
with gr.Blocks(
|
| 1037 |
+
title="LLM Arena: Leaderboard", # Translated title
|
| 1038 |
+
theme=theme,
|
| 1039 |
+
css=block_css, # Use loaded or fallback CSS
|
| 1040 |
+
) as demo:
|
| 1041 |
+
# Build only the leaderboard tab content
|
| 1042 |
+
# show_plot=True to display plots
|
| 1043 |
+
leader_components = build_leaderboard_tab(
|
| 1044 |
+
elo_results_file, leaderboard_table_file, show_plot=True, mirror=False
|
| 1045 |
+
)
|
| 1046 |
+
return demo
|
| 1047 |
|
| 1048 |
|
| 1049 |
if __name__ == "__main__":
|
| 1050 |
parser = argparse.ArgumentParser()
|
| 1051 |
+
parser.add_argument("--share", action="store_true", default=False) # Default False for HF
|
| 1052 |
parser.add_argument("--host", default="0.0.0.0")
|
| 1053 |
parser.add_argument("--port", type=int, default=7860)
|
| 1054 |
+
# Removed args specific to monitor.py
|
| 1055 |
args = parser.parse_args()
|
| 1056 |
+
try:
|
| 1057 |
+
elo_result_files = glob.glob("elo_results_*.pkl")
|
| 1058 |
+
if not elo_result_files:
|
| 1059 |
+
raise FileNotFoundError("No elo_results_*.pkl files found.")
|
| 1060 |
+
# More robust sorting extracting the number
|
| 1061 |
+
elo_result_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
|
| 1062 |
+
elo_result_file = elo_result_files[-1]
|
| 1063 |
+
print(f"Using Elo results file: {elo_result_file}")
|
| 1064 |
+
except Exception as e:
|
| 1065 |
+
print(f"Error finding Elo results file: {e}")
|
| 1066 |
+
print("Please ensure a file matching 'elo_results_NUMBER.pkl' exists.")
|
| 1067 |
+
exit(1) # Exit if file not found
|
| 1068 |
+
|
| 1069 |
+
try:
|
| 1070 |
+
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
| 1071 |
+
if not leaderboard_table_files:
|
| 1072 |
+
raise FileNotFoundError("No leaderboard_table_*.csv files found.")
|
| 1073 |
+
leaderboard_table_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
|
| 1074 |
+
leaderboard_table_file = leaderboard_table_files[-1]
|
| 1075 |
+
print(f"Using leaderboard table file: {leaderboard_table_file}")
|
| 1076 |
+
except Exception as e:
|
| 1077 |
+
print(f"Error finding leaderboard table file: {e}")
|
| 1078 |
+
print("Please ensure a file matching 'leaderboard_table_NUMBER.csv' exists.")
|
| 1079 |
+
exit(1) # Exit if file not found
|
| 1080 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1081 |
|
| 1082 |
demo = build_demo(elo_result_file, leaderboard_table_file)
|
| 1083 |
+
# Launch with args
|
| 1084 |
+
demo.launch(
|
| 1085 |
+
server_name=args.host,
|
| 1086 |
+
server_port=args.port,
|
| 1087 |
+
share=args.share,
|
| 1088 |
+
show_api=False
|
| 1089 |
+
)
|