Spaces:
Paused
Paused
Interactive sliders
Browse files
app.py
CHANGED
|
@@ -18,7 +18,8 @@ from utils import (
|
|
| 18 |
format_data,
|
| 19 |
get_trendlines,
|
| 20 |
find_crossover_point,
|
| 21 |
-
sigmoid_transition
|
|
|
|
| 22 |
)
|
| 23 |
|
| 24 |
###################
|
|
@@ -105,21 +106,14 @@ merged_dfs = {k: format_data(v) for k, v in merged_dfs.items()}
|
|
| 105 |
# get constants
|
| 106 |
min_elo_score, max_elo_score, _ = get_constants(merged_dfs)
|
| 107 |
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
|
| 108 |
-
orgs = merged_dfs["Overall"].Organization.unique().tolist()
|
| 109 |
|
|
|
|
|
|
|
| 110 |
###################
|
| 111 |
### Build and Plot Data
|
| 112 |
###################
|
| 113 |
|
| 114 |
|
| 115 |
-
df = merged_dfs["Overall"]
|
| 116 |
-
top_orgs = df.groupby("Organization")["rating"].max().nlargest(11).index.tolist()
|
| 117 |
-
|
| 118 |
-
df = df.loc[(df["Organization"].isin(top_orgs)) & (df["rating"] > 1000)]
|
| 119 |
-
print(df)
|
| 120 |
-
|
| 121 |
-
df = df.loc[~df["Release Date"].isna()]
|
| 122 |
-
|
| 123 |
def get_data_split(dfs, set_name):
|
| 124 |
df = dfs[set_name].copy(deep=True)
|
| 125 |
return df.reset_index(drop=True)
|
|
@@ -272,45 +266,32 @@ def make_figure(df):
|
|
| 272 |
speak_french = False
|
| 273 |
if speak_french:
|
| 274 |
fig.update_layout(
|
| 275 |
-
xaxis_title="Date",
|
| 276 |
title="La course au classement",
|
| 277 |
yaxis_title="Score ELO",
|
| 278 |
legend_title="Classement en Novembre 2024",
|
| 279 |
-
xaxis_range=[pd.Timestamp("2024-01-01"), current_date], # Extend x-axis for labels
|
| 280 |
-
yaxis_range=[1103, 1350],
|
| 281 |
)
|
| 282 |
else:
|
| 283 |
fig.update_layout(
|
| 284 |
-
xaxis_title="Date",
|
| 285 |
yaxis_title="ELO score on Chatbot Arena",
|
| 286 |
legend_title="Ranking as of November 2024",
|
| 287 |
title="The race for the best LLM",
|
| 288 |
-
hovermode="closest",
|
| 289 |
-
xaxis_range=[pd.Timestamp("2024-01-01"), current_date], # Extend x-axis for labels
|
| 290 |
-
yaxis_range=[1103, 1350],
|
| 291 |
)
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
fig.update_xaxes(
|
| 295 |
tickformat="%m-%Y",
|
| 296 |
)
|
| 297 |
-
print(fig)
|
| 298 |
return fig, df
|
| 299 |
|
| 300 |
-
def filter_df():
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
set_dark_mode = """
|
| 305 |
-
function refresh() {
|
| 306 |
-
const url = new URL(window.location);
|
| 307 |
-
|
| 308 |
-
if (url.searchParams.get('__theme') !== 'dark') {
|
| 309 |
-
url.searchParams.set('__theme', 'dark');
|
| 310 |
-
window.location.href = url.href;
|
| 311 |
-
}
|
| 312 |
-
}
|
| 313 |
-
"""
|
| 314 |
|
| 315 |
with gr.Blocks(
|
| 316 |
theme=gr.themes.Soft(
|
|
@@ -320,45 +301,49 @@ with gr.Blocks(
|
|
| 320 |
text_size=gr.themes.sizes.text_sm,
|
| 321 |
font=[
|
| 322 |
gr.themes.GoogleFont("Open Sans"),
|
| 323 |
-
"ui-
|
| 324 |
"system-ui",
|
| 325 |
-
"
|
| 326 |
],
|
| 327 |
),
|
| 328 |
-
js=set_dark_mode,
|
| 329 |
) as demo:
|
| 330 |
-
|
| 331 |
-
"""
|
| 332 |
-
<div style="text-align: center; max-width: 650px; margin: auto;">
|
| 333 |
-
<h1 style="font-weight: 900; margin-top: 5px;">π The race for the best LLM π</h1>
|
| 334 |
-
<p style="text-align: left; margin-top: 30px; margin-bottom: 30px; line-height: 20px;">
|
| 335 |
-
This app visualizes the progress of LLMs over time as scored by the <a href="https://leaderboard.lmsys.org/">LMSYS Chatbot Arena</a>.
|
| 336 |
-
The app is adapted from <a href="https://huggingface.co/spaces/andrewrreed/closed-vs-open-arena-elo"> this app</a> by Andew Reed,
|
| 337 |
-
and is intended to stay up-to-date as new models are released and evaluated.
|
| 338 |
-
<div style="text-align: left;">
|
| 339 |
-
<strong>Plot info:</strong>
|
| 340 |
-
<br>
|
| 341 |
-
<ul style="padding-left: 20px;">
|
| 342 |
-
<li> The ELO score (y-axis) is a measure of the relative strength of a model based on its performance against other models in the arena. </li>
|
| 343 |
-
<li> The Release Date (x-axis) corresponds to when the model was first publicly released or when its ELO results were first reported (for ease of automated updates). </li>
|
| 344 |
-
<li> Trend lines are based on Ordinary Least Squares (OLS) regression and adjust based on the filter criteria. </li>
|
| 345 |
-
<ul>
|
| 346 |
-
</div>
|
| 347 |
-
</p>
|
| 348 |
-
</div>
|
| 349 |
-
"""
|
| 350 |
-
)
|
| 351 |
filtered_df = gr.State()
|
|
|
|
|
|
|
|
|
|
| 352 |
with gr.Group():
|
| 353 |
with gr.Tab("Plot"):
|
| 354 |
plot = gr.Plot(show_label=False)
|
| 355 |
with gr.Tab("Raw Data"):
|
| 356 |
display_df = gr.DataFrame()
|
| 357 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
|
| 359 |
demo.load(
|
| 360 |
fn=filter_df,
|
| 361 |
-
inputs=[],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
outputs=filtered_df,
|
| 363 |
).then(
|
| 364 |
fn=make_figure,
|
|
@@ -366,4 +351,14 @@ with gr.Blocks(
|
|
| 366 |
outputs=[plot, display_df],
|
| 367 |
)
|
| 368 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
demo.launch()
|
|
|
|
| 18 |
format_data,
|
| 19 |
get_trendlines,
|
| 20 |
find_crossover_point,
|
| 21 |
+
sigmoid_transition,
|
| 22 |
+
apply_template,
|
| 23 |
)
|
| 24 |
|
| 25 |
###################
|
|
|
|
| 106 |
# get constants
|
| 107 |
min_elo_score, max_elo_score, _ = get_constants(merged_dfs)
|
| 108 |
date_updated = elo_results["full"]["last_updated_datetime"].split(" ")[0]
|
|
|
|
| 109 |
|
| 110 |
+
ratings_df = merged_dfs["Overall"]
|
| 111 |
+
ratings_df = ratings_df.loc[~ratings_df["Release Date"].isna()]
|
| 112 |
###################
|
| 113 |
### Build and Plot Data
|
| 114 |
###################
|
| 115 |
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
def get_data_split(dfs, set_name):
|
| 118 |
df = dfs[set_name].copy(deep=True)
|
| 119 |
return df.reset_index(drop=True)
|
|
|
|
| 266 |
speak_french = False
|
| 267 |
if speak_french:
|
| 268 |
fig.update_layout(
|
|
|
|
| 269 |
title="La course au classement",
|
| 270 |
yaxis_title="Score ELO",
|
| 271 |
legend_title="Classement en Novembre 2024",
|
|
|
|
|
|
|
| 272 |
)
|
| 273 |
else:
|
| 274 |
fig.update_layout(
|
|
|
|
| 275 |
yaxis_title="ELO score on Chatbot Arena",
|
| 276 |
legend_title="Ranking as of November 2024",
|
| 277 |
title="The race for the best LLM",
|
|
|
|
|
|
|
|
|
|
| 278 |
)
|
| 279 |
+
fig.update_layout(
|
| 280 |
+
xaxis_title="Date",
|
| 281 |
+
hovermode="closest",
|
| 282 |
+
xaxis_range=[pd.Timestamp("2024-01-01"), current_date], # Extend x-axis for labels
|
| 283 |
+
yaxis_range=[best_models_df["rating"].min() - 10, df["rating"].max() + 30],
|
| 284 |
+
)
|
| 285 |
+
apply_template(fig, annotation_text="Aymeric Roucher")
|
| 286 |
|
| 287 |
fig.update_xaxes(
|
| 288 |
tickformat="%m-%Y",
|
| 289 |
)
|
|
|
|
| 290 |
return fig, df
|
| 291 |
|
| 292 |
+
def filter_df(top_n_orgs=11, minimum_rating=1000):
|
| 293 |
+
top_orgs = ratings_df.groupby("Organization")["rating"].max().nlargest(top_n_orgs).index.tolist()
|
| 294 |
+
return ratings_df.loc[(ratings_df["Organization"].isin(top_orgs)) & (ratings_df["rating"] > minimum_rating)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
with gr.Blocks(
|
| 297 |
theme=gr.themes.Soft(
|
|
|
|
| 301 |
text_size=gr.themes.sizes.text_sm,
|
| 302 |
font=[
|
| 303 |
gr.themes.GoogleFont("Open Sans"),
|
| 304 |
+
"ui-serif",
|
| 305 |
"system-ui",
|
| 306 |
+
"serif",
|
| 307 |
],
|
| 308 |
),
|
|
|
|
| 309 |
) as demo:
|
| 310 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
filtered_df = gr.State()
|
| 312 |
+
with gr.Row():
|
| 313 |
+
top_n_orgs = gr.Slider(minimum=1, maximum=30, value=10, label="View top N companies")
|
| 314 |
+
minimum_rating = gr.Slider(minimum=800, maximum=1300, value=1000, label="Restrict to ELO scores above N")
|
| 315 |
with gr.Group():
|
| 316 |
with gr.Tab("Plot"):
|
| 317 |
plot = gr.Plot(show_label=False)
|
| 318 |
with gr.Tab("Raw Data"):
|
| 319 |
display_df = gr.DataFrame()
|
| 320 |
|
| 321 |
+
gr.Markdown(
|
| 322 |
+
"""
|
| 323 |
+
This app visualizes the progress of LLMs over time as scored by the [LMSYS Chatbot Arena](https://leaderboard.lmsys.org/).
|
| 324 |
+
The app is adapted from [this app](https://huggingface.co/spaces/andrewrreed/closed-vs-open-arena-elo) by Andew Reed,
|
| 325 |
+
and is intended to stay up-to-date as new models are released and evaluated.
|
| 326 |
+
|
| 327 |
+
> ### Plot info
|
| 328 |
+
> The ELO score (y-axis) is a measure of the relative strength of a model based on its performance against other models in the arena.
|
| 329 |
+
> The Release Date (x-axis) corresponds to when the model was first publicly released or when its ELO results were first reported (for ease of automated updates).
|
| 330 |
+
> Trend lines are based on Ordinary Least Squares (OLS) regression and adjust based on the filter criteria.
|
| 331 |
+
"""
|
| 332 |
+
)
|
| 333 |
|
| 334 |
demo.load(
|
| 335 |
fn=filter_df,
|
| 336 |
+
inputs=[top_n_orgs, minimum_rating],
|
| 337 |
+
outputs=filtered_df,
|
| 338 |
+
).then(
|
| 339 |
+
fn=make_figure,
|
| 340 |
+
inputs=[filtered_df],
|
| 341 |
+
outputs=[plot, display_df],
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
minimum_rating.change(
|
| 345 |
+
fn=filter_df,
|
| 346 |
+
inputs=[top_n_orgs, minimum_rating],
|
| 347 |
outputs=filtered_df,
|
| 348 |
).then(
|
| 349 |
fn=make_figure,
|
|
|
|
| 351 |
outputs=[plot, display_df],
|
| 352 |
)
|
| 353 |
|
| 354 |
+
top_n_orgs.change(
|
| 355 |
+
fn=filter_df,
|
| 356 |
+
inputs=[top_n_orgs, minimum_rating],
|
| 357 |
+
outputs=filtered_df,
|
| 358 |
+
).then(
|
| 359 |
+
fn=make_figure,
|
| 360 |
+
inputs=[filtered_df],
|
| 361 |
+
outputs=[plot, display_df],
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
demo.launch()
|
utils.py
CHANGED
|
@@ -233,3 +233,48 @@ def find_crossover_point(b1, m1, b2, m2):
|
|
| 233 |
# Function to create sigmoid transition
|
| 234 |
def sigmoid_transition(x, x0, k=0.1):
|
| 235 |
return expit(k * (x - x0))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
# Function to create sigmoid transition
|
| 234 |
def sigmoid_transition(x, x0, k=0.1):
|
| 235 |
return expit(k * (x - x0))
|
| 236 |
+
|
| 237 |
+
def apply_template(
|
| 238 |
+
fig,
|
| 239 |
+
template="none",
|
| 240 |
+
annotation_text="",
|
| 241 |
+
title=None,
|
| 242 |
+
width=1200,
|
| 243 |
+
height=600,
|
| 244 |
+
):
|
| 245 |
+
"""Applies template in-place to input fig."""
|
| 246 |
+
layout_updates = {
|
| 247 |
+
"template": template,
|
| 248 |
+
"width": width,
|
| 249 |
+
"height": height,
|
| 250 |
+
"font": dict(family="Garamond", size=14),
|
| 251 |
+
"title_font_family": "Garamond",
|
| 252 |
+
"title_font_size": 24,
|
| 253 |
+
"title_xanchor": "center",
|
| 254 |
+
"legend": dict(
|
| 255 |
+
itemsizing="constant",
|
| 256 |
+
title_font_family="Garamond",
|
| 257 |
+
font=dict(family="Garamond", size=14),
|
| 258 |
+
itemwidth=30,
|
| 259 |
+
),
|
| 260 |
+
}
|
| 261 |
+
if len(annotation_text) > 0:
|
| 262 |
+
layout_updates["annotations"] = [
|
| 263 |
+
dict(
|
| 264 |
+
text=f"<i>{annotation_text}</i>",
|
| 265 |
+
xref="paper",
|
| 266 |
+
yref="paper",
|
| 267 |
+
x=1.05,
|
| 268 |
+
y=-0.05,
|
| 269 |
+
xanchor="left",
|
| 270 |
+
yanchor="top",
|
| 271 |
+
showarrow=False,
|
| 272 |
+
font=dict(size=14),
|
| 273 |
+
)
|
| 274 |
+
]
|
| 275 |
+
if title is not None:
|
| 276 |
+
layout_updates["title"] = title
|
| 277 |
+
fig.update_layout(layout_updates)
|
| 278 |
+
fig.update_xaxes(title_font_family="Garamond", tickfont_family="Garamond")
|
| 279 |
+
fig.update_yaxes(title_font_family="Garamond", tickfont_family="Garamond")
|
| 280 |
+
return
|