Spaces:

lmarena
/

chatbot-arena-leaderboard

Running

App Files Files Community

LLMArena commited on Apr 4

Commit

c32157e

verified ·

1 Parent(s): 1575c35

Update app.py with new features from arena

Browse files

Files changed (1) hide show

app.py +895 -625

app.py CHANGED Viewed

@@ -4,17 +4,12 @@ import glob
 import pickle
 import traceback
 import numpy as np
-from datetime import datetime
 import pandas as pd
 import gradio as gr
 import numpy as np
-basic_component_values = [None] * 6
-leader_component_values = [None] * 5
 promo_banner = """
 <div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
     USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
@@ -23,18 +18,30 @@ promo_banner = """
 deprecated_model_name = [
     "GigaChat 3.1.25.3",
-    "GigaChat-Pro 2.2.25.3",
     "saiga_llama3_8b_v6",
     "saiga_phi3_medium",
     "GigaChat-Plus 3.1.25.3",
     "GigaChat-Pro 4.0.26.8",
     "GigaChat 4.0.26.8",
-    "xAI: Grok 2",
     "GigaChat-Pro 4.0.26.15",
     "GigaChat 4.0.26.15",
-    "YandexGPT Experimental", "yandex-gpt-arena"
 ]
 def make_default_md_1():
     leaderboard_md = f"""
 # 🏆 LLM Arena in Russian: Leaderboard
@@ -44,24 +51,24 @@ def make_default_md_1():
 """
     return leaderboard_md
 def make_default_md_2():
     leaderboard_md = f"""
     The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
     Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
     - To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
     - If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
     - You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
     """
     return leaderboard_md
 def make_arena_leaderboard_md(arena_df, last_updated_time):
-    total_votes = sum(arena_df["num_battles"])
     total_models = len(arena_df)
-    space = "   "
     leaderboard_md = f"""
 Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
@@ -74,57 +81,166 @@ See Figure 1 below for a visualization of the confidence intervals of model rati
 def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
-    total_votes = sum(arena_df["num_battles"])
     total_models = len(arena_df)
-    space = "   "
-    total_subset_votes = sum(arena_subset_df["num_battles"])
     total_subset_models = len(arena_subset_df)
-    leaderboard_md = f"""### {cat_name_to_explanation[name]}
-#### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
 """
     return leaderboard_md
 def model_hyperlink(model_name, link):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def filter_deprecated_models_plots(fig, hidden_models=None):
     """
-    Removes deprecated models from a Plotly figure.
     Args:
         fig: The Plotly figure object.
-        hidden_models: A list of model names to remove.
     """
     if fig is None:
-        return
-    if hidden_models is None:
         return fig
-    if fig.data[0].type == 'heatmap':
-        data = fig.data[0]
-        mask_x = ~np.isin(data.x, hidden_models)
-        mask_y = ~np.isin(data.y, hidden_models)
-        data.update({
-            'x': np.array(data.x)[mask_x],
-            'y': np.array(data.y)[mask_y],
-            'z': np.array(data.z)[np.ix_(mask_y, mask_x)]
-        })
-    elif fig.data[0].type == 'scatter':
-        trace = fig.data[0]
-        mask = ~np.isin(trace.x, hidden_models)
-        trace.x, trace.y, trace.text = np.array(trace.x)[mask], np.array(trace.y)[mask], np.array(trace.text)[mask]
-        for key in ['array', 'arrayminus']:
-            if key in trace.error_y:
-                trace.error_y[key] = trace.error_y[key][mask]
-    elif fig.data[0].type == 'bar':
-        mask = ~np.isin(fig.data[0].x, hidden_models)
-        fig.data[0].x = fig.data[0].x[mask]
-        fig.data[0].y = fig.data[0].y[mask]
-    return fig
 def load_leaderboard_table_csv(filename, add_hyperlink=True):
     lines = open(filename).readlines()
@@ -132,688 +248,842 @@ def load_leaderboard_table_csv(filename, add_hyperlink=True):
     rows = []
     for i in range(1, len(lines)):
         row = [v.strip() for v in lines[i].split(",")]
-        for j in range(len(heads)):
-            item = {}
-            for h, v in zip(heads, row):
-                if h == "Arena Elo rating":
-                    if v != "-":
                         v = int(ast.literal_eval(v))
-                    else:
-                        v = np.nan
-                elif h == "MMLU":
-                    if v != "-":
-                        v = round(ast.literal_eval(v) * 100, 1)
-                    else:
-                        v = np.nan
-                elif h == "MT-bench (win rate %)":
-                    if v != "-":
-                        v = round(ast.literal_eval(v[:-1]), 1)
-                    else:
-                        v = np.nan
-                elif h == "MT-bench (score)":
-                    if v != "-":
-                        v = round(ast.literal_eval(v), 2)
-                    else:
-                        v = np.nan
-                item[h] = v
-            if add_hyperlink:
                 item["Model"] = model_hyperlink(item["Model"], item["Link"])
         rows.append(item)
     return rows
 def create_ranking_str(ranking, ranking_difference):
-    if ranking_difference > 0:
-        return f"{int(ranking)} \u2191"
-    elif ranking_difference < 0:
-        return f"{int(ranking)} \u2193"
-    else:
-        return f"{int(ranking)}"
 def recompute_final_ranking(arena_df):
-    # compute ranking based on CI
     ranking = {}
-    for i, model_a in enumerate(arena_df.index):
-        ranking[model_a] = 1
-        for j, model_b in enumerate(arena_df.index):
-            if i == j:
                 continue
-            if (
-                arena_df.loc[model_b]["rating_q025"]
-                > arena_df.loc[model_a]["rating_q975"]
-            ):
-                ranking[model_a] += 1
     return list(ranking.values())
 def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
-    # Apply hidden_models filter first
     if hidden_models:
-        arena_df = arena_df[~arena_df.index.isin(hidden_models)].copy()
-    arena_df = arena_df.sort_values(
-        by=["final_ranking", "rating"], ascending=[True, False]
-    )
-    arena_df["final_ranking"] = recompute_final_ranking(arena_df)
-    arena_df = arena_df.sort_values(
-        by=["final_ranking", "rating"], ascending=[True, False]
-    )
-    # sort by rating
-    if arena_subset_df is not None:
-        # filter out models not in the arena_df
-        arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
-        arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
-        arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
-        # keep only the models in the subset in arena_df and recompute final_ranking
-        arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
-        # recompute final ranking
-        arena_df["final_ranking"] = recompute_final_ranking(arena_df)
-        # assign ranking by the order
-        arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
-        arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
-        # join arena_df and arena_subset_df on index
-        arena_df = arena_subset_df.join(
-            arena_df["final_ranking"], rsuffix="_global", how="inner"
-        )
-        arena_df["ranking_difference"] = (
-            arena_df["final_ranking_global"] - arena_df["final_ranking"]
-        )
-        arena_df = arena_df.sort_values(
-            by=["final_ranking", "rating"], ascending=[True, False]
-        )
-        arena_df["final_ranking"] = arena_df.apply(
-            lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]),
-            axis=1,
-        )
-    arena_df["final_ranking"] = arena_df["final_ranking"].astype(str)
-    values = []
-    for i in range(len(arena_df)):
-        row = []
-        model_key = arena_df.index[i]
-        try:
-            model_name = model_table_df[model_table_df["key"] == model_key][
-                "Model"
-            ].values[0]
-            ranking = arena_df.iloc[i].get("final_ranking") or i + 1
-            row.append(ranking)
-            if arena_subset_df is not None:
-                row.append(arena_df.iloc[i].get("ranking_difference") or 0)
-            row.append(model_name)
-            row.append(round(arena_df.iloc[i]["rating"]))
-            upper_diff = round(
-                arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
-            )
-            lower_diff = round(
-                arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
             )
-            row.append(f"+{upper_diff}/-{lower_diff}")
-            row.append(round(arena_df.iloc[i]["num_battles"]))
-            row.append(
-                model_table_df[model_table_df["key"] == model_key][
-                    "Organization"
-                ].values[0]
             )
-            row.append(
-                model_table_df[model_table_df["key"] == model_key]["License"].values[0]
             )
-            cutoff_date = model_table_df[model_table_df["key"] == model_key][
-                "Knowledge cutoff date"
-            ].values[0]
-            if cutoff_date == "-":
-                row.append("Unknown")
             else:
-                row.append(cutoff_date)
-            values.append(row)
-        except Exception as e:
-            traceback.print_exc()
-            print(f"{model_key} - {e}")
     return values
 key_to_category_name = {
-    "full": "Overall",
     "crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
     "site_visitors/medium_prompts": "site_visitors/medium_prompts",
-    "site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style control"
 }
 cat_name_to_explanation = {
     "Overall": "All queries",
-    "crowdsourcing/simple_prompts": "Queries collected through crowdsourcing. Mostly simple ones.",
     "site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
-    "site_visitors/medium_prompts:style control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
 }
 cat_name_to_baseline = {
-    "Hard Prompts (English)": "English",
 }
 actual_categories = [
-    # "Overall",
-    # "crowdsourcing/simple_prompts",
     "site_visitors/medium_prompts",
     "site_visitors/medium_prompts:style control"
 ]
-req_cat = "site_visitors/medium_prompts:style control"
-# selected_category = req_cat if req_cat in actual_categories else "Overall"
-selected_category = req_cat if req_cat in actual_categories else "site_visitors/medium_prompts:style control"
 def read_elo_file(elo_results_file, leaderboard_table_file):
     arena_dfs = {}
     category_elo_results = {}
-    with open(elo_results_file, "rb") as fin:
-        elo_results = pickle.load(fin)
-        last_updated_time = None
-        if selected_category in elo_results:
-            last_updated_time = elo_results[selected_category]["last_updated_datetime"].split(
-                " "
-            )[0]
-            for k in key_to_category_name.keys():
-                if k not in elo_results:
-                    continue
-                arena_dfs[key_to_category_name[k]] = elo_results[k][
-                    "leaderboard_table_df"
-                ]
-                category_elo_results[key_to_category_name[k]] = elo_results[k]
-    data = load_leaderboard_table_csv(leaderboard_table_file)
-    model_table_df = pd.DataFrame(data)
     return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
 def build_leaderboard_tab(
     elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
 ):
-    arena_dfs = {}
-    arena_df = pd.DataFrame()
-    category_elo_results = {}
-    last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
-    arena_df = arena_dfs[selected_category]
-    p1 = category_elo_results[selected_category]["win_fraction_heatmap"]
-    p2 = category_elo_results[selected_category]["battle_count_heatmap"]
-    p3 = category_elo_results[selected_category]["bootstrap_elo_rating"]
-    p4 = category_elo_results[selected_category]["average_win_rate_bar"]
-    # arena_df = arena_dfs["Overall"]
-    default_md = make_default_md_1()
-    default_md_2 = make_default_md_2()
     with gr.Row():
         with gr.Column(scale=4):
             md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
         with gr.Column(scale=1):
             vote_button = gr.Button("Vote!", link="https://llmarena.ru")
     md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
-    if leaderboard_table_file:
-        data = load_leaderboard_table_csv(leaderboard_table_file)
-        model_table_df = pd.DataFrame(data)
-        with gr.Tabs() as tabs:
-            arena_table_vals = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
-            with gr.Tab("Arena", id=0):
-                md = make_arena_leaderboard_md(arena_dfs[selected_category], last_updated_time)
-                lb_description = gr.Markdown(md, elem_id="leaderboard_markdown")
-                with gr.Row():
-                    with gr.Column(scale=2):
-                        category_dropdown = gr.Dropdown(
-                            choices=actual_categories,
-                            value=selected_category,
-                            label="Category",
-                        )
-                    with gr.Column(scale=2):
-                        category_checkbox = gr.CheckboxGroup(
-                            ["Deprecated"],
-                            label="Filter",
-                            value=[],
-                            info="",
-                        )
-                    default_category_details = make_category_arena_leaderboard_md(
-                        arena_df, arena_df, name=selected_category
-                    )
-                    with gr.Column(scale=4, variant="panel"):
-                        category_deets = gr.Markdown(
-                            default_category_details, elem_id="category_deets"
-                        )
-                arena_vals = pd.DataFrame(
-                    arena_table_vals,
-                    columns=[
-                        "Rank* (UB)",
-                        "Model",
-                        "Arena Elo",
-                        "95% CI",
-                        "Votes",
-                        "Organization",
-                        "License",
-                        "Knowledge Cutoff",
-                    ],
-                )
-                elo_display_df = gr.Dataframe(
-                    headers=[
-                        "Rank* (UB)",
-                        "Model",
-                        "Arena Elo",
-                        "95% CI",
-                        "Votes",
-                        "Organization",
-                        "License",
-                        "Knowledge Cutoff",
-                    ],
-                    datatype=[
-                        "str",
-                        "markdown",
-                        "number",
-                        "str",
-                        "number",
-                        "str",
-                        "str",
-                        "str",
-                    ],
-                    value=arena_vals.style,
-                    elem_id="arena_leaderboard_dataframe",
-                    height=700,
-                    column_widths=[70, 190, 100, 100, 90, 130, 150, 100],
-                    wrap=True,
                 )
-                gr.Markdown(
-                    elem_id="leaderboard_markdown",
                 )
-                leader_component_values[:] = [default_md, p1, p2, p3, p4]
-                if show_plot:
-                    more_stats_md = gr.Markdown(
-                        f"""## More statistics on Chatbot Arena""",
-                        elem_id="leaderboard_header_markdown",
-                    )
-                    with gr.Row():
-                        with gr.Column():
-                            gr.Markdown(
-                                "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
-                                elem_id="plot-title",
-                            )
-                            plot_3 = gr.Plot(p3, show_label=False)
-                        with gr.Column():
-                            gr.Markdown(
-                                "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
-                                elem_id="plot-title",
-                            )
-                            plot_4 = gr.Plot(p4, show_label=False)
-                    with gr.Row():
-                        with gr.Column():
-                            gr.Markdown(
-                                "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
-                                elem_id="plot-title",
-                            )
-                            plot_1 = gr.Plot(
-                                p1, show_label=False, elem_id="plot-container"
-                            )
-                        with gr.Column():
-                            gr.Markdown(
-                                "#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
-                                elem_id="plot-title",
-                            )
-                            plot_2 = gr.Plot(p2, show_label=False)
-        if not show_plot:
-            gr.Markdown(
-                """
-                """,
-                elem_id="leaderboard_markdown",
-            )
-    else:
-        pass
-    def update_leaderboard_df(arena_table_vals):
-        elo_datarame = pd.DataFrame(
-            arena_table_vals,
             columns=[
-                "Rank* (UB)",
-                "Delta",
-                "Model",
-                "Arena Elo",
-                "95% CI",
-                "Votes",
-                "Organization",
-                "License",
-                "Knowledge Cutoff",
-            ],
-        )
-        def highlight_max(s):
-            return [
-                "color: green; font-weight: bold"
-                if "\u2191" in v
-                else "color: red; font-weight: bold"
-                if "\u2193" in v
-                else ""
-                for v in s
             ]
-        def highlight_rank_max(s):
-            return [
-                "color: green; font-weight: bold"
-                if v > 0
-                else "color: red; font-weight: bold"
-                if v < 0
-                else ""
-                for v in s
-            ]
-        return elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
-            highlight_rank_max, subset=["Delta"]
         )
-    def update_leaderboard_and_plots(category, filters):
-        _, arena_dfs, category_elo_results, _, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
-        arena_subset_df = arena_dfs[category]
-        arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 200]
-        elo_subset_results = category_elo_results[category]
-        baseline_category = cat_name_to_baseline.get(category, selected_category)
-        arena_df = arena_dfs[baseline_category]
-        arena_values = get_arena_table(
-            arena_df,
-            model_table_df,
-            arena_subset_df=arena_subset_df if category != "Overall" else None,
-            hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
-        )
-        # Filter plots based on deprecated models
-        p1 = filter_deprecated_models_plots(
-            elo_subset_results["win_fraction_heatmap"],
-            hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
-        )
-        p2 = filter_deprecated_models_plots(
-            elo_subset_results["battle_count_heatmap"],
-            hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
-        )
-        p3 = filter_deprecated_models_plots(
-            elo_subset_results["bootstrap_elo_rating"],
-            hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
-        )
-        p4 = filter_deprecated_models_plots(
-            elo_subset_results["average_win_rate_bar"],
-            hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
-        )
-        if category != "Overall":
-            arena_values = update_leaderboard_df(arena_values)
-            arena_values = gr.Dataframe(
-                headers=[
-                    "Rank* (UB)",
-                    "Delta",
-                    "Model",
-                    "Arena Elo",
-                    "95% CI",
-                    "Votes",
-                    "Organization",
-                    "License",
-                    "Knowledge Cutoff",
-                ],
-                datatype=[
-                    "str",
-                    "number",
-                    "markdown",
-                    "number",
-                    "str",
-                    "number",
-                    "str",
-                    "str",
-                    "str",
                 ],
-                value=arena_values,
-                elem_id="arena_leaderboard_dataframe",
-                height=700,
-                column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100],
-                wrap=True,
             )
-        else:
-            arena_values = gr.Dataframe(
-                headers=[
-                    "Rank* (UB)",
-                    "Model",
-                    "Arena Elo",
-                    "95% CI",
-                    "Votes",
-                    "Organization",
-                    "License",
-                    "Knowledge Cutoff",
-                ],
-                datatype=[
-                    "str",
-                    "markdown",
-                    "number",
-                    "str",
-                    "number",
-                    "str",
-                    "str",
-                    "str",
-                ],
-                value=arena_values,
-                elem_id="arena_leaderboard_dataframe",
-                height=700,
-                column_widths=[70, 190, 100, 100, 90, 140, 150, 100],
-                wrap=True,
             )
-        p1 = elo_subset_results["win_fraction_heatmap"]
-        p2 = elo_subset_results["battle_count_heatmap"]
-        p3 = elo_subset_results["bootstrap_elo_rating"]
-        p4 = elo_subset_results["average_win_rate_bar"]
-        more_stats_md = f"""## More Statistics for Chatbot Arena - {category}
-        """
-        leaderboard_md = make_category_arena_leaderboard_md(
-            arena_df, arena_subset_df, name=category
-        )
-        return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
-    if leaderboard_table_file:
-        category_dropdown.change(
-            fn=update_leaderboard_and_plots,
-            inputs=[category_dropdown, category_checkbox],
-            outputs=[
-                elo_display_df,
-                plot_1,
-                plot_2,
-                plot_3,
-                plot_4,
-                more_stats_md,
-                category_deets,
-            ],
-        )
-        category_checkbox.change(
-            update_leaderboard_and_plots,
-            inputs=[category_dropdown, category_checkbox],
-            outputs=[
-                elo_display_df,
-                plot_1,
-                plot_2,
-                plot_3,
-                plot_4,
-                more_stats_md,
-                category_deets,
-            ],
         )
-    if show_plot and leaderboard_table_file:
-        return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4]
-    return [md_1]
-def build_demo(elo_results_file, leaderboard_table_file):
-    text_size = gr.themes.sizes.text_lg
-    theme = gr.themes.Default.load("theme.json")
-    theme.text_size = text_size
-    theme.set(
-        button_large_text_size="40px",
-        button_small_text_size="40px",
-        button_large_text_weight="1000",
-        button_small_text_weight="1000",
-        button_shadow="*shadow_drop_lg",
-        button_shadow_hover="*shadow_drop_lg",
-        checkbox_label_shadow="*shadow_drop_lg",
-        button_shadow_active="*shadow_inset",
-        button_secondary_background_fill="*primary_300",
-        button_secondary_background_fill_dark="*primary_700",
-        button_secondary_background_fill_hover="*primary_200",
-        button_secondary_background_fill_hover_dark="*primary_500",
-        button_secondary_text_color="*primary_800",
-        button_secondary_text_color_dark="white",
-    )
-    with gr.Blocks(
-        title="LLM arena: leaderboard",
-        theme=theme,
-        css=block_css,
-    ) as demo:
-        build_leaderboard_tab(
-            elo_results_file, leaderboard_table_file, show_plot=True, mirror=True
-        )
-    return demo
-block_css = """
-#notice_markdown .prose {
-    font-size: 110% !important;
-}
-#notice_markdown th {
-    display: none;
-}
-#notice_markdown td {
-    padding-top: 6px;
-    padding-bottom: 6px;
-}
-#arena_leaderboard_dataframe table {
-    font-size: 110%;
-}
-#full_leaderboard_dataframe table {
-    font-size: 110%;
-}
-#model_description_markdown {
-    font-size: 110% !important;
-}
-#leaderboard_markdown .prose {
-    font-size: 110% !important;
-}
-#leaderboard_markdown td {
-    padding-top: 6px;
-    padding-bottom: 6px;
-}
-#leaderboard_dataframe td {
-    line-height: 0.1em;
-}
-#about_markdown .prose {
-    font-size: 110% !important;
-}
-#ack_markdown .prose {
-    font-size: 110% !important;
-}
-#chatbot .prose {
-    font-size: 105% !important;
-}
-.sponsor-image-about img {
-    margin: 0 20px;
-    margin-top: 20px;
-    height: 40px;
-    max-height: 100%;
-    width: auto;
-    float: left;
-}
-.chatbot h1, h2, h3 {
-    margin-top: 8px; /* Adjust the value as needed */
-    margin-bottom: 0px; /* Adjust the value as needed */
-    padding-bottom: 0px;
-}
-.chatbot h1 {
-    font-size: 130%;
-}
-.chatbot h2 {
-    font-size: 120%;
-}
-.chatbot h3 {
-    font-size: 110%;
-}
-.chatbot p:not(:first-child) {
-    margin-top: 8px;
-}
-.typing {
-    display: inline-block;
-}
-.cursor {
-    display: inline-block;
-    width: 7px;
-    height: 1em;
-    background-color: black;
-    vertical-align: middle;
-    animation: blink 1s infinite;
-}
-.dark .cursor {
-    display: inline-block;
-    width: 7px;
-    height: 1em;
-    background-color: white;
-    vertical-align: middle;
-    animation: blink 1s infinite;
-}
-@keyframes blink {
-    0%, 50% { opacity: 1; }
-    50.1%, 100% { opacity: 0; }
-}
-.app {
-  max-width: 100% !important;
-  padding: 20px !important;
-}
-a {
-    color: #1976D2; /* Your current link color, a shade of blue */
-    text-decoration: none; /* Removes underline from links */
-}
-a:hover {
-    color: #63A4FF; /* This can be any color you choose for hover */
-    text-decoration: underline; /* Adds underline on hover */
-}
-"""
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--share", action="store_true")
     parser.add_argument("--host", default="0.0.0.0")
     parser.add_argument("--port", type=int, default=7860)
     args = parser.parse_args()
-    elo_result_files = glob.glob("elo_results_*.pkl")
-    elo_result_files.sort(key=lambda x: int(x[12:-4]))
-    elo_result_file = elo_result_files[-1]
-    leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
-    leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
-    leaderboard_table_file = leaderboard_table_files[-1]
     demo = build_demo(elo_result_file, leaderboard_table_file)
-    demo.launch(show_api=False)

 import pickle
 import traceback
 import numpy as np
 import pandas as pd
 import gradio as gr
 import numpy as np
 promo_banner = """
 <div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
     USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
 deprecated_model_name = [
     "GigaChat 3.1.25.3",
+    "GigaChat-Pro 2.2.25.3",
     "saiga_llama3_8b_v6",
     "saiga_phi3_medium",
     "GigaChat-Plus 3.1.25.3",
     "GigaChat-Pro 4.0.26.8",
     "GigaChat 4.0.26.8",
+    "xAI: Grok 2",
     "GigaChat-Pro 4.0.26.15",
     "GigaChat 4.0.26.15",
+    "YandexGPT Experimental", "yandex-gpt-arena",
+    "RefalMachine/ruadapt_llama3_instruct_lep_saiga_kto_ablitirated"
+]
+models_10b = [
+    "saiga_llama3_8b_v7",
+    "Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it",
+    "T-lite-instruct-0.1",
+    "t-tech/T-lite-it-1.0",
+    "LLaMA-3 Chat (8B)",
+    "Llama 3.1 8B Instruct Turbo",
+    "MTSAIR/Cotype-Nano"
 ]
 def make_default_md_1():
     leaderboard_md = f"""
 # 🏆 LLM Arena in Russian: Leaderboard
 """
     return leaderboard_md
 def make_default_md_2():
     leaderboard_md = f"""
     The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
     Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
     - To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
     - If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
     - You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
     """
     return leaderboard_md
 def make_arena_leaderboard_md(arena_df, last_updated_time):
+    # Using version from monitor.py (translated)
+    total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
     total_models = len(arena_df)
+    space = "   " # Using HTML space
     leaderboard_md = f"""
 Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
 def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
+    total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
     total_models = len(arena_df)
+    space = "   "
+    total_subset_votes = sum(arena_subset_df["num_battles"]) if not arena_subset_df.empty else 0
     total_subset_models = len(arena_subset_df)
+    perc_models = round(total_subset_models / total_models * 100) if total_models > 0 else 0
+    perc_votes = round(total_subset_votes / total_votes * 100) if total_votes > 0 else 0
+    leaderboard_md = f"""### {cat_name_to_explanation.get(name, name)}
+#### {space} #models: **{total_subset_models} ({perc_models}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({perc_votes}%)**{space}
 """
     return leaderboard_md
 def model_hyperlink(model_name, link):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def filter_deprecated_models_plots(fig, hidden_models=None, limit_to_top=25):
     """
+    Filters Plotly plots to show only top N models and optionally removes specific models.
     Args:
         fig: The Plotly figure object.
+        hidden_models (list, optional): A list of model names to remove. Defaults to None.
+        limit_to_top (int, optional): Limit display to top N models (0 or None means no limit). Defaults to 25.
+    Returns:
+        Plotly figure: The filtered figure object or the original if filtering fails or is not applicable.
     """
     if fig is None:
+        return None
+    # Check if the figure has data
+    if not hasattr(fig, 'data') or len(fig.data) == 0:
+        return fig
+    # Check if data has a type attribute
+    if not hasattr(fig.data[0], 'type'):
         return fig
+    # Check minimum number of models after initial hidden_models filtering
+    models_to_check = []
+    if hasattr(fig.data[0], 'x'):
+      models_to_check = fig.data[0].x
+    elif hasattr(fig.data[0], 'y'): # For some types like bar, X axis might be numeric
+      models_to_check = fig.data[0].y
+    if hidden_models is not None and models_to_check.any():
+        available_models = [x for x in models_to_check if x not in hidden_models]
+        # print(f"Available models before top N: {len(available_models)}") # Debug
+        if len(available_models) <= 2:  # If less than 3 models remain before top_n
+            # print(f"Warning: Too few models left after initial filtering ({len(available_models)}), returning original plot.")
+            return fig # Return the original plot if too few models
+    if limit_to_top is not None and limit_to_top <= 0:
+        limit_to_top = None
+    try:
+        # Work on a deep copy to avoid modifying the original figure object
+        fig_copy = pickle.loads(pickle.dumps(fig))
+        data = fig_copy.data[0]
+        if data.type == 'heatmap':
+            # Apply hidden models filter
+            mask_x = ~np.isin(data.x, hidden_models) if hidden_models is not None else np.ones_like(data.x, dtype=bool)
+            mask_y = ~np.isin(data.y, hidden_models) if hidden_models is not None else np.ones_like(data.y, dtype=bool)
+            # Get initially filtered X and Y arrays
+            filtered_x = np.array(data.x)[mask_x]
+            filtered_y = np.array(data.y)[mask_y]
+            # Apply top N limit (assuming the order is already by rank/rating)
+            if limit_to_top is not None and len(filtered_x) > limit_to_top:
+                top_models = filtered_x[:limit_to_top]
+                # Create new masks based on the top models relative to the *original* data axes
+                mask_x = np.isin(data.x, top_models)
+                mask_y = np.isin(data.y, top_models)
+                # Get final filtered axes
+                filtered_x = np.array(data.x)[mask_x]
+                filtered_y = np.array(data.y)[mask_y]
+            elif len(filtered_x) <= 2: # If <=2 models remain after filtering
+                 return fig # Return original
+            # Update the heatmap data
+            data.x = filtered_x
+            data.y = filtered_y
+            # Important: Indexing 'z' must use masks derived from the *original* data order
+            z_original = np.array(fig.data[0].z)
+            data.z = z_original[np.ix_(mask_y, mask_x)]
+        elif data.type == 'scatter':
+            trace = data
+            # Apply hidden models filter
+            mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
+            # Get initially filtered arrays
+            current_x = np.array(trace.x)[mask]
+            current_y = np.array(trace.y)[mask]
+            current_text = np.array(trace.text)[mask] if hasattr(trace, 'text') and trace.text is not None else None
+            # Handle error bars safely
+            current_error_y_array = np.array(trace.error_y['array'])[mask] if 'error_y' in trace and 'array' in trace.error_y and trace.error_y['array'] is not None else None
+            current_error_y_arrayminus = np.array(trace.error_y['arrayminus'])[mask] if 'error_y' in trace and 'arrayminus' in trace.error_y and trace.error_y['arrayminus'] is not None else None
+            # Apply top N limit
+            if limit_to_top is not None and len(current_x) > limit_to_top:
+                # Sort by y-value (rating) descending to find the top N
+                sort_indices = np.argsort(-current_y)[:limit_to_top]
+                current_x = current_x[sort_indices]
+                current_y = current_y[sort_indices]
+                if current_text is not None:
+                    current_text = current_text[sort_indices]
+                if current_error_y_array is not None:
+                    current_error_y_array = current_error_y_array[sort_indices]
+                if current_error_y_arrayminus is not None:
+                    current_error_y_arrayminus = current_error_y_arrayminus[sort_indices]
+            elif len(current_x) <= 2: # If <=2 models remain after filtering
+                return fig # Return original
+            # Update the scatter trace data
+            trace.x, trace.y = current_x, current_y
+            if current_text is not None:
+                trace.text = current_text
+            # Update error bars if they exist
+            if current_error_y_array is not None:
+                # Ensure error_y exists before assigning
+                if 'error_y' not in trace: trace.error_y = {}
+                trace.error_y['array'] = current_error_y_array
+            if current_error_y_arrayminus is not None:
+                if 'error_y' not in trace: trace.error_y = {}
+                trace.error_y['arrayminus'] = current_error_y_arrayminus
+        elif data.type == 'bar':
+            trace = data
+            # Apply hidden models filter
+            mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
+            # Get initially filtered arrays
+            current_x = np.array(trace.x)[mask]
+            current_y = np.array(trace.y)[mask]
+            # Apply top N limit
+            if limit_to_top is not None and len(current_x) > limit_to_top:
+                # Sort by y-value (rating) descending
+                sort_indices = np.argsort(-current_y)[:limit_to_top]
+                current_x = current_x[sort_indices]
+                current_y = current_y[sort_indices]
+            elif len(current_x) <= 2: # If <=2 models remain after filtering
+                return fig # Return original
+            # Update the bar trace data
+            trace.x, trace.y = current_x, current_y
+        return fig_copy
+    except Exception as e:
+        print(f"Error filtering plot: {e}")
+        traceback.print_exc()
+        return fig # Return original figure on error
 def load_leaderboard_table_csv(filename, add_hyperlink=True):
     lines = open(filename).readlines()
     rows = []
     for i in range(1, len(lines)):
         row = [v.strip() for v in lines[i].split(",")]
+        item = {} # Create dictionary once per row
+        for h, v in zip(heads, row):
+            if h == "Arena Elo rating":
+                if v != "-":
+                    try:
                         v = int(ast.literal_eval(v))
+                    except:
+                        v = np.nan # Handle parsing errors
+                else:
+                    v = np.nan
+            item[h] = v
+        if add_hyperlink and "Model" in item and "Link" in item: # Check keys exist
+            # Check for empty/missing link
+            if item["Link"] and item["Link"] != "-":
                 item["Model"] = model_hyperlink(item["Model"], item["Link"])
+            # Otherwise, keep the model name as is
         rows.append(item)
     return rows
 def create_ranking_str(ranking, ranking_difference):
+    # Convert rank to int before comparison
+    try:
+        # Ensure rank and difference are treated as numbers
+        ranking_val = int(float(ranking)) # Handle potential float input
+        ranking_difference_val = int(float(ranking_difference))
+        if ranking_difference_val > 0:
+            return f"{ranking_val} ↑"
+        elif ranking_difference_val < 0:
+            return f"{ranking_val} ↓"
+        else:
+            return f"{ranking_val}"
+    except (ValueError, TypeError): # Handle cases where rank is not numeric
+        return str(ranking)
 def recompute_final_ranking(arena_df):
     ranking = {}
+    if arena_df.empty:
+        return []
+    model_indices = arena_df.index
+    # Ensure CI columns exist before trying to access them
+    if "rating_q025" not in arena_df.columns or "rating_q975" not in arena_df.columns:
+        print("Warning: Confidence interval columns ('rating_q025', 'rating_q975') not found in DataFrame. Cannot compute UB Rank.")
+        # Return NaN or simple rank based on order
+        return [np.nan] * len(model_indices) # Or range(1, len(model_indices) + 1)
+    ratings_q025 = arena_df["rating_q025"].to_dict()
+    ratings_q975 = arena_df["rating_q975"].to_dict()
+    for model_a in model_indices:
+        rank = 1
+        rating_a_q975 = ratings_q975.get(model_a)
+        # Skip if model A has no CI data
+        if pd.isna(rating_a_q975):
+             ranking[model_a] = np.nan # Or assign max rank + 1
+             continue
+        for model_b in model_indices:
+            if model_a == model_b:
                 continue
+            rating_b_q025 = ratings_q025.get(model_b)
+            # Skip comparison if model B has no CI data
+            if pd.isna(rating_b_q025):
+                 continue
+            # Check if B is statistically better than A
+            if rating_b_q025 > rating_a_q975:
+                rank += 1
+        ranking[model_a] = rank
     return list(ranking.values())
 def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
+    """
+    Generates the leaderboard table data.
+    'use_cache' parameter removed.
+    """
+    # print(f'Calculating get_arena_table') # Debug
+    # Create copies to avoid modifying original DataFrames
+    arena_df_processed = arena_df.copy()
+    if arena_subset_df is not None:
+        arena_subset_df_processed = arena_subset_df.copy()
+    else:
+        arena_subset_df_processed = None
+    # Sort by rating initially to have a stable order before ranking
+    arena_df_processed = arena_df_processed.sort_values(by=["rating"], ascending=False)
+    # Compute 'final_ranking' based on CIs if possible
+    if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
+         arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
+         arena_df_processed = arena_df_processed.sort_values(
+             by=["final_ranking", "rating"], ascending=[True, False]
+         )
+    else:
+         # Fallback to simple ordering if CI columns are missing
+         arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
     if hidden_models:
+        arena_df_processed = arena_df_processed[~arena_df_processed.index.isin(hidden_models)].copy()
+        # Recompute ranks for the filtered view
+        if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
+            arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
+            # Re-sort based on new ranks
+            arena_df_processed = arena_df_processed.sort_values(
+                by=["final_ranking", "rating"], ascending=[True, False]
+            )
+        else:
+             arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
+    if arena_subset_df_processed is not None:
+        # Filter subset by hidden_models first
+        if hidden_models:
+             arena_subset_df_processed = arena_subset_df_processed[~arena_subset_df_processed.index.isin(hidden_models)].copy()
+        # Ensure models in the subset are also present in the (filtered) main view
+        arena_subset_df_processed = arena_subset_df_processed[arena_subset_df_processed.index.isin(arena_df_processed.index)]
+        # Proceed only if subset is not empty and has CI columns
+        if not arena_subset_df_processed.empty and "rating_q025" in arena_subset_df_processed.columns and "rating_q975" in arena_subset_df_processed.columns:
+            # Rank within the subset
+            arena_subset_df_processed = arena_subset_df_processed.sort_values(by=["rating"], ascending=False)
+            arena_subset_df_processed["final_ranking_subset"] = recompute_final_ranking(arena_subset_df_processed) # Rank within category
+            # Filter the main processed DF to only include models from the subset
+            # 'final_ranking' here represents the rank *among these models* in the baseline category view
+            arena_df_for_join = arena_df_processed[arena_df_processed.index.isin(arena_subset_df_processed.index)][["final_ranking", "rating"]].copy()
+            arena_df_for_join.rename(columns={"final_ranking": "final_ranking_baseline"}, inplace=True)
+            # Join the subset ranks and baseline ranks
+            arena_df_combined = arena_subset_df_processed[["final_ranking_subset", "rating"]].join(
+                 arena_df_for_join["final_ranking_baseline"], how="inner"
             )
+            # Calculate rank difference
+            arena_df_combined["ranking_difference"] = arena_df_combined["final_ranking_baseline"] - arena_df_combined["final_ranking_subset"]
+            # Sort by subset rank and rating
+            arena_df_combined = arena_df_combined.sort_values(
+                by=["final_ranking_subset", "rating"], ascending=[True, False]
             )
+            # Format the rank string with delta for display
+            arena_df_combined["display_ranking"] = arena_df_combined.apply(
+                lambda x: create_ranking_str(x["final_ranking_subset"], x["ranking_difference"]),
+                axis=1,
             )
+            arena_df_processed = arena_df_processed.loc[arena_df_combined.index] # Reorder arena_df_processed
+            columns_to_join = ["display_ranking", "ranking_difference", "final_ranking_subset"]
+            columns_to_join = [col for col in columns_to_join if col in arena_df_combined.columns]
+            arena_df_processed = arena_df_processed.join(arena_df_combined[columns_to_join], how="inner")
+            # Now sorting should work as the column exists
+            # Use the subset rank for final sorting if subset is active
+            # Check if 'final_ranking_subset' was successfully joined before sorting
+            if "final_ranking_subset" in arena_df_processed.columns:
+                arena_df_processed.sort_values(by=["final_ranking_subset", "rating"], ascending=[True, False], inplace=True)
             else:
+                # Fallback sort if join failed for some reason
+                arena_df_processed.sort_values(by=["rating"], ascending=False, inplace=True)
+        else:
+            # If subset is empty or lacks CI, disable subset logic
+            arena_subset_df_processed = None
+            # Use the baseline ranking as the display ranking
+            arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
+            arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
+    else:
+        # If no subset is used, display ranking is just the final rank from the main DF
+        arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
+        # Ensure it's sorted correctly
+        arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
+    values = []
+    # Iterate using the final sorted index of arena_df_processed
+    for model_key in arena_df_processed.index:
+        row_data = arena_df_processed.loc[model_key]
+        # Find model metadata
+        model_info = model_table_df[model_table_df["key"] == model_key]
+        if model_info.empty:
+            # print(f"Warning: Model key '{model_key}' not found in model_table_df. Skipping.")
+            continue # Skip if no metadata
+        row = []
+        # Rank (Display)
+        row.append(row_data.get("display_ranking", "")) # Use the calculated display rank
+        # Delta (only if subset was processed successfully)
+        if arena_subset_df_processed is not None:
+            row.append(row_data.get("ranking_difference", 0))
+        # Model Name (hyperlink applied during loading)
+        row.append(model_info["Model"].values[0])
+        # Arena Elo
+        row.append(round(row_data["rating"]))
+        # 95% CI
+        # Check for NaN before calculation
+        upper_rating = row_data.get("rating_q975")
+        lower_rating = row_data.get("rating_q025")
+        current_rating = row_data.get("rating")
+        upper_diff = round(upper_rating - current_rating) if pd.notna(upper_rating) and pd.notna(current_rating) else '?'
+        lower_diff = round(current_rating - lower_rating) if pd.notna(current_rating) and pd.notna(lower_rating) else '?'
+        row.append(f"+{upper_diff}/-{lower_diff}")
+        # Votes
+        row.append(round(row_data["num_battles"]))
+        # Organization
+        row.append(model_info["Organization"].values[0])
+        # License
+        row.append(model_info["License"].values[0])
+        # Knowledge Cutoff
+        cutoff_date = model_info["Knowledge cutoff date"].values[0]
+        row.append("Unknown" if cutoff_date == "-" else cutoff_date)
+        values.append(row)
     return values
 key_to_category_name = {
+    # Mapping from internal key to display name (kept English for consistency)
+    "full": "Overall", # Might not be used if filtered out later
     "crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
     "site_visitors/medium_prompts": "site_visitors/medium_prompts",
+    "site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style_control" # Use underscore for display consistency if needed
 }
 cat_name_to_explanation = {
+    # Translated explanations for display
     "Overall": "All queries",
+    "crowdsourcing/simple_prompts": "Queries collected via crowdsourcing. Mostly simple ones.",
     "site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
+    "site_visitors/medium_prompts:style_control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
 }
 cat_name_to_baseline = {
+    # Baseline category for comparison (if needed, seems unused now but kept)
+    # "Hard Prompts (English)": "English",
 }
 actual_categories = [
+    # Categories available in the dropdown (use the *keys* from key_to_category_name)
+    # "Overall", # Removed
+    # "crowdsourcing/simple_prompts", # Removed
     "site_visitors/medium_prompts",
     "site_visitors/medium_prompts:style control"
 ]
+# Default selected category key
+req_cat_key = "site_visitors/medium_prompts:style control"
+selected_category_key = req_cat_key if req_cat_key in actual_categories else ("site_visitors/medium_prompts" if "site_visitors/medium_prompts" in actual_categories else (actual_categories[0] if actual_categories else None))
+# Get the display name for the selected category
+selected_category_display_name = key_to_category_name.get(selected_category_key, selected_category_key) # Fallback to key if not found
 def read_elo_file(elo_results_file, leaderboard_table_file):
+    # Version from monitor.py, but no lazy_load or caching
+    print('Reading Elo file...')
     arena_dfs = {}
     category_elo_results = {}
+    last_updated_time = "N/A" # Default value
+    elo_results = {} # Default value
+    model_table_df = pd.DataFrame() # Default value
+    try:
+        # Use context manager for file operations
+        with open(elo_results_file, "rb") as fin:
+            elo_results = pickle.load(fin)
+            # Try to get last updated time from primary or fallback categories
+            main_cat_key = "site_visitors/medium_prompts:style control"
+            fallback_cat_key_1 = "site_visitors/medium_prompts"
+            fallback_cat_key_2 = "full" # Another fallback
+            if main_cat_key in elo_results and "last_updated_datetime" in elo_results[main_cat_key]:
+                 last_updated_time = elo_results[main_cat_key]["last_updated_datetime"].split(" ")[0]
+            elif fallback_cat_key_1 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_1]:
+                 last_updated_time = elo_results[fallback_cat_key_1]["last_updated_datetime"].split(" ")[0]
+            elif fallback_cat_key_2 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_2]:
+                 last_updated_time = elo_results[fallback_cat_key_2]["last_updated_datetime"].split(" ")[0]
+            # Iterate through defined category keys
+            for key in key_to_category_name.keys():
+                display_name = key_to_category_name[key] # Get the display name
+                if key in elo_results:
+                    # Check for required data within the category result
+                    if "leaderboard_table_df" in elo_results[key] and isinstance(elo_results[key]["leaderboard_table_df"], pd.DataFrame):
+                         df = elo_results[key]["leaderboard_table_df"]
+                         # Filter by number of battles > 200
+                         # Store using the *display_name* as the key for consistency with dropdown/UI
+                         arena_dfs[display_name] = df[df["num_battles"] > 200].copy()
+                         category_elo_results[display_name] = elo_results[key]
+                    # else:
+                    #      print(f"Warning: 'leaderboard_table_df' not found or not a DataFrame for key '{key}'")
+                # else:
+                #      print(f"Warning: Key '{key}' not found in elo_results")
+        # Load model metadata CSV
+        data = load_leaderboard_table_csv(leaderboard_table_file)
+        model_table_df = pd.DataFrame(data)
+    except FileNotFoundError:
+        print(f"Error: Elo results file not found at {elo_results_file}")
+        # Return empty structures
+    except Exception as e:
+        print(f"Error reading elo file: {e}")
+        traceback.print_exc()
+        # Return empty structures
+    # Ensure correct data types are returned even on error
     return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
 def build_leaderboard_tab(
     elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
 ):
+    # Load data once during build time
+    try:
+        last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
+    except Exception as e:
+        print(f"Failed to load initial data: {e}")
+        # Set empty defaults to prevent app crash
+        last_updated_time = "Error"
+        arena_dfs = {}
+        category_elo_results = {}
+        elo_results = {}
+        model_table_df = pd.DataFrame()
+    # Get data for the default selected category
+    # Use the *display name* derived from the selected key
+    if selected_category_display_name in arena_dfs:
+        arena_df = arena_dfs[selected_category_display_name]
+        elo_subset_results_init = category_elo_results[selected_category_display_name]
+        p1_init = elo_subset_results_init.get("win_fraction_heatmap")
+        p2_init = elo_subset_results_init.get("battle_count_heatmap")
+        p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
+        p4_init = elo_subset_results_init.get("average_win_rate_bar")
+    else:
+        # Fallback if default category is missing
+        fallback_cat_display_name = None
+        if actual_categories:
+             # Try the first actual category's display name
+             first_cat_key = actual_categories[0]
+             fallback_cat_display_name = key_to_category_name.get(first_cat_key, first_cat_key)
+        if fallback_cat_display_name and fallback_cat_display_name in arena_dfs:
+            print(f"Warning: Selected category '{selected_category_display_name}' not found. Falling back to '{fallback_cat_display_name}'.")
+            arena_df = arena_dfs[fallback_cat_display_name]
+            elo_subset_results_init = category_elo_results[fallback_cat_display_name]
+            p1_init = elo_subset_results_init.get("win_fraction_heatmap")
+            p2_init = elo_subset_results_init.get("battle_count_heatmap")
+            p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
+            p4_init = elo_subset_results_init.get("average_win_rate_bar")
+        else:
+            print(f"Warning: Default category '{selected_category_display_name}' and fallback categories not found in data.")
+            arena_df = pd.DataFrame() # Empty DataFrame
+            p1_init, p2_init, p3_init, p4_init = None, None, None, None
+    # Apply initial filtering to plots
+    p1_init = filter_deprecated_models_plots(p1_init, hidden_models=deprecated_model_name)
+    p2_init = filter_deprecated_models_plots(p2_init, hidden_models=deprecated_model_name)
+    p3_init = filter_deprecated_models_plots(p3_init, hidden_models=deprecated_model_name)
+    p4_init = filter_deprecated_models_plots(p4_init, hidden_models=deprecated_model_name)
+    default_md = make_default_md_1() # Parameters removed
+    default_md_2 = make_default_md_2() # Parameters removed
     with gr.Row():
         with gr.Column(scale=4):
+            # Removed Vote button
             md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
         with gr.Column(scale=1):
             vote_button = gr.Button("Vote!", link="https://llmarena.ru")
     md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
+    # Generate initial table data
+    if not arena_df.empty and not model_table_df.empty:
+         # Pass the baseline DF and the model table; initially no subset difference is shown
+         arena_table_vals_init = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
+    else:
+         arena_table_vals_init = []
+    # Single "Arena" tab
+    with gr.Tab("Arena", id=0): # Removed Tabs() as only one tab
+        md_arena = make_arena_leaderboard_md(arena_df, last_updated_time)
+        lb_description = gr.Markdown(md_arena, elem_id="leaderboard_markdown")
+        with gr.Row():
+            with gr.Column(scale=2):
+                # Use *display names* for choices if they differ significantly from keys,
+                # but here keys are descriptive enough. Callback receives the *key*.
+                category_dropdown = gr.Dropdown(
+                    # Choices should be the *keys* corresponding to display names
+                    choices=actual_categories,
+                    value=selected_category_key, # Use the key for the default value
+                    label="Category", # Translated
                 )
+            with gr.Column(scale=2):
+                category_checkbox = gr.CheckboxGroup(
+                    # Use user-friendly translated labels
+                    ["Show Deprecated", "Only <10B Models"], # Adjusted label for clarity
+                    label="Apply Filter",
+                    info="",
+                    value=[], # Filters off by default
                 )
+            # Category details
+            default_category_details = make_category_arena_leaderboard_md(
+                arena_df, arena_df, name=selected_category_display_name # Pass arena_df twice for initial display
+            ) if not arena_df.empty else "No data for category"
+            with gr.Column(scale=4, variant="panel"):
+                 category_deets = gr.Markdown(
+                     default_category_details, elem_id="category_deets"
+                 )
+        # DataFrame for displaying the table
+        # Initial view doesn't have 'Delta' column
+        arena_vals = pd.DataFrame(
+            arena_table_vals_init,
             columns=[
+                "Rank* (UB)", "Model", "Arena Elo", "95% CI",
+                "Votes", "Organization", "License", "Knowledge Cutoff"
             ]
+        ) if arena_table_vals_init else pd.DataFrame(columns=[ # Handle empty initial data
+                "Rank* (UB)", "Model", "Arena Elo", "95% CI",
+                "Votes", "Organization", "License", "Knowledge Cutoff"
+            ])
+        # Sort by Elo for initial display
+        if "Arena Elo" in arena_vals.columns:
+            arena_vals = arena_vals.sort_values(by="Arena Elo", ascending=False)
+        elo_display_df = gr.Dataframe(
+            headers=[ # Translated headers
+                "Rank* (UB)", "Model", "Arena Elo", "95% CI",
+                "Votes", "Organization", "License", "Knowledge Cutoff"
+            ],
+            datatype=[
+                "str", "markdown", "number", "str",
+                "number", "str", "str", "str"
+            ],
+            value=arena_vals.style, # Apply Pandas styling if needed
+            elem_id="arena_leaderboard_dataframe",
+            height=700,
+            column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths from monitor.py
+            wrap=True,
         )
+        gr.Markdown(elem_id="leaderboard_markdown") # Empty markdown for spacing
+        plot_1, plot_2, plot_3, plot_4 = None, None, None, None # Initialize plot variables
+        more_stats_md = None # Initialize markdown variable
+        if show_plot:
+            more_stats_md = gr.Markdown(
+                f"""## More Statistics for Chatbot Arena""", # Translated
+                elem_id="leaderboard_header_markdown",
+            )
+            with gr.Row(elem_id="leaderboard_bars"): # Use ID from monitor.py
+                with gr.Column():
+                    gr.Markdown( # Translated title
+                        "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
+                        elem_id="plot-title",
+                    )
+                    plot_3 = gr.Plot(p3_init, show_label=False) # Use initial data
+                with gr.Column():
+                    gr.Markdown( # Translated title
+                        "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
+                        elem_id="plot-title",
+                    )
+                    plot_4 = gr.Plot(p4_init, show_label=False) # Use initial data
+            with gr.Row(elem_id="leaderboard_plots"): # Use ID from monitor.py
+                with gr.Column():
+                    gr.Markdown( # Translated title
+                        "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
+                        elem_id="plot-title",
+                    )
+                    plot_1 = gr.Plot(
+                        p1_init, show_label=False, elem_id="plot-container" # Use initial data
+                    )
+                with gr.Column():
+                    gr.Markdown( # Translated title
+                        "#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
+                        elem_id="plot-title",
+                    )
+                    plot_2 = gr.Plot(p2_init, show_label=False) # Use initial data
+    def update_leaderboard_df(arena_table_vals):
+        # Add error handling for empty or incorrect data
+        # Expects 9 columns when Delta is present
+        if not arena_table_vals or not isinstance(arena_table_vals, list) or not arena_table_vals[0] or len(arena_table_vals[0]) != 9:
+            print("Warning: Invalid data for styling in update_leaderboard_df. Returning empty DataFrame.")
+            # Return an empty styled DataFrame to avoid Gradio errors
+            empty_styled = pd.DataFrame(columns=[
+                "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
+                "Votes", "Organization", "License", "Knowledge Cutoff"
+            ]).style
+            return empty_styled
+        try:
+            elo_datarame = pd.DataFrame(
+                arena_table_vals,
+                columns=[
+                    "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
+                    "Votes", "Organization", "License", "Knowledge Cutoff"
                 ],
             )
+            def highlight_max(s):
+                # Check rank string for arrows
+                return [
+                    "color: green; font-weight: bold" if "↑" in str(v) else
+                    "color: red; font-weight: bold" if "↓" in str(v) else ""
+                    for v in s
+                ]
+            def highlight_rank_max(s):
+                # Check Delta value (ensure it's numeric)
+                return [
+                    "color: green; font-weight: bold" if isinstance(v, (int, float)) and v > 0 else
+                    "color: red; font-weight: bold" if isinstance(v, (int, float)) and v < 0 else ""
+                    for v in s
+                ]
+            # Apply styles
+            styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
+                highlight_rank_max, subset=["Delta"]
             )
+            return styled_df
+        except Exception as e:
+            print(f"Error applying styles in update_leaderboard_df: {e}")
+            traceback.print_exc()
+            # Return unstyled DataFrame on error
+            return pd.DataFrame(arena_table_vals, columns=[
+                "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
+                "Votes", "Organization", "License", "Knowledge Cutoff"
+            ]).style
+    def update_leaderboard_and_plots(category_key, filters): # Receives category *key* from dropdown
+        # No caching
+        # Reload data on each call
+        try:
+             current_last_updated_time, current_arena_dfs, current_category_elo_results, _, current_model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
+        except Exception as e:
+             print(f"Error reloading data in callback: {e}")
+             # Return empty updates to prevent UI crash
+             empty_df_update = gr.Dataframe(value=pd.DataFrame().style) # Empty DataFrame
+             empty_plot_update = gr.Plot(value=None) # Empty Plot
+             empty_md_update = gr.Markdown(value="Error loading data.") # Error Markdown
+             # Match the number of outputs expected by the .change() call
+             num_plots = 4 if show_plot else 0
+             return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
+        # Use the display name corresponding to the selected key
+        category_display_name = key_to_category_name.get(category_key, category_key)
+        # Check if data exists for the selected category (using display name as key now)
+        if not current_arena_dfs or category_display_name not in current_arena_dfs or category_display_name not in current_category_elo_results or current_model_table_df.empty:
+             print(f"Warning: Data missing for category '{category_display_name}' (key: '{category_key}') after reload.")
+             empty_df_update = gr.Dataframe(value=pd.DataFrame().style)
+             empty_plot_update = gr.Plot(value=None)
+             empty_md_update = gr.Markdown(value=f"No data available for category: {category_display_name}")
+             num_plots = 4 if show_plot else 0
+             # Match the number of outputs
+             return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
+        # Get the specific data slices using the display name
+        arena_subset_df = current_arena_dfs[category_display_name]
+        elo_subset_results = current_category_elo_results[category_display_name]
+        # Use the hardcoded baseline key, get its display name
+        baseline_key = "site_visitors/medium_prompts:style control"
+        baseline_display_name = key_to_category_name.get(baseline_key, baseline_key)
+        # Fallback if baseline is missing
+        if baseline_display_name not in current_arena_dfs:
+            print(f"Warning: Baseline category '{baseline_display_name}' not found. Using selected category '{category_display_name}' as baseline.")
+            baseline_display_name = category_display_name # Fallback to the selected category itself
+        arena_df_baseline = current_arena_dfs[baseline_display_name]
+        hidden_models_list = None # Default: show all
+        # Check filter labels (must match the translated CheckboxGroup choices)
+        if "Show Deprecated" not in filters:
+            hidden_models_list = deprecated_model_name.copy() # Hide deprecated
+        if "Only <10B Models" in filters:
+            # Get all models currently in the baseline view
+            all_models_in_view = arena_df_baseline.index.tolist()
+            # Find models *not* in the allowed list
+            models_to_hide = [model for model in all_models_in_view if model not in models_10b]
+            if hidden_models_list is None: # If deprecated are not hidden
+                hidden_models_list = models_to_hide
+            else: # If deprecated are already hidden, add the non-<10B ones
+                # Use set to avoid duplicates
+                hidden_models_list = list(set(hidden_models_list + models_to_hide))
+        arena_table_values = get_arena_table(
+            arena_df_baseline, # Use the determined baseline DataFrame
+            current_model_table_df,
+            # Pass subset only if it's different from the baseline
+            arena_subset_df=(arena_subset_df if category_display_name != baseline_display_name else None),
+            hidden_models=hidden_models_list
         )
+        dataframe_update = None
+        # Show Delta column only if category is not the baseline and data exists
+        if category_display_name != baseline_display_name and arena_table_values:
+            styled_arena_values = update_leaderboard_df(arena_table_values) # Apply styling with Delta
+            # Check if styling was successful
+            if isinstance(styled_arena_values, pd.io.formats.style.Styler) and not styled_arena_values.data.empty:
+                dataframe_update = gr.Dataframe(
+                    headers=[ # Headers including Delta
+                        "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
+                        "Votes", "Organization", "License", "Knowledge Cutoff"
+                    ],
+                    datatype=[
+                        "str", "number", "markdown", "number", "str",
+                        "number", "str", "str", "str"
+                    ],
+                    value=styled_arena_values, # Pass the Styler object
+                    elem_id="arena_leaderboard_dataframe",
+                    height=700,
+                    column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], # Widths with Delta
+                    wrap=True,
+                )
+            else: # Handle styling failure
+                 dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
+        else: # Baseline category or no data for Delta
+             # Ensure data exists before creating DataFrame
+             if arena_table_values:
+                  # Create DataFrame without Delta column from the raw values
+                  df_no_delta = pd.DataFrame(arena_table_values, columns=[
+                      "Rank* (UB)", "Model", "Arena Elo", "95% CI",
+                      "Votes", "Organization", "License", "Knowledge Cutoff"
+                  ])
+                  dataframe_update = gr.Dataframe(
+                      headers=[ # Headers without Delta
+                          "Rank* (UB)", "Model", "Arena Elo", "95% CI",
+                          "Votes", "Organization", "License", "Knowledge Cutoff"
+                      ],
+                      datatype=[
+                          "str", "markdown", "number", "str", "number",
+                          "str", "str", "str"
+                      ],
+                      value=df_no_delta.style, # Apply basic Pandas styling
+                      elem_id="arena_leaderboard_dataframe",
+                      height=700,
+                      column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths without Delta
+                      wrap=True,
+                  )
+             else:
+                  dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
+        plot_updates = [gr.Plot(value=None)] * 4 # Default empty plot updates
+        if show_plot:
+             p1_updated = elo_subset_results.get("win_fraction_heatmap")
+             p2_updated = elo_subset_results.get("battle_count_heatmap")
+             p3_updated = elo_subset_results.get("bootstrap_elo_rating")
+             p4_updated = elo_subset_results.get("average_win_rate_bar")
+             # Filter plots
+             p1_filtered = filter_deprecated_models_plots(p1_updated, hidden_models=hidden_models_list)
+             p2_filtered = filter_deprecated_models_plots(p2_updated, hidden_models=hidden_models_list)
+             p3_filtered = filter_deprecated_models_plots(p3_updated, hidden_models=hidden_models_list)
+             p4_filtered = filter_deprecated_models_plots(p4_updated, hidden_models=hidden_models_list)
+             plot_updates = [p1_filtered, p2_filtered, p3_filtered, p4_filtered]
+        more_stats_md_updated_text = f"""## More Statistics for Chatbot Arena - {category_display_name} """ if show_plot else ""
+        more_stats_md_update = gr.Markdown(value=more_stats_md_updated_text)
+        # Use baseline DF for total counts, subset DF for category-specific counts
+        category_details_md_updated_text = make_category_arena_leaderboard_md(
+            arena_df_baseline, arena_subset_df, name=category_display_name # Pass display name
+        )
+        category_deets_update = gr.Markdown(value=category_details_md_updated_text)
+        # Return updates in the correct order matching outputs list
+        # Order: df, p1, p2, p3, p4, more_stats_md, category_deets
+        return [dataframe_update] + plot_updates + [more_stats_md_update, category_deets_update]
+    # Define output components (must exist in the UI build)
+    outputs_list = [elo_display_df]
+    if show_plot:
+        # Add plot components if they exist
+        outputs_list.extend([plot_1, plot_2, plot_3, plot_4])
+        # Add markdown component if it exists
+        if more_stats_md: outputs_list.append(more_stats_md)
+        else: outputs_list.append(gr.Markdown(visible=False)) # Placeholder if MD wasn't created
+    else:
+        # Add placeholders if plots/MD are not shown
+        outputs_list.extend([gr.Plot(visible=False)] * 4)
+        outputs_list.append(gr.Markdown(visible=False))
+    outputs_list.append(category_deets) # Always update category details
+    # Attach change listeners
+    category_dropdown.change(
+        fn=update_leaderboard_and_plots,
+        inputs=[category_dropdown, category_checkbox],
+        outputs=outputs_list
+    )
+    category_checkbox.change(
+        fn=update_leaderboard_and_plots, # Use the same function
+        inputs=[category_dropdown, category_checkbox],
+        outputs=outputs_list
+    )
+    return_components = [md_1, md_2, lb_description, category_deets, elo_display_df]
+    if show_plot:
+        # Add plots if they were created
+        return_components.extend([plot_1, plot_2, plot_3, plot_4])
+        # Add the extra stats markdown if it was created
+        if more_stats_md: return_components.append(more_stats_md)
+    return return_components
+def build_demo(elo_results_file, leaderboard_table_file):
+    # Assumes block_css is available or defined elsewhere
+    try:
+        from fastchat.serve.gradio_web_server import block_css
+    except ImportError:
+        print("Warning: fastchat.serve.gradio_web_server.block_css not found. Using fallback CSS.")
+        # Define a minimal fallback CSS or copy the content here
+        block_css = """
+        /* Add minimal CSS rules here if needed */
+        #arena_leaderboard_dataframe table { font-size: 105%; }
+        #leaderboard_markdown .prose { font-size: 110% !important; }
+        .app { max-width: 100% !important; padding: 20px !important; }
+        a { color: #1976D2; text-decoration: none; }
+        a:hover { color: #63A4FF; text-decoration: underline; }
+        """
+    text_size = gr.themes.sizes.text_lg
+    # Assumes theme.json is present
+    try:
+       theme = gr.themes.Default.load("theme.json")
+    except:
+       print("Warning: theme.json not found. Using default Gradio theme.")
+       theme = gr.themes.Default(text_size=text_size) # Fallback theme
+    if hasattr(theme, 'text_size'): theme.text_size = text_size
+    # Apply custom settings if theme object supports it
+    if hasattr(theme, 'set'):
+        theme.set(
+            button_large_text_size="40px",
+            button_small_text_size="40px",
+            button_large_text_weight="1000",
+            button_small_text_weight="1000",
+            button_shadow="*shadow_drop_lg",
+            button_shadow_hover="*shadow_drop_lg",
+            checkbox_label_shadow="*shadow_drop_lg",
+            button_shadow_active="*shadow_inset",
+            button_secondary_background_fill="*primary_300",
+            button_secondary_background_fill_dark="*primary_700",
+            button_secondary_background_fill_hover="*primary_200",
+            button_secondary_background_fill_hover_dark="*primary_500",
+            button_secondary_text_color="*primary_800",
+            button_secondary_text_color_dark="white",
+        )
+    with gr.Blocks(
+        title="LLM Arena: Leaderboard", # Translated title
+        theme=theme,
+        css=block_css, # Use loaded or fallback CSS
+    ) as demo:
+        # Build only the leaderboard tab content
+        # show_plot=True to display plots
+        leader_components = build_leaderboard_tab(
+            elo_results_file, leaderboard_table_file, show_plot=True, mirror=False
+        )
+    return demo
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true", default=False) # Default False for HF
     parser.add_argument("--host", default="0.0.0.0")
     parser.add_argument("--port", type=int, default=7860)
+    # Removed args specific to monitor.py
     args = parser.parse_args()
+    try:
+        elo_result_files = glob.glob("elo_results_*.pkl")
+        if not elo_result_files:
+             raise FileNotFoundError("No elo_results_*.pkl files found.")
+        # More robust sorting extracting the number
+        elo_result_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
+        elo_result_file = elo_result_files[-1]
+        print(f"Using Elo results file: {elo_result_file}")
+    except Exception as e:
+        print(f"Error finding Elo results file: {e}")
+        print("Please ensure a file matching 'elo_results_NUMBER.pkl' exists.")
+        exit(1) # Exit if file not found
+    try:
+        leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
+        if not leaderboard_table_files:
+             raise FileNotFoundError("No leaderboard_table_*.csv files found.")
+        leaderboard_table_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
+        leaderboard_table_file = leaderboard_table_files[-1]
+        print(f"Using leaderboard table file: {leaderboard_table_file}")
+    except Exception as e:
+        print(f"Error finding leaderboard table file: {e}")
+        print("Please ensure a file matching 'leaderboard_table_NUMBER.csv' exists.")
+        exit(1) # Exit if file not found
     demo = build_demo(elo_result_file, leaderboard_table_file)
+    # Launch with args
+    demo.launch(
+        server_name=args.host,
+        server_port=args.port,
+        share=args.share,
+        show_api=False
+    )