Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns | |
| import pandas as pd | |
| from apscheduler.schedulers.background import BackgroundScheduler | |
| from huggingface_hub import snapshot_download | |
| from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE | |
| from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION | |
| from src.display.css_html_js import custom_css | |
| from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision | |
| from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN | |
| from src.populate import get_evaluation_queue_df, get_leaderboard_df | |
| from src.submission.submit import add_new_eval | |
| import random | |
| import matplotlib.pyplot as plt | |
| import re | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import numpy as np | |
| # === NEW: helper for prompt sensitivity (simple: only NER/REL and 3 prompts) === | |
| def calculate_prompt_sensitivity(dataframe, tasks, prompt_ids): | |
| """ | |
| Computes a simple Prompt Sensitivity Index (PSI) over the tasks | |
| using the distribution of 'Best Prompt Id' across the provided prompt_ids. | |
| """ | |
| cv_per_task = [] | |
| for task in tasks: | |
| prompt_col = f"{task} Best Prompt Id" | |
| task_accuracies = [] | |
| for pid in prompt_ids: | |
| total = len(dataframe[prompt_col].dropna()) if prompt_col in dataframe.columns else 0 | |
| count = (dataframe[prompt_col] == pid).sum() if prompt_col in dataframe.columns else 0 | |
| acc = (count / total * 100) if total > 0 else 0 | |
| task_accuracies.append(acc) | |
| if task_accuracies: | |
| mean_acc = np.mean(task_accuracies) | |
| std_acc = np.std(task_accuracies) | |
| cv_per_task.append((std_acc / mean_acc) if mean_acc > 0 else 0) | |
| else: | |
| cv_per_task.append(0) | |
| mean_cv = np.mean(cv_per_task) if cv_per_task else 0 | |
| psi = 1.0 if mean_cv >= 0.5 else (mean_cv / 0.5) | |
| return psi, mean_cv, cv_per_task | |
| def create_best_model_comparison_table(dataframe, lang: str | None = None, shot: str | None = None): | |
| """ | |
| Table with best overall model per task and the model with the best prompt score. | |
| Applies optional filters: | |
| - lang in {EN, IT, SL, SK, GR, PL} or None/"All" | |
| - shot in {"0","10"} or None/"All" (mapped to IS_FS False/True) | |
| """ | |
| tasks = ["NER", "REL", "RML", "HIS", "DIA"] | |
| df = dataframe.copy() | |
| if lang and lang != "All" and "LANG" in df.columns: | |
| df = df[df["LANG"] == lang] | |
| if shot and shot != "All" and "IS_FS" in df.columns: | |
| df = df[df["IS_FS"] == (shot == "10")] | |
| table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []} | |
| for task in tasks: | |
| if task not in df.columns or df.empty: | |
| continue | |
| # Best overall on task | |
| #max_idx = df[task].idxmax() | |
| max_idx = pd.to_numeric(df[task], errors='coerce').idxmax() | |
| try: | |
| model_raw = df.loc[max_idx, 'Model'] | |
| except Exception as e: | |
| break | |
| if isinstance(model_raw, str) and '<' in model_raw: | |
| match = re.search(r'>([^<]+)<', model_raw) | |
| model_name = match.group(1) if match else model_raw | |
| else: | |
| model_name = str(model_raw) | |
| comb_perf_value = df.loc[max_idx, task] | |
| # Best prompt row for task | |
| best_prompt_column = f"{task} Best Prompt" | |
| if best_prompt_column in df.columns and df[best_prompt_column].notna().any(): | |
| best_prompt_idx= pd.to_numeric(df[best_prompt_column],errors='coerce').idxmax() | |
| try: | |
| best_prompt_model_raw = df.loc[best_prompt_idx, 'Model'] | |
| except Exception as e: | |
| break | |
| if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw: | |
| match = re.search(r'>([^<]+)<', best_prompt_model_raw) | |
| best_prompt_model = match.group(1) if match else best_prompt_model_raw | |
| else: | |
| best_prompt_model = str(best_prompt_model_raw) | |
| best_prompt_accuracy = df.loc[best_prompt_idx, best_prompt_column] | |
| else: | |
| best_prompt_model = "n/a" | |
| best_prompt_accuracy = float('nan') | |
| table_data['Task'].append(task) | |
| table_data['Best Overall Model'].append(model_name) | |
| table_data['CPS'].append(f"{comb_perf_value:.2f}") | |
| table_data['Best Prompt Model'].append(best_prompt_model) | |
| table_data['Acc.'].append(f"{best_prompt_accuracy:.2f}" if isinstance(best_prompt_accuracy, (int, float)) else "n/a") | |
| fig = go.Figure(data=[go.Table( | |
| columnwidth=[60, 220, 60, 220, 60], | |
| header=dict( | |
| values=[f'<b>{col}</b>' for col in table_data.keys()], | |
| fill_color=['#2171b5', '#2171b5', '#2171b5', '#4292c6', '#4292c6'], | |
| font=dict(color='white', size=12, family='Arial'), | |
| align='center', height=30 | |
| ), | |
| cells=dict( | |
| values=list(table_data.values()), | |
| fill_color=[['#f0f0f0' if i % 2 == 0 else 'white' for i in range(len(table_data['Task']))]], | |
| font=dict(color='#2c3e50', size=11, family='Arial'), | |
| align=['center', 'left', 'center', 'left', 'center'], | |
| height=30 | |
| ) | |
| )]) | |
| subtitle = [] | |
| subtitle.append(lang if (lang and lang != "All") else "All languages") | |
| subtitle.append(f"{shot}-shot" if (shot and shot != "All") else "All shots") | |
| fig.update_layout( | |
| title={'text': f"Top Model per Task: CPS & Best Prompt β {', '.join(subtitle)}", | |
| 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}}, | |
| font=dict(family="Arial", size=11), | |
| height=420, margin=dict(l=20, r=20, t=50, b=80) | |
| ) | |
| return fig | |
| # === NEW: Best-model comparison table (only NER, REL) === | |
| def create_best_model_comparison_table_without_lang(dataframe): | |
| """ | |
| Table with the best overall model per task (NER, REL,) and the model that | |
| achieves the best score with its own best prompt. | |
| """ | |
| tasks = ["NER", "REL", "RML", "HIS", "DIA"] | |
| table_data = {'Task': [], 'Best Overall Model': [], 'CPS': [], 'Best Prompt Model': [], 'Acc.': []} | |
| for task in tasks: | |
| if task not in dataframe.columns: | |
| continue | |
| # Best overall on the task's combined performance | |
| max_idx = dataframe[task].idxmax() | |
| model_raw = dataframe.loc[max_idx, 'Model'] | |
| if isinstance(model_raw, str) and '<' in model_raw: | |
| match = re.search(r'>([^<]+)<', model_raw) | |
| model_name = match.group(1) if match else model_raw | |
| else: | |
| model_name = str(model_raw) | |
| comb_perf_value = dataframe.loc[max_idx, task] | |
| # Model with the best prompt for this task | |
| best_prompt_column = f"{task} Best Prompt" | |
| if best_prompt_column in dataframe.columns: | |
| best_prompt_idx = dataframe[best_prompt_column].idxmax() | |
| best_prompt_model_raw = dataframe.loc[best_prompt_idx, 'Model'] | |
| if isinstance(best_prompt_model_raw, str) and '<' in best_prompt_model_raw: | |
| match = re.search(r'>([^<]+)<', best_prompt_model_raw) | |
| best_prompt_model = match.group(1) if match else best_prompt_model_raw | |
| else: | |
| best_prompt_model = str(best_prompt_model_raw) | |
| best_prompt_accuracy = dataframe.loc[best_prompt_idx, best_prompt_column] | |
| else: | |
| best_prompt_model = "n/a" | |
| best_prompt_accuracy = float('nan') | |
| table_data['Task'].append(task) | |
| table_data['Best Overall Model'].append(model_name) | |
| table_data['CPS'].append(f"{comb_perf_value:.2f}") | |
| table_data['Best Prompt Model'].append(best_prompt_model) | |
| table_data['Acc.'].append(f"{best_prompt_accuracy:.2f}" if isinstance(best_prompt_accuracy, (int, float)) else "n/a") | |
| fig = go.Figure(data=[go.Table( | |
| columnwidth=[60, 220, 60, 220, 60], | |
| header=dict( | |
| values=[f'<b>{col}</b>' for col in table_data.keys()], | |
| fill_color=['#2171b5', '#2171b5', '#2171b5', '#4292c6', '#4292c6'], | |
| font=dict(color='white', size=12, family='Arial'), | |
| align='center', height=30 | |
| ), | |
| cells=dict( | |
| values=list(table_data.values()), | |
| fill_color=[['#f0f0f0' if i % 2 == 0 else 'white' for i in range(len(table_data['Task']))]], | |
| font=dict(color='#2c3e50', size=11, family='Arial'), | |
| align=['center', 'left', 'center', 'left', 'center'], | |
| height=30 | |
| ) | |
| )]) | |
| fig.update_layout( | |
| title={'text': "Top Model per Task: CPS & Best Prompt (NER/REL)", | |
| 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}}, | |
| font=dict(family="Arial", size=11), | |
| height=420, margin=dict(l=20, r=20, t=50, b=80) | |
| ) | |
| fig.add_annotation( | |
| text=("Best Overall Model uses the task's primary metric (CPS). " | |
| "Best Prompt Model is the one whose own best prompt yields the highest score."), | |
| xref="paper", yref="paper", x=0.5, y=-0.20, showarrow=False, | |
| font=dict(size=11, color="gray", family="Arial"), align="center", xanchor="center" | |
| ) | |
| return fig | |
| def create_prompt_heatmap(dataframe, lang: str | None = None, shot: str | None = None): | |
| """ | |
| Heatmap of share (%) of models whose BEST prompt is each pid, for NER/REL with prompts p1..p3. | |
| Optional filters: | |
| - lang: None or one of EN/IT/SL/SK/GR/PL (None means All) | |
| - shot: None or "0"/"10" (None means All) mapped to IS_FS False/True | |
| """ | |
| tasks = ["NER", "REL", "RML", "HIS", "DIA"] | |
| df = dataframe.copy() | |
| # Language filter | |
| if lang and lang != "All" and "LANG" in df.columns: | |
| df = df[df["LANG"] == lang] | |
| # Shot filter -> IS_FS (10-shot=True, 0-shot=False) | |
| if shot and shot != "All" and "IS_FS" in df.columns: | |
| df = df[df["IS_FS"] == (shot == "10")] | |
| # Collect prompt ids present, normalize labels to p1..p3 | |
| def label_for(pid): | |
| if isinstance(pid, str): return pid | |
| try: return f"p{int(pid)}" | |
| except Exception: return str(pid) | |
| all_ids = set() | |
| for task in tasks: | |
| col = f"{task} Best Prompt Id" | |
| if col in df.columns: | |
| all_ids.update(df[col].dropna().unique()) | |
| prompt_ids_raw = sorted(list(all_ids), key=lambda x: int(re.sub(r'[^0-9]', '', str(x)) or 0)) | |
| prompt_ids_raw = [pid for pid in prompt_ids_raw if label_for(pid) in {"p1", "p2", "p3"}] or [1, 2, 3] | |
| y_tick_labels = [label_for(pid) for pid in prompt_ids_raw] | |
| matrix, hovers = [], [] | |
| for pid in prompt_ids_raw: | |
| row, hover_row = [], [] | |
| for task in tasks: | |
| col = f"{task} Best Prompt Id" | |
| if col in df.columns and len(df[col].dropna()) > 0: | |
| series = df[col].dropna() | |
| def same_pid(v): | |
| a = re.sub(r'[^0-9]', '', str(v)) | |
| b = re.sub(r'[^0-9]', '', str(pid)) | |
| return a == b and a != "" | |
| total = len(series) | |
| count = sum(same_pid(v) for v in series) | |
| pct = (count / total * 100) if total > 0 else 0 | |
| row.append(pct) | |
| hover_row.append(f"<b>{task} β {label_for(pid)}</b><br>Models: {count}/{total}<br>Percentage: {pct:.1f}%") | |
| else: | |
| row.append(0); hover_row.append(f"<b>{task} β {label_for(pid)}</b><br>No data") | |
| matrix.append(row); hovers.append(hover_row) | |
| fig = go.Figure(data=go.Heatmap( | |
| z=matrix, x=tasks, y=y_tick_labels, | |
| colorscale=[[0,'#f7fbff'],[0.2,'#deebf7'],[0.4,'#9ecae1'],[0.6,'#4292c6'],[0.8,'#2171b5'],[1,'#08519c']], | |
| text=[[f"{val:.0f}%" if val is not None else "" for val in row] for row in matrix], | |
| texttemplate="%{text}", textfont={"size": 11, "family": "Arial"}, | |
| hovertemplate='%{customdata}<extra></extra>', customdata=hovers, | |
| colorbar=dict(title="% Models", ticksuffix="%"), | |
| zmin=0, zmax=100 | |
| )) | |
| title_parts = [] | |
| title_parts.append(lang if (lang and lang != "All") else "All languages") | |
| title_parts.append(f"{shot}-shot" if (shot and shot != "All") else "All shots") | |
| fig.update_layout( | |
| title={'text': f"Most Effective Prompts β {', '.join(title_parts)}", | |
| 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}}, | |
| xaxis_title="Task", yaxis_title="Prompt", | |
| font=dict(family="Arial", size=11), margin=dict(b=100), | |
| template="plotly_white", dragmode=False, height=420 | |
| ) | |
| fig.update_xaxes(fixedrange=True); fig.update_yaxes(fixedrange=True) | |
| return fig | |
| # === NEW: Prompt heatmap (only NER, REL; 3 prompts p1, p2, p3) === | |
| def create_prompt_heatmap_without_lang(dataframe): | |
| """ | |
| Heatmap of the share of models (in %) whose BEST prompt for the task is each prompt id, | |
| for tasks NER and REL, with exactly 3 prompts (p1, p2, p3). It supports columns storing | |
| ids as integers (1/2/3) or strings ('p1'/'p2'/'p3'). | |
| """ | |
| tasks = ["NER", "REL", "RML", "HIS", "DIA"] | |
| # Collect unique prompt ids as they appear (int or 'pX'); restrict to 3 prompts | |
| all_ids = set() | |
| for task in tasks: | |
| col = f"{task} Best Prompt Id" | |
| if col in dataframe.columns: | |
| all_ids.update(dataframe[col].dropna().unique()) | |
| # Normalize to display labels and preserve the original values as keys | |
| def label_for(pid): | |
| if isinstance(pid, str): | |
| return pid # e.g., 'p1' | |
| try: | |
| return f"p{int(pid)}" | |
| except Exception: | |
| return str(pid) | |
| prompt_ids_raw = sorted(list(all_ids), key=lambda x: int(re.sub(r'[^0-9]', '', str(x)) or 0)) | |
| # Optional: hard-limit to p1/p2/p3 if extra noise exists | |
| prompt_ids_raw = [pid for pid in prompt_ids_raw if label_for(pid) in {"p1", "p2", "p3"}] | |
| if not prompt_ids_raw: | |
| # Fallback to p1..p3 | |
| prompt_ids_raw = [1, 2, 3] | |
| y_tick_labels = [label_for(pid) for pid in prompt_ids_raw] | |
| matrix, hovers = [], [] | |
| for pid in prompt_ids_raw: | |
| row, hover_row = [], [] | |
| for task in tasks: | |
| col = f"{task} Best Prompt Id" | |
| if col in dataframe.columns: | |
| series = dataframe[col].dropna() | |
| # match values regardless of 'p1' vs 1 vs '1' | |
| def same_pid(v): | |
| a = re.sub(r'[^0-9]', '', str(v)) | |
| b = re.sub(r'[^0-9]', '', str(pid)) | |
| return a == b and a != "" | |
| total = len(series) | |
| count = sum(same_pid(v) for v in series) | |
| pct = (count / total * 100) if total > 0 else 0 | |
| row.append(pct) | |
| hover_row.append( | |
| f"<b>{task} β {label_for(pid)}</b><br>Models: {count}/{total}<br>Percentage: {pct:.1f}%" | |
| ) | |
| else: | |
| row.append(0); hover_row.append(f"<b>{task} β {label_for(pid)}</b><br>No data") | |
| matrix.append(row) | |
| hovers.append(hover_row) | |
| fig = go.Figure(data=go.Heatmap( | |
| z=matrix, x=tasks, y=y_tick_labels, | |
| colorscale=[[0,'#f7fbff'],[0.2,'#deebf7'],[0.4,'#9ecae1'],[0.6,'#4292c6'],[0.8,'#2171b5'],[1,'#08519c']], | |
| text=[[f"{val:.0f}%" if val is not None else "" for val in row] for row in matrix], | |
| texttemplate="%{text}", | |
| textfont={"size": 11, "family": "Arial"}, | |
| hovertemplate='%{customdata}<extra></extra>', | |
| customdata=hovers, | |
| colorbar=dict(title="% Models", ticksuffix="%"), | |
| zmin=0, zmax=100 | |
| )) | |
| fig.update_layout( | |
| title={'text': "Most Effective Prompts Across Models (NER/REL)", | |
| 'font': {'family': 'Arial', 'size': 14, 'color': '#2c3e50'}}, | |
| xaxis_title="Task", yaxis_title="Prompt", | |
| font=dict(family="Arial", size=11), | |
| margin=dict(b=120), template="plotly_white", dragmode=False, height=420 | |
| ) | |
| # PSI (optional info line) | |
| psi, mean_cv, _ = calculate_prompt_sensitivity( | |
| dataframe, tasks, prompt_ids_raw | |
| ) | |
| fig.add_annotation( | |
| text=f"Prompt Sensitivity (mean CV): {mean_cv:.2f}", | |
| xref="paper", yref="paper", x=0.3, y=1.12, showarrow=False, | |
| font=dict(size=11, color="#2c3e50", family="Arial") | |
| ) | |
| fig.update_xaxes(fixedrange=True); fig.update_yaxes(fixedrange=True) | |
| return fig | |
| def mean_of_max_per_field(df): | |
| """ | |
| Calcola il massimo per ciascun campo e poi la media dei massimi. | |
| Args: | |
| df (pd.DataFrame): DataFrame con colonne TE, SA, HS, AT, WIC, FAQ, LS, SU, NER, REL, RML, DIA, HIS | |
| Returns: | |
| float: media dei valori massimi dei campi | |
| """ | |
| #fields = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] | |
| fields = ["NER", "REL", "RML", "DIA", "HIS"] | |
| #print(df.columns) | |
| # Controlla che tutte le colonne esistano nel DataFrame | |
| missing = [f for f in fields if f not in df.columns] | |
| if missing: | |
| raise ValueError(f"Le seguenti colonne mancano nel DataFrame: {missing}") | |
| # Calcola il massimo per ciascun campo | |
| max_values = df[fields].apply(pd.to_numeric, errors='coerce').max(skipna=True) | |
| # Calcola la media dei massimi | |
| mean_max = max_values.mean() | |
| return mean_max | |
| def barplot_mean_few_minus_zero_shot(dataframe, tasks=None): | |
| if tasks is None: | |
| tasks = [ "NER", "REL", "RML", "DIA", "HIS"] | |
| task_means = {} | |
| for task in tasks: | |
| if task not in dataframe.columns: | |
| continue | |
| # Separa few-shot e zero-shot | |
| few_shot = dataframe[dataframe['IS_FS'] == True][["Model", task]] | |
| zero_shot = dataframe[dataframe['IS_FS'] == False][["Model", task]] | |
| # Allinea i modelli | |
| merged = pd.merge(few_shot, zero_shot, on="Model", suffixes=("_few", "_zero")) | |
| # Rimuovi righe con valori mancanti | |
| merged = merged.dropna(subset=[f"{task}_few", f"{task}_zero"]) | |
| if merged.empty: | |
| continue | |
| # Calcola differenza few - zero | |
| diff = merged[f"{task}_few"] - merged[f"{task}_zero"] | |
| # Calcola la media | |
| task_means[task] = diff.mean() | |
| # Crea barplot | |
| fig = go.Figure([go.Bar( | |
| x=list(task_means.keys()), | |
| y=list(task_means.values()), | |
| marker_color="#ff7f0e", | |
| text=[f"{v:.2f}" for v in task_means.values()], | |
| textposition="outside", | |
| hovertemplate="<b>%{x}</b><br>Mean Delta Accuracy: %{y:.2f}%<extra></extra>" | |
| )]) | |
| # Linea di riferimento a 0 | |
| ''' | |
| fig.add_shape( | |
| type="line", | |
| x0=-0.5, x1=len(task_means) - 0.5, | |
| y0=0, y1=0, | |
| line=dict(color="black", width=2, dash="dash"), | |
| xref="x", yref="y" | |
| ) | |
| ''' | |
| fig.update_layout( | |
| title="Mean Accuracy Difference (Few-shot β Zero-shot) per Task", | |
| xaxis_title="", | |
| yaxis_title="Mean Delta Combined Performance", | |
| template="plotly_white", | |
| font=dict(family="Arial", size=13), | |
| #margin=dict(b=100) | |
| ) | |
| fig.add_annotation( | |
| text="10-shot learning generally outperforms zero-shot. <br>" | |
| "", | |
| xref="paper", yref="paper", | |
| x=0, y=-0.2, | |
| showarrow=False, | |
| font=dict(size=11, color="gray"), | |
| align="left" | |
| ) | |
| return fig | |
| def boxplot_per_task(dataframe=None, baselines=None, references=None): | |
| #print(dataframe.columns) | |
| #tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] | |
| tasks =["NER", "REL", "RML", "HIS", "DIA"] | |
| if dataframe is None: | |
| np.random.seed(42) | |
| dataframe = pd.DataFrame({ | |
| task: np.random.uniform(0.4, 0.9, 20) * 100 | |
| for task in tasks | |
| }) | |
| if baselines is None: | |
| baselines = {task: np.random.randint(50, 70) for task in tasks} | |
| colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", | |
| "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"] | |
| fig = go.Figure() | |
| for i, task in enumerate(tasks): | |
| if task in dataframe.columns: | |
| y_data = dataframe[task].dropna().tolist() | |
| # boxplot | |
| fig.add_trace(go.Box( | |
| y=y_data, | |
| name=task, | |
| marker=dict(color=colors[i]), | |
| line=dict(color="black", width=2), | |
| fillcolor=colors[i], | |
| opacity=0.7, | |
| hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>", | |
| width=0.6, | |
| whiskerwidth=0.2, | |
| quartilemethod="linear" | |
| )) | |
| # baseline | |
| #if task in baselines and baselines[task] is not None: | |
| #fig.add_shape( | |
| # type="line", | |
| # x0=i - 0.3, x1=i + 0.3, | |
| # y0=baselines[task], y1=baselines[task], | |
| # line=dict(color="black", width=2, dash="dot"), # piΓΉ visibile | |
| # xref="x", yref="y" | |
| #) | |
| #''' | |
| #fig.add_annotation( | |
| #x=i, y=baselines[task], | |
| #text=f"{baselines[task]}%", | |
| #showarrow=False, | |
| #yshift=10, | |
| #font=dict(size=10, color="black") | |
| #) | |
| #''' | |
| # reference GPT-4o | |
| # if task in references and references[task] is not None: | |
| # fig.add_shape( | |
| # type="line", | |
| # x0=i - 0.3, x1=i + 0.3, | |
| # y0=references[task], y1=references[task], | |
| # line=dict(color="red", width=2, dash="dashdot"), | |
| # xref="x", yref="y" | |
| # ) | |
| fig.update_layout( | |
| title="Distribution of Model Accuracy by Task", | |
| xaxis_title="Task", | |
| yaxis_title="Combined Performance", | |
| template="plotly_white", | |
| boxmode="group", | |
| dragmode=False, | |
| font=dict(family="Arial", size=10), | |
| margin=dict(b=80), | |
| ) | |
| fig.add_annotation( | |
| text=("" | |
| #"In tasks like TE and SA, models approach the accuracy of supervised <br>" | |
| #"models at EVALITA (dashed black line); in NER and REL they remain lower. <br>" | |
| # "Dashed red lines show GPT-4o reference results for generative tasks." | |
| ), | |
| xref="paper", yref="paper", | |
| x=0.5, y=-0.30, | |
| showarrow=False, | |
| font=dict(size=11, color="gray"), | |
| align="left" | |
| ) | |
| fig.update_yaxes(range=[0, 100], fixedrange=True) | |
| return fig | |
| # EVALITA results | |
| BASELINES = { | |
| "TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00, | |
| "LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99 | |
| } | |
| # GPT-4o | |
| REFERENCES = { | |
| "NER": 79.11, | |
| "REL": 63.32, | |
| "LS": 59.25, | |
| "SU": 33.04 | |
| } | |
| def boxplot_prompts_per_task(dataframe, tasks=None): | |
| if tasks is None: | |
| tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"] | |
| # Lista delle colonne da aggiornare | |
| cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"] | |
| # Applichiamo la trasformazione | |
| for col in cols_to_update: | |
| dataframe[col] = dataframe[col].replace({1: 7, 2: 8}) | |
| fig = go.Figure() | |
| # Liste per creare una sola voce in legenda per Average e Best | |
| avg_x, avg_y = [], [] | |
| best_x, best_y, best_text = [], [], [] | |
| for task in tasks: | |
| avg_col = f"{task} Prompt Average" | |
| best_col = f"{task} Best Prompt" | |
| best_id_col = f"{task} Best Prompt Id" | |
| if all(col in dataframe.columns for col in [avg_col, best_col, best_id_col]): | |
| avg_value = dataframe[avg_col].mean() | |
| avg_x.append(task) | |
| avg_y.append(avg_value) | |
| best_value = dataframe[best_col].mean() | |
| best_x.append(task) | |
| best_y.append(best_value) | |
| best_id = dataframe[best_id_col].mode()[0] # Most frequent best prompt id | |
| best_text.append(f"P:{best_id}") | |
| # Barre Average Accuracy (azzurro) | |
| fig.add_trace(go.Bar( | |
| x=avg_x, | |
| y=avg_y, | |
| name="Avg. Accuracy", | |
| marker_color="#1f77b4", | |
| )) | |
| # Barre Best Prompt (rosso) | |
| fig.add_trace(go.Bar( | |
| x=best_x, | |
| y=best_y, | |
| name="Best Prompt", | |
| marker_color="#d62728", | |
| )) | |
| # Testo sopra barre Best Prompt con ID | |
| for x, y, text in zip(best_x, best_y, best_text): | |
| fig.add_annotation( | |
| x=x, | |
| y=y + 3, # leggermente sopra la barra | |
| text=text, | |
| showarrow=False, | |
| font=dict(size=12, color="black") | |
| ) | |
| fig.update_layout( | |
| title= "Prompt Accuracy: Avg vs Best", | |
| xaxis_title="Task", | |
| yaxis_title="Combined Performance", | |
| barmode='group', | |
| template="plotly_white", | |
| font=dict(family="Arial", size=10), | |
| yaxis=dict(range=[0, 100], fixedrange=True) | |
| ) | |
| # caption come annotazione separata | |
| fig.add_annotation( | |
| text="There is no single prompt that performs best across all tasks.<br>" | |
| "Different prompts achieve the highest accuracy on different tasks.", | |
| xref="paper", yref="paper", | |
| x=0.5, y=-0.3, | |
| showarrow=False, | |
| font=dict(size=11, color="gray"), | |
| align="center", | |
| xanchor="center" | |
| ) | |
| return fig | |
| def line_chart(dataframe): | |
| # Normalizza le dimensioni per avere marker non troppo piccoli nΓ© enormi | |
| def scale_sizes(values, min_size=8, max_size=30): | |
| vmin, vmax = min(values), max(values) | |
| return [ | |
| min_size + (val - vmin) / (vmax - vmin) * (max_size - min_size) if vmax > vmin else (min_size + max_size) / 2 | |
| for val in values | |
| ] | |
| # dati in base a IS_FS | |
| df_true = dataframe[dataframe['IS_FS'] == True] | |
| df_false = dataframe[dataframe['IS_FS'] == False] | |
| # Estrai valori x, y e labels | |
| x_true = df_true['#Params (B)'].tolist() | |
| y_true = df_true['Avg. Comb. Perf. β¬οΈ'].tolist() | |
| labels_true = [re.search(r'>([^<]+)<', m).group(1) for m in df_true['Model'].tolist()] | |
| x_false = df_false['#Params (B)'].tolist() | |
| y_false = df_false['Avg. Comb. Perf. β¬οΈ'].tolist() | |
| labels_false = [re.search(r'>([^<]+)<', m).group(1) for m in df_false['Model'].tolist()] | |
| fig = go.Figure() | |
| # Punti IS_FS=True | |
| fig.add_trace(go.Scatter( | |
| x=x_true, | |
| y=y_true, | |
| mode='markers', | |
| name='10-Shot', | |
| marker=dict( | |
| color='blue', | |
| size=scale_sizes(x_true) | |
| ), | |
| hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>', | |
| customdata=labels_true | |
| )) | |
| # Punti IS_FS=False | |
| fig.add_trace(go.Scatter( | |
| x=x_false, | |
| y=y_false, | |
| mode='markers', | |
| name='0-Shot', | |
| marker=dict( | |
| color='red', | |
| size=scale_sizes(x_false) | |
| ), | |
| hovertemplate='<b>%{customdata}</b><br>#Params: %{x}<br>Performance: %{y}<extra></extra>', | |
| customdata=labels_false | |
| )) | |
| # Trova il massimo tra tutti i modelli | |
| all_y = y_true + y_false | |
| all_x = x_true + x_false | |
| all_labels = labels_true + labels_false | |
| max_idx = all_y.index(max(all_y)) | |
| max_x = all_x[max_idx] | |
| max_y = all_y[max_idx] | |
| max_label = all_labels[max_idx] | |
| # Aggiungi annotazione visibile per il modello migliore | |
| fig.add_annotation( | |
| x=max_x, | |
| y=max_y, | |
| #text=f"Top: {max_label} ({max_y:.1f}%)", | |
| text=f"{max_label}", | |
| showarrow=True, | |
| arrowhead=2, | |
| arrowsize=1, | |
| arrowwidth=2, | |
| arrowcolor="black", | |
| font=dict(size=11, color="black"), | |
| xshift=10, | |
| yshift=10, | |
| ax = -30, ay = -20, # sposta la label a sinistra e sopra il punto | |
| xanchor = "right" # allinea la label a destra rispetto al punto | |
| ) | |
| fig.update_layout( | |
| title="Avg. Combined Performance vs #Params", | |
| xaxis_title="#Params (B)", | |
| yaxis_title="Avg. Combined Performance", | |
| template="plotly_white", | |
| hovermode="closest", | |
| font=dict(family="Arial", size=10), | |
| dragmode=False, | |
| xaxis=dict( | |
| tickvals=[0, 25, 50, 75, 100, 125], | |
| ticktext=["0", "25", "50", "75", "100"] | |
| ), | |
| yaxis=dict( | |
| tickvals=[0, 20, 40, 60, 80, 100], # π tick fissi | |
| range=[0, 100] # π range bloccato | |
| ) | |
| ) | |
| # Caption | |
| fig.add_annotation( | |
| text="Accuracy generally rises with #Params, but smaller models <br>" | |
| "with 10-shot can outperform larger zero-shot models.", | |
| xref="paper", yref="paper", | |
| x=0.5, y=-0.3, # π centrata | |
| showarrow=False, | |
| font=dict(size=11, color="gray"), | |
| align="center", | |
| xanchor="center" # π ancora centrata rispetto al testo | |
| ) | |
| fig.update_xaxes(fixedrange=True, rangeslider_visible=False) | |
| fig.update_yaxes(fixedrange=True) | |
| return fig | |
| # Define task metadata (icons, names, descriptions) | |
| TASK_METADATA_MULTIPLECHOICE = { | |
| #"TE": {"icon": "π", "name": "Textual Entailment", "tooltip": ""}, | |
| #"SA": {"icon": "π", "name": "Sentiment Analysis", "tooltip": ""}, | |
| #"HS": {"icon": "β οΈ", "name": "Hate Speech", "tooltip": ""}, | |
| #"AT": {"icon": "π₯", "name": "Admission Test", "tooltip": ""}, | |
| #"WIC": {"icon": "π€", "name": "Word in Context", "tooltip": ""}, | |
| #"FAQ": {"icon": "β", "name": "Frequently Asked Questions", "tooltip": ""} | |
| } | |
| # Define task metadata (icons, names, descriptions) | |
| TASK_METADATA_GENERATIVE = { | |
| "NER": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""}, | |
| "REL": {"icon": "π", "name": "Relation Extraction", "tooltip": ""}, | |
| "RML": {"icon": "π", "name": "CRF RML", "tooltip": "CRF RML"}, | |
| "DIA": {"icon": "π₯", "name": "CRF Diagnosis", "tooltip": "CRF Diagnosis"}, | |
| "HIS": {"icon": "π", "name": "CRF History", "tooltip": "CRF History"}, | |
| } | |
| def restart_space(): | |
| """Restart the Hugging Face space.""" | |
| API.restart_space(repo_id=REPO_ID) | |
| def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): | |
| """ | |
| Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected. | |
| The table is sorted based on the "Avg. Combined Performance" field. | |
| """ | |
| if dataframe is None or dataframe.empty: | |
| raise ValueError("Leaderboard DataFrame is empty or None.") | |
| #print("????????????????????????????????", mean_of_max_per_field(dataframe)) | |
| sorted_dataframe = dataframe.sort_values(by="Avg. Comb. Perf. β¬οΈ", ascending=False) | |
| sorted_dataframe = sorted_dataframe.reset_index(drop=True) | |
| sorted_dataframe["Rank"] = sorted_dataframe.index + 1 | |
| # Flag per sapere se la medaglia Γ¨ giΓ stata assegnata per categoria e tipo | |
| large_medal_fs_assigned = False | |
| medium_medal_fs_assigned = False | |
| small_medal_fs_assigned = False | |
| large_medal_0shot_assigned = False | |
| medium_medal_0shot_assigned = False | |
| small_medal_0shot_assigned = False | |
| # Lista temporanea per salvare i nuovi valori della colonna Model | |
| new_model_column = [] | |
| for _, row in sorted_dataframe.iterrows(): | |
| if row['IS_FS']: # 10-Few-Shot | |
| if row["Size"] == "π΅π΅π΅" and not large_medal_fs_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π΅π΅π") | |
| large_medal_fs_assigned = True | |
| elif row["Size"] == "π΅π΅" and not medium_medal_fs_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π΅π") | |
| medium_medal_fs_assigned = True | |
| elif row["Size"] == "π΅" and not small_medal_fs_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π") | |
| small_medal_fs_assigned = True | |
| else: | |
| new_model_column.append(row["Model"]) | |
| else: # 0-Shot | |
| if row["Size"] == "π΅π΅π΅" and not large_medal_0shot_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π΅π΅ποΈ") | |
| large_medal_0shot_assigned = True | |
| elif row["Size"] == "π΅π΅" and not medium_medal_0shot_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π΅ποΈ") | |
| medium_medal_0shot_assigned = True | |
| elif row["Size"] == "π΅" and not small_medal_0shot_assigned: | |
| new_model_column.append(f"{row['Model']} π΅ποΈ") | |
| small_medal_0shot_assigned = True | |
| else: | |
| new_model_column.append(row["Model"]) | |
| # Lista delle colonne da aggiornare | |
| #cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"] | |
| # Applichiamo la trasformazione | |
| #for col in cols_to_update: | |
| # dataframe[col] = dataframe[col].replace({1: 7, 2: 8}) | |
| # Aggiorna la colonna Model | |
| sorted_dataframe["Model"] = new_model_column | |
| field_list = fields(AutoEvalColumn) | |
| return Leaderboard( | |
| value=sorted_dataframe, | |
| datatype=[c.type for c in field_list], | |
| search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], | |
| hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], | |
| filter_columns=[ | |
| ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "), | |
| ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges: "), | |
| ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max = 100, default = [0,100], label="Select the number of parameters (B)"), | |
| ], | |
| bool_checkboxgroup_label="Evaluation Mode", | |
| interactive=False, | |
| ) | |
| def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None): | |
| """ | |
| Update and return the leaderboard when a specific task is selected. | |
| The table is sorted based on the "Combined Performance" field. | |
| """ | |
| if dataframe is None or dataframe.empty: | |
| raise ValueError("Leaderboard DataFrame is empty or None.") | |
| #sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False) | |
| clean_df = dataframe.assign( **{"Combined Performance": pd.to_numeric(dataframe["Combined Performance"], errors="coerce")}).loc[lambda df: df["Combined Performance"].notna() & (df["Combined Performance"] != 0)] | |
| sorted_dataframe = clean_df.sort_values(by="Combined Performance", ascending=False) | |
| # aggiungo la colonna rank in base alla posizione | |
| sorted_dataframe = sorted_dataframe.reset_index(drop=True) | |
| sorted_dataframe["Rank"] = sorted_dataframe.index + 1 | |
| # Flag per sapere se la medaglia Γ¨ giΓ stata assegnata per categoria e tipo | |
| large_medal_fs_assigned = False | |
| medium_medal_fs_assigned = False | |
| small_medal_fs_assigned = False | |
| large_medal_0shot_assigned = False | |
| medium_medal_0shot_assigned = False | |
| small_medal_0shot_assigned = False | |
| # Lista temporanea per salvare i nuovi valori della colonna Model | |
| new_model_column = [] | |
| for _, row in sorted_dataframe.iterrows(): | |
| if row['IS_FS']: # 5-Few-Shot | |
| if row["Size"] == "π΅π΅π΅" and not large_medal_fs_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π΅π΅π") | |
| large_medal_fs_assigned = True | |
| elif row["Size"] == "π΅π΅" and not medium_medal_fs_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π΅π") | |
| medium_medal_fs_assigned = True | |
| elif row["Size"] == "π΅" and not small_medal_fs_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π") | |
| small_medal_fs_assigned = True | |
| else: | |
| new_model_column.append(row["Model"]) | |
| else: # 0-Shot | |
| if row["Size"] == "π΅π΅π΅" and not large_medal_0shot_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π΅π΅ποΈ") | |
| large_medal_0shot_assigned = True | |
| elif row["Size"] == "π΅π΅" and not medium_medal_0shot_assigned: | |
| new_model_column.append(f"{row['Model']} π΅π΅ποΈ") | |
| medium_medal_0shot_assigned = True | |
| elif row["Size"] == "π΅" and not small_medal_0shot_assigned: | |
| new_model_column.append(f"{row['Model']} π΅ποΈ") | |
| small_medal_0shot_assigned = True | |
| else: | |
| new_model_column.append(row["Model"]) | |
| # Aggiorna la colonna Model | |
| sorted_dataframe["Model"] = new_model_column | |
| pd.set_option('display.max_colwidth', None) | |
| #print("========================", dataframe['Model']) | |
| #print(sorted_dataframe['Combined Performance']) | |
| field_list = fields(AutoEvalColumn) | |
| return Leaderboard( | |
| value=sorted_dataframe, | |
| #datatype=[c.type for c in field_list], | |
| datatype=[c.type for c in field_list] + [int], | |
| #select_columns=SelectColumns( | |
| # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default], | |
| # cant_deselect=[c.name for c in field_list if c.never_hidden], | |
| # label="Select Columns to Display:", | |
| #), | |
| search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], | |
| hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], | |
| filter_columns=[ | |
| ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Shot Learning (FS): "), | |
| ColumnFilter(AutoEvalColumn.LANG.name, type="checkboxgroup", label="Languges: "), | |
| ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=100, default=[0, 100], | |
| label="Select the number of parameters (B)"), | |
| ], | |
| bool_checkboxgroup_label="Evaluation Mode", | |
| interactive=False | |
| ) | |
| def download_snapshot(repo, local_dir): | |
| """Try to download a snapshot from Hugging Face Hub.""" | |
| try: | |
| print(f"Downloading from {repo} to {local_dir}...") | |
| snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN) | |
| except Exception as e: | |
| print(f"Error downloading {repo}: {e}") | |
| restart_space() | |
| # Initialize the app by downloading snapshots | |
| download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH) | |
| download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) | |
| # Load leaderboard data | |
| LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) | |
| finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) | |
| #print(LEADERBOARD_DF.columns.tolist()) | |
| theoretical_max_combined_perf = mean_of_max_per_field(LEADERBOARD_DF) | |
| # Prepare the main interface | |
| demo = gr.Blocks(css=custom_css) | |
| with demo: | |
| #gr.HTML(TITLE) | |
| gr.HTML( | |
| """ | |
| <div style="display: flex; align-items: center; position: relative; width: 100%; height: 60px; padding: 10px 0;"> | |
| <h1 style=" | |
| margin: 0 auto; | |
| font-weight: 900; | |
| font-size: 5.5em; | |
| letter-spacing: 2px; | |
| text-transform: uppercase; | |
| color: red; | |
| background: linear-gradient(90deg, #1f77b4, #00c6ff); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| text-shadow: 2px 2px 8px rgba(0.2,0,0,0); | |
| "> | |
| ECREAM-LLM Leaderboard | |
| </h1> | |
| </div> | |
| """ | |
| ) | |
| gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") | |
| # β¬οΈ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs | |
| with gr.Row(): | |
| gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart") | |
| gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task") | |
| # === NEW: second row with the 2 extra plots (NER/REL + p1..p3) === | |
| #with gr.Row(): | |
| #gr.Plot(value=create_prompt_heatmap(LEADERBOARD_DF), elem_id="prompt-heatmap") | |
| #gr.Plot(value=create_best_model_comparison_table(LEADERBOARD_DF), elem_id="best-model-table") | |
| # === NEW: gray background wrapper for combos === | |
| with gr.Row(elem_id="filters-wrap"): | |
| lang_dd = gr.Dropdown( | |
| choices=["All", "EN", "IT", "SL", "SK", "GR", "PL"], | |
| value="All", label="Language: ", scale=1 | |
| ) | |
| shot_dd = gr.Dropdown( | |
| choices=["All", "0", "10"], | |
| value="All", label="N-Shot: ", scale=1 | |
| ) | |
| with gr.Row(): | |
| heatmap_plot = gr.Plot(value=create_prompt_heatmap(LEADERBOARD_DF, None, None), elem_id="prompt-heatmap") | |
| table_plot = gr.Plot(value=create_best_model_comparison_table(LEADERBOARD_DF, None, None), elem_id="best-model-table") | |
| def _update_both(lang, shot): | |
| return ( | |
| create_prompt_heatmap(LEADERBOARD_DF, None if lang == "All" else lang, None if shot == "All" else shot), | |
| create_best_model_comparison_table(LEADERBOARD_DF, None if lang == "All" else lang, None if shot == "All" else shot) | |
| ) | |
| lang_dd.change(_update_both, inputs=[lang_dd, shot_dd], outputs=[heatmap_plot, table_plot]) | |
| shot_dd.change(_update_both, inputs=[lang_dd, shot_dd], outputs=[heatmap_plot, table_plot]) | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| # Main leaderboard tab | |
| with gr.TabItem("π Benchmark"): | |
| leaderboard = init_leaderboard( | |
| LEADERBOARD_DF, | |
| default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"], | |
| hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size', 'LANG', 'FS', 'Model', "Avg. Comb. Perf. β¬οΈ", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL", "RML", "DIA", "HIS"]] | |
| ) | |
| # About tab | |
| with gr.TabItem("π About"): | |
| gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") | |
| # Task-specific leaderboards | |
| for task, metadata in TASK_METADATA_MULTIPLECHOICE.items(): | |
| with gr.TabItem(f"{metadata['icon']}{task}"): | |
| task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") | |
| gr.Markdown(task_description, elem_classes="markdown-text") | |
| leaderboard = update_task_leaderboard( | |
| LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Prompt Std": "Prompt Std", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}), | |
| default_selection=['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id'], | |
| hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', 'Best Prompt Id']] | |
| ) | |
| # About tab | |
| with gr.TabItem("β", interactive=False): | |
| gr.Markdown("", elem_classes="markdown-text") | |
| # Task-specific leaderboards | |
| for task, metadata in TASK_METADATA_GENERATIVE.items(): | |
| with gr.TabItem(f"{metadata['icon']}{task}"): | |
| task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") | |
| gr.Markdown(task_description, elem_classes="markdown-text1") | |
| #print (LEADERBOARD_DF) | |
| leaderboard = update_task_leaderboard( | |
| LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", | |
| f"{task} Prompt Std": "Prompt Std", | |
| f"{task} Best Prompt": "Best Prompt", | |
| f"{task} Best Prompt Id": "Best Prompt Id", | |
| task: "Combined Performance"}), | |
| default_selection=['Rank', 'Size', 'LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', 'Best Prompt', | |
| 'Best Prompt Id'], | |
| hidden_columns=[col for col in LEADERBOARD_DF.columns if | |
| col not in ['Rank', 'Size','LANG', 'FS', 'Model', 'Combined Performance', 'Prompt Average', 'Prompt Std', | |
| 'Best Prompt', 'Best Prompt Id']] | |
| ) | |
| # Citation section | |
| with gr.Accordion("π Citation", open=False): | |
| gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True) | |
| with gr.Accordion("π Credits", open=False): | |
| gr.Markdown( | |
| """ | |
| ***This project has been funded by the European Union under: | |
| Horizon Europe eCREAM Project (Grant Agreement No.101057726) | |
| """ | |
| ) | |
| # Background job to restart space | |
| scheduler = BackgroundScheduler() | |
| scheduler.add_job(restart_space, "interval", seconds=1800) | |
| scheduler.start() | |
| # Launch the app with concurrent queueing | |
| demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode | |
| show_error=True) |