Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter | |
| from data_reviewer import create_data_viewer | |
| # Define constants and enums | |
| TITLE = "<h1>VL-RewardBench Leaderboard</h1>" | |
| INTRODUCTION_TEXT = "https://vl-rewardbench.github.io/" | |
| GOOGLE_SHEET_URL = ( | |
| "https://docs.google.com/spreadsheets/d/1fPqZLF1FQFyy4n9I6GNk7MeDSGlJDVVes9yEBqN8RwU/export?gid=0&format=csv" | |
| ) | |
| ABOUT_TEXT = """Welcome to VLRewardBench! | |
| We introduce a novel benchmark VL-RewardBench, designed to expose limitations of vision-language reward models across visual perception, hallucination detection, and reasoning tasks. | |
| Our evaluation reveals including that models primarily fail at basic visual perception rather than reasoning, and that performance on our benchmark strongly correlates (r>0.9) with downstream vision-language tasks. | |
| The splits are: | |
| - General (VLFeedback + WildVision | |
| - Hallucination (POVID, RLAIF, RLHF-V) | |
| - Reasoning (MMMU-Pro, MathVerse)""" | |
| class AutoEvalColumn: | |
| model = {"name": "Model", "type": "str", "displayed_by_default": True, "never_hidden": True} | |
| license = {"name": "License", "type": "str", "displayed_by_default": False, "never_hidden": False} | |
| general = {"name": "General", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
| hallucination = {"name": "Hallucination", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
| reasoning = {"name": "Reasoning", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
| overall = {"name": "Overall Consistency", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
| macro = {"name": "Macro Average", "type": "float", "displayed_by_default": True, "never_hidden": False} | |
| # Create sample data | |
| def get_sample_data(): | |
| return pd.DataFrame( | |
| { | |
| "Model": ["model1", "model2", "model3"], | |
| "License": ["MIT", "Apache", "MIT"], | |
| "Model Type": ["base", "instruct", "chat"], | |
| "Precision": ["float16", "float32", "float16"], | |
| "Parameters (B)": [7, 13, 70], | |
| "Available": [True, True, False], | |
| } | |
| ) | |
| def get_result_data(): | |
| return pd.read_csv(GOOGLE_SHEET_URL) | |
| def init_leaderboard(dataframe): | |
| if dataframe is None or dataframe.empty: | |
| raise ValueError("Leaderboard DataFrame is empty or None.") | |
| return Leaderboard( | |
| value=dataframe, | |
| datatype=[col["type"] for col in AutoEvalColumn.__dict__.values() if isinstance(col, dict)], | |
| select_columns=SelectColumns( | |
| default_selection=[ | |
| col["name"] | |
| for col in AutoEvalColumn.__dict__.values() | |
| if isinstance(col, dict) and col["displayed_by_default"] | |
| ], | |
| cant_deselect=[ | |
| col["name"] | |
| for col in AutoEvalColumn.__dict__.values() | |
| if isinstance(col, dict) and col.get("never_hidden", False) | |
| ], | |
| label="Select Columns to Display:", | |
| ), | |
| search_columns=["Model", "License"], | |
| filter_columns=[ | |
| ColumnFilter("License", type="checkboxgroup", label="License"), | |
| ColumnFilter("Model Size", type="checkboxgroup", label="Model Size"), | |
| ], | |
| interactive=False, | |
| ) | |
| # Initialize the Gradio interface | |
| demo = gr.Blocks() | |
| with demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown(INTRODUCTION_TEXT) | |
| with gr.Tabs() as tabs: | |
| with gr.TabItem("🏅 Leaderboard"): | |
| # Load your DataFrame here instead of the sample data | |
| df = get_result_data() | |
| leaderboard = init_leaderboard(df) | |
| with gr.TabItem("📊 Data Viewer"): | |
| dataset_name, dataset_split, sample_idx = create_data_viewer() | |
| with gr.TabItem("ℹ️ About"): | |
| gr.Markdown(ABOUT_TEXT) | |
| demo.launch() | |