VL-RewardBench

Running

VL-RewardBench / app.py

xiezhihui.x

cleanup & initial version

bf5c2b9 about 1 year ago

3.88 kB

	import gradio as gr
	import pandas as pd
	from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter

	from data_reviewer import create_data_viewer

	# Define constants and enums
	TITLE = "<h1>VL-RewardBench Leaderboard</h1>"
	INTRODUCTION_TEXT = "https://vl-rewardbench.github.io/"
	GOOGLE_SHEET_URL = (
	"https://docs.google.com/spreadsheets/d/1fPqZLF1FQFyy4n9I6GNk7MeDSGlJDVVes9yEBqN8RwU/export?gid=0&format=csv"
	)
	ABOUT_TEXT = """Welcome to VLRewardBench!

	We introduce a novel benchmark VL-RewardBench, designed to expose limitations of vision-language reward models across visual perception, hallucination detection, and reasoning tasks.
	Our evaluation reveals including that models primarily fail at basic visual perception rather than reasoning, and that performance on our benchmark strongly correlates (r>0.9) with downstream vision-language tasks.

	The splits are:

	- General (VLFeedback + WildVision
	- Hallucination (POVID, RLAIF, RLHF-V)
	- Reasoning (MMMU-Pro, MathVerse)"""


	class AutoEvalColumn:
	model = {"name": "Model", "type": "str", "displayed_by_default": True, "never_hidden": True}
	license = {"name": "License", "type": "str", "displayed_by_default": False, "never_hidden": False}
	general = {"name": "General", "type": "float", "displayed_by_default": True, "never_hidden": False}
	hallucination = {"name": "Hallucination", "type": "float", "displayed_by_default": True, "never_hidden": False}
	reasoning = {"name": "Reasoning", "type": "float", "displayed_by_default": True, "never_hidden": False}
	overall = {"name": "Overall Consistency", "type": "float", "displayed_by_default": True, "never_hidden": False}
	macro = {"name": "Macro Average", "type": "float", "displayed_by_default": True, "never_hidden": False}


	# Create sample data
	def get_sample_data():
	return pd.DataFrame(
	{
	"Model": ["model1", "model2", "model3"],
	"License": ["MIT", "Apache", "MIT"],
	"Model Type": ["base", "instruct", "chat"],
	"Precision": ["float16", "float32", "float16"],
	"Parameters (B)": [7, 13, 70],
	"Available": [True, True, False],
	}
	)


	def get_result_data():
	return pd.read_csv(GOOGLE_SHEET_URL)


	def init_leaderboard(dataframe):
	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")

	return Leaderboard(
	value=dataframe,
	datatype=[col["type"] for col in AutoEvalColumn.__dict__.values() if isinstance(col, dict)],
	select_columns=SelectColumns(
	default_selection=[
	col["name"]
	for col in AutoEvalColumn.__dict__.values()
	if isinstance(col, dict) and col["displayed_by_default"]
	],
	cant_deselect=[
	col["name"]
	for col in AutoEvalColumn.__dict__.values()
	if isinstance(col, dict) and col.get("never_hidden", False)
	],
	label="Select Columns to Display:",
	),
	search_columns=["Model", "License"],
	filter_columns=[
	ColumnFilter("License", type="checkboxgroup", label="License"),
	ColumnFilter("Model Size", type="checkboxgroup", label="Model Size"),
	],
	interactive=False,
	)


	# Initialize the Gradio interface
	demo = gr.Blocks()
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT)

	with gr.Tabs() as tabs:
	with gr.TabItem("🏅 Leaderboard"):
	# Load your DataFrame here instead of the sample data
	df = get_result_data()
	leaderboard = init_leaderboard(df)

	with gr.TabItem("📊 Data Viewer"):
	dataset_name, dataset_split, sample_idx = create_data_viewer()

	with gr.TabItem("ℹ️ About"):
	gr.Markdown(ABOUT_TEXT)

	demo.launch()