import gradio as gr import pandas as pd import requests from PIL import Image from io import BytesIO import os import tempfile from src.leaderboard.leaderboard_html import create_leaderboard_html BASE_URL = "https://huggingface.co/datasets/zonszer/demo_source_data/resolve/main" def load_image_from_url(url): try: response = requests.get(url) return Image.open(BytesIO(response.content)) except: return None def load_file_from_url(url): try: response = requests.get(url) file_ext = os.path.splitext(url)[1] with tempfile.NamedTemporaryFile(delete=False, suffix=file_ext) as tmp_file: tmp_file.write(response.content) return tmp_file.name except: return None # Static data - reordered columns: Method, #Param., Input Type, Control Type, Model Type, Mean Traj. ↓, Acc. ↑ STATIC_DATA = [ ["w/o WM", "72B", "RGB", "–", "VLM", 6.24, 50.27], ["PathDreamer [36]", "0.69B", "RGB-D; Sem; Pano", "Viewpoint", "Image Gen.", 5.28, 56.99], ["SE3DS [11]", "1.1B", "RGB-D; Pano", "Viewpoint", "Image Gen.", 5.29, 57.53], ["NWM [25]", "1B", "RGB", "Trajectory", "Video Gen.", 5.68, 57.35], ["SVD [6]", "1.5B", "RGB", "Image", "Video Gen.", 5.29, 57.71], ["LTX-Video [5]", "2B", "RGB", "Text", "Video Gen.", 5.37, 56.08], ["Hunyuan [4]", "13B", "RGB", "Text", "Video Gen.", 5.21, 57.71], ["Wan2.1 [23]", "14B", "RGB", "Text", "Video Gen.", 5.24, 58.26], ["Cosmos [1]", "2B", "RGB", "Text", "Video Gen.", 5.898, 52.27], ["Runway", "–", "–", "Text", "Video Gen.", "–", "–"], ["SVD† [6]", "1.5B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.02, 60.98], ["LTX† [5]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.49, 57.53], ["WAN2.1† [23]", "14B", "RGB; Pano", "Action", "Video Gen. Post-Train", "XXX", "XXX"], ["Cosmos† [1]", "2B", "RGB; Pano", "Action", "Video Gen. Post-Train", 5.08, 60.25], ] COLUMNS = ["Method", "#Param.", "Input Type", "Control Type", "Model Type", "Mean Traj. ↓", "Acc. ↑"] def create_leaderboard(): df = pd.DataFrame(STATIC_DATA, columns=COLUMNS) # Sort by accuracy in descending order (highest first), handling non-numeric values df_clean = df.copy() # Replace non-numeric values with -1 for sorting (so they appear at bottom) df_clean['Acc. ↑'] = pd.to_numeric(df_clean['Acc. ↑'], errors='coerce').fillna(-1) df_sorted = df_clean.sort_values('Acc. ↑', ascending=False) # Return original df with the sorted order but original values return df.iloc[df_sorted.index].reset_index(drop=True) with gr.Blocks(title="World-in-World: Building a Closed-Loop World Interface to Evaluate World Models", theme=gr.themes.Soft()) as demo: gr.HTML("

🏆 World-in-World: Building a Closed-Loop World Interface to Evaluate World Models

") with gr.Tabs(): with gr.TabItem("🧑‍🏫 Interactive Demo"): with open("src/display/demo_new.html", "r", encoding="utf-8") as f: html_content = f.read() gr.HTML(html_content) # with gr.Row(): # # Left Zone: Agent's View # with gr.Column(scale=2, min_width=350): # gr.HTML("

Agent's View

") # # Mimicking the blue instruction box from the image # gr.HTML(""" #

# 🧠 #

Instruction:

Navigate to the Toaster in the room and be as close as possible to it.

# """) # # Mimicking the grey planning box from the image # gr.HTML(""" #

# 🦾 #

Environment Step 4-7:

Planning:

Move leftward by 0.25.
Move leftward by 0.25.
Move forward by 0.25.
Move forward by 0.25.

# """) # # Middle Zone: Closed-Loop Environmental Feedback # with gr.Column(scale=4, min_width=500): # gr.HTML("

Closed-Loop Environmental Feedback

") # with gr.Row(): # gr.Video(value=load_file_from_url(f"{BASE_URL}/AR/FTwan21_lora/5ZKStnWn8Zo/E014/A001/world_model_gen/bbox_gen_video_1.mp4"), label="First Person View", interactive=False) # gr.Image(value=load_image_from_url(f"{BASE_URL}/scenes_glb/birdEye_5ZKStnWn8Zo.png"), label="Bird's Eye View", type="pil", interactive=False) # # gr.Model3D(value=load_file_from_url(f"{BASE_URL}/scenes_glb/5ZKStnWn8Zo.glb"), label="3D Scene", interactive=False) # # Right Zone: World Model's Generation # with gr.Column(scale=3, min_width=400): # gr.HTML("

World Model's Generation

") # # Using the new video path provided by the user # gr.Video(value=load_file_from_url(f"{BASE_URL}/AR/FTwan21_lora/5ZKStnWn8Zo/E014/A005/world_model_gen/obj_centered_gen_video_1.mp4"), label="Generated View", interactive=False) with gr.TabItem("📊 Leaderboard"): gr.HTML(create_leaderboard_html()) with gr.TabItem("📝 About"): gr.Markdown(""" # World-in-World: Building a Closed-Loop World Interface to Evaluate World Models This leaderboard showcases performance metrics across different types of AI models in world modeling tasks: ## Model Categories - **VLM**: Vision-Language Models - **Image Gen.**: Image Generation Models - **Video Gen.**: Video Generation Models - **Video Gen. Post-Train**: Post-training specialized Video Generation Models ## Metrics Explained - **Acc. ↑**: Accuracy score (higher values indicate better performance) - **Mean Traj. ↓**: Mean trajectory error (lower values indicate better performance) ## Notes - † indicates post-training specialized models - XXX indicates results pending/unavailable - – indicates not applicable or not available *Results represent performance on world modeling evaluation benchmarks and may vary across different evaluation settings.* """) if __name__ == "__main__": demo.launch()