import json import os import subprocess from pathlib import Path import gradio as gr import pandas as pd from src.download_swebench_leaderboard import download_leaderboard, get_leaderboard DATA_DIR = Path("data") TRAJS_DIR = DATA_DIR / "swebench_trajs" LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json" S3_BUCKET = "s3://swe-bench-experiments/bash-only" def load_or_download_leaderboard(): if LEADERBOARD_CACHE.exists(): with open(LEADERBOARD_CACHE) as f: return json.load(f) filename = download_leaderboard(output_dir=str(DATA_DIR)) os.rename(filename, LEADERBOARD_CACHE) with open(LEADERBOARD_CACHE) as f: return json.load(f) def get_bash_only_df(): data = load_or_download_leaderboard() leaderboards = data.get("leaderboards", []) bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) if not bash_only: return pd.DataFrame() rows = [] for r in bash_only["results"]: rows.append({ "name": r.get("name", ""), "date": r.get("date", ""), "cost": round(r.get("cost", 0), 2), "instance_cost": round(r.get("instance_cost", 0), 4), "instance_calls": r.get("instance_calls", 0), "folder": r.get("folder", ""), "os_model": "✅" if r.get("os_model") else "❌", "os_system": "✅" if r.get("os_system") else "❌", }) df = pd.DataFrame(rows) return df def get_model_details(folder: str): if not folder: return None, "Select a model from the table" data = load_or_download_leaderboard() leaderboards = data.get("leaderboards", []) bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) if not bash_only: return None, "Leaderboard not found" model = next((r for r in bash_only["results"] if r.get("folder") == folder), None) if not model: return None, f"Model with folder '{folder}' not found" return model, None def download_trajectories_from_s3(folder: str, progress=gr.Progress()): if not folder: return "❌ No model selected" model, error = get_model_details(folder) if error: return f"❌ {error}" output_dir = TRAJS_DIR / folder if output_dir.exists() and any(output_dir.iterdir()): file_count = len(list(output_dir.glob("*/*.traj.json"))) return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files" s3_path = f"{S3_BUCKET}/{folder}/trajs/" output_dir.mkdir(parents=True, exist_ok=True) progress(0, desc="Starting S3 download...") try: result = subprocess.run( ["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"], capture_output=True, text=True, timeout=600, ) if result.returncode != 0: return f"❌ S3 download failed:\n{result.stderr}" file_count = len(list(output_dir.glob("*/*.traj.json"))) if file_count == 0: file_count = len(list(output_dir.glob("*.json"))) per_instance = model.get("per_instance_details", {}) resolved_count = sum(1 for v in per_instance.values() if v.get("resolved")) total_count = len(per_instance) return f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)" except subprocess.TimeoutExpired: return "❌ Download timed out (>10 min)" except FileNotFoundError: return "❌ AWS CLI not found. Install with: pip install awscli" except Exception as e: return f"❌ Error: {e}" def on_row_select(evt: gr.SelectData, df: pd.DataFrame): if evt.index is None: return "", "", gr.update() row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index row = df.iloc[row_idx] folder = row["folder"] name = row["name"] return folder, name, gr.update(interactive=True) def build_app(): df = get_bash_only_df() with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app: gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard") gr.Markdown("Select a model to use as base for cost analysis") with gr.Row(): with gr.Column(scale=3): leaderboard_table = gr.Dataframe( value=df, label="Bash-Only Leaderboard", interactive=False, wrap=True, ) with gr.Column(scale=1): gr.Markdown("### Selected Model") selected_name = gr.Textbox(label="Model Name", interactive=False) selected_folder = gr.Textbox(label="Folder ID", interactive=False) download_btn = gr.Button("📥 Download Trajectories", interactive=False) download_status = gr.Textbox(label="Status", interactive=False, lines=3) leaderboard_table.select( fn=on_row_select, inputs=[leaderboard_table], outputs=[selected_folder, selected_name, download_btn], ) download_btn.click( fn=download_trajectories_from_s3, inputs=[selected_folder], outputs=[download_status], ) return app if __name__ == "__main__": app = build_app() app.launch()