import json import os import subprocess from pathlib import Path import gradio as gr import pandas as pd import plotly.express as px import plotly.graph_objects as go import requests from src.download_swebench_leaderboard import download_leaderboard DATA_DIR = Path("data") TRAJS_DIR = DATA_DIR / "swebench_trajs" LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json" LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json" S3_BUCKET = "s3://swe-bench-experiments/bash-only" LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json" _litellm_prices_cache = None _trajectories_cache = {} def get_litellm_prices() -> dict: global _litellm_prices_cache if _litellm_prices_cache is not None: return _litellm_prices_cache if LITELLM_PRICES_CACHE.exists(): with open(LITELLM_PRICES_CACHE) as f: _litellm_prices_cache = json.load(f) return _litellm_prices_cache try: response = requests.get(LITELLM_PRICES_URL, timeout=30) response.raise_for_status() _litellm_prices_cache = response.json() DATA_DIR.mkdir(exist_ok=True) with open(LITELLM_PRICES_CACHE, "w") as f: json.dump(_litellm_prices_cache, f) except Exception: _litellm_prices_cache = {} return _litellm_prices_cache def get_model_prices(model_name: str) -> dict | None: if not model_name: return None prices = get_litellm_prices() clean_name = model_name.replace("anthropic/", "").replace("openai/", "") candidates = [ model_name, clean_name, f"anthropic/{clean_name}", f"openai/{clean_name}", ] for key in candidates: if key in prices: return prices[key] for key, value in prices.items(): if clean_name in key or model_name in key: return value return None def load_or_download_leaderboard(): if LEADERBOARD_CACHE.exists(): with open(LEADERBOARD_CACHE) as f: return json.load(f) filename = download_leaderboard(output_dir=str(DATA_DIR)) os.rename(filename, LEADERBOARD_CACHE) with open(LEADERBOARD_CACHE) as f: return json.load(f) def get_bash_only_df(): data = load_or_download_leaderboard() leaderboards = data.get("leaderboards", []) bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) if not bash_only: return pd.DataFrame() rows = [] for r in bash_only["results"]: rows.append({ "name": r.get("name", ""), "date": r.get("date", ""), "cost": round(r.get("cost", 0), 2), "instance_cost": round(r.get("instance_cost", 0), 4), "instance_calls": r.get("instance_calls", 0), "folder": r.get("folder", ""), "os_model": "✅" if r.get("os_model") else "❌", "os_system": "✅" if r.get("os_system") else "❌", }) return pd.DataFrame(rows) def get_model_details(folder: str): if not folder: return None, "Select a model from the table" data = load_or_download_leaderboard() leaderboards = data.get("leaderboards", []) bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) if not bash_only: return None, "Leaderboard not found" model = next((r for r in bash_only["results"] if r.get("folder") == folder), None) if not model: return None, f"Model with folder '{folder}' not found" return model, None def check_trajectories_downloaded(folder: str) -> bool: if not folder: return False output_dir = TRAJS_DIR / folder return output_dir.exists() and any(output_dir.iterdir()) def download_trajectories_from_s3(folder: str, progress=gr.Progress()): if not folder: return "❌ No model selected", gr.update(visible=False) model, error = get_model_details(folder) if error: return f"❌ {error}", gr.update(visible=False) output_dir = TRAJS_DIR / folder if output_dir.exists() and any(output_dir.iterdir()): file_count = len(list(output_dir.glob("*/*.traj.json"))) if file_count == 0: file_count = len(list(output_dir.glob("*.json"))) return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True) s3_path = f"{S3_BUCKET}/{folder}/trajs/" output_dir.mkdir(parents=True, exist_ok=True) progress(0, desc="Starting S3 download...") try: result = subprocess.run( ["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"], capture_output=True, text=True, timeout=600, ) if result.returncode != 0: return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False) file_count = len(list(output_dir.glob("*/*.traj.json"))) if file_count == 0: file_count = len(list(output_dir.glob("*.json"))) per_instance = model.get("per_instance_details", {}) resolved_count = sum(1 for v in per_instance.values() if v.get("resolved")) total_count = len(per_instance) status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)" return status, gr.update(visible=True) except subprocess.TimeoutExpired: return "❌ Download timed out (>10 min)", gr.update(visible=False) except FileNotFoundError: return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False) except Exception as e: return f"❌ Error: {e}", gr.update(visible=False) def parse_trajectory(traj_path: Path) -> dict: with open(traj_path, "r", encoding="utf-8") as f: data = json.load(f) info = data.get("info", {}) model_stats = info.get("model_stats", {}) config = info.get("config", {}) model_config = config.get("model", {}) model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", "")) result = { "instance_id": data.get("instance_id", traj_path.stem), "model_name": model_name, "api_calls": model_stats.get("api_calls", 0), "instance_cost": model_stats.get("instance_cost", 0), "prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0, "cache_read_tokens": 0, "cache_creation_tokens": 0, } messages = data.get("messages", []) for msg in messages: usage = None if "usage" in msg: usage = msg["usage"] elif "extra" in msg and isinstance(msg["extra"], dict): response = msg["extra"].get("response", {}) if isinstance(response, dict): usage = response.get("usage", {}) if usage: result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0 result["completion_tokens"] += usage.get("completion_tokens", 0) or 0 result["total_tokens"] += usage.get("total_tokens", 0) or 0 result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0 result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0 return result def load_all_trajectories(folder: str) -> pd.DataFrame: global _trajectories_cache if folder in _trajectories_cache: return _trajectories_cache[folder] output_dir = TRAJS_DIR / folder traj_files = list(output_dir.glob("*/*.traj.json")) if not traj_files: traj_files = list(output_dir.glob("*.traj.json")) if not traj_files: traj_files = list(output_dir.glob("*.json")) rows = [] for traj_path in traj_files: try: rows.append(parse_trajectory(traj_path)) except Exception as e: print(f"Error parsing {traj_path}: {e}") df = pd.DataFrame(rows) _trajectories_cache[folder] = df return df def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): if df.empty: return None, None, None, None, None fig_steps = px.histogram( df, x="api_calls", nbins=30, title="Distribution of API Calls (Steps) per Instance", color_discrete_sequence=["#636EFA"], ) fig_steps.update_layout( xaxis_title="API Calls (Steps)", yaxis_title="Number of Instances", showlegend=False, margin=dict(l=40, r=20, t=40, b=40), ) fig_steps.add_annotation( text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}", xref="paper", yref="paper", x=0.95, y=0.95, showarrow=False, font=dict(size=12), ) fig_cost = px.histogram( df, x="instance_cost", nbins=30, title="Distribution of Cost per Instance ($)", color_discrete_sequence=["#00CC96"], ) fig_cost.update_layout( xaxis_title="Cost ($)", yaxis_title="Number of Instances", showlegend=False, margin=dict(l=40, r=20, t=40, b=40), ) fig_cost.add_annotation( text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}", xref="paper", yref="paper", x=0.95, y=0.95, showarrow=False, font=dict(size=12), ) total_completion = df["completion_tokens"].sum() total_cache_read = df["cache_read_tokens"].sum() total_cache_creation = df["cache_creation_tokens"].sum() # Uncached input = prompt - cache_read - cache_creation (per instance, then sum) df_temp = df.copy() df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) total_uncached_input = df_temp["uncached_input"].sum() token_data = pd.DataFrame({ "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], "Total Tokens": [total_uncached_input, total_cache_read, total_cache_creation, total_completion], }) fig_tokens = px.bar( token_data, x="Token Type", y="Total Tokens", title="Total Tokens by Type", color="Token Type", color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], ) fig_tokens.update_layout( xaxis_title="Token Type", yaxis_title="Total Tokens", showlegend=False, margin=dict(l=40, r=20, t=40, b=40), ) total_all = token_data["Total Tokens"].sum() fig_tokens.add_annotation( text=f"Total: {total_all:,.0f}", xref="paper", yref="paper", x=0.95, y=0.95, showarrow=False, font=dict(size=12), ) # Cost by token type cost_uncached_input = total_uncached_input * input_price / 1e6 cost_cache_read = total_cache_read * cache_read_price / 1e6 cost_cache_creation = total_cache_creation * cache_creation_price / 1e6 cost_completion = total_completion * completion_price / 1e6 cost_data = pd.DataFrame({ "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], "Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion], }) fig_tokens_cost = px.bar( cost_data, x="Token Type", y="Cost ($)", title="Total Cost by Token Type ($)", color="Token Type", color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], ) fig_tokens_cost.update_layout( xaxis_title="Token Type", yaxis_title="Cost ($)", showlegend=False, margin=dict(l=40, r=20, t=40, b=40), ) total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion fig_tokens_cost.add_annotation( text=f"Total: ${total_cost:.2f}", xref="paper", yref="paper", x=0.95, y=0.95, showarrow=False, font=dict(size=12), ) df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True) df_sorted["instance_idx"] = range(len(df_sorted)) # Uncached input = prompt - cache_read - cache_creation df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) fig_stacked = go.Figure() fig_stacked.add_trace(go.Bar( name="Uncached Input", x=df_sorted["instance_idx"], y=df_sorted["uncached_input_tokens"], marker_color="#EF553B", hovertemplate="Instance: %{x}
Uncached Input: %{y:,.0f}", )) fig_stacked.add_trace(go.Bar( name="Cache Read", x=df_sorted["instance_idx"], y=df_sorted["cache_read_tokens"], marker_color="#19D3F3", hovertemplate="Instance: %{x}
Cache Read: %{y:,.0f}", )) fig_stacked.add_trace(go.Bar( name="Cache Creation", x=df_sorted["instance_idx"], y=df_sorted["cache_creation_tokens"], marker_color="#FFA15A", hovertemplate="Instance: %{x}
Cache Creation: %{y:,.0f}", )) fig_stacked.add_trace(go.Bar( name="Completion", x=df_sorted["instance_idx"], y=df_sorted["completion_tokens"], marker_color="#AB63FA", hovertemplate="Instance: %{x}
Completion: %{y:,.0f}", )) fig_stacked.update_layout( barmode="stack", title="Billable Tokens per Instance (stacked)", xaxis_title="Instance (sorted by cache read)", yaxis_title="Tokens", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), margin=dict(l=50, r=20, t=60, b=40), ) return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): if df.empty: return None df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True) df_sorted["instance_idx"] = range(len(df_sorted)) # Uncached input = prompt - cache_read - cache_creation df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6 df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6 df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6 df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6 fig = go.Figure() fig.add_trace(go.Bar( name=f"Uncached Input (${input_price:.2f}/1M)", x=df_sorted["instance_idx"], y=df_sorted["cost_uncached_input"], marker_color="#EF553B", hovertemplate="Instance: %{x}
Cost: $%{y:.4f}", )) fig.add_trace(go.Bar( name=f"Cache Read (${cache_read_price:.2f}/1M)", x=df_sorted["instance_idx"], y=df_sorted["cost_cache_read"], marker_color="#19D3F3", hovertemplate="Instance: %{x}
Cost: $%{y:.4f}", )) fig.add_trace(go.Bar( name=f"Cache Creation (${cache_creation_price:.2f}/1M)", x=df_sorted["instance_idx"], y=df_sorted["cost_cache_creation"], marker_color="#FFA15A", hovertemplate="Instance: %{x}
Cost: $%{y:.4f}", )) fig.add_trace(go.Bar( name=f"Completion (${completion_price:.2f}/1M)", x=df_sorted["instance_idx"], y=df_sorted["cost_completion"], marker_color="#AB63FA", hovertemplate="Instance: %{x}
Cost: $%{y:.4f}", )) total_cost = ( df_sorted["cost_uncached_input"].sum() + df_sorted["cost_cache_read"].sum() + df_sorted["cost_cache_creation"].sum() + df_sorted["cost_completion"].sum() ) fig.update_layout( barmode="stack", title="Cost Breakdown per Instance", xaxis_title="Instance (sorted by cache read)", yaxis_title="Cost ($)", legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), margin=dict(l=50, r=20, t=60, b=40), ) fig.add_annotation( text=f"Total: ${total_cost:.2f}", xref="paper", yref="paper", x=0.95, y=0.95, showarrow=False, font=dict(size=14), bgcolor="white", ) return fig def extract_model_from_folder(folder: str) -> str: """Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'""" if not folder: return "" parts = folder.split("_") if len(parts) >= 3: return "_".join(parts[2:]) return folder def get_prices_for_folder(folder: str) -> tuple[float, float, float, float, str]: """Get prices from litellm based on folder name. Returns (input, cache_read, cache_creation, completion, model_name)""" model_hint = extract_model_from_folder(folder) if not model_hint: return 0, 0, 0, 0, "" prices = get_model_prices(model_hint) if prices: input_price = prices.get("input_cost_per_token", 0) * 1e6 cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6 cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6 completion = prices.get("output_cost_per_token", 0) * 1e6 return input_price, cache_read, cache_creation, completion, model_hint return 0, 0, 0, 0, model_hint def on_row_select(evt: gr.SelectData, df: pd.DataFrame): if evt.index is None: return ( "", "", gr.update(interactive=False), gr.update(visible=False), gr.update(value=0, label="💲 Input"), gr.update(value=0, label="💲 Cache Read"), gr.update(value=0, label="💲 Cache Creation"), gr.update(value=0, label="💲 Completion"), "" ) row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index row = df.iloc[row_idx] folder = row["folder"] name = row["name"] show_analyze = check_trajectories_downloaded(folder) input_price, cache_read, cache_creation, completion, model_hint = get_prices_for_folder(folder) def price_update(value, name): if value > 0: return gr.update(value=value, label=f"✅ {name}") else: return gr.update(value=value, label=f"❌ {name}") return ( folder, name, gr.update(interactive=True), gr.update(visible=show_analyze), price_update(input_price, "Input"), price_update(cache_read, "Cache Read"), price_update(cache_creation, "Cache Creation"), price_update(completion, "Completion"), model_hint ) def build_app(): leaderboard_df = get_bash_only_df() with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app: trajectories_state = gr.State(None) gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard") gr.Markdown("Select a model to use as base for cost analysis") with gr.Row(): with gr.Column(scale=3): leaderboard_table = gr.Dataframe( value=leaderboard_df, label="Bash-Only Leaderboard", interactive=False, wrap=True, ) with gr.Column(visible=False) as analysis_section: gr.Markdown("## 📊 Trajectory Analysis") with gr.Row(): plot_steps = gr.Plot(label="API Calls Distribution") plot_cost = gr.Plot(label="Cost Distribution") with gr.Row(): plot_tokens = gr.Plot(label="Token Usage by Type") plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)") with gr.Row(): plot_stacked = gr.Plot(label="Billable Tokens per Instance") with gr.Row(): plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)") with gr.Column(scale=1): selected_folder = gr.State("") gr.Markdown("### Selected Model") selected_name = gr.Textbox(label="Model Name", interactive=False) download_btn = gr.Button("📥 Download Trajectories", interactive=False) download_status = gr.Textbox(label="Status", interactive=False, lines=3) analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary") gr.Markdown("---") gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*") detected_model = gr.Textbox(label="Detected Model", interactive=False) price_input = gr.Number(label="💲 Input", value=0, precision=2) price_cache_read = gr.Number(label="💲 Cache Read", value=0, precision=2) price_cache_creation = gr.Number(label="💲 Cache Creation", value=0, precision=2) price_completion = gr.Number(label="💲 Completion", value=0, precision=2) leaderboard_table.select( fn=on_row_select, inputs=[leaderboard_table], outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model], ) download_btn.click( fn=download_trajectories_from_s3, inputs=[selected_folder], outputs=[download_status, analyze_btn], ) def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price): empty_result = ( gr.update(visible=False), None, None, None, None, None, None, ) if not folder: yield empty_result return yield ( gr.update(visible=True), None, None, None, None, None, None, ) df = load_all_trajectories(folder) if df.empty: yield empty_result return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms( df, input_price, cache_read_price, cache_creation_price, completion_price ) fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price) yield ( gr.update(visible=True), fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown, ) analyze_btn.click( fn=load_and_analyze, inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion], outputs=[ analysis_section, plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown, ], ) return app if __name__ == "__main__": app = build_app() app.queue() app.launch()