import json
import os
import subprocess
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
from src.download_swebench_leaderboard import download_leaderboard
DATA_DIR = Path("data")
TRAJS_DIR = DATA_DIR / "swebench_trajs"
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
_litellm_prices_cache = None
_trajectories_cache = {}
def get_litellm_prices() -> dict:
global _litellm_prices_cache
if _litellm_prices_cache is not None:
return _litellm_prices_cache
if LITELLM_PRICES_CACHE.exists():
with open(LITELLM_PRICES_CACHE) as f:
_litellm_prices_cache = json.load(f)
return _litellm_prices_cache
try:
response = requests.get(LITELLM_PRICES_URL, timeout=30)
response.raise_for_status()
_litellm_prices_cache = response.json()
DATA_DIR.mkdir(exist_ok=True)
with open(LITELLM_PRICES_CACHE, "w") as f:
json.dump(_litellm_prices_cache, f)
except Exception:
_litellm_prices_cache = {}
return _litellm_prices_cache
def get_model_prices(model_name: str) -> dict | None:
if not model_name:
return None
prices = get_litellm_prices()
clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
candidates = [
model_name,
clean_name,
f"anthropic/{clean_name}",
f"openai/{clean_name}",
]
for key in candidates:
if key in prices:
return prices[key]
for key, value in prices.items():
if clean_name in key or model_name in key:
return value
return None
def load_or_download_leaderboard():
if LEADERBOARD_CACHE.exists():
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
filename = download_leaderboard(output_dir=str(DATA_DIR))
os.rename(filename, LEADERBOARD_CACHE)
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
def get_bash_only_df():
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return pd.DataFrame()
rows = []
for r in bash_only["results"]:
rows.append({
"name": r.get("name", ""),
"date": r.get("date", ""),
"cost": round(r.get("cost", 0), 2),
"instance_cost": round(r.get("instance_cost", 0), 4),
"instance_calls": r.get("instance_calls", 0),
"folder": r.get("folder", ""),
"os_model": "✅" if r.get("os_model") else "❌",
"os_system": "✅" if r.get("os_system") else "❌",
})
return pd.DataFrame(rows)
def get_model_details(folder: str):
if not folder:
return None, "Select a model from the table"
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return None, "Leaderboard not found"
model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
if not model:
return None, f"Model with folder '{folder}' not found"
return model, None
def check_trajectories_downloaded(folder: str) -> bool:
if not folder:
return False
output_dir = TRAJS_DIR / folder
return output_dir.exists() and any(output_dir.iterdir())
def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
if not folder:
return "❌ No model selected", gr.update(visible=False)
model, error = get_model_details(folder)
if error:
return f"❌ {error}", gr.update(visible=False)
output_dir = TRAJS_DIR / folder
if output_dir.exists() and any(output_dir.iterdir()):
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)
s3_path = f"{S3_BUCKET}/{folder}/trajs/"
output_dir.mkdir(parents=True, exist_ok=True)
progress(0, desc="Starting S3 download...")
try:
result = subprocess.run(
["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
capture_output=True,
text=True,
timeout=600,
)
if result.returncode != 0:
return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
per_instance = model.get("per_instance_details", {})
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
total_count = len(per_instance)
status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
return status, gr.update(visible=True)
except subprocess.TimeoutExpired:
return "❌ Download timed out (>10 min)", gr.update(visible=False)
except FileNotFoundError:
return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
except Exception as e:
return f"❌ Error: {e}", gr.update(visible=False)
def parse_trajectory(traj_path: Path) -> dict:
with open(traj_path, "r", encoding="utf-8") as f:
data = json.load(f)
info = data.get("info", {})
model_stats = info.get("model_stats", {})
config = info.get("config", {})
model_config = config.get("model", {})
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
result = {
"instance_id": data.get("instance_id", traj_path.stem),
"model_name": model_name,
"api_calls": model_stats.get("api_calls", 0),
"instance_cost": model_stats.get("instance_cost", 0),
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
"cache_read_tokens": 0,
"cache_creation_tokens": 0,
}
messages = data.get("messages", [])
for msg in messages:
usage = None
if "usage" in msg:
usage = msg["usage"]
elif "extra" in msg and isinstance(msg["extra"], dict):
response = msg["extra"].get("response", {})
if isinstance(response, dict):
usage = response.get("usage", {})
if usage:
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
result["total_tokens"] += usage.get("total_tokens", 0) or 0
result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0
return result
def load_all_trajectories(folder: str) -> pd.DataFrame:
global _trajectories_cache
if folder in _trajectories_cache:
return _trajectories_cache[folder]
output_dir = TRAJS_DIR / folder
traj_files = list(output_dir.glob("*/*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.json"))
rows = []
for traj_path in traj_files:
try:
rows.append(parse_trajectory(traj_path))
except Exception as e:
print(f"Error parsing {traj_path}: {e}")
df = pd.DataFrame(rows)
_trajectories_cache[folder] = df
return df
def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None, None, None, None, None
fig_steps = px.histogram(
df,
x="api_calls",
nbins=30,
title="Distribution of API Calls (Steps) per Instance",
color_discrete_sequence=["#636EFA"],
)
fig_steps.update_layout(
xaxis_title="API Calls (Steps)",
yaxis_title="Number of Instances",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_steps.add_annotation(
text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
fig_cost = px.histogram(
df,
x="instance_cost",
nbins=30,
title="Distribution of Cost per Instance ($)",
color_discrete_sequence=["#00CC96"],
)
fig_cost.update_layout(
xaxis_title="Cost ($)",
yaxis_title="Number of Instances",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_cost.add_annotation(
text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
# Uncached input = prompt - cache_read - cache_creation (per instance, then sum)
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
token_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Total Tokens": [total_uncached_input, total_cache_read, total_cache_creation, total_completion],
})
fig_tokens = px.bar(
token_data,
x="Token Type",
y="Total Tokens",
title="Total Tokens by Type",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens.update_layout(
xaxis_title="Token Type",
yaxis_title="Total Tokens",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_all = token_data["Total Tokens"].sum()
fig_tokens.add_annotation(
text=f"Total: {total_all:,.0f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
# Cost by token type
cost_uncached_input = total_uncached_input * input_price / 1e6
cost_cache_read = total_cache_read * cache_read_price / 1e6
cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
cost_completion = total_completion * completion_price / 1e6
cost_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
})
fig_tokens_cost = px.bar(
cost_data,
x="Token Type",
y="Cost ($)",
title="Total Cost by Token Type ($)",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens_cost.update_layout(
xaxis_title="Token Type",
yaxis_title="Cost ($)",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
fig_tokens_cost.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
df_sorted["instance_idx"] = range(len(df_sorted))
# Uncached input = prompt - cache_read - cache_creation
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
fig_stacked = go.Figure()
fig_stacked.add_trace(go.Bar(
name="Uncached Input",
x=df_sorted["instance_idx"],
y=df_sorted["uncached_input_tokens"],
marker_color="#EF553B",
hovertemplate="Instance: %{x}
Uncached Input: %{y:,.0f}",
))
fig_stacked.add_trace(go.Bar(
name="Cache Read",
x=df_sorted["instance_idx"],
y=df_sorted["cache_read_tokens"],
marker_color="#19D3F3",
hovertemplate="Instance: %{x}
Cache Read: %{y:,.0f}",
))
fig_stacked.add_trace(go.Bar(
name="Cache Creation",
x=df_sorted["instance_idx"],
y=df_sorted["cache_creation_tokens"],
marker_color="#FFA15A",
hovertemplate="Instance: %{x}
Cache Creation: %{y:,.0f}",
))
fig_stacked.add_trace(go.Bar(
name="Completion",
x=df_sorted["instance_idx"],
y=df_sorted["completion_tokens"],
marker_color="#AB63FA",
hovertemplate="Instance: %{x}
Completion: %{y:,.0f}",
))
fig_stacked.update_layout(
barmode="stack",
title="Billable Tokens per Instance (stacked)",
xaxis_title="Instance (sorted by cache read)",
yaxis_title="Tokens",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked
def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None
df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
df_sorted["instance_idx"] = range(len(df_sorted))
# Uncached input = prompt - cache_read - cache_creation
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6
fig = go.Figure()
fig.add_trace(go.Bar(
name=f"Uncached Input (${input_price:.2f}/1M)",
x=df_sorted["instance_idx"],
y=df_sorted["cost_uncached_input"],
marker_color="#EF553B",
hovertemplate="Instance: %{x}
Cost: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name=f"Cache Read (${cache_read_price:.2f}/1M)",
x=df_sorted["instance_idx"],
y=df_sorted["cost_cache_read"],
marker_color="#19D3F3",
hovertemplate="Instance: %{x}
Cost: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
x=df_sorted["instance_idx"],
y=df_sorted["cost_cache_creation"],
marker_color="#FFA15A",
hovertemplate="Instance: %{x}
Cost: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name=f"Completion (${completion_price:.2f}/1M)",
x=df_sorted["instance_idx"],
y=df_sorted["cost_completion"],
marker_color="#AB63FA",
hovertemplate="Instance: %{x}
Cost: $%{y:.4f}",
))
total_cost = (
df_sorted["cost_uncached_input"].sum() +
df_sorted["cost_cache_read"].sum() +
df_sorted["cost_cache_creation"].sum() +
df_sorted["cost_completion"].sum()
)
fig.update_layout(
barmode="stack",
title="Cost Breakdown per Instance",
xaxis_title="Instance (sorted by cache read)",
yaxis_title="Cost ($)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
fig.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=14),
bgcolor="white",
)
return fig
def extract_model_from_folder(folder: str) -> str:
"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
if not folder:
return ""
parts = folder.split("_")
if len(parts) >= 3:
return "_".join(parts[2:])
return folder
def get_prices_for_folder(folder: str) -> tuple[float, float, float, float, str]:
"""Get prices from litellm based on folder name. Returns (input, cache_read, cache_creation, completion, model_name)"""
model_hint = extract_model_from_folder(folder)
if not model_hint:
return 0, 0, 0, 0, ""
prices = get_model_prices(model_hint)
if prices:
input_price = prices.get("input_cost_per_token", 0) * 1e6
cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
completion = prices.get("output_cost_per_token", 0) * 1e6
return input_price, cache_read, cache_creation, completion, model_hint
return 0, 0, 0, 0, model_hint
def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
if evt.index is None:
return (
"", "",
gr.update(interactive=False),
gr.update(visible=False),
gr.update(value=0, label="💲 Input"),
gr.update(value=0, label="💲 Cache Read"),
gr.update(value=0, label="💲 Cache Creation"),
gr.update(value=0, label="💲 Completion"),
""
)
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
row = df.iloc[row_idx]
folder = row["folder"]
name = row["name"]
show_analyze = check_trajectories_downloaded(folder)
input_price, cache_read, cache_creation, completion, model_hint = get_prices_for_folder(folder)
def price_update(value, name):
if value > 0:
return gr.update(value=value, label=f"✅ {name}")
else:
return gr.update(value=value, label=f"❌ {name}")
return (
folder, name,
gr.update(interactive=True),
gr.update(visible=show_analyze),
price_update(input_price, "Input"),
price_update(cache_read, "Cache Read"),
price_update(cache_creation, "Cache Creation"),
price_update(completion, "Completion"),
model_hint
)
def build_app():
leaderboard_df = get_bash_only_df()
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
trajectories_state = gr.State(None)
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
gr.Markdown("Select a model to use as base for cost analysis")
with gr.Row():
with gr.Column(scale=3):
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
label="Bash-Only Leaderboard",
interactive=False,
wrap=True,
)
with gr.Column(visible=False) as analysis_section:
gr.Markdown("## 📊 Trajectory Analysis")
with gr.Row():
plot_steps = gr.Plot(label="API Calls Distribution")
plot_cost = gr.Plot(label="Cost Distribution")
with gr.Row():
plot_tokens = gr.Plot(label="Token Usage by Type")
plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
with gr.Row():
plot_stacked = gr.Plot(label="Billable Tokens per Instance")
with gr.Row():
plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)")
with gr.Column(scale=1):
selected_folder = gr.State("")
gr.Markdown("### Selected Model")
selected_name = gr.Textbox(label="Model Name", interactive=False)
download_btn = gr.Button("📥 Download Trajectories", interactive=False)
download_status = gr.Textbox(label="Status", interactive=False, lines=3)
analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")
gr.Markdown("---")
gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*")
detected_model = gr.Textbox(label="Detected Model", interactive=False)
price_input = gr.Number(label="💲 Input", value=0, precision=2)
price_cache_read = gr.Number(label="💲 Cache Read", value=0, precision=2)
price_cache_creation = gr.Number(label="💲 Cache Creation", value=0, precision=2)
price_completion = gr.Number(label="💲 Completion", value=0, precision=2)
leaderboard_table.select(
fn=on_row_select,
inputs=[leaderboard_table],
outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model],
)
download_btn.click(
fn=download_trajectories_from_s3,
inputs=[selected_folder],
outputs=[download_status, analyze_btn],
)
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price):
empty_result = (
gr.update(visible=False),
None, None, None, None, None, None,
)
if not folder:
yield empty_result
return
yield (
gr.update(visible=True),
None, None, None, None, None, None,
)
df = load_all_trajectories(folder)
if df.empty:
yield empty_result
return
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms(
df, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
yield (
gr.update(visible=True),
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
)
analyze_btn.click(
fn=load_and_analyze,
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion],
outputs=[
analysis_section,
plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
],
)
return app
if __name__ == "__main__":
app = build_app()
app.queue()
app.launch()