import json
import os
import random
import re
import subprocess
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import tiktoken
from src.download_swebench_leaderboard import download_leaderboard
# Tokenizer cache
_tokenizer_cache = {}
DATA_DIR = Path("data")
TRAJS_DIR = DATA_DIR / "swebench_trajs"
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
_litellm_prices_cache = None
_trajectories_cache = {}
_calculated_tokens_cache = {}
_trajectory_steps_cache = {}
def calculate_routing_tokens(steps: list[dict]) -> dict:
"""
Calculate token breakdown per model with proper caching simulation.
Args:
steps: list of dicts with keys:
- model: str (model name)
- system_user: int (tokens for system/user message, usually only step 0)
- completion: int (generated tokens)
- observation: int or None (env response tokens, None for last step)
Returns:
dict with per-model totals:
{model_name: {cache_read, uncached_input, completion, observation, cache_creation}}
"""
model_caches = {}
model_totals = {}
total_context = 0
prev_observation = 0
for i, step in enumerate(steps):
model = step["model"]
system_user = step.get("system_user", 0)
completion = step.get("completion", 0)
observation = step.get("observation") or 0
if model not in model_caches:
model_caches[model] = 0
if model not in model_totals:
model_totals[model] = {
"cache_read": 0,
"uncached_input": 0,
"completion": 0,
"observation": 0,
"cache_creation": 0,
}
cache_read = model_caches[model]
if i == 0:
uncached_input = system_user
else:
full_context_needed = total_context + prev_observation
uncached_input = full_context_needed - cache_read
cache_creation = uncached_input + completion
model_caches[model] = cache_read + cache_creation
model_totals[model]["cache_read"] += cache_read
model_totals[model]["uncached_input"] += uncached_input
model_totals[model]["completion"] += completion
model_totals[model]["observation"] += observation
model_totals[model]["cache_creation"] += cache_creation
total_context = cache_read + uncached_input + completion
prev_observation = observation
return model_totals
def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
"""
Parse trajectory file into step format for calculate_routing_tokens.
Returns list of steps with:
- model: base model name
- system_user: tokens for system + user message (step 0 only)
- completion: assistant response tokens
- observation: env response tokens (None for last step)
"""
with open(traj_path, "r", encoding="utf-8") as f:
data = json.load(f)
messages = data.get("messages", [])
if not messages:
return []
count_tokens, _ = get_tokenizer(model_name)
steps = []
system_user_tokens = 0
current_completion = 0
pending_observation = None
i = 0
while i < len(messages):
msg = messages[i]
role = msg.get("role", "user")
content = msg.get("content", "")
if isinstance(content, list):
content = json.dumps(content)
tokens = count_tokens(str(content))
if role == "system":
system_user_tokens += tokens
i += 1
elif role == "user":
if not steps:
system_user_tokens += tokens
i += 1
else:
if steps:
steps[-1]["observation"] = tokens
pending_observation = tokens
i += 1
elif role == "assistant":
step = {
"model": model_name,
"system_user": system_user_tokens if not steps else 0,
"completion": tokens,
"observation": None,
}
steps.append(step)
system_user_tokens = 0
i += 1
return steps
def get_default_overhead(model_name: str) -> float:
"""Get default tokenizer overhead for model provider"""
model_lower = model_name.lower() if model_name else ""
if "claude" in model_lower or "anthropic" in model_lower:
return 1.24
elif "gemini" in model_lower or "google" in model_lower:
return 1.0
elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
return 1.0
else:
return 1.0
def get_tokenizer(model_name: str):
"""Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
global _tokenizer_cache
model_lower = model_name.lower() if model_name else ""
if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
tokenizer_name = "o200k_base"
elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
tokenizer_name = "cl100k_base"
elif "gemini" in model_lower or "google" in model_lower:
return lambda text: int(len(text) / 3.23), "gemini_approx"
else:
tokenizer_name = "cl100k_base"
if tokenizer_name not in _tokenizer_cache:
_tokenizer_cache[tokenizer_name] = tiktoken.get_encoding(tokenizer_name)
enc = _tokenizer_cache[tokenizer_name]
return lambda text: len(enc.encode(text)), tokenizer_name
def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
"""Apply tokenizer overhead multiplier to all token counts"""
if df.empty or overhead == 1.0:
return df
df = df.copy()
df["prompt_tokens"] = (df["prompt_tokens"] * overhead).astype(int)
df["completion_tokens"] = (df["completion_tokens"] * overhead).astype(int)
df["cache_read_tokens"] = (df["cache_read_tokens"] * overhead).astype(int)
df["cache_creation_tokens"] = (df["cache_creation_tokens"] * overhead).astype(int)
df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"]
return df
def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
"""Convert all tokens to uncached input + completion (no caching)"""
if df.empty:
return df
df = df.copy()
df["cache_read_tokens"] = 0
df["cache_creation_tokens"] = 0
return df
def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
"""Load trajectories with self-calculated token counts using calculate_routing_tokens"""
global _calculated_tokens_cache
cache_key = f"calculated_{folder}"
if cache_key in _calculated_tokens_cache:
return _calculated_tokens_cache[cache_key]
trajectory_steps = load_all_trajectory_steps(folder)
rows = []
for instance_id, steps in trajectory_steps.items():
if not steps:
continue
try:
model_totals = calculate_routing_tokens(steps)
step_model = steps[0].get("model", "") if steps else ""
totals = model_totals.get(step_model, {})
cache_read = totals.get("cache_read", 0)
uncached_input = totals.get("uncached_input", 0)
completion = totals.get("completion", 0)
cache_creation = totals.get("cache_creation", 0)
prompt_tokens = cache_read + uncached_input
rows.append({
"instance_id": instance_id,
"model_name": step_model,
"api_calls": len(steps),
"instance_cost": 0,
"prompt_tokens": prompt_tokens,
"completion_tokens": completion,
"total_tokens": prompt_tokens + completion,
"cache_read_tokens": cache_read,
"cache_creation_tokens": cache_creation,
})
except Exception as e:
print(f"Error calculating tokens for {instance_id}: {e}")
df = pd.DataFrame(rows)
_calculated_tokens_cache[cache_key] = df
return df
def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
"""
Load all trajectories as step sequences for routing calculations.
Returns:
dict mapping instance_id -> list of steps for calculate_routing_tokens
"""
global _trajectory_steps_cache
cache_key = f"steps_{folder}"
if cache_key in _trajectory_steps_cache:
return _trajectory_steps_cache[cache_key]
output_dir = TRAJS_DIR / folder
traj_files = list(output_dir.glob("*/*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*/*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.json"))
model_name = ""
if traj_files:
try:
with open(traj_files[0], "r") as f:
first_data = json.load(f)
config = first_data.get("info", {}).get("config", {}).get("model", {})
model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
except Exception:
pass
result = {}
for traj_path in traj_files:
try:
instance_id = traj_path.stem.replace(".traj", "")
steps = parse_trajectory_to_steps(traj_path, model_name)
if steps:
result[instance_id] = steps
except Exception as e:
print(f"Error parsing steps for {traj_path}: {e}")
_trajectory_steps_cache[cache_key] = result
return result
def get_litellm_model_list() -> list[str]:
"""Get list of model names from litellm prices"""
prices = get_litellm_prices()
return sorted(prices.keys())
def get_litellm_prices() -> dict:
global _litellm_prices_cache
if _litellm_prices_cache is not None:
return _litellm_prices_cache
if LITELLM_PRICES_CACHE.exists():
with open(LITELLM_PRICES_CACHE) as f:
_litellm_prices_cache = json.load(f)
return _litellm_prices_cache
try:
response = requests.get(LITELLM_PRICES_URL, timeout=30)
response.raise_for_status()
_litellm_prices_cache = response.json()
DATA_DIR.mkdir(exist_ok=True)
with open(LITELLM_PRICES_CACHE, "w") as f:
json.dump(_litellm_prices_cache, f)
except Exception:
_litellm_prices_cache = {}
return _litellm_prices_cache
def normalize_model_name(name: str) -> str:
"""Normalize model name for comparison: lowercase, remove separators"""
return re.sub(r'[-_./]', '', name.lower())
def get_model_prices(model_name: str) -> dict | None:
if not model_name:
return None
prices = get_litellm_prices()
clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
name_without_date = re.sub(r'-\d{8}$', '', clean_name)
candidates = [
model_name,
clean_name,
name_without_date,
f"anthropic/{clean_name}",
f"openai/{clean_name}",
f"anthropic/{name_without_date}",
f"openai/{name_without_date}",
]
for key in candidates:
if key in prices:
return prices[key]
normalized_name = normalize_model_name(clean_name)
normalized_no_date = normalize_model_name(name_without_date)
for key, value in prices.items():
key_normalized = normalize_model_name(key)
if normalized_name in key_normalized or normalized_no_date in key_normalized:
return value
key_last_part = key.split('/')[-1] if '/' in key else key
key_last_normalized = normalize_model_name(key_last_part)
if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
return value
return None
def load_or_download_leaderboard():
if LEADERBOARD_CACHE.exists():
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
filename = download_leaderboard(output_dir=str(DATA_DIR))
os.rename(filename, LEADERBOARD_CACHE)
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
def get_bash_only_df():
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return pd.DataFrame()
rows = []
for r in bash_only["results"]:
resolved_pct = r.get("resolved", 0)
if isinstance(resolved_pct, (int, float)):
resolved_str = f"{resolved_pct:.1f}%"
else:
resolved_str = str(resolved_pct)
rows.append({
"name": r.get("name", ""),
"% resolved": resolved_str,
"date": r.get("date", ""),
"cost": round(r.get("cost", 0), 2),
"instance_cost": round(r.get("instance_cost", 0), 4),
"instance_calls": r.get("instance_calls", 0),
"folder": r.get("folder", ""),
"os_model": "✅" if r.get("os_model") else "❌",
})
return pd.DataFrame(rows)
def get_model_details(folder: str):
if not folder:
return None, "Select a model from the table"
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return None, "Leaderboard not found"
model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
if not model:
return None, f"Model with folder '{folder}' not found"
return model, None
def check_trajectories_downloaded(folder: str) -> bool:
if not folder:
return False
output_dir = TRAJS_DIR / folder
return output_dir.exists() and any(output_dir.iterdir())
def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
if not folder:
return "❌ No model selected", gr.update(visible=False)
model, error = get_model_details(folder)
if error:
return f"❌ {error}", gr.update(visible=False)
output_dir = TRAJS_DIR / folder
if output_dir.exists() and any(output_dir.iterdir()):
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*/*.traj")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)
s3_path = f"{S3_BUCKET}/{folder}/trajs/"
output_dir.mkdir(parents=True, exist_ok=True)
progress(0, desc="Starting S3 download...")
try:
result = subprocess.run(
["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
capture_output=True,
text=True,
timeout=600,
)
if result.returncode != 0:
return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*/*.traj")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
if file_count == 0:
return f"❌ No trajectory files found on S3 for {folder}", gr.update(visible=False)
per_instance = model.get("per_instance_details", {})
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
total_count = len(per_instance)
if total_count > 0:
resolved_pct = f"{100*resolved_count/total_count:.1f}%"
else:
resolved_pct = "N/A"
status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
return status, gr.update(visible=True)
except subprocess.TimeoutExpired:
return "❌ Download timed out (>10 min)", gr.update(visible=False)
except FileNotFoundError:
return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
except Exception as e:
return f"❌ Error: {e}", gr.update(visible=False)
def parse_trajectory(traj_path: Path) -> dict:
with open(traj_path, "r", encoding="utf-8") as f:
data = json.load(f)
info = data.get("info", {})
model_stats = info.get("model_stats", {})
config = info.get("config", {})
model_config = config.get("model", {})
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
result = {
"instance_id": data.get("instance_id", traj_path.stem),
"model_name": model_name,
"api_calls": model_stats.get("api_calls", 0),
"instance_cost": model_stats.get("instance_cost", 0),
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
"cache_read_tokens": 0,
"cache_creation_tokens": 0,
}
messages = data.get("messages", [])
for msg in messages:
usage = None
if "usage" in msg:
usage = msg["usage"]
elif "extra" in msg and isinstance(msg["extra"], dict):
response = msg["extra"].get("response", {})
if isinstance(response, dict):
usage = response.get("usage", {})
if usage:
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
result["total_tokens"] += usage.get("total_tokens", 0) or 0
result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0
return result
def load_all_trajectories(folder: str) -> pd.DataFrame:
global _trajectories_cache
if folder in _trajectories_cache:
return _trajectories_cache[folder]
output_dir = TRAJS_DIR / folder
traj_files = list(output_dir.glob("*/*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*/*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj"))
if not traj_files:
traj_files = list(output_dir.glob("*.json"))
rows = []
for traj_path in traj_files:
try:
rows.append(parse_trajectory(traj_path))
except Exception as e:
print(f"Error parsing {traj_path}: {e}")
df = pd.DataFrame(rows)
_trajectories_cache[folder] = df
return df
def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
"""Create Total Cost by Token Type chart (can be called separately for price updates)"""
if df.empty:
return None
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
cost_uncached_input = total_uncached_input * input_price / 1e6
cost_cache_read = total_cache_read * cache_read_price / 1e6
cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
cost_completion = total_completion * completion_price / 1e6
cost_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
})
fig = px.bar(
cost_data,
x="Token Type",
y="Cost ($)",
title="Total Cost by Token Type ($)",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig.update_layout(
xaxis_title="Token Type",
yaxis_title="Cost ($)",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
fig.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
return fig
def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
"""Create only token-related charts (for source switching)"""
if df.empty:
return None, None, None
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
token_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Total Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
})
fig_tokens = px.bar(
token_data,
x="Token Type",
y="Total Tokens (M)",
title="Total Tokens by Type",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens.update_layout(
xaxis_title="Token Type",
yaxis_title="Tokens (M)",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
fig_tokens.add_annotation(
text=f"Total: {total_all/1e6:.2f}M",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
# Stacked bar chart - sort by total tokens (sum of all stacked)
df_sorted = df.copy()
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
df_sorted["trajectory_idx"] = range(len(df_sorted))
fig_stacked = go.Figure()
fig_stacked.add_trace(go.Bar(
name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"] / 1e6,
marker_color="#EF553B", hovertemplate="Trajectory: %{x}
Uncached Input: %{y:.2f}M",
))
fig_stacked.add_trace(go.Bar(
name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"] / 1e6,
marker_color="#19D3F3", hovertemplate="Trajectory: %{x}
Cache Read: %{y:.2f}M",
))
fig_stacked.add_trace(go.Bar(
name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"] / 1e6,
marker_color="#FFA15A", hovertemplate="Trajectory: %{x}
Cache Creation: %{y:.2f}M",
))
fig_stacked.add_trace(go.Bar(
name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"] / 1e6,
marker_color="#AB63FA", hovertemplate="Trajectory: %{x}
Completion: %{y:.2f}M",
))
fig_stacked.update_layout(
barmode="stack",
title="Tokens per Trajectory (stacked)",
xaxis_title="Trajectory (sorted by total tokens)",
yaxis_title="Tokens (M)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
return fig_tokens, fig_tokens_cost, fig_stacked
def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None, None, None, None, None
fig_steps = px.histogram(
df,
x="api_calls",
nbins=30,
title="Distribution of API Calls (Steps) per Trajectory",
color_discrete_sequence=["#636EFA"],
)
fig_steps.update_layout(
xaxis_title="API Calls (Steps)",
yaxis_title="Number of Trajectories",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_steps.add_annotation(
text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
fig_cost = px.histogram(
df,
x="instance_cost",
nbins=30,
title="Distribution of Cost Reported by Leaderboard ($)",
color_discrete_sequence=["#00CC96"],
)
fig_cost.update_layout(
xaxis_title="Cost ($)",
yaxis_title="Number of Trajectories",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_cost.add_annotation(
text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
# Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
token_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
})
fig_tokens = px.bar(
token_data,
x="Token Type",
y="Tokens (M)",
title="Total Tokens by Type",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens.update_layout(
xaxis_title="Token Type",
yaxis_title="Tokens (M)",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
fig_tokens.add_annotation(
text=f"Total: {total_all/1e6:.2f}M",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
# Cost by token type (use separate function)
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
# Sort by total tokens (sum of all stacked)
df_sorted = df.copy()
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
df_sorted["trajectory_idx"] = range(len(df_sorted))
fig_stacked = go.Figure()
fig_stacked.add_trace(go.Bar(
name="Uncached Input",
x=df_sorted["trajectory_idx"],
y=df_sorted["uncached_input_tokens"] / 1e6,
marker_color="#EF553B",
hovertemplate="Trajectory: %{x}
Uncached Input: %{y:.3f}M",
))
fig_stacked.add_trace(go.Bar(
name="Cache Read",
x=df_sorted["trajectory_idx"],
y=df_sorted["cache_read_tokens"] / 1e6,
marker_color="#19D3F3",
hovertemplate="Trajectory: %{x}
Cache Read: %{y:.3f}M",
))
fig_stacked.add_trace(go.Bar(
name="Cache Creation",
x=df_sorted["trajectory_idx"],
y=df_sorted["cache_creation_tokens"] / 1e6,
marker_color="#FFA15A",
hovertemplate="Trajectory: %{x}
Cache Creation: %{y:.3f}M",
))
fig_stacked.add_trace(go.Bar(
name="Completion",
x=df_sorted["trajectory_idx"],
y=df_sorted["completion_tokens"] / 1e6,
marker_color="#AB63FA",
hovertemplate="Trajectory: %{x}
Completion: %{y:.3f}M",
))
fig_stacked.update_layout(
barmode="stack",
title="Tokens per Trajectory (stacked)",
xaxis_title="Trajectory (sorted by total tokens)",
yaxis_title="Tokens (M)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked
def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None
# Sort by total tokens (sum of all stacked)
df_sorted = df.copy()
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
df_sorted["trajectory_idx"] = range(len(df_sorted))
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6
fig = go.Figure()
fig.add_trace(go.Bar(
name=f"Uncached Input (${input_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_uncached_input"],
marker_color="#EF553B",
hovertemplate="Trajectory: %{x}
Cost: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name=f"Cache Read (${cache_read_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_cache_read"],
marker_color="#19D3F3",
hovertemplate="Trajectory: %{x}
Cost: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_cache_creation"],
marker_color="#FFA15A",
hovertemplate="Trajectory: %{x}
Cost: $%{y:.4f}",
))
fig.add_trace(go.Bar(
name=f"Completion (${completion_price:.2f}/1M)",
x=df_sorted["trajectory_idx"],
y=df_sorted["cost_completion"],
marker_color="#AB63FA",
hovertemplate="Trajectory: %{x}
Cost: $%{y:.4f}",
))
total_cost = (
df_sorted["cost_uncached_input"].sum() +
df_sorted["cost_cache_read"].sum() +
df_sorted["cost_cache_creation"].sum() +
df_sorted["cost_completion"].sum()
)
fig.update_layout(
barmode="stack",
title="Cost per Trajectory",
xaxis_title="Trajectory (sorted by total tokens)",
yaxis_title="Cost ($)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
fig.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=14),
bgcolor="white",
)
return fig
def extract_model_from_folder(folder: str) -> str:
"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
if not folder:
return ""
parts = folder.split("_")
if len(parts) >= 3:
return "_".join(parts[2:])
return folder
def get_prices_for_folder(folder: str) -> tuple[dict, str]:
"""Get prices from litellm based on folder name.
Returns (prices_dict, model_name) where prices_dict has 'value' and 'found' for each price type."""
model_hint = extract_model_from_folder(folder)
result = {
"input": {"value": 0, "found": False},
"cache_read": {"value": 0, "found": False},
"cache_creation": {"value": 0, "found": False},
"completion": {"value": 0, "found": False},
}
if not model_hint:
return result, ""
prices = get_model_prices(model_hint)
if prices:
# Get values from litellm
input_price = prices.get("input_cost_per_token", 0) * 1e6
cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
completion = prices.get("output_cost_per_token", 0) * 1e6
result["input"] = {"value": input_price, "found": input_price > 0}
result["cache_read"] = {"value": cache_read, "found": cache_read > 0}
result["cache_creation"] = {"value": cache_creation, "found": cache_creation > 0}
result["completion"] = {"value": completion, "found": completion > 0}
# Apply fallback estimates based on standard ratios
# Cache Read = Input * 0.1 (90% discount)
# Cache Creation = Input * 1.25 (25% premium)
# Completion = Input * 5 (typical ratio)
if input_price > 0:
if not result["cache_read"]["found"]:
result["cache_read"]["value"] = input_price * 0.1
if not result["cache_creation"]["found"]:
result["cache_creation"]["value"] = input_price * 1.25
if not result["completion"]["found"]:
result["completion"]["value"] = input_price * 5
elif completion > 0:
# If we only have completion, estimate input from it
estimated_input = completion / 5
if not result["input"]["found"]:
result["input"]["value"] = estimated_input
if not result["cache_read"]["found"]:
result["cache_read"]["value"] = estimated_input * 0.1
if not result["cache_creation"]["found"]:
result["cache_creation"]["value"] = estimated_input * 1.25
return result, model_hint
def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
if evt.index is None:
return (
"", "",
gr.update(visible=False),
gr.update(value=0, label="Input"),
gr.update(value=0, label="Cache Read"),
gr.update(value=0, label="Cache Creation"),
gr.update(value=0, label="Completion"),
"",
gr.update(value=1.0),
)
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
row = df.iloc[row_idx]
folder = row["folder"]
name = row["name"]
prices_dict, model_hint = get_prices_for_folder(folder)
default_overhead = get_default_overhead(model_hint)
def price_update(price_info, name):
value = price_info["value"]
if price_info["found"]:
return gr.update(value=value, label=f"✅ {name}")
elif value > 0:
return gr.update(value=value, label=f"❌ {name} (est.)")
else:
return gr.update(value=0, label=f"❌ {name}")
return (
folder, name,
gr.update(visible=True),
price_update(prices_dict["input"], "Input"),
price_update(prices_dict["cache_read"], "Cache Read"),
price_update(prices_dict["cache_creation"], "Cache Creation"),
price_update(prices_dict["completion"], "Completion"),
model_hint,
gr.update(value=default_overhead),
)
def create_routed_token_chart(base_tokens: dict, additional_models: list):
"""
Create grouped bar chart for tokens by type, comparing base vs additional models.
Args:
base_tokens: dict with uncached_input, cache_read, cache_creation, completion
additional_models: list of (model_name, tokens_dict) tuples
"""
import plotly.graph_objects as go
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
fig = go.Figure()
base_total = sum(base_tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
base_values = [
base_tokens.get("uncached_input", 0) / 1e6,
base_tokens.get("cache_read", 0) / 1e6,
base_tokens.get("cache_creation", 0) / 1e6,
base_tokens.get("completion", 0) / 1e6,
]
fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))
model_totals = [("Base Model", base_total)]
for i, (model_name, tokens) in enumerate(additional_models):
model_total = sum(tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
model_totals.append((model_name or f"Model {i+1}", model_total))
values = [
tokens.get("uncached_input", 0) / 1e6,
tokens.get("cache_read", 0) / 1e6,
tokens.get("cache_creation", 0) / 1e6,
tokens.get("completion", 0) / 1e6,
]
color = colors[(i + 1) % len(colors)]
fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))
grand_total = sum(t for _, t in model_totals)
annotation_lines = [f"Total: {grand_total/1e6:.2f}M"]
for name, total in model_totals:
annotation_lines.append(f"{name}: {total/1e6:.2f}M")
fig.update_layout(
title="Tokens by Type (per Model)",
yaxis_title="Tokens (M)",
barmode="group",
margin=dict(l=40, r=40, t=80, b=40),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)
fig.add_annotation(
text="
".join(annotation_lines),
xref="paper", yref="paper",
x=0.02, y=0.98, showarrow=False,
font=dict(size=11),
align="left",
bgcolor="rgba(255,255,255,0.8)",
bordercolor="gray",
borderwidth=1,
)
return fig
def create_routed_cost_chart(base_costs: dict, additional_models: list):
"""
Create grouped bar chart for costs by type, comparing base vs additional models.
Args:
base_costs: dict with uncached_input, cache_read, cache_creation, completion
additional_models: list of (model_name, costs_dict) tuples
"""
import plotly.graph_objects as go
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]
fig = go.Figure()
base_total = sum(base_costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
base_values = [
base_costs.get("uncached_input", 0),
base_costs.get("cache_read", 0),
base_costs.get("cache_creation", 0),
base_costs.get("completion", 0),
]
fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))
model_totals = [("Base Model", base_total)]
for i, (model_name, costs) in enumerate(additional_models):
model_total = sum(costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
model_totals.append((model_name or f"Model {i+1}", model_total))
values = [
costs.get("uncached_input", 0),
costs.get("cache_read", 0),
costs.get("cache_creation", 0),
costs.get("completion", 0),
]
color = colors[(i + 1) % len(colors)]
fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))
grand_total = sum(t for _, t in model_totals)
annotation_lines = [f"Total: ${grand_total:.2f}"]
for name, total in model_totals:
annotation_lines.append(f"{name}: ${total:.2f}")
fig.update_layout(
title="Cost by Type (per Model) ($)",
yaxis_title="Cost ($)",
barmode="group",
margin=dict(l=40, r=40, t=80, b=40),
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)
fig.add_annotation(
text="
".join(annotation_lines),
xref="paper", yref="paper",
x=0.02, y=0.98, showarrow=False,
font=dict(size=11),
align="left",
bgcolor="rgba(255,255,255,0.8)",
bordercolor="gray",
borderwidth=1,
)
return fig
def build_app():
leaderboard_df = get_bash_only_df()
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
trajectories_state = gr.State(None)
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
gr.Markdown("Select a model to use as base for cost analysis")
with gr.Row():
with gr.Column(scale=3):
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
label="Bash-Only Leaderboard",
interactive=False,
wrap=True,
)
with gr.Column(visible=False) as analysis_section:
gr.Markdown("## 📊 Trajectory Analysis")
with gr.Row():
plot_steps = gr.Plot(label="API Calls Distribution")
plot_cost = gr.Plot(label="Cost Distribution")
with gr.Row():
plot_tokens = gr.Plot(label="Token Usage by Type")
plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
with gr.Row():
plot_stacked = gr.Plot(label="Tokens per Trajectory")
plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)")
with gr.Row(visible=False) as routing_plots_row:
routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
routing_cost_plot = gr.Plot(label="Cost by Type (per Model)")
with gr.Column(scale=1):
selected_folder = gr.State("")
gr.Markdown("### Selected Model")
selected_name = gr.Textbox(label="Model Name", interactive=False)
analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")
download_status = gr.Textbox(label="Status", interactive=False, lines=3)
gr.Markdown("---")
gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*")
detected_model = gr.Textbox(label="Detected Model", interactive=False)
with gr.Row():
price_input = gr.Number(label="Input", value=0, precision=2, scale=1)
price_cache_read = gr.Number(label="Cache Read", value=0, precision=2, scale=1)
price_cache_creation = gr.Number(label="Cache Creation", value=0, precision=2, scale=1)
price_completion = gr.Number(label="Completion", value=0, precision=2, scale=1)
gr.Markdown("---")
gr.Markdown("### 📊 Token Count Source")
token_source = gr.Radio(
choices=["Metadata", "Calculated"],
value="Metadata",
)
thinking_overhead = gr.Number(
label="🔢 Tokenizer Overhead",
value=1.21,
precision=2,
info="Multiplier for Calculated tokens (tiktoken → native)",
visible=False,
)
use_cache = gr.Checkbox(
label="Use Cache",
value=True,
info="If disabled, all tokens are Uncached Input or Completion",
visible=False,
)
gr.Markdown("---")
add_routing_btn = gr.Button("➕ Add Routing", variant="primary", visible=False)
with gr.Column(visible=False) as routing_section:
gr.Markdown("### 🔀 Routing Models")
with gr.Column():
with gr.Group():
gr.Markdown("#### Route to Model 1")
routing_model_1 = gr.Dropdown(
label="Model (type 3+ chars to search)",
choices=[],
allow_custom_value=True,
interactive=True,
)
with gr.Row():
routing_price_1_input = gr.Number(label="Input", precision=3, scale=1)
routing_price_1_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
routing_price_1_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
routing_price_1_completion = gr.Number(label="Completion", precision=3, scale=1)
add_model_2_btn = gr.Button("+ Add another model", size="sm", visible=False)
with gr.Column(visible=False) as routing_block_2:
with gr.Group():
gr.Markdown("#### Route to Model 2")
routing_model_2 = gr.Dropdown(
label="Model (type 3+ chars to search)",
choices=[],
allow_custom_value=True,
interactive=True,
)
with gr.Row():
routing_price_2_input = gr.Number(label="Input", precision=3, scale=1)
routing_price_2_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
routing_price_2_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
routing_price_2_completion = gr.Number(label="Completion", precision=3, scale=1)
add_model_3_btn = gr.Button("+ Add another model", size="sm", visible=False)
with gr.Column(visible=False) as routing_block_3:
with gr.Group():
gr.Markdown("#### Route to Model 3")
routing_model_3 = gr.Dropdown(
label="Model (type 3+ chars to search)",
choices=[],
allow_custom_value=True,
interactive=True,
)
with gr.Row():
routing_price_3_input = gr.Number(label="Input", precision=3, scale=1)
routing_price_3_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
routing_price_3_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
routing_price_3_completion = gr.Number(label="Completion", precision=3, scale=1)
gr.Markdown("---")
gr.Markdown("### 🎯 Router Strategy")
selected_strategy = gr.Radio(
choices=["Random weights", "Every k-th step", "Replace part of trajectory"],
value="Random weights",
label="Strategy",
interactive=True,
)
random_hint = gr.Markdown("*Weights must sum to 1.0*", visible=True)
weight_base = gr.Number(label="Base weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True, visible=True)
weight_model_1 = gr.Number(label="Model 1 weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True, visible=True)
weight_model_2 = gr.Number(label="Model 2 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)
weight_model_3 = gr.Number(label="Model 3 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)
every_k_hint = gr.Markdown("*First model has priority on overlaps*", visible=False)
k_model_1 = gr.Number(label="k₁ (Model 1)", value=2, minimum=1, precision=0, interactive=True, visible=False)
k_model_2 = gr.Number(label="k₂ (Model 2)", value=3, minimum=1, precision=0, interactive=True, visible=False)
k_model_3 = gr.Number(label="k₃ (Model 3)", value=5, minimum=1, precision=0, interactive=True, visible=False)
part_hint = gr.Markdown("*Ranges must not overlap*", visible=False)
part_mode = gr.Radio(
choices=["Indexes", "Percentages"],
value="Percentages",
label="Mode",
interactive=True,
visible=False,
)
start_1 = gr.Number(label="M1 Start", value=0, minimum=0, precision=0, interactive=True, visible=False)
end_1 = gr.Number(label="M1 End", value=30, minimum=0, precision=0, interactive=True, visible=False)
start_2 = gr.Number(label="M2 Start", value=30, minimum=0, precision=0, interactive=True, visible=False)
end_2 = gr.Number(label="M2 End", value=60, minimum=0, precision=0, interactive=True, visible=False)
start_3 = gr.Number(label="M3 Start", value=60, minimum=0, precision=0, interactive=True, visible=False)
end_3 = gr.Number(label="M3 End", value=100, minimum=0, precision=0, interactive=True, visible=False)
gr.Markdown("---")
route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg", interactive=False)
routing_result = gr.Markdown(visible=False)
def toggle_routing_section():
return gr.update(visible=True)
add_routing_btn.click(
fn=toggle_routing_section,
outputs=[routing_section],
)
def on_strategy_change(strategy):
is_random = strategy == "Random weights"
is_every_k = strategy == "Every k-th step"
is_part = strategy == "Replace part of trajectory"
print(f"DEBUG on_strategy_change: strategy={strategy}")
return (
gr.update(visible=is_random),
gr.update(visible=is_random),
gr.update(visible=is_random),
gr.update(visible=is_every_k),
gr.update(visible=is_every_k),
gr.update(visible=is_part),
gr.update(visible=is_part),
gr.update(visible=is_part),
gr.update(visible=is_part),
)
selected_strategy.change(
fn=on_strategy_change,
inputs=[selected_strategy],
outputs=[
random_hint, weight_base, weight_model_1,
every_k_hint, k_model_1,
part_hint, part_mode, start_1, end_1,
],
)
def filter_models(query):
"""Filter models based on search query (starts at 3 chars)"""
if not query or len(query) < 3:
return gr.update(choices=[])
all_models = get_litellm_model_list()
query_lower = query.lower()
filtered = [m for m in all_models if query_lower in m.lower()][:50]
return gr.update(choices=filtered)
routing_model_1.input(fn=filter_models, inputs=[routing_model_1], outputs=[routing_model_1])
routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])
def get_routing_prices_with_labels(model_name):
"""Get all 4 prices for a routing model with found/estimated labels"""
if not model_name:
return (
gr.update(value=0, label="Input"),
gr.update(value=0, label="Cache Read"),
gr.update(value=0, label="Cache Creation"),
gr.update(value=0, label="Completion"),
)
prices = get_litellm_prices()
model_prices = prices.get(model_name, {})
input_price = model_prices.get("input_cost_per_token", 0) * 1e6
cache_read = model_prices.get("cache_read_input_token_cost", 0) * 1e6
cache_creation = model_prices.get("cache_creation_input_token_cost", 0) * 1e6
completion = model_prices.get("output_cost_per_token", 0) * 1e6
input_found = input_price > 0
cache_read_found = cache_read > 0
cache_creation_found = cache_creation > 0
completion_found = completion > 0
if not cache_read_found and input_price > 0:
cache_read = input_price * 0.1
if not cache_creation_found and input_price > 0:
cache_creation = input_price * 1.25
def label(name, found):
return f"✅ {name}" if found else f"❌ {name}"
return (
gr.update(value=input_price, label=label("Input", input_found)),
gr.update(value=cache_read, label=label("Cache Read", cache_read_found)),
gr.update(value=cache_creation, label=label("Cache Creation", cache_creation_found)),
gr.update(value=completion, label=label("Completion", completion_found)),
)
def on_routing_model_1_select(model_name):
prices = get_routing_prices_with_labels(model_name)
show_btn = bool(model_name)
return *prices, gr.update(visible=show_btn), gr.update(interactive=show_btn)
def on_routing_model_2_select(model_name):
prices = get_routing_prices_with_labels(model_name)
show_btn = bool(model_name)
return *prices, gr.update(visible=show_btn)
def on_routing_model_3_select(model_name):
return get_routing_prices_with_labels(model_name)
routing_model_1.change(
fn=on_routing_model_1_select,
inputs=[routing_model_1],
outputs=[routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn],
)
def show_model_2(strategy):
is_random = strategy == "Random weights"
is_every_k = strategy == "Every k-th step"
is_part = strategy == "Replace part of trajectory"
return (
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=is_random),
gr.update(visible=is_every_k),
gr.update(visible=is_part),
gr.update(visible=is_part),
)
add_model_2_btn.click(
fn=show_model_2,
inputs=[selected_strategy],
outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, start_2, end_2],
)
routing_model_2.change(
fn=on_routing_model_2_select,
inputs=[routing_model_2],
outputs=[routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn],
)
def show_model_3(strategy):
is_random = strategy == "Random weights"
is_every_k = strategy == "Every k-th step"
is_part = strategy == "Replace part of trajectory"
return (
gr.update(visible=True),
gr.update(visible=False),
gr.update(visible=is_random),
gr.update(visible=is_every_k),
gr.update(visible=is_part),
gr.update(visible=is_part),
)
add_model_3_btn.click(
fn=show_model_3,
inputs=[selected_strategy],
outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, start_3, end_3],
)
routing_model_3.change(
fn=on_routing_model_3_select,
inputs=[routing_model_3],
outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion],
)
def run_routing(
state_data,
base_input, base_cache_read, base_cache_creation, base_completion,
routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion,
routing_model_2_val, r2_input, r2_cache_read, r2_cache_creation, r2_completion,
routing_model_3_val, r3_input, r3_cache_read, r3_cache_creation, r3_completion,
strategy_val,
weight_base_val, weight_1_val, weight_2_val, weight_3_val,
k_1_val, k_2_val, k_3_val,
part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
source, overhead, with_cache
):
if state_data is None:
yield (
gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."),
gr.update(visible=False),
None, None,
)
return
if not routing_model_1_val:
yield (
gr.update(visible=True, value="❌ Please select at least one routing model."),
gr.update(visible=False),
None, None,
)
return
trajectory_steps = state_data.get("steps", {})
if not trajectory_steps:
yield (
gr.update(visible=True, value="❌ No trajectory steps data available."),
gr.update(visible=False),
None, None,
)
return
df_calc = state_data.get("calculated")
if df_calc is not None and not df_calc.empty:
df_for_cost = apply_thinking_overhead(df_calc.copy(), overhead)
if not with_cache:
df_for_cost = apply_no_cache(df_for_cost)
df_temp = df_for_cost.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_original_cost_from_df = (
df_temp["uncached_input"].sum() * base_input / 1e6 +
df_for_cost["cache_read_tokens"].sum() * base_cache_read / 1e6 +
df_for_cost["cache_creation_tokens"].sum() * base_cache_creation / 1e6 +
df_for_cost["completion_tokens"].sum() * base_completion / 1e6
)
else:
total_original_cost_from_df = None
base_prices = {
"input": base_input,
"cache_read": base_cache_read,
"cache_creation": base_cache_creation,
"completion": base_completion,
}
routing_models = []
if routing_model_1_val:
routing_models.append({
"name": routing_model_1_val,
"prices": {"input": r1_input, "cache_read": r1_cache_read, "cache_creation": r1_cache_creation, "completion": r1_completion},
})
if routing_model_2_val:
routing_models.append({
"name": routing_model_2_val,
"prices": {"input": r2_input, "cache_read": r2_cache_read, "cache_creation": r2_cache_creation, "completion": r2_completion},
})
if routing_model_3_val:
routing_models.append({
"name": routing_model_3_val,
"prices": {"input": r3_input, "cache_read": r3_cache_read, "cache_creation": r3_cache_creation, "completion": r3_completion},
})
if strategy_val == "Replace part of trajectory":
ranges = [(start_1_val, end_1_val)]
if len(routing_models) > 1:
ranges.append((start_2_val, end_2_val))
if len(routing_models) > 2:
ranges.append((start_3_val, end_3_val))
for i, (s, e) in enumerate(ranges):
if s >= e:
yield (gr.update(visible=True, value=f"❌ Model {i+1}: Start must be less than End"), gr.update(visible=False), None, None)
return
for i in range(len(ranges)):
for j in range(i+1, len(ranges)):
s1, e1 = ranges[i]
s2, e2 = ranges[j]
if not (e1 <= s2 or e2 <= s1):
yield (gr.update(visible=True, value=f"❌ Model {i+1} and Model {j+1} ranges overlap"), gr.update(visible=False), None, None)
return
weights = None
if strategy_val == "Random weights":
weights = [weight_base_val, weight_1_val]
if len(routing_models) > 1:
weights.append(weight_2_val)
if len(routing_models) > 2:
weights.append(weight_3_val)
total_weight = sum(weights)
if abs(total_weight - 1.0) > 0.01:
yield (gr.update(visible=True, value=f"❌ Weights must sum to 1.0 (current: {total_weight:.2f})"), gr.update(visible=False), None, None)
return
k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)]
part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)]
BASE_MODEL = "__base__"
model_keys = [BASE_MODEL] + [f"__routing_{i}__" for i in range(len(routing_models))]
all_tokens = {key: {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} for key in model_keys}
total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
for instance_id, steps in trajectory_steps.items():
if not steps:
continue
total_steps = len(steps)
step_to_model = {}
if strategy_val == "Random weights":
model_choices = [BASE_MODEL] + [f"__routing_{j}__" for j in range(len(routing_models))]
for i in range(total_steps):
step_to_model[i] = random.choices(model_choices, weights=weights)[0]
elif strategy_val == "Every k-th step":
for j, k_val in enumerate(k_values):
if k_val and k_val > 0:
for i in range(total_steps):
if (i + 1) % int(k_val) == 0:
if i not in step_to_model:
step_to_model[i] = f"__routing_{j}__"
elif strategy_val == "Replace part of trajectory":
for j, (start_val, end_val) in enumerate(part_ranges):
if part_mode_val == "Percentages":
start_idx = int(total_steps * start_val / 100)
end_idx = int(total_steps * end_val / 100)
else:
start_idx = int(start_val)
end_idx = min(int(end_val), total_steps)
for i in range(start_idx, end_idx):
step_to_model[i] = f"__routing_{j}__"
modified_steps = []
for i, step in enumerate(steps):
model = step_to_model.get(i, BASE_MODEL)
modified_steps.append({
"model": model,
"system_user": step.get("system_user", 0),
"completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
"observation": step.get("observation"),
})
model_totals = calculate_routing_tokens(modified_steps)
for key in model_keys:
totals = model_totals.get(key, {})
all_tokens[key]["cache_read"] += totals.get("cache_read", 0)
all_tokens[key]["uncached_input"] += totals.get("uncached_input", 0)
all_tokens[key]["completion"] += totals.get("completion", 0)
all_tokens[key]["cache_creation"] += totals.get("cache_creation", 0)
original_steps = []
for step in steps:
original_steps.append({
"model": BASE_MODEL,
"system_user": step.get("system_user", 0),
"completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
"observation": step.get("observation"),
})
original_totals = calculate_routing_tokens(original_steps)
orig = original_totals.get(BASE_MODEL, {})
total_original_tokens["cache_read"] += orig.get("cache_read", 0)
total_original_tokens["uncached_input"] += orig.get("uncached_input", 0)
total_original_tokens["completion"] += orig.get("completion", 0)
total_original_tokens["cache_creation"] += orig.get("cache_creation", 0)
def calc_cost(tokens: dict, prices: dict) -> float:
return (
tokens["uncached_input"] * prices["input"] / 1e6 +
tokens["cache_read"] * prices["cache_read"] / 1e6 +
tokens["cache_creation"] * prices["cache_creation"] / 1e6 +
tokens["completion"] * prices["completion"] / 1e6
)
def tokens_to_costs(tokens: dict, prices: dict) -> dict:
price_map = {"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}
return {k: tokens[k] * prices[price_map[k]] / 1e6 for k in tokens}
total_base_tokens = all_tokens[BASE_MODEL]
base_costs = tokens_to_costs(total_base_tokens, base_prices)
total_base_cost = calc_cost(total_base_tokens, base_prices)
routing_costs_list = []
total_routing_cost = 0
for i, rm in enumerate(routing_models):
key = f"__routing_{i}__"
tokens = all_tokens[key]
costs = tokens_to_costs(tokens, rm["prices"])
cost = calc_cost(tokens, rm["prices"])
routing_costs_list.append({"name": rm["name"], "tokens": tokens, "costs": costs, "cost": cost})
total_routing_cost += cost
if total_original_cost_from_df is not None:
total_original_cost = total_original_cost_from_df
else:
total_original_cost = calc_cost(total_original_tokens, base_prices)
total_routed_cost = total_base_cost + total_routing_cost
savings = total_original_cost - total_routed_cost
savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0
result_lines = [
"## 🚀 Routing Results",
"",
"| Metric | Value |",
"|--------|-------|",
f"| **Original Cost (base model only)** | ${total_original_cost:.2f} |",
f"| **Routed Cost** | ${total_routed_cost:.2f} |",
f"| ↳ Base model portion | ${total_base_cost:.2f} |",
]
for rc in routing_costs_list:
result_lines.append(f"| ↳ {rc['name']} | ${rc['cost']:.2f} |")
result_lines.append(f"| **Savings** | ${savings:.2f} ({savings_pct:+.1f}%) |")
result_text = "\n".join(result_lines)
additional_token_models = [(rc["name"], rc["tokens"]) for rc in routing_costs_list]
additional_cost_models = [(rc["name"], rc["costs"]) for rc in routing_costs_list]
yield (
gr.update(visible=True, value="⏳ Creating charts..."),
gr.update(visible=True),
None,
None,
)
tokens_chart = create_routed_token_chart(total_base_tokens, additional_token_models)
cost_chart = create_routed_cost_chart(base_costs, additional_cost_models)
yield (
gr.update(visible=True, value=result_text),
gr.update(visible=True),
tokens_chart,
cost_chart,
)
route_btn.click(
fn=run_routing,
inputs=[
trajectories_state,
price_input, price_cache_read, price_cache_creation, price_completion,
routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion,
routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion,
selected_strategy,
weight_base, weight_model_1, weight_model_2, weight_model_3,
k_model_1, k_model_2, k_model_3,
part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
token_source, thinking_overhead, use_cache,
],
outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
)
def update_calculated_options_visibility(source):
is_calc = source == "Calculated"
return gr.update(visible=is_calc), gr.update(visible=is_calc)
token_source.change(
fn=update_calculated_options_visibility,
inputs=[token_source],
outputs=[thinking_overhead, use_cache],
)
leaderboard_table.select(
fn=on_row_select,
inputs=[leaderboard_table],
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
)
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache, progress=gr.Progress()):
empty_result = (
"",
gr.update(visible=False),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
if not folder:
yield empty_result
return
if not check_trajectories_downloaded(folder):
yield (
"⏳ Downloading trajectories...",
gr.update(visible=False),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
status, _ = download_trajectories_from_s3(folder)
if "❌" in status:
yield (
status,
gr.update(visible=False),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
return
yield (
"⏳ Loading trajectories...",
gr.update(visible=True),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
df_meta = load_all_trajectories(folder)
df_calc = load_all_trajectories_calculated(folder)
df_calc["api_calls"] = df_meta["api_calls"].values
df_calc["instance_cost"] = df_meta["instance_cost"].values
trajectory_steps = load_all_trajectory_steps(folder)
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}
if source == "Metadata":
df = df_meta
else:
df = apply_thinking_overhead(df_calc.copy(), overhead)
if not with_cache:
df = apply_no_cache(df)
if df.empty:
yield (
"❌ No trajectories found",
gr.update(visible=False),
None, None, None, None, None, None,
None,
gr.update(visible=False),
)
return
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms(
df, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
yield (
f"✅ Loaded {len(df)} trajectories",
gr.update(visible=True),
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
state_data,
gr.update(visible=True),
)
analyze_btn.click(
fn=load_and_analyze,
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache],
outputs=[
download_status,
analysis_section,
plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
trajectories_state,
add_routing_btn,
],
)
def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
if state_data is None:
return None, None
if source == "Metadata":
df = state_data["meta"]
else:
df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
if not with_cache:
df = apply_no_cache(df)
if df.empty:
return None, None
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
return fig_tokens_cost, fig_cost_breakdown
price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
price_outputs = [plot_tokens_cost, plot_cost_breakdown]
price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
price_cache_read.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
def on_source_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
"""Recalculate only token-dependent charts when source changes"""
if state_data is None:
return None, None, None, None
if source == "Metadata":
df = state_data["meta"]
else:
df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
if not with_cache:
df = apply_no_cache(df)
if df.empty:
return None, None, None, None
fig_tokens, fig_tokens_cost, fig_stacked = create_token_charts(
df, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown
source_change_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
source_change_outputs = [plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown]
token_source.change(
fn=on_source_change,
inputs=source_change_inputs,
outputs=source_change_outputs,
)
thinking_overhead.change(
fn=on_source_change,
inputs=source_change_inputs,
outputs=source_change_outputs,
)
use_cache.change(
fn=on_source_change,
inputs=source_change_inputs,
outputs=source_change_outputs,
)
return app
if __name__ == "__main__":
app = build_app()
app.queue()
app.launch()