import json
import os
import random
import re
import subprocess
from pathlib import Path

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import tiktoken

from src.download_swebench_leaderboard import download_leaderboard

# Tokenizer cache
_tokenizer_cache = {}

DATA_DIR = Path("data")
TRAJS_DIR = DATA_DIR / "swebench_trajs"
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"

_litellm_prices_cache = None
_trajectories_cache = {}
_calculated_tokens_cache = {}
_trajectory_steps_cache = {}


def calculate_routing_tokens(steps: list[dict]) -> dict:
    """
    Calculate token breakdown per model with proper caching simulation.

    Args:
        steps: list of dicts with keys:
            - model: str (model name)
            - system_user: int (tokens for system/user message, usually only step 0)
            - completion: int (generated tokens)
            - observation: int or None (env response tokens, None for last step)

    Returns:
        dict with per-model totals:
            {model_name: {cache_read, uncached_input, completion, observation, cache_creation}}
    """
    model_caches = {}
    model_totals = {}

    total_context = 0
    prev_observation = 0

    for i, step in enumerate(steps):
        model = step["model"]
        system_user = step.get("system_user", 0)
        completion = step.get("completion", 0)
        observation = step.get("observation") or 0

        if model not in model_caches:
            model_caches[model] = 0
        if model not in model_totals:
            model_totals[model] = {
                "cache_read": 0,
                "uncached_input": 0,
                "completion": 0,
                "observation": 0,
                "cache_creation": 0,
            }

        cache_read = model_caches[model]

        if i == 0:
            uncached_input = system_user
        else:
            full_context_needed = total_context + prev_observation
            uncached_input = full_context_needed - cache_read

        cache_creation = uncached_input + completion

        model_caches[model] = cache_read + cache_creation

        model_totals[model]["cache_read"] += cache_read
        model_totals[model]["uncached_input"] += uncached_input
        model_totals[model]["completion"] += completion
        model_totals[model]["observation"] += observation
        model_totals[model]["cache_creation"] += cache_creation

        total_context = cache_read + uncached_input + completion
        prev_observation = observation

    return model_totals


def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
    """
    Parse trajectory file into step format for calculate_routing_tokens.

    Returns list of steps with:
        - model: base model name
        - system_user: tokens for system + user message (step 0 only)
        - completion: assistant response tokens
        - observation: env response tokens (None for last step)
    """
    with open(traj_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    messages = data.get("messages", [])
    if not messages:
        return []

    count_tokens, _ = get_tokenizer(model_name)

    steps = []
    system_user_tokens = 0
    current_completion = 0
    pending_observation = None

    i = 0
    while i < len(messages):
        msg = messages[i]
        role = msg.get("role", "user")
        content = msg.get("content", "")
        if isinstance(content, list):
            content = json.dumps(content)
        tokens = count_tokens(str(content))

        if role == "system":
            system_user_tokens += tokens
            i += 1
        elif role == "user":
            if not steps:
                system_user_tokens += tokens
                i += 1
            else:
                if steps:
                    steps[-1]["observation"] = tokens
                pending_observation = tokens
                i += 1
        elif role == "assistant":
            step = {
                "model": model_name,
                "system_user": system_user_tokens if not steps else 0,
                "completion": tokens,
                "observation": None,
            }
            steps.append(step)
            system_user_tokens = 0
            i += 1

    return steps


def get_default_overhead(model_name: str) -> float:
    """Get default tokenizer overhead for model provider"""
    model_lower = model_name.lower() if model_name else ""

    if "claude" in model_lower or "anthropic" in model_lower:
        return 1.24
    elif "gemini" in model_lower or "google" in model_lower:
        return 1.0
    elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
        return 1.0
    else:
        return 1.0


def get_tokenizer(model_name: str):
    """Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
    global _tokenizer_cache

    model_lower = model_name.lower() if model_name else ""

    if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
        tokenizer_name = "o200k_base"
    elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
        tokenizer_name = "cl100k_base"
    elif "gemini" in model_lower or "google" in model_lower:
        return lambda text: int(len(text) / 3.23), "gemini_approx"
    else:
        tokenizer_name = "cl100k_base"
    
    if tokenizer_name not in _tokenizer_cache:
        _tokenizer_cache[tokenizer_name] = tiktoken.get_encoding(tokenizer_name)
    
    enc = _tokenizer_cache[tokenizer_name]
    return lambda text: len(enc.encode(text)), tokenizer_name


def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
    """Apply tokenizer overhead multiplier to all token counts"""
    if df.empty or overhead == 1.0:
        return df

    df = df.copy()
    df["prompt_tokens"] = (df["prompt_tokens"] * overhead).astype(int)
    df["completion_tokens"] = (df["completion_tokens"] * overhead).astype(int)
    df["cache_read_tokens"] = (df["cache_read_tokens"] * overhead).astype(int)
    df["cache_creation_tokens"] = (df["cache_creation_tokens"] * overhead).astype(int)
    df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"]
    return df


def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
    """Convert all tokens to uncached input + completion (no caching)"""
    if df.empty:
        return df

    df = df.copy()
    df["cache_read_tokens"] = 0
    df["cache_creation_tokens"] = 0
    return df


def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
    """Load trajectories with self-calculated token counts using calculate_routing_tokens"""
    global _calculated_tokens_cache

    cache_key = f"calculated_{folder}"
    if cache_key in _calculated_tokens_cache:
        return _calculated_tokens_cache[cache_key]

    trajectory_steps = load_all_trajectory_steps(folder)

    rows = []
    for instance_id, steps in trajectory_steps.items():
        if not steps:
            continue

        try:
            model_totals = calculate_routing_tokens(steps)
            step_model = steps[0].get("model", "") if steps else ""
            totals = model_totals.get(step_model, {})

            cache_read = totals.get("cache_read", 0)
            uncached_input = totals.get("uncached_input", 0)
            completion = totals.get("completion", 0)
            cache_creation = totals.get("cache_creation", 0)

            prompt_tokens = cache_read + uncached_input

            rows.append({
                "instance_id": instance_id,
                "model_name": step_model,
                "api_calls": len(steps),
                "instance_cost": 0,
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion,
                "total_tokens": prompt_tokens + completion,
                "cache_read_tokens": cache_read,
                "cache_creation_tokens": cache_creation,
            })
        except Exception as e:
            print(f"Error calculating tokens for {instance_id}: {e}")

    df = pd.DataFrame(rows)
    _calculated_tokens_cache[cache_key] = df
    return df


def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
    """
    Load all trajectories as step sequences for routing calculations.

    Returns:
        dict mapping instance_id -> list of steps for calculate_routing_tokens
    """
    global _trajectory_steps_cache

    cache_key = f"steps_{folder}"
    if cache_key in _trajectory_steps_cache:
        return _trajectory_steps_cache[cache_key]

    output_dir = TRAJS_DIR / folder

    traj_files = list(output_dir.glob("*/*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*/*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.json"))

    model_name = ""
    if traj_files:
        try:
            with open(traj_files[0], "r") as f:
                first_data = json.load(f)
                config = first_data.get("info", {}).get("config", {}).get("model", {})
                model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
        except Exception:
            pass

    result = {}
    for traj_path in traj_files:
        try:
            instance_id = traj_path.stem.replace(".traj", "")
            steps = parse_trajectory_to_steps(traj_path, model_name)
            if steps:
                result[instance_id] = steps
        except Exception as e:
            print(f"Error parsing steps for {traj_path}: {e}")

    _trajectory_steps_cache[cache_key] = result
    return result


def get_litellm_model_list() -> list[str]:
    """Get list of model names from litellm prices"""
    prices = get_litellm_prices()
    return sorted(prices.keys())


def get_litellm_prices() -> dict:
    global _litellm_prices_cache
    if _litellm_prices_cache is not None:
        return _litellm_prices_cache

    if LITELLM_PRICES_CACHE.exists():
        with open(LITELLM_PRICES_CACHE) as f:
            _litellm_prices_cache = json.load(f)
            return _litellm_prices_cache

    try:
        response = requests.get(LITELLM_PRICES_URL, timeout=30)
        response.raise_for_status()
        _litellm_prices_cache = response.json()

        DATA_DIR.mkdir(exist_ok=True)
        with open(LITELLM_PRICES_CACHE, "w") as f:
            json.dump(_litellm_prices_cache, f)
    except Exception:
        _litellm_prices_cache = {}

    return _litellm_prices_cache


def normalize_model_name(name: str) -> str:
    """Normalize model name for comparison: lowercase, remove separators"""
    return re.sub(r'[-_./]', '', name.lower())


def get_model_prices(model_name: str) -> dict | None:
    if not model_name:
        return None

    prices = get_litellm_prices()

    clean_name = model_name.replace("anthropic/", "").replace("openai/", "")

    name_without_date = re.sub(r'-\d{8}$', '', clean_name)

    candidates = [
        model_name,
        clean_name,
        name_without_date,
        f"anthropic/{clean_name}",
        f"openai/{clean_name}",
        f"anthropic/{name_without_date}",
        f"openai/{name_without_date}",
    ]

    for key in candidates:
        if key in prices:
            return prices[key]

    normalized_name = normalize_model_name(clean_name)
    normalized_no_date = normalize_model_name(name_without_date)

    for key, value in prices.items():
        key_normalized = normalize_model_name(key)
        if normalized_name in key_normalized or normalized_no_date in key_normalized:
            return value
        key_last_part = key.split('/')[-1] if '/' in key else key
        key_last_normalized = normalize_model_name(key_last_part)
        if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
            return value

    return None


def load_or_download_leaderboard():
    if LEADERBOARD_CACHE.exists():
        with open(LEADERBOARD_CACHE) as f:
            return json.load(f)

    filename = download_leaderboard(output_dir=str(DATA_DIR))
    os.rename(filename, LEADERBOARD_CACHE)
    with open(LEADERBOARD_CACHE) as f:
        return json.load(f)


def get_bash_only_df():
    data = load_or_download_leaderboard()
    leaderboards = data.get("leaderboards", [])
    bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

    if not bash_only:
        return pd.DataFrame()

    rows = []
    for r in bash_only["results"]:
        resolved_pct = r.get("resolved", 0)
        if isinstance(resolved_pct, (int, float)):
            resolved_str = f"{resolved_pct:.1f}%"
        else:
            resolved_str = str(resolved_pct)

        rows.append({
            "name": r.get("name", ""),
            "% resolved": resolved_str,
            "date": r.get("date", ""),
            "cost": round(r.get("cost", 0), 2),
            "instance_cost": round(r.get("instance_cost", 0), 4),
            "instance_calls": r.get("instance_calls", 0),
            "folder": r.get("folder", ""),
            "os_model": "✅" if r.get("os_model") else "❌",
        })

    return pd.DataFrame(rows)


def get_model_details(folder: str):
    if not folder:
        return None, "Select a model from the table"

    data = load_or_download_leaderboard()
    leaderboards = data.get("leaderboards", [])
    bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

    if not bash_only:
        return None, "Leaderboard not found"

    model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
    if not model:
        return None, f"Model with folder '{folder}' not found"

    return model, None


def check_trajectories_downloaded(folder: str) -> bool:
    if not folder:
        return False
    output_dir = TRAJS_DIR / folder
    return output_dir.exists() and any(output_dir.iterdir())


def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
    if not folder:
        return "❌ No model selected", gr.update(visible=False)

    model, error = get_model_details(folder)
    if error:
        return f"❌ {error}", gr.update(visible=False)

    output_dir = TRAJS_DIR / folder
    if output_dir.exists() and any(output_dir.iterdir()):
        file_count = len(list(output_dir.glob("*/*.traj.json")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*/*.traj")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*.json")))
        return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)

    s3_path = f"{S3_BUCKET}/{folder}/trajs/"
    output_dir.mkdir(parents=True, exist_ok=True)

    progress(0, desc="Starting S3 download...")

    try:
        result = subprocess.run(
            ["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
            capture_output=True,
            text=True,
            timeout=600,
        )

        if result.returncode != 0:
            return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)

        file_count = len(list(output_dir.glob("*/*.traj.json")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*/*.traj")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*.json")))

        if file_count == 0:
            return f"❌ No trajectory files found on S3 for {folder}", gr.update(visible=False)

        per_instance = model.get("per_instance_details", {})
        resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
        total_count = len(per_instance)

        if total_count > 0:
            resolved_pct = f"{100*resolved_count/total_count:.1f}%"
        else:
            resolved_pct = "N/A"

        status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
        return status, gr.update(visible=True)

    except subprocess.TimeoutExpired:
        return "❌ Download timed out (>10 min)", gr.update(visible=False)
    except FileNotFoundError:
        return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
    except Exception as e:
        return f"❌ Error: {e}", gr.update(visible=False)


def parse_trajectory(traj_path: Path) -> dict:
    with open(traj_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    info = data.get("info", {})
    model_stats = info.get("model_stats", {})
    config = info.get("config", {})
    model_config = config.get("model", {})
    model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))

    result = {
        "instance_id": data.get("instance_id", traj_path.stem),
        "model_name": model_name,
        "api_calls": model_stats.get("api_calls", 0),
        "instance_cost": model_stats.get("instance_cost", 0),
        "prompt_tokens": 0,
        "completion_tokens": 0,
        "total_tokens": 0,
        "cache_read_tokens": 0,
        "cache_creation_tokens": 0,
    }

    messages = data.get("messages", [])
    for msg in messages:
        usage = None
        if "usage" in msg:
            usage = msg["usage"]
        elif "extra" in msg and isinstance(msg["extra"], dict):
            response = msg["extra"].get("response", {})
            if isinstance(response, dict):
                usage = response.get("usage", {})

        if usage:
            result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
            result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
            result["total_tokens"] += usage.get("total_tokens", 0) or 0
            result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
            result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0

    return result


def load_all_trajectories(folder: str) -> pd.DataFrame:
    global _trajectories_cache

    if folder in _trajectories_cache:
        return _trajectories_cache[folder]

    output_dir = TRAJS_DIR / folder

    traj_files = list(output_dir.glob("*/*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*/*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.json"))

    rows = []
    for traj_path in traj_files:
        try:
            rows.append(parse_trajectory(traj_path))
        except Exception as e:
            print(f"Error parsing {traj_path}: {e}")

    df = pd.DataFrame(rows)
    _trajectories_cache[folder] = df
    return df


def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    """Create Total Cost by Token Type chart (can be called separately for price updates)"""
    if df.empty:
        return None

    total_completion = df["completion_tokens"].sum()
    total_cache_read = df["cache_read_tokens"].sum()
    total_cache_creation = df["cache_creation_tokens"].sum()
    df_temp = df.copy()
    df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
    total_uncached_input = df_temp["uncached_input"].sum()

    cost_uncached_input = total_uncached_input * input_price / 1e6
    cost_cache_read = total_cache_read * cache_read_price / 1e6
    cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
    cost_completion = total_completion * completion_price / 1e6

    cost_data = pd.DataFrame({
        "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
        "Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
    })

    fig = px.bar(
        cost_data,
        x="Token Type",
        y="Cost ($)",
        title="Total Cost by Token Type ($)",
        color="Token Type",
        color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
    )
    fig.update_layout(
        xaxis_title="Token Type",
        yaxis_title="Cost ($)",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )

    total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
    fig.add_annotation(
        text=f"Total: ${total_cost:.2f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    return fig


def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    """Create only token-related charts (for source switching)"""
    if df.empty:
        return None, None, None

    total_completion = df["completion_tokens"].sum()
    total_cache_read = df["cache_read_tokens"].sum()
    total_cache_creation = df["cache_creation_tokens"].sum()
    df_temp = df.copy()
    df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
    total_uncached_input = df_temp["uncached_input"].sum()

    token_data = pd.DataFrame({
        "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
        "Total Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
    })

    fig_tokens = px.bar(
        token_data,
        x="Token Type",
        y="Total Tokens (M)",
        title="Total Tokens by Type",
        color="Token Type",
        color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
    )
    fig_tokens.update_layout(
        xaxis_title="Token Type",
        yaxis_title="Tokens (M)",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )
    total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
    fig_tokens.add_annotation(
        text=f"Total: {total_all/1e6:.2f}M",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)

    # Stacked bar chart - sort by total tokens (sum of all stacked)
    df_sorted = df.copy()
    df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
    df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
    df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
    df_sorted["trajectory_idx"] = range(len(df_sorted))

    fig_stacked = go.Figure()
    fig_stacked.add_trace(go.Bar(
        name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"] / 1e6,
        marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.2f}M<extra></extra>",
    ))
    fig_stacked.add_trace(go.Bar(
        name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"] / 1e6,
        marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.2f}M<extra></extra>",
    ))
    fig_stacked.add_trace(go.Bar(
        name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"] / 1e6,
        marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.2f}M<extra></extra>",
    ))
    fig_stacked.add_trace(go.Bar(
        name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"] / 1e6,
        marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:.2f}M<extra></extra>",
    ))
    fig_stacked.update_layout(
        barmode="stack",
        title="Tokens per Trajectory (stacked)",
        xaxis_title="Trajectory (sorted by total tokens)",
        yaxis_title="Tokens (M)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=60, b=40),
    )

    return fig_tokens, fig_tokens_cost, fig_stacked


def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    if df.empty:
        return None, None, None, None, None

    fig_steps = px.histogram(
        df,
        x="api_calls",
        nbins=30,
        title="Distribution of API Calls (Steps) per Trajectory",
        color_discrete_sequence=["#636EFA"],
    )
    fig_steps.update_layout(
        xaxis_title="API Calls (Steps)",
        yaxis_title="Number of Trajectories",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )
    fig_steps.add_annotation(
        text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    fig_cost = px.histogram(
        df,
        x="instance_cost",
        nbins=30,
        title="Distribution of Cost Reported by Leaderboard ($)",
        color_discrete_sequence=["#00CC96"],
    )
    fig_cost.update_layout(
        xaxis_title="Cost ($)",
        yaxis_title="Number of Trajectories",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )
    fig_cost.add_annotation(
        text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    total_completion = df["completion_tokens"].sum()
    total_cache_read = df["cache_read_tokens"].sum()
    total_cache_creation = df["cache_creation_tokens"].sum()
    # Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
    df_temp = df.copy()
    df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
    total_uncached_input = df_temp["uncached_input"].sum()

    token_data = pd.DataFrame({
        "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
        "Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
    })

    fig_tokens = px.bar(
        token_data,
        x="Token Type",
        y="Tokens (M)",
        title="Total Tokens by Type",
        color="Token Type",
        color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
    )
    fig_tokens.update_layout(
        xaxis_title="Token Type",
        yaxis_title="Tokens (M)",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )

    total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
    fig_tokens.add_annotation(
        text=f"Total: {total_all/1e6:.2f}M",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    # Cost by token type (use separate function)
    fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)

    # Sort by total tokens (sum of all stacked)
    df_sorted = df.copy()
    df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
    df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
    df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
    df_sorted["trajectory_idx"] = range(len(df_sorted))

    fig_stacked = go.Figure()

    fig_stacked.add_trace(go.Bar(
        name="Uncached Input",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["uncached_input_tokens"] / 1e6,
        marker_color="#EF553B",
        hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.3f}M<extra></extra>",
    ))

    fig_stacked.add_trace(go.Bar(
        name="Cache Read",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cache_read_tokens"] / 1e6,
        marker_color="#19D3F3",
        hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.3f}M<extra></extra>",
    ))

    fig_stacked.add_trace(go.Bar(
        name="Cache Creation",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cache_creation_tokens"] / 1e6,
        marker_color="#FFA15A",
        hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.3f}M<extra></extra>",
    ))

    fig_stacked.add_trace(go.Bar(
        name="Completion",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["completion_tokens"] / 1e6,
        marker_color="#AB63FA",
        hovertemplate="Trajectory: %{x}<br>Completion: %{y:.3f}M<extra></extra>",
    ))

    fig_stacked.update_layout(
        barmode="stack",
        title="Tokens per Trajectory (stacked)",
        xaxis_title="Trajectory (sorted by total tokens)",
        yaxis_title="Tokens (M)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=60, b=40),
    )

    return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked


def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    if df.empty:
        return None

    # Sort by total tokens (sum of all stacked)
    df_sorted = df.copy()
    df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
    df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
    df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
    df_sorted["trajectory_idx"] = range(len(df_sorted))

    df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
    df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
    df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
    df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6

    fig = go.Figure()

    fig.add_trace(go.Bar(
        name=f"Uncached Input (${input_price:.2f}/1M)",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cost_uncached_input"],
        marker_color="#EF553B",
        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"Cache Read (${cache_read_price:.2f}/1M)",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cost_cache_read"],
        marker_color="#19D3F3",
        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cost_cache_creation"],
        marker_color="#FFA15A",
        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"Completion (${completion_price:.2f}/1M)",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cost_completion"],
        marker_color="#AB63FA",
        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    total_cost = (
        df_sorted["cost_uncached_input"].sum() +
        df_sorted["cost_cache_read"].sum() +
        df_sorted["cost_cache_creation"].sum() +
        df_sorted["cost_completion"].sum()
    )

    fig.update_layout(
        barmode="stack",
        title="Cost per Trajectory",
        xaxis_title="Trajectory (sorted by total tokens)",
        yaxis_title="Cost ($)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=60, b=40),
    )

    fig.add_annotation(
        text=f"Total: ${total_cost:.2f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=14),
        bgcolor="white",
    )

    return fig


def extract_model_from_folder(folder: str) -> str:
    """Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
    if not folder:
        return ""
    parts = folder.split("_")
    if len(parts) >= 3:
        return "_".join(parts[2:])
    return folder


def get_prices_for_folder(folder: str) -> tuple[dict, str]:
    """Get prices from litellm based on folder name. 
    Returns (prices_dict, model_name) where prices_dict has 'value' and 'found' for each price type."""
    model_hint = extract_model_from_folder(folder)
    
    result = {
        "input": {"value": 0, "found": False},
        "cache_read": {"value": 0, "found": False},
        "cache_creation": {"value": 0, "found": False},
        "completion": {"value": 0, "found": False},
    }
    
    if not model_hint:
        return result, ""

    prices = get_model_prices(model_hint)
    if prices:
        # Get values from litellm
        input_price = prices.get("input_cost_per_token", 0) * 1e6
        cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
        cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
        completion = prices.get("output_cost_per_token", 0) * 1e6
        
        result["input"] = {"value": input_price, "found": input_price > 0}
        result["cache_read"] = {"value": cache_read, "found": cache_read > 0}
        result["cache_creation"] = {"value": cache_creation, "found": cache_creation > 0}
        result["completion"] = {"value": completion, "found": completion > 0}
        
        # Apply fallback estimates based on standard ratios
        # Cache Read = Input * 0.1 (90% discount)
        # Cache Creation = Input * 1.25 (25% premium)
        # Completion = Input * 5 (typical ratio)
        if input_price > 0:
            if not result["cache_read"]["found"]:
                result["cache_read"]["value"] = input_price * 0.1
            if not result["cache_creation"]["found"]:
                result["cache_creation"]["value"] = input_price * 1.25
            if not result["completion"]["found"]:
                result["completion"]["value"] = input_price * 5
        elif completion > 0:
            # If we only have completion, estimate input from it
            estimated_input = completion / 5
            if not result["input"]["found"]:
                result["input"]["value"] = estimated_input
            if not result["cache_read"]["found"]:
                result["cache_read"]["value"] = estimated_input * 0.1
            if not result["cache_creation"]["found"]:
                result["cache_creation"]["value"] = estimated_input * 1.25

    return result, model_hint


def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
    if evt.index is None:
        return (
            "", "",
            gr.update(visible=False),
            gr.update(value=0, label="Input"),
            gr.update(value=0, label="Cache Read"),
            gr.update(value=0, label="Cache Creation"),
            gr.update(value=0, label="Completion"),
            "",
            gr.update(value=1.0),
        )

    row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
    row = df.iloc[row_idx]
    folder = row["folder"]
    name = row["name"]

    prices_dict, model_hint = get_prices_for_folder(folder)
    default_overhead = get_default_overhead(model_hint)

    def price_update(price_info, name):
        value = price_info["value"]
        if price_info["found"]:
            return gr.update(value=value, label=f"✅ {name}")
        elif value > 0:
            return gr.update(value=value, label=f"❌ {name} (est.)")
        else:
            return gr.update(value=0, label=f"❌ {name}")

    return (
        folder, name,
        gr.update(visible=True),
        price_update(prices_dict["input"], "Input"),
        price_update(prices_dict["cache_read"], "Cache Read"),
        price_update(prices_dict["cache_creation"], "Cache Creation"),
        price_update(prices_dict["completion"], "Completion"),
        model_hint,
        gr.update(value=default_overhead),
    )


def create_routed_token_chart(base_tokens: dict, additional_models: list):
    """
    Create grouped bar chart for tokens by type, comparing base vs additional models.

    Args:
        base_tokens: dict with uncached_input, cache_read, cache_creation, completion
        additional_models: list of (model_name, tokens_dict) tuples
    """
    import plotly.graph_objects as go

    categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
    colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]

    fig = go.Figure()

    base_total = sum(base_tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
    base_values = [
        base_tokens.get("uncached_input", 0) / 1e6,
        base_tokens.get("cache_read", 0) / 1e6,
        base_tokens.get("cache_creation", 0) / 1e6,
        base_tokens.get("completion", 0) / 1e6,
    ]
    fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))

    model_totals = [("Base Model", base_total)]

    for i, (model_name, tokens) in enumerate(additional_models):
        model_total = sum(tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
        model_totals.append((model_name or f"Model {i+1}", model_total))
        values = [
            tokens.get("uncached_input", 0) / 1e6,
            tokens.get("cache_read", 0) / 1e6,
            tokens.get("cache_creation", 0) / 1e6,
            tokens.get("completion", 0) / 1e6,
        ]
        color = colors[(i + 1) % len(colors)]
        fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))

    grand_total = sum(t for _, t in model_totals)
    annotation_lines = [f"<b>Total: {grand_total/1e6:.2f}M</b>"]
    for name, total in model_totals:
        annotation_lines.append(f"{name}: {total/1e6:.2f}M")

    fig.update_layout(
        title="Tokens by Type (per Model)",
        yaxis_title="Tokens (M)",
        barmode="group",
        margin=dict(l=40, r=40, t=80, b=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    )
    fig.add_annotation(
        text="<br>".join(annotation_lines),
        xref="paper", yref="paper",
        x=0.02, y=0.98, showarrow=False,
        font=dict(size=11),
        align="left",
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="gray",
        borderwidth=1,
    )
    return fig


def create_routed_cost_chart(base_costs: dict, additional_models: list):
    """
    Create grouped bar chart for costs by type, comparing base vs additional models.

    Args:
        base_costs: dict with uncached_input, cache_read, cache_creation, completion
        additional_models: list of (model_name, costs_dict) tuples
    """
    import plotly.graph_objects as go

    categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
    colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]

    fig = go.Figure()

    base_total = sum(base_costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
    base_values = [
        base_costs.get("uncached_input", 0),
        base_costs.get("cache_read", 0),
        base_costs.get("cache_creation", 0),
        base_costs.get("completion", 0),
    ]
    fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))

    model_totals = [("Base Model", base_total)]

    for i, (model_name, costs) in enumerate(additional_models):
        model_total = sum(costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
        model_totals.append((model_name or f"Model {i+1}", model_total))
        values = [
            costs.get("uncached_input", 0),
            costs.get("cache_read", 0),
            costs.get("cache_creation", 0),
            costs.get("completion", 0),
        ]
        color = colors[(i + 1) % len(colors)]
        fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))

    grand_total = sum(t for _, t in model_totals)
    annotation_lines = [f"<b>Total: ${grand_total:.2f}</b>"]
    for name, total in model_totals:
        annotation_lines.append(f"{name}: ${total:.2f}")

    fig.update_layout(
        title="Cost by Type (per Model) ($)",
        yaxis_title="Cost ($)",
        barmode="group",
        margin=dict(l=40, r=40, t=80, b=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
    )
    fig.add_annotation(
        text="<br>".join(annotation_lines),
        xref="paper", yref="paper",
        x=0.02, y=0.98, showarrow=False,
        font=dict(size=11),
        align="left",
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="gray",
        borderwidth=1,
    )
    return fig


def build_app():
    leaderboard_df = get_bash_only_df()

    with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
        trajectories_state = gr.State(None)

        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
        gr.Markdown("Select a model to use as base for cost analysis")

        with gr.Row():
            with gr.Column(scale=3):
                leaderboard_table = gr.Dataframe(
                    value=leaderboard_df,
                    label="Bash-Only Leaderboard",
                    interactive=False,
                    wrap=True,
                )

                with gr.Column(visible=False) as analysis_section:
                    gr.Markdown("## 📊 Trajectory Analysis")

                    with gr.Row():
                        plot_steps = gr.Plot(label="API Calls Distribution")
                        plot_cost = gr.Plot(label="Cost Distribution")

                    with gr.Row():
                        plot_tokens = gr.Plot(label="Token Usage by Type")
                        plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")

                    with gr.Row():
                        plot_stacked = gr.Plot(label="Tokens per Trajectory")
                        plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)")

                    with gr.Row(visible=False) as routing_plots_row:
                        routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
                        routing_cost_plot = gr.Plot(label="Cost by Type (per Model)")

            with gr.Column(scale=1):
                selected_folder = gr.State("")
                gr.Markdown("### Selected Model")
                selected_name = gr.Textbox(label="Model Name", interactive=False)

                analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")
                download_status = gr.Textbox(label="Status", interactive=False, lines=3)

                gr.Markdown("---")
                gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*")
                detected_model = gr.Textbox(label="Detected Model", interactive=False)
                with gr.Row():
                    price_input = gr.Number(label="Input", value=0, precision=2, scale=1)
                    price_cache_read = gr.Number(label="Cache Read", value=0, precision=2, scale=1)
                    price_cache_creation = gr.Number(label="Cache Creation", value=0, precision=2, scale=1)
                    price_completion = gr.Number(label="Completion", value=0, precision=2, scale=1)

                gr.Markdown("---")
                gr.Markdown("### 📊 Token Count Source")
                token_source = gr.Radio(
                    choices=["Metadata", "Calculated"],
                    value="Metadata",
                )
                thinking_overhead = gr.Number(
                    label="🔢 Tokenizer Overhead",
                    value=1.21,
                    precision=2,
                    info="Multiplier for Calculated tokens (tiktoken → native)",
                    visible=False,
                )
                use_cache = gr.Checkbox(
                    label="Use Cache",
                    value=True,
                    info="If disabled, all tokens are Uncached Input or Completion",
                    visible=False,
                )

                gr.Markdown("---")
                add_routing_btn = gr.Button("➕ Add Routing", variant="primary", visible=False)

                with gr.Column(visible=False) as routing_section:
                    gr.Markdown("### 🔀 Routing Models")

                    with gr.Column():
                        with gr.Group():
                            gr.Markdown("#### Route to Model 1")
                            routing_model_1 = gr.Dropdown(
                                label="Model (type 3+ chars to search)",
                                choices=[],
                                allow_custom_value=True,
                                interactive=True,
                            )
                            with gr.Row():
                                routing_price_1_input = gr.Number(label="Input", precision=3, scale=1)
                                routing_price_1_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
                                routing_price_1_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
                                routing_price_1_completion = gr.Number(label="Completion", precision=3, scale=1)

                        add_model_2_btn = gr.Button("+ Add another model", size="sm", visible=False)

                        with gr.Column(visible=False) as routing_block_2:
                            with gr.Group():
                                gr.Markdown("#### Route to Model 2")
                                routing_model_2 = gr.Dropdown(
                                    label="Model (type 3+ chars to search)",
                                    choices=[],
                                    allow_custom_value=True,
                                    interactive=True,
                                )
                                with gr.Row():
                                    routing_price_2_input = gr.Number(label="Input", precision=3, scale=1)
                                    routing_price_2_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
                                    routing_price_2_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
                                    routing_price_2_completion = gr.Number(label="Completion", precision=3, scale=1)

                            add_model_3_btn = gr.Button("+ Add another model", size="sm", visible=False)

                        with gr.Column(visible=False) as routing_block_3:
                            with gr.Group():
                                gr.Markdown("#### Route to Model 3")
                                routing_model_3 = gr.Dropdown(
                                    label="Model (type 3+ chars to search)",
                                    choices=[],
                                    allow_custom_value=True,
                                    interactive=True,
                                )
                                with gr.Row():
                                    routing_price_3_input = gr.Number(label="Input", precision=3, scale=1)
                                    routing_price_3_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
                                    routing_price_3_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
                                    routing_price_3_completion = gr.Number(label="Completion", precision=3, scale=1)

                    gr.Markdown("---")
                    gr.Markdown("### 🎯 Router Strategy")

                    selected_strategy = gr.Radio(
                        choices=["Random weights", "Every k-th step", "Replace part of trajectory"],
                        value="Random weights",
                        label="Strategy",
                        interactive=True,
                    )

                    random_hint = gr.Markdown("*Weights must sum to 1.0*", visible=True)
                    weight_base = gr.Number(label="Base weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True, visible=True)
                    weight_model_1 = gr.Number(label="Model 1 weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True, visible=True)
                    weight_model_2 = gr.Number(label="Model 2 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)
                    weight_model_3 = gr.Number(label="Model 3 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)

                    every_k_hint = gr.Markdown("*First model has priority on overlaps*", visible=False)
                    k_model_1 = gr.Number(label="k₁ (Model 1)", value=2, minimum=1, precision=0, interactive=True, visible=False)
                    k_model_2 = gr.Number(label="k₂ (Model 2)", value=3, minimum=1, precision=0, interactive=True, visible=False)
                    k_model_3 = gr.Number(label="k₃ (Model 3)", value=5, minimum=1, precision=0, interactive=True, visible=False)

                    part_hint = gr.Markdown("*Ranges must not overlap*", visible=False)
                    part_mode = gr.Radio(
                        choices=["Indexes", "Percentages"],
                        value="Percentages",
                        label="Mode",
                        interactive=True,
                        visible=False,
                    )
                    start_1 = gr.Number(label="M1 Start", value=0, minimum=0, precision=0, interactive=True, visible=False)
                    end_1 = gr.Number(label="M1 End", value=30, minimum=0, precision=0, interactive=True, visible=False)
                    start_2 = gr.Number(label="M2 Start", value=30, minimum=0, precision=0, interactive=True, visible=False)
                    end_2 = gr.Number(label="M2 End", value=60, minimum=0, precision=0, interactive=True, visible=False)
                    start_3 = gr.Number(label="M3 Start", value=60, minimum=0, precision=0, interactive=True, visible=False)
                    end_3 = gr.Number(label="M3 End", value=100, minimum=0, precision=0, interactive=True, visible=False)

                    gr.Markdown("---")
                    route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg", interactive=False)
                    routing_result = gr.Markdown(visible=False)


        def toggle_routing_section():
            return gr.update(visible=True)

        add_routing_btn.click(
            fn=toggle_routing_section,
            outputs=[routing_section],
        )

        def on_strategy_change(strategy):
            is_random = strategy == "Random weights"
            is_every_k = strategy == "Every k-th step"
            is_part = strategy == "Replace part of trajectory"
            print(f"DEBUG on_strategy_change: strategy={strategy}")
            return (
                gr.update(visible=is_random),
                gr.update(visible=is_random),
                gr.update(visible=is_random),
                gr.update(visible=is_every_k),
                gr.update(visible=is_every_k),
                gr.update(visible=is_part),
                gr.update(visible=is_part),
                gr.update(visible=is_part),
                gr.update(visible=is_part),
            )

        selected_strategy.change(
            fn=on_strategy_change,
            inputs=[selected_strategy],
            outputs=[
                random_hint, weight_base, weight_model_1,
                every_k_hint, k_model_1,
                part_hint, part_mode, start_1, end_1,
            ],
        )

        def filter_models(query):
            """Filter models based on search query (starts at 3 chars)"""
            if not query or len(query) < 3:
                return gr.update(choices=[])
            all_models = get_litellm_model_list()
            query_lower = query.lower()
            filtered = [m for m in all_models if query_lower in m.lower()][:50]
            return gr.update(choices=filtered)

        routing_model_1.input(fn=filter_models, inputs=[routing_model_1], outputs=[routing_model_1])
        routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
        routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])

        def get_routing_prices_with_labels(model_name):
            """Get all 4 prices for a routing model with found/estimated labels"""
            if not model_name:
                return (
                    gr.update(value=0, label="Input"),
                    gr.update(value=0, label="Cache Read"),
                    gr.update(value=0, label="Cache Creation"),
                    gr.update(value=0, label="Completion"),
                )

            prices = get_litellm_prices()
            model_prices = prices.get(model_name, {})

            input_price = model_prices.get("input_cost_per_token", 0) * 1e6
            cache_read = model_prices.get("cache_read_input_token_cost", 0) * 1e6
            cache_creation = model_prices.get("cache_creation_input_token_cost", 0) * 1e6
            completion = model_prices.get("output_cost_per_token", 0) * 1e6

            input_found = input_price > 0
            cache_read_found = cache_read > 0
            cache_creation_found = cache_creation > 0
            completion_found = completion > 0

            if not cache_read_found and input_price > 0:
                cache_read = input_price * 0.1
            if not cache_creation_found and input_price > 0:
                cache_creation = input_price * 1.25

            def label(name, found):
                return f"✅ {name}" if found else f"❌ {name}"

            return (
                gr.update(value=input_price, label=label("Input", input_found)),
                gr.update(value=cache_read, label=label("Cache Read", cache_read_found)),
                gr.update(value=cache_creation, label=label("Cache Creation", cache_creation_found)),
                gr.update(value=completion, label=label("Completion", completion_found)),
            )

        def on_routing_model_1_select(model_name):
            prices = get_routing_prices_with_labels(model_name)
            show_btn = bool(model_name)
            return *prices, gr.update(visible=show_btn), gr.update(interactive=show_btn)

        def on_routing_model_2_select(model_name):
            prices = get_routing_prices_with_labels(model_name)
            show_btn = bool(model_name)
            return *prices, gr.update(visible=show_btn)

        def on_routing_model_3_select(model_name):
            return get_routing_prices_with_labels(model_name)

        routing_model_1.change(
            fn=on_routing_model_1_select,
            inputs=[routing_model_1],
            outputs=[routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn],
        )

        def show_model_2(strategy):
            is_random = strategy == "Random weights"
            is_every_k = strategy == "Every k-th step"
            is_part = strategy == "Replace part of trajectory"
            return (
                gr.update(visible=True),
                gr.update(visible=False),
                gr.update(visible=is_random),
                gr.update(visible=is_every_k),
                gr.update(visible=is_part),
                gr.update(visible=is_part),
            )

        add_model_2_btn.click(
            fn=show_model_2,
            inputs=[selected_strategy],
            outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, start_2, end_2],
        )

        routing_model_2.change(
            fn=on_routing_model_2_select,
            inputs=[routing_model_2],
            outputs=[routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn],
        )

        def show_model_3(strategy):
            is_random = strategy == "Random weights"
            is_every_k = strategy == "Every k-th step"
            is_part = strategy == "Replace part of trajectory"
            return (
                gr.update(visible=True),
                gr.update(visible=False),
                gr.update(visible=is_random),
                gr.update(visible=is_every_k),
                gr.update(visible=is_part),
                gr.update(visible=is_part),
            )

        add_model_3_btn.click(
            fn=show_model_3,
            inputs=[selected_strategy],
            outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, start_3, end_3],
        )

        routing_model_3.change(
            fn=on_routing_model_3_select,
            inputs=[routing_model_3],
            outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion],
        )

        def run_routing(
            state_data,
            base_input, base_cache_read, base_cache_creation, base_completion,
            routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion,
            routing_model_2_val, r2_input, r2_cache_read, r2_cache_creation, r2_completion,
            routing_model_3_val, r3_input, r3_cache_read, r3_cache_creation, r3_completion,
            strategy_val,
            weight_base_val, weight_1_val, weight_2_val, weight_3_val,
            k_1_val, k_2_val, k_3_val,
            part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
            source, overhead, with_cache
        ):
            if state_data is None:
                yield (
                    gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."),
                    gr.update(visible=False),
                    None, None,
                )
                return

            if not routing_model_1_val:
                yield (
                    gr.update(visible=True, value="❌ Please select at least one routing model."),
                    gr.update(visible=False),
                    None, None,
                )
                return

            trajectory_steps = state_data.get("steps", {})
            if not trajectory_steps:
                yield (
                    gr.update(visible=True, value="❌ No trajectory steps data available."),
                    gr.update(visible=False),
                    None, None,
                )
                return


            df_calc = state_data.get("calculated")
            if df_calc is not None and not df_calc.empty:
                df_for_cost = apply_thinking_overhead(df_calc.copy(), overhead)
                if not with_cache:
                    df_for_cost = apply_no_cache(df_for_cost)
                df_temp = df_for_cost.copy()
                df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
                total_original_cost_from_df = (
                    df_temp["uncached_input"].sum() * base_input / 1e6 +
                    df_for_cost["cache_read_tokens"].sum() * base_cache_read / 1e6 +
                    df_for_cost["cache_creation_tokens"].sum() * base_cache_creation / 1e6 +
                    df_for_cost["completion_tokens"].sum() * base_completion / 1e6
                )
            else:
                total_original_cost_from_df = None

            base_prices = {
                "input": base_input,
                "cache_read": base_cache_read,
                "cache_creation": base_cache_creation,
                "completion": base_completion,
            }

            routing_models = []
            if routing_model_1_val:
                routing_models.append({
                    "name": routing_model_1_val,
                    "prices": {"input": r1_input, "cache_read": r1_cache_read, "cache_creation": r1_cache_creation, "completion": r1_completion},
                })
            if routing_model_2_val:
                routing_models.append({
                    "name": routing_model_2_val,
                    "prices": {"input": r2_input, "cache_read": r2_cache_read, "cache_creation": r2_cache_creation, "completion": r2_completion},
                })
            if routing_model_3_val:
                routing_models.append({
                    "name": routing_model_3_val,
                    "prices": {"input": r3_input, "cache_read": r3_cache_read, "cache_creation": r3_cache_creation, "completion": r3_completion},
                })

            if strategy_val == "Replace part of trajectory":
                ranges = [(start_1_val, end_1_val)]
                if len(routing_models) > 1:
                    ranges.append((start_2_val, end_2_val))
                if len(routing_models) > 2:
                    ranges.append((start_3_val, end_3_val))
                for i, (s, e) in enumerate(ranges):
                    if s >= e:
                        yield (gr.update(visible=True, value=f"❌ Model {i+1}: Start must be less than End"), gr.update(visible=False), None, None)
                        return
                for i in range(len(ranges)):
                    for j in range(i+1, len(ranges)):
                        s1, e1 = ranges[i]
                        s2, e2 = ranges[j]
                        if not (e1 <= s2 or e2 <= s1):
                            yield (gr.update(visible=True, value=f"❌ Model {i+1} and Model {j+1} ranges overlap"), gr.update(visible=False), None, None)
                            return

            weights = None
            if strategy_val == "Random weights":
                weights = [weight_base_val, weight_1_val]
                if len(routing_models) > 1:
                    weights.append(weight_2_val)
                if len(routing_models) > 2:
                    weights.append(weight_3_val)
                total_weight = sum(weights)
                if abs(total_weight - 1.0) > 0.01:
                    yield (gr.update(visible=True, value=f"❌ Weights must sum to 1.0 (current: {total_weight:.2f})"), gr.update(visible=False), None, None)
                    return

            k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)]
            part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)]

            BASE_MODEL = "__base__"
            model_keys = [BASE_MODEL] + [f"__routing_{i}__" for i in range(len(routing_models))]

            all_tokens = {key: {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} for key in model_keys}
            total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}

            for instance_id, steps in trajectory_steps.items():
                if not steps:
                    continue

                total_steps = len(steps)

                step_to_model = {}

                if strategy_val == "Random weights":
                    model_choices = [BASE_MODEL] + [f"__routing_{j}__" for j in range(len(routing_models))]
                    for i in range(total_steps):
                        step_to_model[i] = random.choices(model_choices, weights=weights)[0]

                elif strategy_val == "Every k-th step":
                    for j, k_val in enumerate(k_values):
                        if k_val and k_val > 0:
                            for i in range(total_steps):
                                if (i + 1) % int(k_val) == 0:
                                    if i not in step_to_model:
                                        step_to_model[i] = f"__routing_{j}__"

                elif strategy_val == "Replace part of trajectory":
                    for j, (start_val, end_val) in enumerate(part_ranges):
                        if part_mode_val == "Percentages":
                            start_idx = int(total_steps * start_val / 100)
                            end_idx = int(total_steps * end_val / 100)
                        else:
                            start_idx = int(start_val)
                            end_idx = min(int(end_val), total_steps)
                        for i in range(start_idx, end_idx):
                            step_to_model[i] = f"__routing_{j}__"

                modified_steps = []
                for i, step in enumerate(steps):
                    model = step_to_model.get(i, BASE_MODEL)
                    modified_steps.append({
                        "model": model,
                        "system_user": step.get("system_user", 0),
                        "completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
                        "observation": step.get("observation"),
                    })

                model_totals = calculate_routing_tokens(modified_steps)

                for key in model_keys:
                    totals = model_totals.get(key, {})
                    all_tokens[key]["cache_read"] += totals.get("cache_read", 0)
                    all_tokens[key]["uncached_input"] += totals.get("uncached_input", 0)
                    all_tokens[key]["completion"] += totals.get("completion", 0)
                    all_tokens[key]["cache_creation"] += totals.get("cache_creation", 0)

                original_steps = []
                for step in steps:
                    original_steps.append({
                        "model": BASE_MODEL,
                        "system_user": step.get("system_user", 0),
                        "completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
                        "observation": step.get("observation"),
                    })
                original_totals = calculate_routing_tokens(original_steps)
                orig = original_totals.get(BASE_MODEL, {})
                total_original_tokens["cache_read"] += orig.get("cache_read", 0)
                total_original_tokens["uncached_input"] += orig.get("uncached_input", 0)
                total_original_tokens["completion"] += orig.get("completion", 0)
                total_original_tokens["cache_creation"] += orig.get("cache_creation", 0)

            def calc_cost(tokens: dict, prices: dict) -> float:
                return (
                    tokens["uncached_input"] * prices["input"] / 1e6 +
                    tokens["cache_read"] * prices["cache_read"] / 1e6 +
                    tokens["cache_creation"] * prices["cache_creation"] / 1e6 +
                    tokens["completion"] * prices["completion"] / 1e6
                )

            def tokens_to_costs(tokens: dict, prices: dict) -> dict:
                price_map = {"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}
                return {k: tokens[k] * prices[price_map[k]] / 1e6 for k in tokens}

            total_base_tokens = all_tokens[BASE_MODEL]
            base_costs = tokens_to_costs(total_base_tokens, base_prices)
            total_base_cost = calc_cost(total_base_tokens, base_prices)

            routing_costs_list = []
            total_routing_cost = 0
            for i, rm in enumerate(routing_models):
                key = f"__routing_{i}__"
                tokens = all_tokens[key]
                costs = tokens_to_costs(tokens, rm["prices"])
                cost = calc_cost(tokens, rm["prices"])
                routing_costs_list.append({"name": rm["name"], "tokens": tokens, "costs": costs, "cost": cost})
                total_routing_cost += cost

            if total_original_cost_from_df is not None:
                total_original_cost = total_original_cost_from_df
            else:
                total_original_cost = calc_cost(total_original_tokens, base_prices)

            total_routed_cost = total_base_cost + total_routing_cost
            savings = total_original_cost - total_routed_cost
            savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0

            result_lines = [
                "## 🚀 Routing Results",
                "",
                "| Metric | Value |",
                "|--------|-------|",
                f"| **Original Cost (base model only)** | ${total_original_cost:.2f} |",
                f"| **Routed Cost** | ${total_routed_cost:.2f} |",
                f"| ↳ Base model portion | ${total_base_cost:.2f} |",
            ]
            for rc in routing_costs_list:
                result_lines.append(f"| ↳ {rc['name']} | ${rc['cost']:.2f} |")
            result_lines.append(f"| **Savings** | ${savings:.2f} ({savings_pct:+.1f}%) |")
            result_text = "\n".join(result_lines)

            additional_token_models = [(rc["name"], rc["tokens"]) for rc in routing_costs_list]
            additional_cost_models = [(rc["name"], rc["costs"]) for rc in routing_costs_list]

            yield (
                gr.update(visible=True, value="⏳ Creating charts..."),
                gr.update(visible=True),
                None,
                None,
            )

            tokens_chart = create_routed_token_chart(total_base_tokens, additional_token_models)
            cost_chart = create_routed_cost_chart(base_costs, additional_cost_models)

            yield (
                gr.update(visible=True, value=result_text),
                gr.update(visible=True),
                tokens_chart,
                cost_chart,
            )

        route_btn.click(
            fn=run_routing,
            inputs=[
                trajectories_state,
                price_input, price_cache_read, price_cache_creation, price_completion,
                routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
                routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion,
                routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion,
                selected_strategy,
                weight_base, weight_model_1, weight_model_2, weight_model_3,
                k_model_1, k_model_2, k_model_3,
                part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
                token_source, thinking_overhead, use_cache,
            ],
            outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
        )

        def update_calculated_options_visibility(source):
            is_calc = source == "Calculated"
            return gr.update(visible=is_calc), gr.update(visible=is_calc)

        token_source.change(
            fn=update_calculated_options_visibility,
            inputs=[token_source],
            outputs=[thinking_overhead, use_cache],
        )

        leaderboard_table.select(
            fn=on_row_select,
            inputs=[leaderboard_table],
            outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
        )

        def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache, progress=gr.Progress()):
            empty_result = (
                "",
                gr.update(visible=False),
                None, None, None, None, None, None,
                None,
                gr.update(visible=False),
            )

            if not folder:
                yield empty_result
                return

            if not check_trajectories_downloaded(folder):
                yield (
                    "⏳ Downloading trajectories...",
                    gr.update(visible=False),
                    None, None, None, None, None, None,
                    None,
                    gr.update(visible=False),
                )
                status, _ = download_trajectories_from_s3(folder)
                if "❌" in status:
                    yield (
                        status,
                        gr.update(visible=False),
                        None, None, None, None, None, None,
                        None,
                        gr.update(visible=False),
                    )
                    return

            yield (
                "⏳ Loading trajectories...",
                gr.update(visible=True),
                None, None, None, None, None, None,
                None,
                gr.update(visible=False),
            )

            df_meta = load_all_trajectories(folder)
            df_calc = load_all_trajectories_calculated(folder)
            df_calc["api_calls"] = df_meta["api_calls"].values
            df_calc["instance_cost"] = df_meta["instance_cost"].values
            trajectory_steps = load_all_trajectory_steps(folder)

            state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}

            if source == "Metadata":
                df = df_meta
            else:
                df = apply_thinking_overhead(df_calc.copy(), overhead)
                if not with_cache:
                    df = apply_no_cache(df)

            if df.empty:
                yield (
                    "❌ No trajectories found",
                    gr.update(visible=False),
                    None, None, None, None, None, None,
                    None,
                    gr.update(visible=False),
                )
                return

            fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms(
                df, input_price, cache_read_price, cache_creation_price, completion_price
            )
            fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)

            yield (
                f"✅ Loaded {len(df)} trajectories",
                gr.update(visible=True),
                fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
                state_data,
                gr.update(visible=True),
            )

        analyze_btn.click(
            fn=load_and_analyze,
            inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache],
            outputs=[
                download_status,
                analysis_section,
                plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
                trajectories_state,
                add_routing_btn,
            ],
        )

        def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
            if state_data is None:
                return None, None

            if source == "Metadata":
                df = state_data["meta"]
            else:
                df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
                if not with_cache:
                    df = apply_no_cache(df)

            if df.empty:
                return None, None

            fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
            fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
            return fig_tokens_cost, fig_cost_breakdown

        price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
        price_outputs = [plot_tokens_cost, plot_cost_breakdown]

        price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
        price_cache_read.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
        price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
        price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)

        def on_source_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
            """Recalculate only token-dependent charts when source changes"""
            if state_data is None:
                return None, None, None, None

            if source == "Metadata":
                df = state_data["meta"]
            else:
                df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
                if not with_cache:
                    df = apply_no_cache(df)

            if df.empty:
                return None, None, None, None

            fig_tokens, fig_tokens_cost, fig_stacked = create_token_charts(
                df, input_price, cache_read_price, cache_creation_price, completion_price
            )
            fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)

            return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown

        source_change_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
        source_change_outputs = [plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown]

        token_source.change(
            fn=on_source_change,
            inputs=source_change_inputs,
            outputs=source_change_outputs,
        )

        thinking_overhead.change(
            fn=on_source_change,
            inputs=source_change_inputs,
            outputs=source_change_outputs,
        )

        use_cache.change(
            fn=on_source_change,
            inputs=source_change_inputs,
            outputs=source_change_outputs,
        )

    return app


if __name__ == "__main__":
    app = build_app()
    app.queue()
    app.launch()