Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

IgorSlinko commited on 10 days ago

Commit

748e866

1 Parent(s): 3855d11

Add single trajectory statistics section (v0.3.36)

- New accordion "One trajectory statistics. Calculated from .traj messages"
- Dropdown to select issue ID
- Tokens per step stacked bar chart
- Cost per step stacked bar chart (uses current prices)
- Charts update when issue is selected or prices change

Files changed (1) hide show

app.py +223 -1

app.py CHANGED Viewed

@@ -132,6 +132,168 @@ def calculate_routing_tokens(steps: list[dict]) -> dict:
     return model_totals
 def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
     """
     Parse trajectory file into step format for calculate_routing_tokens.
@@ -1353,7 +1515,7 @@ def build_app():
         """)
         trajectories_state = gr.State(None)
-        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.32`")
         gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
         with gr.Row():
@@ -1396,6 +1558,14 @@ def build_app():
                         with gr.Row():
                             plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")
                     with gr.Accordion("Calculated with routing", open=True, visible=False) as routing_plots_row:
                         with gr.Row():
                             routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
@@ -2213,6 +2383,10 @@ def build_app():
                 None, None, None, None,
                 None,
                 gr.update(visible=False),
             )
             if not folder:
@@ -2230,6 +2404,10 @@ def build_app():
                     None, None, None, None,
                     None,
                     gr.update(visible=False),
                 )
                 progress(0.3, desc="Downloading")
                 status, _ = download_trajectories_from_s3(folder)
@@ -2243,6 +2421,10 @@ def build_app():
                         None, None, None, None,
                         None,
                         gr.update(visible=False),
                     )
                     return
             progress(0.45, desc="Loading trajectories")
@@ -2255,6 +2437,10 @@ def build_app():
                 None, None, None, None,
                 None,
                 gr.update(visible=False),
             )
             progress(0.6, desc="Reading metadata")
@@ -2285,6 +2471,10 @@ def build_app():
                     None, None, None, None,
                     None,
                     gr.update(visible=False),
                 )
                 return
@@ -2311,6 +2501,9 @@ def build_app():
                 df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
             )
             progress(1, desc="Done")
             yield (
                 f"✅ Loaded {len(df_meta)} trajectories",
@@ -2320,8 +2513,23 @@ def build_app():
                 fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc,
                 state_data,
                 gr.update(visible=True),
             )
         analyze_btn.click(
             fn=load_and_analyze,
             inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
@@ -2333,7 +2541,15 @@ def build_app():
                 plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc,
                 trajectories_state,
                 add_routing_btn,
             ],
         )
         def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
@@ -2391,6 +2607,12 @@ def build_app():
         calc_options_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
         calc_options_outputs = [plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc]
         thinking_overhead.change(
             fn=on_calc_options_change,
             inputs=calc_options_inputs,

     return model_totals
+def calculate_per_step_tokens(steps: list[dict]) -> list[dict]:
+    """
+    Calculate token breakdown per step with proper caching simulation.
+    Returns list of per-step data:
+        [{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
+    """
+    result = []
+    cache_size = 0
+    total_context = 0
+    prev_observation = 0
+    for i, step in enumerate(steps):
+        system_user = step.get("system_user", 0)
+        completion = step.get("completion", 0)
+        observation = step.get("observation") or 0
+        cache_read = cache_size
+        if i == 0:
+            uncached_input = system_user
+        else:
+            full_context_needed = total_context + prev_observation
+            uncached_input = full_context_needed - cache_read
+        cache_creation = uncached_input + completion
+        cache_size = cache_read + cache_creation
+        result.append({
+            "step": i,
+            "cache_read": cache_read,
+            "uncached_input": uncached_input,
+            "completion": completion,
+            "cache_creation": cache_creation,
+        })
+        total_context = cache_read + uncached_input + completion
+        prev_observation = observation
+    return result
+def create_single_trajectory_chart(steps: list[dict]):
+    """Create stacked bar chart for a single trajectory showing tokens per step."""
+    import plotly.graph_objects as go
+    if not steps:
+        return None
+    per_step_data = calculate_per_step_tokens(steps)
+    x_labels = [f"Step {d['step']}" for d in per_step_data]
+    uncached = [d["uncached_input"] / 1e3 for d in per_step_data]
+    cache_read = [d["cache_read"] / 1e3 for d in per_step_data]
+    cache_creation = [d["cache_creation"] / 1e3 for d in per_step_data]
+    completion = [d["completion"] / 1e3 for d in per_step_data]
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        name="Uncached Input",
+        x=x_labels,
+        y=uncached,
+        marker_color="#EF553B",
+        hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>",
+    ))
+    fig.add_trace(go.Bar(
+        name="Cache Read",
+        x=x_labels,
+        y=cache_read,
+        marker_color="#19D3F3",
+        hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>",
+    ))
+    fig.add_trace(go.Bar(
+        name="Cache Creation",
+        x=x_labels,
+        y=cache_creation,
+        marker_color="#FFA15A",
+        hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>",
+    ))
+    fig.add_trace(go.Bar(
+        name="Completion",
+        x=x_labels,
+        y=completion,
+        marker_color="#AB63FA",
+        hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>",
+    ))
+    fig.update_layout(
+        barmode="stack",
+        xaxis_title="Step",
+        yaxis_title="Tokens (K)",
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
+        margin=dict(l=50, r=20, t=40, b=40),
+    )
+    return fig
+def create_single_trajectory_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
+    """Create stacked bar chart for a single trajectory showing cost per step."""
+    import plotly.graph_objects as go
+    if not steps:
+        return None
+    per_step_data = calculate_per_step_tokens(steps)
+    x_labels = [f"Step {d['step']}" for d in per_step_data]
+    uncached_cost = [d["uncached_input"] * input_price / 1e6 for d in per_step_data]
+    cache_read_cost = [d["cache_read"] * cache_read_price / 1e6 for d in per_step_data]
+    cache_creation_cost = [d["cache_creation"] * cache_creation_price / 1e6 for d in per_step_data]
+    completion_cost = [d["completion"] * completion_price / 1e6 for d in per_step_data]
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        name="Uncached Input",
+        x=x_labels,
+        y=uncached_cost,
+        marker_color="#EF553B",
+        hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>",
+    ))
+    fig.add_trace(go.Bar(
+        name="Cache Read",
+        x=x_labels,
+        y=cache_read_cost,
+        marker_color="#19D3F3",
+        hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>",
+    ))
+    fig.add_trace(go.Bar(
+        name="Cache Creation",
+        x=x_labels,
+        y=cache_creation_cost,
+        marker_color="#FFA15A",
+        hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>",
+    ))
+    fig.add_trace(go.Bar(
+        name="Completion",
+        x=x_labels,
+        y=completion_cost,
+        marker_color="#AB63FA",
+        hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>",
+    ))
+    fig.update_layout(
+        barmode="stack",
+        xaxis_title="Step",
+        yaxis_title="Cost ($)",
+        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
+        margin=dict(l=50, r=20, t=40, b=40),
+    )
+    return fig
 def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
     """
     Parse trajectory file into step format for calculate_routing_tokens.
         """)
         trajectories_state = gr.State(None)
+        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.36`")
         gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
         with gr.Row():
                         with gr.Row():
                             plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")
+                    with gr.Accordion("One trajectory statistics. Calculated from .traj messages", open=False, visible=False) as single_traj_accordion:
+                        with gr.Row():
+                            single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
+                        with gr.Row():
+                            single_traj_plot = gr.Plot(label="Tokens per Step (stacked)")
+                        with gr.Row():
+                            single_traj_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")
                     with gr.Accordion("Calculated with routing", open=True, visible=False) as routing_plots_row:
                         with gr.Row():
                             routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
                 None, None, None, None,
                 None,
                 gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(),
+                gr.update(),
+                gr.update(),
             )
             if not folder:
                     None, None, None, None,
                     None,
                     gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(),
+                    gr.update(),
+                    gr.update(),
                 )
                 progress(0.3, desc="Downloading")
                 status, _ = download_trajectories_from_s3(folder)
                         None, None, None, None,
                         None,
                         gr.update(visible=False),
+                        gr.update(visible=False),
+                        gr.update(),
+                        gr.update(),
+                        gr.update(),
                     )
                     return
             progress(0.45, desc="Loading trajectories")
                 None, None, None, None,
                 None,
                 gr.update(visible=False),
+                gr.update(visible=False),
+                gr.update(),
+                gr.update(),
+                gr.update(),
             )
             progress(0.6, desc="Reading metadata")
                     None, None, None, None,
                     None,
                     gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(),
+                    gr.update(),
+                    gr.update(),
                 )
                 return
                 df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
             )
+            issue_ids = sorted(trajectory_steps.keys())
+            first_issue = issue_ids[0] if issue_ids else None
             progress(1, desc="Done")
             yield (
                 f"✅ Loaded {len(df_meta)} trajectories",
                 fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc,
                 state_data,
                 gr.update(visible=True),
+                gr.update(visible=True),
+                gr.update(choices=issue_ids, value=first_issue),
+                gr.update(),
+                gr.update(),
             )
+        def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
+            if state_data is None or not issue_id:
+                return None, None
+            trajectory_steps = state_data.get("steps", {})
+            if issue_id not in trajectory_steps:
+                return None, None
+            steps = trajectory_steps[issue_id]
+            tokens_chart = create_single_trajectory_chart(steps)
+            cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
+            return tokens_chart, cost_chart
         analyze_btn.click(
             fn=load_and_analyze,
             inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
                 plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc,
                 trajectories_state,
                 add_routing_btn,
+                single_traj_accordion,
+                single_traj_dropdown,
+                single_traj_plot,
+                single_traj_cost_plot,
             ],
+        ).then(
+            fn=on_single_traj_select,
+            inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
+            outputs=[single_traj_plot, single_traj_cost_plot],
         )
         def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
         calc_options_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
         calc_options_outputs = [plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc]
+        single_traj_dropdown.change(
+            fn=on_single_traj_select,
+            inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
+            outputs=[single_traj_plot, single_traj_cost_plot],
+        )
         thinking_overhead.change(
             fn=on_calc_options_change,
             inputs=calc_options_inputs,