IgorSlinko commited on
Commit
748e866
·
1 Parent(s): 3855d11

Add single trajectory statistics section (v0.3.36)

Browse files

- New accordion "One trajectory statistics. Calculated from .traj messages"
- Dropdown to select issue ID
- Tokens per step stacked bar chart
- Cost per step stacked bar chart (uses current prices)
- Charts update when issue is selected or prices change

Files changed (1) hide show
  1. app.py +223 -1
app.py CHANGED
@@ -132,6 +132,168 @@ def calculate_routing_tokens(steps: list[dict]) -> dict:
132
  return model_totals
133
 
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
136
  """
137
  Parse trajectory file into step format for calculate_routing_tokens.
@@ -1353,7 +1515,7 @@ def build_app():
1353
  """)
1354
  trajectories_state = gr.State(None)
1355
 
1356
- gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.32`")
1357
  gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
1358
 
1359
  with gr.Row():
@@ -1396,6 +1558,14 @@ def build_app():
1396
  with gr.Row():
1397
  plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")
1398
 
 
 
 
 
 
 
 
 
1399
  with gr.Accordion("Calculated with routing", open=True, visible=False) as routing_plots_row:
1400
  with gr.Row():
1401
  routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
@@ -2213,6 +2383,10 @@ def build_app():
2213
  None, None, None, None,
2214
  None,
2215
  gr.update(visible=False),
 
 
 
 
2216
  )
2217
 
2218
  if not folder:
@@ -2230,6 +2404,10 @@ def build_app():
2230
  None, None, None, None,
2231
  None,
2232
  gr.update(visible=False),
 
 
 
 
2233
  )
2234
  progress(0.3, desc="Downloading")
2235
  status, _ = download_trajectories_from_s3(folder)
@@ -2243,6 +2421,10 @@ def build_app():
2243
  None, None, None, None,
2244
  None,
2245
  gr.update(visible=False),
 
 
 
 
2246
  )
2247
  return
2248
  progress(0.45, desc="Loading trajectories")
@@ -2255,6 +2437,10 @@ def build_app():
2255
  None, None, None, None,
2256
  None,
2257
  gr.update(visible=False),
 
 
 
 
2258
  )
2259
 
2260
  progress(0.6, desc="Reading metadata")
@@ -2285,6 +2471,10 @@ def build_app():
2285
  None, None, None, None,
2286
  None,
2287
  gr.update(visible=False),
 
 
 
 
2288
  )
2289
  return
2290
 
@@ -2311,6 +2501,9 @@ def build_app():
2311
  df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
2312
  )
2313
 
 
 
 
2314
  progress(1, desc="Done")
2315
  yield (
2316
  f"✅ Loaded {len(df_meta)} trajectories",
@@ -2320,8 +2513,23 @@ def build_app():
2320
  fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc,
2321
  state_data,
2322
  gr.update(visible=True),
 
 
 
 
2323
  )
2324
 
 
 
 
 
 
 
 
 
 
 
 
2325
  analyze_btn.click(
2326
  fn=load_and_analyze,
2327
  inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
@@ -2333,7 +2541,15 @@ def build_app():
2333
  plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc,
2334
  trajectories_state,
2335
  add_routing_btn,
 
 
 
 
2336
  ],
 
 
 
 
2337
  )
2338
 
2339
  def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
@@ -2391,6 +2607,12 @@ def build_app():
2391
  calc_options_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
2392
  calc_options_outputs = [plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc]
2393
 
 
 
 
 
 
 
2394
  thinking_overhead.change(
2395
  fn=on_calc_options_change,
2396
  inputs=calc_options_inputs,
 
132
  return model_totals
133
 
134
 
135
+ def calculate_per_step_tokens(steps: list[dict]) -> list[dict]:
136
+ """
137
+ Calculate token breakdown per step with proper caching simulation.
138
+
139
+ Returns list of per-step data:
140
+ [{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
141
+ """
142
+ result = []
143
+ cache_size = 0
144
+ total_context = 0
145
+ prev_observation = 0
146
+
147
+ for i, step in enumerate(steps):
148
+ system_user = step.get("system_user", 0)
149
+ completion = step.get("completion", 0)
150
+ observation = step.get("observation") or 0
151
+
152
+ cache_read = cache_size
153
+
154
+ if i == 0:
155
+ uncached_input = system_user
156
+ else:
157
+ full_context_needed = total_context + prev_observation
158
+ uncached_input = full_context_needed - cache_read
159
+
160
+ cache_creation = uncached_input + completion
161
+ cache_size = cache_read + cache_creation
162
+
163
+ result.append({
164
+ "step": i,
165
+ "cache_read": cache_read,
166
+ "uncached_input": uncached_input,
167
+ "completion": completion,
168
+ "cache_creation": cache_creation,
169
+ })
170
+
171
+ total_context = cache_read + uncached_input + completion
172
+ prev_observation = observation
173
+
174
+ return result
175
+
176
+
177
+ def create_single_trajectory_chart(steps: list[dict]):
178
+ """Create stacked bar chart for a single trajectory showing tokens per step."""
179
+ import plotly.graph_objects as go
180
+
181
+ if not steps:
182
+ return None
183
+
184
+ per_step_data = calculate_per_step_tokens(steps)
185
+
186
+ x_labels = [f"Step {d['step']}" for d in per_step_data]
187
+ uncached = [d["uncached_input"] / 1e3 for d in per_step_data]
188
+ cache_read = [d["cache_read"] / 1e3 for d in per_step_data]
189
+ cache_creation = [d["cache_creation"] / 1e3 for d in per_step_data]
190
+ completion = [d["completion"] / 1e3 for d in per_step_data]
191
+
192
+ fig = go.Figure()
193
+
194
+ fig.add_trace(go.Bar(
195
+ name="Uncached Input",
196
+ x=x_labels,
197
+ y=uncached,
198
+ marker_color="#EF553B",
199
+ hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>",
200
+ ))
201
+
202
+ fig.add_trace(go.Bar(
203
+ name="Cache Read",
204
+ x=x_labels,
205
+ y=cache_read,
206
+ marker_color="#19D3F3",
207
+ hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>",
208
+ ))
209
+
210
+ fig.add_trace(go.Bar(
211
+ name="Cache Creation",
212
+ x=x_labels,
213
+ y=cache_creation,
214
+ marker_color="#FFA15A",
215
+ hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>",
216
+ ))
217
+
218
+ fig.add_trace(go.Bar(
219
+ name="Completion",
220
+ x=x_labels,
221
+ y=completion,
222
+ marker_color="#AB63FA",
223
+ hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>",
224
+ ))
225
+
226
+ fig.update_layout(
227
+ barmode="stack",
228
+ xaxis_title="Step",
229
+ yaxis_title="Tokens (K)",
230
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
231
+ margin=dict(l=50, r=20, t=40, b=40),
232
+ )
233
+
234
+ return fig
235
+
236
+
237
+ def create_single_trajectory_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
238
+ """Create stacked bar chart for a single trajectory showing cost per step."""
239
+ import plotly.graph_objects as go
240
+
241
+ if not steps:
242
+ return None
243
+
244
+ per_step_data = calculate_per_step_tokens(steps)
245
+
246
+ x_labels = [f"Step {d['step']}" for d in per_step_data]
247
+ uncached_cost = [d["uncached_input"] * input_price / 1e6 for d in per_step_data]
248
+ cache_read_cost = [d["cache_read"] * cache_read_price / 1e6 for d in per_step_data]
249
+ cache_creation_cost = [d["cache_creation"] * cache_creation_price / 1e6 for d in per_step_data]
250
+ completion_cost = [d["completion"] * completion_price / 1e6 for d in per_step_data]
251
+
252
+ fig = go.Figure()
253
+
254
+ fig.add_trace(go.Bar(
255
+ name="Uncached Input",
256
+ x=x_labels,
257
+ y=uncached_cost,
258
+ marker_color="#EF553B",
259
+ hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>",
260
+ ))
261
+
262
+ fig.add_trace(go.Bar(
263
+ name="Cache Read",
264
+ x=x_labels,
265
+ y=cache_read_cost,
266
+ marker_color="#19D3F3",
267
+ hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>",
268
+ ))
269
+
270
+ fig.add_trace(go.Bar(
271
+ name="Cache Creation",
272
+ x=x_labels,
273
+ y=cache_creation_cost,
274
+ marker_color="#FFA15A",
275
+ hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>",
276
+ ))
277
+
278
+ fig.add_trace(go.Bar(
279
+ name="Completion",
280
+ x=x_labels,
281
+ y=completion_cost,
282
+ marker_color="#AB63FA",
283
+ hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>",
284
+ ))
285
+
286
+ fig.update_layout(
287
+ barmode="stack",
288
+ xaxis_title="Step",
289
+ yaxis_title="Cost ($)",
290
+ legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
291
+ margin=dict(l=50, r=20, t=40, b=40),
292
+ )
293
+
294
+ return fig
295
+
296
+
297
  def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
298
  """
299
  Parse trajectory file into step format for calculate_routing_tokens.
 
1515
  """)
1516
  trajectories_state = gr.State(None)
1517
 
1518
+ gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.36`")
1519
  gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
1520
 
1521
  with gr.Row():
 
1558
  with gr.Row():
1559
  plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")
1560
 
1561
+ with gr.Accordion("One trajectory statistics. Calculated from .traj messages", open=False, visible=False) as single_traj_accordion:
1562
+ with gr.Row():
1563
+ single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
1564
+ with gr.Row():
1565
+ single_traj_plot = gr.Plot(label="Tokens per Step (stacked)")
1566
+ with gr.Row():
1567
+ single_traj_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")
1568
+
1569
  with gr.Accordion("Calculated with routing", open=True, visible=False) as routing_plots_row:
1570
  with gr.Row():
1571
  routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
 
2383
  None, None, None, None,
2384
  None,
2385
  gr.update(visible=False),
2386
+ gr.update(visible=False),
2387
+ gr.update(),
2388
+ gr.update(),
2389
+ gr.update(),
2390
  )
2391
 
2392
  if not folder:
 
2404
  None, None, None, None,
2405
  None,
2406
  gr.update(visible=False),
2407
+ gr.update(visible=False),
2408
+ gr.update(),
2409
+ gr.update(),
2410
+ gr.update(),
2411
  )
2412
  progress(0.3, desc="Downloading")
2413
  status, _ = download_trajectories_from_s3(folder)
 
2421
  None, None, None, None,
2422
  None,
2423
  gr.update(visible=False),
2424
+ gr.update(visible=False),
2425
+ gr.update(),
2426
+ gr.update(),
2427
+ gr.update(),
2428
  )
2429
  return
2430
  progress(0.45, desc="Loading trajectories")
 
2437
  None, None, None, None,
2438
  None,
2439
  gr.update(visible=False),
2440
+ gr.update(visible=False),
2441
+ gr.update(),
2442
+ gr.update(),
2443
+ gr.update(),
2444
  )
2445
 
2446
  progress(0.6, desc="Reading metadata")
 
2471
  None, None, None, None,
2472
  None,
2473
  gr.update(visible=False),
2474
+ gr.update(visible=False),
2475
+ gr.update(),
2476
+ gr.update(),
2477
+ gr.update(),
2478
  )
2479
  return
2480
 
 
2501
  df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
2502
  )
2503
 
2504
+ issue_ids = sorted(trajectory_steps.keys())
2505
+ first_issue = issue_ids[0] if issue_ids else None
2506
+
2507
  progress(1, desc="Done")
2508
  yield (
2509
  f"✅ Loaded {len(df_meta)} trajectories",
 
2513
  fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc,
2514
  state_data,
2515
  gr.update(visible=True),
2516
+ gr.update(visible=True),
2517
+ gr.update(choices=issue_ids, value=first_issue),
2518
+ gr.update(),
2519
+ gr.update(),
2520
  )
2521
 
2522
+ def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
2523
+ if state_data is None or not issue_id:
2524
+ return None, None
2525
+ trajectory_steps = state_data.get("steps", {})
2526
+ if issue_id not in trajectory_steps:
2527
+ return None, None
2528
+ steps = trajectory_steps[issue_id]
2529
+ tokens_chart = create_single_trajectory_chart(steps)
2530
+ cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
2531
+ return tokens_chart, cost_chart
2532
+
2533
  analyze_btn.click(
2534
  fn=load_and_analyze,
2535
  inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
 
2541
  plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc,
2542
  trajectories_state,
2543
  add_routing_btn,
2544
+ single_traj_accordion,
2545
+ single_traj_dropdown,
2546
+ single_traj_plot,
2547
+ single_traj_cost_plot,
2548
  ],
2549
+ ).then(
2550
+ fn=on_single_traj_select,
2551
+ inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
2552
+ outputs=[single_traj_plot, single_traj_cost_plot],
2553
  )
2554
 
2555
  def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
 
2607
  calc_options_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
2608
  calc_options_outputs = [plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc]
2609
 
2610
+ single_traj_dropdown.change(
2611
+ fn=on_single_traj_select,
2612
+ inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
2613
+ outputs=[single_traj_plot, single_traj_cost_plot],
2614
+ )
2615
+
2616
  thinking_overhead.change(
2617
  fn=on_calc_options_change,
2618
  inputs=calc_options_inputs,