Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

IgorSlinko commited on 12 days ago

Commit

bb3fde6

1 Parent(s): 843e0a2

Add Grep routing strategy (v0.3.17)

- Add 'Grep' strategy matching words in assistant messages
- Support | (OR) and & (AND) operators (cannot mix)
- Use word boundaries (\b) for whole word matching
- Default: M1='ls|find', M2='cat|echo|printf|tee', M3='python&.py'
- First model has priority on overlaps
- Store assistant message content in step data for grep matching

Files changed (1) hide show

app.py +61 -5

app.py CHANGED Viewed

@@ -188,6 +188,7 @@ def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
                 "system_user": system_user_tokens if not steps else 0,
                 "completion": tokens,
                 "observation": None,
             }
             steps.append(step)
             system_user_tokens = 0
@@ -224,6 +225,7 @@ def _parse_trajectory_format_to_steps(trajectory_data: list, model_name: str) ->
             "system_user": system_user_tokens,
             "completion": completion_tokens,
             "observation": observation_tokens,
         }
         steps.append(step)
@@ -1321,7 +1323,7 @@ def build_app():
         """)
         trajectories_state = gr.State(None)
-        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.16`")
         gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
         with gr.Row():
@@ -1489,7 +1491,7 @@ def build_app():
                     gr.Markdown("### 🎯 Router Strategy")
                     selected_strategy = gr.Radio(
-                        choices=["Random router", "Every k-th step", "Python list slices", "Replace part of trajectory"],
                         value="Random router",
                         label="",
                         interactive=True,
@@ -1515,6 +1517,12 @@ def build_app():
                         slice_model_2 = gr.Textbox(label="M2 slice", value="[1::3]", interactive=True, visible=False)
                         slice_model_3 = gr.Textbox(label="M3 slice", value="[2::3]", interactive=True, visible=False)
                     with gr.Column(visible=False) as part_block:
                         part_hint = gr.Markdown("*Ranges must not overlap*")
                         part_mode = gr.Radio(
@@ -1547,6 +1555,7 @@ def build_app():
             show_random = strategy == "Random router"
             show_every_k = strategy == "Every k-th step"
             show_slice = strategy == "Python list slices"
             show_part = strategy == "Replace part of trajectory"
             has_m2 = num_models >= 2
             has_m3 = num_models >= 3
@@ -1554,6 +1563,7 @@ def build_app():
                 gr.update(visible=show_random),       # random_block
                 gr.update(visible=show_every_k),      # every_k_block
                 gr.update(visible=show_slice),        # slice_block
                 gr.update(visible=show_part),         # part_block
                 gr.update(visible=show_random),       # random_hint
                 gr.update(visible=show_random),       # weight_base
@@ -1568,6 +1578,10 @@ def build_app():
                 gr.update(visible=show_slice),        # slice_model_1
                 gr.update(visible=show_slice and has_m2), # slice_model_2
                 gr.update(visible=show_slice and has_m3), # slice_model_3
                 gr.update(visible=show_part),         # part_hint
                 gr.update(visible=show_part),         # part_mode
                 gr.update(visible=show_part),         # start_1
@@ -1582,10 +1596,11 @@ def build_app():
             fn=on_strategy_change,
             inputs=[selected_strategy, num_routing_models],
             outputs=[
-                random_block, every_k_block, slice_block, part_block,
                 random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
                 every_k_hint, k_model_1, k_model_2, k_model_3,
                 slice_hint, slice_model_1, slice_model_2, slice_model_3,
                 part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
             ],
         )
@@ -1678,6 +1693,7 @@ def build_app():
             is_random = strategy == "Random router"
             is_every_k = strategy == "Every k-th step"
             is_slice = strategy == "Python list slices"
             is_part = strategy == "Replace part of trajectory"
             return (
                 gr.update(visible=True),   # show block 2
@@ -1685,6 +1701,7 @@ def build_app():
                 gr.update(visible=is_random),  # weight2
                 gr.update(visible=is_every_k), # k2
                 gr.update(visible=is_slice),   # slice2
                 gr.update(visible=is_part),    # start2
                 gr.update(visible=is_part),    # end2
                 2,
@@ -1693,7 +1710,7 @@ def build_app():
         add_model_2_btn.click(
             fn=show_model_2,
             inputs=[selected_strategy],
-            outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, slice_model_2, start_2, end_2, num_routing_models],
         )
         routing_model_2.change(
@@ -1706,6 +1723,7 @@ def build_app():
             is_random = strategy == "Random router"
             is_every_k = strategy == "Every k-th step"
             is_slice = strategy == "Python list slices"
             is_part = strategy == "Replace part of trajectory"
             return (
                 gr.update(visible=True),   # show block 3
@@ -1713,6 +1731,7 @@ def build_app():
                 gr.update(visible=is_random),  # weight3
                 gr.update(visible=is_every_k), # k3
                 gr.update(visible=is_slice),   # slice3
                 gr.update(visible=is_part),    # start3
                 gr.update(visible=is_part),    # end3
                 3,
@@ -1721,7 +1740,7 @@ def build_app():
         add_model_3_btn.click(
             fn=show_model_3,
             inputs=[selected_strategy],
-            outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, slice_model_3, start_3, end_3, num_routing_models],
         )
         routing_model_3.change(
@@ -1740,6 +1759,7 @@ def build_app():
             weight_base_val, weight_1_val, weight_2_val, weight_3_val,
             k_1_val, k_2_val, k_3_val,
             slice_1_val, slice_2_val, slice_3_val,
             part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
             overhead, with_cache
         ):
@@ -1841,8 +1861,35 @@ def build_app():
             k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)]
             slice_values = [slice_1_val, slice_2_val, slice_3_val][:len(routing_models)]
             part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)]
             def parse_slice(slice_str, length):
                 """Parse Python slice notation like [0::3] and return list of indices"""
                 slice_str = slice_str.strip()
@@ -1899,6 +1946,14 @@ def build_app():
                             except Exception:
                                 pass
                 elif strategy_val == "Replace part of trajectory":
                     for j, (start_val, end_val) in enumerate(part_ranges):
                         if part_mode_val == "Percentages":
@@ -2025,6 +2080,7 @@ def build_app():
                 weight_base, weight_model_1, weight_model_2, weight_model_3,
                 k_model_1, k_model_2, k_model_3,
                 slice_model_1, slice_model_2, slice_model_3,
                 part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
                 thinking_overhead, use_cache,
             ],

                 "system_user": system_user_tokens if not steps else 0,
                 "completion": tokens,
                 "observation": None,
+                "content": str(content),
             }
             steps.append(step)
             system_user_tokens = 0
             "system_user": system_user_tokens,
             "completion": completion_tokens,
             "observation": observation_tokens,
+            "content": str(response_text) if response_text else "",
         }
         steps.append(step)
         """)
         trajectories_state = gr.State(None)
+        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.17`")
         gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
         with gr.Row():
                     gr.Markdown("### 🎯 Router Strategy")
                     selected_strategy = gr.Radio(
+                        choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Replace part of trajectory"],
                         value="Random router",
                         label="",
                         interactive=True,
                         slice_model_2 = gr.Textbox(label="M2 slice", value="[1::3]", interactive=True, visible=False)
                         slice_model_3 = gr.Textbox(label="M3 slice", value="[2::3]", interactive=True, visible=False)
+                    with gr.Column(visible=False) as grep_block:
+                        grep_hint = gr.Markdown("*Use `|` for OR, `&` for AND (don't mix). First model has priority on overlaps*")
+                        grep_model_1 = gr.Textbox(label="M1 grep", value="ls|find", interactive=True)
+                        grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
+                        grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)
                     with gr.Column(visible=False) as part_block:
                         part_hint = gr.Markdown("*Ranges must not overlap*")
                         part_mode = gr.Radio(
             show_random = strategy == "Random router"
             show_every_k = strategy == "Every k-th step"
             show_slice = strategy == "Python list slices"
+            show_grep = strategy == "Grep"
             show_part = strategy == "Replace part of trajectory"
             has_m2 = num_models >= 2
             has_m3 = num_models >= 3
                 gr.update(visible=show_random),       # random_block
                 gr.update(visible=show_every_k),      # every_k_block
                 gr.update(visible=show_slice),        # slice_block
+                gr.update(visible=show_grep),         # grep_block
                 gr.update(visible=show_part),         # part_block
                 gr.update(visible=show_random),       # random_hint
                 gr.update(visible=show_random),       # weight_base
                 gr.update(visible=show_slice),        # slice_model_1
                 gr.update(visible=show_slice and has_m2), # slice_model_2
                 gr.update(visible=show_slice and has_m3), # slice_model_3
+                gr.update(visible=show_grep),         # grep_hint
+                gr.update(visible=show_grep),         # grep_model_1
+                gr.update(visible=show_grep and has_m2), # grep_model_2
+                gr.update(visible=show_grep and has_m3), # grep_model_3
                 gr.update(visible=show_part),         # part_hint
                 gr.update(visible=show_part),         # part_mode
                 gr.update(visible=show_part),         # start_1
             fn=on_strategy_change,
             inputs=[selected_strategy, num_routing_models],
             outputs=[
+                random_block, every_k_block, slice_block, grep_block, part_block,
                 random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
                 every_k_hint, k_model_1, k_model_2, k_model_3,
                 slice_hint, slice_model_1, slice_model_2, slice_model_3,
+                grep_hint, grep_model_1, grep_model_2, grep_model_3,
                 part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
             ],
         )
             is_random = strategy == "Random router"
             is_every_k = strategy == "Every k-th step"
             is_slice = strategy == "Python list slices"
+            is_grep = strategy == "Grep"
             is_part = strategy == "Replace part of trajectory"
             return (
                 gr.update(visible=True),   # show block 2
                 gr.update(visible=is_random),  # weight2
                 gr.update(visible=is_every_k), # k2
                 gr.update(visible=is_slice),   # slice2
+                gr.update(visible=is_grep),    # grep2
                 gr.update(visible=is_part),    # start2
                 gr.update(visible=is_part),    # end2
                 2,
         add_model_2_btn.click(
             fn=show_model_2,
             inputs=[selected_strategy],
+            outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, slice_model_2, grep_model_2, start_2, end_2, num_routing_models],
         )
         routing_model_2.change(
             is_random = strategy == "Random router"
             is_every_k = strategy == "Every k-th step"
             is_slice = strategy == "Python list slices"
+            is_grep = strategy == "Grep"
             is_part = strategy == "Replace part of trajectory"
             return (
                 gr.update(visible=True),   # show block 3
                 gr.update(visible=is_random),  # weight3
                 gr.update(visible=is_every_k), # k3
                 gr.update(visible=is_slice),   # slice3
+                gr.update(visible=is_grep),    # grep3
                 gr.update(visible=is_part),    # start3
                 gr.update(visible=is_part),    # end3
                 3,
         add_model_3_btn.click(
             fn=show_model_3,
             inputs=[selected_strategy],
+            outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, slice_model_3, grep_model_3, start_3, end_3, num_routing_models],
         )
         routing_model_3.change(
             weight_base_val, weight_1_val, weight_2_val, weight_3_val,
             k_1_val, k_2_val, k_3_val,
             slice_1_val, slice_2_val, slice_3_val,
+            grep_1_val, grep_2_val, grep_3_val,
             part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
             overhead, with_cache
         ):
             k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)]
             slice_values = [slice_1_val, slice_2_val, slice_3_val][:len(routing_models)]
+            grep_values = [grep_1_val, grep_2_val, grep_3_val][:len(routing_models)]
             part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)]
+            if strategy_val == "Grep":
+                for i, gv in enumerate(grep_values):
+                    if gv and "|" in gv and "&" in gv:
+                        yield (gr.update(visible=True, value=f"❌ M{i+1} grep: cannot mix | and & operators"), gr.update(visible=False), None, None)
+                        return
+            def grep_matches(text, pattern):
+                """Check if text matches grep pattern (words with | or &)"""
+                if not pattern or not text:
+                    return False
+                pattern = pattern.strip()
+                if "|" in pattern:
+                    words = [w.strip() for w in pattern.split("|") if w.strip()]
+                    for word in words:
+                        if re.search(r'\b' + re.escape(word) + r'\b', text):
+                            return True
+                    return False
+                elif "&" in pattern:
+                    words = [w.strip() for w in pattern.split("&") if w.strip()]
+                    for word in words:
+                        if not re.search(r'\b' + re.escape(word) + r'\b', text):
+                            return False
+                    return True
+                else:
+                    return bool(re.search(r'\b' + re.escape(pattern) + r'\b', text))
             def parse_slice(slice_str, length):
                 """Parse Python slice notation like [0::3] and return list of indices"""
                 slice_str = slice_str.strip()
                             except Exception:
                                 pass
+                elif strategy_val == "Grep":
+                    for i, step in enumerate(steps):
+                        content = step.get("content", "")
+                        for j, grep_val in enumerate(grep_values):
+                            if grep_val and i not in step_to_model:
+                                if grep_matches(content, grep_val):
+                                    step_to_model[i] = f"__routing_{j}__"
                 elif strategy_val == "Replace part of trajectory":
                     for j, (start_val, end_val) in enumerate(part_ranges):
                         if part_mode_val == "Percentages":
                 weight_base, weight_model_1, weight_model_2, weight_model_3,
                 k_model_1, k_model_2, k_model_3,
                 slice_model_1, slice_model_2, slice_model_3,
+                grep_model_1, grep_model_2, grep_model_3,
                 part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
                 thinking_overhead, use_cache,
             ],