Commit
·
bb3fde6
1
Parent(s):
843e0a2
Add Grep routing strategy (v0.3.17)
Browse files- Add 'Grep' strategy matching words in assistant messages
- Support | (OR) and & (AND) operators (cannot mix)
- Use word boundaries (\b) for whole word matching
- Default: M1='ls|find', M2='cat|echo|printf|tee', M3='python&.py'
- First model has priority on overlaps
- Store assistant message content in step data for grep matching
app.py
CHANGED
|
@@ -188,6 +188,7 @@ def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
|
|
| 188 |
"system_user": system_user_tokens if not steps else 0,
|
| 189 |
"completion": tokens,
|
| 190 |
"observation": None,
|
|
|
|
| 191 |
}
|
| 192 |
steps.append(step)
|
| 193 |
system_user_tokens = 0
|
|
@@ -224,6 +225,7 @@ def _parse_trajectory_format_to_steps(trajectory_data: list, model_name: str) ->
|
|
| 224 |
"system_user": system_user_tokens,
|
| 225 |
"completion": completion_tokens,
|
| 226 |
"observation": observation_tokens,
|
|
|
|
| 227 |
}
|
| 228 |
steps.append(step)
|
| 229 |
|
|
@@ -1321,7 +1323,7 @@ def build_app():
|
|
| 1321 |
""")
|
| 1322 |
trajectories_state = gr.State(None)
|
| 1323 |
|
| 1324 |
-
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.
|
| 1325 |
gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
|
| 1326 |
|
| 1327 |
with gr.Row():
|
|
@@ -1489,7 +1491,7 @@ def build_app():
|
|
| 1489 |
gr.Markdown("### 🎯 Router Strategy")
|
| 1490 |
|
| 1491 |
selected_strategy = gr.Radio(
|
| 1492 |
-
choices=["Random router", "Every k-th step", "Python list slices", "Replace part of trajectory"],
|
| 1493 |
value="Random router",
|
| 1494 |
label="",
|
| 1495 |
interactive=True,
|
|
@@ -1515,6 +1517,12 @@ def build_app():
|
|
| 1515 |
slice_model_2 = gr.Textbox(label="M2 slice", value="[1::3]", interactive=True, visible=False)
|
| 1516 |
slice_model_3 = gr.Textbox(label="M3 slice", value="[2::3]", interactive=True, visible=False)
|
| 1517 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1518 |
with gr.Column(visible=False) as part_block:
|
| 1519 |
part_hint = gr.Markdown("*Ranges must not overlap*")
|
| 1520 |
part_mode = gr.Radio(
|
|
@@ -1547,6 +1555,7 @@ def build_app():
|
|
| 1547 |
show_random = strategy == "Random router"
|
| 1548 |
show_every_k = strategy == "Every k-th step"
|
| 1549 |
show_slice = strategy == "Python list slices"
|
|
|
|
| 1550 |
show_part = strategy == "Replace part of trajectory"
|
| 1551 |
has_m2 = num_models >= 2
|
| 1552 |
has_m3 = num_models >= 3
|
|
@@ -1554,6 +1563,7 @@ def build_app():
|
|
| 1554 |
gr.update(visible=show_random), # random_block
|
| 1555 |
gr.update(visible=show_every_k), # every_k_block
|
| 1556 |
gr.update(visible=show_slice), # slice_block
|
|
|
|
| 1557 |
gr.update(visible=show_part), # part_block
|
| 1558 |
gr.update(visible=show_random), # random_hint
|
| 1559 |
gr.update(visible=show_random), # weight_base
|
|
@@ -1568,6 +1578,10 @@ def build_app():
|
|
| 1568 |
gr.update(visible=show_slice), # slice_model_1
|
| 1569 |
gr.update(visible=show_slice and has_m2), # slice_model_2
|
| 1570 |
gr.update(visible=show_slice and has_m3), # slice_model_3
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1571 |
gr.update(visible=show_part), # part_hint
|
| 1572 |
gr.update(visible=show_part), # part_mode
|
| 1573 |
gr.update(visible=show_part), # start_1
|
|
@@ -1582,10 +1596,11 @@ def build_app():
|
|
| 1582 |
fn=on_strategy_change,
|
| 1583 |
inputs=[selected_strategy, num_routing_models],
|
| 1584 |
outputs=[
|
| 1585 |
-
random_block, every_k_block, slice_block, part_block,
|
| 1586 |
random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
|
| 1587 |
every_k_hint, k_model_1, k_model_2, k_model_3,
|
| 1588 |
slice_hint, slice_model_1, slice_model_2, slice_model_3,
|
|
|
|
| 1589 |
part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
|
| 1590 |
],
|
| 1591 |
)
|
|
@@ -1678,6 +1693,7 @@ def build_app():
|
|
| 1678 |
is_random = strategy == "Random router"
|
| 1679 |
is_every_k = strategy == "Every k-th step"
|
| 1680 |
is_slice = strategy == "Python list slices"
|
|
|
|
| 1681 |
is_part = strategy == "Replace part of trajectory"
|
| 1682 |
return (
|
| 1683 |
gr.update(visible=True), # show block 2
|
|
@@ -1685,6 +1701,7 @@ def build_app():
|
|
| 1685 |
gr.update(visible=is_random), # weight2
|
| 1686 |
gr.update(visible=is_every_k), # k2
|
| 1687 |
gr.update(visible=is_slice), # slice2
|
|
|
|
| 1688 |
gr.update(visible=is_part), # start2
|
| 1689 |
gr.update(visible=is_part), # end2
|
| 1690 |
2,
|
|
@@ -1693,7 +1710,7 @@ def build_app():
|
|
| 1693 |
add_model_2_btn.click(
|
| 1694 |
fn=show_model_2,
|
| 1695 |
inputs=[selected_strategy],
|
| 1696 |
-
outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, slice_model_2, start_2, end_2, num_routing_models],
|
| 1697 |
)
|
| 1698 |
|
| 1699 |
routing_model_2.change(
|
|
@@ -1706,6 +1723,7 @@ def build_app():
|
|
| 1706 |
is_random = strategy == "Random router"
|
| 1707 |
is_every_k = strategy == "Every k-th step"
|
| 1708 |
is_slice = strategy == "Python list slices"
|
|
|
|
| 1709 |
is_part = strategy == "Replace part of trajectory"
|
| 1710 |
return (
|
| 1711 |
gr.update(visible=True), # show block 3
|
|
@@ -1713,6 +1731,7 @@ def build_app():
|
|
| 1713 |
gr.update(visible=is_random), # weight3
|
| 1714 |
gr.update(visible=is_every_k), # k3
|
| 1715 |
gr.update(visible=is_slice), # slice3
|
|
|
|
| 1716 |
gr.update(visible=is_part), # start3
|
| 1717 |
gr.update(visible=is_part), # end3
|
| 1718 |
3,
|
|
@@ -1721,7 +1740,7 @@ def build_app():
|
|
| 1721 |
add_model_3_btn.click(
|
| 1722 |
fn=show_model_3,
|
| 1723 |
inputs=[selected_strategy],
|
| 1724 |
-
outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, slice_model_3, start_3, end_3, num_routing_models],
|
| 1725 |
)
|
| 1726 |
|
| 1727 |
routing_model_3.change(
|
|
@@ -1740,6 +1759,7 @@ def build_app():
|
|
| 1740 |
weight_base_val, weight_1_val, weight_2_val, weight_3_val,
|
| 1741 |
k_1_val, k_2_val, k_3_val,
|
| 1742 |
slice_1_val, slice_2_val, slice_3_val,
|
|
|
|
| 1743 |
part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
|
| 1744 |
overhead, with_cache
|
| 1745 |
):
|
|
@@ -1841,8 +1861,35 @@ def build_app():
|
|
| 1841 |
|
| 1842 |
k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)]
|
| 1843 |
slice_values = [slice_1_val, slice_2_val, slice_3_val][:len(routing_models)]
|
|
|
|
| 1844 |
part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)]
|
| 1845 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1846 |
def parse_slice(slice_str, length):
|
| 1847 |
"""Parse Python slice notation like [0::3] and return list of indices"""
|
| 1848 |
slice_str = slice_str.strip()
|
|
@@ -1899,6 +1946,14 @@ def build_app():
|
|
| 1899 |
except Exception:
|
| 1900 |
pass
|
| 1901 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1902 |
elif strategy_val == "Replace part of trajectory":
|
| 1903 |
for j, (start_val, end_val) in enumerate(part_ranges):
|
| 1904 |
if part_mode_val == "Percentages":
|
|
@@ -2025,6 +2080,7 @@ def build_app():
|
|
| 2025 |
weight_base, weight_model_1, weight_model_2, weight_model_3,
|
| 2026 |
k_model_1, k_model_2, k_model_3,
|
| 2027 |
slice_model_1, slice_model_2, slice_model_3,
|
|
|
|
| 2028 |
part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
|
| 2029 |
thinking_overhead, use_cache,
|
| 2030 |
],
|
|
|
|
| 188 |
"system_user": system_user_tokens if not steps else 0,
|
| 189 |
"completion": tokens,
|
| 190 |
"observation": None,
|
| 191 |
+
"content": str(content),
|
| 192 |
}
|
| 193 |
steps.append(step)
|
| 194 |
system_user_tokens = 0
|
|
|
|
| 225 |
"system_user": system_user_tokens,
|
| 226 |
"completion": completion_tokens,
|
| 227 |
"observation": observation_tokens,
|
| 228 |
+
"content": str(response_text) if response_text else "",
|
| 229 |
}
|
| 230 |
steps.append(step)
|
| 231 |
|
|
|
|
| 1323 |
""")
|
| 1324 |
trajectories_state = gr.State(None)
|
| 1325 |
|
| 1326 |
+
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.17`")
|
| 1327 |
gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
|
| 1328 |
|
| 1329 |
with gr.Row():
|
|
|
|
| 1491 |
gr.Markdown("### 🎯 Router Strategy")
|
| 1492 |
|
| 1493 |
selected_strategy = gr.Radio(
|
| 1494 |
+
choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Replace part of trajectory"],
|
| 1495 |
value="Random router",
|
| 1496 |
label="",
|
| 1497 |
interactive=True,
|
|
|
|
| 1517 |
slice_model_2 = gr.Textbox(label="M2 slice", value="[1::3]", interactive=True, visible=False)
|
| 1518 |
slice_model_3 = gr.Textbox(label="M3 slice", value="[2::3]", interactive=True, visible=False)
|
| 1519 |
|
| 1520 |
+
with gr.Column(visible=False) as grep_block:
|
| 1521 |
+
grep_hint = gr.Markdown("*Use `|` for OR, `&` for AND (don't mix). First model has priority on overlaps*")
|
| 1522 |
+
grep_model_1 = gr.Textbox(label="M1 grep", value="ls|find", interactive=True)
|
| 1523 |
+
grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
|
| 1524 |
+
grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)
|
| 1525 |
+
|
| 1526 |
with gr.Column(visible=False) as part_block:
|
| 1527 |
part_hint = gr.Markdown("*Ranges must not overlap*")
|
| 1528 |
part_mode = gr.Radio(
|
|
|
|
| 1555 |
show_random = strategy == "Random router"
|
| 1556 |
show_every_k = strategy == "Every k-th step"
|
| 1557 |
show_slice = strategy == "Python list slices"
|
| 1558 |
+
show_grep = strategy == "Grep"
|
| 1559 |
show_part = strategy == "Replace part of trajectory"
|
| 1560 |
has_m2 = num_models >= 2
|
| 1561 |
has_m3 = num_models >= 3
|
|
|
|
| 1563 |
gr.update(visible=show_random), # random_block
|
| 1564 |
gr.update(visible=show_every_k), # every_k_block
|
| 1565 |
gr.update(visible=show_slice), # slice_block
|
| 1566 |
+
gr.update(visible=show_grep), # grep_block
|
| 1567 |
gr.update(visible=show_part), # part_block
|
| 1568 |
gr.update(visible=show_random), # random_hint
|
| 1569 |
gr.update(visible=show_random), # weight_base
|
|
|
|
| 1578 |
gr.update(visible=show_slice), # slice_model_1
|
| 1579 |
gr.update(visible=show_slice and has_m2), # slice_model_2
|
| 1580 |
gr.update(visible=show_slice and has_m3), # slice_model_3
|
| 1581 |
+
gr.update(visible=show_grep), # grep_hint
|
| 1582 |
+
gr.update(visible=show_grep), # grep_model_1
|
| 1583 |
+
gr.update(visible=show_grep and has_m2), # grep_model_2
|
| 1584 |
+
gr.update(visible=show_grep and has_m3), # grep_model_3
|
| 1585 |
gr.update(visible=show_part), # part_hint
|
| 1586 |
gr.update(visible=show_part), # part_mode
|
| 1587 |
gr.update(visible=show_part), # start_1
|
|
|
|
| 1596 |
fn=on_strategy_change,
|
| 1597 |
inputs=[selected_strategy, num_routing_models],
|
| 1598 |
outputs=[
|
| 1599 |
+
random_block, every_k_block, slice_block, grep_block, part_block,
|
| 1600 |
random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
|
| 1601 |
every_k_hint, k_model_1, k_model_2, k_model_3,
|
| 1602 |
slice_hint, slice_model_1, slice_model_2, slice_model_3,
|
| 1603 |
+
grep_hint, grep_model_1, grep_model_2, grep_model_3,
|
| 1604 |
part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
|
| 1605 |
],
|
| 1606 |
)
|
|
|
|
| 1693 |
is_random = strategy == "Random router"
|
| 1694 |
is_every_k = strategy == "Every k-th step"
|
| 1695 |
is_slice = strategy == "Python list slices"
|
| 1696 |
+
is_grep = strategy == "Grep"
|
| 1697 |
is_part = strategy == "Replace part of trajectory"
|
| 1698 |
return (
|
| 1699 |
gr.update(visible=True), # show block 2
|
|
|
|
| 1701 |
gr.update(visible=is_random), # weight2
|
| 1702 |
gr.update(visible=is_every_k), # k2
|
| 1703 |
gr.update(visible=is_slice), # slice2
|
| 1704 |
+
gr.update(visible=is_grep), # grep2
|
| 1705 |
gr.update(visible=is_part), # start2
|
| 1706 |
gr.update(visible=is_part), # end2
|
| 1707 |
2,
|
|
|
|
| 1710 |
add_model_2_btn.click(
|
| 1711 |
fn=show_model_2,
|
| 1712 |
inputs=[selected_strategy],
|
| 1713 |
+
outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, slice_model_2, grep_model_2, start_2, end_2, num_routing_models],
|
| 1714 |
)
|
| 1715 |
|
| 1716 |
routing_model_2.change(
|
|
|
|
| 1723 |
is_random = strategy == "Random router"
|
| 1724 |
is_every_k = strategy == "Every k-th step"
|
| 1725 |
is_slice = strategy == "Python list slices"
|
| 1726 |
+
is_grep = strategy == "Grep"
|
| 1727 |
is_part = strategy == "Replace part of trajectory"
|
| 1728 |
return (
|
| 1729 |
gr.update(visible=True), # show block 3
|
|
|
|
| 1731 |
gr.update(visible=is_random), # weight3
|
| 1732 |
gr.update(visible=is_every_k), # k3
|
| 1733 |
gr.update(visible=is_slice), # slice3
|
| 1734 |
+
gr.update(visible=is_grep), # grep3
|
| 1735 |
gr.update(visible=is_part), # start3
|
| 1736 |
gr.update(visible=is_part), # end3
|
| 1737 |
3,
|
|
|
|
| 1740 |
add_model_3_btn.click(
|
| 1741 |
fn=show_model_3,
|
| 1742 |
inputs=[selected_strategy],
|
| 1743 |
+
outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, slice_model_3, grep_model_3, start_3, end_3, num_routing_models],
|
| 1744 |
)
|
| 1745 |
|
| 1746 |
routing_model_3.change(
|
|
|
|
| 1759 |
weight_base_val, weight_1_val, weight_2_val, weight_3_val,
|
| 1760 |
k_1_val, k_2_val, k_3_val,
|
| 1761 |
slice_1_val, slice_2_val, slice_3_val,
|
| 1762 |
+
grep_1_val, grep_2_val, grep_3_val,
|
| 1763 |
part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
|
| 1764 |
overhead, with_cache
|
| 1765 |
):
|
|
|
|
| 1861 |
|
| 1862 |
k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)]
|
| 1863 |
slice_values = [slice_1_val, slice_2_val, slice_3_val][:len(routing_models)]
|
| 1864 |
+
grep_values = [grep_1_val, grep_2_val, grep_3_val][:len(routing_models)]
|
| 1865 |
part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)]
|
| 1866 |
|
| 1867 |
+
if strategy_val == "Grep":
|
| 1868 |
+
for i, gv in enumerate(grep_values):
|
| 1869 |
+
if gv and "|" in gv and "&" in gv:
|
| 1870 |
+
yield (gr.update(visible=True, value=f"❌ M{i+1} grep: cannot mix | and & operators"), gr.update(visible=False), None, None)
|
| 1871 |
+
return
|
| 1872 |
+
|
| 1873 |
+
def grep_matches(text, pattern):
|
| 1874 |
+
"""Check if text matches grep pattern (words with | or &)"""
|
| 1875 |
+
if not pattern or not text:
|
| 1876 |
+
return False
|
| 1877 |
+
pattern = pattern.strip()
|
| 1878 |
+
if "|" in pattern:
|
| 1879 |
+
words = [w.strip() for w in pattern.split("|") if w.strip()]
|
| 1880 |
+
for word in words:
|
| 1881 |
+
if re.search(r'\b' + re.escape(word) + r'\b', text):
|
| 1882 |
+
return True
|
| 1883 |
+
return False
|
| 1884 |
+
elif "&" in pattern:
|
| 1885 |
+
words = [w.strip() for w in pattern.split("&") if w.strip()]
|
| 1886 |
+
for word in words:
|
| 1887 |
+
if not re.search(r'\b' + re.escape(word) + r'\b', text):
|
| 1888 |
+
return False
|
| 1889 |
+
return True
|
| 1890 |
+
else:
|
| 1891 |
+
return bool(re.search(r'\b' + re.escape(pattern) + r'\b', text))
|
| 1892 |
+
|
| 1893 |
def parse_slice(slice_str, length):
|
| 1894 |
"""Parse Python slice notation like [0::3] and return list of indices"""
|
| 1895 |
slice_str = slice_str.strip()
|
|
|
|
| 1946 |
except Exception:
|
| 1947 |
pass
|
| 1948 |
|
| 1949 |
+
elif strategy_val == "Grep":
|
| 1950 |
+
for i, step in enumerate(steps):
|
| 1951 |
+
content = step.get("content", "")
|
| 1952 |
+
for j, grep_val in enumerate(grep_values):
|
| 1953 |
+
if grep_val and i not in step_to_model:
|
| 1954 |
+
if grep_matches(content, grep_val):
|
| 1955 |
+
step_to_model[i] = f"__routing_{j}__"
|
| 1956 |
+
|
| 1957 |
elif strategy_val == "Replace part of trajectory":
|
| 1958 |
for j, (start_val, end_val) in enumerate(part_ranges):
|
| 1959 |
if part_mode_val == "Percentages":
|
|
|
|
| 2080 |
weight_base, weight_model_1, weight_model_2, weight_model_3,
|
| 2081 |
k_model_1, k_model_2, k_model_3,
|
| 2082 |
slice_model_1, slice_model_2, slice_model_3,
|
| 2083 |
+
grep_model_1, grep_model_2, grep_model_3,
|
| 2084 |
part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
|
| 2085 |
thinking_overhead, use_cache,
|
| 2086 |
],
|