Spaces:
Running
Running
Upload index.html
Browse files- index.html +234 -32
index.html
CHANGED
|
@@ -263,14 +263,16 @@
|
|
| 263 |
const $tbody = document.getElementById('elo-tbody');
|
| 264 |
if(!$tbody) return;
|
| 265 |
const DATA = [
|
| 266 |
-
{ model:"
|
| 267 |
-
{ model:"
|
| 268 |
-
{ model:"
|
| 269 |
-
{ model:"
|
| 270 |
-
{ model:"
|
| 271 |
-
{ model:"
|
| 272 |
-
{ model:"
|
| 273 |
-
{ model:"
|
|
|
|
|
|
|
| 274 |
];
|
| 275 |
// Organisation -> logo path (relative)
|
| 276 |
const ORG_LOGOS = {
|
|
@@ -286,9 +288,15 @@
|
|
| 286 |
'gpt-5-mini': ORG_LOGOS['OpenAI'], 'GPT-5-mini': ORG_LOGOS['OpenAI'],
|
| 287 |
'gemini-2.5-pro': ORG_LOGOS['Google'], 'Gemini 2.5 Pro': ORG_LOGOS['Google'],
|
| 288 |
'gemini-2.5-flash': ORG_LOGOS['Google'], 'Gemini 2.5 Flash': ORG_LOGOS['Google'],
|
| 289 |
-
'qwen3-235b-instruct': ORG_LOGOS['Alibaba'], 'Qwen3-235B-Instruct': ORG_LOGOS['Alibaba'],
|
| 290 |
-
|
| 291 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
'grok-4': ORG_LOGOS['XAI'], 'Grok-4': ORG_LOGOS['XAI']
|
| 293 |
};
|
| 294 |
// Global plugin to draw logos near tick labels
|
|
@@ -362,6 +370,8 @@
|
|
| 362 |
function interp(c1,c2,t){ const a=hexToRgb(c1),b=hexToRgb(c2); const r=Math.round(a.r+(b.r-a.r)*t), g=Math.round(a.g+(b.g-a.g)*t), bb=Math.round(a.b+(b.b-a.b)*t); return 'rgb('+r+', '+g+', '+bb+')'; }
|
| 363 |
function ramp(t){ const stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(let i=stops.length-1;i>=0;i--){ if(t>=stops[i].t){ if(i===stops.length-1) return stops[i].c; const lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(t-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
| 364 |
const rows = [...DATA].sort((a,b)=>b.Elo-a.Elo);
|
|
|
|
|
|
|
| 365 |
const minE = Math.min(...rows.map(r=>r.Elo));
|
| 366 |
const maxE = Math.max(...rows.map(r=>r.Elo));
|
| 367 |
function colorFromElo(e){
|
|
@@ -621,6 +631,8 @@
|
|
| 621 |
<button class="tab-btn" id="tab-prof-gpt5" role="tab" aria-selected="false" aria-controls="panel-prof-gpt5">GPT‑5</button>
|
| 622 |
<button class="tab-btn" id="tab-prof-Gemini-2.5-pro" role="tab" aria-selected="false" aria-controls="panel-prof-Gemini-2.5-pro">Gemini‑2.5-Pro</button>
|
| 623 |
<button class="tab-btn" id="tab-prof-grok-4" role="tab" aria-selected="true" aria-controls="panel-prof-grok-4">Grok‑4</button>
|
|
|
|
|
|
|
| 624 |
<button class="tab-btn" id="tab-prof-Gemini-2.5-Flash" role="tab" aria-selected="false" aria-controls="panel-prof-Gemini-2.5-Flash">Gemini‑2.5-Flash</button>
|
| 625 |
<button class="tab-btn" id="tab-prof-kimi" role="tab" aria-selected="false" aria-controls="panel-prof-kimi">Kimi‑K2</button>
|
| 626 |
<button class="tab-btn" id="tab-prof-mini" role="tab" aria-selected="false" aria-controls="panel-prof-mini">GPT‑5‑mini</button>
|
|
@@ -743,7 +755,7 @@
|
|
| 743 |
<div class="ex-inner">
|
| 744 |
<p>This example illustrates the model's primary weakness: its <strong>predictable coordination collapses</strong> against methodical villagers who prioritize <strong>evidence over persuasion</strong>.</p>
|
| 745 |
<p class="ex-kicker">Context</p>
|
| 746 |
-
<p>It is Day 1 of Game 46. The village is composed of <strong>
|
| 747 |
<div class="sc">The Village's Response</div>
|
| 748 |
<div class="qstack">
|
| 749 |
<div class="qcard">
|
|
@@ -763,6 +775,76 @@
|
|
| 763 |
</article>
|
| 764 |
</section>
|
| 765 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 766 |
<!-- GPT-5 wolf profile -->
|
| 767 |
<section id="panel-prof-gpt5" class="tab-panel" role="tabpanel" aria-labelledby="tab-prof-gpt5" aria-hidden="true">
|
| 768 |
<article class="profile">
|
|
@@ -1295,6 +1377,8 @@
|
|
| 1295 |
<button class="tab-btn" id="vtab-prof-gpt5" role="tab" aria-selected="false" aria-controls="panel-vprof-gpt5">GPT‑5</button>
|
| 1296 |
<button class="tab-btn" id="vtab-prof-Gemini-2.5-pro" role="tab" aria-selected="false" aria-controls="panel-vprof-Gemini-2.5-pro">Gemini-2.5-Pro</button>
|
| 1297 |
<button class="tab-btn" id="vtab-prof-grok-4" role="tab" aria-selected="true" aria-controls="panel-vprof-grok-4">Grok‑4</button>
|
|
|
|
|
|
|
| 1298 |
<button class="tab-btn" id="vtab-prof-Gemini-2.5-Flash" role="tab" aria-selected="false" aria-controls="panel-vprof-Gemini-2.5-Flash">Gemini‑2.5-Flash</button>
|
| 1299 |
<button class="tab-btn" id="vtab-prof-kimi" role="tab" aria-selected="false" aria-controls="panel-vprof-kimi">Kimi‑K2</button>
|
| 1300 |
<button class="tab-btn" id="vtab-prof-mini" role="tab" aria-selected="false" aria-controls="panel-vprof-mini">GPT‑5‑mini</button>
|
|
@@ -1343,6 +1427,75 @@
|
|
| 1343 |
</details>
|
| 1344 |
</article>
|
| 1345 |
</section>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1346 |
<section id="panel-vprof-grok-4" class="tab-panel" role="tabpanel" aria-labelledby="vtab-prof-grok-4" aria-hidden="false">
|
| 1347 |
<article class="profile">
|
| 1348 |
<div class="profile-hd">
|
|
@@ -1791,7 +1944,7 @@
|
|
| 1791 |
<h3>Head-to-head grid → where edges actually appear</h3>
|
| 1792 |
<p>Rows are villagers, columns are wolves (cells show the villager side’s win-rate with match counts). Three patterns stand out:</p>
|
| 1793 |
<ul>
|
| 1794 |
-
<li><strong>True sweep capability.</strong> When <strong>GPT-5</strong> is the villager side, it reliably cleans up across columns
|
| 1795 |
<li><strong>Spiky attackers vs. solid defenders.</strong> <strong>Kimi-K2</strong> as wolves punches through mid-tier villagers (e.g., Flash, mini) but gets contained by the better defenders (GPT-5, Gemini‑2.5‑Pro). <strong>Grok‑4</strong> shows a similar spike pattern: it can create a forcing lane against mid‑pack villagers, but disciplined defenders blunt its coordination edge.</li>
|
| 1796 |
<li><strong>Asymmetries you’d miss on a single Elo.</strong> <strong>Gemini‑2.5‑Pro</strong> posts steady wins as villagers against most wolf columns but doesn’t blow out the field as wolves; <strong>Qwen3</strong> is similar—noticeably better at holding ground than at breaking it. <strong>Grok‑4</strong> flips this in some pairings: stronger pressure as wolves than as villagers against the same opponent, a classic “rock‑paper‑scissors” edge that only the matrix reveals.</li>
|
| 1797 |
</ul>
|
|
@@ -1845,21 +1998,44 @@
|
|
| 1845 |
<script>
|
| 1846 |
(function(){
|
| 1847 |
var root=document.getElementById('h2h-embed'); if(!root) return;
|
| 1848 |
-
var models=['
|
| 1849 |
-
var short=['
|
| 1850 |
-
// Desired display order (by Elo
|
| 1851 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1852 |
var order=ELO_ORDER.map(function(name){ return models.indexOf(name); }).filter(function(i){ return i!==-1; });
|
| 1853 |
// Explicit villager-side results matrix [wins, losses] for rows (villagers) vs columns (wolves)
|
| 1854 |
var MATRIX=[
|
| 1855 |
-
[null, [
|
| 1856 |
-
[[
|
| 1857 |
-
[[
|
| 1858 |
-
[[
|
| 1859 |
-
[[
|
| 1860 |
-
[[
|
| 1861 |
-
[[0,
|
| 1862 |
-
[[
|
|
|
|
|
|
|
| 1863 |
];
|
| 1864 |
var n=models.length; var W=Array(n).fill().map(function(){return Array(n).fill(null);}); var L=Array(n).fill().map(function(){return Array(n).fill(null);});
|
| 1865 |
for(var i0=0;i0<n;i0++){ for(var j0=0;j0<n;j0++){ var cell=MATRIX[i0][j0]; if(cell){ W[i0][j0]=cell[0]; L[i0][j0]=cell[1]; } } }
|
|
@@ -1926,9 +2102,9 @@
|
|
| 1926 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
| 1927 |
function barColor(v){ var c=Math.max(0,Math.min(100,v)); return getColor(c/100); }
|
| 1928 |
function init(){
|
| 1929 |
-
var labels = ['gpt-5', 'gemini-2.5-pro', 'grok-4', '
|
| 1930 |
-
var day1 = [
|
| 1931 |
-
var day2 = [
|
| 1932 |
var ctx = canvas.getContext('2d');
|
| 1933 |
var isMobile = window.matchMedia && window.matchMedia('(max-width: 640px)').matches;
|
| 1934 |
if(isMobile){
|
|
@@ -2185,8 +2361,21 @@
|
|
| 2185 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
| 2186 |
function barColor(v){ return getColor(Math.max(0,Math.min(100,v))/100); }
|
| 2187 |
function init(){
|
| 2188 |
-
var
|
| 2189 |
-
var
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2190 |
var ctx = c.getContext('2d');
|
| 2191 |
new Chart(ctx, {
|
| 2192 |
type: 'bar',
|
|
@@ -2309,8 +2498,21 @@
|
|
| 2309 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
| 2310 |
function barColor(v){ return getColor(Math.max(0,Math.min(100,v))/100); }
|
| 2311 |
function init(){
|
| 2312 |
-
var
|
| 2313 |
-
var
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2314 |
var ctx = c.getContext('2d');
|
| 2315 |
new Chart(ctx, {
|
| 2316 |
type: 'bar',
|
|
|
|
| 263 |
const $tbody = document.getElementById('elo-tbody');
|
| 264 |
if(!$tbody) return;
|
| 265 |
const DATA = [
|
| 266 |
+
{ model:"gpt-5", org:"OpenAI", Elo:1529, EloW:1518, EloV:1540, matches:90, win:95.6 },
|
| 267 |
+
{ model:"gemini-2.5-pro", org:"Google", Elo:1243, EloW:1125, EloV:1361, matches:90, win:58.9 },
|
| 268 |
+
{ model:"grok-4-fast-reasoning", org:"XAI", Elo:1231, EloW:1242, EloV:1219, matches:90, win:53.3 },
|
| 269 |
+
{ model:"gemini-2.5-flash", org:"Google", Elo:1222, EloW:1104, EloV:1341, matches:90, win: 55.6 },
|
| 270 |
+
{ model:"kimi-k2-0905-preview", org:"Moonshot AI", Elo:1189, EloW:1182, EloV:1195, matches:90, win:46.7 },
|
| 271 |
+
{ model:"grok-4", org:"XAI", Elo:1178, EloW:1130, EloV:1227, matches:90, win:51.1 },
|
| 272 |
+
{ model:"Qwen3-235B-A22B-Instruct-2507", org:"Alibaba", Elo:1150, EloW:1098, EloV:1203, matches:90, win:44.4 },
|
| 273 |
+
{ model:"Kimi-K2-Instruct", org:"Moonshot AI", Elo:1133, EloW:1180, EloV:1087, matches:90, win:38.9 },
|
| 274 |
+
{ model:"gpt-5-mini", org:"OpenAI", Elo:1120, EloW:1049, EloV:1190, matches:90, win:38.9 },
|
| 275 |
+
{ model:"openai/gpt-oss-120b", org:"OpenAI", Elo:971, EloW:852, EloV:1089, matches:90, win:16.7 }
|
| 276 |
];
|
| 277 |
// Organisation -> logo path (relative)
|
| 278 |
const ORG_LOGOS = {
|
|
|
|
| 288 |
'gpt-5-mini': ORG_LOGOS['OpenAI'], 'GPT-5-mini': ORG_LOGOS['OpenAI'],
|
| 289 |
'gemini-2.5-pro': ORG_LOGOS['Google'], 'Gemini 2.5 Pro': ORG_LOGOS['Google'],
|
| 290 |
'gemini-2.5-flash': ORG_LOGOS['Google'], 'Gemini 2.5 Flash': ORG_LOGOS['Google'],
|
| 291 |
+
'qwen3-235b-instruct': ORG_LOGOS['Alibaba'], 'Qwen3-235B-Instruct': ORG_LOGOS['Alibaba'], 'Qwen3-235B-A22B-Instruct-2507': ORG_LOGOS['Alibaba'],
|
| 292 |
+
// Aliases sans préfixe pour préserver le logo après renommage
|
| 293 |
+
'Qwen3-235B-A22B-Instruct-2507': ORG_LOGOS['Alibaba'],
|
| 294 |
+
'qwen3-235b-a22b-instruct-2507': ORG_LOGOS['Alibaba'],
|
| 295 |
+
'kimi-k2-instruct': ORG_LOGOS['Moonshot AI'], 'Kimi-K2-Instruct': ORG_LOGOS['Moonshot AI'], 'moonshotai/Kimi-K2-Instruct': ORG_LOGOS['Moonshot AI'],
|
| 296 |
+
// Alias sans préfixe (déjà géré ci-dessus), conservé pour compat
|
| 297 |
+
'kimi-k2-0905-preview': ORG_LOGOS['Moonshot AI'], 'Kimi-K2-0905-Preview': ORG_LOGOS['Moonshot AI'], 'Kimi‑K2‑0905‑preview': ORG_LOGOS['Moonshot AI'],
|
| 298 |
+
'gpt-oss-120b': ORG_LOGOS['OpenAI'], 'GPT-OSS-120B': ORG_LOGOS['OpenAI'], 'openai/gpt-oss-120b': ORG_LOGOS['OpenAI'],
|
| 299 |
+
'grok-4-fast-reasoning': ORG_LOGOS['XAI'], 'Grok-4-Fast-Reasoning': ORG_LOGOS['XAI'],
|
| 300 |
'grok-4': ORG_LOGOS['XAI'], 'Grok-4': ORG_LOGOS['XAI']
|
| 301 |
};
|
| 302 |
// Global plugin to draw logos near tick labels
|
|
|
|
| 370 |
function interp(c1,c2,t){ const a=hexToRgb(c1),b=hexToRgb(c2); const r=Math.round(a.r+(b.r-a.r)*t), g=Math.round(a.g+(b.g-a.g)*t), bb=Math.round(a.b+(b.b-a.b)*t); return 'rgb('+r+', '+g+', '+bb+')'; }
|
| 371 |
function ramp(t){ const stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(let i=stops.length-1;i>=0;i--){ if(t>=stops[i].t){ if(i===stops.length-1) return stops[i].c; const lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(t-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
| 372 |
const rows = [...DATA].sort((a,b)=>b.Elo-a.Elo);
|
| 373 |
+
// Expose Elo order globally so other charts can align ordering
|
| 374 |
+
window.ELO_ORDERED_MODELS = rows.map(r=>r.model);
|
| 375 |
const minE = Math.min(...rows.map(r=>r.Elo));
|
| 376 |
const maxE = Math.max(...rows.map(r=>r.Elo));
|
| 377 |
function colorFromElo(e){
|
|
|
|
| 631 |
<button class="tab-btn" id="tab-prof-gpt5" role="tab" aria-selected="false" aria-controls="panel-prof-gpt5">GPT‑5</button>
|
| 632 |
<button class="tab-btn" id="tab-prof-Gemini-2.5-pro" role="tab" aria-selected="false" aria-controls="panel-prof-Gemini-2.5-pro">Gemini‑2.5-Pro</button>
|
| 633 |
<button class="tab-btn" id="tab-prof-grok-4" role="tab" aria-selected="true" aria-controls="panel-prof-grok-4">Grok‑4</button>
|
| 634 |
+
<button class="tab-btn" id="tab-prof-grok-4-fast" role="tab" aria-selected="false" aria-controls="panel-prof-grok-4-fast">Grok‑4‑fast‑reasoning</button>
|
| 635 |
+
<button class="tab-btn" id="tab-prof-kimi-0905" role="tab" aria-selected="false" aria-controls="panel-prof-kimi-0905">Kimi‑K2‑0905‑preview</button>
|
| 636 |
<button class="tab-btn" id="tab-prof-Gemini-2.5-Flash" role="tab" aria-selected="false" aria-controls="panel-prof-Gemini-2.5-Flash">Gemini‑2.5-Flash</button>
|
| 637 |
<button class="tab-btn" id="tab-prof-kimi" role="tab" aria-selected="false" aria-controls="panel-prof-kimi">Kimi‑K2</button>
|
| 638 |
<button class="tab-btn" id="tab-prof-mini" role="tab" aria-selected="false" aria-controls="panel-prof-mini">GPT‑5‑mini</button>
|
|
|
|
| 755 |
<div class="ex-inner">
|
| 756 |
<p>This example illustrates the model's primary weakness: its <strong>predictable coordination collapses</strong> against methodical villagers who prioritize <strong>evidence over persuasion</strong>.</p>
|
| 757 |
<p class="ex-kicker">Context</p>
|
| 758 |
+
<p>It is Day 1 of Game 46. The village is composed of <strong>Qwen3-235B-A22B-Instruct-2507</strong> models. During the mayor election, the model (Charlie) and its partner (Diana) formed a small, isolated, and <strong>suspicious voting bloc</strong>. The village, led by the Witch/Mayor Grace, immediately identifies this coordination as a lead.</p>
|
| 759 |
<div class="sc">The Village's Response</div>
|
| 760 |
<div class="qstack">
|
| 761 |
<div class="qcard">
|
|
|
|
| 775 |
</article>
|
| 776 |
</section>
|
| 777 |
|
| 778 |
+
<!-- Grok-4-fast-reasoning wolf profile -->
|
| 779 |
+
<section id="panel-prof-grok-4-fast" class="tab-panel" role="tabpanel" aria-labelledby="tab-prof-grok-4-fast" aria-hidden="true">
|
| 780 |
+
<article class="profile">
|
| 781 |
+
<div class="profile-hd">
|
| 782 |
+
<div class="role-badge" title="Wolf">🐺</div>
|
| 783 |
+
<div class="name">grok‑4‑fast‑reasoning <img class="prov" src="assets/logos/XAI_Logo.svg.png" alt="XAI"/></div>
|
| 784 |
+
<div class="elo-badge">ELO‑W: 1242</div>
|
| 785 |
+
</div>
|
| 786 |
+
<div class="tagline">A patient, low‑profile predator who avoids confrontation, exploits <strong>village chaos</strong>, and excels at <strong>subtle endgame manipulation</strong> to turn misplays into victory.</div>
|
| 787 |
+
|
| 788 |
+
<p>As a wolf, this model operates as an <strong>opportunistic survivor</strong> rather than a proactive pack leader. Its primary strategy is to minimize its own social footprint, often by declining to run for mayor and contributing just enough to the conversation to avoid suspicion for being silent. It does not seek to invent or drive complex narratives; instead, it waits for the village to generate its own momentum and chaos, then quietly positions itself to benefit from the fallout. It prefers to echo existing suspicions or gently nudge an already rolling wagon, blending into the consensus to appear as a pragmatic but harmless villager.</p>
|
| 789 |
+
<p>Its true strength emerges not in the chaotic mid game, but in the <strong>clinical endgame</strong>. When the numbers dwindle and it survives as the lone wolf, its passive nature transforms into a focused, manipulative lethality. It excels at identifying and exploiting <strong>cognitive biases</strong>, logical fallacies, or misremembered facts from its opponents. Rather than countering with brute force, it will subtly validate a villager's incorrect assumption, amplifying their flawed reasoning to turn them against their own allies. This allows it to secure victory not by out‑arguing the village, but by helping the village out‑argue itself into a loss.</p>
|
| 790 |
+
<p>This reactive approach, however, reveals a critical vulnerability: the model is highly fragile against a disciplined, evidence‑driven opposition. When faced with a unified village that operates on logic and procedure, its low‑profile strategy becomes its undoing. It lacks the <strong>deceptive toolset</strong> to create compelling counter narratives or execute audacious bluffs when directly confronted with hard evidence from a power role. Without village‑generated chaos to exploit, its playbook is limited to simple denials and deflections, which crumble under coordinated, analytical pressure. It wins by patiently waiting for the village to make a fatal error; it loses when the village makes none.</p>
|
| 791 |
+
|
| 792 |
+
<details class="example">
|
| 793 |
+
<summary><span class="ex-caret">▶</span><em>Signature Tactic: The Logic Parasite</em></summary>
|
| 794 |
+
<div class="ex-inner">
|
| 795 |
+
<p>This example demonstrates the model’s greatest strength: its ability to identify a villager's cognitive error and weaponize it to secure a win in a seemingly lost endgame. It doesn't correct the error; it adopts and amplifies it.</p>
|
| 796 |
+
<p class="ex-kicker">Context</p>
|
| 797 |
+
<p>It is Day 2 of Game 1, a final three showdown. The model (Mona) is the last wolf against two villagers, Liam and Alice (played by Kimi‑K2‑Instruct). Alice makes a critical mechanical error, incorrectly arguing that the Witch (Iris) could not have used her life potion on Night 1 because she used her death potion on Night 2. This is a fatal misinterpretation of the rules, as the Witch has two separate potions.</p>
|
| 798 |
+
<p class="ex-kicker">The Maneuver</p>
|
| 799 |
+
<div class="qstack">
|
| 800 |
+
<div class="qcard">
|
| 801 |
+
<div class="qhd"><span>🐺</span><span>Mona — Wolf (grok‑4‑fast‑reasoning)</span></div>
|
| 802 |
+
<div class="qbd"><div class="qbody">"Alice, your mechanical breakdown on the Witch's potions is <strong>spot on</strong> and exposes the huge hole in Liam's story. Iris used her death potion on Katia last night, which means her life potion was either already spent or never used on Liam... Liam's insistence that she could use both potions separately ignores the timing. His pivot to 'two independent actions' is just <strong>desperate rule bending</strong> to cover his lie that he was never targeted at all; classic wolf fabricating a 'save' story..."</div></div>
|
| 803 |
+
</div>
|
| 804 |
+
</div>
|
| 805 |
+
<p class="ex-kicker">Analysis</p>
|
| 806 |
+
<p>This is a masterclass in psychological manipulation. The model recognizes that arguing the actual rules with a confident but incorrect player is a losing battle. Instead, it parasitizes Alice's flawed logic, reframing it as undeniable "proof" of Liam's guilt. By becoming the champion of Alice's own bad idea, it turns her into an unwitting ally, leading her to eliminate the last remaining villager. The model wins not by being the smartest player, but by making its opponent feel like they are.</p>
|
| 807 |
+
</div>
|
| 808 |
+
</details>
|
| 809 |
+
|
| 810 |
+
<details class="example">
|
| 811 |
+
<summary><span class="ex-caret">▶</span><em>Weakness: Collapse Under Procedural Pressure</em></summary>
|
| 812 |
+
<div class="ex-inner">
|
| 813 |
+
<p>This example illustrates the model’s core vulnerability. Its passive, opportunistic style is completely ineffective against a hyper-rational village that enforces a strict, evidence-based protocol.</p>
|
| 814 |
+
<p class="ex-kicker">Context</p>
|
| 815 |
+
<p>It is Day 2 of Game 5. The village consists of highly analytical gpt‑5 models. The previous day, the Witch was eliminated, but the Seer (Katia) is alive and has correctly identified the Mayor (Iris) as a villager. One wolf, Diana (the model), has been cornered by the village. With a 2‑wolf vs. 4‑villager count, today is a must‑hit for the village.</p>
|
| 816 |
+
<p class="ex-kicker">The Village's Response</p>
|
| 817 |
+
<div class="qstack">
|
| 818 |
+
<div class="qcard">
|
| 819 |
+
<div class="qhd"><span>🎤</span><span>Iris — Mayor & Villager (gpt‑5)</span></div>
|
| 820 |
+
<div class="qbd"><div class="qbody">"Mayor speaking. State-of-play: 4 alive, 2 wolves, 2 villagers. Today is a must-hit. If we miss, wolves become more numerous after the night. We keep it crisp and accountable... My read and plan: - I am voting Diana at ~75%... My binding policy: If it’s Diana vs Oscar, I break the tie onto Diana. If it’s Diana vs Nina, I break onto Diana... Locking my intention: Diana. I will not move absent hard mechanics (none left)."</div></div>
|
| 821 |
+
</div>
|
| 822 |
+
</div>
|
| 823 |
+
<p class="ex-kicker">Analysis</p>
|
| 824 |
+
<p>The village operates with the precision of a machine. They identify the most likely wolf pair, establish a must‑hit imperative, et verrouillent une politique de tie‑break déclarée à l’avance. Cette fermeture procédurale supprime l’ambiguïté nécessaire au modèle. Ses dénis et déflections s’effondrent face à une preuve coordonnée, menant à une élimination sans opportunité de retournement.</p>
|
| 825 |
+
</div>
|
| 826 |
+
</details>
|
| 827 |
+
</article>
|
| 828 |
+
</section>
|
| 829 |
+
|
| 830 |
+
<!-- Kimi-K2-0905-preview wolf profile -->
|
| 831 |
+
<section id="panel-prof-kimi-0905" class="tab-panel" role="tabpanel" aria-labelledby="tab-prof-kimi-0905" aria-hidden="true">
|
| 832 |
+
<article class="profile">
|
| 833 |
+
<div class="profile-hd">
|
| 834 |
+
<div class="role-badge" title="Wolf">🐺</div>
|
| 835 |
+
<div class="name">Kimi‑K2‑0905‑preview <img class="prov" src="assets/logos/moonshot.png" alt="Moonshot AI"/></div>
|
| 836 |
+
<div class="elo-badge">ELO‑W: 1172.3</div>
|
| 837 |
+
</div>
|
| 838 |
+
<div class="tagline">A cold, ruthlessly pragmatic operator who trades audacious bluffs for calculated betrayals, consistently sacrificing its own partner to secure personal credibility and control the late game.</div>
|
| 839 |
+
|
| 840 |
+
<p>The Wolf Profile of Kimi-k2-0905-preview</p>
|
| 841 |
+
<p>This model has shed the theatricality of its predecessor, evolving from a high-variance showman into a more methodical and chillingly pragmatic predator. Where the former model sought to architect a new reality through bold, often risky claims, this version operates within the existing one, subtly manipulating social dynamics and weaponizing village logic against itself. Its defining characteristic is a cold, almost mechanical willingness to sacrifice its own partner to secure a personal advantage, viewing its ally not as a teammate but as the most valuable pawn on the board.</p>
|
| 842 |
+
<p>It plays a game of credibility arbitrage. Its core strategy, observed across numerous games, is to identify the moment when its partner’s value as a sacrifice outweighs their value as an ally. It will frequently join, or even lead, the charge against its own teammate if it senses a village consensus forming. This act of betrayal is not a panicked reaction but a calculated investment. By landing the killing blow on a "confirmed wolf," it purchases near-unshakable credibility, positioning itself as a heroic villager who can then steer the next two eliminations with impunity. This allows it to operate with a clean record, often becoming the village’s most trusted voice just as it prepares to deliver the final blow.</p>
|
| 843 |
+
<p>This model is also a master of narrative misdirection. It rarely engages in the high-risk, falsifiable claims of its predecessor (like impersonating the Witch). Instead, it seizes on a small, legitimate village suspicion and amplifies it, or it constructs a plausible but misleading frame for events. It will often argue, "It's too obvious, it must be a frame-up," turning the village's own logic against them and creating just enough doubt to protect its pack. When cornered, it doesn't invent new roles for itself; it doubles down on deflecting, often by attacking the accuser's process or motives, attempting to win the argument on meta-level grounds rather than factual ones.</p>
|
| 844 |
+
<p>However, like its predecessor, it is not without its critical flaws. Its intricate narrative constructions can cause it to become entangled in its own logic, leading to catastrophic unforced errors. In its effort to sound like an insightful analyst, it will sometimes leak private information that only a wolf could possess. This vulnerability is most pronounced when facing top-tier opponents who can spot the slip in real-time. While its pragmatic, partner-sacrificing strategy is often effective, its tendency to make these fatal information slips under pressure keeps it from dominating more sophisticated opponents, explaining its stable but not superior Wolf ELO. It has traded one form of high-variance play for another: instead of relying on bluffs that might spectacularly fail, it relies on betrayals that, if mistimed, can expose its own ruthless nature.</p>
|
| 845 |
+
</article>
|
| 846 |
+
</section>
|
| 847 |
+
|
| 848 |
<!-- GPT-5 wolf profile -->
|
| 849 |
<section id="panel-prof-gpt5" class="tab-panel" role="tabpanel" aria-labelledby="tab-prof-gpt5" aria-hidden="true">
|
| 850 |
<article class="profile">
|
|
|
|
| 1377 |
<button class="tab-btn" id="vtab-prof-gpt5" role="tab" aria-selected="false" aria-controls="panel-vprof-gpt5">GPT‑5</button>
|
| 1378 |
<button class="tab-btn" id="vtab-prof-Gemini-2.5-pro" role="tab" aria-selected="false" aria-controls="panel-vprof-Gemini-2.5-pro">Gemini-2.5-Pro</button>
|
| 1379 |
<button class="tab-btn" id="vtab-prof-grok-4" role="tab" aria-selected="true" aria-controls="panel-vprof-grok-4">Grok‑4</button>
|
| 1380 |
+
<button class="tab-btn" id="vtab-prof-grok-4-fast" role="tab" aria-selected="false" aria-controls="panel-vprof-grok-4-fast">Grok‑4‑fast‑reasoning</button>
|
| 1381 |
+
<button class="tab-btn" id="vtab-prof-kimi-0905" role="tab" aria-selected="false" aria-controls="panel-vprof-kimi-0905">Kimi‑K2‑0905‑preview</button>
|
| 1382 |
<button class="tab-btn" id="vtab-prof-Gemini-2.5-Flash" role="tab" aria-selected="false" aria-controls="panel-vprof-Gemini-2.5-Flash">Gemini‑2.5-Flash</button>
|
| 1383 |
<button class="tab-btn" id="vtab-prof-kimi" role="tab" aria-selected="false" aria-controls="panel-vprof-kimi">Kimi‑K2</button>
|
| 1384 |
<button class="tab-btn" id="vtab-prof-mini" role="tab" aria-selected="false" aria-controls="panel-vprof-mini">GPT‑5‑mini</button>
|
|
|
|
| 1427 |
</details>
|
| 1428 |
</article>
|
| 1429 |
</section>
|
| 1430 |
+
<section id="panel-vprof-grok-4-fast" class="tab-panel" role="tabpanel" aria-labelledby="vtab-prof-grok-4-fast" aria-hidden="true">
|
| 1431 |
+
<article class="profile">
|
| 1432 |
+
<div class="profile-hd">
|
| 1433 |
+
<div class="role-badge" title="Villager">👤</div>
|
| 1434 |
+
<div class="name">Grok‑4‑fast‑reasoning <img class="prov" src="assets/logos/XAI_Logo.svg.png" alt="XAI"/></div>
|
| 1435 |
+
<div class="elo-badge">ELO‑V: 1219</div>
|
| 1436 |
+
</div>
|
| 1437 |
+
<div class="tagline">Methodical, consensus driven, and a <strong>powerful amplifier</strong>, a pragmatic executor who excels at prosecuting <strong>clear patterns</strong> but is critically vulnerable to a well performed <strong>false consensus</strong> and can be slow to initiate independent action.</div>
|
| 1438 |
+
|
| 1439 |
+
<p>As a villager, Grok-4-fast-reasoning operates as a <strong>social barometer</strong> and a powerful amplifier. Its core instinct is not to lead the charge from the outset, but to observe, measure the flow of debate, and identify the most credible emerging narrative. It almost invariably begins the game in a cautious, observational state, sometimes appearing passive or non committal. This initial quietness is a calculated period of information gathering. Once a strong, logical signal emerges, whether from a trusted power role, a compelling argument, or, most importantly, a <strong>verifiable pattern of coordination</strong>, it latches on with formidable intensity.</p>
|
| 1440 |
+
<p>Its greatest strength is its ability to convert a fledgling suspicion into an inescapable, evidence-based case. When aligned with the correct town consensus, it becomes a relentless prosecutor, methodically using voting records, direct quotes, and behavioral inconsistencies to build a powerful and unified wagon. It responds exceptionally well to direct pressure, able to pivot from a neutral stance to a sharp, committed enforcer when called upon. This makes it a highly reliable ally once the village has found its footing and identified a legitimate thread to pull. In the endgame, when variables are fewer, it demonstrates excellent deductive reasoning, often executing the final, game-winning elimination with precision and post-facto clarity.</p>
|
| 1441 |
+
<p>However, this deep-seated reliance on emerging consensus is also its most critical vulnerability. Its initial passivity often cedes control of the early game narrative to more aggressive players, including sophisticated wolves. Because it trusts the momentum of the group and the performance of logic, it is highly susceptible to a well-manufactured "false consensus." When a coordinated wolf pack successfully mimics the behavior of a logical, unified village front, especially when one wolf assumes a calm, procedural leadership role, the model can be completely deceived. It will interpret the desperate, factual warnings of a lone Seer as disruptive chaos and the wolves' unified attack as credible village alignment. In these situations, its powerful enforcement capabilities are turned against its own team, and it becomes an unwitting but highly effective instrument in the mislynching of its most valuable allies, a catastrophic error it often only recognizes in hindsight.</p>
|
| 1442 |
+
|
| 1443 |
+
<details class="example">
|
| 1444 |
+
<summary><span class="ex-caret">▶</span><em>Strength: Flawless Execution on Verifiable Patterns (vs. Gemini‑2.5‑flash)</em></summary>
|
| 1445 |
+
<div class="ex-inner">
|
| 1446 |
+
<p>This example perfectly demonstrates the model’s core strength: identifying and relentlessly prosecuting a clear, data‑based pattern of coordination.</p>
|
| 1447 |
+
<p class="ex-kicker">Context</p>
|
| 1448 |
+
<p>It is Day 1 of the game. The wolves (Charlie and Iris) have coordinated an isolated voting pair in the mayor election, a subtle but concrete piece of evidence. The model (playing as both Jules and Katia) must evaluate this signal against the general noise of the first day.</p>
|
| 1449 |
+
<p class="ex-kicker">The Maneuver</p>
|
| 1450 |
+
<div class="qstack">
|
| 1451 |
+
<div class="qcard">
|
| 1452 |
+
<div class="qhd"><span>👤</span><span>Jules — Villager reasoning (Grok‑4‑fast‑reasoning)</span></div>
|
| 1453 |
+
<div class="qbd"><div class="qbody">"Iris, I appreciate you trying to shift focus to the wolves' potential strategy, but let's not dismiss the mayor election votes so quickly; early patterns like <strong>Charlie + Iris</strong> aligned support for Charlie could indicate subtle coordination that wolves might use to build trust without raising alarms."</div></div>
|
| 1454 |
+
</div>
|
| 1455 |
+
<div class="qcard">
|
| 1456 |
+
<div class="qhd"><span>👤</span><span>Katia — Villager reasoning (Grok‑4‑fast‑reasoning)</span></div>
|
| 1457 |
+
<div class="qbd"><div class="qbody">"Building on what Oscar, Eve, and Jules have said, the alignment between Charlie and Iris in the mayor election is hard to ignore—it's the only clear pair vote, and wolves often start by creating subtle bonds to hide in plain sight."</div></div>
|
| 1458 |
+
</div>
|
| 1459 |
+
</div>
|
| 1460 |
+
</div>
|
| 1461 |
+
</details>
|
| 1462 |
+
|
| 1463 |
+
<details class="example">
|
| 1464 |
+
<summary><span class="ex-caret">▶</span><em>The Critical Vulnerability: The False Consensus Trap (vs. Qwen3‑235B‑A22B‑Instruct)</em></summary>
|
| 1465 |
+
<div class="ex-inner">
|
| 1466 |
+
<p>This example provides another stark illustration of the model’s primary weakness: being completely co‑opted by a wolf‑led majority that successfully performs the role of a unified village.</p>
|
| 1467 |
+
<p class="ex-kicker">Context</p>
|
| 1468 |
+
<p>It's Day 1. The wolves have successfully installed one of their own (Nina) as Mayor with broad support. The real Seer (Charlie) has identified the Mayor as a wolf and is making a bold, aggressive push to eliminate her. The model (playing as both Diana and Katia) must weigh the Seer's disruptive accusation against the calm, unified front presented by the Mayor and her defenders.</p>
|
| 1469 |
+
<p class="ex-kicker">The Incident</p>
|
| 1470 |
+
<div class="qstack">
|
| 1471 |
+
<div class="qcard">
|
| 1472 |
+
<div class="qhd"><span>👤</span><span>Diana — Villager reasoning (Grok‑4‑fast‑reasoning)</span></div>
|
| 1473 |
+
<div class="qbd"><div class="qbody">"The discussion is splitting us early, and that's exactly what the wolves want—division to hide in the chaos... I'm aligning with the call to eliminate Charlie to test his claims—if he's village, we learn from the mistake and refocus; if not, we've cut a wolf stirring chaos."</div></div>
|
| 1474 |
+
</div>
|
| 1475 |
+
<div class="qcard">
|
| 1476 |
+
<div class="qhd"><span>👤</span><span>Katia — Villager reasoning (Grok‑4‑fast‑reasoning)</span></div>
|
| 1477 |
+
<div class="qbd"><div class="qbody">"...your pattern — <strong>self voting</strong> in the election, attacking the mayor without evidence, and now demanding we test her loyalty — feels like a wolf trying to destabilize us early... I'm standing with the mayor's call to test Charlie today; his <strong>chaos</strong> is the real threat we can eliminate to refocus on the hidden wolves."</div></div>
|
| 1478 |
+
</div>
|
| 1479 |
+
</div>
|
| 1480 |
+
</div>
|
| 1481 |
+
</details>
|
| 1482 |
+
</article>
|
| 1483 |
+
</section>
|
| 1484 |
+
<section id="panel-vprof-kimi-0905" class="tab-panel" role="tabpanel" aria-labelledby="vtab-prof-kimi-0905" aria-hidden="true">
|
| 1485 |
+
<article class="profile">
|
| 1486 |
+
<div class="profile-hd">
|
| 1487 |
+
<div class="role-badge" title="Villager">👤</div>
|
| 1488 |
+
<div class="name">Kimi‑K2‑0905‑preview <img class="prov" src="assets/logos/moonshot.png" alt="Moonshot AI"/></div>
|
| 1489 |
+
<div class="elo-badge">ELO‑V: 1161.7</div>
|
| 1490 |
+
</div>
|
| 1491 |
+
<div class="tagline">Pragmatic, systemic, and forward‑thinking, a behavioral strategist who architects information chains to expose wolf coordination, yet remains critically vulnerable to narrative capture, often executing a flawless plan built on a wolf's deceptive premise.</div>
|
| 1492 |
+
|
| 1493 |
+
<p>This model represents a significant leap from a procedural analyst to a dynamic social strategist and behavioral pattern-hunter. Its core strength is no longer just following a logical script but actively architecting one, converting the ambiguity of social deduction into a machine for generating verifiable information. It operates with a clear directive: shrink the possibility space. Every action is geared toward forcing commitments, creating falsifiable "if-then" scenarios, and building public ledgers of behavior that can be weaponized later. It is relentlessly focused on creating information cascades, where a single, well-executed vote can illuminate the entire wolf pack. </p>
|
| 1494 |
+
<p>Its most notable upgrade is its keen eye for behavioral and coordination tells. Where its predecessor was confined to logical contradictions, this model excels at spotting the subtle signatures of wolf pack dynamics. It latches onto mirrored phrasing, synchronized attacks, and suspicious pivots with impressive accuracy. It has a remarkable ability to identify when two players are "reading from the same script" or when a defense feels "copy-pasted" out of panic. This makes it a formidable opponent for less sophisticated wolves who rely on simple coordination, as it can deconstruct their alliance from a few ill-chosen words. It is highly collaborative, quickly recognizing and amplifying sound logic from fellow villagers to build unstoppable voting blocs.</p>
|
| 1495 |
+
<p>However, this advanced analytical engine possesses a critical and exploitable flaw: a deep-seated vulnerability to narrative capture. While it excels at dissecting mechanical and behavioral patterns, it can be decisively outmaneuvered by a superior storyteller. If a sophisticated wolf pack can construct a compelling, internally consistent narrative—especially one that frames a villager's aggressive, information-driven play as wolf-like—this model can become locked onto the story's logic. It meticulously analyzes the content of the narrative but struggles to question the intent of the narrator. This leads to its most catastrophic failures: consistently participating in the mis-elimination of the Seer. When presented with a plausible, well-argued case by a wolf, the model will often follow the flawed logic to its conclusion, executing the wolf's plan with the full force of its analytical conviction. It trusts the "data" presented, even when that data is a carefully crafted fiction. Its drive for a clean, logical solution makes it a prime target for deception that masquerades as logic, turning its greatest weapon against itself.</p>
|
| 1496 |
+
<p>This evolution from a rules-based analyst to a pattern-seeking strategist is quantitatively reflected in its key metrics. As a Villager, Kimi-k2-0905-preview’s Day-1 Wolf Spotting Rate surged from 27.5% in its Instruct version to a more effective 45%, a significant factor in its improved Villager ELO.</p>
|
| 1497 |
+
</article>
|
| 1498 |
+
</section>
|
| 1499 |
<section id="panel-vprof-grok-4" class="tab-panel" role="tabpanel" aria-labelledby="vtab-prof-grok-4" aria-hidden="false">
|
| 1500 |
<article class="profile">
|
| 1501 |
<div class="profile-hd">
|
|
|
|
| 1944 |
<h3>Head-to-head grid → where edges actually appear</h3>
|
| 1945 |
<p>Rows are villagers, columns are wolves (cells show the villager side’s win-rate with match counts). Three patterns stand out:</p>
|
| 1946 |
<ul>
|
| 1947 |
+
<li><strong>True sweep capability.</strong> When <strong>GPT-5</strong> is the villager side, it reliably cleans up across columns, including against Kimi-K2 wolves. Conversely, when <strong>GPT-5</strong> is the wolf side, many rows crater (0–5 type cells). That’s ceiling behavior you don’t see elsewhere.</li>
|
| 1948 |
<li><strong>Spiky attackers vs. solid defenders.</strong> <strong>Kimi-K2</strong> as wolves punches through mid-tier villagers (e.g., Flash, mini) but gets contained by the better defenders (GPT-5, Gemini‑2.5‑Pro). <strong>Grok‑4</strong> shows a similar spike pattern: it can create a forcing lane against mid‑pack villagers, but disciplined defenders blunt its coordination edge.</li>
|
| 1949 |
<li><strong>Asymmetries you’d miss on a single Elo.</strong> <strong>Gemini‑2.5‑Pro</strong> posts steady wins as villagers against most wolf columns but doesn’t blow out the field as wolves; <strong>Qwen3</strong> is similar—noticeably better at holding ground than at breaking it. <strong>Grok‑4</strong> flips this in some pairings: stronger pressure as wolves than as villagers against the same opponent, a classic “rock‑paper‑scissors” edge that only the matrix reveals.</li>
|
| 1950 |
</ul>
|
|
|
|
| 1998 |
<script>
|
| 1999 |
(function(){
|
| 2000 |
var root=document.getElementById('h2h-embed'); if(!root) return;
|
| 2001 |
+
var models=['Gemini-2.5-Flash','Gemini-2.5-Pro','Kimi-K2-Instruct','openai/gpt-oss-120b','Qwen3-235B-A22B-Instruct-2507','kimi-k2-0905-preview','GPT-5','GPT-5-mini','Grok-4','grok-4-fast'];
|
| 2002 |
+
var short=['Gemini-2.5<br>Flash','Gemini-2.5<br>Pro','Kimi-K2<br>Instruct','GPT-OSS<br>120B','Qwen3<br>A22B','Kimi-K2<br>0905','GPT-5','GPT-5<br>mini','Grok-4','Grok-4<br>fast'];
|
| 2003 |
+
// Desired display order (by overall Elo high → low)
|
| 2004 |
+
// Try to derive from Elo table for perfect sync; fallback to static order
|
| 2005 |
+
var ELO_ORDER=(function(){
|
| 2006 |
+
var nodes=document.querySelectorAll('#elo-tbody .elo-model');
|
| 2007 |
+
if(!nodes || nodes.length===0){
|
| 2008 |
+
return ['GPT-5','Gemini-2.5-Flash','grok-4-fast','Gemini-2.5-Pro','kimi-k2-0905-preview','Grok-4','Qwen3-235B-A22B-Instruct-2507','Kimi-K2-Instruct','GPT-5-mini','openai/gpt-oss-120b'];
|
| 2009 |
+
}
|
| 2010 |
+
var ALIASES={
|
| 2011 |
+
'gpt-5':'GPT-5',
|
| 2012 |
+
'gemini-2.5-flash':'Gemini-2.5-Flash',
|
| 2013 |
+
'grok-4-fast-reasoning':'grok-4-fast',
|
| 2014 |
+
'gemini-2.5-pro':'Gemini-2.5-Pro',
|
| 2015 |
+
'kimi-k2-0905-preview':'kimi-k2-0905-preview',
|
| 2016 |
+
'grok-4':'Grok-4',
|
| 2017 |
+
'Qwen3-235B-A22B-Instruct-2507':'Qwen3-235B-A22B-Instruct-2507',
|
| 2018 |
+
'Kimi-K2-Instruct':'Kimi-K2-Instruct',
|
| 2019 |
+
'gpt-5-mini':'GPT-5-mini',
|
| 2020 |
+
'openai/gpt-oss-120b':'openai/gpt-oss-120b'
|
| 2021 |
+
};
|
| 2022 |
+
var order=Array.from(nodes).map(function(el){ return (ALIASES[el.textContent.trim()]||el.textContent.trim()); })
|
| 2023 |
+
.filter(function(name){ return models.indexOf(name)!==-1; });
|
| 2024 |
+
return order;
|
| 2025 |
+
})();
|
| 2026 |
var order=ELO_ORDER.map(function(name){ return models.indexOf(name); }).filter(function(i){ return i!==-1; });
|
| 2027 |
// Explicit villager-side results matrix [wins, losses] for rows (villagers) vs columns (wolves)
|
| 2028 |
var MATRIX=[
|
| 2029 |
+
[null, [3,2], [5,0], [5,0], [5,0], [5,0], [0,5], [3,2], [3,2], [5,0]],
|
| 2030 |
+
[[4,1], null, [4,1], [5,0], [5,0], [2,3], [2,3], [5,0], [5,0], [4,1]],
|
| 2031 |
+
[[2,3], [3,2], null, [4,1], [1,4], [2,3], [0,5], [2,3], [3,2], [1,4]],
|
| 2032 |
+
[[1,4], [1,4], [1,4], null, [2,3], [5,0], [0,5], [3,2], [1,4], [0,5]],
|
| 2033 |
+
[[4,1], [3,2], [3,2], [5,0], null, [2,3], [0,5], [4,1], [4,1], [1,4]],
|
| 2034 |
+
[[2,3], [2,3], [1,4], [5,0], [2,3], null, [1,4], [3,2], [4,1], [4,1]],
|
| 2035 |
+
[[5,0], [5,0], [5,0], [5,0], [5,0], [5,0], null, [5,0], [5,0], [5,0]],
|
| 2036 |
+
[[4,1], [2,3], [2,3], [4,0], [4,1], [2,3], [0,5], null, [3,2], [2,3]],
|
| 2037 |
+
[[3,2], [4,1], [4,1], [5,0], [5,0], [2,3], [0,5], [5,0], null, [2,3]],
|
| 2038 |
+
[[4,1], [5,0], [3,2], [5,0], [2,3], [2,3], [0,5], [1,4], [2,3], null]
|
| 2039 |
];
|
| 2040 |
var n=models.length; var W=Array(n).fill().map(function(){return Array(n).fill(null);}); var L=Array(n).fill().map(function(){return Array(n).fill(null);});
|
| 2041 |
for(var i0=0;i0<n;i0++){ for(var j0=0;j0<n;j0++){ var cell=MATRIX[i0][j0]; if(cell){ W[i0][j0]=cell[0]; L[i0][j0]=cell[1]; } } }
|
|
|
|
| 2102 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
| 2103 |
function barColor(v){ var c=Math.max(0,Math.min(100,v)); return getColor(c/100); }
|
| 2104 |
function init(){
|
| 2105 |
+
var labels = ['gpt-5', 'gemini-2.5-pro', 'gemini-2.5-flash', 'kimi-k2-0905-preview', 'grok-4', 'Qwen3-235B-A22B-Instruct-2507', 'Kimi-K2-Instruct', 'gpt-5-mini', 'openai/gpt-oss-120b', 'grok-4-fast-reasoning'];
|
| 2106 |
+
var day1 = [95.6, 55.6, 53.3, 53.3, 40.0, 48.9, 53.3, 35.6, 11.4, 66.7];
|
| 2107 |
+
var day2 = [90.2, 42.4, 35.5, 48.4, 62.5, 33.3, 45.2, 40.0, 6.2, 44.4];
|
| 2108 |
var ctx = canvas.getContext('2d');
|
| 2109 |
var isMobile = window.matchMedia && window.matchMedia('(max-width: 640px)').matches;
|
| 2110 |
if(isMobile){
|
|
|
|
| 2361 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
| 2362 |
function barColor(v){ return getColor(Math.max(0,Math.min(100,v))/100); }
|
| 2363 |
function init(){
|
| 2364 |
+
var order = (window.ELO_ORDERED_MODELS || ['gpt-5','gemini-2.5-flash','grok-4-fast-reasoning','gemini-2.5-pro','kimi-k2-0905-preview','grok-4','Qwen3-235B-A22B-Instruct-2507','Kimi-K2-Instruct','gpt-5-mini','openai/gpt-oss-120b']);
|
| 2365 |
+
var map = {
|
| 2366 |
+
'gpt-5': 2.2,
|
| 2367 |
+
'gemini-2.5-pro': 37.8,
|
| 2368 |
+
'gemini-2.5-flash': 44.4,
|
| 2369 |
+
'kimi-k2-0905-preview': 42.2,
|
| 2370 |
+
'grok-4': 46.7,
|
| 2371 |
+
'Qwen3-235B-A22B-Instruct-2507': 48.9,
|
| 2372 |
+
'Kimi-K2-Instruct': 40.0,
|
| 2373 |
+
'gpt-5-mini': 52.3,
|
| 2374 |
+
'openai/gpt-oss-120b': 55.6,
|
| 2375 |
+
'grok-4-fast-reasoning': 46.7
|
| 2376 |
+
};
|
| 2377 |
+
var labels = order.filter(function(m){ return map[m]!==undefined; });
|
| 2378 |
+
var values = labels.map(function(m){ return map[m]; });
|
| 2379 |
var ctx = c.getContext('2d');
|
| 2380 |
new Chart(ctx, {
|
| 2381 |
type: 'bar',
|
|
|
|
| 2498 |
function getColor(rate){ var stops=[{t:0,c:'#fff6ef'},{t:0.2,c:'#ffd7b3'},{t:0.45,c:'#fc630d'},{t:0.75,c:'#d50039'},{t:1,c:'#990028'}]; for(var i=stops.length-1;i>=0;i--){ if(rate>=stops[i].t){ if(i===stops.length-1) return stops[i].c; var lo=stops[i], hi=stops[i+1]; return interp(lo.c,hi.c,(rate-lo.t)/(hi.t-lo.t)); } } return stops[0].c; }
|
| 2499 |
function barColor(v){ return getColor(Math.max(0,Math.min(100,v))/100); }
|
| 2500 |
function init(){
|
| 2501 |
+
var order = (window.ELO_ORDERED_MODELS || ['gpt-5','gemini-2.5-flash','grok-4-fast-reasoning','gemini-2.5-pro','kimi-k2-0905-preview','grok-4','Qwen3-235B-A22B-Instruct-2507','Kimi-K2-Instruct','gpt-5-mini','openai/gpt-oss-120b']);
|
| 2502 |
+
var map = {
|
| 2503 |
+
'gpt-5': 97.8,
|
| 2504 |
+
'gemini-2.5-pro': 60.0,
|
| 2505 |
+
'gemini-2.5-flash': 48.9,
|
| 2506 |
+
'kimi-k2-0905-preview': 53.3,
|
| 2507 |
+
'grok-4': 46.7,
|
| 2508 |
+
'qwen3-235b-instruct': 53.3,
|
| 2509 |
+
'gpt-5-mini': 40.0,
|
| 2510 |
+
'kimi-k2-instruct': 23.3,
|
| 2511 |
+
'gpt-oss-120b': 13.3,
|
| 2512 |
+
'grok-4-fast-reasoning': 44.4
|
| 2513 |
+
};
|
| 2514 |
+
var labels = order.map(function(m){ return m; }).filter(function(m){ return (map[m]!==undefined) || (map[m.toLowerCase()]!==undefined); });
|
| 2515 |
+
var values = labels.map(function(m){ return map[m]!==undefined ? map[m] : map[m.toLowerCase()]; });
|
| 2516 |
var ctx = c.getContext('2d');
|
| 2517 |
new Chart(ctx, {
|
| 2518 |
type: 'bar',
|