deepmage121 commited on
Commit
4b56c7c
·
1 Parent(s): b3c0458

interim update in parser

Browse files
Files changed (5) hide show
  1. app.py +180 -106
  2. data_loader.py +253 -132
  3. eval.schema.json +221 -63
  4. hf_operations.py +58 -40
  5. ui_components.py +93 -77
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import gradio as gr
3
  import pandas as pd
4
  from pathlib import Path
@@ -11,7 +10,7 @@ from data_loader import (
11
  clear_cache,
12
  search_model_across_leaderboards,
13
  get_model_suggestions_fast,
14
- DATA_DIR
15
  )
16
  from ui_components import (
17
  get_theme,
@@ -29,12 +28,12 @@ PAGE_SIZE = 50
29
  def get_leaderboard_data(selected_leaderboard, progress=gr.Progress()):
30
  if not selected_leaderboard:
31
  return pd.DataFrame(), {}
32
-
33
  metadata = get_eval_metadata(selected_leaderboard)
34
-
35
  def progress_callback(value, desc):
36
  progress(value, desc=desc)
37
-
38
  df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
39
  return df, metadata
40
 
@@ -42,27 +41,32 @@ def get_leaderboard_data(selected_leaderboard, progress=gr.Progress()):
42
  def filter_and_paginate(df, search_query, sort_column, selected_columns, current_page):
43
  if df.empty:
44
  return df.copy(), 1, 1
45
-
46
  df = df.copy()
47
  all_columns = list(df.columns)
48
-
49
  if selected_columns:
50
- cols = ["Model"] + [c for c in all_columns if c in selected_columns and c != "Model"]
 
 
51
  df = df[cols]
52
-
53
  if search_query:
54
- mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
 
 
 
55
  df = df[mask]
56
-
57
  if sort_column and sort_column in df.columns:
58
- df = df.sort_values(by=sort_column, ascending=False, na_position='last')
59
-
60
  total_rows = len(df)
61
  total_pages = max(1, (total_rows + PAGE_SIZE - 1) // PAGE_SIZE)
62
  current_page = max(1, min(current_page, total_pages))
63
  start = (current_page - 1) * PAGE_SIZE
64
  end = start + PAGE_SIZE
65
-
66
  return df.iloc[start:end], current_page, total_pages
67
 
68
 
@@ -74,9 +78,9 @@ def search_model(model_query):
74
  <p>Enter a model name to see its benchmarks across all leaderboards</p>
75
  </div>
76
  """
77
-
78
  results, _ = search_model_across_leaderboards(model_query)
79
-
80
  if not results:
81
  return f"""
82
  <div class="no-results">
@@ -84,74 +88,88 @@ def search_model(model_query):
84
  <p>Try a different model name or check the spelling</p>
85
  </div>
86
  """
87
-
88
  model_name = list(results.keys())[0]
89
  model_data = results[model_name]
90
-
91
  return format_model_card(model_name, model_data)
92
 
93
 
94
  def compare_models(selected_models):
95
  if not selected_models:
96
- return """
 
97
  <div class="no-results">
98
  <h3>Select models to compare</h3>
99
  <p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
100
  </div>
101
- """, None
102
-
 
 
103
  all_results = {}
104
  for model_name in selected_models:
105
  results, _ = search_model_across_leaderboards(model_name)
106
  if results:
107
  matched_model = list(results.keys())[0]
108
  all_results[matched_model] = results[matched_model]
109
-
110
  plot = create_radar_plot(list(all_results.keys()), all_results)
111
-
112
  if len(all_results) == 1:
113
  model_name = list(all_results.keys())[0]
114
  return format_model_card(model_name, all_results[model_name]), plot
115
  elif len(all_results) > 1:
116
  return format_model_comparison(list(all_results.keys()), all_results), plot
117
  else:
118
- return """
 
119
  <div class="no-results">
120
  <h3>No results found</h3>
121
  <p>Try selecting different models</p>
122
  </div>
123
- """, None
 
 
124
 
125
 
126
  def get_model_suggestions(value):
127
  query = value or ""
128
  if not query or len(query) < 2:
129
  return gr.update(choices=[], value=[])
130
-
131
  matches = get_model_suggestions_fast(query, limit=10)
132
  return gr.update(choices=matches, value=[])
133
 
134
 
135
- def export_leaderboard_to_csv(full_df, selected_leaderboard, search_query, selected_columns):
 
 
136
  """Export the current leaderboard view to CSV."""
137
  if full_df.empty:
138
  return None
139
-
140
  df = full_df.copy()
141
-
142
  # Apply column selection
143
  if selected_columns:
144
- cols = ["Model"] + [c for c in df.columns if c in selected_columns and c != "Model"]
 
 
145
  df = df[cols]
146
-
147
  # Apply search filter
148
  if search_query:
149
- mask = df.astype(str).apply(lambda row: row.str.contains(search_query, case=False, na=False).any(), axis=1)
 
 
 
150
  df = df[mask]
151
-
152
  # Save to CSV with absolute path
153
  from pathlib import Path
154
  import tempfile
 
155
  temp_dir = Path(tempfile.gettempdir())
156
  filename = temp_dir / f"{selected_leaderboard.replace(' ', '_')}_leaderboard.csv"
157
  df.to_csv(filename, index=False)
@@ -162,17 +180,17 @@ def export_comparison_to_csv(selected_models):
162
  """Export model comparison to CSV."""
163
  if not selected_models:
164
  return None
165
-
166
  all_results = {}
167
  for model_name in selected_models:
168
  results, _ = search_model_across_leaderboards(model_name)
169
  if results:
170
  matched_model = list(results.keys())[0]
171
  all_results[matched_model] = results[matched_model]
172
-
173
  if not all_results:
174
  return None
175
-
176
  # Build comparison table
177
  rows = []
178
  for model_name, model_data in all_results.items():
@@ -184,14 +202,15 @@ def export_comparison_to_csv(selected_models):
184
  "Developer": data.get("developer"),
185
  "Params (B)": data.get("params"),
186
  "Architecture": data.get("architecture"),
187
- "Precision": data.get("precision")
188
  }
189
  row.update(results)
190
  rows.append(row)
191
-
192
  df = pd.DataFrame(rows)
193
  from pathlib import Path
194
  import tempfile
 
195
  temp_dir = Path(tempfile.gettempdir())
196
  filename = temp_dir / "model_comparison.csv"
197
  df.to_csv(filename, index=False)
@@ -205,8 +224,12 @@ initial_leaderboard = initial_leaderboards[0] if initial_leaderboards else None
205
 
206
  if initial_leaderboard:
207
  _init_df, _init_metadata = get_leaderboard_data(initial_leaderboard)
208
- _init_columns = [c for c in _init_df.columns if c != "Model"] if not _init_df.empty else []
209
- _init_df_display, _, _init_total_pages = filter_and_paginate(_init_df, "", "Average", None, 1)
 
 
 
 
210
  else:
211
  _init_df = pd.DataFrame()
212
  _init_metadata = {}
@@ -214,12 +237,13 @@ else:
214
  _init_df_display = pd.DataFrame()
215
  _init_total_pages = 1
216
 
217
- with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css()) as demo:
218
-
 
219
  full_df_state = gr.State(value=_init_df)
220
  metadata_state = gr.State(value=_init_metadata)
221
  current_page_state = gr.State(value=1)
222
-
223
  gr.HTML("""
224
  <div class="app-header">
225
  <div class="logo-mark">E³</div>
@@ -232,7 +256,7 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
232
  </div>
233
  </div>
234
  """)
235
-
236
  with gr.Tabs():
237
  with gr.TabItem("Leaderboards"):
238
  with gr.Column(elem_classes="controls-bar"):
@@ -242,21 +266,25 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
242
  choices=initial_leaderboards,
243
  value=initial_leaderboard,
244
  label="Leaderboard",
245
- interactive=True
246
  )
247
  with gr.Column(scale=1, min_width=120):
248
- refresh_btn = gr.Button("↻ Refresh", variant="secondary", size="sm")
 
 
249
  with gr.Column(scale=1, min_width=120):
250
- export_btn = gr.DownloadButton("📥 Export CSV", variant="secondary", size="sm")
 
 
251
 
252
  search_box = gr.Textbox(
253
- label="Filter",
254
- placeholder="Filter models...",
255
- show_label=True
256
  )
257
-
258
- header_view = gr.HTML(value=format_leaderboard_header(initial_leaderboard, _init_metadata))
259
-
 
 
260
  with gr.Row(elem_classes="column-selector-bar"):
261
  with gr.Column(scale=5, min_width=320):
262
  column_selector = gr.Dropdown(
@@ -265,9 +293,9 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
265
  label="Columns to Display",
266
  multiselect=True,
267
  interactive=True,
268
- elem_classes="column-selector-dropdown"
269
  )
270
-
271
  leaderboard_table = gr.Dataframe(
272
  value=_init_df_display,
273
  label=None,
@@ -275,17 +303,21 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
275
  wrap=False,
276
  elem_classes="dataframe",
277
  )
278
-
279
  with gr.Row(elem_classes="pagination-bar"):
280
  prev_btn = gr.Button("←", variant="secondary", size="sm", min_width=60)
281
- page_info = gr.Markdown(value=f"1 / {_init_total_pages}", elem_classes="page-info")
 
 
282
  next_btn = gr.Button("→", variant="secondary", size="sm", min_width=60)
283
-
284
- metrics_view = gr.HTML(value=format_metric_details(initial_leaderboard, _init_metadata))
285
-
 
 
286
  with gr.TabItem("🔍 Model Lookup"):
287
  gr.Markdown("### Find and compare models across all leaderboards")
288
-
289
  selected_models_state = gr.State(value=[])
290
  default_compare_html = """
291
  <div class="no-results">
@@ -293,7 +325,7 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
293
  <p>Type in the dropdown to search, then select a model to add it</p>
294
  </div>
295
  """
296
-
297
  with gr.Row(elem_classes="controls-bar"):
298
  with gr.Column(scale=3):
299
  model_search_box = gr.Textbox(
@@ -312,25 +344,29 @@ with gr.Blocks(title="Every Eval Ever", theme=get_theme(), css=get_custom_css())
312
  elem_classes=["match-pills"],
313
  )
314
  with gr.Column(scale=1, min_width=80):
315
- clear_models_btn = gr.Button("Clear", variant="secondary", size="sm")
316
-
 
 
317
  selected_models_group = gr.CheckboxGroup(
318
  choices=[],
319
  value=[],
320
  label="Selected Models (click to remove)",
321
  interactive=True,
322
- elem_classes="selected-models-group"
323
  )
324
-
325
  with gr.Row():
326
  with gr.Column(scale=4):
327
  pass
328
  with gr.Column(scale=1, min_width=120):
329
- export_comparison_btn = gr.DownloadButton("📥 Export CSV", variant="secondary", size="sm")
330
-
 
 
331
  radar_view = gr.Plot(label="Radar Comparison")
332
  model_card_view = gr.HTML(value=default_compare_html)
333
-
334
  with gr.Accordion("📤 How to Submit Data", open=False):
335
  gr.Markdown("""
336
  Submit via GitHub Pull Request:
@@ -341,12 +377,12 @@ Submit via GitHub Pull Request:
341
 
342
  [Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) - [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)
343
  """)
344
-
345
  def load_leaderboard(leaderboard_name):
346
  df, metadata = get_leaderboard_data(leaderboard_name)
347
  columns = [c for c in df.columns if c != "Model"] if not df.empty else []
348
  df_display, page, total_pages = filter_and_paginate(df, "", "Average", None, 1)
349
-
350
  return (
351
  df, # full_df_state
352
  metadata, # metadata_state
@@ -357,83 +393,103 @@ Submit via GitHub Pull Request:
357
  gr.update(choices=columns, value=columns), # column_selector
358
  f"1 / {total_pages}", # page_info
359
  )
360
-
361
  def update_table(full_df, search_query, selected_columns, current_page):
362
  df_display, page, total_pages = filter_and_paginate(
363
  full_df, search_query, "Average", selected_columns, current_page
364
  )
365
  return df_display, f"{page} / {total_pages}", page
366
-
367
  def go_page(full_df, search_query, selected_columns, current_page, delta):
368
  new_page = max(1, current_page + delta)
369
  df_display, page, total_pages = filter_and_paginate(
370
  full_df, search_query, "Average", selected_columns, new_page
371
  )
372
  return df_display, f"{page} / {total_pages}", page
373
-
374
  leaderboard_selector.change(
375
  fn=load_leaderboard,
376
  inputs=[leaderboard_selector],
377
- outputs=[full_df_state, metadata_state, current_page_state, leaderboard_table, header_view, metrics_view, column_selector, page_info]
 
 
 
 
 
 
 
 
 
378
  )
379
-
380
  search_box.input(
381
  fn=lambda df, q, cols: update_table(df, q, cols, 1),
382
  inputs=[full_df_state, search_box, column_selector],
383
- outputs=[leaderboard_table, page_info, current_page_state]
384
  )
385
-
386
  def on_column_change(df, q, cols):
387
  if not cols:
388
  cols = [c for c in df.columns if c != "Model"]
389
  return update_table(df, q, cols, 1)
390
-
391
  column_selector.change(
392
  fn=on_column_change,
393
  inputs=[full_df_state, search_box, column_selector],
394
- outputs=[leaderboard_table, page_info, current_page_state]
395
  )
396
-
397
  prev_btn.click(
398
  fn=lambda df, q, cols, p: go_page(df, q, cols, p, -1),
399
  inputs=[full_df_state, search_box, column_selector, current_page_state],
400
- outputs=[leaderboard_table, page_info, current_page_state]
401
  )
402
-
403
  next_btn.click(
404
  fn=lambda df, q, cols, p: go_page(df, q, cols, p, 1),
405
  inputs=[full_df_state, search_box, column_selector, current_page_state],
406
- outputs=[leaderboard_table, page_info, current_page_state]
407
  )
408
-
409
  refresh_btn.click(
410
  fn=lambda: (clear_cache(), gr.update(choices=get_available_leaderboards()))[1],
411
- outputs=[leaderboard_selector]
412
  )
413
-
414
  export_btn.click(
415
  fn=export_leaderboard_to_csv,
416
  inputs=[full_df_state, leaderboard_selector, search_box, column_selector],
417
- outputs=[export_btn]
418
  )
419
-
420
  def add_models_from_search(selected_from_results, current_selected):
421
  selected_from_results = selected_from_results or []
422
  current_selected = current_selected or []
423
  merged = list(dict.fromkeys(current_selected + selected_from_results))
424
- comparison_html, plot = compare_models(merged) if merged else (default_compare_html, None)
 
 
425
  return (
426
  merged,
427
  gr.update(choices=[], value=[]),
428
  gr.update(choices=merged, value=merged),
429
  comparison_html,
430
- plot
431
  )
432
-
433
  def update_selection(selected_list):
434
- comparison_html, plot = compare_models(selected_list) if selected_list else (default_compare_html, None)
435
- return selected_list, gr.update(choices=selected_list, value=selected_list), comparison_html, plot
436
-
 
 
 
 
 
 
 
 
 
437
  def clear_all_models():
438
  return (
439
  [],
@@ -441,9 +497,9 @@ Submit via GitHub Pull Request:
441
  gr.update(choices=[], value=[]),
442
  gr.update(choices=[], value=[]),
443
  default_compare_html,
444
- None
445
  )
446
-
447
  search_button.click(
448
  fn=get_model_suggestions,
449
  inputs=[model_search_box],
@@ -456,30 +512,48 @@ Submit via GitHub Pull Request:
456
  outputs=[search_results],
457
  queue=False,
458
  )
459
-
460
  search_results.change(
461
  fn=add_models_from_search,
462
  inputs=[search_results, selected_models_state],
463
- outputs=[selected_models_state, search_results, selected_models_group, model_card_view, radar_view],
 
 
 
 
 
 
464
  )
465
-
466
  selected_models_group.change(
467
  fn=update_selection,
468
  inputs=[selected_models_group],
469
- outputs=[selected_models_state, selected_models_group, model_card_view, radar_view]
 
 
 
 
 
470
  )
471
-
472
  clear_models_btn.click(
473
  fn=clear_all_models,
474
- outputs=[selected_models_state, model_search_box, search_results, selected_models_group, model_card_view, radar_view]
 
 
 
 
 
 
 
475
  )
476
-
477
  export_comparison_btn.click(
478
  fn=export_comparison_to_csv,
479
  inputs=[selected_models_state],
480
- outputs=[export_comparison_btn]
481
  )
482
-
483
  DATA_DIR.mkdir(exist_ok=True)
484
 
485
  if __name__ == "__main__":
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from pathlib import Path
 
10
  clear_cache,
11
  search_model_across_leaderboards,
12
  get_model_suggestions_fast,
13
+ DATA_DIR,
14
  )
15
  from ui_components import (
16
  get_theme,
 
28
  def get_leaderboard_data(selected_leaderboard, progress=gr.Progress()):
29
  if not selected_leaderboard:
30
  return pd.DataFrame(), {}
31
+
32
  metadata = get_eval_metadata(selected_leaderboard)
33
+
34
  def progress_callback(value, desc):
35
  progress(value, desc=desc)
36
+
37
  df = build_leaderboard_table(selected_leaderboard, "", progress_callback)
38
  return df, metadata
39
 
 
41
  def filter_and_paginate(df, search_query, sort_column, selected_columns, current_page):
42
  if df.empty:
43
  return df.copy(), 1, 1
44
+
45
  df = df.copy()
46
  all_columns = list(df.columns)
47
+
48
  if selected_columns:
49
+ cols = ["Model"] + [
50
+ c for c in all_columns if c in selected_columns and c != "Model"
51
+ ]
52
  df = df[cols]
53
+
54
  if search_query:
55
+ mask = df.astype(str).apply(
56
+ lambda row: row.str.contains(search_query, case=False, na=False).any(),
57
+ axis=1,
58
+ )
59
  df = df[mask]
60
+
61
  if sort_column and sort_column in df.columns:
62
+ df = df.sort_values(by=sort_column, ascending=False, na_position="last")
63
+
64
  total_rows = len(df)
65
  total_pages = max(1, (total_rows + PAGE_SIZE - 1) // PAGE_SIZE)
66
  current_page = max(1, min(current_page, total_pages))
67
  start = (current_page - 1) * PAGE_SIZE
68
  end = start + PAGE_SIZE
69
+
70
  return df.iloc[start:end], current_page, total_pages
71
 
72
 
 
78
  <p>Enter a model name to see its benchmarks across all leaderboards</p>
79
  </div>
80
  """
81
+
82
  results, _ = search_model_across_leaderboards(model_query)
83
+
84
  if not results:
85
  return f"""
86
  <div class="no-results">
 
88
  <p>Try a different model name or check the spelling</p>
89
  </div>
90
  """
91
+
92
  model_name = list(results.keys())[0]
93
  model_data = results[model_name]
94
+
95
  return format_model_card(model_name, model_data)
96
 
97
 
98
  def compare_models(selected_models):
99
  if not selected_models:
100
+ return (
101
+ """
102
  <div class="no-results">
103
  <h3>Select models to compare</h3>
104
  <p>Choose multiple models from the dropdown to see a side-by-side comparison</p>
105
  </div>
106
+ """,
107
+ None,
108
+ )
109
+
110
  all_results = {}
111
  for model_name in selected_models:
112
  results, _ = search_model_across_leaderboards(model_name)
113
  if results:
114
  matched_model = list(results.keys())[0]
115
  all_results[matched_model] = results[matched_model]
116
+
117
  plot = create_radar_plot(list(all_results.keys()), all_results)
118
+
119
  if len(all_results) == 1:
120
  model_name = list(all_results.keys())[0]
121
  return format_model_card(model_name, all_results[model_name]), plot
122
  elif len(all_results) > 1:
123
  return format_model_comparison(list(all_results.keys()), all_results), plot
124
  else:
125
+ return (
126
+ """
127
  <div class="no-results">
128
  <h3>No results found</h3>
129
  <p>Try selecting different models</p>
130
  </div>
131
+ """,
132
+ None,
133
+ )
134
 
135
 
136
  def get_model_suggestions(value):
137
  query = value or ""
138
  if not query or len(query) < 2:
139
  return gr.update(choices=[], value=[])
140
+
141
  matches = get_model_suggestions_fast(query, limit=10)
142
  return gr.update(choices=matches, value=[])
143
 
144
 
145
+ def export_leaderboard_to_csv(
146
+ full_df, selected_leaderboard, search_query, selected_columns
147
+ ):
148
  """Export the current leaderboard view to CSV."""
149
  if full_df.empty:
150
  return None
151
+
152
  df = full_df.copy()
153
+
154
  # Apply column selection
155
  if selected_columns:
156
+ cols = ["Model"] + [
157
+ c for c in df.columns if c in selected_columns and c != "Model"
158
+ ]
159
  df = df[cols]
160
+
161
  # Apply search filter
162
  if search_query:
163
+ mask = df.astype(str).apply(
164
+ lambda row: row.str.contains(search_query, case=False, na=False).any(),
165
+ axis=1,
166
+ )
167
  df = df[mask]
168
+
169
  # Save to CSV with absolute path
170
  from pathlib import Path
171
  import tempfile
172
+
173
  temp_dir = Path(tempfile.gettempdir())
174
  filename = temp_dir / f"{selected_leaderboard.replace(' ', '_')}_leaderboard.csv"
175
  df.to_csv(filename, index=False)
 
180
  """Export model comparison to CSV."""
181
  if not selected_models:
182
  return None
183
+
184
  all_results = {}
185
  for model_name in selected_models:
186
  results, _ = search_model_across_leaderboards(model_name)
187
  if results:
188
  matched_model = list(results.keys())[0]
189
  all_results[matched_model] = results[matched_model]
190
+
191
  if not all_results:
192
  return None
193
+
194
  # Build comparison table
195
  rows = []
196
  for model_name, model_data in all_results.items():
 
202
  "Developer": data.get("developer"),
203
  "Params (B)": data.get("params"),
204
  "Architecture": data.get("architecture"),
205
+ "Precision": data.get("precision"),
206
  }
207
  row.update(results)
208
  rows.append(row)
209
+
210
  df = pd.DataFrame(rows)
211
  from pathlib import Path
212
  import tempfile
213
+
214
  temp_dir = Path(tempfile.gettempdir())
215
  filename = temp_dir / "model_comparison.csv"
216
  df.to_csv(filename, index=False)
 
224
 
225
  if initial_leaderboard:
226
  _init_df, _init_metadata = get_leaderboard_data(initial_leaderboard)
227
+ _init_columns = (
228
+ [c for c in _init_df.columns if c != "Model"] if not _init_df.empty else []
229
+ )
230
+ _init_df_display, _, _init_total_pages = filter_and_paginate(
231
+ _init_df, "", "Average", None, 1
232
+ )
233
  else:
234
  _init_df = pd.DataFrame()
235
  _init_metadata = {}
 
237
  _init_df_display = pd.DataFrame()
238
  _init_total_pages = 1
239
 
240
+ with gr.Blocks(
241
+ title="Every Eval Ever", theme=get_theme(), css=get_custom_css()
242
+ ) as demo:
243
  full_df_state = gr.State(value=_init_df)
244
  metadata_state = gr.State(value=_init_metadata)
245
  current_page_state = gr.State(value=1)
246
+
247
  gr.HTML("""
248
  <div class="app-header">
249
  <div class="logo-mark">E³</div>
 
256
  </div>
257
  </div>
258
  """)
259
+
260
  with gr.Tabs():
261
  with gr.TabItem("Leaderboards"):
262
  with gr.Column(elem_classes="controls-bar"):
 
266
  choices=initial_leaderboards,
267
  value=initial_leaderboard,
268
  label="Leaderboard",
269
+ interactive=True,
270
  )
271
  with gr.Column(scale=1, min_width=120):
272
+ refresh_btn = gr.Button(
273
+ "↻ Refresh", variant="secondary", size="sm"
274
+ )
275
  with gr.Column(scale=1, min_width=120):
276
+ export_btn = gr.DownloadButton(
277
+ "📥 Export CSV", variant="secondary", size="sm"
278
+ )
279
 
280
  search_box = gr.Textbox(
281
+ label="Filter", placeholder="Filter models...", show_label=True
 
 
282
  )
283
+
284
+ header_view = gr.HTML(
285
+ value=format_leaderboard_header(initial_leaderboard, _init_metadata)
286
+ )
287
+
288
  with gr.Row(elem_classes="column-selector-bar"):
289
  with gr.Column(scale=5, min_width=320):
290
  column_selector = gr.Dropdown(
 
293
  label="Columns to Display",
294
  multiselect=True,
295
  interactive=True,
296
+ elem_classes="column-selector-dropdown",
297
  )
298
+
299
  leaderboard_table = gr.Dataframe(
300
  value=_init_df_display,
301
  label=None,
 
303
  wrap=False,
304
  elem_classes="dataframe",
305
  )
306
+
307
  with gr.Row(elem_classes="pagination-bar"):
308
  prev_btn = gr.Button("←", variant="secondary", size="sm", min_width=60)
309
+ page_info = gr.Markdown(
310
+ value=f"1 / {_init_total_pages}", elem_classes="page-info"
311
+ )
312
  next_btn = gr.Button("→", variant="secondary", size="sm", min_width=60)
313
+
314
+ metrics_view = gr.HTML(
315
+ value=format_metric_details(initial_leaderboard, _init_metadata)
316
+ )
317
+
318
  with gr.TabItem("🔍 Model Lookup"):
319
  gr.Markdown("### Find and compare models across all leaderboards")
320
+
321
  selected_models_state = gr.State(value=[])
322
  default_compare_html = """
323
  <div class="no-results">
 
325
  <p>Type in the dropdown to search, then select a model to add it</p>
326
  </div>
327
  """
328
+
329
  with gr.Row(elem_classes="controls-bar"):
330
  with gr.Column(scale=3):
331
  model_search_box = gr.Textbox(
 
344
  elem_classes=["match-pills"],
345
  )
346
  with gr.Column(scale=1, min_width=80):
347
+ clear_models_btn = gr.Button(
348
+ "Clear", variant="secondary", size="sm"
349
+ )
350
+
351
  selected_models_group = gr.CheckboxGroup(
352
  choices=[],
353
  value=[],
354
  label="Selected Models (click to remove)",
355
  interactive=True,
356
+ elem_classes="selected-models-group",
357
  )
358
+
359
  with gr.Row():
360
  with gr.Column(scale=4):
361
  pass
362
  with gr.Column(scale=1, min_width=120):
363
+ export_comparison_btn = gr.DownloadButton(
364
+ "📥 Export CSV", variant="secondary", size="sm"
365
+ )
366
+
367
  radar_view = gr.Plot(label="Radar Comparison")
368
  model_card_view = gr.HTML(value=default_compare_html)
369
+
370
  with gr.Accordion("📤 How to Submit Data", open=False):
371
  gr.Markdown("""
372
  Submit via GitHub Pull Request:
 
377
 
378
  [Submission Guide](https://github.com/evaleval/every_eval_ever#contributor-guide) - [JSON Schema](https://github.com/evaleval/every_eval_ever/blob/main/eval.schema.json)
379
  """)
380
+
381
  def load_leaderboard(leaderboard_name):
382
  df, metadata = get_leaderboard_data(leaderboard_name)
383
  columns = [c for c in df.columns if c != "Model"] if not df.empty else []
384
  df_display, page, total_pages = filter_and_paginate(df, "", "Average", None, 1)
385
+
386
  return (
387
  df, # full_df_state
388
  metadata, # metadata_state
 
393
  gr.update(choices=columns, value=columns), # column_selector
394
  f"1 / {total_pages}", # page_info
395
  )
396
+
397
  def update_table(full_df, search_query, selected_columns, current_page):
398
  df_display, page, total_pages = filter_and_paginate(
399
  full_df, search_query, "Average", selected_columns, current_page
400
  )
401
  return df_display, f"{page} / {total_pages}", page
402
+
403
  def go_page(full_df, search_query, selected_columns, current_page, delta):
404
  new_page = max(1, current_page + delta)
405
  df_display, page, total_pages = filter_and_paginate(
406
  full_df, search_query, "Average", selected_columns, new_page
407
  )
408
  return df_display, f"{page} / {total_pages}", page
409
+
410
  leaderboard_selector.change(
411
  fn=load_leaderboard,
412
  inputs=[leaderboard_selector],
413
+ outputs=[
414
+ full_df_state,
415
+ metadata_state,
416
+ current_page_state,
417
+ leaderboard_table,
418
+ header_view,
419
+ metrics_view,
420
+ column_selector,
421
+ page_info,
422
+ ],
423
  )
424
+
425
  search_box.input(
426
  fn=lambda df, q, cols: update_table(df, q, cols, 1),
427
  inputs=[full_df_state, search_box, column_selector],
428
+ outputs=[leaderboard_table, page_info, current_page_state],
429
  )
430
+
431
  def on_column_change(df, q, cols):
432
  if not cols:
433
  cols = [c for c in df.columns if c != "Model"]
434
  return update_table(df, q, cols, 1)
435
+
436
  column_selector.change(
437
  fn=on_column_change,
438
  inputs=[full_df_state, search_box, column_selector],
439
+ outputs=[leaderboard_table, page_info, current_page_state],
440
  )
441
+
442
  prev_btn.click(
443
  fn=lambda df, q, cols, p: go_page(df, q, cols, p, -1),
444
  inputs=[full_df_state, search_box, column_selector, current_page_state],
445
+ outputs=[leaderboard_table, page_info, current_page_state],
446
  )
447
+
448
  next_btn.click(
449
  fn=lambda df, q, cols, p: go_page(df, q, cols, p, 1),
450
  inputs=[full_df_state, search_box, column_selector, current_page_state],
451
+ outputs=[leaderboard_table, page_info, current_page_state],
452
  )
453
+
454
  refresh_btn.click(
455
  fn=lambda: (clear_cache(), gr.update(choices=get_available_leaderboards()))[1],
456
+ outputs=[leaderboard_selector],
457
  )
458
+
459
  export_btn.click(
460
  fn=export_leaderboard_to_csv,
461
  inputs=[full_df_state, leaderboard_selector, search_box, column_selector],
462
+ outputs=[export_btn],
463
  )
464
+
465
  def add_models_from_search(selected_from_results, current_selected):
466
  selected_from_results = selected_from_results or []
467
  current_selected = current_selected or []
468
  merged = list(dict.fromkeys(current_selected + selected_from_results))
469
+ comparison_html, plot = (
470
+ compare_models(merged) if merged else (default_compare_html, None)
471
+ )
472
  return (
473
  merged,
474
  gr.update(choices=[], value=[]),
475
  gr.update(choices=merged, value=merged),
476
  comparison_html,
477
+ plot,
478
  )
479
+
480
  def update_selection(selected_list):
481
+ comparison_html, plot = (
482
+ compare_models(selected_list)
483
+ if selected_list
484
+ else (default_compare_html, None)
485
+ )
486
+ return (
487
+ selected_list,
488
+ gr.update(choices=selected_list, value=selected_list),
489
+ comparison_html,
490
+ plot,
491
+ )
492
+
493
  def clear_all_models():
494
  return (
495
  [],
 
497
  gr.update(choices=[], value=[]),
498
  gr.update(choices=[], value=[]),
499
  default_compare_html,
500
+ None,
501
  )
502
+
503
  search_button.click(
504
  fn=get_model_suggestions,
505
  inputs=[model_search_box],
 
512
  outputs=[search_results],
513
  queue=False,
514
  )
515
+
516
  search_results.change(
517
  fn=add_models_from_search,
518
  inputs=[search_results, selected_models_state],
519
+ outputs=[
520
+ selected_models_state,
521
+ search_results,
522
+ selected_models_group,
523
+ model_card_view,
524
+ radar_view,
525
+ ],
526
  )
527
+
528
  selected_models_group.change(
529
  fn=update_selection,
530
  inputs=[selected_models_group],
531
+ outputs=[
532
+ selected_models_state,
533
+ selected_models_group,
534
+ model_card_view,
535
+ radar_view,
536
+ ],
537
  )
538
+
539
  clear_models_btn.click(
540
  fn=clear_all_models,
541
+ outputs=[
542
+ selected_models_state,
543
+ model_search_box,
544
+ search_results,
545
+ selected_models_group,
546
+ model_card_view,
547
+ radar_view,
548
+ ],
549
  )
550
+
551
  export_comparison_btn.click(
552
  fn=export_comparison_to_csv,
553
  inputs=[selected_models_state],
554
+ outputs=[export_comparison_btn],
555
  )
556
+
557
  DATA_DIR.mkdir(exist_ok=True)
558
 
559
  if __name__ == "__main__":
data_loader.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
  Data Loader: Load from HuggingFace, parse JSON files, and build tables.
3
  """
 
4
  import json
5
  import pandas as pd
6
  from pathlib import Path
@@ -26,70 +27,133 @@ def load_hf_dataset_on_startup():
26
  print("Loading dataset from HuggingFace...")
27
  try:
28
  dataset = load_dataset("evaleval/every_eval_ever")
29
-
30
  for split_name, split_data in dataset.items():
31
  print(f"Loading split: {split_name} ({len(split_data)} rows)")
32
-
33
  df = split_data.to_pandas()
34
  parsed_items = []
35
-
36
  for _, row in df.iterrows():
37
- evaluation_results = json.loads(row['evaluation_results'])
38
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  results = {}
40
  for eval_result in evaluation_results:
41
  eval_name = eval_result.get("evaluation_name")
42
  score = eval_result.get("score_details", {}).get("score")
43
  if eval_name and score is not None:
44
  results[eval_name] = score
45
-
46
- additional_details = {}
47
- if pd.notna(row.get('additional_details')):
48
- additional_details = json.loads(row['additional_details'])
49
-
50
  parsed_item = {
51
- "leaderboard": row['_leaderboard'],
52
- "provider": row['source_organization_name'],
53
- "model": row['model_id'],
54
- "developer": row['model_developer'],
55
- "params": additional_details.get('params_billions'),
56
- "architecture": additional_details.get('architecture', 'Unknown'),
57
- "precision": additional_details.get('precision', 'Unknown'),
 
 
 
 
 
 
 
 
58
  "results": results,
59
  "raw_data": {
60
- "schema_version": row['schema_version'],
61
- "evaluation_id": row['evaluation_id'],
62
- "retrieved_timestamp": row['retrieved_timestamp'],
63
- "source_data": json.loads(row['source_data']),
64
- "evaluation_source": {
65
- "evaluation_source_name": row['evaluation_source_name'],
66
- "evaluation_source_type": row['evaluation_source_type']
67
- },
68
- "source_metadata": {
69
- "source_organization_name": row['source_organization_name'],
70
- "evaluator_relationship": row['evaluator_relationship'],
71
- },
72
- "model_info": {
73
- "name": row['model_name'],
74
- "id": row['model_id'],
75
- "developer": row['model_developer'],
76
- },
77
  "evaluation_results": evaluation_results,
78
- "additional_details": additional_details
79
- }
80
  }
81
-
82
- if pd.notna(row.get('source_organization_url')):
83
- parsed_item["raw_data"]["source_metadata"]["source_organization_url"] = row['source_organization_url']
84
- if pd.notna(row.get('source_organization_logo_url')):
85
- parsed_item["raw_data"]["source_metadata"]["source_organization_logo_url"] = row['source_organization_logo_url']
86
- if pd.notna(row.get('model_inference_platform')):
87
- parsed_item["raw_data"]["model_info"]["inference_platform"] = row['model_inference_platform']
88
-
89
  parsed_items.append(parsed_item)
90
-
91
  HF_DATASET_CACHE[split_name] = parsed_items
92
-
93
  print(f"Loaded {len(HF_DATASET_CACHE)} leaderboard(s) from HuggingFace")
94
  _build_search_index()
95
  return True
@@ -102,20 +166,39 @@ def load_hf_dataset_on_startup():
102
  def parse_eval_json(file_path):
103
  """Parses a single JSON file to extract model, provider, and results."""
104
  try:
105
- with open(file_path, 'r') as f:
106
  data = json.load(f)
107
-
108
- leaderboard_name = data.get("evaluation_source", {}).get("evaluation_source_name", "Unknown Leaderboard")
109
- provider_name = data.get("source_metadata", {}).get("source_organization_name", "Unknown Provider")
110
- model_id = data.get("model_info", {}).get("id", "Unknown Model")
111
- developer_name = data.get("model_info", {}).get("developer", "Unknown Developer")
112
-
113
- params = data.get("model_info", {}).get("params_billions", None)
114
- architecture = data.get("model_info", {}).get("architecture", "Unknown")
115
- precision = data.get("additional_details", {}).get("precision", "Unknown")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  if precision == "Unknown":
117
- precision = data.get("model_info", {}).get("precision", "Unknown")
118
-
119
  results = {}
120
  if "evaluation_results" in data:
121
  for res in data["evaluation_results"]:
@@ -123,7 +206,7 @@ def parse_eval_json(file_path):
123
  score = res.get("score_details", {}).get("score", None)
124
  if score is not None:
125
  results[eval_name] = score
126
-
127
  return {
128
  "leaderboard": leaderboard_name,
129
  "provider": provider_name,
@@ -133,7 +216,7 @@ def parse_eval_json(file_path):
133
  "architecture": architecture,
134
  "precision": precision,
135
  "results": results,
136
- "raw_data": data
137
  }
138
  except Exception as e:
139
  print(f"Error parsing {file_path}: {e}")
@@ -144,7 +227,7 @@ def get_available_leaderboards():
144
  """Returns available leaderboards from HF cache or local directory."""
145
  if HF_DATASET_CACHE:
146
  return list(HF_DATASET_CACHE.keys())
147
-
148
  if not DATA_DIR.exists():
149
  return []
150
  return [d.name for d in DATA_DIR.iterdir() if d.is_dir()]
@@ -162,102 +245,135 @@ def get_eval_metadata(selected_leaderboard):
162
  """Extracts evaluation metadata from the leaderboard data."""
163
  if not selected_leaderboard:
164
  return {}
165
-
166
  eval_metadata = {"evals": {}, "source_info": {}}
167
-
168
  if selected_leaderboard in HF_DATASET_CACHE:
169
  parsed_items = HF_DATASET_CACHE[selected_leaderboard]
170
  if parsed_items:
171
  parsed = parsed_items[0]
172
-
173
- source_meta = parsed["raw_data"].get("source_metadata", {})
174
- source_data_list = parsed["raw_data"].get("source_data", [])
175
- url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
176
-
 
 
 
 
 
 
177
  eval_metadata["source_info"] = {
178
  "organization": source_meta.get("source_organization_name", "Unknown"),
179
  "relationship": source_meta.get("evaluator_relationship", "Unknown"),
180
- "url": url
181
  }
182
-
183
  if "evaluation_results" in parsed["raw_data"]:
184
  for res in parsed["raw_data"]["evaluation_results"]:
185
  eval_name = res.get("evaluation_name", "Unknown Metric")
186
  if eval_name not in eval_metadata["evals"]:
187
  metric_config = res.get("metric_config", {})
188
  eval_metadata["evals"][eval_name] = {
189
- "description": metric_config.get("evaluation_description", "No description available"),
 
 
190
  "score_type": metric_config.get("score_type", "unknown"),
191
- "lower_is_better": metric_config.get("lower_is_better", False),
 
 
192
  "min_score": metric_config.get("min_score"),
193
  "max_score": metric_config.get("max_score"),
194
  "level_names": metric_config.get("level_names", []),
195
  "level_metadata": metric_config.get("level_metadata", []),
196
- "has_unknown_level": metric_config.get("has_unknown_level", False)
 
 
197
  }
198
  return eval_metadata
199
-
200
  # Fall back to file system
201
  for json_file in walk_eval_files(selected_leaderboard):
202
  parsed = parse_eval_json(json_file)
203
  if parsed:
204
  if not eval_metadata["source_info"]:
205
- source_meta = parsed["raw_data"].get("source_metadata", {})
206
- source_data_list = parsed["raw_data"].get("source_data", [])
207
- url = source_data_list[0] if isinstance(source_data_list, list) and source_data_list else "#"
208
-
209
- eval_metadata["source_info"] = {
210
- "organization": source_meta.get("source_organization_name", "Unknown"),
211
- "relationship": source_meta.get("evaluator_relationship", "Unknown"),
212
- "url": url
213
- }
214
-
 
 
 
 
 
 
 
 
215
  if "evaluation_results" in parsed["raw_data"]:
216
  for res in parsed["raw_data"]["evaluation_results"]:
217
  eval_name = res.get("evaluation_name", "Unknown Metric")
218
  if eval_name not in eval_metadata["evals"]:
219
  metric_config = res.get("metric_config", {})
220
  eval_metadata["evals"][eval_name] = {
221
- "description": metric_config.get("evaluation_description", "No description available"),
 
 
222
  "score_type": metric_config.get("score_type", "unknown"),
223
- "lower_is_better": metric_config.get("lower_is_better", False),
 
 
224
  "min_score": metric_config.get("min_score"),
225
  "max_score": metric_config.get("max_score"),
226
  "level_names": metric_config.get("level_names", []),
227
  "level_metadata": metric_config.get("level_metadata", []),
228
- "has_unknown_level": metric_config.get("has_unknown_level", False)
 
 
229
  }
230
  break
231
-
232
  return eval_metadata
233
 
234
 
235
- def build_leaderboard_table(selected_leaderboard, search_query="", progress_callback=None):
 
 
236
  """Builds the leaderboard DataFrame from cache or files."""
237
  if not selected_leaderboard:
238
  return pd.DataFrame()
239
-
240
  if selected_leaderboard in LEADERBOARD_CACHE:
241
  df, _ = LEADERBOARD_CACHE[selected_leaderboard]
242
  else:
243
  rows = []
244
-
245
  if selected_leaderboard in HF_DATASET_CACHE:
246
  if progress_callback:
247
- progress_callback(0, desc=f"Loading {selected_leaderboard} from cache...")
248
-
 
 
249
  parsed_items = HF_DATASET_CACHE[selected_leaderboard]
250
-
251
  for i, parsed in enumerate(parsed_items):
252
  if i % 100 == 0 and progress_callback:
253
- progress_callback((i / len(parsed_items)), desc=f"Processing {selected_leaderboard}...")
254
-
 
 
 
255
  row = {
256
- "Model": parsed["model"],
257
  "Developer": parsed["developer"],
258
  "Params (B)": parsed["params"],
259
  "Arch": parsed["architecture"],
260
- "Precision": parsed["precision"]
261
  }
262
  row.update(parsed["results"])
263
  rows.append(row)
@@ -265,41 +381,45 @@ def build_leaderboard_table(selected_leaderboard, search_query="", progress_call
265
  # Fall back to file system
266
  if progress_callback:
267
  progress_callback(0, desc=f"Scanning {selected_leaderboard}...")
268
-
269
  all_files = list(walk_eval_files(selected_leaderboard))
270
  total_files = len(all_files)
271
-
272
  for i, json_file in enumerate(all_files):
273
  if i % 100 == 0 and progress_callback:
274
- progress_callback((i / total_files), desc=f"Loading {selected_leaderboard}...")
275
-
 
 
276
  parsed = parse_eval_json(json_file)
277
  if parsed:
278
  row = {
279
- "Model": parsed["model"],
280
  "Developer": parsed["developer"],
281
  "Params (B)": parsed["params"],
282
  "Arch": parsed["architecture"],
283
- "Precision": parsed["precision"]
284
  }
285
  row.update(parsed["results"])
286
  rows.append(row)
287
-
288
  if not rows:
289
- df = pd.DataFrame(columns=["Model", "Developer", "Params (B)", "Arch", "Precision"])
 
 
290
  LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
291
  return df
292
-
293
  df = pd.DataFrame(rows)
294
- df = df.dropna(axis=1, how='all')
295
-
296
  if df.empty:
297
- LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
298
- return df
299
 
300
- numeric_cols = df.select_dtypes(include=['float', 'int']).columns
301
  df[numeric_cols] = df[numeric_cols].round(2)
302
-
303
  # Add Average Score
304
  eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]]
305
  if len(eval_only_cols) > 0:
@@ -310,18 +430,20 @@ def build_leaderboard_table(selected_leaderboard, search_query="", progress_call
310
  # Model detail columns: Arch, Precision (moved to end)
311
  base_cols = ["Model", "Developer", "Params (B)", "Average"]
312
  model_detail_cols = ["Arch", "Precision"]
313
- eval_cols = [c for c in df.columns if c not in base_cols and c not in model_detail_cols]
 
 
314
  base_cols = [c for c in base_cols if c in df.columns]
315
  model_detail_cols = [c for c in model_detail_cols if c in df.columns]
316
-
317
  final_cols = base_cols + sorted(eval_cols) + model_detail_cols
318
  df = df[final_cols]
319
-
320
  if "Average" in df.columns:
321
  df = df.sort_values("Average", ascending=False)
322
-
323
  LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
324
-
325
  return df
326
 
327
 
@@ -357,10 +479,10 @@ def get_model_suggestions_fast(query, limit=15):
357
  global LAST_QUERY, LAST_RESULTS
358
  if not query or len(query) < 2 or not MODEL_SEARCH_INDEX:
359
  return []
360
-
361
  query_lower = query.lower()
362
  results = []
363
-
364
  # Incremental reuse: if user keeps typing the same prefix, reuse last pool
365
  base_pool = None
366
  if LAST_QUERY and query_lower.startswith(LAST_QUERY) and LAST_RESULTS:
@@ -368,7 +490,7 @@ def get_model_suggestions_fast(query, limit=15):
368
  else:
369
  prefix_key = query_lower[:2]
370
  base_pool = MODEL_PREFIX_MAP.get(prefix_key, MODEL_SEARCH_INDEX)
371
-
372
  # 1) Prefix match on model names
373
  if base_pool is MODEL_SEARCH_INDEX:
374
  idx = bisect_left(MODEL_SEARCH_INDEX, (query_lower,))
@@ -385,7 +507,7 @@ def get_model_suggestions_fast(query, limit=15):
385
  results.append((0, len(name_lower), name_orig))
386
  if len(results) >= limit:
387
  break
388
-
389
  # 2) Substring fallback on the narrowed pool
390
  if len(results) < limit:
391
  seen = {r[2] for r in results}
@@ -402,13 +524,13 @@ def get_model_suggestions_fast(query, limit=15):
402
  results.append((pos, len(name_lower), name_orig))
403
  if len(results) >= limit * 2:
404
  break
405
-
406
  results.sort(key=lambda x: (x[0], x[1]))
407
-
408
  # Update incremental cache
409
  LAST_QUERY = query_lower
410
  LAST_RESULTS = base_pool if base_pool is not None else MODEL_SEARCH_INDEX
411
-
412
  return [r[2] for r in results[:limit]]
413
 
414
 
@@ -416,10 +538,10 @@ def search_model_across_leaderboards(model_query):
416
  """Search for a model across all leaderboards and return aggregated results."""
417
  if not model_query or not HF_DATASET_CACHE:
418
  return {}, []
419
-
420
  # Use fast fuzzy search for suggestions
421
  matches = get_model_suggestions_fast(model_query, limit=20)
422
-
423
  # Get detailed results only for matched models
424
  results = {}
425
  for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
@@ -433,9 +555,9 @@ def search_model_across_leaderboards(model_query):
433
  "params": item.get("params"),
434
  "architecture": item.get("architecture"),
435
  "precision": item.get("precision"),
436
- "results": item.get("results", {})
437
  }
438
-
439
  return results, matches
440
 
441
 
@@ -443,11 +565,10 @@ def get_all_model_names():
443
  """Get all unique model names across all leaderboards."""
444
  if not HF_DATASET_CACHE:
445
  return []
446
-
447
  models = set()
448
  for parsed_items in HF_DATASET_CACHE.values():
449
  for item in parsed_items:
450
  models.add(item.get("model", ""))
451
-
452
- return sorted(models)
453
 
 
 
1
  """
2
  Data Loader: Load from HuggingFace, parse JSON files, and build tables.
3
  """
4
+
5
  import json
6
  import pandas as pd
7
  from pathlib import Path
 
27
  print("Loading dataset from HuggingFace...")
28
  try:
29
  dataset = load_dataset("evaleval/every_eval_ever")
30
+
31
  for split_name, split_data in dataset.items():
32
  print(f"Loading split: {split_name} ({len(split_data)} rows)")
33
+
34
  df = split_data.to_pandas()
35
  parsed_items = []
36
+
37
  for _, row in df.iterrows():
38
+ # New schema (v0.1.0) stores complex fields as JSON strings in parquet
39
+ # and uses unified top-level keys that mirror eval.schema.json.
40
+ #
41
+ # We keep this flexible so it works with both the old dataset
42
+ # (flattened columns) and the new one (JSON columns).
43
+ def _safe_json_load(value):
44
+ if isinstance(value, str):
45
+ try:
46
+ return json.loads(value)
47
+ except Exception:
48
+ return value
49
+ return value
50
+
51
+ # --- Core structured fields ---
52
+ evaluation_results = (
53
+ _safe_json_load(row.get("evaluation_results", "[]")) or []
54
+ )
55
+ source_metadata = (
56
+ _safe_json_load(row.get("source_metadata", "{}")) or {}
57
+ )
58
+ source_data = _safe_json_load(row.get("source_data", "[]")) or []
59
+ model_info = _safe_json_load(row.get("model_info", "{}")) or {}
60
+
61
+ # Some older parquet versions had flattened columns instead of JSON blobs.
62
+ # We transparently patch those into the new structure if present.
63
+ if not source_metadata:
64
+ # Old columns: source_organization_name, evaluator_relationship,
65
+ # source_organization_url, source_organization_logo_url, evaluation_source_name, evaluation_source_type
66
+ sm = {}
67
+ if pd.notna(row.get("evaluation_source_name", None)):
68
+ sm["source_name"] = row["evaluation_source_name"]
69
+ if pd.notna(row.get("evaluation_source_type", None)):
70
+ sm["source_type"] = row["evaluation_source_type"]
71
+ if pd.notna(row.get("source_organization_name", None)):
72
+ sm["source_organization_name"] = row["source_organization_name"]
73
+ if pd.notna(row.get("source_organization_url", None)):
74
+ sm["source_organization_url"] = row["source_organization_url"]
75
+ if pd.notna(row.get("source_organization_logo_url", None)):
76
+ sm["source_organization_logo_url"] = row[
77
+ "source_organization_logo_url"
78
+ ]
79
+ if pd.notna(row.get("evaluator_relationship", None)):
80
+ sm["evaluator_relationship"] = row["evaluator_relationship"]
81
+ source_metadata = sm
82
+
83
+ if not source_data:
84
+ # Old schema used `source_data` as list of URLs already; if we see a
85
+ # plain string, wrap it into a list for consistency.
86
+ raw_sd = row.get("source_data")
87
+ if isinstance(raw_sd, str) and raw_sd:
88
+ source_data = [raw_sd]
89
+
90
+ if not model_info:
91
+ # Old flattened model columns
92
+ mi = {}
93
+ if pd.notna(row.get("model_name", None)):
94
+ mi["name"] = row["model_name"]
95
+ if pd.notna(row.get("model_id", None)):
96
+ mi["id"] = row["model_id"]
97
+ if pd.notna(row.get("model_developer", None)):
98
+ mi["developer"] = row["model_developer"]
99
+ if pd.notna(row.get("model_inference_platform", None)):
100
+ mi["inference_platform"] = row["model_inference_platform"]
101
+ model_info = mi
102
+
103
+ additional_details = {}
104
+ # New schema: additional_details lives inside model_info
105
+ if isinstance(model_info, dict):
106
+ additional_details = model_info.get("additional_details") or {}
107
+
108
+ # Old schema sometimes had an `additional_details` top-level column
109
+ # with JSON, we still honour that as a source of params/precision/arch.
110
+ if not additional_details and pd.notna(
111
+ row.get("additional_details", None)
112
+ ):
113
+ additional_details = (
114
+ _safe_json_load(row["additional_details"]) or {}
115
+ )
116
+
117
  results = {}
118
  for eval_result in evaluation_results:
119
  eval_name = eval_result.get("evaluation_name")
120
  score = eval_result.get("score_details", {}).get("score")
121
  if eval_name and score is not None:
122
  results[eval_name] = score
123
+
 
 
 
 
124
  parsed_item = {
125
+ "leaderboard": row.get("_leaderboard", "unknown_leaderboard"),
126
+ # Provider is the organization owning the source/leaderboard
127
+ "provider": source_metadata.get(
128
+ "source_organization_name", "Unknown Provider"
129
+ ),
130
+ # Prefer the canonical model id from the new schema; fall back to old columns
131
+ "model": model_info.get("id")
132
+ or row.get("_model")
133
+ or row.get("model_id", "Unknown Model"),
134
+ "developer": model_info.get("developer")
135
+ or row.get("_developer")
136
+ or row.get("model_developer", "Unknown Developer"),
137
+ "params": additional_details.get("params_billions"),
138
+ "architecture": additional_details.get("architecture", "Unknown"),
139
+ "precision": additional_details.get("precision", "Unknown"),
140
  "results": results,
141
  "raw_data": {
142
+ "schema_version": row.get("schema_version"),
143
+ "evaluation_id": row.get("evaluation_id"),
144
+ "retrieved_timestamp": row.get("retrieved_timestamp"),
145
+ "source_data": source_data,
146
+ "source_metadata": source_metadata,
147
+ "model_info": model_info,
 
 
 
 
 
 
 
 
 
 
 
148
  "evaluation_results": evaluation_results,
149
+ "additional_details": additional_details,
150
+ },
151
  }
152
+
 
 
 
 
 
 
 
153
  parsed_items.append(parsed_item)
154
+
155
  HF_DATASET_CACHE[split_name] = parsed_items
156
+
157
  print(f"Loaded {len(HF_DATASET_CACHE)} leaderboard(s) from HuggingFace")
158
  _build_search_index()
159
  return True
 
166
  def parse_eval_json(file_path):
167
  """Parses a single JSON file to extract model, provider, and results."""
168
  try:
169
+ with open(file_path, "r") as f:
170
  data = json.load(f)
171
+
172
+ # New schema (v0.1.0) removes `evaluation_source` and moves most
173
+ # metadata into `source_metadata` and `model_info.additional_details`.
174
+ source_meta = data.get("source_metadata", {}) or {}
175
+ model_info = data.get("model_info", {}) or {}
176
+
177
+ # Leaderboard name:
178
+ # - new schema: source_metadata.source_name
179
+ # - old schema: evaluation_source.evaluation_source_name
180
+ leaderboard_name = source_meta.get("source_name")
181
+ if not leaderboard_name:
182
+ leaderboard_name = data.get("evaluation_source", {}).get(
183
+ "evaluation_source_name", "Unknown Leaderboard"
184
+ )
185
+
186
+ provider_name = source_meta.get("source_organization_name", "Unknown Provider")
187
+ model_id = model_info.get("id", "Unknown Model")
188
+ developer_name = model_info.get("developer", "Unknown Developer")
189
+
190
+ # Model-level details:
191
+ additional_details = model_info.get("additional_details") or {}
192
+ # Backwards compatibility with old layout
193
+ if not additional_details:
194
+ additional_details = data.get("additional_details", {}) or {}
195
+
196
+ params = additional_details.get("params_billions")
197
+ architecture = additional_details.get("architecture", "Unknown")
198
+ precision = additional_details.get("precision", "Unknown")
199
  if precision == "Unknown":
200
+ precision = model_info.get("precision", "Unknown")
201
+
202
  results = {}
203
  if "evaluation_results" in data:
204
  for res in data["evaluation_results"]:
 
206
  score = res.get("score_details", {}).get("score", None)
207
  if score is not None:
208
  results[eval_name] = score
209
+
210
  return {
211
  "leaderboard": leaderboard_name,
212
  "provider": provider_name,
 
216
  "architecture": architecture,
217
  "precision": precision,
218
  "results": results,
219
+ "raw_data": data,
220
  }
221
  except Exception as e:
222
  print(f"Error parsing {file_path}: {e}")
 
227
  """Returns available leaderboards from HF cache or local directory."""
228
  if HF_DATASET_CACHE:
229
  return list(HF_DATASET_CACHE.keys())
230
+
231
  if not DATA_DIR.exists():
232
  return []
233
  return [d.name for d in DATA_DIR.iterdir() if d.is_dir()]
 
245
  """Extracts evaluation metadata from the leaderboard data."""
246
  if not selected_leaderboard:
247
  return {}
248
+
249
  eval_metadata = {"evals": {}, "source_info": {}}
250
+
251
  if selected_leaderboard in HF_DATASET_CACHE:
252
  parsed_items = HF_DATASET_CACHE[selected_leaderboard]
253
  if parsed_items:
254
  parsed = parsed_items[0]
255
+
256
+ source_meta = parsed["raw_data"].get("source_metadata", {}) or {}
257
+ source_data_val = parsed["raw_data"].get("source_data", [])
258
+
259
+ # source_data can be either:
260
+ # - list[str] (URLs) OR
261
+ # - object describing a HF dataset. For the latter, we skip the URL.
262
+ url = "#"
263
+ if isinstance(source_data_val, list) and source_data_val:
264
+ url = source_data_val[0]
265
+
266
  eval_metadata["source_info"] = {
267
  "organization": source_meta.get("source_organization_name", "Unknown"),
268
  "relationship": source_meta.get("evaluator_relationship", "Unknown"),
269
+ "url": url,
270
  }
271
+
272
  if "evaluation_results" in parsed["raw_data"]:
273
  for res in parsed["raw_data"]["evaluation_results"]:
274
  eval_name = res.get("evaluation_name", "Unknown Metric")
275
  if eval_name not in eval_metadata["evals"]:
276
  metric_config = res.get("metric_config", {})
277
  eval_metadata["evals"][eval_name] = {
278
+ "description": metric_config.get(
279
+ "evaluation_description", "No description available"
280
+ ),
281
  "score_type": metric_config.get("score_type", "unknown"),
282
+ "lower_is_better": metric_config.get(
283
+ "lower_is_better", False
284
+ ),
285
  "min_score": metric_config.get("min_score"),
286
  "max_score": metric_config.get("max_score"),
287
  "level_names": metric_config.get("level_names", []),
288
  "level_metadata": metric_config.get("level_metadata", []),
289
+ "has_unknown_level": metric_config.get(
290
+ "has_unknown_level", False
291
+ ),
292
  }
293
  return eval_metadata
294
+
295
  # Fall back to file system
296
  for json_file in walk_eval_files(selected_leaderboard):
297
  parsed = parse_eval_json(json_file)
298
  if parsed:
299
  if not eval_metadata["source_info"]:
300
+ source_meta = parsed["raw_data"].get("source_metadata", {})
301
+ source_data_list = parsed["raw_data"].get("source_data", [])
302
+ url = (
303
+ source_data_list[0]
304
+ if isinstance(source_data_list, list) and source_data_list
305
+ else "#"
306
+ )
307
+
308
+ eval_metadata["source_info"] = {
309
+ "organization": source_meta.get(
310
+ "source_organization_name", "Unknown"
311
+ ),
312
+ "relationship": source_meta.get(
313
+ "evaluator_relationship", "Unknown"
314
+ ),
315
+ "url": url,
316
+ }
317
+
318
  if "evaluation_results" in parsed["raw_data"]:
319
  for res in parsed["raw_data"]["evaluation_results"]:
320
  eval_name = res.get("evaluation_name", "Unknown Metric")
321
  if eval_name not in eval_metadata["evals"]:
322
  metric_config = res.get("metric_config", {})
323
  eval_metadata["evals"][eval_name] = {
324
+ "description": metric_config.get(
325
+ "evaluation_description", "No description available"
326
+ ),
327
  "score_type": metric_config.get("score_type", "unknown"),
328
+ "lower_is_better": metric_config.get(
329
+ "lower_is_better", False
330
+ ),
331
  "min_score": metric_config.get("min_score"),
332
  "max_score": metric_config.get("max_score"),
333
  "level_names": metric_config.get("level_names", []),
334
  "level_metadata": metric_config.get("level_metadata", []),
335
+ "has_unknown_level": metric_config.get(
336
+ "has_unknown_level", False
337
+ ),
338
  }
339
  break
340
+
341
  return eval_metadata
342
 
343
 
344
+ def build_leaderboard_table(
345
+ selected_leaderboard, search_query="", progress_callback=None
346
+ ):
347
  """Builds the leaderboard DataFrame from cache or files."""
348
  if not selected_leaderboard:
349
  return pd.DataFrame()
350
+
351
  if selected_leaderboard in LEADERBOARD_CACHE:
352
  df, _ = LEADERBOARD_CACHE[selected_leaderboard]
353
  else:
354
  rows = []
355
+
356
  if selected_leaderboard in HF_DATASET_CACHE:
357
  if progress_callback:
358
+ progress_callback(
359
+ 0, desc=f"Loading {selected_leaderboard} from cache..."
360
+ )
361
+
362
  parsed_items = HF_DATASET_CACHE[selected_leaderboard]
363
+
364
  for i, parsed in enumerate(parsed_items):
365
  if i % 100 == 0 and progress_callback:
366
+ progress_callback(
367
+ (i / len(parsed_items)),
368
+ desc=f"Processing {selected_leaderboard}...",
369
+ )
370
+
371
  row = {
372
+ "Model": parsed["model"],
373
  "Developer": parsed["developer"],
374
  "Params (B)": parsed["params"],
375
  "Arch": parsed["architecture"],
376
+ "Precision": parsed["precision"],
377
  }
378
  row.update(parsed["results"])
379
  rows.append(row)
 
381
  # Fall back to file system
382
  if progress_callback:
383
  progress_callback(0, desc=f"Scanning {selected_leaderboard}...")
384
+
385
  all_files = list(walk_eval_files(selected_leaderboard))
386
  total_files = len(all_files)
387
+
388
  for i, json_file in enumerate(all_files):
389
  if i % 100 == 0 and progress_callback:
390
+ progress_callback(
391
+ (i / total_files), desc=f"Loading {selected_leaderboard}..."
392
+ )
393
+
394
  parsed = parse_eval_json(json_file)
395
  if parsed:
396
  row = {
397
+ "Model": parsed["model"],
398
  "Developer": parsed["developer"],
399
  "Params (B)": parsed["params"],
400
  "Arch": parsed["architecture"],
401
+ "Precision": parsed["precision"],
402
  }
403
  row.update(parsed["results"])
404
  rows.append(row)
405
+
406
  if not rows:
407
+ df = pd.DataFrame(
408
+ columns=["Model", "Developer", "Params (B)", "Arch", "Precision"]
409
+ )
410
  LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
411
  return df
412
+
413
  df = pd.DataFrame(rows)
414
+ df = df.dropna(axis=1, how="all")
415
+
416
  if df.empty:
417
+ LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
418
+ return df
419
 
420
+ numeric_cols = df.select_dtypes(include=["float", "int"]).columns
421
  df[numeric_cols] = df[numeric_cols].round(2)
422
+
423
  # Add Average Score
424
  eval_only_cols = [c for c in numeric_cols if c not in ["Params (B)"]]
425
  if len(eval_only_cols) > 0:
 
430
  # Model detail columns: Arch, Precision (moved to end)
431
  base_cols = ["Model", "Developer", "Params (B)", "Average"]
432
  model_detail_cols = ["Arch", "Precision"]
433
+ eval_cols = [
434
+ c for c in df.columns if c not in base_cols and c not in model_detail_cols
435
+ ]
436
  base_cols = [c for c in base_cols if c in df.columns]
437
  model_detail_cols = [c for c in model_detail_cols if c in df.columns]
438
+
439
  final_cols = base_cols + sorted(eval_cols) + model_detail_cols
440
  df = df[final_cols]
441
+
442
  if "Average" in df.columns:
443
  df = df.sort_values("Average", ascending=False)
444
+
445
  LEADERBOARD_CACHE[selected_leaderboard] = (df, None)
446
+
447
  return df
448
 
449
 
 
479
  global LAST_QUERY, LAST_RESULTS
480
  if not query or len(query) < 2 or not MODEL_SEARCH_INDEX:
481
  return []
482
+
483
  query_lower = query.lower()
484
  results = []
485
+
486
  # Incremental reuse: if user keeps typing the same prefix, reuse last pool
487
  base_pool = None
488
  if LAST_QUERY and query_lower.startswith(LAST_QUERY) and LAST_RESULTS:
 
490
  else:
491
  prefix_key = query_lower[:2]
492
  base_pool = MODEL_PREFIX_MAP.get(prefix_key, MODEL_SEARCH_INDEX)
493
+
494
  # 1) Prefix match on model names
495
  if base_pool is MODEL_SEARCH_INDEX:
496
  idx = bisect_left(MODEL_SEARCH_INDEX, (query_lower,))
 
507
  results.append((0, len(name_lower), name_orig))
508
  if len(results) >= limit:
509
  break
510
+
511
  # 2) Substring fallback on the narrowed pool
512
  if len(results) < limit:
513
  seen = {r[2] for r in results}
 
524
  results.append((pos, len(name_lower), name_orig))
525
  if len(results) >= limit * 2:
526
  break
527
+
528
  results.sort(key=lambda x: (x[0], x[1]))
529
+
530
  # Update incremental cache
531
  LAST_QUERY = query_lower
532
  LAST_RESULTS = base_pool if base_pool is not None else MODEL_SEARCH_INDEX
533
+
534
  return [r[2] for r in results[:limit]]
535
 
536
 
 
538
  """Search for a model across all leaderboards and return aggregated results."""
539
  if not model_query or not HF_DATASET_CACHE:
540
  return {}, []
541
+
542
  # Use fast fuzzy search for suggestions
543
  matches = get_model_suggestions_fast(model_query, limit=20)
544
+
545
  # Get detailed results only for matched models
546
  results = {}
547
  for leaderboard_name, parsed_items in HF_DATASET_CACHE.items():
 
555
  "params": item.get("params"),
556
  "architecture": item.get("architecture"),
557
  "precision": item.get("precision"),
558
+ "results": item.get("results", {}),
559
  }
560
+
561
  return results, matches
562
 
563
 
 
565
  """Get all unique model names across all leaderboards."""
566
  if not HF_DATASET_CACHE:
567
  return []
568
+
569
  models = set()
570
  for parsed_items in HF_DATASET_CACHE.values():
571
  for item in parsed_items:
572
  models.add(item.get("model", ""))
 
 
573
 
574
+ return sorted(models)
eval.schema.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
  "$schema": "http://json-schema.org/draft-07/schema#",
3
- "version": "0.0.1",
4
  "type": "object",
5
  "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
6
  "required": [
7
  "schema_version",
8
  "evaluation_id",
9
- "evaluation_source",
10
  "retrieved_timestamp",
11
  "source_data",
12
  "source_metadata",
13
  "model_info",
14
  "evaluation_results"
15
  ],
 
16
  "properties": {
17
  "schema_version": {
18
  "type": "string",
@@ -20,49 +20,82 @@
20
  },
21
  "evaluation_id": {
22
  "type": "string",
23
- "description": "Unique identifier for this specific evaluation run. Use org_name/eval_name/retrieved_timestamp format"
24
  },
25
  "retrieved_timestamp": {
26
  "type": "string",
27
- "description": "Timestamp for when this record was created"
28
  },
29
  "source_data": {
30
- "type": "array",
31
- "description": "URLs for the source of the evaluation data",
32
- "items": {
33
- "type": "string"
34
- }
35
- },
36
- "evaluation_source": {
37
- "type": "object",
38
- "description": "Details about evaluation origin. There are options that evaluations come from leaderboards (e.g. Live Code Bench Pro) or evaluation platforms (e.g. lm-eval, inspect ai, HELM...).",
39
- "required": [
40
- "evaluation_source_name",
41
- "evaluation_source_type"
42
- ],
43
- "properties": {
44
- "evaluation_source_name": {
45
- "type": "string",
46
- "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation."
47
  },
48
- "evaluation_source_type": {
49
- "type": "string",
50
- "enum": [
51
- "leaderboard",
52
- "evaluation_platform"
53
  ],
54
- "description": "Type of evaluation source, e.g., leaderboard or evaluation platform"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  }
56
- }
57
  },
58
  "source_metadata": {
59
  "type": "object",
60
  "description": "Metadata about the source of the leaderboard data",
61
  "required": [
 
62
  "source_organization_name",
63
  "evaluator_relationship"
64
  ],
65
  "properties": {
 
 
 
 
 
 
 
 
 
 
 
 
66
  "source_organization_name": {
67
  "type": "string",
68
  "description": "Name of the organization that provides the data"
@@ -101,7 +134,7 @@
101
  },
102
  "id": {
103
  "type": "string",
104
- "description": "Model name standarized to HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct)"
105
  },
106
  "developer": {
107
  "type": "string",
@@ -109,7 +142,16 @@
109
  },
110
  "inference_platform": {
111
  "type": "string",
112
- "description": "Description of platform used to run the evaluations (e.g. local machine, Bedrock)"
 
 
 
 
 
 
 
 
 
113
  }
114
  }
115
  },
@@ -213,7 +255,6 @@
213
  }
214
  },
215
  "score_details": {
216
- "type": "object",
217
  "description": "The score for the evaluation and related details",
218
  "required": [
219
  "score"
@@ -237,37 +278,45 @@
237
  "generation_config": {
238
  "type": "object",
239
  "generation_args": {
240
- "type": "object",
241
- "description": "Parameters used to generate results - properties may vary by model type",
242
- "properties": {
243
- "temperature": {
244
- "type": [
245
- "null",
246
- "number"
247
- ],
248
- "description": "Sampling temperature"
249
- },
250
- "top_p": {
251
- "type": [
252
- "null",
253
- "number"
254
- ],
255
- "description": "Nucleus sampling parameter"
256
- },
257
- "top_k": {
258
- "type": [
259
- "null",
260
- "number"
261
- ],
262
- "description": "Top-k sampling parameter"
263
- },
264
- "max_tokens": {
265
- "type": "integer",
266
- "minimum": 1,
267
- "description": "Maximum number of tokens to generate"
268
- }
269
  },
270
- "additionalProperties": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  },
272
  "additional_details": {
273
  "type": "string",
@@ -276,7 +325,116 @@
276
  }
277
  }
278
  }
279
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  }
281
  }
282
- }
 
1
  {
2
  "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "version": "0.1.0",
4
  "type": "object",
5
  "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
6
  "required": [
7
  "schema_version",
8
  "evaluation_id",
 
9
  "retrieved_timestamp",
10
  "source_data",
11
  "source_metadata",
12
  "model_info",
13
  "evaluation_results"
14
  ],
15
+ "additionalProperties": false,
16
  "properties": {
17
  "schema_version": {
18
  "type": "string",
 
20
  },
21
  "evaluation_id": {
22
  "type": "string",
23
+ "description": "Unique identifier for this specific evaluation run. Use eval_name/model_id/retrieved_timestamp format"
24
  },
25
  "retrieved_timestamp": {
26
  "type": "string",
27
+ "description": "Timestamp for when this record was created - using Unix Epoch time format"
28
  },
29
  "source_data": {
30
+ "description": "Source of dataset used for evaluation. There are two options supported: HuggingFace dataset or url for other data source.",
31
+ "oneOf": [
32
+ {
33
+ "type": "array",
34
+ "description": "URLs for the source of the evaluation data",
35
+ "items": {
36
+ "type": "string"
37
+ }
 
 
 
 
 
 
 
 
 
38
  },
39
+ {
40
+ "type": "object",
41
+ "description": "Details about HuggingFace dataset used for evaluation",
42
+ "required": [
43
+ "dataset_name"
44
  ],
45
+ "properties": {
46
+ "dataset_name": {
47
+ "type": "string",
48
+ "description": "Name of the source dataset"
49
+ },
50
+ "hf_repo": {
51
+ "type": "string",
52
+ "description": "HuggingFace repository identifier"
53
+ },
54
+ "hf_split": {
55
+ "type": "string",
56
+ "description": "One of train, val or test."
57
+ },
58
+ "samples_number": {
59
+ "type": "integer",
60
+ "description": "Number of samples in the dataset"
61
+ },
62
+ "sample_ids": {
63
+ "type": "array",
64
+ "description": "Array of sample ids used for evaluation",
65
+ "items": {
66
+ "type": ["integer", "string"]
67
+ }
68
+ },
69
+ "additional_details": {
70
+ "type": "object",
71
+ "description": "Additional dataset info parameters",
72
+ "additionalProperties": true
73
+ }
74
+ }
75
  }
76
+ ]
77
  },
78
  "source_metadata": {
79
  "type": "object",
80
  "description": "Metadata about the source of the leaderboard data",
81
  "required": [
82
+ "source_type",
83
  "source_organization_name",
84
  "evaluator_relationship"
85
  ],
86
  "properties": {
87
+ "source_name": {
88
+ "type": "string",
89
+ "description": "Name of the source (e.g. title of the source leaderboard or name of the platform used for the evaluation)."
90
+ },
91
+ "source_type": {
92
+ "type": "string",
93
+ "enum": [
94
+ "documentation",
95
+ "evaluation_run"
96
+ ],
97
+ "description": "Whether the data comes from a direct evaluation run or from documentation"
98
+ },
99
  "source_organization_name": {
100
  "type": "string",
101
  "description": "Name of the organization that provides the data"
 
134
  },
135
  "id": {
136
  "type": "string",
137
+ "description": "Model name in HuggingFace format (e.g. meta-llama/Llama-3.1-8B-Instruct for models available on HuggingFace or openai/azure/gpt-4o-mini-2024-07-18 for closed API models)"
138
  },
139
  "developer": {
140
  "type": "string",
 
142
  },
143
  "inference_platform": {
144
  "type": "string",
145
+ "description": "Name of inference platform which provides an access to models by API to run the evaluations or provides models weights to run them locally (e.g. HuggingFace, Bedrock, Together AI)"
146
+ },
147
+ "inference_engine": {
148
+ "type": "string",
149
+ "description": "Name of inference engine which provides an access to optimized models to use them for local evaluations (e.g. vLLM, Ollama)."
150
+ },
151
+ "additional_details": {
152
+ "type": "object",
153
+ "description": "Additional model configuration parameters",
154
+ "additionalProperties": true
155
  }
156
  }
157
  },
 
255
  }
256
  },
257
  "score_details": {
 
258
  "description": "The score for the evaluation and related details",
259
  "required": [
260
  "score"
 
278
  "generation_config": {
279
  "type": "object",
280
  "generation_args": {
281
+ "type": "object",
282
+ "description": "Parameters used to generate results - properties may vary by model type",
283
+ "properties": {
284
+ "temperature": {
285
+ "type": [
286
+ "null",
287
+ "number"
288
+ ],
289
+ "description": "Sampling temperature"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  },
291
+ "top_p": {
292
+ "type": [
293
+ "null",
294
+ "number"
295
+ ],
296
+ "description": "Nucleus sampling parameter"
297
+ },
298
+ "top_k": {
299
+ "type": [
300
+ "null",
301
+ "number"
302
+ ],
303
+ "description": "Top-k sampling parameter"
304
+ },
305
+ "max_tokens": {
306
+ "type": "integer",
307
+ "minimum": 1,
308
+ "description": "Maximum number of tokens to generate"
309
+ },
310
+ "execution_command": {
311
+ "type": "string",
312
+ "description": "Command used to run the model to generate results"
313
+ },
314
+ "reasoning": {
315
+ "type": "boolean",
316
+ "description": "Whether reasoning orchain-of-thought was used to generate results"
317
+ }
318
+ },
319
+ "additionalProperties": true
320
  },
321
  "additional_details": {
322
  "type": "string",
 
325
  }
326
  }
327
  }
328
+ },
329
+ "detailed_evaluation_results_per_samples": {
330
+ "description": "Detailed eval results for all individual samples in the evaluation. This can be provided as source link or list of DetailedEvaluationResultsPerSample objects.",
331
+ "anyOf": [
332
+ {
333
+ "type": "string",
334
+ "description": "Link to detailed evaluation data for all samples"
335
+ },
336
+ {
337
+ "type": "array",
338
+ "description": "Array of evaluation results",
339
+ "items": {
340
+ "type": "object",
341
+ "required": [
342
+ "sample_id",
343
+ "input",
344
+ "ground_truth",
345
+ "response"
346
+ ],
347
+ "properties": {
348
+ "sample_id": {
349
+ "type": "string",
350
+ "description": "Simple sample ID"
351
+ },
352
+ "input": {
353
+ "type": "string",
354
+ "description": "Raw input for the model"
355
+ },
356
+ "prompt": {
357
+ "type": "string",
358
+ "description": "Full prompt for the model"
359
+ },
360
+ "ground_truth": {
361
+ "description": "Target response that may include one or multiple correct answers.",
362
+ "oneOf": [
363
+ {
364
+ "type": "string"
365
+ },
366
+ {
367
+ "type": "array",
368
+ "items": {
369
+ "type": "string"
370
+ }
371
+ }
372
+ ]
373
+ },
374
+ "response": {
375
+ "type": "string",
376
+ "description": "Response from the model"
377
+ },
378
+ "choices": {
379
+ "description": "Either an array of possible responses (list of strings) or an array of string pairs [choice, response].",
380
+ "oneOf": [
381
+ {
382
+ "type": "array",
383
+ "items": {
384
+ "type": "string"
385
+ }
386
+ },
387
+ {
388
+ "type": "array",
389
+ "items": {
390
+ "type": "array",
391
+ "items": [
392
+ {
393
+ "type": "string"
394
+ },
395
+ {
396
+ "type": "string"
397
+ }
398
+ ],
399
+ "minItems": 2,
400
+ "maxItems": 2
401
+ }
402
+ }
403
+ ]
404
+ },
405
+ "full_logprobs": {
406
+ "type": "array",
407
+ "description": "Full log probabilities generated for this sample",
408
+ "items": {
409
+ "type": "array",
410
+ "items": {
411
+ "type": "object",
412
+ "required": [
413
+ "token_id",
414
+ "logprob",
415
+ "decoded_token"
416
+ ],
417
+ "properties": {
418
+ "token_id": {
419
+ "type": "number",
420
+ "description": "Id of token for which we keep its logprob"
421
+ },
422
+ "logprob": {
423
+ "type": "number",
424
+ "description": "Log probability of the token"
425
+ },
426
+ "decoded_token": {
427
+ "type": "string",
428
+ "description": "The decoded string representation of the token"
429
+ }
430
+ }
431
+ }
432
+ }
433
+ }
434
+ }
435
+ }
436
+ }
437
+ ]
438
  }
439
  }
440
+ }
hf_operations.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
  HuggingFace Operations: Upload data, create PRs, validate schemas.
3
  """
 
4
  from huggingface_hub import HfApi, login
5
  import pandas as pd
6
  import json
@@ -10,17 +11,17 @@ from jsonschema import validate, ValidationError, Draft7Validator
10
 
11
  # Load schema once at module level
12
  SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
13
- with open(SCHEMA_PATH, 'r') as f:
14
  EVAL_SCHEMA = json.load(f)
15
-
16
 
17
  def validate_json_against_schema(json_data):
18
  """
19
  Validate a JSON object against eval.schema.json.
20
-
21
  Args:
22
  json_data: Dict containing the evaluation data
23
-
24
  Returns:
25
  (bool, str): (is_valid, error_message)
26
  """
@@ -38,7 +39,7 @@ def validate_json_against_schema(json_data):
38
  def upload_to_hf_dataset(parquet_file, split_name, repo_id):
39
  """
40
  Upload a parquet file as a new split to the HF dataset.
41
-
42
  Args:
43
  parquet_file: Path to parquet file
44
  split_name: Name of the split (leaderboard name)
@@ -51,14 +52,14 @@ def upload_to_hf_dataset(parquet_file, split_name, repo_id):
51
  def check_hf_authentication():
52
  """
53
  Check if user is authenticated with HuggingFace.
54
-
55
  Returns:
56
  (bool, str): (is_authenticated, username or error_message)
57
  """
58
  try:
59
  api = HfApi()
60
  user_info = api.whoami()
61
- return True, user_info['name']
62
  except Exception as e:
63
  return False, "Not authenticated. Run: huggingface-cli login"
64
 
@@ -66,18 +67,18 @@ def check_hf_authentication():
66
  def check_duplicate_pr_exists(leaderboard_name, repo_id):
67
  """
68
  Check if a PR already exists for this leaderboard.
69
-
70
  Args:
71
  leaderboard_name: Name of the leaderboard
72
  repo_id: HuggingFace dataset repository ID
73
-
74
  Returns:
75
  (bool, str or None): (exists, pr_url if exists)
76
  """
77
  try:
78
  api = HfApi()
79
  discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
80
-
81
  # Check for open PRs with matching title
82
  pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
83
  for discussion in discussions:
@@ -85,7 +86,7 @@ def check_duplicate_pr_exists(leaderboard_name, repo_id):
85
  if pr_title_pattern in discussion.title.lower():
86
  pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
87
  return True, pr_url
88
-
89
  return False, None
90
  except Exception as e:
91
  # If we can't check, assume no duplicate (fail open)
@@ -96,12 +97,12 @@ def check_duplicate_pr_exists(leaderboard_name, repo_id):
96
  def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
97
  """
98
  Create a pull request to add a new leaderboard split.
99
-
100
  Args:
101
  leaderboard_name: Name of the new leaderboard
102
  parquet_file: Path to parquet file
103
  repo_id: HuggingFace dataset repository ID
104
-
105
  Returns:
106
  (success, pr_url or error_message)
107
  """
@@ -109,28 +110,28 @@ def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
109
  is_auth, auth_result = check_hf_authentication()
110
  if not is_auth:
111
  return False, f"❌ {auth_result}"
112
-
113
  # 2. Check for duplicate PR
114
  has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
115
  if has_duplicate:
116
  return False, f"⚠️ PR already exists: {duplicate_url}"
117
-
118
  # 3. Validate parquet file exists and has data
119
  parquet_path = Path(parquet_file)
120
  if not parquet_path.exists():
121
  return False, "❌ Parquet file not found"
122
-
123
  df = pd.read_parquet(parquet_file)
124
  if len(df) == 0:
125
  return False, "❌ Parquet file is empty"
126
-
127
  # 4. Create PR
128
  try:
129
  api = HfApi()
130
-
131
  # Upload the parquet file to the branch
132
  commit_message = f"Add new leaderboard: {leaderboard_name}"
133
-
134
  # Upload file and create PR
135
  commit_info = api.upload_file(
136
  path_or_fileobj=parquet_file,
@@ -140,12 +141,16 @@ def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
140
  commit_message=commit_message,
141
  create_pr=True,
142
  )
143
-
144
  # Extract PR URL from commit info
145
- pr_url = commit_info.pr_url if hasattr(commit_info, 'pr_url') else f"https://huggingface.co/datasets/{repo_id}/discussions"
146
-
 
 
 
 
147
  return True, f"PR created ({len(df)} rows): {pr_url}"
148
-
149
  except Exception as e:
150
  return False, f"❌ Failed to create PR: {str(e)}"
151
 
@@ -153,37 +158,50 @@ def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
153
  def validate_schema(parquet_file):
154
  """
155
  Validate that a parquet file matches the expected schema.
156
-
157
  Args:
158
  parquet_file: Path to parquet file to validate
159
-
160
  Returns:
161
  (bool, str): (is_valid, error_message)
162
  """
163
  try:
164
  df = pd.read_parquet(parquet_file)
165
-
166
  # Required columns
167
  required_cols = [
168
- '_leaderboard', '_developer', '_model', '_uuid',
169
- 'schema_version', 'evaluation_id', 'retrieved_timestamp',
170
- 'source_data', 'evaluation_source_name', 'evaluation_source_type',
171
- 'source_organization_name', 'evaluator_relationship',
172
- 'model_name', 'model_id', 'model_developer',
173
- 'evaluation_results'
 
 
 
 
 
 
 
 
 
 
174
  ]
175
-
176
  missing = [col for col in required_cols if col not in df.columns]
177
  if missing:
178
  return False, f"Missing required columns: {', '.join(missing)}"
179
-
180
  # Check data types (all should be strings)
181
  for col in df.columns:
182
- if df[col].dtype not in ['object', 'string']:
183
- return False, f"Column '{col}' has wrong type: {df[col].dtype} (expected string)"
184
-
 
 
 
185
  return True, "Schema validation passed"
186
-
187
  except Exception as e:
188
  return False, f"Validation error: {str(e)}"
189
 
@@ -192,11 +210,11 @@ def export_to_json(parquet_file, output_dir):
192
  """
193
  Export parquet data back to JSON files.
194
  Uses the parquet_to_folder function from json_to_parquet.py
195
-
196
  Args:
197
  parquet_file: Path to parquet file
198
  output_dir: Directory to write JSON files to
199
  """
200
  from json_to_parquet import parquet_to_folder
201
- parquet_to_folder(parquet_file, output_dir)
202
 
 
 
1
  """
2
  HuggingFace Operations: Upload data, create PRs, validate schemas.
3
  """
4
+
5
  from huggingface_hub import HfApi, login
6
  import pandas as pd
7
  import json
 
11
 
12
  # Load schema once at module level
13
  SCHEMA_PATH = Path(__file__).parent / "eval.schema.json"
14
+ with open(SCHEMA_PATH, "r") as f:
15
  EVAL_SCHEMA = json.load(f)
16
+
17
 
18
  def validate_json_against_schema(json_data):
19
  """
20
  Validate a JSON object against eval.schema.json.
21
+
22
  Args:
23
  json_data: Dict containing the evaluation data
24
+
25
  Returns:
26
  (bool, str): (is_valid, error_message)
27
  """
 
39
  def upload_to_hf_dataset(parquet_file, split_name, repo_id):
40
  """
41
  Upload a parquet file as a new split to the HF dataset.
42
+
43
  Args:
44
  parquet_file: Path to parquet file
45
  split_name: Name of the split (leaderboard name)
 
52
  def check_hf_authentication():
53
  """
54
  Check if user is authenticated with HuggingFace.
55
+
56
  Returns:
57
  (bool, str): (is_authenticated, username or error_message)
58
  """
59
  try:
60
  api = HfApi()
61
  user_info = api.whoami()
62
+ return True, user_info["name"]
63
  except Exception as e:
64
  return False, "Not authenticated. Run: huggingface-cli login"
65
 
 
67
  def check_duplicate_pr_exists(leaderboard_name, repo_id):
68
  """
69
  Check if a PR already exists for this leaderboard.
70
+
71
  Args:
72
  leaderboard_name: Name of the leaderboard
73
  repo_id: HuggingFace dataset repository ID
74
+
75
  Returns:
76
  (bool, str or None): (exists, pr_url if exists)
77
  """
78
  try:
79
  api = HfApi()
80
  discussions = api.get_repo_discussions(repo_id=repo_id, repo_type="dataset")
81
+
82
  # Check for open PRs with matching title
83
  pr_title_pattern = f"add new leaderboard: {leaderboard_name.lower()}"
84
  for discussion in discussions:
 
86
  if pr_title_pattern in discussion.title.lower():
87
  pr_url = f"https://huggingface.co/datasets/{repo_id}/discussions/{discussion.num}"
88
  return True, pr_url
89
+
90
  return False, None
91
  except Exception as e:
92
  # If we can't check, assume no duplicate (fail open)
 
97
  def create_pr_for_new_leaderboard(leaderboard_name, parquet_file, repo_id):
98
  """
99
  Create a pull request to add a new leaderboard split.
100
+
101
  Args:
102
  leaderboard_name: Name of the new leaderboard
103
  parquet_file: Path to parquet file
104
  repo_id: HuggingFace dataset repository ID
105
+
106
  Returns:
107
  (success, pr_url or error_message)
108
  """
 
110
  is_auth, auth_result = check_hf_authentication()
111
  if not is_auth:
112
  return False, f"❌ {auth_result}"
113
+
114
  # 2. Check for duplicate PR
115
  has_duplicate, duplicate_url = check_duplicate_pr_exists(leaderboard_name, repo_id)
116
  if has_duplicate:
117
  return False, f"⚠️ PR already exists: {duplicate_url}"
118
+
119
  # 3. Validate parquet file exists and has data
120
  parquet_path = Path(parquet_file)
121
  if not parquet_path.exists():
122
  return False, "❌ Parquet file not found"
123
+
124
  df = pd.read_parquet(parquet_file)
125
  if len(df) == 0:
126
  return False, "❌ Parquet file is empty"
127
+
128
  # 4. Create PR
129
  try:
130
  api = HfApi()
131
+
132
  # Upload the parquet file to the branch
133
  commit_message = f"Add new leaderboard: {leaderboard_name}"
134
+
135
  # Upload file and create PR
136
  commit_info = api.upload_file(
137
  path_or_fileobj=parquet_file,
 
141
  commit_message=commit_message,
142
  create_pr=True,
143
  )
144
+
145
  # Extract PR URL from commit info
146
+ pr_url = (
147
+ commit_info.pr_url
148
+ if hasattr(commit_info, "pr_url")
149
+ else f"https://huggingface.co/datasets/{repo_id}/discussions"
150
+ )
151
+
152
  return True, f"PR created ({len(df)} rows): {pr_url}"
153
+
154
  except Exception as e:
155
  return False, f"❌ Failed to create PR: {str(e)}"
156
 
 
158
  def validate_schema(parquet_file):
159
  """
160
  Validate that a parquet file matches the expected schema.
161
+
162
  Args:
163
  parquet_file: Path to parquet file to validate
164
+
165
  Returns:
166
  (bool, str): (is_valid, error_message)
167
  """
168
  try:
169
  df = pd.read_parquet(parquet_file)
170
+
171
  # Required columns
172
  required_cols = [
173
+ "_leaderboard",
174
+ "_developer",
175
+ "_model",
176
+ "_uuid",
177
+ "schema_version",
178
+ "evaluation_id",
179
+ "retrieved_timestamp",
180
+ "source_data",
181
+ "evaluation_source_name",
182
+ "evaluation_source_type",
183
+ "source_organization_name",
184
+ "evaluator_relationship",
185
+ "model_name",
186
+ "model_id",
187
+ "model_developer",
188
+ "evaluation_results",
189
  ]
190
+
191
  missing = [col for col in required_cols if col not in df.columns]
192
  if missing:
193
  return False, f"Missing required columns: {', '.join(missing)}"
194
+
195
  # Check data types (all should be strings)
196
  for col in df.columns:
197
+ if df[col].dtype not in ["object", "string"]:
198
+ return (
199
+ False,
200
+ f"Column '{col}' has wrong type: {df[col].dtype} (expected string)",
201
+ )
202
+
203
  return True, "Schema validation passed"
204
+
205
  except Exception as e:
206
  return False, f"Validation error: {str(e)}"
207
 
 
210
  """
211
  Export parquet data back to JSON files.
212
  Uses the parquet_to_folder function from json_to_parquet.py
213
+
214
  Args:
215
  parquet_file: Path to parquet file
216
  output_dir: Directory to write JSON files to
217
  """
218
  from json_to_parquet import parquet_to_folder
 
219
 
220
+ parquet_to_folder(parquet_file, output_dir)
ui_components.py CHANGED
@@ -752,17 +752,19 @@ table tr:hover td {
752
  def format_leaderboard_header(selected_leaderboard, metadata):
753
  if not selected_leaderboard:
754
  return '<div style="text-align: center; padding: 2rem; color: #525252;">Select a leaderboard to explore</div>'
755
-
756
  if not metadata or not metadata.get("evals"):
757
  return f'<div class="info-banner"><h3>{selected_leaderboard}</h3></div>'
758
-
759
  source_info = metadata.get("source_info", {})
760
  org = source_info.get("organization", "Unknown")
761
  url = source_info.get("url", "#")
762
  eval_names = sorted(list(metadata["evals"].keys()))
763
-
764
- eval_tags = "".join([f'<span class="eval-tag">{name}</span>' for name in eval_names])
765
-
 
 
766
  return f'''
767
  <div class="info-banner">
768
  <div class="leaderboard-header">
@@ -785,14 +787,16 @@ def format_metric_details(selected_leaderboard, metadata):
785
 
786
  cards_html = ""
787
  for i, (eval_name, info) in enumerate(evals.items()):
788
- score_type = info.get('score_type', '').upper() or "—"
789
- direction = "Lower is better" if info.get('lower_is_better') else "Higher is better"
790
- arrow = "↓" if info.get('lower_is_better') else ""
 
 
791
 
792
  details = ""
793
- if info.get('score_type') == "continuous" and info.get('min_score') is not None:
794
  details = f"Range: [{info['min_score']} – {info['max_score']}]"
795
- elif info.get('score_type') == "levels" and info.get('level_names'):
796
  details = f"Levels: {', '.join(str(l) for l in info['level_names'])}"
797
 
798
  card_id = f"mc{i}"
@@ -804,7 +808,7 @@ def format_metric_details(selected_leaderboard, metadata):
804
  <span class="metric-card-direction"><span class="arrow">{arrow}</span> {direction}</span>
805
  </label>
806
  <div class="metric-card-body">
807
- <div>{info.get('description', 'No description')}</div>
808
  <div style="display: flex; justify-content: space-between; align-items: center; margin-top: 0.5rem;">
809
  <span style="font-size: 0.75rem; color: #525252;">{details}</span>
810
  <span class="metric-type-badge">{score_type}</span>
@@ -813,25 +817,25 @@ def format_metric_details(selected_leaderboard, metadata):
813
  </div>
814
  '''
815
 
816
- return f'''
817
  <div class="metrics-section">
818
  <h3>Metric Reference</h3>
819
  <div class="metrics-grid">{cards_html}</div>
820
  </div>
821
- '''
822
 
823
 
824
  def format_model_card(model_name, model_data):
825
  if not model_data:
826
  return '<div class="no-results"><h3>No results found</h3><p>Try a different model name</p></div>'
827
-
828
  first = list(model_data.values())[0]
829
  developer = first.get("developer", "Unknown")
830
  params = first.get("params")
831
  arch = first.get("architecture", "Unknown")
832
  params_str = f"{params}B" if params else "—"
833
-
834
- html = f'''
835
  <div style="padding: 1rem; background: #ffffff; border-radius: 10px; border: 1px solid #e5e5e5;">
836
  <h2 style="margin: 0 0 0.5rem 0; color: #0a0a0a;">{model_name}</h2>
837
  <div style="color: #525252; margin-bottom: 1rem;">
@@ -839,80 +843,86 @@ def format_model_card(model_name, model_data):
839
  <span>Params: {params_str}</span> ·
840
  <span>Arch: {arch}</span>
841
  </div>
842
- '''
843
-
844
  for leaderboard_name, data in model_data.items():
845
  results = data.get("results", {})
846
  if not results:
847
  continue
848
-
849
  scores = [v for v in results.values() if v is not None]
850
  avg = sum(scores) / len(scores) if scores else None
851
  avg_str = f"{avg:.2f}" if avg else "—"
852
-
853
  html += f'<div style="margin-bottom: 1rem;"><h4 style="color: #0a0a0a;">{leaderboard_name} <span style="color: #525252;">(avg: {avg_str})</span></h4>'
854
  html += '<div style="display: flex; flex-wrap: wrap; gap: 0.5rem;">'
855
-
856
- for metric_name, score in sorted(results.items(), key=lambda x: x[1] if x[1] else 0, reverse=True):
 
 
857
  score_display = f"{score:.2f}" if score is not None else "—"
858
  html += f'<div style="padding: 0.4rem 0.8rem; border-radius: 6px; background: #f5f5f5; border: 1px solid #e5e5e5;"><span style="color: #525252;">{metric_name}:</span> <strong style="color: #0a0a0a;">{score_display}</strong></div>'
859
-
860
- html += '</div></div>'
861
-
862
- html += '</div>'
863
  return html
864
 
865
 
866
  def format_model_comparison(selected_models, all_results):
867
  if not selected_models or not all_results:
868
  return '<div class="no-results"><h3>Select models to compare</h3><p>Choose models from the dropdown</p></div>'
869
-
870
  all_leaderboards = set()
871
  model_data_dict = {}
872
-
873
  for model_name in selected_models:
874
  if model_name in all_results:
875
  model_data_dict[model_name] = all_results[model_name]
876
  for lb in all_results[model_name].keys():
877
  all_leaderboards.add(lb)
878
-
879
  if not model_data_dict:
880
  return '<div class="no-results"><h3>No data found</h3></div>'
881
-
882
  all_leaderboards = sorted(all_leaderboards)
883
-
884
  html = '<div style="padding: 1rem; background: #ffffff; border-radius: 10px; border: 1px solid #e5e5e5;">'
885
-
886
  for leaderboard_name in all_leaderboards:
887
  metrics = set()
888
  for md in model_data_dict.values():
889
  if leaderboard_name in md:
890
  metrics.update(md[leaderboard_name].get("results", {}).keys())
891
-
892
  metrics = sorted(metrics)
893
  if not metrics:
894
  continue
895
-
896
  html += f'<h3 style="margin: 1rem 0 0.5rem; color: #0a0a0a;">{leaderboard_name}</h3>'
897
  html += '<div style="overflow-x: auto;"><table class="heatmap-table"><thead><tr><th>Metric</th>'
898
-
899
  for model_name in selected_models:
900
  short = model_name[:20] + "…" if len(model_name) > 20 else model_name
901
  html += f'<th title="{model_name}">{short}</th>'
902
- html += '</tr></thead><tbody>'
903
-
904
  for metric_name in metrics:
905
  html += f'<tr><td class="metric-name">{metric_name}</td>'
906
-
907
  scores = {}
908
  for m in selected_models:
909
  if m in model_data_dict and leaderboard_name in model_data_dict[m]:
910
- scores[m] = model_data_dict[m][leaderboard_name].get("results", {}).get(metric_name)
911
-
 
 
 
 
912
  valid = [v for v in scores.values() if v is not None]
913
  max_s = max(valid) if valid else None
914
  min_s = min(valid) if valid else None
915
-
916
  for model_name in selected_models:
917
  score = scores.get(model_name)
918
  if score is not None:
@@ -921,7 +931,15 @@ def format_model_comparison(selected_models, all_results):
921
  cls = "best"
922
  elif max_s > min_s:
923
  pct = (score - min_s) / (max_s - min_s)
924
- cls = "good" if pct >= 0.75 else "mid" if pct >= 0.5 else "low" if pct >= 0.25 else "worst"
 
 
 
 
 
 
 
 
925
  else:
926
  cls = ""
927
  else:
@@ -929,11 +947,11 @@ def format_model_comparison(selected_models, all_results):
929
  html += f'<td class="score-cell {cls}">{score:.2f}</td>'
930
  else:
931
  html += '<td class="score-cell na">—</td>'
932
- html += '</tr>'
933
-
934
- html += '</tbody></table></div>'
935
-
936
- html += '</div>'
937
  return html
938
 
939
 
@@ -947,13 +965,14 @@ def create_radar_plot(selected_models, all_results):
947
  for model in selected_models:
948
  if model not in all_results:
949
  continue
950
-
951
  model_data = all_results[model]
952
  for lb_name, lb_data in model_data.items():
953
  leaderboards_involved.add(lb_name)
954
  results = lb_data.get("results", {})
955
  for metric, score in results.items():
956
- if score is None: continue
 
957
  key = f"{lb_name}: {metric}"
958
  if key not in metric_data:
959
  metric_data[key] = {}
@@ -967,17 +986,17 @@ def create_radar_plot(selected_models, all_results):
967
  meta_cache[lb] = get_eval_metadata(lb)
968
 
969
  fig = go.Figure()
970
-
971
  categories = sorted(metric_data.keys())
972
-
973
  for model in selected_models:
974
  r_values = []
975
  theta_values = []
976
  hover_texts = []
977
-
978
  for cat in categories:
979
  lb_name, metric_name = cat.split(": ", 1)
980
-
981
  val = metric_data[cat].get(model)
982
  if val is None:
983
  r_values.append(None)
@@ -987,15 +1006,15 @@ def create_radar_plot(selected_models, all_results):
987
  meta = meta_cache.get(lb_name, {}).get("evals", {}).get(metric_name, {})
988
  min_s = meta.get("min_score")
989
  max_s = meta.get("max_score")
990
-
991
  observed_vals = []
992
  for m in selected_models:
993
  v = metric_data[cat].get(m)
994
  if v is not None:
995
  observed_vals.append(v)
996
-
997
  observed_max = max(observed_vals) if observed_vals else 1.0
998
-
999
  if min_s is None:
1000
  min_s = 0
1001
  if max_s is None:
@@ -1004,42 +1023,39 @@ def create_radar_plot(selected_models, all_results):
1004
  else:
1005
  max_s = 1
1006
  max_s = max(max_s, observed_max)
1007
-
1008
  if max_s == min_s:
1009
  norm_val = 1.0
1010
  else:
1011
  norm_val = (val - min_s) / (max_s - min_s)
1012
-
1013
  norm_val = max(0.0, min(1.0, norm_val))
1014
-
1015
  r_values.append(norm_val)
1016
  theta_values.append(cat)
1017
  hover_texts.append(f"{cat}<br>Score: {val:.2f} (Norm: {norm_val:.2f})")
1018
-
1019
  if r_values:
1020
  r_values.append(r_values[0])
1021
  theta_values.append(theta_values[0])
1022
  hover_texts.append(hover_texts[0])
1023
-
1024
- fig.add_trace(go.Scatterpolar(
1025
- r=r_values,
1026
- theta=theta_values,
1027
- name=model,
1028
- hovertext=hover_texts,
1029
- hoverinfo="text",
1030
- fill='toself'
1031
- ))
1032
 
1033
- fig.update_layout(
1034
- polar=dict(
1035
- radialaxis=dict(
1036
- visible=True,
1037
- range=[0, 1]
 
 
 
1038
  )
1039
- ),
 
 
 
1040
  showlegend=True,
1041
  margin=dict(l=80, r=80, t=20, b=20),
1042
- title="Model Comparison Radar (Normalized Scores)"
1043
  )
1044
-
1045
  return fig
 
752
  def format_leaderboard_header(selected_leaderboard, metadata):
753
  if not selected_leaderboard:
754
  return '<div style="text-align: center; padding: 2rem; color: #525252;">Select a leaderboard to explore</div>'
755
+
756
  if not metadata or not metadata.get("evals"):
757
  return f'<div class="info-banner"><h3>{selected_leaderboard}</h3></div>'
758
+
759
  source_info = metadata.get("source_info", {})
760
  org = source_info.get("organization", "Unknown")
761
  url = source_info.get("url", "#")
762
  eval_names = sorted(list(metadata["evals"].keys()))
763
+
764
+ eval_tags = "".join(
765
+ [f'<span class="eval-tag">{name}</span>' for name in eval_names]
766
+ )
767
+
768
  return f'''
769
  <div class="info-banner">
770
  <div class="leaderboard-header">
 
787
 
788
  cards_html = ""
789
  for i, (eval_name, info) in enumerate(evals.items()):
790
+ score_type = info.get("score_type", "").upper() or "—"
791
+ direction = (
792
+ "Lower is better" if info.get("lower_is_better") else "Higher is better"
793
+ )
794
+ arrow = "↓" if info.get("lower_is_better") else "↑"
795
 
796
  details = ""
797
+ if info.get("score_type") == "continuous" and info.get("min_score") is not None:
798
  details = f"Range: [{info['min_score']} – {info['max_score']}]"
799
+ elif info.get("score_type") == "levels" and info.get("level_names"):
800
  details = f"Levels: {', '.join(str(l) for l in info['level_names'])}"
801
 
802
  card_id = f"mc{i}"
 
808
  <span class="metric-card-direction"><span class="arrow">{arrow}</span> {direction}</span>
809
  </label>
810
  <div class="metric-card-body">
811
+ <div>{info.get("description", "No description")}</div>
812
  <div style="display: flex; justify-content: space-between; align-items: center; margin-top: 0.5rem;">
813
  <span style="font-size: 0.75rem; color: #525252;">{details}</span>
814
  <span class="metric-type-badge">{score_type}</span>
 
817
  </div>
818
  '''
819
 
820
+ return f"""
821
  <div class="metrics-section">
822
  <h3>Metric Reference</h3>
823
  <div class="metrics-grid">{cards_html}</div>
824
  </div>
825
+ """
826
 
827
 
828
  def format_model_card(model_name, model_data):
829
  if not model_data:
830
  return '<div class="no-results"><h3>No results found</h3><p>Try a different model name</p></div>'
831
+
832
  first = list(model_data.values())[0]
833
  developer = first.get("developer", "Unknown")
834
  params = first.get("params")
835
  arch = first.get("architecture", "Unknown")
836
  params_str = f"{params}B" if params else "—"
837
+
838
+ html = f"""
839
  <div style="padding: 1rem; background: #ffffff; border-radius: 10px; border: 1px solid #e5e5e5;">
840
  <h2 style="margin: 0 0 0.5rem 0; color: #0a0a0a;">{model_name}</h2>
841
  <div style="color: #525252; margin-bottom: 1rem;">
 
843
  <span>Params: {params_str}</span> ·
844
  <span>Arch: {arch}</span>
845
  </div>
846
+ """
847
+
848
  for leaderboard_name, data in model_data.items():
849
  results = data.get("results", {})
850
  if not results:
851
  continue
852
+
853
  scores = [v for v in results.values() if v is not None]
854
  avg = sum(scores) / len(scores) if scores else None
855
  avg_str = f"{avg:.2f}" if avg else "—"
856
+
857
  html += f'<div style="margin-bottom: 1rem;"><h4 style="color: #0a0a0a;">{leaderboard_name} <span style="color: #525252;">(avg: {avg_str})</span></h4>'
858
  html += '<div style="display: flex; flex-wrap: wrap; gap: 0.5rem;">'
859
+
860
+ for metric_name, score in sorted(
861
+ results.items(), key=lambda x: x[1] if x[1] else 0, reverse=True
862
+ ):
863
  score_display = f"{score:.2f}" if score is not None else "—"
864
  html += f'<div style="padding: 0.4rem 0.8rem; border-radius: 6px; background: #f5f5f5; border: 1px solid #e5e5e5;"><span style="color: #525252;">{metric_name}:</span> <strong style="color: #0a0a0a;">{score_display}</strong></div>'
865
+
866
+ html += "</div></div>"
867
+
868
+ html += "</div>"
869
  return html
870
 
871
 
872
  def format_model_comparison(selected_models, all_results):
873
  if not selected_models or not all_results:
874
  return '<div class="no-results"><h3>Select models to compare</h3><p>Choose models from the dropdown</p></div>'
875
+
876
  all_leaderboards = set()
877
  model_data_dict = {}
878
+
879
  for model_name in selected_models:
880
  if model_name in all_results:
881
  model_data_dict[model_name] = all_results[model_name]
882
  for lb in all_results[model_name].keys():
883
  all_leaderboards.add(lb)
884
+
885
  if not model_data_dict:
886
  return '<div class="no-results"><h3>No data found</h3></div>'
887
+
888
  all_leaderboards = sorted(all_leaderboards)
889
+
890
  html = '<div style="padding: 1rem; background: #ffffff; border-radius: 10px; border: 1px solid #e5e5e5;">'
891
+
892
  for leaderboard_name in all_leaderboards:
893
  metrics = set()
894
  for md in model_data_dict.values():
895
  if leaderboard_name in md:
896
  metrics.update(md[leaderboard_name].get("results", {}).keys())
897
+
898
  metrics = sorted(metrics)
899
  if not metrics:
900
  continue
901
+
902
  html += f'<h3 style="margin: 1rem 0 0.5rem; color: #0a0a0a;">{leaderboard_name}</h3>'
903
  html += '<div style="overflow-x: auto;"><table class="heatmap-table"><thead><tr><th>Metric</th>'
904
+
905
  for model_name in selected_models:
906
  short = model_name[:20] + "…" if len(model_name) > 20 else model_name
907
  html += f'<th title="{model_name}">{short}</th>'
908
+ html += "</tr></thead><tbody>"
909
+
910
  for metric_name in metrics:
911
  html += f'<tr><td class="metric-name">{metric_name}</td>'
912
+
913
  scores = {}
914
  for m in selected_models:
915
  if m in model_data_dict and leaderboard_name in model_data_dict[m]:
916
+ scores[m] = (
917
+ model_data_dict[m][leaderboard_name]
918
+ .get("results", {})
919
+ .get(metric_name)
920
+ )
921
+
922
  valid = [v for v in scores.values() if v is not None]
923
  max_s = max(valid) if valid else None
924
  min_s = min(valid) if valid else None
925
+
926
  for model_name in selected_models:
927
  score = scores.get(model_name)
928
  if score is not None:
 
931
  cls = "best"
932
  elif max_s > min_s:
933
  pct = (score - min_s) / (max_s - min_s)
934
+ cls = (
935
+ "good"
936
+ if pct >= 0.75
937
+ else "mid"
938
+ if pct >= 0.5
939
+ else "low"
940
+ if pct >= 0.25
941
+ else "worst"
942
+ )
943
  else:
944
  cls = ""
945
  else:
 
947
  html += f'<td class="score-cell {cls}">{score:.2f}</td>'
948
  else:
949
  html += '<td class="score-cell na">—</td>'
950
+ html += "</tr>"
951
+
952
+ html += "</tbody></table></div>"
953
+
954
+ html += "</div>"
955
  return html
956
 
957
 
 
965
  for model in selected_models:
966
  if model not in all_results:
967
  continue
968
+
969
  model_data = all_results[model]
970
  for lb_name, lb_data in model_data.items():
971
  leaderboards_involved.add(lb_name)
972
  results = lb_data.get("results", {})
973
  for metric, score in results.items():
974
+ if score is None:
975
+ continue
976
  key = f"{lb_name}: {metric}"
977
  if key not in metric_data:
978
  metric_data[key] = {}
 
986
  meta_cache[lb] = get_eval_metadata(lb)
987
 
988
  fig = go.Figure()
989
+
990
  categories = sorted(metric_data.keys())
991
+
992
  for model in selected_models:
993
  r_values = []
994
  theta_values = []
995
  hover_texts = []
996
+
997
  for cat in categories:
998
  lb_name, metric_name = cat.split(": ", 1)
999
+
1000
  val = metric_data[cat].get(model)
1001
  if val is None:
1002
  r_values.append(None)
 
1006
  meta = meta_cache.get(lb_name, {}).get("evals", {}).get(metric_name, {})
1007
  min_s = meta.get("min_score")
1008
  max_s = meta.get("max_score")
1009
+
1010
  observed_vals = []
1011
  for m in selected_models:
1012
  v = metric_data[cat].get(m)
1013
  if v is not None:
1014
  observed_vals.append(v)
1015
+
1016
  observed_max = max(observed_vals) if observed_vals else 1.0
1017
+
1018
  if min_s is None:
1019
  min_s = 0
1020
  if max_s is None:
 
1023
  else:
1024
  max_s = 1
1025
  max_s = max(max_s, observed_max)
1026
+
1027
  if max_s == min_s:
1028
  norm_val = 1.0
1029
  else:
1030
  norm_val = (val - min_s) / (max_s - min_s)
1031
+
1032
  norm_val = max(0.0, min(1.0, norm_val))
1033
+
1034
  r_values.append(norm_val)
1035
  theta_values.append(cat)
1036
  hover_texts.append(f"{cat}<br>Score: {val:.2f} (Norm: {norm_val:.2f})")
1037
+
1038
  if r_values:
1039
  r_values.append(r_values[0])
1040
  theta_values.append(theta_values[0])
1041
  hover_texts.append(hover_texts[0])
 
 
 
 
 
 
 
 
 
1042
 
1043
+ fig.add_trace(
1044
+ go.Scatterpolar(
1045
+ r=r_values,
1046
+ theta=theta_values,
1047
+ name=model,
1048
+ hovertext=hover_texts,
1049
+ hoverinfo="text",
1050
+ fill="toself",
1051
  )
1052
+ )
1053
+
1054
+ fig.update_layout(
1055
+ polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
1056
  showlegend=True,
1057
  margin=dict(l=80, r=80, t=20, b=20),
1058
+ title="Model Comparison Radar (Normalized Scores)",
1059
  )
1060
+
1061
  return fig