diff --git "a/parse.ipynb" "b/parse.ipynb"
new file mode 100644--- /dev/null
+++ "b/parse.ipynb"
@@ -0,0 +1,6638 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 104,
+ "id": "73fc3ddb-9d22-4b9b-960a-f78b5111c898",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "15c618d3-e5a2-4ae8-8e2e-df916cc7d465",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json, pathlib, pandas as pd\n",
+ "from pprint import pprint\n",
+ "import os\n",
+ "from pathlib import Path\n",
+ "from collections import Counter\n",
+ "from io import StringIO\n",
+ "import numpy as np\n",
+ "\n",
+ "\n",
+ "pd.set_option(\"display.max_rows\", None) # show ALL rows\n",
+ "pd.set_option(\"display.max_columns\", None) # show ALL columns\n",
+ "\n",
+ "\n",
+ "# Time Conversion function\n",
+ "def format_time(seconds: float) -> str:\n",
+ " seconds = int(seconds)\n",
+ " hours, remainder = divmod(seconds, 3600)\n",
+ " minutes = remainder // 60 # drop leftover seconds\n",
+ "\n",
+ " parts = []\n",
+ " if hours > 0:\n",
+ " parts.append(f\"{hours}h\")\n",
+ " if minutes > 0 or not parts: # if no hours and no minutes, show 0m\n",
+ " parts.append(f\"{minutes}m\")\n",
+ "\n",
+ " return \" \".join(parts)\n",
+ "\n",
+ "\n",
+ "def list_json_files(directory: str):\n",
+ " \"\"\"\n",
+ " Reads all .json files in a given directory and returns \n",
+ " their full paths as a list.\n",
+ " \"\"\"\n",
+ " json_files = []\n",
+ " for file in os.listdir(directory):\n",
+ " if file.endswith(\".json\"):\n",
+ " full_path = os.path.join(directory, file)\n",
+ " json_files.append(full_path)\n",
+ " return json_files\n",
+ "\n",
+ "\n",
+ "def format_params(n: int) -> str:\n",
+ " \"\"\"\n",
+ " Convert raw parameter count (int) into human-friendly string.\n",
+ " Examples:\n",
+ " 6851947264 -> \"7B\"\n",
+ " 12500000000 -> \"12.5B\"\n",
+ " 560000000 -> \"560M\"\n",
+ " \"\"\"\n",
+ " if n >= 1_000_000_000: # billions\n",
+ " val = n / 1_000_000_000\n",
+ " if val.is_integer():\n",
+ " return f\"{int(val)}B\"\n",
+ " else:\n",
+ " return f\"{val:.1f}B\"\n",
+ " elif n >= 1_000_000: # millions\n",
+ " val = n / 1_000_000\n",
+ " if val.is_integer():\n",
+ " return f\"{int(val)}M\"\n",
+ " else:\n",
+ " return f\"{val:.1f}M\"\n",
+ " elif n >= 1_000: # thousands (rare for params, but included)\n",
+ " val = n / 1_000\n",
+ " if val.is_integer():\n",
+ " return f\"{int(val)}K\"\n",
+ " else:\n",
+ " return f\"{val:.1f}K\"\n",
+ " else:\n",
+ " return str(n)\n",
+ "\n",
+ "\n",
+ "metric_map = {\n",
+ " \"mmlu\":\"acc,none\" ,\n",
+ " \"hellaswag\": \"acc_norm,none\",\n",
+ " \"arc_challenge\": \"acc_norm,none\", # prefer normalized accuracy\n",
+ " \"bbh\": \"exact_match,get-answer\",\n",
+ " \"gsm8k\":\"exact_match,strict-match\" ,\n",
+ " \"gpqa_main_zeroshot\":\"acc_norm,none\",\n",
+ " \"anli_r1\": \"acc,none\",\n",
+ " \"anli_r2\": \"acc,none\",\n",
+ " \"anli_r3\": \"acc,none\",\n",
+ " \"piqa\":\"acc_norm,none\" ,\n",
+ " \"winogrande\": \"acc,none\",\n",
+ " \"boolq\": \"acc,none\",\n",
+ " \"truthfulqa_mc1\":\"acc,none\" ,\n",
+ " \"truthfulqa_mc2\":\"acc,none\" ,\n",
+ " \"drop\": \"f1,none\",\n",
+ " \"nq_open\":\"exact_match,remove_whitespace\" ,\n",
+ " \"openbookqa\":\"acc_norm,none\" ,\n",
+ " \"sciq\": \"acc_norm,none\",\n",
+ " \"triviaqa\":\"exact_match,remove_whitespace\" ,\n",
+ " \"qnli\":\"acc,none\" ,\n",
+ "}\n",
+ "\n",
+ "# Tasks from most important to least important\n",
+ "# tasks = [mmlu, hellaswag, arc_challenge, bbh, gsm8k, gpqa_main_zeroshot, ANLI (r1, r2, r3), piqa, winogrande, boolq, TruthfulQA (mc1, mc2), drop, nq_open, openbookqa, sciq, triviaqa, qnli]\n",
+ "\n",
+ "# Path list \n",
+ "directory = \"/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/results\"\n",
+ "all_json_paths = list_json_files(directory)\n",
+ "\n",
+ "def parse_results(json_path: str, metric_map: dict) -> pd.DataFrame:\n",
+ "\n",
+ " with open(json_path,'r') as f:\n",
+ " data = json.load(f)\n",
+ "\n",
+ " # Extract core info\n",
+ " model_name = data.get(\"model_name\")\n",
+ " model_name = model_name.split(\"/\")[-1]\n",
+ " total_time_raw = float(data.get(\"total_evaluation_time_seconds\", 0))\n",
+ " total_time = format_time(float(data.get(\"total_evaluation_time_seconds\", 0)))\n",
+ " batch_size = data[\"config\"].get(\"batch_size\")\n",
+ " batch_sizes = data[\"config\"].get(\"batch_sizes\")\n",
+ " parameters = format_params(data[\"config\"].get(\"model_num_parameters\"))\n",
+ " parameters_raw = data[\"config\"].get(\"model_num_parameters\")\n",
+ " \n",
+ "\n",
+ " rows = []\n",
+ " for task, metric_key in metric_map.items():\n",
+ " # Skip tasks not present in the results\n",
+ " if task not in data[\"results\"]:\n",
+ " raise ValueError(f\"'{task}' not in results! \") \n",
+ " \n",
+ " metrics = data[\"results\"][task]\n",
+ " \n",
+ " # If the metric_key isn't in this task's results, raise error\n",
+ " if metric_key not in metrics:\n",
+ " raise ValueError(\n",
+ " f\"Expected metric '{metric_key}' not found for task '{task}'. \"\n",
+ " f\"Available keys: {list(metrics.keys())}\"\n",
+ " )\n",
+ " \n",
+ " acc = metrics[metric_key]\n",
+ " \n",
+ " row = {\n",
+ " \"model_name\": model_name,\n",
+ " # \"task\": task,\n",
+ " \"task\": task + \"(\" + metric_key + \")\",\n",
+ " \"score\": acc,\n",
+ " \"total_time\": total_time,\n",
+ " \"total_time_raw\" : total_time_raw,\n",
+ " \"batch_size\": batch_size,\n",
+ " \"batch_sizes\": batch_sizes,\n",
+ " \"parameters\": parameters,\n",
+ " \"parameters_raw\": parameters_raw,\n",
+ " }\n",
+ " rows.append(row)\n",
+ "\n",
+ " # Convert to tidy dataframe\n",
+ " return pd.DataFrame(rows)\n",
+ "\n",
+ "\n",
+ "dfs = [parse_results(path, metric_map) for path in all_json_paths]\n",
+ "master_df = pd.concat(dfs, ignore_index=True)\n",
+ "\n",
+ "\n",
+ "# display(master_df)\n",
+ "\n",
+ "\n",
+ "# Wide format: one row per model, columns = tasks\n",
+ "#Check for duplicate rows \n",
+ "key_cols = [\"task\", 'score', 'model_name']\n",
+ "dups_mask = master_df.duplicated(key_cols, keep=False)\n",
+ "# dups = master_df.loc[dups_mask]\n",
+ "# display(dups)\n",
+ "\n",
+ "if dups_mask.any():\n",
+ " dups = master_df.loc[dups_mask, key_cols]\n",
+ " raise ValueError(f\"Duplicate rows found for keys:\\n{dups}\")\n",
+ "\n",
+ "wide_df = master_df.pivot_table(\n",
+ " index=[\"model_name\", \"parameters\"],\n",
+ " columns=[\"task\"],\n",
+ " values=\"score\",\n",
+ " aggfunc=\"mean\"\n",
+ ").reset_index()\n",
+ "\n",
+ "# select the metadata columns you want from the long df\n",
+ "meta_cols = [\n",
+ " \"model_name\", \n",
+ " \"parameters\", \n",
+ " \"parameters_raw\",\n",
+ " \"total_time\", \n",
+ " \"total_time_raw\", \n",
+ " \"batch_size\", \n",
+ " \"batch_sizes\", \n",
+ "]\n",
+ "\n",
+ "# drop duplicate rows by model_name + parameters\n",
+ "df_meta = master_df[meta_cols].drop_duplicates(subset=[\"model_name\", \"parameters\"])\n",
+ "\n",
+ "# merge the metadata back into your wide dataframe\n",
+ "df_wide_merged = df_meta.merge(wide_df, on=[\"model_name\", \"parameters\"], how=\"left\")\n",
+ "\n",
+ "\n",
+ "# display(df_wide_merged.drop(columns=[\"parameters_raw\", \"total_time_raw\", \"batch_sizes\"]))\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "324364b8-b59a-4450-8723-0c4057488513",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "gpu_dir = Path(\"/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/gpu_usage\")\n",
+ "gpu_files = list(gpu_dir.glob(\"*_gpu_usage.csv\"))\n",
+ "\n",
+ "def model_from_filename(p: Path) -> str:\n",
+ " return p.stem.replace(\"_gpu_usage\", \"\").strip()\n",
+ "\n",
+ "model_names_gpu = [model_from_filename(x) for x in gpu_files]\n",
+ "\n",
+ "# Check if match with result\n",
+ "set_gpu = set(model_names_gpu)\n",
+ "set_results = set(wide_df['model_name'])\n",
+ "extra_in_gpu = set_gpu - set_results\n",
+ "missing_in_gpu = set_results - set_gpu\n",
+ "# print(\"Extra models in GPU logs:\", extra_in_gpu)\n",
+ "# print(\"Models in results with no GPU log:\", missing_in_gpu)\n",
+ "\n",
+ "\n",
+ "# Check for Dulicates\n",
+ "def print_duplicates(name_list, label=\"\"):\n",
+ " counts = Counter(name_list)\n",
+ " dups = [name for name, cnt in counts.items() if cnt > 1]\n",
+ " if dups:\n",
+ " print(f\"Duplicates in {label}:\")\n",
+ " for name in dups:\n",
+ " print(f\" {name} (count = {counts[name]})\")\n",
+ " else:\n",
+ " print(f\"No duplicates found in {label}.\")\n",
+ "# print_duplicates(model_names_gpu, \"GPU logs\")\n",
+ "\n",
+ "\n",
+ "def read_last_run_csv(path: Path) -> pd.DataFrame:\n",
+ " \"\"\"\n",
+ " Return a DataFrame for only the *last* '==== New Run ... ====' block.\n",
+ " Assumes next line after the marker is the CSV header.\n",
+ " \"\"\"\n",
+ " lines = path.read_text(encoding=\"utf-8\").splitlines()\n",
+ " # locate all run markers\n",
+ " run_idx = [i for i, line in enumerate(lines) if line.startswith(\"==== New Run:\")]\n",
+ " if not run_idx:\n",
+ " raise ValueError(f\"No '==== New Run' marker found in {path}\")\n",
+ " start = run_idx[-1] + 1 # header line index\n",
+ "\n",
+ " # slice from header to end and parse CSV\n",
+ " block = \"\\n\".join(lines[start:])\n",
+ " df = pd.read_csv(StringIO(block))\n",
+ "\n",
+ " # optional cleanup: strip units and cast to numbers if these columns exist\n",
+ " if \" utilization.gpu [%]\" in df.columns:\n",
+ " df[\" utilization.gpu [%]\"] = (\n",
+ " df[\" utilization.gpu [%]\"].astype(str).str.replace(\"%\", \"\", regex=False).str.strip().astype(\"float\")\n",
+ " )\n",
+ " if \" memory.used [MiB]\" in df.columns:\n",
+ " df[\" memory.used [MiB]\"] = (\n",
+ " df[\" memory.used [MiB]\"].astype(str).str.replace(\"MiB\", \"\", regex=False).str.strip().astype(\"float\")\n",
+ " )\n",
+ " # parse timestamp if desired\n",
+ " if \"timestamp\" in df.columns:\n",
+ " df[\"timestamp\"] = pd.to_datetime(df[\"timestamp\"], errors=\"coerce\")\n",
+ "\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def eq_full_util_time(df, util_col=\" utilization.gpu [%]\", interval_sec=60):\n",
+ " # clip just in case and cast to float\n",
+ " u = pd.to_numeric(df[util_col], errors=\"coerce\")\n",
+ " # u = pd.to_numeric(df[util_col], errors=\"coerce\").fillna(0).clip(0, 100)\n",
+ " eq_full_sec = float((u / 100 * interval_sec).sum())\n",
+ " full_sec = float(len(u)*interval_sec)\n",
+ "\n",
+ " # pretty formatter\n",
+ " h, rem = divmod(int(round(full_sec)), 3600)\n",
+ " m, s = divmod(rem, 60)\n",
+ " pretty_full = f\"{h}h {m}m\"\n",
+ " h, rem = divmod(int(round(eq_full_sec)), 3600)\n",
+ " m, s = divmod(rem, 60)\n",
+ " pretty = f\"{h}h {m}m\"\n",
+ " return pretty, pretty_full, eq_full_sec\n",
+ "\n",
+ "\n",
+ "gpu_df = [read_last_run_csv(df) for df in gpu_files]\n",
+ "\n",
+ "\n",
+ "results = []\n",
+ "for name, df in zip(model_names_gpu, gpu_df):\n",
+ " pretty, pretty_full, full_sec_raw = eq_full_util_time(df) # unpack values\n",
+ " results.append((name, pretty, full_sec_raw, pretty_full)) # collect tuple\n",
+ "\n",
+ "# Turn into DataFrame\n",
+ "gpu_util_df = pd.DataFrame(results, columns=[\"model_name\", \"gpu_util_time\", \"gpu_util_time_raw\", 'full_time_from_gpu_log'])\n",
+ "\n",
+ "result_gpu_merged = gpu_util_df.merge(df_wide_merged, on=[\"model_name\"], how=\"left\")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "id": "2fa54bc3-81f2-492c-832c-26e4f9a7cff3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Overall Rank | \n",
+ " Model Name | \n",
+ " GPU Util Time | \n",
+ " gpu_util_time_raw | \n",
+ " full_time_from_gpu_log | \n",
+ " Parameters | \n",
+ " parameters_raw | \n",
+ " Total Time | \n",
+ " total_time_raw | \n",
+ " batch_size | \n",
+ " batch_sizes | \n",
+ " anli_r1(acc,none) | \n",
+ " anli_r2(acc,none) | \n",
+ " anli_r3(acc,none) | \n",
+ " arc_challenge(acc_norm,none) | \n",
+ " bbh(exact_match,get-answer) | \n",
+ " boolq(acc,none) | \n",
+ " drop(f1,none) | \n",
+ " gpqa_main_zeroshot(acc_norm,none) | \n",
+ " gsm8k(exact_match,strict-match) | \n",
+ " hellaswag(acc_norm,none) | \n",
+ " mmlu(acc,none) | \n",
+ " nq_open(exact_match,remove_whitespace) | \n",
+ " openbookqa(acc_norm,none) | \n",
+ " piqa(acc_norm,none) | \n",
+ " qnli(acc,none) | \n",
+ " sciq(acc_norm,none) | \n",
+ " triviaqa(exact_match,remove_whitespace) | \n",
+ " truthfulqa_mc1(acc,none) | \n",
+ " truthfulqa_mc2(acc,none) | \n",
+ " winogrande(acc,none) | \n",
+ " gsm8k(exact_match,strict-match)_rank | \n",
+ " bbh(exact_match,get-answer)_rank | \n",
+ " arc_challenge(acc_norm,none)_rank | \n",
+ " anli_r1(acc,none)_rank | \n",
+ " anli_r2(acc,none)_rank | \n",
+ " anli_r3(acc,none)_rank | \n",
+ " gpqa_main_zeroshot(acc_norm,none)_rank | \n",
+ " hellaswag(acc_norm,none)_rank | \n",
+ " piqa(acc_norm,none)_rank | \n",
+ " winogrande(acc,none)_rank | \n",
+ " boolq(acc,none)_rank | \n",
+ " openbookqa(acc_norm,none)_rank | \n",
+ " sciq(acc_norm,none)_rank | \n",
+ " qnli(acc,none)_rank | \n",
+ " mmlu(acc,none)_rank | \n",
+ " nq_open(exact_match,remove_whitespace)_rank | \n",
+ " drop(f1,none)_rank | \n",
+ " truthfulqa_mc1(acc,none)_rank | \n",
+ " truthfulqa_mc2(acc,none)_rank | \n",
+ " triviaqa(exact_match,remove_whitespace)_rank | \n",
+ " Reasoning & Math Mean Score | \n",
+ " Reasoning & Math Avg. Rank | \n",
+ " Commonsense & NLI Mean Score | \n",
+ " Commonsense & NLI Avg. Rank | \n",
+ " Knowledge & Reading Mean Score | \n",
+ " Knowledge & Reading Avg. Rank | \n",
+ " Mean Score | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 1 | \n",
+ " google_gemma-3-12b-it | \n",
+ " 14h 8m | \n",
+ " 50906.4 | \n",
+ " 15h 47m | \n",
+ " 12.2B | \n",
+ " 12187325040 | \n",
+ " 15h 45m | \n",
+ " 56750.865892 | \n",
+ " auto | \n",
+ " [2] | \n",
+ " 0.603 | \n",
+ " 0.560 | \n",
+ " 0.595833 | \n",
+ " 0.610922 | \n",
+ " 0.801874 | \n",
+ " 0.874618 | \n",
+ " 0.139566 | \n",
+ " 0.337054 | \n",
+ " 0.877180 | \n",
+ " 0.818761 | \n",
+ " 0.716137 | \n",
+ " 0.157064 | \n",
+ " 0.498 | \n",
+ " 0.780740 | \n",
+ " 0.745744 | \n",
+ " 0.954 | \n",
+ " 0.275245 | \n",
+ " 0.405141 | \n",
+ " 0.581183 | \n",
+ " 0.744278 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 5.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 9.0 | \n",
+ " 3.0 | \n",
+ " 19.0 | \n",
+ " 4.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 6.0 | \n",
+ " 8.0 | \n",
+ " 6.0 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 23.0 | \n",
+ " 0.6266 | \n",
+ " 1 | \n",
+ " 0.7737 | \n",
+ " 3 | \n",
+ " 0.3791 | \n",
+ " 10 | \n",
+ " 0.6038 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2 | \n",
+ " Qwen_Qwen3-14B (8bit) | \n",
+ " 17h 29m | \n",
+ " 62956.2 | \n",
+ " 29h 46m | \n",
+ " 14.8B | \n",
+ " 14768307200 | \n",
+ " 29h 45m | \n",
+ " 107151.802065 | \n",
+ " 1 | \n",
+ " [] | \n",
+ " 0.646 | \n",
+ " 0.570 | \n",
+ " 0.556667 | \n",
+ " 0.600683 | \n",
+ " 0.432960 | \n",
+ " 0.891743 | \n",
+ " 0.090410 | \n",
+ " 0.397321 | \n",
+ " 0.898408 | \n",
+ " 0.787692 | \n",
+ " 0.769477 | \n",
+ " 0.092244 | \n",
+ " 0.460 | \n",
+ " 0.794886 | \n",
+ " 0.844225 | \n",
+ " 0.966 | \n",
+ " 0.407490 | \n",
+ " 0.406365 | \n",
+ " 0.589404 | \n",
+ " 0.720600 | \n",
+ " 1.0 | \n",
+ " 29.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 12.0 | \n",
+ " 14.0 | \n",
+ " 11.0 | \n",
+ " 1.0 | \n",
+ " 8.0 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 15.0 | \n",
+ " 18.0 | \n",
+ " 7.0 | \n",
+ " 6.0 | \n",
+ " 13.0 | \n",
+ " 0.5860 | \n",
+ " 3 | \n",
+ " 0.7807 | \n",
+ " 2 | \n",
+ " 0.3926 | \n",
+ " 7 | \n",
+ " 0.5961 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 3 | \n",
+ " openchat_openchat-3.6-8b-20240522 | \n",
+ " 6h 59m | \n",
+ " 25150.8 | \n",
+ " 7h 52m | \n",
+ " 8.0B | \n",
+ " 8030261248 | \n",
+ " 7h 51m | \n",
+ " 28278.859470 | \n",
+ " 3 | \n",
+ " [] | \n",
+ " 0.556 | \n",
+ " 0.513 | \n",
+ " 0.480000 | \n",
+ " 0.603242 | \n",
+ " 0.617877 | \n",
+ " 0.872783 | \n",
+ " 0.251569 | \n",
+ " 0.332589 | \n",
+ " 0.750569 | \n",
+ " 0.797849 | \n",
+ " 0.643071 | \n",
+ " 0.170637 | \n",
+ " 0.462 | \n",
+ " 0.818281 | \n",
+ " 0.730002 | \n",
+ " 0.964 | \n",
+ " 0.565927 | \n",
+ " 0.352509 | \n",
+ " 0.497601 | \n",
+ " 0.763220 | \n",
+ " 16.0 | \n",
+ " 10.0 | \n",
+ " 3.0 | \n",
+ " 9.0 | \n",
+ " 8.0 | \n",
+ " 11.0 | \n",
+ " 11.0 | \n",
+ " 6.0 | \n",
+ " 3.0 | \n",
+ " 1.0 | \n",
+ " 4.0 | \n",
+ " 7.0 | \n",
+ " 2.0 | \n",
+ " 9.0 | \n",
+ " 14.0 | \n",
+ " 5.0 | \n",
+ " 3.0 | \n",
+ " 15.0 | \n",
+ " 19.0 | \n",
+ " 3.0 | \n",
+ " 0.5505 | \n",
+ " 6 | \n",
+ " 0.7726 | \n",
+ " 5 | \n",
+ " 0.4136 | \n",
+ " 2 | \n",
+ " 0.5871 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 4 | \n",
+ " Qwen_Qwen3-8B | \n",
+ " 13h 44m | \n",
+ " 49497.0 | \n",
+ " 15h 33m | \n",
+ " 8.2B | \n",
+ " 8190735360 | \n",
+ " 15h 31m | \n",
+ " 55918.467860 | \n",
+ " auto | \n",
+ " [1] | \n",
+ " 0.669 | \n",
+ " 0.542 | \n",
+ " 0.555833 | \n",
+ " 0.562287 | \n",
+ " 0.797573 | \n",
+ " 0.865749 | \n",
+ " 0.109877 | \n",
+ " 0.350446 | \n",
+ " 0.872631 | \n",
+ " 0.748656 | \n",
+ " 0.728956 | \n",
+ " 0.073684 | \n",
+ " 0.418 | \n",
+ " 0.775299 | \n",
+ " 0.781805 | \n",
+ " 0.958 | \n",
+ " 0.320609 | \n",
+ " 0.363525 | \n",
+ " 0.543140 | \n",
+ " 0.680347 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " 11.0 | \n",
+ " 3.0 | \n",
+ " 5.0 | \n",
+ " 4.0 | \n",
+ " 5.0 | \n",
+ " 24.0 | \n",
+ " 21.0 | \n",
+ " 22.0 | \n",
+ " 6.0 | \n",
+ " 20.0 | \n",
+ " 4.0 | \n",
+ " 7.0 | \n",
+ " 3.0 | \n",
+ " 16.0 | \n",
+ " 13.0 | \n",
+ " 13.0 | \n",
+ " 13.0 | \n",
+ " 18.0 | \n",
+ " 0.6214 | \n",
+ " 2 | \n",
+ " 0.7468 | \n",
+ " 8 | \n",
+ " 0.3566 | \n",
+ " 14 | \n",
+ " 0.5859 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " Qwen_Qwen2.5-7B-Instruct | \n",
+ " 8h 33m | \n",
+ " 30831.6 | \n",
+ " 9h 38m | \n",
+ " 7.6B | \n",
+ " 7615616512 | \n",
+ " 9h 36m | \n",
+ " 34616.604248 | \n",
+ " 3 | \n",
+ " [] | \n",
+ " 0.685 | \n",
+ " 0.549 | \n",
+ " 0.552500 | \n",
+ " 0.552901 | \n",
+ " 0.448779 | \n",
+ " 0.863303 | \n",
+ " 0.071089 | \n",
+ " 0.328125 | \n",
+ " 0.762699 | \n",
+ " 0.804919 | \n",
+ " 0.718060 | \n",
+ " 0.045706 | \n",
+ " 0.486 | \n",
+ " 0.803047 | \n",
+ " 0.804503 | \n",
+ " 0.937 | \n",
+ " 0.325401 | \n",
+ " 0.477356 | \n",
+ " 0.648483 | \n",
+ " 0.711918 | \n",
+ " 12.0 | \n",
+ " 27.0 | \n",
+ " 12.0 | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " 5.0 | \n",
+ " 12.0 | \n",
+ " 5.0 | \n",
+ " 10.0 | \n",
+ " 14.0 | \n",
+ " 7.0 | \n",
+ " 2.0 | \n",
+ " 12.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 23.0 | \n",
+ " 27.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 17.0 | \n",
+ " 0.5541 | \n",
+ " 5 | \n",
+ " 0.7730 | \n",
+ " 4 | \n",
+ " 0.3810 | \n",
+ " 9 | \n",
+ " 0.5788 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 6 | \n",
+ " Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
+ " 29h 32m | \n",
+ " 106374.6 | \n",
+ " 52h 45m | \n",
+ " 14.8B | \n",
+ " 14770033664 | \n",
+ " 52h 44m | \n",
+ " 189869.409404 | \n",
+ " 1 | \n",
+ " [] | \n",
+ " 0.721 | \n",
+ " 0.634 | \n",
+ " 0.617500 | \n",
+ " 0.615188 | \n",
+ " 0.106896 | \n",
+ " 0.886239 | \n",
+ " 0.071276 | \n",
+ " 0.354911 | \n",
+ " 0.792267 | \n",
+ " 0.841964 | \n",
+ " 0.783079 | \n",
+ " 0.061496 | \n",
+ " 0.476 | \n",
+ " 0.817193 | \n",
+ " 0.853926 | \n",
+ " 0.929 | \n",
+ " 0.039289 | \n",
+ " 0.510404 | \n",
+ " 0.683015 | \n",
+ " 0.754538 | \n",
+ " 9.0 | \n",
+ " 41.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " 17.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 19.0 | \n",
+ " 25.0 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 35.0 | \n",
+ " 0.5488 | \n",
+ " 7 | \n",
+ " 0.7941 | \n",
+ " 1 | \n",
+ " 0.3581 | \n",
+ " 13 | \n",
+ " 0.5775 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 7 | \n",
+ " 01-ai_Yi-1.5-9B | \n",
+ " 10h 26m | \n",
+ " 37569.6 | \n",
+ " 11h 44m | \n",
+ " 8.8B | \n",
+ " 8829407232 | \n",
+ " 11h 43m | \n",
+ " 42212.112622 | \n",
+ " 2 | \n",
+ " [] | \n",
+ " 0.532 | \n",
+ " 0.480 | \n",
+ " 0.439167 | \n",
+ " 0.546928 | \n",
+ " 0.712026 | \n",
+ " 0.858104 | \n",
+ " 0.445686 | \n",
+ " 0.294643 | \n",
+ " 0.639121 | \n",
+ " 0.778929 | \n",
+ " 0.689289 | \n",
+ " 0.153186 | \n",
+ " 0.456 | \n",
+ " 0.806311 | \n",
+ " 0.508695 | \n",
+ " 0.952 | \n",
+ " 0.543803 | \n",
+ " 0.321909 | \n",
+ " 0.467572 | \n",
+ " 0.726125 | \n",
+ " 25.0 | \n",
+ " 5.0 | \n",
+ " 15.0 | \n",
+ " 12.0 | \n",
+ " 12.0 | \n",
+ " 18.0 | \n",
+ " 20.0 | \n",
+ " 14.0 | \n",
+ " 6.0 | \n",
+ " 9.0 | \n",
+ " 10.0 | \n",
+ " 9.0 | \n",
+ " 7.0 | \n",
+ " 28.0 | \n",
+ " 8.0 | \n",
+ " 10.0 | \n",
+ " 1.0 | \n",
+ " 19.0 | \n",
+ " 26.0 | \n",
+ " 4.0 | \n",
+ " 0.5206 | \n",
+ " 16 | \n",
+ " 0.7266 | \n",
+ " 15 | \n",
+ " 0.4369 | \n",
+ " 1 | \n",
+ " 0.5676 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 8 | \n",
+ " Qwen_Qwen2.5-7B-Instruct-1M | \n",
+ " 10h 10m | \n",
+ " 36621.0 | \n",
+ " 11h 18m | \n",
+ " 7.6B | \n",
+ " 7615616512 | \n",
+ " 11h 17m | \n",
+ " 40632.813397 | \n",
+ " auto | \n",
+ " [1] | \n",
+ " 0.585 | \n",
+ " 0.533 | \n",
+ " 0.556667 | \n",
+ " 0.585324 | \n",
+ " 0.277223 | \n",
+ " 0.852599 | \n",
+ " 0.057047 | \n",
+ " 0.339286 | \n",
+ " 0.795299 | \n",
+ " 0.789982 | \n",
+ " 0.716636 | \n",
+ " 0.157618 | \n",
+ " 0.480 | \n",
+ " 0.816104 | \n",
+ " 0.678199 | \n",
+ " 0.950 | \n",
+ " 0.420531 | \n",
+ " 0.425949 | \n",
+ " 0.600072 | \n",
+ " 0.727703 | \n",
+ " 8.0 | \n",
+ " 38.0 | \n",
+ " 7.0 | \n",
+ " 6.0 | \n",
+ " 6.0 | \n",
+ " 3.0 | \n",
+ " 8.0 | \n",
+ " 11.0 | \n",
+ " 5.0 | \n",
+ " 8.0 | \n",
+ " 12.0 | \n",
+ " 3.0 | \n",
+ " 8.0 | \n",
+ " 11.0 | \n",
+ " 5.0 | \n",
+ " 7.0 | \n",
+ " 29.0 | \n",
+ " 3.0 | \n",
+ " 3.0 | \n",
+ " 12.0 | \n",
+ " 0.5245 | \n",
+ " 15 | \n",
+ " 0.7564 | \n",
+ " 7 | \n",
+ " 0.3963 | \n",
+ " 6 | \n",
+ " 0.5672 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 9 | \n",
+ " meta-llama_Llama-3.1-8B-Instruct | \n",
+ " 10h 52m | \n",
+ " 39147.6 | \n",
+ " 12h 20m | \n",
+ " 8.0B | \n",
+ " 8030261248 | \n",
+ " 12h 19m | \n",
+ " 44363.249360 | \n",
+ " auto | \n",
+ " [1] | \n",
+ " 0.482 | \n",
+ " 0.467 | \n",
+ " 0.443333 | \n",
+ " 0.550341 | \n",
+ " 0.715558 | \n",
+ " 0.841590 | \n",
+ " 0.193729 | \n",
+ " 0.343750 | \n",
+ " 0.754359 | \n",
+ " 0.792073 | \n",
+ " 0.679319 | \n",
+ " 0.177562 | \n",
+ " 0.432 | \n",
+ " 0.806311 | \n",
+ " 0.501373 | \n",
+ " 0.962 | \n",
+ " 0.518168 | \n",
+ " 0.365973 | \n",
+ " 0.541154 | \n",
+ " 0.738753 | \n",
+ " 15.0 | \n",
+ " 4.0 | \n",
+ " 13.0 | \n",
+ " 18.0 | \n",
+ " 14.0 | \n",
+ " 17.0 | \n",
+ " 6.0 | \n",
+ " 9.0 | \n",
+ " 6.0 | \n",
+ " 6.0 | \n",
+ " 16.0 | \n",
+ " 16.0 | \n",
+ " 3.0 | \n",
+ " 30.0 | \n",
+ " 12.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 12.0 | \n",
+ " 14.0 | \n",
+ " 7.0 | \n",
+ " 0.5366 | \n",
+ " 12 | \n",
+ " 0.7249 | \n",
+ " 17 | \n",
+ " 0.4127 | \n",
+ " 3 | \n",
+ " 0.5653 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 10 | \n",
+ " 01-ai_Yi-1.5-9B-Chat | \n",
+ " 12h 15m | \n",
+ " 44120.4 | \n",
+ " 13h 55m | \n",
+ " 8.8B | \n",
+ " 8829407232 | \n",
+ " 13h 54m | \n",
+ " 50056.331345 | \n",
+ " 2 | \n",
+ " [] | \n",
+ " 0.535 | \n",
+ " 0.509 | \n",
+ " 0.525833 | \n",
+ " 0.587031 | \n",
+ " 0.610659 | \n",
+ " 0.868196 | \n",
+ " 0.125326 | \n",
+ " 0.303571 | \n",
+ " 0.708112 | \n",
+ " 0.787293 | \n",
+ " 0.684091 | \n",
+ " 0.009418 | \n",
+ " 0.436 | \n",
+ " 0.803591 | \n",
+ " 0.787662 | \n",
+ " 0.954 | \n",
+ " 0.338665 | \n",
+ " 0.374541 | \n",
+ " 0.547934 | \n",
+ " 0.746646 | \n",
+ " 18.0 | \n",
+ " 12.0 | \n",
+ " 6.0 | \n",
+ " 11.0 | \n",
+ " 9.0 | \n",
+ " 6.0 | \n",
+ " 18.0 | \n",
+ " 13.0 | \n",
+ " 9.0 | \n",
+ " 3.0 | \n",
+ " 5.0 | \n",
+ " 14.0 | \n",
+ " 6.0 | \n",
+ " 6.0 | \n",
+ " 9.0 | \n",
+ " 36.0 | \n",
+ " 9.0 | \n",
+ " 10.0 | \n",
+ " 11.0 | \n",
+ " 15.0 | \n",
+ " 0.5399 | \n",
+ " 9 | \n",
+ " 0.7691 | \n",
+ " 6 | \n",
+ " 0.3467 | \n",
+ " 15 | \n",
+ " 0.5621 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 11 | \n",
+ " mistralai_Ministral-8B-Instruct-2410 | \n",
+ " 9h 27m | \n",
+ " 34053.6 | \n",
+ " 10h 47m | \n",
+ " 8.0B | \n",
+ " 8019808256 | \n",
+ " 10h 46m | \n",
+ " 38770.339256 | \n",
+ " auto | \n",
+ " [1] | \n",
+ " 0.488 | \n",
+ " 0.487 | \n",
+ " 0.465833 | \n",
+ " 0.562287 | \n",
+ " 0.692520 | \n",
+ " 0.860245 | \n",
+ " 0.071413 | \n",
+ " 0.341518 | \n",
+ " 0.774829 | \n",
+ " 0.791077 | \n",
+ " 0.640721 | \n",
+ " 0.157618 | \n",
+ " 0.466 | \n",
+ " 0.823177 | \n",
+ " 0.494966 | \n",
+ " 0.956 | \n",
+ " 0.527809 | \n",
+ " 0.325581 | \n",
+ " 0.486670 | \n",
+ " 0.737964 | \n",
+ " 11.0 | \n",
+ " 7.0 | \n",
+ " 11.0 | \n",
+ " 16.0 | \n",
+ " 11.0 | \n",
+ " 14.0 | \n",
+ " 7.0 | \n",
+ " 10.0 | \n",
+ " 2.0 | \n",
+ " 7.0 | \n",
+ " 8.0 | \n",
+ " 6.0 | \n",
+ " 5.0 | \n",
+ " 39.0 | \n",
+ " 15.0 | \n",
+ " 7.0 | \n",
+ " 24.0 | \n",
+ " 18.0 | \n",
+ " 22.0 | \n",
+ " 5.0 | \n",
+ " 0.5446 | \n",
+ " 8 | \n",
+ " 0.7328 | \n",
+ " 12 | \n",
+ " 0.3683 | \n",
+ " 12 | \n",
+ " 0.5576 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 12 | \n",
+ " meta-llama_Meta-Llama-3-8B-Instruct | \n",
+ " 5h 46m | \n",
+ " 20809.8 | \n",
+ " 6h 31m | \n",
+ " 8.0B | \n",
+ " 8030261248 | \n",
+ " 6h 30m | \n",
+ " 23440.234421 | \n",
+ " 3 | \n",
+ " [] | \n",
+ " 0.484 | \n",
+ " 0.458 | \n",
+ " 0.448333 | \n",
+ " 0.563993 | \n",
+ " 0.679005 | \n",
+ " 0.831193 | \n",
+ " 0.163977 | \n",
+ " 0.310268 | \n",
+ " 0.756634 | \n",
+ " 0.759211 | \n",
+ " 0.638727 | \n",
+ " 0.159003 | \n",
+ " 0.430 | \n",
+ " 0.787269 | \n",
+ " 0.546403 | \n",
+ " 0.932 | \n",
+ " 0.511202 | \n",
+ " 0.363525 | \n",
+ " 0.517142 | \n",
+ " 0.716654 | \n",
+ " 14.0 | \n",
+ " 8.0 | \n",
+ " 10.0 | \n",
+ " 17.0 | \n",
+ " 18.0 | \n",
+ " 16.0 | \n",
+ " 16.0 | \n",
+ " 19.0 | \n",
+ " 18.0 | \n",
+ " 13.0 | \n",
+ " 19.0 | \n",
+ " 17.0 | \n",
+ " 15.0 | \n",
+ " 20.0 | \n",
+ " 16.0 | \n",
+ " 6.0 | \n",
+ " 5.0 | \n",
+ " 13.0 | \n",
+ " 17.0 | \n",
+ " 8.0 | \n",
+ " 0.5286 | \n",
+ " 13 | \n",
+ " 0.7147 | \n",
+ " 22 | \n",
+ " 0.3923 | \n",
+ " 8 | \n",
+ " 0.5528 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 13 | \n",
+ " Qwen_Qwen3-4B | \n",
+ " 5h 3m | \n",
+ " 18234.6 | \n",
+ " 5h 52m | \n",
+ " 4.0B | \n",
+ " 4022468096 | \n",
+ " 5h 51m | \n",
+ " 21077.943646 | \n",
+ " 6 | \n",
+ " [] | \n",
+ " 0.550 | \n",
+ " 0.461 | \n",
+ " 0.513333 | \n",
+ " 0.539249 | \n",
+ " 0.752265 | \n",
+ " 0.850459 | \n",
+ " 0.097707 | \n",
+ " 0.325893 | \n",
+ " 0.856710 | \n",
+ " 0.683330 | \n",
+ " 0.683592 | \n",
+ " 0.014681 | \n",
+ " 0.402 | \n",
+ " 0.751360 | \n",
+ " 0.808713 | \n",
+ " 0.932 | \n",
+ " 0.225033 | \n",
+ " 0.367197 | \n",
+ " 0.547575 | \n",
+ " 0.658248 | \n",
+ " 5.0 | \n",
+ " 3.0 | \n",
+ " 17.0 | \n",
+ " 10.0 | \n",
+ " 17.0 | \n",
+ " 8.0 | \n",
+ " 13.0 | \n",
+ " 29.0 | \n",
+ " 27.0 | \n",
+ " 27.0 | \n",
+ " 13.0 | \n",
+ " 23.0 | \n",
+ " 15.0 | \n",
+ " 3.0 | \n",
+ " 10.0 | \n",
+ " 34.0 | \n",
+ " 16.0 | \n",
+ " 11.0 | \n",
+ " 12.0 | \n",
+ " 26.0 | \n",
+ " 0.5712 | \n",
+ " 4 | \n",
+ " 0.7266 | \n",
+ " 16 | \n",
+ " 0.3226 | \n",
+ " 21 | \n",
+ " 0.5510 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 14 | \n",
+ " NousResearch_Hermes-2-Pro-Mistral-7B | \n",
+ " 7h 28m | \n",
+ " 26916.0 | \n",
+ " 8h 28m | \n",
+ " 7.2B | \n",
+ " 7241994240 | \n",
+ " 8h 27m | \n",
+ " 30434.329021 | \n",
+ " 3 | \n",
+ " [] | \n",
+ " 0.531 | \n",
+ " 0.496 | \n",
+ " 0.500000 | \n",
+ " 0.565700 | \n",
+ " 0.573798 | \n",
+ " 0.868196 | \n",
+ " 0.109754 | \n",
+ " 0.276786 | \n",
+ " 0.685368 | \n",
+ " 0.804919 | \n",
+ " 0.605113 | \n",
+ " 0.040443 | \n",
+ " 0.434 | \n",
+ " 0.798694 | \n",
+ " 0.556471 | \n",
+ " 0.917 | \n",
+ " 0.471132 | \n",
+ " 0.413709 | \n",
+ " 0.591156 | \n",
+ " 0.719811 | \n",
+ " 21.0 | \n",
+ " 17.0 | \n",
+ " 9.0 | \n",
+ " 13.0 | \n",
+ " 10.0 | \n",
+ " 9.0 | \n",
+ " 25.0 | \n",
+ " 5.0 | \n",
+ " 12.0 | \n",
+ " 12.0 | \n",
+ " 5.0 | \n",
+ " 15.0 | \n",
+ " 20.0 | \n",
+ " 18.0 | \n",
+ " 20.0 | \n",
+ " 25.0 | \n",
+ " 14.0 | \n",
+ " 6.0 | \n",
+ " 5.0 | \n",
+ " 11.0 | \n",
+ " 0.5184 | \n",
+ " 17 | \n",
+ " 0.7284 | \n",
+ " 13 | \n",
+ " 0.3719 | \n",
+ " 11 | \n",
+ " 0.5480 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 15 | \n",
+ " mistralai_Mistral-7B-Instruct-v0.3 | \n",
+ " 7h 41m | \n",
+ " 27676.8 | \n",
+ " 8h 39m | \n",
+ " 7.2B | \n",
+ " 7248023552 | \n",
+ " 8h 38m | \n",
+ " 31084.838324 | \n",
+ " 3 | \n",
+ " [] | \n",
+ " 0.476 | \n",
+ " 0.443 | \n",
+ " 0.448333 | \n",
+ " 0.589590 | \n",
+ " 0.562586 | \n",
+ " 0.858410 | \n",
+ " 0.089972 | \n",
+ " 0.283482 | \n",
+ " 0.489765 | \n",
+ " 0.828919 | \n",
+ " 0.597137 | \n",
+ " 0.153740 | \n",
+ " 0.470 | \n",
+ " 0.826986 | \n",
+ " 0.514552 | \n",
+ " 0.943 | \n",
+ " 0.568324 | \n",
+ " 0.421053 | \n",
+ " 0.596813 | \n",
+ " 0.740331 | \n",
+ " 28.0 | \n",
+ " 18.0 | \n",
+ " 5.0 | \n",
+ " 20.0 | \n",
+ " 20.0 | \n",
+ " 16.0 | \n",
+ " 24.0 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 5.0 | \n",
+ " 9.0 | \n",
+ " 5.0 | \n",
+ " 9.0 | \n",
+ " 26.0 | \n",
+ " 22.0 | \n",
+ " 9.0 | \n",
+ " 19.0 | \n",
+ " 4.0 | \n",
+ " 4.0 | \n",
+ " 2.0 | \n",
+ " 0.4704 | \n",
+ " 22 | \n",
+ " 0.7403 | \n",
+ " 9 | \n",
+ " 0.4045 | \n",
+ " 5 | \n",
+ " 0.5451 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 16 | \n",
+ " google_gemma-3-4b-it | \n",
+ " 3h 50m | \n",
+ " 13811.4 | \n",
+ " 4h 52m | \n",
+ " 4.3B | \n",
+ " 4300079472 | \n",
+ " 4h 51m | \n",
+ " 17460.233507 | \n",
+ " auto | \n",
+ " [4] | \n",
+ " 0.492 | \n",
+ " 0.471 | \n",
+ " 0.468333 | \n",
+ " 0.570819 | \n",
+ " 0.709415 | \n",
+ " 0.839755 | \n",
+ " 0.089284 | \n",
+ " 0.287946 | \n",
+ " 0.761941 | \n",
+ " 0.741386 | \n",
+ " 0.575559 | \n",
+ " 0.109418 | \n",
+ " 0.466 | \n",
+ " 0.772035 | \n",
+ " 0.565989 | \n",
+ " 0.931 | \n",
+ " 0.314813 | \n",
+ " 0.348837 | \n",
+ " 0.518821 | \n",
+ " 0.700868 | \n",
+ " 13.0 | \n",
+ " 6.0 | \n",
+ " 8.0 | \n",
+ " 15.0 | \n",
+ " 13.0 | \n",
+ " 13.0 | \n",
+ " 23.0 | \n",
+ " 26.0 | \n",
+ " 22.0 | \n",
+ " 17.0 | \n",
+ " 17.0 | \n",
+ " 6.0 | \n",
+ " 16.0 | \n",
+ " 16.0 | \n",
+ " 24.0 | \n",
+ " 13.0 | \n",
+ " 20.0 | \n",
+ " 16.0 | \n",
+ " 16.0 | \n",
+ " 19.0 | \n",
+ " 0.5374 | \n",
+ " 11 | \n",
+ " 0.7167 | \n",
+ " 19 | \n",
+ " 0.3261 | \n",
+ " 20 | \n",
+ " 0.5368 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 17 | \n",
+ " 01-ai_Yi-1.5-6B-Chat | \n",
+ " 7h 1m | \n",
+ " 25318.8 | \n",
+ " 8h 5m | \n",
+ " 6.1B | \n",
+ " 6061035520 | \n",
+ " 8h 4m | \n",
+ " 29040.429802 | \n",
+ " 2 | \n",
+ " [] | \n",
+ " 0.477 | \n",
+ " 0.453 | \n",
+ " 0.460000 | \n",
+ " 0.539249 | \n",
+ " 0.547842 | \n",
+ " 0.847401 | \n",
+ " 0.116081 | \n",
+ " 0.357143 | \n",
+ " 0.670205 | \n",
+ " 0.767477 | \n",
+ " 0.617861 | \n",
+ " 0.027147 | \n",
+ " 0.436 | \n",
+ " 0.787813 | \n",
+ " 0.679480 | \n",
+ " 0.934 | \n",
+ " 0.330974 | \n",
+ " 0.376989 | \n",
+ " 0.534371 | \n",
+ " 0.709550 | \n",
+ " 22.0 | \n",
+ " 21.0 | \n",
+ " 17.0 | \n",
+ " 19.0 | \n",
+ " 19.0 | \n",
+ " 15.0 | \n",
+ " 3.0 | \n",
+ " 16.0 | \n",
+ " 17.0 | \n",
+ " 15.0 | \n",
+ " 15.0 | \n",
+ " 14.0 | \n",
+ " 14.0 | \n",
+ " 10.0 | \n",
+ " 18.0 | \n",
+ " 29.0 | \n",
+ " 12.0 | \n",
+ " 9.0 | \n",
+ " 15.0 | \n",
+ " 16.0 | \n",
+ " 0.5006 | \n",
+ " 19 | \n",
+ " 0.7374 | \n",
+ " 10 | \n",
+ " 0.3339 | \n",
+ " 19 | \n",
+ " 0.5335 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 18 | \n",
+ " 01-ai_Yi-1.5-6B | \n",
+ " 3h 54m | \n",
+ " 14091.6 | \n",
+ " 4h 29m | \n",
+ " 6.1B | \n",
+ " 6061035520 | \n",
+ " 4h 28m | \n",
+ " 16094.199661 | \n",
+ " auto | \n",
+ " [8] | \n",
+ " 0.448 | \n",
+ " 0.407 | \n",
+ " 0.406667 | \n",
+ " 0.496587 | \n",
+ " 0.575488 | \n",
+ " 0.801529 | \n",
+ " 0.399462 | \n",
+ " 0.290179 | \n",
+ " 0.522365 | \n",
+ " 0.754133 | \n",
+ " 0.624270 | \n",
+ " 0.178116 | \n",
+ " 0.422 | \n",
+ " 0.801415 | \n",
+ " 0.598572 | \n",
+ " 0.941 | \n",
+ " 0.495207 | \n",
+ " 0.299878 | \n",
+ " 0.440750 | \n",
+ " 0.720600 | \n",
+ " 27.0 | \n",
+ " 16.0 | \n",
+ " 20.0 | \n",
+ " 21.0 | \n",
+ " 26.0 | \n",
+ " 27.0 | \n",
+ " 22.0 | \n",
+ " 22.0 | \n",
+ " 11.0 | \n",
+ " 11.0 | \n",
+ " 23.0 | \n",
+ " 19.0 | \n",
+ " 10.0 | \n",
+ " 13.0 | \n",
+ " 17.0 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " 23.0 | \n",
+ " 32.0 | \n",
+ " 10.0 | \n",
+ " 0.4495 | \n",
+ " 24 | \n",
+ " 0.7199 | \n",
+ " 18 | \n",
+ " 0.4063 | \n",
+ " 4 | \n",
+ " 0.5312 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 19 | \n",
+ " Qwen_Qwen2-7B-Instruct | \n",
+ " 10h 11m | \n",
+ " 36684.6 | \n",
+ " 11h 31m | \n",
+ " 7.6B | \n",
+ " 7615616512 | \n",
+ " 11h 30m | \n",
+ " 41431.857967 | \n",
+ " auto | \n",
+ " [1] | \n",
+ " 0.573 | \n",
+ " 0.525 | \n",
+ " 0.522500 | \n",
+ " 0.540102 | \n",
+ " 0.577484 | \n",
+ " 0.856269 | \n",
+ " 0.052028 | \n",
+ " 0.314732 | \n",
+ " 0.646702 | \n",
+ " 0.806015 | \n",
+ " 0.699402 | \n",
+ " 0.013296 | \n",
+ " 0.462 | \n",
+ " 0.805767 | \n",
+ " 0.547135 | \n",
+ " 0.916 | \n",
+ " 0.008136 | \n",
+ " 0.405141 | \n",
+ " 0.573437 | \n",
+ " 0.698500 | \n",
+ " 23.0 | \n",
+ " 15.0 | \n",
+ " 16.0 | \n",
+ " 7.0 | \n",
+ " 7.0 | \n",
+ " 7.0 | \n",
+ " 15.0 | \n",
+ " 4.0 | \n",
+ " 7.0 | \n",
+ " 18.0 | \n",
+ " 11.0 | \n",
+ " 7.0 | \n",
+ " 21.0 | \n",
+ " 19.0 | \n",
+ " 7.0 | \n",
+ " 35.0 | \n",
+ " 31.0 | \n",
+ " 8.0 | \n",
+ " 9.0 | \n",
+ " 39.0 | \n",
+ " 0.5285 | \n",
+ " 14 | \n",
+ " 0.7274 | \n",
+ " 14 | \n",
+ " 0.2919 | \n",
+ " 24 | \n",
+ " 0.5271 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 20 | \n",
+ " deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
+ " 15h 30m | \n",
+ " 55855.2 | \n",
+ " 17h 59m | \n",
+ " 8.2B | \n",
+ " 8190735360 | \n",
+ " 17h 57m | \n",
+ " 64675.539163 | \n",
+ " auto | \n",
+ " [1] | \n",
+ " 0.511 | \n",
+ " 0.464 | \n",
+ " 0.476667 | \n",
+ " 0.549488 | \n",
+ " 0.584088 | \n",
+ " 0.848318 | \n",
+ " 0.053279 | \n",
+ " 0.372768 | \n",
+ " 0.812737 | \n",
+ " 0.756423 | \n",
+ " 0.682951 | \n",
+ " 0.018283 | \n",
+ " 0.430 | \n",
+ " 0.756801 | \n",
+ " 0.557752 | \n",
+ " 0.941 | \n",
+ " 0.029481 | \n",
+ " 0.357405 | \n",
+ " 0.559013 | \n",
+ " 0.675612 | \n",
+ " 7.0 | \n",
+ " 14.0 | \n",
+ " 14.0 | \n",
+ " 14.0 | \n",
+ " 16.0 | \n",
+ " 12.0 | \n",
+ " 2.0 | \n",
+ " 20.0 | \n",
+ " 25.0 | \n",
+ " 24.0 | \n",
+ " 14.0 | \n",
+ " 17.0 | \n",
+ " 10.0 | \n",
+ " 17.0 | \n",
+ " 11.0 | \n",
+ " 33.0 | \n",
+ " 30.0 | \n",
+ " 14.0 | \n",
+ " 10.0 | \n",
+ " 36.0 | \n",
+ " 0.5387 | \n",
+ " 10 | \n",
+ " 0.7094 | \n",
+ " 23 | \n",
+ " 0.2834 | \n",
+ " 28 | \n",
+ " 0.5219 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " 21 | \n",
+ " meta-llama_Llama-3.2-3B-Instruct | \n",
+ " 5h 57m | \n",
+ " 21477.0 | \n",
+ " 7h 13m | \n",
+ " 3.2B | \n",
+ " 3212749824 | \n",
+ " 7h 12m | \n",
+ " 25939.885959 | \n",
+ " auto | \n",
+ " [2] | \n",
+ " 0.447 | \n",
+ " 0.418 | \n",
+ " 0.430833 | \n",
+ " 0.459044 | \n",
+ " 0.556443 | \n",
+ " 0.784709 | \n",
+ " 0.155394 | \n",
+ " 0.328125 | \n",
+ " 0.642153 | \n",
+ " 0.705437 | \n",
+ " 0.605184 | \n",
+ " 0.139058 | \n",
+ " 0.358 | \n",
+ " 0.755169 | \n",
+ " 0.545122 | \n",
+ " 0.932 | \n",
+ " 0.338943 | \n",
+ " 0.326805 | \n",
+ " 0.497579 | \n",
+ " 0.670876 | \n",
+ " 24.0 | \n",
+ " 20.0 | \n",
+ " 25.0 | \n",
+ " 22.0 | \n",
+ " 23.0 | \n",
+ " 21.0 | \n",
+ " 12.0 | \n",
+ " 27.0 | \n",
+ " 26.0 | \n",
+ " 25.0 | \n",
+ " 26.0 | \n",
+ " 28.0 | \n",
+ " 15.0 | \n",
+ " 21.0 | \n",
+ " 19.0 | \n",
+ " 12.0 | \n",
+ " 7.0 | \n",
+ " 17.0 | \n",
+ " 20.0 | \n",
+ " 14.0 | \n",
+ " 0.4688 | \n",
+ " 23 | \n",
+ " 0.6788 | \n",
+ " 30 | \n",
+ " 0.3438 | \n",
+ " 16 | \n",
+ " 0.5048 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " 22 | \n",
+ " Qwen_Qwen2.5-3B-Instruct | \n",
+ " 6h 30m | \n",
+ " 23452.2 | \n",
+ " 7h 49m | \n",
+ " 3.1B | \n",
+ " 3085938688 | \n",
+ " 7h 48m | \n",
+ " 28089.516568 | \n",
+ " auto:4 | \n",
+ " [2, 64, 64, 64, 64] | \n",
+ " 0.562 | \n",
+ " 0.466 | \n",
+ " 0.494167 | \n",
+ " 0.482082 | \n",
+ " 0.249117 | \n",
+ " 0.801223 | \n",
+ " 0.077333 | \n",
+ " 0.321429 | \n",
+ " 0.101592 | \n",
+ " 0.749054 | \n",
+ " 0.654964 | \n",
+ " 0.008310 | \n",
+ " 0.422 | \n",
+ " 0.780740 | \n",
+ " 0.797913 | \n",
+ " 0.913 | \n",
+ " 0.300992 | \n",
+ " 0.416157 | \n",
+ " 0.586055 | \n",
+ " 0.692976 | \n",
+ " 41.0 | \n",
+ " 39.0 | \n",
+ " 22.0 | \n",
+ " 8.0 | \n",
+ " 15.0 | \n",
+ " 10.0 | \n",
+ " 14.0 | \n",
+ " 23.0 | \n",
+ " 19.0 | \n",
+ " 20.0 | \n",
+ " 24.0 | \n",
+ " 19.0 | \n",
+ " 24.0 | \n",
+ " 5.0 | \n",
+ " 13.0 | \n",
+ " 37.0 | \n",
+ " 21.0 | \n",
+ " 5.0 | \n",
+ " 7.0 | \n",
+ " 21.0 | \n",
+ " 0.3823 | \n",
+ " 32 | \n",
+ " 0.7367 | \n",
+ " 11 | \n",
+ " 0.3406 | \n",
+ " 17 | \n",
+ " 0.4939 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " 23 | \n",
+ " Qwen_Qwen2.5-Math-7B | \n",
+ " 24h 38m | \n",
+ " 88696.2 | \n",
+ " 27h 23m | \n",
+ " 7.6B | \n",
+ " 7615616512 | \n",
+ " 27h 21m | \n",
+ " 98517.403245 | \n",
+ " auto | \n",
+ " [4] | \n",
+ " 0.387 | \n",
+ " 0.407 | \n",
+ " 0.382500 | \n",
+ " 0.502560 | \n",
+ " 0.672401 | \n",
+ " 0.745566 | \n",
+ " 0.043235 | \n",
+ " 0.308036 | \n",
+ " 0.847612 | \n",
+ " 0.652858 | \n",
+ " 0.579903 | \n",
+ " 0.050970 | \n",
+ " 0.392 | \n",
+ " 0.745375 | \n",
+ " 0.498078 | \n",
+ " 0.929 | \n",
+ " 0.218346 | \n",
+ " 0.320685 | \n",
+ " 0.483219 | \n",
+ " 0.647987 | \n",
+ " 6.0 | \n",
+ " 9.0 | \n",
+ " 18.0 | \n",
+ " 30.0 | \n",
+ " 26.0 | \n",
+ " 31.0 | \n",
+ " 17.0 | \n",
+ " 31.0 | \n",
+ " 29.0 | \n",
+ " 29.0 | \n",
+ " 33.0 | \n",
+ " 24.0 | \n",
+ " 17.0 | \n",
+ " 33.0 | \n",
+ " 23.0 | \n",
+ " 22.0 | \n",
+ " 33.0 | \n",
+ " 20.0 | \n",
+ " 23.0 | \n",
+ " 27.0 | \n",
+ " 0.5010 | \n",
+ " 18 | \n",
+ " 0.6587 | \n",
+ " 32 | \n",
+ " 0.2827 | \n",
+ " 29 | \n",
+ " 0.4907 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " 24 | \n",
+ " deepseek-ai_deepseek-llm-7b-chat | \n",
+ " 9h 8m | \n",
+ " 32906.4 | \n",
+ " 10h 8m | \n",
+ " 6.9B | \n",
+ " 6910365696 | \n",
+ " 10h 6m | \n",
+ " 36412.969244 | \n",
+ " 3 | \n",
+ " [] | \n",
+ " 0.423 | \n",
+ " 0.419 | \n",
+ " 0.420833 | \n",
+ " 0.496587 | \n",
+ " 0.454769 | \n",
+ " 0.833028 | \n",
+ " 0.103048 | \n",
+ " 0.292411 | \n",
+ " 0.463988 | \n",
+ " 0.777236 | \n",
+ " 0.498789 | \n",
+ " 0.063435 | \n",
+ " 0.460 | \n",
+ " 0.801415 | \n",
+ " 0.496980 | \n",
+ " 0.893 | \n",
+ " 0.311190 | \n",
+ " 0.348837 | \n",
+ " 0.478933 | \n",
+ " 0.701657 | \n",
+ " 29.0 | \n",
+ " 26.0 | \n",
+ " 20.0 | \n",
+ " 26.0 | \n",
+ " 22.0 | \n",
+ " 23.0 | \n",
+ " 21.0 | \n",
+ " 15.0 | \n",
+ " 11.0 | \n",
+ " 16.0 | \n",
+ " 18.0 | \n",
+ " 8.0 | \n",
+ " 29.0 | \n",
+ " 35.0 | \n",
+ " 32.0 | \n",
+ " 18.0 | \n",
+ " 15.0 | \n",
+ " 16.0 | \n",
+ " 24.0 | \n",
+ " 20.0 | \n",
+ " 0.4244 | \n",
+ " 27 | \n",
+ " 0.7090 | \n",
+ " 24 | \n",
+ " 0.3007 | \n",
+ " 23 | \n",
+ " 0.4869 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " 25 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
+ " 10h 36m | \n",
+ " 38179.2 | \n",
+ " 11h 47m | \n",
+ " 8.0B | \n",
+ " 8030261248 | \n",
+ " 11h 46m | \n",
+ " 42405.489811 | \n",
+ " auto:5 | \n",
+ " [1, 64, 64, 64, 64, 64] | \n",
+ " 0.404 | \n",
+ " 0.410 | \n",
+ " 0.388333 | \n",
+ " 0.423208 | \n",
+ " 0.603748 | \n",
+ " 0.828746 | \n",
+ " 0.071225 | \n",
+ " 0.274554 | \n",
+ " 0.624716 | \n",
+ " 0.742979 | \n",
+ " 0.532688 | \n",
+ " 0.058449 | \n",
+ " 0.410 | \n",
+ " 0.775843 | \n",
+ " 0.514735 | \n",
+ " 0.899 | \n",
+ " 0.194048 | \n",
+ " 0.321909 | \n",
+ " 0.504460 | \n",
+ " 0.677979 | \n",
+ " 26.0 | \n",
+ " 13.0 | \n",
+ " 31.0 | \n",
+ " 29.0 | \n",
+ " 25.0 | \n",
+ " 29.0 | \n",
+ " 26.0 | \n",
+ " 25.0 | \n",
+ " 20.0 | \n",
+ " 23.0 | \n",
+ " 20.0 | \n",
+ " 21.0 | \n",
+ " 27.0 | \n",
+ " 25.0 | \n",
+ " 27.0 | \n",
+ " 20.0 | \n",
+ " 26.0 | \n",
+ " 19.0 | \n",
+ " 18.0 | \n",
+ " 28.0 | \n",
+ " 0.4469 | \n",
+ " 26 | \n",
+ " 0.6928 | \n",
+ " 27 | \n",
+ " 0.2805 | \n",
+ " 30 | \n",
+ " 0.4830 | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " 26 | \n",
+ " meta-llama_Llama-2-13b-hf | \n",
+ " 17h 38m | \n",
+ " 63506.4 | \n",
+ " 19h 22m | \n",
+ " 13.0B | \n",
+ " 13015864320 | \n",
+ " 19h 21m | \n",
+ " 69687.765642 | \n",
+ " auto | \n",
+ " [1] | \n",
+ " 0.377 | \n",
+ " 0.390 | \n",
+ " 0.385000 | \n",
+ " 0.489761 | \n",
+ " 0.477653 | \n",
+ " 0.806422 | \n",
+ " 0.030132 | \n",
+ " 0.254464 | \n",
+ " 0.229719 | \n",
+ " 0.793866 | \n",
+ " 0.520937 | \n",
+ " 0.236288 | \n",
+ " 0.452 | \n",
+ " 0.805223 | \n",
+ " 0.495332 | \n",
+ " 0.935 | \n",
+ " 0.608839 | \n",
+ " 0.259486 | \n",
+ " 0.368992 | \n",
+ " 0.722178 | \n",
+ " 36.0 | \n",
+ " 25.0 | \n",
+ " 21.0 | \n",
+ " 31.0 | \n",
+ " 29.0 | \n",
+ " 30.0 | \n",
+ " 32.0 | \n",
+ " 8.0 | \n",
+ " 8.0 | \n",
+ " 10.0 | \n",
+ " 22.0 | \n",
+ " 10.0 | \n",
+ " 13.0 | \n",
+ " 38.0 | \n",
+ " 31.0 | \n",
+ " 1.0 | \n",
+ " 38.0 | \n",
+ " 33.0 | \n",
+ " 40.0 | \n",
+ " 1.0 | \n",
+ " 0.3719 | \n",
+ " 33 | \n",
+ " 0.7157 | \n",
+ " 20 | \n",
+ " 0.3374 | \n",
+ " 18 | \n",
+ " 0.4819 | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " 27 | \n",
+ " meta-llama_Llama-2-13b-chat-hf | \n",
+ " 15h 37m | \n",
+ " 56271.6 | \n",
+ " 17h 9m | \n",
+ " 13.0B | \n",
+ " 13015864320 | \n",
+ " 17h 8m | \n",
+ " 61732.053618 | \n",
+ " auto | \n",
+ " [1] | \n",
+ " 0.430 | \n",
+ " 0.430 | \n",
+ " 0.414167 | \n",
+ " 0.501706 | \n",
+ " 0.477960 | \n",
+ " 0.816514 | \n",
+ " 0.091509 | \n",
+ " 0.299107 | \n",
+ " 0.347233 | \n",
+ " 0.796654 | \n",
+ " 0.531263 | \n",
+ " 0.103047 | \n",
+ " 0.440 | \n",
+ " 0.793254 | \n",
+ " 0.543840 | \n",
+ " 0.905 | \n",
+ " 0.272459 | \n",
+ " 0.280294 | \n",
+ " 0.439624 | \n",
+ " 0.711918 | \n",
+ " 31.0 | \n",
+ " 24.0 | \n",
+ " 19.0 | \n",
+ " 25.0 | \n",
+ " 21.0 | \n",
+ " 24.0 | \n",
+ " 19.0 | \n",
+ " 7.0 | \n",
+ " 15.0 | \n",
+ " 14.0 | \n",
+ " 21.0 | \n",
+ " 12.0 | \n",
+ " 26.0 | \n",
+ " 22.0 | \n",
+ " 28.0 | \n",
+ " 14.0 | \n",
+ " 17.0 | \n",
+ " 30.0 | \n",
+ " 33.0 | \n",
+ " 24.0 | \n",
+ " 0.4143 | \n",
+ " 28 | \n",
+ " 0.7153 | \n",
+ " 21 | \n",
+ " 0.2864 | \n",
+ " 26 | \n",
+ " 0.4813 | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " 28 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
+ " 5h 43m | \n",
+ " 20637.0 | \n",
+ " 6h 29m | \n",
+ " 7.6B | \n",
+ " 7615616512 | \n",
+ " 6h 28m | \n",
+ " 23311.022941 | \n",
+ " 3 | \n",
+ " [] | \n",
+ " 0.445 | \n",
+ " 0.418 | \n",
+ " 0.410000 | \n",
+ " 0.437713 | \n",
+ " 0.556904 | \n",
+ " 0.778287 | \n",
+ " 0.041198 | \n",
+ " 0.334821 | \n",
+ " 0.786202 | \n",
+ " 0.602569 | \n",
+ " 0.526350 | \n",
+ " 0.032133 | \n",
+ " 0.360 | \n",
+ " 0.716540 | \n",
+ " 0.520959 | \n",
+ " 0.918 | \n",
+ " 0.059240 | \n",
+ " 0.288862 | \n",
+ " 0.456319 | \n",
+ " 0.599053 | \n",
+ " 10.0 | \n",
+ " 19.0 | \n",
+ " 28.0 | \n",
+ " 23.0 | \n",
+ " 23.0 | \n",
+ " 25.0 | \n",
+ " 10.0 | \n",
+ " 34.0 | \n",
+ " 33.0 | \n",
+ " 33.0 | \n",
+ " 28.0 | \n",
+ " 27.0 | \n",
+ " 19.0 | \n",
+ " 24.0 | \n",
+ " 29.0 | \n",
+ " 28.0 | \n",
+ " 35.0 | \n",
+ " 28.0 | \n",
+ " 29.0 | \n",
+ " 34.0 | \n",
+ " 0.4841 | \n",
+ " 21 | \n",
+ " 0.6422 | \n",
+ " 34 | \n",
+ " 0.2340 | \n",
+ " 35 | \n",
+ " 0.4644 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " 29 | \n",
+ " Qwen_Qwen2.5-1.5B-Instruct | \n",
+ " 2h 36m | \n",
+ " 9398.4 | \n",
+ " 3h 21m | \n",
+ " 1.5B | \n",
+ " 1543714304 | \n",
+ " 3h 20m | \n",
+ " 12036.565195 | \n",
+ " 6 | \n",
+ " [] | \n",
+ " 0.448 | \n",
+ " 0.392 | \n",
+ " 0.431667 | \n",
+ " 0.468430 | \n",
+ " 0.369221 | \n",
+ " 0.781346 | \n",
+ " 0.039052 | \n",
+ " 0.283482 | \n",
+ " 0.319181 | \n",
+ " 0.682932 | \n",
+ " 0.600555 | \n",
+ " 0.041551 | \n",
+ " 0.406 | \n",
+ " 0.758433 | \n",
+ " 0.566722 | \n",
+ " 0.939 | \n",
+ " 0.282601 | \n",
+ " 0.312118 | \n",
+ " 0.465748 | \n",
+ " 0.627466 | \n",
+ " 33.0 | \n",
+ " 37.0 | \n",
+ " 23.0 | \n",
+ " 21.0 | \n",
+ " 28.0 | \n",
+ " 20.0 | \n",
+ " 24.0 | \n",
+ " 30.0 | \n",
+ " 24.0 | \n",
+ " 30.0 | \n",
+ " 27.0 | \n",
+ " 22.0 | \n",
+ " 11.0 | \n",
+ " 15.0 | \n",
+ " 21.0 | \n",
+ " 24.0 | \n",
+ " 36.0 | \n",
+ " 21.0 | \n",
+ " 27.0 | \n",
+ " 22.0 | \n",
+ " 0.3874 | \n",
+ " 31 | \n",
+ " 0.6803 | \n",
+ " 29 | \n",
+ " 0.2903 | \n",
+ " 25 | \n",
+ " 0.4608 | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " 30 | \n",
+ " Qwen_Qwen3-1.7B | \n",
+ " 3h 36m | \n",
+ " 13010.4 | \n",
+ " 4h 26m | \n",
+ " 1.7B | \n",
+ " 1720574976 | \n",
+ " 4h 25m | \n",
+ " 15915.268575 | \n",
+ " 6 | \n",
+ " [] | \n",
+ " 0.410 | \n",
+ " 0.404 | \n",
+ " 0.434167 | \n",
+ " 0.434300 | \n",
+ " 0.482568 | \n",
+ " 0.776453 | \n",
+ " 0.075260 | \n",
+ " 0.290179 | \n",
+ " 0.689917 | \n",
+ " 0.603764 | \n",
+ " 0.553767 | \n",
+ " 0.022161 | \n",
+ " 0.376 | \n",
+ " 0.720348 | \n",
+ " 0.510525 | \n",
+ " 0.914 | \n",
+ " 0.134975 | \n",
+ " 0.294982 | \n",
+ " 0.458812 | \n",
+ " 0.608524 | \n",
+ " 20.0 | \n",
+ " 23.0 | \n",
+ " 29.0 | \n",
+ " 28.0 | \n",
+ " 27.0 | \n",
+ " 19.0 | \n",
+ " 22.0 | \n",
+ " 33.0 | \n",
+ " 32.0 | \n",
+ " 31.0 | \n",
+ " 30.0 | \n",
+ " 26.0 | \n",
+ " 23.0 | \n",
+ " 27.0 | \n",
+ " 25.0 | \n",
+ " 30.0 | \n",
+ " 23.0 | \n",
+ " 25.0 | \n",
+ " 28.0 | \n",
+ " 32.0 | \n",
+ " 0.4493 | \n",
+ " 25 | \n",
+ " 0.6442 | \n",
+ " 33 | \n",
+ " 0.2567 | \n",
+ " 34 | \n",
+ " 0.4597 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " 31 | \n",
+ " Qwen_Qwen2.5-Math-7B-Instruct | \n",
+ " 4h 57m | \n",
+ " 17861.4 | \n",
+ " 5h 38m | \n",
+ " 7.6B | \n",
+ " 7615616512 | \n",
+ " 5h 37m | \n",
+ " 20230.489569 | \n",
+ " auto | \n",
+ " [4] | \n",
+ " 0.431 | \n",
+ " 0.415 | \n",
+ " 0.429167 | \n",
+ " 0.430887 | \n",
+ " 0.614038 | \n",
+ " 0.606116 | \n",
+ " 0.027299 | \n",
+ " 0.287946 | \n",
+ " 0.890068 | \n",
+ " 0.588130 | \n",
+ " 0.537245 | \n",
+ " 0.019945 | \n",
+ " 0.334 | \n",
+ " 0.685528 | \n",
+ " 0.677467 | \n",
+ " 0.858 | \n",
+ " 0.007468 | \n",
+ " 0.298654 | \n",
+ " 0.475035 | \n",
+ " 0.579321 | \n",
+ " 2.0 | \n",
+ " 11.0 | \n",
+ " 30.0 | \n",
+ " 24.0 | \n",
+ " 24.0 | \n",
+ " 22.0 | \n",
+ " 23.0 | \n",
+ " 35.0 | \n",
+ " 35.0 | \n",
+ " 35.0 | \n",
+ " 39.0 | \n",
+ " 30.0 | \n",
+ " 32.0 | \n",
+ " 12.0 | \n",
+ " 26.0 | \n",
+ " 32.0 | \n",
+ " 40.0 | \n",
+ " 24.0 | \n",
+ " 25.0 | \n",
+ " 40.0 | \n",
+ " 0.4997 | \n",
+ " 20 | \n",
+ " 0.6184 | \n",
+ " 37 | \n",
+ " 0.2276 | \n",
+ " 36 | \n",
+ " 0.4596 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " 32 | \n",
+ " meta-llama_Llama-2-7b-chat-hf | \n",
+ " 6h 7m | \n",
+ " 22072.8 | \n",
+ " 6h 59m | \n",
+ " 6.7B | \n",
+ " 6738415616 | \n",
+ " 6h 57m | \n",
+ " 25079.294749 | \n",
+ " auto | \n",
+ " [4] | \n",
+ " 0.417 | \n",
+ " 0.410 | \n",
+ " 0.407500 | \n",
+ " 0.442833 | \n",
+ " 0.401321 | \n",
+ " 0.797859 | \n",
+ " 0.117497 | \n",
+ " 0.261161 | \n",
+ " 0.231994 | \n",
+ " 0.754830 | \n",
+ " 0.463609 | \n",
+ " 0.066759 | \n",
+ " 0.438 | \n",
+ " 0.771491 | \n",
+ " 0.580084 | \n",
+ " 0.878 | \n",
+ " 0.190370 | \n",
+ " 0.302326 | \n",
+ " 0.453217 | \n",
+ " 0.664562 | \n",
+ " 35.0 | \n",
+ " 33.0 | \n",
+ " 27.0 | \n",
+ " 27.0 | \n",
+ " 25.0 | \n",
+ " 26.0 | \n",
+ " 31.0 | \n",
+ " 21.0 | \n",
+ " 23.0 | \n",
+ " 26.0 | \n",
+ " 25.0 | \n",
+ " 13.0 | \n",
+ " 31.0 | \n",
+ " 14.0 | \n",
+ " 33.0 | \n",
+ " 17.0 | \n",
+ " 11.0 | \n",
+ " 22.0 | \n",
+ " 30.0 | \n",
+ " 29.0 | \n",
+ " 0.3674 | \n",
+ " 35 | \n",
+ " 0.6978 | \n",
+ " 25 | \n",
+ " 0.2656 | \n",
+ " 32 | \n",
+ " 0.4525 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " 33 | \n",
+ " meta-llama_Llama-2-7b-hf | \n",
+ " 4h 59m | \n",
+ " 17980.2 | \n",
+ " 5h 43m | \n",
+ " 6.7B | \n",
+ " 6738415616 | \n",
+ " 5h 42m | \n",
+ " 20539.258032 | \n",
+ " auto | \n",
+ " [4] | \n",
+ " 0.364 | \n",
+ " 0.372 | \n",
+ " 0.375833 | \n",
+ " 0.462457 | \n",
+ " 0.399017 | \n",
+ " 0.777370 | \n",
+ " 0.036335 | \n",
+ " 0.241071 | \n",
+ " 0.137983 | \n",
+ " 0.760008 | \n",
+ " 0.418530 | \n",
+ " 0.188920 | \n",
+ " 0.442 | \n",
+ " 0.790533 | \n",
+ " 0.499176 | \n",
+ " 0.910 | \n",
+ " 0.525078 | \n",
+ " 0.252142 | \n",
+ " 0.389716 | \n",
+ " 0.689818 | \n",
+ " 40.0 | \n",
+ " 34.0 | \n",
+ " 24.0 | \n",
+ " 33.0 | \n",
+ " 31.0 | \n",
+ " 33.0 | \n",
+ " 34.0 | \n",
+ " 18.0 | \n",
+ " 16.0 | \n",
+ " 21.0 | \n",
+ " 29.0 | \n",
+ " 11.0 | \n",
+ " 25.0 | \n",
+ " 31.0 | \n",
+ " 37.0 | \n",
+ " 2.0 | \n",
+ " 37.0 | \n",
+ " 34.0 | \n",
+ " 38.0 | \n",
+ " 6.0 | \n",
+ " 0.3361 | \n",
+ " 39 | \n",
+ " 0.6956 | \n",
+ " 26 | \n",
+ " 0.3018 | \n",
+ " 22 | \n",
+ " 0.4516 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " 34 | \n",
+ " deepseek-ai_deepseek-llm-7b-base | \n",
+ " 6h 26m | \n",
+ " 23180.4 | \n",
+ " 7h 12m | \n",
+ " 6.9B | \n",
+ " 6910365696 | \n",
+ " 7h 11m | \n",
+ " 25877.186720 | \n",
+ " 3 | \n",
+ " [] | \n",
+ " 0.340 | \n",
+ " 0.363 | \n",
+ " 0.377500 | \n",
+ " 0.445392 | \n",
+ " 0.423744 | \n",
+ " 0.723547 | \n",
+ " 0.042181 | \n",
+ " 0.252232 | \n",
+ " 0.162244 | \n",
+ " 0.760605 | \n",
+ " 0.442814 | \n",
+ " 0.150970 | \n",
+ " 0.434 | \n",
+ " 0.797606 | \n",
+ " 0.495881 | \n",
+ " 0.915 | \n",
+ " 0.500390 | \n",
+ " 0.232558 | \n",
+ " 0.349214 | \n",
+ " 0.693765 | \n",
+ " 38.0 | \n",
+ " 30.0 | \n",
+ " 26.0 | \n",
+ " 37.0 | \n",
+ " 32.0 | \n",
+ " 32.0 | \n",
+ " 33.0 | \n",
+ " 17.0 | \n",
+ " 13.0 | \n",
+ " 19.0 | \n",
+ " 34.0 | \n",
+ " 15.0 | \n",
+ " 22.0 | \n",
+ " 37.0 | \n",
+ " 36.0 | \n",
+ " 11.0 | \n",
+ " 34.0 | \n",
+ " 36.0 | \n",
+ " 41.0 | \n",
+ " 9.0 | \n",
+ " 0.3377 | \n",
+ " 38 | \n",
+ " 0.6886 | \n",
+ " 28 | \n",
+ " 0.2864 | \n",
+ " 27 | \n",
+ " 0.4451 | \n",
+ "
\n",
+ " \n",
+ " | 34 | \n",
+ " 35 | \n",
+ " deepseek-ai_deepseek-math-7b-rl | \n",
+ " 7h 12m | \n",
+ " 25973.4 | \n",
+ " 8h 3m | \n",
+ " 6.9B | \n",
+ " 6910365696 | \n",
+ " 8h 2m | \n",
+ " 28925.110783 | \n",
+ " 3 | \n",
+ " [] | \n",
+ " 0.368 | \n",
+ " 0.389 | \n",
+ " 0.405000 | \n",
+ " 0.489761 | \n",
+ " 0.524651 | \n",
+ " 0.755963 | \n",
+ " 0.119027 | \n",
+ " 0.272321 | \n",
+ " 0.142532 | \n",
+ " 0.689604 | \n",
+ " 0.524996 | \n",
+ " 0.039335 | \n",
+ " 0.424 | \n",
+ " 0.750272 | \n",
+ " 0.498993 | \n",
+ " 0.928 | \n",
+ " 0.174654 | \n",
+ " 0.287638 | \n",
+ " 0.402884 | \n",
+ " 0.651144 | \n",
+ " 39.0 | \n",
+ " 22.0 | \n",
+ " 21.0 | \n",
+ " 32.0 | \n",
+ " 30.0 | \n",
+ " 28.0 | \n",
+ " 27.0 | \n",
+ " 28.0 | \n",
+ " 28.0 | \n",
+ " 28.0 | \n",
+ " 32.0 | \n",
+ " 18.0 | \n",
+ " 18.0 | \n",
+ " 32.0 | \n",
+ " 30.0 | \n",
+ " 26.0 | \n",
+ " 10.0 | \n",
+ " 29.0 | \n",
+ " 37.0 | \n",
+ " 31.0 | \n",
+ " 0.3702 | \n",
+ " 34 | \n",
+ " 0.6711 | \n",
+ " 31 | \n",
+ " 0.2581 | \n",
+ " 33 | \n",
+ " 0.4419 | \n",
+ "
\n",
+ " \n",
+ " | 35 | \n",
+ " 36 | \n",
+ " meta-llama_Llama-3.2-1B-Instruct | \n",
+ " 2h 35m | \n",
+ " 9307.8 | \n",
+ " 3h 32m | \n",
+ " 1.2B | \n",
+ " 1235814400 | \n",
+ " 3h 30m | \n",
+ " 12653.736082 | \n",
+ " auto | \n",
+ " [2] | \n",
+ " 0.338 | \n",
+ " 0.334 | \n",
+ " 0.372500 | \n",
+ " 0.380546 | \n",
+ " 0.378129 | \n",
+ " 0.694801 | \n",
+ " 0.163484 | \n",
+ " 0.274554 | \n",
+ " 0.337377 | \n",
+ " 0.608843 | \n",
+ " 0.458909 | \n",
+ " 0.056510 | \n",
+ " 0.346 | \n",
+ " 0.742111 | \n",
+ " 0.494600 | \n",
+ " 0.897 | \n",
+ " 0.249944 | \n",
+ " 0.271726 | \n",
+ " 0.438300 | \n",
+ " 0.601421 | \n",
+ " 32.0 | \n",
+ " 36.0 | \n",
+ " 32.0 | \n",
+ " 38.0 | \n",
+ " 37.0 | \n",
+ " 34.0 | \n",
+ " 26.0 | \n",
+ " 32.0 | \n",
+ " 30.0 | \n",
+ " 32.0 | \n",
+ " 35.0 | \n",
+ " 29.0 | \n",
+ " 28.0 | \n",
+ " 40.0 | \n",
+ " 34.0 | \n",
+ " 21.0 | \n",
+ " 6.0 | \n",
+ " 31.0 | \n",
+ " 34.0 | \n",
+ " 25.0 | \n",
+ " 0.3450 | \n",
+ " 37 | \n",
+ " 0.6264 | \n",
+ " 36 | \n",
+ " 0.2731 | \n",
+ " 31 | \n",
+ " 0.4219 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " 37 | \n",
+ " google_gemma-3-1b-it | \n",
+ " 4h 52m | \n",
+ " 17533.8 | \n",
+ " 6h 51m | \n",
+ " 999.9M | \n",
+ " 999885952 | \n",
+ " 6h 50m | \n",
+ " 24641.929494 | \n",
+ " auto | \n",
+ " [1] | \n",
+ " 0.332 | \n",
+ " 0.354 | \n",
+ " 0.356667 | \n",
+ " 0.380546 | \n",
+ " 0.382276 | \n",
+ " 0.758104 | \n",
+ " 0.076157 | \n",
+ " 0.265625 | \n",
+ " 0.247157 | \n",
+ " 0.578271 | \n",
+ " 0.385914 | \n",
+ " 0.035734 | \n",
+ " 0.388 | \n",
+ " 0.720892 | \n",
+ " 0.494051 | \n",
+ " 0.858 | \n",
+ " 0.189701 | \n",
+ " 0.246022 | \n",
+ " 0.387463 | \n",
+ " 0.589582 | \n",
+ " 34.0 | \n",
+ " 35.0 | \n",
+ " 32.0 | \n",
+ " 39.0 | \n",
+ " 34.0 | \n",
+ " 36.0 | \n",
+ " 30.0 | \n",
+ " 36.0 | \n",
+ " 31.0 | \n",
+ " 34.0 | \n",
+ " 31.0 | \n",
+ " 25.0 | \n",
+ " 32.0 | \n",
+ " 41.0 | \n",
+ " 39.0 | \n",
+ " 27.0 | \n",
+ " 22.0 | \n",
+ " 35.0 | \n",
+ " 39.0 | \n",
+ " 30.0 | \n",
+ " 0.3312 | \n",
+ " 40 | \n",
+ " 0.6267 | \n",
+ " 35 | \n",
+ " 0.2202 | \n",
+ " 38 | \n",
+ " 0.4013 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " 38 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
+ " 2h 52m | \n",
+ " 10353.6 | \n",
+ " 3h 42m | \n",
+ " 1.8B | \n",
+ " 1777088000 | \n",
+ " 3h 40m | \n",
+ " 13254.913052 | \n",
+ " 6 | \n",
+ " [] | \n",
+ " 0.356 | \n",
+ " 0.362 | \n",
+ " 0.362500 | \n",
+ " 0.346416 | \n",
+ " 0.405928 | \n",
+ " 0.680122 | \n",
+ " 0.050686 | \n",
+ " 0.272321 | \n",
+ " 0.701289 | \n",
+ " 0.446724 | \n",
+ " 0.360632 | \n",
+ " 0.006371 | \n",
+ " 0.308 | \n",
+ " 0.657780 | \n",
+ " 0.505400 | \n",
+ " 0.845 | \n",
+ " 0.009028 | \n",
+ " 0.293758 | \n",
+ " 0.451742 | \n",
+ " 0.549329 | \n",
+ " 19.0 | \n",
+ " 32.0 | \n",
+ " 34.0 | \n",
+ " 34.0 | \n",
+ " 33.0 | \n",
+ " 35.0 | \n",
+ " 27.0 | \n",
+ " 39.0 | \n",
+ " 37.0 | \n",
+ " 38.0 | \n",
+ " 36.0 | \n",
+ " 32.0 | \n",
+ " 33.0 | \n",
+ " 29.0 | \n",
+ " 41.0 | \n",
+ " 38.0 | \n",
+ " 32.0 | \n",
+ " 26.0 | \n",
+ " 31.0 | \n",
+ " 38.0 | \n",
+ " 0.4009 | \n",
+ " 30 | \n",
+ " 0.5703 | \n",
+ " 39 | \n",
+ " 0.1954 | \n",
+ " 41 | \n",
+ " 0.3986 | \n",
+ "
\n",
+ " \n",
+ " | 38 | \n",
+ " 39 | \n",
+ " Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
+ " 2h 39m | \n",
+ " 9542.4 | \n",
+ " 3h 26m | \n",
+ " 1.5B | \n",
+ " 1543714304 | \n",
+ " 3h 25m | \n",
+ " 12324.098490 | \n",
+ " auto:4 | \n",
+ " [6, 64, 64, 64, 64] | \n",
+ " 0.342 | \n",
+ " 0.341 | \n",
+ " 0.353333 | \n",
+ " 0.365188 | \n",
+ " 0.437260 | \n",
+ " 0.569419 | \n",
+ " 0.023086 | \n",
+ " 0.283482 | \n",
+ " 0.736922 | \n",
+ " 0.416550 | \n",
+ " 0.378792 | \n",
+ " 0.003878 | \n",
+ " 0.286 | \n",
+ " 0.613711 | \n",
+ " 0.497346 | \n",
+ " 0.718 | \n",
+ " 0.004291 | \n",
+ " 0.290086 | \n",
+ " 0.489501 | \n",
+ " 0.525651 | \n",
+ " 17.0 | \n",
+ " 28.0 | \n",
+ " 33.0 | \n",
+ " 36.0 | \n",
+ " 36.0 | \n",
+ " 37.0 | \n",
+ " 24.0 | \n",
+ " 40.0 | \n",
+ " 38.0 | \n",
+ " 39.0 | \n",
+ " 40.0 | \n",
+ " 33.0 | \n",
+ " 35.0 | \n",
+ " 34.0 | \n",
+ " 40.0 | \n",
+ " 39.0 | \n",
+ " 41.0 | \n",
+ " 27.0 | \n",
+ " 21.0 | \n",
+ " 41.0 | \n",
+ " 0.4085 | \n",
+ " 29 | \n",
+ " 0.5181 | \n",
+ " 41 | \n",
+ " 0.1983 | \n",
+ " 40 | \n",
+ " 0.3838 | \n",
+ "
\n",
+ " \n",
+ " | 39 | \n",
+ " 40 | \n",
+ " Qwen_Qwen3-0.6B | \n",
+ " 2h 53m | \n",
+ " 10404.6 | \n",
+ " 3h 46m | \n",
+ " 596.0M | \n",
+ " 596049920 | \n",
+ " 3h 45m | \n",
+ " 13547.446141 | \n",
+ " 6 | \n",
+ " [] | \n",
+ " 0.343 | \n",
+ " 0.319 | \n",
+ " 0.344167 | \n",
+ " 0.342150 | \n",
+ " 0.414836 | \n",
+ " 0.639144 | \n",
+ " 0.060544 | \n",
+ " 0.270089 | \n",
+ " 0.412434 | \n",
+ " 0.471918 | \n",
+ " 0.401296 | \n",
+ " 0.020499 | \n",
+ " 0.320 | \n",
+ " 0.675190 | \n",
+ " 0.496064 | \n",
+ " 0.833 | \n",
+ " 0.019282 | \n",
+ " 0.270502 | \n",
+ " 0.427742 | \n",
+ " 0.551697 | \n",
+ " 30.0 | \n",
+ " 31.0 | \n",
+ " 35.0 | \n",
+ " 35.0 | \n",
+ " 38.0 | \n",
+ " 39.0 | \n",
+ " 28.0 | \n",
+ " 38.0 | \n",
+ " 36.0 | \n",
+ " 37.0 | \n",
+ " 38.0 | \n",
+ " 31.0 | \n",
+ " 34.0 | \n",
+ " 36.0 | \n",
+ " 38.0 | \n",
+ " 31.0 | \n",
+ " 28.0 | \n",
+ " 32.0 | \n",
+ " 35.0 | \n",
+ " 37.0 | \n",
+ " 0.3494 | \n",
+ " 36 | \n",
+ " 0.5696 | \n",
+ " 40 | \n",
+ " 0.2000 | \n",
+ " 39 | \n",
+ " 0.3816 | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " 41 | \n",
+ " Qwen_Qwen2.5-0.5B-Instruct | \n",
+ " 1h 48m | \n",
+ " 6532.8 | \n",
+ " 2h 35m | \n",
+ " 494.0M | \n",
+ " 494032768 | \n",
+ " 2h 34m | \n",
+ " 9253.074769 | \n",
+ " 6 | \n",
+ " [] | \n",
+ " 0.324 | \n",
+ " 0.342 | \n",
+ " 0.347500 | \n",
+ " 0.337031 | \n",
+ " 0.213792 | \n",
+ " 0.676758 | \n",
+ " 0.028644 | \n",
+ " 0.267857 | \n",
+ " 0.207733 | \n",
+ " 0.524099 | \n",
+ " 0.457556 | \n",
+ " 0.020499 | \n",
+ " 0.346 | \n",
+ " 0.704026 | \n",
+ " 0.536884 | \n",
+ " 0.883 | \n",
+ " 0.134195 | \n",
+ " 0.271726 | \n",
+ " 0.418387 | \n",
+ " 0.556433 | \n",
+ " 37.0 | \n",
+ " 40.0 | \n",
+ " 36.0 | \n",
+ " 40.0 | \n",
+ " 35.0 | \n",
+ " 38.0 | \n",
+ " 29.0 | \n",
+ " 37.0 | \n",
+ " 34.0 | \n",
+ " 36.0 | \n",
+ " 37.0 | \n",
+ " 29.0 | \n",
+ " 30.0 | \n",
+ " 23.0 | \n",
+ " 35.0 | \n",
+ " 31.0 | \n",
+ " 39.0 | \n",
+ " 31.0 | \n",
+ " 36.0 | \n",
+ " 33.0 | \n",
+ " 0.2914 | \n",
+ " 41 | \n",
+ " 0.6039 | \n",
+ " 38 | \n",
+ " 0.2218 | \n",
+ " 37 | \n",
+ " 0.3799 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Overall Rank Model Name GPU Util Time \\\n",
+ "0 1 google_gemma-3-12b-it 14h 8m \n",
+ "1 2 Qwen_Qwen3-14B (8bit) 17h 29m \n",
+ "2 3 openchat_openchat-3.6-8b-20240522 6h 59m \n",
+ "3 4 Qwen_Qwen3-8B 13h 44m \n",
+ "4 5 Qwen_Qwen2.5-7B-Instruct 8h 33m \n",
+ "5 6 Qwen_Qwen2.5-14B-Instruct (8bit) 29h 32m \n",
+ "6 7 01-ai_Yi-1.5-9B 10h 26m \n",
+ "7 8 Qwen_Qwen2.5-7B-Instruct-1M 10h 10m \n",
+ "8 9 meta-llama_Llama-3.1-8B-Instruct 10h 52m \n",
+ "9 10 01-ai_Yi-1.5-9B-Chat 12h 15m \n",
+ "10 11 mistralai_Ministral-8B-Instruct-2410 9h 27m \n",
+ "11 12 meta-llama_Meta-Llama-3-8B-Instruct 5h 46m \n",
+ "12 13 Qwen_Qwen3-4B 5h 3m \n",
+ "13 14 NousResearch_Hermes-2-Pro-Mistral-7B 7h 28m \n",
+ "14 15 mistralai_Mistral-7B-Instruct-v0.3 7h 41m \n",
+ "15 16 google_gemma-3-4b-it 3h 50m \n",
+ "16 17 01-ai_Yi-1.5-6B-Chat 7h 1m \n",
+ "17 18 01-ai_Yi-1.5-6B 3h 54m \n",
+ "18 19 Qwen_Qwen2-7B-Instruct 10h 11m \n",
+ "19 20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 15h 30m \n",
+ "20 21 meta-llama_Llama-3.2-3B-Instruct 5h 57m \n",
+ "21 22 Qwen_Qwen2.5-3B-Instruct 6h 30m \n",
+ "22 23 Qwen_Qwen2.5-Math-7B 24h 38m \n",
+ "23 24 deepseek-ai_deepseek-llm-7b-chat 9h 8m \n",
+ "24 25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 10h 36m \n",
+ "25 26 meta-llama_Llama-2-13b-hf 17h 38m \n",
+ "26 27 meta-llama_Llama-2-13b-chat-hf 15h 37m \n",
+ "27 28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 5h 43m \n",
+ "28 29 Qwen_Qwen2.5-1.5B-Instruct 2h 36m \n",
+ "29 30 Qwen_Qwen3-1.7B 3h 36m \n",
+ "30 31 Qwen_Qwen2.5-Math-7B-Instruct 4h 57m \n",
+ "31 32 meta-llama_Llama-2-7b-chat-hf 6h 7m \n",
+ "32 33 meta-llama_Llama-2-7b-hf 4h 59m \n",
+ "33 34 deepseek-ai_deepseek-llm-7b-base 6h 26m \n",
+ "34 35 deepseek-ai_deepseek-math-7b-rl 7h 12m \n",
+ "35 36 meta-llama_Llama-3.2-1B-Instruct 2h 35m \n",
+ "36 37 google_gemma-3-1b-it 4h 52m \n",
+ "37 38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 2h 52m \n",
+ "38 39 Qwen_Qwen2.5-Math-1.5B-Instruct 2h 39m \n",
+ "39 40 Qwen_Qwen3-0.6B 2h 53m \n",
+ "40 41 Qwen_Qwen2.5-0.5B-Instruct 1h 48m \n",
+ "\n",
+ " gpu_util_time_raw full_time_from_gpu_log Parameters parameters_raw \\\n",
+ "0 50906.4 15h 47m 12.2B 12187325040 \n",
+ "1 62956.2 29h 46m 14.8B 14768307200 \n",
+ "2 25150.8 7h 52m 8.0B 8030261248 \n",
+ "3 49497.0 15h 33m 8.2B 8190735360 \n",
+ "4 30831.6 9h 38m 7.6B 7615616512 \n",
+ "5 106374.6 52h 45m 14.8B 14770033664 \n",
+ "6 37569.6 11h 44m 8.8B 8829407232 \n",
+ "7 36621.0 11h 18m 7.6B 7615616512 \n",
+ "8 39147.6 12h 20m 8.0B 8030261248 \n",
+ "9 44120.4 13h 55m 8.8B 8829407232 \n",
+ "10 34053.6 10h 47m 8.0B 8019808256 \n",
+ "11 20809.8 6h 31m 8.0B 8030261248 \n",
+ "12 18234.6 5h 52m 4.0B 4022468096 \n",
+ "13 26916.0 8h 28m 7.2B 7241994240 \n",
+ "14 27676.8 8h 39m 7.2B 7248023552 \n",
+ "15 13811.4 4h 52m 4.3B 4300079472 \n",
+ "16 25318.8 8h 5m 6.1B 6061035520 \n",
+ "17 14091.6 4h 29m 6.1B 6061035520 \n",
+ "18 36684.6 11h 31m 7.6B 7615616512 \n",
+ "19 55855.2 17h 59m 8.2B 8190735360 \n",
+ "20 21477.0 7h 13m 3.2B 3212749824 \n",
+ "21 23452.2 7h 49m 3.1B 3085938688 \n",
+ "22 88696.2 27h 23m 7.6B 7615616512 \n",
+ "23 32906.4 10h 8m 6.9B 6910365696 \n",
+ "24 38179.2 11h 47m 8.0B 8030261248 \n",
+ "25 63506.4 19h 22m 13.0B 13015864320 \n",
+ "26 56271.6 17h 9m 13.0B 13015864320 \n",
+ "27 20637.0 6h 29m 7.6B 7615616512 \n",
+ "28 9398.4 3h 21m 1.5B 1543714304 \n",
+ "29 13010.4 4h 26m 1.7B 1720574976 \n",
+ "30 17861.4 5h 38m 7.6B 7615616512 \n",
+ "31 22072.8 6h 59m 6.7B 6738415616 \n",
+ "32 17980.2 5h 43m 6.7B 6738415616 \n",
+ "33 23180.4 7h 12m 6.9B 6910365696 \n",
+ "34 25973.4 8h 3m 6.9B 6910365696 \n",
+ "35 9307.8 3h 32m 1.2B 1235814400 \n",
+ "36 17533.8 6h 51m 999.9M 999885952 \n",
+ "37 10353.6 3h 42m 1.8B 1777088000 \n",
+ "38 9542.4 3h 26m 1.5B 1543714304 \n",
+ "39 10404.6 3h 46m 596.0M 596049920 \n",
+ "40 6532.8 2h 35m 494.0M 494032768 \n",
+ "\n",
+ " Total Time total_time_raw batch_size batch_sizes \\\n",
+ "0 15h 45m 56750.865892 auto [2] \n",
+ "1 29h 45m 107151.802065 1 [] \n",
+ "2 7h 51m 28278.859470 3 [] \n",
+ "3 15h 31m 55918.467860 auto [1] \n",
+ "4 9h 36m 34616.604248 3 [] \n",
+ "5 52h 44m 189869.409404 1 [] \n",
+ "6 11h 43m 42212.112622 2 [] \n",
+ "7 11h 17m 40632.813397 auto [1] \n",
+ "8 12h 19m 44363.249360 auto [1] \n",
+ "9 13h 54m 50056.331345 2 [] \n",
+ "10 10h 46m 38770.339256 auto [1] \n",
+ "11 6h 30m 23440.234421 3 [] \n",
+ "12 5h 51m 21077.943646 6 [] \n",
+ "13 8h 27m 30434.329021 3 [] \n",
+ "14 8h 38m 31084.838324 3 [] \n",
+ "15 4h 51m 17460.233507 auto [4] \n",
+ "16 8h 4m 29040.429802 2 [] \n",
+ "17 4h 28m 16094.199661 auto [8] \n",
+ "18 11h 30m 41431.857967 auto [1] \n",
+ "19 17h 57m 64675.539163 auto [1] \n",
+ "20 7h 12m 25939.885959 auto [2] \n",
+ "21 7h 48m 28089.516568 auto:4 [2, 64, 64, 64, 64] \n",
+ "22 27h 21m 98517.403245 auto [4] \n",
+ "23 10h 6m 36412.969244 3 [] \n",
+ "24 11h 46m 42405.489811 auto:5 [1, 64, 64, 64, 64, 64] \n",
+ "25 19h 21m 69687.765642 auto [1] \n",
+ "26 17h 8m 61732.053618 auto [1] \n",
+ "27 6h 28m 23311.022941 3 [] \n",
+ "28 3h 20m 12036.565195 6 [] \n",
+ "29 4h 25m 15915.268575 6 [] \n",
+ "30 5h 37m 20230.489569 auto [4] \n",
+ "31 6h 57m 25079.294749 auto [4] \n",
+ "32 5h 42m 20539.258032 auto [4] \n",
+ "33 7h 11m 25877.186720 3 [] \n",
+ "34 8h 2m 28925.110783 3 [] \n",
+ "35 3h 30m 12653.736082 auto [2] \n",
+ "36 6h 50m 24641.929494 auto [1] \n",
+ "37 3h 40m 13254.913052 6 [] \n",
+ "38 3h 25m 12324.098490 auto:4 [6, 64, 64, 64, 64] \n",
+ "39 3h 45m 13547.446141 6 [] \n",
+ "40 2h 34m 9253.074769 6 [] \n",
+ "\n",
+ " anli_r1(acc,none) anli_r2(acc,none) anli_r3(acc,none) \\\n",
+ "0 0.603 0.560 0.595833 \n",
+ "1 0.646 0.570 0.556667 \n",
+ "2 0.556 0.513 0.480000 \n",
+ "3 0.669 0.542 0.555833 \n",
+ "4 0.685 0.549 0.552500 \n",
+ "5 0.721 0.634 0.617500 \n",
+ "6 0.532 0.480 0.439167 \n",
+ "7 0.585 0.533 0.556667 \n",
+ "8 0.482 0.467 0.443333 \n",
+ "9 0.535 0.509 0.525833 \n",
+ "10 0.488 0.487 0.465833 \n",
+ "11 0.484 0.458 0.448333 \n",
+ "12 0.550 0.461 0.513333 \n",
+ "13 0.531 0.496 0.500000 \n",
+ "14 0.476 0.443 0.448333 \n",
+ "15 0.492 0.471 0.468333 \n",
+ "16 0.477 0.453 0.460000 \n",
+ "17 0.448 0.407 0.406667 \n",
+ "18 0.573 0.525 0.522500 \n",
+ "19 0.511 0.464 0.476667 \n",
+ "20 0.447 0.418 0.430833 \n",
+ "21 0.562 0.466 0.494167 \n",
+ "22 0.387 0.407 0.382500 \n",
+ "23 0.423 0.419 0.420833 \n",
+ "24 0.404 0.410 0.388333 \n",
+ "25 0.377 0.390 0.385000 \n",
+ "26 0.430 0.430 0.414167 \n",
+ "27 0.445 0.418 0.410000 \n",
+ "28 0.448 0.392 0.431667 \n",
+ "29 0.410 0.404 0.434167 \n",
+ "30 0.431 0.415 0.429167 \n",
+ "31 0.417 0.410 0.407500 \n",
+ "32 0.364 0.372 0.375833 \n",
+ "33 0.340 0.363 0.377500 \n",
+ "34 0.368 0.389 0.405000 \n",
+ "35 0.338 0.334 0.372500 \n",
+ "36 0.332 0.354 0.356667 \n",
+ "37 0.356 0.362 0.362500 \n",
+ "38 0.342 0.341 0.353333 \n",
+ "39 0.343 0.319 0.344167 \n",
+ "40 0.324 0.342 0.347500 \n",
+ "\n",
+ " arc_challenge(acc_norm,none) bbh(exact_match,get-answer) \\\n",
+ "0 0.610922 0.801874 \n",
+ "1 0.600683 0.432960 \n",
+ "2 0.603242 0.617877 \n",
+ "3 0.562287 0.797573 \n",
+ "4 0.552901 0.448779 \n",
+ "5 0.615188 0.106896 \n",
+ "6 0.546928 0.712026 \n",
+ "7 0.585324 0.277223 \n",
+ "8 0.550341 0.715558 \n",
+ "9 0.587031 0.610659 \n",
+ "10 0.562287 0.692520 \n",
+ "11 0.563993 0.679005 \n",
+ "12 0.539249 0.752265 \n",
+ "13 0.565700 0.573798 \n",
+ "14 0.589590 0.562586 \n",
+ "15 0.570819 0.709415 \n",
+ "16 0.539249 0.547842 \n",
+ "17 0.496587 0.575488 \n",
+ "18 0.540102 0.577484 \n",
+ "19 0.549488 0.584088 \n",
+ "20 0.459044 0.556443 \n",
+ "21 0.482082 0.249117 \n",
+ "22 0.502560 0.672401 \n",
+ "23 0.496587 0.454769 \n",
+ "24 0.423208 0.603748 \n",
+ "25 0.489761 0.477653 \n",
+ "26 0.501706 0.477960 \n",
+ "27 0.437713 0.556904 \n",
+ "28 0.468430 0.369221 \n",
+ "29 0.434300 0.482568 \n",
+ "30 0.430887 0.614038 \n",
+ "31 0.442833 0.401321 \n",
+ "32 0.462457 0.399017 \n",
+ "33 0.445392 0.423744 \n",
+ "34 0.489761 0.524651 \n",
+ "35 0.380546 0.378129 \n",
+ "36 0.380546 0.382276 \n",
+ "37 0.346416 0.405928 \n",
+ "38 0.365188 0.437260 \n",
+ "39 0.342150 0.414836 \n",
+ "40 0.337031 0.213792 \n",
+ "\n",
+ " boolq(acc,none) drop(f1,none) gpqa_main_zeroshot(acc_norm,none) \\\n",
+ "0 0.874618 0.139566 0.337054 \n",
+ "1 0.891743 0.090410 0.397321 \n",
+ "2 0.872783 0.251569 0.332589 \n",
+ "3 0.865749 0.109877 0.350446 \n",
+ "4 0.863303 0.071089 0.328125 \n",
+ "5 0.886239 0.071276 0.354911 \n",
+ "6 0.858104 0.445686 0.294643 \n",
+ "7 0.852599 0.057047 0.339286 \n",
+ "8 0.841590 0.193729 0.343750 \n",
+ "9 0.868196 0.125326 0.303571 \n",
+ "10 0.860245 0.071413 0.341518 \n",
+ "11 0.831193 0.163977 0.310268 \n",
+ "12 0.850459 0.097707 0.325893 \n",
+ "13 0.868196 0.109754 0.276786 \n",
+ "14 0.858410 0.089972 0.283482 \n",
+ "15 0.839755 0.089284 0.287946 \n",
+ "16 0.847401 0.116081 0.357143 \n",
+ "17 0.801529 0.399462 0.290179 \n",
+ "18 0.856269 0.052028 0.314732 \n",
+ "19 0.848318 0.053279 0.372768 \n",
+ "20 0.784709 0.155394 0.328125 \n",
+ "21 0.801223 0.077333 0.321429 \n",
+ "22 0.745566 0.043235 0.308036 \n",
+ "23 0.833028 0.103048 0.292411 \n",
+ "24 0.828746 0.071225 0.274554 \n",
+ "25 0.806422 0.030132 0.254464 \n",
+ "26 0.816514 0.091509 0.299107 \n",
+ "27 0.778287 0.041198 0.334821 \n",
+ "28 0.781346 0.039052 0.283482 \n",
+ "29 0.776453 0.075260 0.290179 \n",
+ "30 0.606116 0.027299 0.287946 \n",
+ "31 0.797859 0.117497 0.261161 \n",
+ "32 0.777370 0.036335 0.241071 \n",
+ "33 0.723547 0.042181 0.252232 \n",
+ "34 0.755963 0.119027 0.272321 \n",
+ "35 0.694801 0.163484 0.274554 \n",
+ "36 0.758104 0.076157 0.265625 \n",
+ "37 0.680122 0.050686 0.272321 \n",
+ "38 0.569419 0.023086 0.283482 \n",
+ "39 0.639144 0.060544 0.270089 \n",
+ "40 0.676758 0.028644 0.267857 \n",
+ "\n",
+ " gsm8k(exact_match,strict-match) hellaswag(acc_norm,none) mmlu(acc,none) \\\n",
+ "0 0.877180 0.818761 0.716137 \n",
+ "1 0.898408 0.787692 0.769477 \n",
+ "2 0.750569 0.797849 0.643071 \n",
+ "3 0.872631 0.748656 0.728956 \n",
+ "4 0.762699 0.804919 0.718060 \n",
+ "5 0.792267 0.841964 0.783079 \n",
+ "6 0.639121 0.778929 0.689289 \n",
+ "7 0.795299 0.789982 0.716636 \n",
+ "8 0.754359 0.792073 0.679319 \n",
+ "9 0.708112 0.787293 0.684091 \n",
+ "10 0.774829 0.791077 0.640721 \n",
+ "11 0.756634 0.759211 0.638727 \n",
+ "12 0.856710 0.683330 0.683592 \n",
+ "13 0.685368 0.804919 0.605113 \n",
+ "14 0.489765 0.828919 0.597137 \n",
+ "15 0.761941 0.741386 0.575559 \n",
+ "16 0.670205 0.767477 0.617861 \n",
+ "17 0.522365 0.754133 0.624270 \n",
+ "18 0.646702 0.806015 0.699402 \n",
+ "19 0.812737 0.756423 0.682951 \n",
+ "20 0.642153 0.705437 0.605184 \n",
+ "21 0.101592 0.749054 0.654964 \n",
+ "22 0.847612 0.652858 0.579903 \n",
+ "23 0.463988 0.777236 0.498789 \n",
+ "24 0.624716 0.742979 0.532688 \n",
+ "25 0.229719 0.793866 0.520937 \n",
+ "26 0.347233 0.796654 0.531263 \n",
+ "27 0.786202 0.602569 0.526350 \n",
+ "28 0.319181 0.682932 0.600555 \n",
+ "29 0.689917 0.603764 0.553767 \n",
+ "30 0.890068 0.588130 0.537245 \n",
+ "31 0.231994 0.754830 0.463609 \n",
+ "32 0.137983 0.760008 0.418530 \n",
+ "33 0.162244 0.760605 0.442814 \n",
+ "34 0.142532 0.689604 0.524996 \n",
+ "35 0.337377 0.608843 0.458909 \n",
+ "36 0.247157 0.578271 0.385914 \n",
+ "37 0.701289 0.446724 0.360632 \n",
+ "38 0.736922 0.416550 0.378792 \n",
+ "39 0.412434 0.471918 0.401296 \n",
+ "40 0.207733 0.524099 0.457556 \n",
+ "\n",
+ " nq_open(exact_match,remove_whitespace) openbookqa(acc_norm,none) \\\n",
+ "0 0.157064 0.498 \n",
+ "1 0.092244 0.460 \n",
+ "2 0.170637 0.462 \n",
+ "3 0.073684 0.418 \n",
+ "4 0.045706 0.486 \n",
+ "5 0.061496 0.476 \n",
+ "6 0.153186 0.456 \n",
+ "7 0.157618 0.480 \n",
+ "8 0.177562 0.432 \n",
+ "9 0.009418 0.436 \n",
+ "10 0.157618 0.466 \n",
+ "11 0.159003 0.430 \n",
+ "12 0.014681 0.402 \n",
+ "13 0.040443 0.434 \n",
+ "14 0.153740 0.470 \n",
+ "15 0.109418 0.466 \n",
+ "16 0.027147 0.436 \n",
+ "17 0.178116 0.422 \n",
+ "18 0.013296 0.462 \n",
+ "19 0.018283 0.430 \n",
+ "20 0.139058 0.358 \n",
+ "21 0.008310 0.422 \n",
+ "22 0.050970 0.392 \n",
+ "23 0.063435 0.460 \n",
+ "24 0.058449 0.410 \n",
+ "25 0.236288 0.452 \n",
+ "26 0.103047 0.440 \n",
+ "27 0.032133 0.360 \n",
+ "28 0.041551 0.406 \n",
+ "29 0.022161 0.376 \n",
+ "30 0.019945 0.334 \n",
+ "31 0.066759 0.438 \n",
+ "32 0.188920 0.442 \n",
+ "33 0.150970 0.434 \n",
+ "34 0.039335 0.424 \n",
+ "35 0.056510 0.346 \n",
+ "36 0.035734 0.388 \n",
+ "37 0.006371 0.308 \n",
+ "38 0.003878 0.286 \n",
+ "39 0.020499 0.320 \n",
+ "40 0.020499 0.346 \n",
+ "\n",
+ " piqa(acc_norm,none) qnli(acc,none) sciq(acc_norm,none) \\\n",
+ "0 0.780740 0.745744 0.954 \n",
+ "1 0.794886 0.844225 0.966 \n",
+ "2 0.818281 0.730002 0.964 \n",
+ "3 0.775299 0.781805 0.958 \n",
+ "4 0.803047 0.804503 0.937 \n",
+ "5 0.817193 0.853926 0.929 \n",
+ "6 0.806311 0.508695 0.952 \n",
+ "7 0.816104 0.678199 0.950 \n",
+ "8 0.806311 0.501373 0.962 \n",
+ "9 0.803591 0.787662 0.954 \n",
+ "10 0.823177 0.494966 0.956 \n",
+ "11 0.787269 0.546403 0.932 \n",
+ "12 0.751360 0.808713 0.932 \n",
+ "13 0.798694 0.556471 0.917 \n",
+ "14 0.826986 0.514552 0.943 \n",
+ "15 0.772035 0.565989 0.931 \n",
+ "16 0.787813 0.679480 0.934 \n",
+ "17 0.801415 0.598572 0.941 \n",
+ "18 0.805767 0.547135 0.916 \n",
+ "19 0.756801 0.557752 0.941 \n",
+ "20 0.755169 0.545122 0.932 \n",
+ "21 0.780740 0.797913 0.913 \n",
+ "22 0.745375 0.498078 0.929 \n",
+ "23 0.801415 0.496980 0.893 \n",
+ "24 0.775843 0.514735 0.899 \n",
+ "25 0.805223 0.495332 0.935 \n",
+ "26 0.793254 0.543840 0.905 \n",
+ "27 0.716540 0.520959 0.918 \n",
+ "28 0.758433 0.566722 0.939 \n",
+ "29 0.720348 0.510525 0.914 \n",
+ "30 0.685528 0.677467 0.858 \n",
+ "31 0.771491 0.580084 0.878 \n",
+ "32 0.790533 0.499176 0.910 \n",
+ "33 0.797606 0.495881 0.915 \n",
+ "34 0.750272 0.498993 0.928 \n",
+ "35 0.742111 0.494600 0.897 \n",
+ "36 0.720892 0.494051 0.858 \n",
+ "37 0.657780 0.505400 0.845 \n",
+ "38 0.613711 0.497346 0.718 \n",
+ "39 0.675190 0.496064 0.833 \n",
+ "40 0.704026 0.536884 0.883 \n",
+ "\n",
+ " triviaqa(exact_match,remove_whitespace) truthfulqa_mc1(acc,none) \\\n",
+ "0 0.275245 0.405141 \n",
+ "1 0.407490 0.406365 \n",
+ "2 0.565927 0.352509 \n",
+ "3 0.320609 0.363525 \n",
+ "4 0.325401 0.477356 \n",
+ "5 0.039289 0.510404 \n",
+ "6 0.543803 0.321909 \n",
+ "7 0.420531 0.425949 \n",
+ "8 0.518168 0.365973 \n",
+ "9 0.338665 0.374541 \n",
+ "10 0.527809 0.325581 \n",
+ "11 0.511202 0.363525 \n",
+ "12 0.225033 0.367197 \n",
+ "13 0.471132 0.413709 \n",
+ "14 0.568324 0.421053 \n",
+ "15 0.314813 0.348837 \n",
+ "16 0.330974 0.376989 \n",
+ "17 0.495207 0.299878 \n",
+ "18 0.008136 0.405141 \n",
+ "19 0.029481 0.357405 \n",
+ "20 0.338943 0.326805 \n",
+ "21 0.300992 0.416157 \n",
+ "22 0.218346 0.320685 \n",
+ "23 0.311190 0.348837 \n",
+ "24 0.194048 0.321909 \n",
+ "25 0.608839 0.259486 \n",
+ "26 0.272459 0.280294 \n",
+ "27 0.059240 0.288862 \n",
+ "28 0.282601 0.312118 \n",
+ "29 0.134975 0.294982 \n",
+ "30 0.007468 0.298654 \n",
+ "31 0.190370 0.302326 \n",
+ "32 0.525078 0.252142 \n",
+ "33 0.500390 0.232558 \n",
+ "34 0.174654 0.287638 \n",
+ "35 0.249944 0.271726 \n",
+ "36 0.189701 0.246022 \n",
+ "37 0.009028 0.293758 \n",
+ "38 0.004291 0.290086 \n",
+ "39 0.019282 0.270502 \n",
+ "40 0.134195 0.271726 \n",
+ "\n",
+ " truthfulqa_mc2(acc,none) winogrande(acc,none) \\\n",
+ "0 0.581183 0.744278 \n",
+ "1 0.589404 0.720600 \n",
+ "2 0.497601 0.763220 \n",
+ "3 0.543140 0.680347 \n",
+ "4 0.648483 0.711918 \n",
+ "5 0.683015 0.754538 \n",
+ "6 0.467572 0.726125 \n",
+ "7 0.600072 0.727703 \n",
+ "8 0.541154 0.738753 \n",
+ "9 0.547934 0.746646 \n",
+ "10 0.486670 0.737964 \n",
+ "11 0.517142 0.716654 \n",
+ "12 0.547575 0.658248 \n",
+ "13 0.591156 0.719811 \n",
+ "14 0.596813 0.740331 \n",
+ "15 0.518821 0.700868 \n",
+ "16 0.534371 0.709550 \n",
+ "17 0.440750 0.720600 \n",
+ "18 0.573437 0.698500 \n",
+ "19 0.559013 0.675612 \n",
+ "20 0.497579 0.670876 \n",
+ "21 0.586055 0.692976 \n",
+ "22 0.483219 0.647987 \n",
+ "23 0.478933 0.701657 \n",
+ "24 0.504460 0.677979 \n",
+ "25 0.368992 0.722178 \n",
+ "26 0.439624 0.711918 \n",
+ "27 0.456319 0.599053 \n",
+ "28 0.465748 0.627466 \n",
+ "29 0.458812 0.608524 \n",
+ "30 0.475035 0.579321 \n",
+ "31 0.453217 0.664562 \n",
+ "32 0.389716 0.689818 \n",
+ "33 0.349214 0.693765 \n",
+ "34 0.402884 0.651144 \n",
+ "35 0.438300 0.601421 \n",
+ "36 0.387463 0.589582 \n",
+ "37 0.451742 0.549329 \n",
+ "38 0.489501 0.525651 \n",
+ "39 0.427742 0.551697 \n",
+ "40 0.418387 0.556433 \n",
+ "\n",
+ " gsm8k(exact_match,strict-match)_rank bbh(exact_match,get-answer)_rank \\\n",
+ "0 3.0 1.0 \n",
+ "1 1.0 29.0 \n",
+ "2 16.0 10.0 \n",
+ "3 4.0 2.0 \n",
+ "4 12.0 27.0 \n",
+ "5 9.0 41.0 \n",
+ "6 25.0 5.0 \n",
+ "7 8.0 38.0 \n",
+ "8 15.0 4.0 \n",
+ "9 18.0 12.0 \n",
+ "10 11.0 7.0 \n",
+ "11 14.0 8.0 \n",
+ "12 5.0 3.0 \n",
+ "13 21.0 17.0 \n",
+ "14 28.0 18.0 \n",
+ "15 13.0 6.0 \n",
+ "16 22.0 21.0 \n",
+ "17 27.0 16.0 \n",
+ "18 23.0 15.0 \n",
+ "19 7.0 14.0 \n",
+ "20 24.0 20.0 \n",
+ "21 41.0 39.0 \n",
+ "22 6.0 9.0 \n",
+ "23 29.0 26.0 \n",
+ "24 26.0 13.0 \n",
+ "25 36.0 25.0 \n",
+ "26 31.0 24.0 \n",
+ "27 10.0 19.0 \n",
+ "28 33.0 37.0 \n",
+ "29 20.0 23.0 \n",
+ "30 2.0 11.0 \n",
+ "31 35.0 33.0 \n",
+ "32 40.0 34.0 \n",
+ "33 38.0 30.0 \n",
+ "34 39.0 22.0 \n",
+ "35 32.0 36.0 \n",
+ "36 34.0 35.0 \n",
+ "37 19.0 32.0 \n",
+ "38 17.0 28.0 \n",
+ "39 30.0 31.0 \n",
+ "40 37.0 40.0 \n",
+ "\n",
+ " arc_challenge(acc_norm,none)_rank anli_r1(acc,none)_rank \\\n",
+ "0 2.0 5.0 \n",
+ "1 4.0 4.0 \n",
+ "2 3.0 9.0 \n",
+ "3 11.0 3.0 \n",
+ "4 12.0 2.0 \n",
+ "5 1.0 1.0 \n",
+ "6 15.0 12.0 \n",
+ "7 7.0 6.0 \n",
+ "8 13.0 18.0 \n",
+ "9 6.0 11.0 \n",
+ "10 11.0 16.0 \n",
+ "11 10.0 17.0 \n",
+ "12 17.0 10.0 \n",
+ "13 9.0 13.0 \n",
+ "14 5.0 20.0 \n",
+ "15 8.0 15.0 \n",
+ "16 17.0 19.0 \n",
+ "17 20.0 21.0 \n",
+ "18 16.0 7.0 \n",
+ "19 14.0 14.0 \n",
+ "20 25.0 22.0 \n",
+ "21 22.0 8.0 \n",
+ "22 18.0 30.0 \n",
+ "23 20.0 26.0 \n",
+ "24 31.0 29.0 \n",
+ "25 21.0 31.0 \n",
+ "26 19.0 25.0 \n",
+ "27 28.0 23.0 \n",
+ "28 23.0 21.0 \n",
+ "29 29.0 28.0 \n",
+ "30 30.0 24.0 \n",
+ "31 27.0 27.0 \n",
+ "32 24.0 33.0 \n",
+ "33 26.0 37.0 \n",
+ "34 21.0 32.0 \n",
+ "35 32.0 38.0 \n",
+ "36 32.0 39.0 \n",
+ "37 34.0 34.0 \n",
+ "38 33.0 36.0 \n",
+ "39 35.0 35.0 \n",
+ "40 36.0 40.0 \n",
+ "\n",
+ " anli_r2(acc,none)_rank anli_r3(acc,none)_rank \\\n",
+ "0 3.0 2.0 \n",
+ "1 2.0 3.0 \n",
+ "2 8.0 11.0 \n",
+ "3 5.0 4.0 \n",
+ "4 4.0 5.0 \n",
+ "5 1.0 1.0 \n",
+ "6 12.0 18.0 \n",
+ "7 6.0 3.0 \n",
+ "8 14.0 17.0 \n",
+ "9 9.0 6.0 \n",
+ "10 11.0 14.0 \n",
+ "11 18.0 16.0 \n",
+ "12 17.0 8.0 \n",
+ "13 10.0 9.0 \n",
+ "14 20.0 16.0 \n",
+ "15 13.0 13.0 \n",
+ "16 19.0 15.0 \n",
+ "17 26.0 27.0 \n",
+ "18 7.0 7.0 \n",
+ "19 16.0 12.0 \n",
+ "20 23.0 21.0 \n",
+ "21 15.0 10.0 \n",
+ "22 26.0 31.0 \n",
+ "23 22.0 23.0 \n",
+ "24 25.0 29.0 \n",
+ "25 29.0 30.0 \n",
+ "26 21.0 24.0 \n",
+ "27 23.0 25.0 \n",
+ "28 28.0 20.0 \n",
+ "29 27.0 19.0 \n",
+ "30 24.0 22.0 \n",
+ "31 25.0 26.0 \n",
+ "32 31.0 33.0 \n",
+ "33 32.0 32.0 \n",
+ "34 30.0 28.0 \n",
+ "35 37.0 34.0 \n",
+ "36 34.0 36.0 \n",
+ "37 33.0 35.0 \n",
+ "38 36.0 37.0 \n",
+ "39 38.0 39.0 \n",
+ "40 35.0 38.0 \n",
+ "\n",
+ " gpqa_main_zeroshot(acc_norm,none)_rank hellaswag(acc_norm,none)_rank \\\n",
+ "0 9.0 3.0 \n",
+ "1 1.0 12.0 \n",
+ "2 11.0 6.0 \n",
+ "3 5.0 24.0 \n",
+ "4 12.0 5.0 \n",
+ "5 4.0 1.0 \n",
+ "6 20.0 14.0 \n",
+ "7 8.0 11.0 \n",
+ "8 6.0 9.0 \n",
+ "9 18.0 13.0 \n",
+ "10 7.0 10.0 \n",
+ "11 16.0 19.0 \n",
+ "12 13.0 29.0 \n",
+ "13 25.0 5.0 \n",
+ "14 24.0 2.0 \n",
+ "15 23.0 26.0 \n",
+ "16 3.0 16.0 \n",
+ "17 22.0 22.0 \n",
+ "18 15.0 4.0 \n",
+ "19 2.0 20.0 \n",
+ "20 12.0 27.0 \n",
+ "21 14.0 23.0 \n",
+ "22 17.0 31.0 \n",
+ "23 21.0 15.0 \n",
+ "24 26.0 25.0 \n",
+ "25 32.0 8.0 \n",
+ "26 19.0 7.0 \n",
+ "27 10.0 34.0 \n",
+ "28 24.0 30.0 \n",
+ "29 22.0 33.0 \n",
+ "30 23.0 35.0 \n",
+ "31 31.0 21.0 \n",
+ "32 34.0 18.0 \n",
+ "33 33.0 17.0 \n",
+ "34 27.0 28.0 \n",
+ "35 26.0 32.0 \n",
+ "36 30.0 36.0 \n",
+ "37 27.0 39.0 \n",
+ "38 24.0 40.0 \n",
+ "39 28.0 38.0 \n",
+ "40 29.0 37.0 \n",
+ "\n",
+ " piqa(acc_norm,none)_rank winogrande(acc,none)_rank boolq(acc,none)_rank \\\n",
+ "0 19.0 4.0 3.0 \n",
+ "1 14.0 11.0 1.0 \n",
+ "2 3.0 1.0 4.0 \n",
+ "3 21.0 22.0 6.0 \n",
+ "4 10.0 14.0 7.0 \n",
+ "5 4.0 2.0 2.0 \n",
+ "6 6.0 9.0 10.0 \n",
+ "7 5.0 8.0 12.0 \n",
+ "8 6.0 6.0 16.0 \n",
+ "9 9.0 3.0 5.0 \n",
+ "10 2.0 7.0 8.0 \n",
+ "11 18.0 13.0 19.0 \n",
+ "12 27.0 27.0 13.0 \n",
+ "13 12.0 12.0 5.0 \n",
+ "14 1.0 5.0 9.0 \n",
+ "15 22.0 17.0 17.0 \n",
+ "16 17.0 15.0 15.0 \n",
+ "17 11.0 11.0 23.0 \n",
+ "18 7.0 18.0 11.0 \n",
+ "19 25.0 24.0 14.0 \n",
+ "20 26.0 25.0 26.0 \n",
+ "21 19.0 20.0 24.0 \n",
+ "22 29.0 29.0 33.0 \n",
+ "23 11.0 16.0 18.0 \n",
+ "24 20.0 23.0 20.0 \n",
+ "25 8.0 10.0 22.0 \n",
+ "26 15.0 14.0 21.0 \n",
+ "27 33.0 33.0 28.0 \n",
+ "28 24.0 30.0 27.0 \n",
+ "29 32.0 31.0 30.0 \n",
+ "30 35.0 35.0 39.0 \n",
+ "31 23.0 26.0 25.0 \n",
+ "32 16.0 21.0 29.0 \n",
+ "33 13.0 19.0 34.0 \n",
+ "34 28.0 28.0 32.0 \n",
+ "35 30.0 32.0 35.0 \n",
+ "36 31.0 34.0 31.0 \n",
+ "37 37.0 38.0 36.0 \n",
+ "38 38.0 39.0 40.0 \n",
+ "39 36.0 37.0 38.0 \n",
+ "40 34.0 36.0 37.0 \n",
+ "\n",
+ " openbookqa(acc_norm,none)_rank sciq(acc_norm,none)_rank \\\n",
+ "0 1.0 6.0 \n",
+ "1 8.0 1.0 \n",
+ "2 7.0 2.0 \n",
+ "3 20.0 4.0 \n",
+ "4 2.0 12.0 \n",
+ "5 4.0 17.0 \n",
+ "6 9.0 7.0 \n",
+ "7 3.0 8.0 \n",
+ "8 16.0 3.0 \n",
+ "9 14.0 6.0 \n",
+ "10 6.0 5.0 \n",
+ "11 17.0 15.0 \n",
+ "12 23.0 15.0 \n",
+ "13 15.0 20.0 \n",
+ "14 5.0 9.0 \n",
+ "15 6.0 16.0 \n",
+ "16 14.0 14.0 \n",
+ "17 19.0 10.0 \n",
+ "18 7.0 21.0 \n",
+ "19 17.0 10.0 \n",
+ "20 28.0 15.0 \n",
+ "21 19.0 24.0 \n",
+ "22 24.0 17.0 \n",
+ "23 8.0 29.0 \n",
+ "24 21.0 27.0 \n",
+ "25 10.0 13.0 \n",
+ "26 12.0 26.0 \n",
+ "27 27.0 19.0 \n",
+ "28 22.0 11.0 \n",
+ "29 26.0 23.0 \n",
+ "30 30.0 32.0 \n",
+ "31 13.0 31.0 \n",
+ "32 11.0 25.0 \n",
+ "33 15.0 22.0 \n",
+ "34 18.0 18.0 \n",
+ "35 29.0 28.0 \n",
+ "36 25.0 32.0 \n",
+ "37 32.0 33.0 \n",
+ "38 33.0 35.0 \n",
+ "39 31.0 34.0 \n",
+ "40 29.0 30.0 \n",
+ "\n",
+ " qnli(acc,none)_rank mmlu(acc,none)_rank \\\n",
+ "0 8.0 6.0 \n",
+ "1 2.0 2.0 \n",
+ "2 9.0 14.0 \n",
+ "3 7.0 3.0 \n",
+ "4 4.0 4.0 \n",
+ "5 1.0 1.0 \n",
+ "6 28.0 8.0 \n",
+ "7 11.0 5.0 \n",
+ "8 30.0 12.0 \n",
+ "9 6.0 9.0 \n",
+ "10 39.0 15.0 \n",
+ "11 20.0 16.0 \n",
+ "12 3.0 10.0 \n",
+ "13 18.0 20.0 \n",
+ "14 26.0 22.0 \n",
+ "15 16.0 24.0 \n",
+ "16 10.0 18.0 \n",
+ "17 13.0 17.0 \n",
+ "18 19.0 7.0 \n",
+ "19 17.0 11.0 \n",
+ "20 21.0 19.0 \n",
+ "21 5.0 13.0 \n",
+ "22 33.0 23.0 \n",
+ "23 35.0 32.0 \n",
+ "24 25.0 27.0 \n",
+ "25 38.0 31.0 \n",
+ "26 22.0 28.0 \n",
+ "27 24.0 29.0 \n",
+ "28 15.0 21.0 \n",
+ "29 27.0 25.0 \n",
+ "30 12.0 26.0 \n",
+ "31 14.0 33.0 \n",
+ "32 31.0 37.0 \n",
+ "33 37.0 36.0 \n",
+ "34 32.0 30.0 \n",
+ "35 40.0 34.0 \n",
+ "36 41.0 39.0 \n",
+ "37 29.0 41.0 \n",
+ "38 34.0 40.0 \n",
+ "39 36.0 38.0 \n",
+ "40 23.0 35.0 \n",
+ "\n",
+ " nq_open(exact_match,remove_whitespace)_rank drop(f1,none)_rank \\\n",
+ "0 8.0 8.0 \n",
+ "1 15.0 18.0 \n",
+ "2 5.0 3.0 \n",
+ "3 16.0 13.0 \n",
+ "4 23.0 27.0 \n",
+ "5 19.0 25.0 \n",
+ "6 10.0 1.0 \n",
+ "7 7.0 29.0 \n",
+ "8 4.0 4.0 \n",
+ "9 36.0 9.0 \n",
+ "10 7.0 24.0 \n",
+ "11 6.0 5.0 \n",
+ "12 34.0 16.0 \n",
+ "13 25.0 14.0 \n",
+ "14 9.0 19.0 \n",
+ "15 13.0 20.0 \n",
+ "16 29.0 12.0 \n",
+ "17 3.0 2.0 \n",
+ "18 35.0 31.0 \n",
+ "19 33.0 30.0 \n",
+ "20 12.0 7.0 \n",
+ "21 37.0 21.0 \n",
+ "22 22.0 33.0 \n",
+ "23 18.0 15.0 \n",
+ "24 20.0 26.0 \n",
+ "25 1.0 38.0 \n",
+ "26 14.0 17.0 \n",
+ "27 28.0 35.0 \n",
+ "28 24.0 36.0 \n",
+ "29 30.0 23.0 \n",
+ "30 32.0 40.0 \n",
+ "31 17.0 11.0 \n",
+ "32 2.0 37.0 \n",
+ "33 11.0 34.0 \n",
+ "34 26.0 10.0 \n",
+ "35 21.0 6.0 \n",
+ "36 27.0 22.0 \n",
+ "37 38.0 32.0 \n",
+ "38 39.0 41.0 \n",
+ "39 31.0 28.0 \n",
+ "40 31.0 39.0 \n",
+ "\n",
+ " truthfulqa_mc1(acc,none)_rank truthfulqa_mc2(acc,none)_rank \\\n",
+ "0 8.0 8.0 \n",
+ "1 7.0 6.0 \n",
+ "2 15.0 19.0 \n",
+ "3 13.0 13.0 \n",
+ "4 2.0 2.0 \n",
+ "5 1.0 1.0 \n",
+ "6 19.0 26.0 \n",
+ "7 3.0 3.0 \n",
+ "8 12.0 14.0 \n",
+ "9 10.0 11.0 \n",
+ "10 18.0 22.0 \n",
+ "11 13.0 17.0 \n",
+ "12 11.0 12.0 \n",
+ "13 6.0 5.0 \n",
+ "14 4.0 4.0 \n",
+ "15 16.0 16.0 \n",
+ "16 9.0 15.0 \n",
+ "17 23.0 32.0 \n",
+ "18 8.0 9.0 \n",
+ "19 14.0 10.0 \n",
+ "20 17.0 20.0 \n",
+ "21 5.0 7.0 \n",
+ "22 20.0 23.0 \n",
+ "23 16.0 24.0 \n",
+ "24 19.0 18.0 \n",
+ "25 33.0 40.0 \n",
+ "26 30.0 33.0 \n",
+ "27 28.0 29.0 \n",
+ "28 21.0 27.0 \n",
+ "29 25.0 28.0 \n",
+ "30 24.0 25.0 \n",
+ "31 22.0 30.0 \n",
+ "32 34.0 38.0 \n",
+ "33 36.0 41.0 \n",
+ "34 29.0 37.0 \n",
+ "35 31.0 34.0 \n",
+ "36 35.0 39.0 \n",
+ "37 26.0 31.0 \n",
+ "38 27.0 21.0 \n",
+ "39 32.0 35.0 \n",
+ "40 31.0 36.0 \n",
+ "\n",
+ " triviaqa(exact_match,remove_whitespace)_rank Reasoning & Math Mean Score \\\n",
+ "0 23.0 0.6266 \n",
+ "1 13.0 0.5860 \n",
+ "2 3.0 0.5505 \n",
+ "3 18.0 0.6214 \n",
+ "4 17.0 0.5541 \n",
+ "5 35.0 0.5488 \n",
+ "6 4.0 0.5206 \n",
+ "7 12.0 0.5245 \n",
+ "8 7.0 0.5366 \n",
+ "9 15.0 0.5399 \n",
+ "10 5.0 0.5446 \n",
+ "11 8.0 0.5286 \n",
+ "12 26.0 0.5712 \n",
+ "13 11.0 0.5184 \n",
+ "14 2.0 0.4704 \n",
+ "15 19.0 0.5374 \n",
+ "16 16.0 0.5006 \n",
+ "17 10.0 0.4495 \n",
+ "18 39.0 0.5285 \n",
+ "19 36.0 0.5387 \n",
+ "20 14.0 0.4688 \n",
+ "21 21.0 0.3823 \n",
+ "22 27.0 0.5010 \n",
+ "23 20.0 0.4244 \n",
+ "24 28.0 0.4469 \n",
+ "25 1.0 0.3719 \n",
+ "26 24.0 0.4143 \n",
+ "27 34.0 0.4841 \n",
+ "28 22.0 0.3874 \n",
+ "29 32.0 0.4493 \n",
+ "30 40.0 0.4997 \n",
+ "31 29.0 0.3674 \n",
+ "32 6.0 0.3361 \n",
+ "33 9.0 0.3377 \n",
+ "34 31.0 0.3702 \n",
+ "35 25.0 0.3450 \n",
+ "36 30.0 0.3312 \n",
+ "37 38.0 0.4009 \n",
+ "38 41.0 0.4085 \n",
+ "39 37.0 0.3494 \n",
+ "40 33.0 0.2914 \n",
+ "\n",
+ " Reasoning & Math Avg. Rank Commonsense & NLI Mean Score \\\n",
+ "0 1 0.7737 \n",
+ "1 3 0.7807 \n",
+ "2 6 0.7726 \n",
+ "3 2 0.7468 \n",
+ "4 5 0.7730 \n",
+ "5 7 0.7941 \n",
+ "6 16 0.7266 \n",
+ "7 15 0.7564 \n",
+ "8 12 0.7249 \n",
+ "9 9 0.7691 \n",
+ "10 8 0.7328 \n",
+ "11 13 0.7147 \n",
+ "12 4 0.7266 \n",
+ "13 17 0.7284 \n",
+ "14 22 0.7403 \n",
+ "15 11 0.7167 \n",
+ "16 19 0.7374 \n",
+ "17 24 0.7199 \n",
+ "18 14 0.7274 \n",
+ "19 10 0.7094 \n",
+ "20 23 0.6788 \n",
+ "21 32 0.7367 \n",
+ "22 18 0.6587 \n",
+ "23 27 0.7090 \n",
+ "24 26 0.6928 \n",
+ "25 33 0.7157 \n",
+ "26 28 0.7153 \n",
+ "27 21 0.6422 \n",
+ "28 31 0.6803 \n",
+ "29 25 0.6442 \n",
+ "30 20 0.6184 \n",
+ "31 35 0.6978 \n",
+ "32 39 0.6956 \n",
+ "33 38 0.6886 \n",
+ "34 34 0.6711 \n",
+ "35 37 0.6264 \n",
+ "36 40 0.6267 \n",
+ "37 30 0.5703 \n",
+ "38 29 0.5181 \n",
+ "39 36 0.5696 \n",
+ "40 41 0.6039 \n",
+ "\n",
+ " Commonsense & NLI Avg. Rank Knowledge & Reading Mean Score \\\n",
+ "0 3 0.3791 \n",
+ "1 2 0.3926 \n",
+ "2 5 0.4136 \n",
+ "3 8 0.3566 \n",
+ "4 4 0.3810 \n",
+ "5 1 0.3581 \n",
+ "6 15 0.4369 \n",
+ "7 7 0.3963 \n",
+ "8 17 0.4127 \n",
+ "9 6 0.3467 \n",
+ "10 12 0.3683 \n",
+ "11 22 0.3923 \n",
+ "12 16 0.3226 \n",
+ "13 13 0.3719 \n",
+ "14 9 0.4045 \n",
+ "15 19 0.3261 \n",
+ "16 10 0.3339 \n",
+ "17 18 0.4063 \n",
+ "18 14 0.2919 \n",
+ "19 23 0.2834 \n",
+ "20 30 0.3438 \n",
+ "21 11 0.3406 \n",
+ "22 32 0.2827 \n",
+ "23 24 0.3007 \n",
+ "24 27 0.2805 \n",
+ "25 20 0.3374 \n",
+ "26 21 0.2864 \n",
+ "27 34 0.2340 \n",
+ "28 29 0.2903 \n",
+ "29 33 0.2567 \n",
+ "30 37 0.2276 \n",
+ "31 25 0.2656 \n",
+ "32 26 0.3018 \n",
+ "33 28 0.2864 \n",
+ "34 31 0.2581 \n",
+ "35 36 0.2731 \n",
+ "36 35 0.2202 \n",
+ "37 39 0.1954 \n",
+ "38 41 0.1983 \n",
+ "39 40 0.2000 \n",
+ "40 38 0.2218 \n",
+ "\n",
+ " Knowledge & Reading Avg. Rank Mean Score \n",
+ "0 10 0.6038 \n",
+ "1 7 0.5961 \n",
+ "2 2 0.5871 \n",
+ "3 14 0.5859 \n",
+ "4 9 0.5788 \n",
+ "5 13 0.5775 \n",
+ "6 1 0.5676 \n",
+ "7 6 0.5672 \n",
+ "8 3 0.5653 \n",
+ "9 15 0.5621 \n",
+ "10 12 0.5576 \n",
+ "11 8 0.5528 \n",
+ "12 21 0.5510 \n",
+ "13 11 0.5480 \n",
+ "14 5 0.5451 \n",
+ "15 20 0.5368 \n",
+ "16 19 0.5335 \n",
+ "17 4 0.5312 \n",
+ "18 24 0.5271 \n",
+ "19 28 0.5219 \n",
+ "20 16 0.5048 \n",
+ "21 17 0.4939 \n",
+ "22 29 0.4907 \n",
+ "23 23 0.4869 \n",
+ "24 30 0.4830 \n",
+ "25 18 0.4819 \n",
+ "26 26 0.4813 \n",
+ "27 35 0.4644 \n",
+ "28 25 0.4608 \n",
+ "29 34 0.4597 \n",
+ "30 36 0.4596 \n",
+ "31 32 0.4525 \n",
+ "32 22 0.4516 \n",
+ "33 27 0.4451 \n",
+ "34 33 0.4419 \n",
+ "35 31 0.4219 \n",
+ "36 38 0.4013 \n",
+ "37 41 0.3986 \n",
+ "38 40 0.3838 \n",
+ "39 39 0.3816 \n",
+ "40 37 0.3799 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'total_time_raw': '18d 7h 55m', 'gpu_util_time_raw': '14d 23h 41m'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "GROUPS = {\n",
+ " \"Reasoning & Math\": [\n",
+ " \"gsm8k(exact_match,strict-match)\", \n",
+ " \"bbh(exact_match,get-answer)\", \n",
+ " \"arc_challenge(acc_norm,none)\", 'anli_r1(acc,none)',\n",
+ " 'anli_r2(acc,none)', 'anli_r3(acc,none)',\n",
+ " \"gpqa_main_zeroshot(acc_norm,none)\",\n",
+ " ],\n",
+ " \"Commonsense & NLI\": [\n",
+ " \"hellaswag(acc_norm,none)\",\n",
+ " \"piqa(acc_norm,none)\", \"winogrande(acc,none)\", \"boolq(acc,none)\",\n",
+ " \"openbookqa(acc_norm,none)\", \"sciq(acc_norm,none)\", \"qnli(acc,none)\",\n",
+ " ],\n",
+ " \"Knowledge & Reading\": [\n",
+ " \"mmlu(acc,none)\", \"nq_open(exact_match,remove_whitespace)\", \"drop(f1,none)\",\n",
+ " \"truthfulqa_mc1(acc,none)\", 'truthfulqa_mc2(acc,none)','triviaqa(exact_match,remove_whitespace)',\n",
+ " ],\n",
+ "}\n",
+ "\n",
+ "\n",
+ "\n",
+ "def add_task_ranks(df, task_cols):\n",
+ " df = df.copy()\n",
+ " for col in task_cols:\n",
+ " if col not in df.columns: \n",
+ " raise ValueError(f\"No task: {col}\")\n",
+ " # rank: 1 = best; NaN scores get ranked at the bottom\n",
+ " df[f\"{col}_rank\"] = df[col].rank(ascending=False, method=\"dense\", na_option=\"bottom\")\n",
+ " return df\n",
+ "\n",
+ "def add_group_ranks(df, groups):\n",
+ " df = df.copy()\n",
+ " for gname, cols in groups.items():\n",
+ " # strip task name before \"(\" if any\n",
+ " tasks = [c for c in cols]\n",
+ " mean_col = f\"{gname}_mean\"\n",
+ " rank_col = f\"{gname}_rank\"\n",
+ " df[mean_col] = df[tasks].mean(axis=1)\n",
+ " df[rank_col] = df[mean_col].rank(ascending=False, method=\"dense\", na_option=\"bottom\").astype(int)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def add_overall_rank(df, groups):\n",
+ " df = df.copy()\n",
+ " all_tasks = [c for cols in groups.values() for c in cols]\n",
+ "\n",
+ " # overall mean score across all tasks\n",
+ " df[\"overall_mean\"] = df[all_tasks].mean(axis=1, skipna=True)\n",
+ "\n",
+ " # higher = better → rank descending\n",
+ " df[\"overall_rank\"] = df[\"overall_mean\"].rank(\n",
+ " ascending=False, method=\"dense\", na_option=\"bottom\"\n",
+ " ).astype(int)\n",
+ " return df\n",
+ "\n",
+ "\n",
+ "all_task_cols = [c for cols in GROUPS.values() for c in cols]\n",
+ "\n",
+ "df_task_ranked = add_task_ranks(result_gpu_merged, all_task_cols)\n",
+ "df_group_ranked = add_group_ranks(df_task_ranked, GROUPS)\n",
+ "leaderboard = add_overall_rank(df_group_ranked, GROUPS)\n",
+ "\n",
+ "\n",
+ "col = \"overall_rank\" # the one you want first\n",
+ "cols = [col] + [c for c in leaderboard.columns if c != col]\n",
+ "df = leaderboard[cols]\n",
+ "df = df.sort_values(by=col, ascending=True).reset_index(drop=True)\n",
+ "\n",
+ "# Add quantization marker\n",
+ "targets = ['Qwen_Qwen3-14B', 'Qwen_Qwen2.5-14B-Instruct'] # use hyphen\n",
+ "mask = df['model_name'].isin(targets)\n",
+ "df.loc[mask, 'model_name'] = df.loc[mask, 'model_name'] + ' (8bit)'\n",
+ "\n",
+ "# display(df)\n",
+ "\n",
+ "df_display = df.rename(columns={\n",
+ " \"overall_rank\": \"Overall Rank\",\n",
+ " \"model_name\": \"Model Name\",\n",
+ " \"gpu_util_time\": \"GPU Util Time\",\n",
+ " \"total_time\": \"Total Time\",\n",
+ " \"parameters\": \"Parameters\",\n",
+ " 'Reasoning & Math_rank': 'Reasoning & Math Avg. Rank',\n",
+ " 'Commonsense & NLI_rank': 'Commonsense & NLI Avg. Rank',\n",
+ " 'Knowledge & Reading_rank': 'Knowledge & Reading Avg. Rank',\n",
+ " 'overall_mean': 'Mean Score',\n",
+ " 'Reasoning & Math_mean': 'Reasoning & Math Mean Score',\n",
+ " 'Commonsense & NLI_mean': 'Commonsense & NLI Mean Score',\n",
+ " 'Knowledge & Reading_mean': 'Knowledge & Reading Mean Score',\n",
+ "})\n",
+ "\n",
+ "cols_to_round = [\"Mean Score\", \"Reasoning & Math Mean Score\", \"Commonsense & NLI Mean Score\", \"Knowledge & Reading Mean Score\"] \n",
+ "df_display[cols_to_round] = df_display[cols_to_round].round(4)\n",
+ "\n",
+ "display(df_display)\n",
+ "df.to_csv(\"/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_master.csv\")\n",
+ "\n",
+ "\n",
+ " \n",
+ "# Total time calculation\n",
+ "def format_seconds(secs: int) -> str:\n",
+ " days, rem = divmod(int(secs), 86400) # 86400 sec = 1 day\n",
+ " hours, rem = divmod(rem, 3600) # 3600 sec = 1 hour\n",
+ " minutes, _ = divmod(rem, 60)\n",
+ " return f\"{days}d {hours}h {minutes}m\"\n",
+ "\n",
+ "# Example usage with df_display\n",
+ "totals = {}\n",
+ "for col in [\"total_time_raw\", \"gpu_util_time_raw\"]:\n",
+ " total_secs = df_display[col].sum()\n",
+ " totals[col] = format_seconds(total_secs)\n",
+ "\n",
+ "print(totals)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "b3ce5953-3a36-436a-ba4c-46bedd2b4c56",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "overall\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model Name | \n",
+ " Total Time | \n",
+ " GPU Util Time | \n",
+ " Mean Score | \n",
+ " Overall Rank | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " google_gemma-3-12b-it | \n",
+ " 15h 45m | \n",
+ " 14h 8m | \n",
+ " 0.6038 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Qwen_Qwen3-14B (8bit) | \n",
+ " 29h 45m | \n",
+ " 17h 29m | \n",
+ " 0.5961 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " openchat_openchat-3.6-8b-20240522 | \n",
+ " 7h 51m | \n",
+ " 6h 59m | \n",
+ " 0.5871 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Qwen_Qwen3-8B | \n",
+ " 15h 31m | \n",
+ " 13h 44m | \n",
+ " 0.5859 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " Qwen_Qwen2.5-7B-Instruct | \n",
+ " 9h 36m | \n",
+ " 8h 33m | \n",
+ " 0.5788 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
+ " 52h 44m | \n",
+ " 29h 32m | \n",
+ " 0.5775 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 01-ai_Yi-1.5-9B | \n",
+ " 11h 43m | \n",
+ " 10h 26m | \n",
+ " 0.5676 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " Qwen_Qwen2.5-7B-Instruct-1M | \n",
+ " 11h 17m | \n",
+ " 10h 10m | \n",
+ " 0.5672 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " meta-llama_Llama-3.1-8B-Instruct | \n",
+ " 12h 19m | \n",
+ " 10h 52m | \n",
+ " 0.5653 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 01-ai_Yi-1.5-9B-Chat | \n",
+ " 13h 54m | \n",
+ " 12h 15m | \n",
+ " 0.5621 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " mistralai_Ministral-8B-Instruct-2410 | \n",
+ " 10h 46m | \n",
+ " 9h 27m | \n",
+ " 0.5576 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " meta-llama_Meta-Llama-3-8B-Instruct | \n",
+ " 6h 30m | \n",
+ " 5h 46m | \n",
+ " 0.5528 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " Qwen_Qwen3-4B | \n",
+ " 5h 51m | \n",
+ " 5h 3m | \n",
+ " 0.5510 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " NousResearch_Hermes-2-Pro-Mistral-7B | \n",
+ " 8h 27m | \n",
+ " 7h 28m | \n",
+ " 0.5480 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " mistralai_Mistral-7B-Instruct-v0.3 | \n",
+ " 8h 38m | \n",
+ " 7h 41m | \n",
+ " 0.5451 | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " google_gemma-3-4b-it | \n",
+ " 4h 51m | \n",
+ " 3h 50m | \n",
+ " 0.5368 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 01-ai_Yi-1.5-6B-Chat | \n",
+ " 8h 4m | \n",
+ " 7h 1m | \n",
+ " 0.5335 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 01-ai_Yi-1.5-6B | \n",
+ " 4h 28m | \n",
+ " 3h 54m | \n",
+ " 0.5312 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " Qwen_Qwen2-7B-Instruct | \n",
+ " 11h 30m | \n",
+ " 10h 11m | \n",
+ " 0.5271 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
+ " 17h 57m | \n",
+ " 15h 30m | \n",
+ " 0.5219 | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " meta-llama_Llama-3.2-3B-Instruct | \n",
+ " 7h 12m | \n",
+ " 5h 57m | \n",
+ " 0.5048 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " Qwen_Qwen2.5-3B-Instruct | \n",
+ " 7h 48m | \n",
+ " 6h 30m | \n",
+ " 0.4939 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " Qwen_Qwen2.5-Math-7B | \n",
+ " 27h 21m | \n",
+ " 24h 38m | \n",
+ " 0.4907 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " deepseek-ai_deepseek-llm-7b-chat | \n",
+ " 10h 6m | \n",
+ " 9h 8m | \n",
+ " 0.4869 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
+ " 11h 46m | \n",
+ " 10h 36m | \n",
+ " 0.4830 | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " meta-llama_Llama-2-13b-hf | \n",
+ " 19h 21m | \n",
+ " 17h 38m | \n",
+ " 0.4819 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " meta-llama_Llama-2-13b-chat-hf | \n",
+ " 17h 8m | \n",
+ " 15h 37m | \n",
+ " 0.4813 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
+ " 6h 28m | \n",
+ " 5h 43m | \n",
+ " 0.4644 | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " Qwen_Qwen2.5-1.5B-Instruct | \n",
+ " 3h 20m | \n",
+ " 2h 36m | \n",
+ " 0.4608 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " Qwen_Qwen3-1.7B | \n",
+ " 4h 25m | \n",
+ " 3h 36m | \n",
+ " 0.4597 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " Qwen_Qwen2.5-Math-7B-Instruct | \n",
+ " 5h 37m | \n",
+ " 4h 57m | \n",
+ " 0.4596 | \n",
+ " 31 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " meta-llama_Llama-2-7b-chat-hf | \n",
+ " 6h 57m | \n",
+ " 6h 7m | \n",
+ " 0.4525 | \n",
+ " 32 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " meta-llama_Llama-2-7b-hf | \n",
+ " 5h 42m | \n",
+ " 4h 59m | \n",
+ " 0.4516 | \n",
+ " 33 | \n",
+ "
\n",
+ " \n",
+ " | 34 | \n",
+ " deepseek-ai_deepseek-llm-7b-base | \n",
+ " 7h 11m | \n",
+ " 6h 26m | \n",
+ " 0.4451 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " | 35 | \n",
+ " deepseek-ai_deepseek-math-7b-rl | \n",
+ " 8h 2m | \n",
+ " 7h 12m | \n",
+ " 0.4419 | \n",
+ " 35 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " meta-llama_Llama-3.2-1B-Instruct | \n",
+ " 3h 30m | \n",
+ " 2h 35m | \n",
+ " 0.4219 | \n",
+ " 36 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " google_gemma-3-1b-it | \n",
+ " 6h 50m | \n",
+ " 4h 52m | \n",
+ " 0.4013 | \n",
+ " 37 | \n",
+ "
\n",
+ " \n",
+ " | 38 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
+ " 3h 40m | \n",
+ " 2h 52m | \n",
+ " 0.3986 | \n",
+ " 38 | \n",
+ "
\n",
+ " \n",
+ " | 39 | \n",
+ " Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
+ " 3h 25m | \n",
+ " 2h 39m | \n",
+ " 0.3838 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " Qwen_Qwen3-0.6B | \n",
+ " 3h 45m | \n",
+ " 2h 53m | \n",
+ " 0.3816 | \n",
+ " 40 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " Qwen_Qwen2.5-0.5B-Instruct | \n",
+ " 2h 34m | \n",
+ " 1h 48m | \n",
+ " 0.3799 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model Name Total Time GPU Util Time \\\n",
+ "1 google_gemma-3-12b-it 15h 45m 14h 8m \n",
+ "2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
+ "3 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
+ "4 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
+ "5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
+ "6 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
+ "7 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
+ "8 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
+ "9 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
+ "10 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
+ "11 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
+ "12 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
+ "13 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
+ "14 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
+ "15 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
+ "16 google_gemma-3-4b-it 4h 51m 3h 50m \n",
+ "17 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
+ "18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
+ "19 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
+ "20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
+ "21 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
+ "22 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
+ "23 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
+ "24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
+ "25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
+ "26 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
+ "27 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
+ "28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
+ "29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
+ "30 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
+ "31 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
+ "32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
+ "33 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
+ "34 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
+ "35 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
+ "36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
+ "37 google_gemma-3-1b-it 6h 50m 4h 52m \n",
+ "38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
+ "39 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
+ "40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
+ "41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
+ "\n",
+ " Mean Score Overall Rank \n",
+ "1 0.6038 1 \n",
+ "2 0.5961 2 \n",
+ "3 0.5871 3 \n",
+ "4 0.5859 4 \n",
+ "5 0.5788 5 \n",
+ "6 0.5775 6 \n",
+ "7 0.5676 7 \n",
+ "8 0.5672 8 \n",
+ "9 0.5653 9 \n",
+ "10 0.5621 10 \n",
+ "11 0.5576 11 \n",
+ "12 0.5528 12 \n",
+ "13 0.5510 13 \n",
+ "14 0.5480 14 \n",
+ "15 0.5451 15 \n",
+ "16 0.5368 16 \n",
+ "17 0.5335 17 \n",
+ "18 0.5312 18 \n",
+ "19 0.5271 19 \n",
+ "20 0.5219 20 \n",
+ "21 0.5048 21 \n",
+ "22 0.4939 22 \n",
+ "23 0.4907 23 \n",
+ "24 0.4869 24 \n",
+ "25 0.4830 25 \n",
+ "26 0.4819 26 \n",
+ "27 0.4813 27 \n",
+ "28 0.4644 28 \n",
+ "29 0.4608 29 \n",
+ "30 0.4597 30 \n",
+ "31 0.4596 31 \n",
+ "32 0.4525 32 \n",
+ "33 0.4516 33 \n",
+ "34 0.4451 34 \n",
+ "35 0.4419 35 \n",
+ "36 0.4219 36 \n",
+ "37 0.4013 37 \n",
+ "38 0.3986 38 \n",
+ "39 0.3838 39 \n",
+ "40 0.3816 40 \n",
+ "41 0.3799 41 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "| Model Name | Total Time | GPU Util Time | Mean Score | Overall Rank |\n",
+ "|:------------------------------------------|:-------------|:----------------|-------------:|---------------:|\n",
+ "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.6038 | 1 |\n",
+ "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.5961 | 2 |\n",
+ "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.5871 | 3 |\n",
+ "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.5859 | 4 |\n",
+ "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.5788 | 5 |\n",
+ "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.5775 | 6 |\n",
+ "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.5676 | 7 |\n",
+ "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.5672 | 8 |\n",
+ "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.5653 | 9 |\n",
+ "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.5621 | 10 |\n",
+ "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.5576 | 11 |\n",
+ "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.5528 | 12 |\n",
+ "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.551 | 13 |\n",
+ "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.548 | 14 |\n",
+ "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.5451 | 15 |\n",
+ "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.5368 | 16 |\n",
+ "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.5335 | 17 |\n",
+ "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.5312 | 18 |\n",
+ "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.5271 | 19 |\n",
+ "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.5219 | 20 |\n",
+ "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.5048 | 21 |\n",
+ "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.4939 | 22 |\n",
+ "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.4907 | 23 |\n",
+ "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.4869 | 24 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.483 | 25 |\n",
+ "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.4819 | 26 |\n",
+ "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.4813 | 27 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.4644 | 28 |\n",
+ "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.4608 | 29 |\n",
+ "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.4597 | 30 |\n",
+ "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.4596 | 31 |\n",
+ "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.4525 | 32 |\n",
+ "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.4516 | 33 |\n",
+ "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.4451 | 34 |\n",
+ "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.4419 | 35 |\n",
+ "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.4219 | 36 |\n",
+ "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.4013 | 37 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.3986 | 38 |\n",
+ "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.3838 | 39 |\n",
+ "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.3816 | 40 |\n",
+ "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.3799 | 41 |\n",
+ "\n",
+ "\n",
+ "reasoning_and_math\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model Name | \n",
+ " Total Time | \n",
+ " GPU Util Time | \n",
+ " Reasoning & Math Mean Score | \n",
+ " Reasoning & Math Avg. Rank | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " google_gemma-3-12b-it | \n",
+ " 15h 45m | \n",
+ " 14h 8m | \n",
+ " 0.6266 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Qwen_Qwen3-8B | \n",
+ " 15h 31m | \n",
+ " 13h 44m | \n",
+ " 0.6214 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Qwen_Qwen3-14B (8bit) | \n",
+ " 29h 45m | \n",
+ " 17h 29m | \n",
+ " 0.5860 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Qwen_Qwen3-4B | \n",
+ " 5h 51m | \n",
+ " 5h 3m | \n",
+ " 0.5712 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " Qwen_Qwen2.5-7B-Instruct | \n",
+ " 9h 36m | \n",
+ " 8h 33m | \n",
+ " 0.5541 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " openchat_openchat-3.6-8b-20240522 | \n",
+ " 7h 51m | \n",
+ " 6h 59m | \n",
+ " 0.5505 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
+ " 52h 44m | \n",
+ " 29h 32m | \n",
+ " 0.5488 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " mistralai_Ministral-8B-Instruct-2410 | \n",
+ " 10h 46m | \n",
+ " 9h 27m | \n",
+ " 0.5446 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 01-ai_Yi-1.5-9B-Chat | \n",
+ " 13h 54m | \n",
+ " 12h 15m | \n",
+ " 0.5399 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
+ " 17h 57m | \n",
+ " 15h 30m | \n",
+ " 0.5387 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " google_gemma-3-4b-it | \n",
+ " 4h 51m | \n",
+ " 3h 50m | \n",
+ " 0.5374 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " meta-llama_Llama-3.1-8B-Instruct | \n",
+ " 12h 19m | \n",
+ " 10h 52m | \n",
+ " 0.5366 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " meta-llama_Meta-Llama-3-8B-Instruct | \n",
+ " 6h 30m | \n",
+ " 5h 46m | \n",
+ " 0.5286 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " Qwen_Qwen2-7B-Instruct | \n",
+ " 11h 30m | \n",
+ " 10h 11m | \n",
+ " 0.5285 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " Qwen_Qwen2.5-7B-Instruct-1M | \n",
+ " 11h 17m | \n",
+ " 10h 10m | \n",
+ " 0.5245 | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " 01-ai_Yi-1.5-9B | \n",
+ " 11h 43m | \n",
+ " 10h 26m | \n",
+ " 0.5206 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " NousResearch_Hermes-2-Pro-Mistral-7B | \n",
+ " 8h 27m | \n",
+ " 7h 28m | \n",
+ " 0.5184 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " Qwen_Qwen2.5-Math-7B | \n",
+ " 27h 21m | \n",
+ " 24h 38m | \n",
+ " 0.5010 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 01-ai_Yi-1.5-6B-Chat | \n",
+ " 8h 4m | \n",
+ " 7h 1m | \n",
+ " 0.5006 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " Qwen_Qwen2.5-Math-7B-Instruct | \n",
+ " 5h 37m | \n",
+ " 4h 57m | \n",
+ " 0.4997 | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
+ " 6h 28m | \n",
+ " 5h 43m | \n",
+ " 0.4841 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " mistralai_Mistral-7B-Instruct-v0.3 | \n",
+ " 8h 38m | \n",
+ " 7h 41m | \n",
+ " 0.4704 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " meta-llama_Llama-3.2-3B-Instruct | \n",
+ " 7h 12m | \n",
+ " 5h 57m | \n",
+ " 0.4688 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " 01-ai_Yi-1.5-6B | \n",
+ " 4h 28m | \n",
+ " 3h 54m | \n",
+ " 0.4495 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " Qwen_Qwen3-1.7B | \n",
+ " 4h 25m | \n",
+ " 3h 36m | \n",
+ " 0.4493 | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
+ " 11h 46m | \n",
+ " 10h 36m | \n",
+ " 0.4469 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " deepseek-ai_deepseek-llm-7b-chat | \n",
+ " 10h 6m | \n",
+ " 9h 8m | \n",
+ " 0.4244 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " meta-llama_Llama-2-13b-chat-hf | \n",
+ " 17h 8m | \n",
+ " 15h 37m | \n",
+ " 0.4143 | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
+ " 3h 25m | \n",
+ " 2h 39m | \n",
+ " 0.4085 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
+ " 3h 40m | \n",
+ " 2h 52m | \n",
+ " 0.4009 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " Qwen_Qwen2.5-1.5B-Instruct | \n",
+ " 3h 20m | \n",
+ " 2h 36m | \n",
+ " 0.3874 | \n",
+ " 31 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " Qwen_Qwen2.5-3B-Instruct | \n",
+ " 7h 48m | \n",
+ " 6h 30m | \n",
+ " 0.3823 | \n",
+ " 32 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " meta-llama_Llama-2-13b-hf | \n",
+ " 19h 21m | \n",
+ " 17h 38m | \n",
+ " 0.3719 | \n",
+ " 33 | \n",
+ "
\n",
+ " \n",
+ " | 34 | \n",
+ " deepseek-ai_deepseek-math-7b-rl | \n",
+ " 8h 2m | \n",
+ " 7h 12m | \n",
+ " 0.3702 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " | 35 | \n",
+ " meta-llama_Llama-2-7b-chat-hf | \n",
+ " 6h 57m | \n",
+ " 6h 7m | \n",
+ " 0.3674 | \n",
+ " 35 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " Qwen_Qwen3-0.6B | \n",
+ " 3h 45m | \n",
+ " 2h 53m | \n",
+ " 0.3494 | \n",
+ " 36 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " meta-llama_Llama-3.2-1B-Instruct | \n",
+ " 3h 30m | \n",
+ " 2h 35m | \n",
+ " 0.3450 | \n",
+ " 37 | \n",
+ "
\n",
+ " \n",
+ " | 38 | \n",
+ " deepseek-ai_deepseek-llm-7b-base | \n",
+ " 7h 11m | \n",
+ " 6h 26m | \n",
+ " 0.3377 | \n",
+ " 38 | \n",
+ "
\n",
+ " \n",
+ " | 39 | \n",
+ " meta-llama_Llama-2-7b-hf | \n",
+ " 5h 42m | \n",
+ " 4h 59m | \n",
+ " 0.3361 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " google_gemma-3-1b-it | \n",
+ " 6h 50m | \n",
+ " 4h 52m | \n",
+ " 0.3312 | \n",
+ " 40 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " Qwen_Qwen2.5-0.5B-Instruct | \n",
+ " 2h 34m | \n",
+ " 1h 48m | \n",
+ " 0.2914 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model Name Total Time GPU Util Time \\\n",
+ "1 google_gemma-3-12b-it 15h 45m 14h 8m \n",
+ "2 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
+ "3 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
+ "4 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
+ "5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
+ "6 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
+ "7 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
+ "8 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
+ "9 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
+ "10 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
+ "11 google_gemma-3-4b-it 4h 51m 3h 50m \n",
+ "12 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
+ "13 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
+ "14 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
+ "15 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
+ "16 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
+ "17 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
+ "18 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
+ "19 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
+ "20 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
+ "21 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
+ "22 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
+ "23 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
+ "24 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
+ "25 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
+ "26 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
+ "27 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
+ "28 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
+ "29 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
+ "30 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
+ "31 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
+ "32 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
+ "33 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
+ "34 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
+ "35 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
+ "36 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
+ "37 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
+ "38 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
+ "39 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
+ "40 google_gemma-3-1b-it 6h 50m 4h 52m \n",
+ "41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
+ "\n",
+ " Reasoning & Math Mean Score Reasoning & Math Avg. Rank \n",
+ "1 0.6266 1 \n",
+ "2 0.6214 2 \n",
+ "3 0.5860 3 \n",
+ "4 0.5712 4 \n",
+ "5 0.5541 5 \n",
+ "6 0.5505 6 \n",
+ "7 0.5488 7 \n",
+ "8 0.5446 8 \n",
+ "9 0.5399 9 \n",
+ "10 0.5387 10 \n",
+ "11 0.5374 11 \n",
+ "12 0.5366 12 \n",
+ "13 0.5286 13 \n",
+ "14 0.5285 14 \n",
+ "15 0.5245 15 \n",
+ "16 0.5206 16 \n",
+ "17 0.5184 17 \n",
+ "18 0.5010 18 \n",
+ "19 0.5006 19 \n",
+ "20 0.4997 20 \n",
+ "21 0.4841 21 \n",
+ "22 0.4704 22 \n",
+ "23 0.4688 23 \n",
+ "24 0.4495 24 \n",
+ "25 0.4493 25 \n",
+ "26 0.4469 26 \n",
+ "27 0.4244 27 \n",
+ "28 0.4143 28 \n",
+ "29 0.4085 29 \n",
+ "30 0.4009 30 \n",
+ "31 0.3874 31 \n",
+ "32 0.3823 32 \n",
+ "33 0.3719 33 \n",
+ "34 0.3702 34 \n",
+ "35 0.3674 35 \n",
+ "36 0.3494 36 \n",
+ "37 0.3450 37 \n",
+ "38 0.3377 38 \n",
+ "39 0.3361 39 \n",
+ "40 0.3312 40 \n",
+ "41 0.2914 41 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "| Model Name | Total Time | GPU Util Time | Reasoning & Math Mean Score | Reasoning & Math Avg. Rank |\n",
+ "|:------------------------------------------|:-------------|:----------------|------------------------------:|-----------------------------:|\n",
+ "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.6266 | 1 |\n",
+ "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.6214 | 2 |\n",
+ "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.586 | 3 |\n",
+ "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.5712 | 4 |\n",
+ "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.5541 | 5 |\n",
+ "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.5505 | 6 |\n",
+ "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.5488 | 7 |\n",
+ "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.5446 | 8 |\n",
+ "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.5399 | 9 |\n",
+ "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.5387 | 10 |\n",
+ "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.5374 | 11 |\n",
+ "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.5366 | 12 |\n",
+ "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.5286 | 13 |\n",
+ "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.5285 | 14 |\n",
+ "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.5245 | 15 |\n",
+ "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.5206 | 16 |\n",
+ "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.5184 | 17 |\n",
+ "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.501 | 18 |\n",
+ "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.5006 | 19 |\n",
+ "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.4997 | 20 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.4841 | 21 |\n",
+ "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.4704 | 22 |\n",
+ "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.4688 | 23 |\n",
+ "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.4495 | 24 |\n",
+ "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.4493 | 25 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.4469 | 26 |\n",
+ "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.4244 | 27 |\n",
+ "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.4143 | 28 |\n",
+ "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.4085 | 29 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.4009 | 30 |\n",
+ "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.3874 | 31 |\n",
+ "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.3823 | 32 |\n",
+ "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.3719 | 33 |\n",
+ "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.3702 | 34 |\n",
+ "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.3674 | 35 |\n",
+ "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.3494 | 36 |\n",
+ "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.345 | 37 |\n",
+ "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.3377 | 38 |\n",
+ "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.3361 | 39 |\n",
+ "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.3312 | 40 |\n",
+ "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.2914 | 41 |\n",
+ "\n",
+ "\n",
+ "commonsense_and_nli\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model Name | \n",
+ " Total Time | \n",
+ " GPU Util Time | \n",
+ " Commonsense & NLI Mean Score | \n",
+ " Commonsense & NLI Avg. Rank | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
+ " 52h 44m | \n",
+ " 29h 32m | \n",
+ " 0.7941 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Qwen_Qwen3-14B (8bit) | \n",
+ " 29h 45m | \n",
+ " 17h 29m | \n",
+ " 0.7807 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " google_gemma-3-12b-it | \n",
+ " 15h 45m | \n",
+ " 14h 8m | \n",
+ " 0.7737 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Qwen_Qwen2.5-7B-Instruct | \n",
+ " 9h 36m | \n",
+ " 8h 33m | \n",
+ " 0.7730 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " openchat_openchat-3.6-8b-20240522 | \n",
+ " 7h 51m | \n",
+ " 6h 59m | \n",
+ " 0.7726 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 01-ai_Yi-1.5-9B-Chat | \n",
+ " 13h 54m | \n",
+ " 12h 15m | \n",
+ " 0.7691 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " Qwen_Qwen2.5-7B-Instruct-1M | \n",
+ " 11h 17m | \n",
+ " 10h 10m | \n",
+ " 0.7564 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " Qwen_Qwen3-8B | \n",
+ " 15h 31m | \n",
+ " 13h 44m | \n",
+ " 0.7468 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " mistralai_Mistral-7B-Instruct-v0.3 | \n",
+ " 8h 38m | \n",
+ " 7h 41m | \n",
+ " 0.7403 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 01-ai_Yi-1.5-6B-Chat | \n",
+ " 8h 4m | \n",
+ " 7h 1m | \n",
+ " 0.7374 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " Qwen_Qwen2.5-3B-Instruct | \n",
+ " 7h 48m | \n",
+ " 6h 30m | \n",
+ " 0.7367 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " mistralai_Ministral-8B-Instruct-2410 | \n",
+ " 10h 46m | \n",
+ " 9h 27m | \n",
+ " 0.7328 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " NousResearch_Hermes-2-Pro-Mistral-7B | \n",
+ " 8h 27m | \n",
+ " 7h 28m | \n",
+ " 0.7284 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " Qwen_Qwen2-7B-Instruct | \n",
+ " 11h 30m | \n",
+ " 10h 11m | \n",
+ " 0.7274 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 01-ai_Yi-1.5-9B | \n",
+ " 11h 43m | \n",
+ " 10h 26m | \n",
+ " 0.7266 | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " Qwen_Qwen3-4B | \n",
+ " 5h 51m | \n",
+ " 5h 3m | \n",
+ " 0.7266 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " meta-llama_Llama-3.1-8B-Instruct | \n",
+ " 12h 19m | \n",
+ " 10h 52m | \n",
+ " 0.7249 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 01-ai_Yi-1.5-6B | \n",
+ " 4h 28m | \n",
+ " 3h 54m | \n",
+ " 0.7199 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " google_gemma-3-4b-it | \n",
+ " 4h 51m | \n",
+ " 3h 50m | \n",
+ " 0.7167 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " meta-llama_Llama-2-13b-hf | \n",
+ " 19h 21m | \n",
+ " 17h 38m | \n",
+ " 0.7157 | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " meta-llama_Llama-2-13b-chat-hf | \n",
+ " 17h 8m | \n",
+ " 15h 37m | \n",
+ " 0.7153 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " meta-llama_Meta-Llama-3-8B-Instruct | \n",
+ " 6h 30m | \n",
+ " 5h 46m | \n",
+ " 0.7147 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
+ " 17h 57m | \n",
+ " 15h 30m | \n",
+ " 0.7094 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " deepseek-ai_deepseek-llm-7b-chat | \n",
+ " 10h 6m | \n",
+ " 9h 8m | \n",
+ " 0.7090 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " meta-llama_Llama-2-7b-chat-hf | \n",
+ " 6h 57m | \n",
+ " 6h 7m | \n",
+ " 0.6978 | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " meta-llama_Llama-2-7b-hf | \n",
+ " 5h 42m | \n",
+ " 4h 59m | \n",
+ " 0.6956 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
+ " 11h 46m | \n",
+ " 10h 36m | \n",
+ " 0.6928 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " deepseek-ai_deepseek-llm-7b-base | \n",
+ " 7h 11m | \n",
+ " 6h 26m | \n",
+ " 0.6886 | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " Qwen_Qwen2.5-1.5B-Instruct | \n",
+ " 3h 20m | \n",
+ " 2h 36m | \n",
+ " 0.6803 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " meta-llama_Llama-3.2-3B-Instruct | \n",
+ " 7h 12m | \n",
+ " 5h 57m | \n",
+ " 0.6788 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " deepseek-ai_deepseek-math-7b-rl | \n",
+ " 8h 2m | \n",
+ " 7h 12m | \n",
+ " 0.6711 | \n",
+ " 31 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " Qwen_Qwen2.5-Math-7B | \n",
+ " 27h 21m | \n",
+ " 24h 38m | \n",
+ " 0.6587 | \n",
+ " 32 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " Qwen_Qwen3-1.7B | \n",
+ " 4h 25m | \n",
+ " 3h 36m | \n",
+ " 0.6442 | \n",
+ " 33 | \n",
+ "
\n",
+ " \n",
+ " | 34 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
+ " 6h 28m | \n",
+ " 5h 43m | \n",
+ " 0.6422 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " | 35 | \n",
+ " google_gemma-3-1b-it | \n",
+ " 6h 50m | \n",
+ " 4h 52m | \n",
+ " 0.6267 | \n",
+ " 35 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " meta-llama_Llama-3.2-1B-Instruct | \n",
+ " 3h 30m | \n",
+ " 2h 35m | \n",
+ " 0.6264 | \n",
+ " 36 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " Qwen_Qwen2.5-Math-7B-Instruct | \n",
+ " 5h 37m | \n",
+ " 4h 57m | \n",
+ " 0.6184 | \n",
+ " 37 | \n",
+ "
\n",
+ " \n",
+ " | 38 | \n",
+ " Qwen_Qwen2.5-0.5B-Instruct | \n",
+ " 2h 34m | \n",
+ " 1h 48m | \n",
+ " 0.6039 | \n",
+ " 38 | \n",
+ "
\n",
+ " \n",
+ " | 39 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
+ " 3h 40m | \n",
+ " 2h 52m | \n",
+ " 0.5703 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " Qwen_Qwen3-0.6B | \n",
+ " 3h 45m | \n",
+ " 2h 53m | \n",
+ " 0.5696 | \n",
+ " 40 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
+ " 3h 25m | \n",
+ " 2h 39m | \n",
+ " 0.5181 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model Name Total Time GPU Util Time \\\n",
+ "1 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
+ "2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
+ "3 google_gemma-3-12b-it 15h 45m 14h 8m \n",
+ "4 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
+ "5 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
+ "6 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
+ "7 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
+ "8 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
+ "9 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
+ "10 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
+ "11 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
+ "12 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
+ "13 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
+ "14 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
+ "15 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
+ "16 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
+ "17 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
+ "18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
+ "19 google_gemma-3-4b-it 4h 51m 3h 50m \n",
+ "20 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
+ "21 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
+ "22 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
+ "23 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
+ "24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
+ "25 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
+ "26 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
+ "27 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
+ "28 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
+ "29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
+ "30 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
+ "31 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
+ "32 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
+ "33 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
+ "34 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
+ "35 google_gemma-3-1b-it 6h 50m 4h 52m \n",
+ "36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
+ "37 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
+ "38 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
+ "39 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
+ "40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
+ "41 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
+ "\n",
+ " Commonsense & NLI Mean Score Commonsense & NLI Avg. Rank \n",
+ "1 0.7941 1 \n",
+ "2 0.7807 2 \n",
+ "3 0.7737 3 \n",
+ "4 0.7730 4 \n",
+ "5 0.7726 5 \n",
+ "6 0.7691 6 \n",
+ "7 0.7564 7 \n",
+ "8 0.7468 8 \n",
+ "9 0.7403 9 \n",
+ "10 0.7374 10 \n",
+ "11 0.7367 11 \n",
+ "12 0.7328 12 \n",
+ "13 0.7284 13 \n",
+ "14 0.7274 14 \n",
+ "15 0.7266 15 \n",
+ "16 0.7266 16 \n",
+ "17 0.7249 17 \n",
+ "18 0.7199 18 \n",
+ "19 0.7167 19 \n",
+ "20 0.7157 20 \n",
+ "21 0.7153 21 \n",
+ "22 0.7147 22 \n",
+ "23 0.7094 23 \n",
+ "24 0.7090 24 \n",
+ "25 0.6978 25 \n",
+ "26 0.6956 26 \n",
+ "27 0.6928 27 \n",
+ "28 0.6886 28 \n",
+ "29 0.6803 29 \n",
+ "30 0.6788 30 \n",
+ "31 0.6711 31 \n",
+ "32 0.6587 32 \n",
+ "33 0.6442 33 \n",
+ "34 0.6422 34 \n",
+ "35 0.6267 35 \n",
+ "36 0.6264 36 \n",
+ "37 0.6184 37 \n",
+ "38 0.6039 38 \n",
+ "39 0.5703 39 \n",
+ "40 0.5696 40 \n",
+ "41 0.5181 41 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "| Model Name | Total Time | GPU Util Time | Commonsense & NLI Mean Score | Commonsense & NLI Avg. Rank |\n",
+ "|:------------------------------------------|:-------------|:----------------|-------------------------------:|------------------------------:|\n",
+ "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.7941 | 1 |\n",
+ "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.7807 | 2 |\n",
+ "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.7737 | 3 |\n",
+ "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.773 | 4 |\n",
+ "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.7726 | 5 |\n",
+ "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.7691 | 6 |\n",
+ "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.7564 | 7 |\n",
+ "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.7468 | 8 |\n",
+ "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.7403 | 9 |\n",
+ "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.7374 | 10 |\n",
+ "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.7367 | 11 |\n",
+ "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.7328 | 12 |\n",
+ "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.7284 | 13 |\n",
+ "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.7274 | 14 |\n",
+ "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.7266 | 15 |\n",
+ "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.7266 | 16 |\n",
+ "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.7249 | 17 |\n",
+ "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.7199 | 18 |\n",
+ "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.7167 | 19 |\n",
+ "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.7157 | 20 |\n",
+ "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.7153 | 21 |\n",
+ "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.7147 | 22 |\n",
+ "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.7094 | 23 |\n",
+ "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.709 | 24 |\n",
+ "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.6978 | 25 |\n",
+ "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.6956 | 26 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.6928 | 27 |\n",
+ "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.6886 | 28 |\n",
+ "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.6803 | 29 |\n",
+ "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.6788 | 30 |\n",
+ "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.6711 | 31 |\n",
+ "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.6587 | 32 |\n",
+ "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.6442 | 33 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.6422 | 34 |\n",
+ "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.6267 | 35 |\n",
+ "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.6264 | 36 |\n",
+ "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.6184 | 37 |\n",
+ "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.6039 | 38 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.5703 | 39 |\n",
+ "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.5696 | 40 |\n",
+ "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.5181 | 41 |\n",
+ "\n",
+ "\n",
+ "knowledge_and_reading\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model Name | \n",
+ " Total Time | \n",
+ " GPU Util Time | \n",
+ " Knowledge & Reading Mean Score | \n",
+ " Knowledge & Reading Avg. Rank | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " 01-ai_Yi-1.5-9B | \n",
+ " 11h 43m | \n",
+ " 10h 26m | \n",
+ " 0.4369 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " openchat_openchat-3.6-8b-20240522 | \n",
+ " 7h 51m | \n",
+ " 6h 59m | \n",
+ " 0.4136 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " meta-llama_Llama-3.1-8B-Instruct | \n",
+ " 12h 19m | \n",
+ " 10h 52m | \n",
+ " 0.4127 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 01-ai_Yi-1.5-6B | \n",
+ " 4h 28m | \n",
+ " 3h 54m | \n",
+ " 0.4063 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " mistralai_Mistral-7B-Instruct-v0.3 | \n",
+ " 8h 38m | \n",
+ " 7h 41m | \n",
+ " 0.4045 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " Qwen_Qwen2.5-7B-Instruct-1M | \n",
+ " 11h 17m | \n",
+ " 10h 10m | \n",
+ " 0.3963 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " Qwen_Qwen3-14B (8bit) | \n",
+ " 29h 45m | \n",
+ " 17h 29m | \n",
+ " 0.3926 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " meta-llama_Meta-Llama-3-8B-Instruct | \n",
+ " 6h 30m | \n",
+ " 5h 46m | \n",
+ " 0.3923 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " Qwen_Qwen2.5-7B-Instruct | \n",
+ " 9h 36m | \n",
+ " 8h 33m | \n",
+ " 0.3810 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " google_gemma-3-12b-it | \n",
+ " 15h 45m | \n",
+ " 14h 8m | \n",
+ " 0.3791 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " NousResearch_Hermes-2-Pro-Mistral-7B | \n",
+ " 8h 27m | \n",
+ " 7h 28m | \n",
+ " 0.3719 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " mistralai_Ministral-8B-Instruct-2410 | \n",
+ " 10h 46m | \n",
+ " 9h 27m | \n",
+ " 0.3683 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
+ " 52h 44m | \n",
+ " 29h 32m | \n",
+ " 0.3581 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " Qwen_Qwen3-8B | \n",
+ " 15h 31m | \n",
+ " 13h 44m | \n",
+ " 0.3566 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " 01-ai_Yi-1.5-9B-Chat | \n",
+ " 13h 54m | \n",
+ " 12h 15m | \n",
+ " 0.3467 | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " meta-llama_Llama-3.2-3B-Instruct | \n",
+ " 7h 12m | \n",
+ " 5h 57m | \n",
+ " 0.3438 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " Qwen_Qwen2.5-3B-Instruct | \n",
+ " 7h 48m | \n",
+ " 6h 30m | \n",
+ " 0.3406 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " meta-llama_Llama-2-13b-hf | \n",
+ " 19h 21m | \n",
+ " 17h 38m | \n",
+ " 0.3374 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " 01-ai_Yi-1.5-6B-Chat | \n",
+ " 8h 4m | \n",
+ " 7h 1m | \n",
+ " 0.3339 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " google_gemma-3-4b-it | \n",
+ " 4h 51m | \n",
+ " 3h 50m | \n",
+ " 0.3261 | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " Qwen_Qwen3-4B | \n",
+ " 5h 51m | \n",
+ " 5h 3m | \n",
+ " 0.3226 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " meta-llama_Llama-2-7b-hf | \n",
+ " 5h 42m | \n",
+ " 4h 59m | \n",
+ " 0.3018 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " deepseek-ai_deepseek-llm-7b-chat | \n",
+ " 10h 6m | \n",
+ " 9h 8m | \n",
+ " 0.3007 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " Qwen_Qwen2-7B-Instruct | \n",
+ " 11h 30m | \n",
+ " 10h 11m | \n",
+ " 0.2919 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " Qwen_Qwen2.5-1.5B-Instruct | \n",
+ " 3h 20m | \n",
+ " 2h 36m | \n",
+ " 0.2903 | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " meta-llama_Llama-2-13b-chat-hf | \n",
+ " 17h 8m | \n",
+ " 15h 37m | \n",
+ " 0.2864 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " deepseek-ai_deepseek-llm-7b-base | \n",
+ " 7h 11m | \n",
+ " 6h 26m | \n",
+ " 0.2864 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
+ " 17h 57m | \n",
+ " 15h 30m | \n",
+ " 0.2834 | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " Qwen_Qwen2.5-Math-7B | \n",
+ " 27h 21m | \n",
+ " 24h 38m | \n",
+ " 0.2827 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
+ " 11h 46m | \n",
+ " 10h 36m | \n",
+ " 0.2805 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " meta-llama_Llama-3.2-1B-Instruct | \n",
+ " 3h 30m | \n",
+ " 2h 35m | \n",
+ " 0.2731 | \n",
+ " 31 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " meta-llama_Llama-2-7b-chat-hf | \n",
+ " 6h 57m | \n",
+ " 6h 7m | \n",
+ " 0.2656 | \n",
+ " 32 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " deepseek-ai_deepseek-math-7b-rl | \n",
+ " 8h 2m | \n",
+ " 7h 12m | \n",
+ " 0.2581 | \n",
+ " 33 | \n",
+ "
\n",
+ " \n",
+ " | 34 | \n",
+ " Qwen_Qwen3-1.7B | \n",
+ " 4h 25m | \n",
+ " 3h 36m | \n",
+ " 0.2567 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " | 35 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
+ " 6h 28m | \n",
+ " 5h 43m | \n",
+ " 0.2340 | \n",
+ " 35 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " Qwen_Qwen2.5-Math-7B-Instruct | \n",
+ " 5h 37m | \n",
+ " 4h 57m | \n",
+ " 0.2276 | \n",
+ " 36 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " Qwen_Qwen2.5-0.5B-Instruct | \n",
+ " 2h 34m | \n",
+ " 1h 48m | \n",
+ " 0.2218 | \n",
+ " 37 | \n",
+ "
\n",
+ " \n",
+ " | 38 | \n",
+ " google_gemma-3-1b-it | \n",
+ " 6h 50m | \n",
+ " 4h 52m | \n",
+ " 0.2202 | \n",
+ " 38 | \n",
+ "
\n",
+ " \n",
+ " | 39 | \n",
+ " Qwen_Qwen3-0.6B | \n",
+ " 3h 45m | \n",
+ " 2h 53m | \n",
+ " 0.2000 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
+ " 3h 25m | \n",
+ " 2h 39m | \n",
+ " 0.1983 | \n",
+ " 40 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
+ " 3h 40m | \n",
+ " 2h 52m | \n",
+ " 0.1954 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model Name Total Time GPU Util Time \\\n",
+ "1 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
+ "2 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
+ "3 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
+ "4 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
+ "5 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
+ "6 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
+ "7 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
+ "8 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
+ "9 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
+ "10 google_gemma-3-12b-it 15h 45m 14h 8m \n",
+ "11 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
+ "12 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
+ "13 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
+ "14 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
+ "15 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
+ "16 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
+ "17 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
+ "18 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
+ "19 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
+ "20 google_gemma-3-4b-it 4h 51m 3h 50m \n",
+ "21 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
+ "22 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
+ "23 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
+ "24 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
+ "25 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
+ "26 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
+ "27 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
+ "28 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
+ "29 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
+ "30 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
+ "31 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
+ "32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
+ "33 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
+ "34 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
+ "35 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
+ "36 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
+ "37 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
+ "38 google_gemma-3-1b-it 6h 50m 4h 52m \n",
+ "39 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
+ "40 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
+ "41 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
+ "\n",
+ " Knowledge & Reading Mean Score Knowledge & Reading Avg. Rank \n",
+ "1 0.4369 1 \n",
+ "2 0.4136 2 \n",
+ "3 0.4127 3 \n",
+ "4 0.4063 4 \n",
+ "5 0.4045 5 \n",
+ "6 0.3963 6 \n",
+ "7 0.3926 7 \n",
+ "8 0.3923 8 \n",
+ "9 0.3810 9 \n",
+ "10 0.3791 10 \n",
+ "11 0.3719 11 \n",
+ "12 0.3683 12 \n",
+ "13 0.3581 13 \n",
+ "14 0.3566 14 \n",
+ "15 0.3467 15 \n",
+ "16 0.3438 16 \n",
+ "17 0.3406 17 \n",
+ "18 0.3374 18 \n",
+ "19 0.3339 19 \n",
+ "20 0.3261 20 \n",
+ "21 0.3226 21 \n",
+ "22 0.3018 22 \n",
+ "23 0.3007 23 \n",
+ "24 0.2919 24 \n",
+ "25 0.2903 25 \n",
+ "26 0.2864 26 \n",
+ "27 0.2864 27 \n",
+ "28 0.2834 28 \n",
+ "29 0.2827 29 \n",
+ "30 0.2805 30 \n",
+ "31 0.2731 31 \n",
+ "32 0.2656 32 \n",
+ "33 0.2581 33 \n",
+ "34 0.2567 34 \n",
+ "35 0.2340 35 \n",
+ "36 0.2276 36 \n",
+ "37 0.2218 37 \n",
+ "38 0.2202 38 \n",
+ "39 0.2000 39 \n",
+ "40 0.1983 40 \n",
+ "41 0.1954 41 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "| Model Name | Total Time | GPU Util Time | Knowledge & Reading Mean Score | Knowledge & Reading Avg. Rank |\n",
+ "|:------------------------------------------|:-------------|:----------------|---------------------------------:|--------------------------------:|\n",
+ "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.4369 | 1 |\n",
+ "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.4136 | 2 |\n",
+ "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.4127 | 3 |\n",
+ "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.4063 | 4 |\n",
+ "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.4045 | 5 |\n",
+ "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.3963 | 6 |\n",
+ "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.3926 | 7 |\n",
+ "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.3923 | 8 |\n",
+ "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.381 | 9 |\n",
+ "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.3791 | 10 |\n",
+ "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.3719 | 11 |\n",
+ "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.3683 | 12 |\n",
+ "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.3581 | 13 |\n",
+ "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.3566 | 14 |\n",
+ "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.3467 | 15 |\n",
+ "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.3438 | 16 |\n",
+ "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.3406 | 17 |\n",
+ "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.3374 | 18 |\n",
+ "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.3339 | 19 |\n",
+ "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.3261 | 20 |\n",
+ "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.3226 | 21 |\n",
+ "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.3018 | 22 |\n",
+ "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.3007 | 23 |\n",
+ "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.2919 | 24 |\n",
+ "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.2903 | 25 |\n",
+ "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.2864 | 26 |\n",
+ "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.2864 | 27 |\n",
+ "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.2834 | 28 |\n",
+ "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.2827 | 29 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.2805 | 30 |\n",
+ "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.2731 | 31 |\n",
+ "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.2656 | 32 |\n",
+ "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.2581 | 33 |\n",
+ "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.2567 | 34 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.234 | 35 |\n",
+ "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.2276 | 36 |\n",
+ "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.2218 | 37 |\n",
+ "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.2202 | 38 |\n",
+ "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.2 | 39 |\n",
+ "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.1983 | 40 |\n",
+ "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.1954 | 41 |\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "column_map = {\n",
+ " \"overall\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Mean Score', \"Overall Rank\"],\n",
+ " \"reasoning_and_math\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Reasoning & Math Mean Score', \"Reasoning & Math Avg. Rank\"],\n",
+ " \"commonsense_and_nli\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Commonsense & NLI Mean Score', \"Commonsense & NLI Avg. Rank\"],\n",
+ " \"knowledge_and_reading\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Knowledge & Reading Mean Score', \"Knowledge & Reading Avg. Rank\"]\n",
+ "}\n",
+ "\n",
+ "\n",
+ "\n",
+ "# Produce sub-dataframes and export them to csv and excel file.\n",
+ "with pd.ExcelWriter(\"/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_all_results.xlsx\") as writer:\n",
+ " df_display.to_excel(writer, sheet_name=\"Master\", index=False)\n",
+ " \n",
+ " for name, cols in column_map.items():\n",
+ " sub_df = df_display[cols].copy()\n",
+ " rank_col = [c for c in sub_df.columns if 'Rank' in c][0]\n",
+ " sub_df = sub_df.sort_values(by=rank_col, ascending=True).reset_index(drop=True)\n",
+ " sub_df.index = sub_df.index + 1\n",
+ " print(name)\n",
+ " if name == 'overall':\n",
+ " overall_df = sub_df\n",
+ " display(sub_df)\n",
+ " \n",
+ " # sub_df.to_csv(f\"/mnt/data8tb/Documents/project/benchmark_project/{name}_rank.csv\")\n",
+ " # sub_df.to_excel(writer, sheet_name=name, index=False)\n",
+ "\n",
+ " table_md = sub_df.to_markdown(index=False)\n",
+ " print(table_md)\n",
+ "\n",
+ " sub_df.to_html(f\"{name}.html\", index=False)\n",
+ " print()\n",
+ " print()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "5642b72a-e416-482b-b45b-8376fd2571b7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Model Name | \n",
+ " Total Time | \n",
+ " GPU Util Time | \n",
+ " Mean Score | \n",
+ " Overall Rank | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " google_gemma-3-12b-it | \n",
+ " 15h 45m | \n",
+ " 14h 8m | \n",
+ " 0.6038 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Qwen_Qwen3-14B (8bit) | \n",
+ " 29h 45m | \n",
+ " 17h 29m | \n",
+ " 0.5961 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " openchat_openchat-3.6-8b-20240522 | \n",
+ " 7h 51m | \n",
+ " 6h 59m | \n",
+ " 0.5871 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Qwen_Qwen3-8B | \n",
+ " 15h 31m | \n",
+ " 13h 44m | \n",
+ " 0.5859 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " Qwen_Qwen2.5-7B-Instruct | \n",
+ " 9h 36m | \n",
+ " 8h 33m | \n",
+ " 0.5788 | \n",
+ " 5 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " Qwen_Qwen2.5-14B-Instruct (8bit) | \n",
+ " 52h 44m | \n",
+ " 29h 32m | \n",
+ " 0.5775 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 01-ai_Yi-1.5-9B | \n",
+ " 11h 43m | \n",
+ " 10h 26m | \n",
+ " 0.5676 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " Qwen_Qwen2.5-7B-Instruct-1M | \n",
+ " 11h 17m | \n",
+ " 10h 10m | \n",
+ " 0.5672 | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " meta-llama_Llama-3.1-8B-Instruct | \n",
+ " 12h 19m | \n",
+ " 10h 52m | \n",
+ " 0.5653 | \n",
+ " 9 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 01-ai_Yi-1.5-9B-Chat | \n",
+ " 13h 54m | \n",
+ " 12h 15m | \n",
+ " 0.5621 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " mistralai_Ministral-8B-Instruct-2410 | \n",
+ " 10h 46m | \n",
+ " 9h 27m | \n",
+ " 0.5576 | \n",
+ " 11 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " meta-llama_Meta-Llama-3-8B-Instruct | \n",
+ " 6h 30m | \n",
+ " 5h 46m | \n",
+ " 0.5528 | \n",
+ " 12 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " Qwen_Qwen3-4B | \n",
+ " 5h 51m | \n",
+ " 5h 3m | \n",
+ " 0.5510 | \n",
+ " 13 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " NousResearch_Hermes-2-Pro-Mistral-7B | \n",
+ " 8h 27m | \n",
+ " 7h 28m | \n",
+ " 0.5480 | \n",
+ " 14 | \n",
+ "
\n",
+ " \n",
+ " | 15 | \n",
+ " mistralai_Mistral-7B-Instruct-v0.3 | \n",
+ " 8h 38m | \n",
+ " 7h 41m | \n",
+ " 0.5451 | \n",
+ " 15 | \n",
+ "
\n",
+ " \n",
+ " | 16 | \n",
+ " google_gemma-3-4b-it | \n",
+ " 4h 51m | \n",
+ " 3h 50m | \n",
+ " 0.5368 | \n",
+ " 16 | \n",
+ "
\n",
+ " \n",
+ " | 17 | \n",
+ " 01-ai_Yi-1.5-6B-Chat | \n",
+ " 8h 4m | \n",
+ " 7h 1m | \n",
+ " 0.5335 | \n",
+ " 17 | \n",
+ "
\n",
+ " \n",
+ " | 18 | \n",
+ " 01-ai_Yi-1.5-6B | \n",
+ " 4h 28m | \n",
+ " 3h 54m | \n",
+ " 0.5312 | \n",
+ " 18 | \n",
+ "
\n",
+ " \n",
+ " | 19 | \n",
+ " Qwen_Qwen2-7B-Instruct | \n",
+ " 11h 30m | \n",
+ " 10h 11m | \n",
+ " 0.5271 | \n",
+ " 19 | \n",
+ "
\n",
+ " \n",
+ " | 20 | \n",
+ " deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | \n",
+ " 17h 57m | \n",
+ " 15h 30m | \n",
+ " 0.5219 | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " | 21 | \n",
+ " meta-llama_Llama-3.2-3B-Instruct | \n",
+ " 7h 12m | \n",
+ " 5h 57m | \n",
+ " 0.5048 | \n",
+ " 21 | \n",
+ "
\n",
+ " \n",
+ " | 22 | \n",
+ " Qwen_Qwen2.5-3B-Instruct | \n",
+ " 7h 48m | \n",
+ " 6h 30m | \n",
+ " 0.4939 | \n",
+ " 22 | \n",
+ "
\n",
+ " \n",
+ " | 23 | \n",
+ " Qwen_Qwen2.5-Math-7B | \n",
+ " 27h 21m | \n",
+ " 24h 38m | \n",
+ " 0.4907 | \n",
+ " 23 | \n",
+ "
\n",
+ " \n",
+ " | 24 | \n",
+ " deepseek-ai_deepseek-llm-7b-chat | \n",
+ " 10h 6m | \n",
+ " 9h 8m | \n",
+ " 0.4869 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | 25 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Llama-8B | \n",
+ " 11h 46m | \n",
+ " 10h 36m | \n",
+ " 0.4830 | \n",
+ " 25 | \n",
+ "
\n",
+ " \n",
+ " | 26 | \n",
+ " meta-llama_Llama-2-13b-hf | \n",
+ " 19h 21m | \n",
+ " 17h 38m | \n",
+ " 0.4819 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " | 27 | \n",
+ " meta-llama_Llama-2-13b-chat-hf | \n",
+ " 17h 8m | \n",
+ " 15h 37m | \n",
+ " 0.4813 | \n",
+ " 27 | \n",
+ "
\n",
+ " \n",
+ " | 28 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | \n",
+ " 6h 28m | \n",
+ " 5h 43m | \n",
+ " 0.4644 | \n",
+ " 28 | \n",
+ "
\n",
+ " \n",
+ " | 29 | \n",
+ " Qwen_Qwen2.5-1.5B-Instruct | \n",
+ " 3h 20m | \n",
+ " 2h 36m | \n",
+ " 0.4608 | \n",
+ " 29 | \n",
+ "
\n",
+ " \n",
+ " | 30 | \n",
+ " Qwen_Qwen3-1.7B | \n",
+ " 4h 25m | \n",
+ " 3h 36m | \n",
+ " 0.4597 | \n",
+ " 30 | \n",
+ "
\n",
+ " \n",
+ " | 31 | \n",
+ " Qwen_Qwen2.5-Math-7B-Instruct | \n",
+ " 5h 37m | \n",
+ " 4h 57m | \n",
+ " 0.4596 | \n",
+ " 31 | \n",
+ "
\n",
+ " \n",
+ " | 32 | \n",
+ " meta-llama_Llama-2-7b-chat-hf | \n",
+ " 6h 57m | \n",
+ " 6h 7m | \n",
+ " 0.4525 | \n",
+ " 32 | \n",
+ "
\n",
+ " \n",
+ " | 33 | \n",
+ " meta-llama_Llama-2-7b-hf | \n",
+ " 5h 42m | \n",
+ " 4h 59m | \n",
+ " 0.4516 | \n",
+ " 33 | \n",
+ "
\n",
+ " \n",
+ " | 34 | \n",
+ " deepseek-ai_deepseek-llm-7b-base | \n",
+ " 7h 11m | \n",
+ " 6h 26m | \n",
+ " 0.4451 | \n",
+ " 34 | \n",
+ "
\n",
+ " \n",
+ " | 35 | \n",
+ " deepseek-ai_deepseek-math-7b-rl | \n",
+ " 8h 2m | \n",
+ " 7h 12m | \n",
+ " 0.4419 | \n",
+ " 35 | \n",
+ "
\n",
+ " \n",
+ " | 36 | \n",
+ " meta-llama_Llama-3.2-1B-Instruct | \n",
+ " 3h 30m | \n",
+ " 2h 35m | \n",
+ " 0.4219 | \n",
+ " 36 | \n",
+ "
\n",
+ " \n",
+ " | 37 | \n",
+ " google_gemma-3-1b-it | \n",
+ " 6h 50m | \n",
+ " 4h 52m | \n",
+ " 0.4013 | \n",
+ " 37 | \n",
+ "
\n",
+ " \n",
+ " | 38 | \n",
+ " deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | \n",
+ " 3h 40m | \n",
+ " 2h 52m | \n",
+ " 0.3986 | \n",
+ " 38 | \n",
+ "
\n",
+ " \n",
+ " | 39 | \n",
+ " Qwen_Qwen2.5-Math-1.5B-Instruct | \n",
+ " 3h 25m | \n",
+ " 2h 39m | \n",
+ " 0.3838 | \n",
+ " 39 | \n",
+ "
\n",
+ " \n",
+ " | 40 | \n",
+ " Qwen_Qwen3-0.6B | \n",
+ " 3h 45m | \n",
+ " 2h 53m | \n",
+ " 0.3816 | \n",
+ " 40 | \n",
+ "
\n",
+ " \n",
+ " | 41 | \n",
+ " Qwen_Qwen2.5-0.5B-Instruct | \n",
+ " 2h 34m | \n",
+ " 1h 48m | \n",
+ " 0.3799 | \n",
+ " 41 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Model Name Total Time GPU Util Time \\\n",
+ "1 google_gemma-3-12b-it 15h 45m 14h 8m \n",
+ "2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n",
+ "3 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n",
+ "4 Qwen_Qwen3-8B 15h 31m 13h 44m \n",
+ "5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n",
+ "6 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n",
+ "7 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n",
+ "8 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n",
+ "9 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n",
+ "10 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n",
+ "11 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n",
+ "12 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n",
+ "13 Qwen_Qwen3-4B 5h 51m 5h 3m \n",
+ "14 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n",
+ "15 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n",
+ "16 google_gemma-3-4b-it 4h 51m 3h 50m \n",
+ "17 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n",
+ "18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n",
+ "19 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n",
+ "20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n",
+ "21 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n",
+ "22 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n",
+ "23 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n",
+ "24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n",
+ "25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n",
+ "26 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n",
+ "27 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n",
+ "28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n",
+ "29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n",
+ "30 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n",
+ "31 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n",
+ "32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n",
+ "33 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n",
+ "34 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n",
+ "35 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n",
+ "36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n",
+ "37 google_gemma-3-1b-it 6h 50m 4h 52m \n",
+ "38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n",
+ "39 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n",
+ "40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n",
+ "41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n",
+ "\n",
+ " Mean Score Overall Rank \n",
+ "1 0.6038 1 \n",
+ "2 0.5961 2 \n",
+ "3 0.5871 3 \n",
+ "4 0.5859 4 \n",
+ "5 0.5788 5 \n",
+ "6 0.5775 6 \n",
+ "7 0.5676 7 \n",
+ "8 0.5672 8 \n",
+ "9 0.5653 9 \n",
+ "10 0.5621 10 \n",
+ "11 0.5576 11 \n",
+ "12 0.5528 12 \n",
+ "13 0.5510 13 \n",
+ "14 0.5480 14 \n",
+ "15 0.5451 15 \n",
+ "16 0.5368 16 \n",
+ "17 0.5335 17 \n",
+ "18 0.5312 18 \n",
+ "19 0.5271 19 \n",
+ "20 0.5219 20 \n",
+ "21 0.5048 21 \n",
+ "22 0.4939 22 \n",
+ "23 0.4907 23 \n",
+ "24 0.4869 24 \n",
+ "25 0.4830 25 \n",
+ "26 0.4819 26 \n",
+ "27 0.4813 27 \n",
+ "28 0.4644 28 \n",
+ "29 0.4608 29 \n",
+ "30 0.4597 30 \n",
+ "31 0.4596 31 \n",
+ "32 0.4525 32 \n",
+ "33 0.4516 33 \n",
+ "34 0.4451 34 \n",
+ "35 0.4419 35 \n",
+ "36 0.4219 36 \n",
+ "37 0.4013 37 \n",
+ "38 0.3986 38 \n",
+ "39 0.3838 39 \n",
+ "40 0.3816 40 \n",
+ "41 0.3799 41 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "display(overall_df)\n",
+ "overall_df.to_html(\"overall.html\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1a04411e-c749-428f-89bd-2c23ac74af71",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7368bca2-dd44-4393-be0e-320f737af82b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}