{ "cells": [ { "cell_type": "code", "execution_count": 104, "id": "73fc3ddb-9d22-4b9b-960a-f78b5111c898", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 23, "id": "15c618d3-e5a2-4ae8-8e2e-df916cc7d465", "metadata": {}, "outputs": [], "source": [ "import json, pathlib, pandas as pd\n", "from pprint import pprint\n", "import os\n", "from pathlib import Path\n", "from collections import Counter\n", "from io import StringIO\n", "import numpy as np\n", "\n", "\n", "pd.set_option(\"display.max_rows\", None) # show ALL rows\n", "pd.set_option(\"display.max_columns\", None) # show ALL columns\n", "\n", "\n", "# Time Conversion function\n", "def format_time(seconds: float) -> str:\n", " seconds = int(seconds)\n", " hours, remainder = divmod(seconds, 3600)\n", " minutes = remainder // 60 # drop leftover seconds\n", "\n", " parts = []\n", " if hours > 0:\n", " parts.append(f\"{hours}h\")\n", " if minutes > 0 or not parts: # if no hours and no minutes, show 0m\n", " parts.append(f\"{minutes}m\")\n", "\n", " return \" \".join(parts)\n", "\n", "\n", "def list_json_files(directory: str):\n", " \"\"\"\n", " Reads all .json files in a given directory and returns \n", " their full paths as a list.\n", " \"\"\"\n", " json_files = []\n", " for file in os.listdir(directory):\n", " if file.endswith(\".json\"):\n", " full_path = os.path.join(directory, file)\n", " json_files.append(full_path)\n", " return json_files\n", "\n", "\n", "def format_params(n: int) -> str:\n", " \"\"\"\n", " Convert raw parameter count (int) into human-friendly string.\n", " Examples:\n", " 6851947264 -> \"7B\"\n", " 12500000000 -> \"12.5B\"\n", " 560000000 -> \"560M\"\n", " \"\"\"\n", " if n >= 1_000_000_000: # billions\n", " val = n / 1_000_000_000\n", " if val.is_integer():\n", " return f\"{int(val)}B\"\n", " else:\n", " return f\"{val:.1f}B\"\n", " elif n >= 1_000_000: # millions\n", " val = n / 1_000_000\n", " if val.is_integer():\n", " return f\"{int(val)}M\"\n", " else:\n", " return f\"{val:.1f}M\"\n", " elif n >= 1_000: # thousands (rare for params, but included)\n", " val = n / 1_000\n", " if val.is_integer():\n", " return f\"{int(val)}K\"\n", " else:\n", " return f\"{val:.1f}K\"\n", " else:\n", " return str(n)\n", "\n", "\n", "metric_map = {\n", " \"mmlu\":\"acc,none\" ,\n", " \"hellaswag\": \"acc_norm,none\",\n", " \"arc_challenge\": \"acc_norm,none\", # prefer normalized accuracy\n", " \"bbh\": \"exact_match,get-answer\",\n", " \"gsm8k\":\"exact_match,strict-match\" ,\n", " \"gpqa_main_zeroshot\":\"acc_norm,none\",\n", " \"anli_r1\": \"acc,none\",\n", " \"anli_r2\": \"acc,none\",\n", " \"anli_r3\": \"acc,none\",\n", " \"piqa\":\"acc_norm,none\" ,\n", " \"winogrande\": \"acc,none\",\n", " \"boolq\": \"acc,none\",\n", " \"truthfulqa_mc1\":\"acc,none\" ,\n", " \"truthfulqa_mc2\":\"acc,none\" ,\n", " \"drop\": \"f1,none\",\n", " \"nq_open\":\"exact_match,remove_whitespace\" ,\n", " \"openbookqa\":\"acc_norm,none\" ,\n", " \"sciq\": \"acc_norm,none\",\n", " \"triviaqa\":\"exact_match,remove_whitespace\" ,\n", " \"qnli\":\"acc,none\" ,\n", "}\n", "\n", "# Tasks from most important to least important\n", "# tasks = [mmlu, hellaswag, arc_challenge, bbh, gsm8k, gpqa_main_zeroshot, ANLI (r1, r2, r3), piqa, winogrande, boolq, TruthfulQA (mc1, mc2), drop, nq_open, openbookqa, sciq, triviaqa, qnli]\n", "\n", "# Path list \n", "directory = \"/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/results\"\n", "all_json_paths = list_json_files(directory)\n", "\n", "def parse_results(json_path: str, metric_map: dict) -> pd.DataFrame:\n", "\n", " with open(json_path,'r') as f:\n", " data = json.load(f)\n", "\n", " # Extract core info\n", " model_name = data.get(\"model_name\")\n", " model_name = model_name.split(\"/\")[-1]\n", " total_time_raw = float(data.get(\"total_evaluation_time_seconds\", 0))\n", " total_time = format_time(float(data.get(\"total_evaluation_time_seconds\", 0)))\n", " batch_size = data[\"config\"].get(\"batch_size\")\n", " batch_sizes = data[\"config\"].get(\"batch_sizes\")\n", " parameters = format_params(data[\"config\"].get(\"model_num_parameters\"))\n", " parameters_raw = data[\"config\"].get(\"model_num_parameters\")\n", " \n", "\n", " rows = []\n", " for task, metric_key in metric_map.items():\n", " # Skip tasks not present in the results\n", " if task not in data[\"results\"]:\n", " raise ValueError(f\"'{task}' not in results! \") \n", " \n", " metrics = data[\"results\"][task]\n", " \n", " # If the metric_key isn't in this task's results, raise error\n", " if metric_key not in metrics:\n", " raise ValueError(\n", " f\"Expected metric '{metric_key}' not found for task '{task}'. \"\n", " f\"Available keys: {list(metrics.keys())}\"\n", " )\n", " \n", " acc = metrics[metric_key]\n", " \n", " row = {\n", " \"model_name\": model_name,\n", " # \"task\": task,\n", " \"task\": task + \"(\" + metric_key + \")\",\n", " \"score\": acc,\n", " \"total_time\": total_time,\n", " \"total_time_raw\" : total_time_raw,\n", " \"batch_size\": batch_size,\n", " \"batch_sizes\": batch_sizes,\n", " \"parameters\": parameters,\n", " \"parameters_raw\": parameters_raw,\n", " }\n", " rows.append(row)\n", "\n", " # Convert to tidy dataframe\n", " return pd.DataFrame(rows)\n", "\n", "\n", "dfs = [parse_results(path, metric_map) for path in all_json_paths]\n", "master_df = pd.concat(dfs, ignore_index=True)\n", "\n", "\n", "# display(master_df)\n", "\n", "\n", "# Wide format: one row per model, columns = tasks\n", "#Check for duplicate rows \n", "key_cols = [\"task\", 'score', 'model_name']\n", "dups_mask = master_df.duplicated(key_cols, keep=False)\n", "# dups = master_df.loc[dups_mask]\n", "# display(dups)\n", "\n", "if dups_mask.any():\n", " dups = master_df.loc[dups_mask, key_cols]\n", " raise ValueError(f\"Duplicate rows found for keys:\\n{dups}\")\n", "\n", "wide_df = master_df.pivot_table(\n", " index=[\"model_name\", \"parameters\"],\n", " columns=[\"task\"],\n", " values=\"score\",\n", " aggfunc=\"mean\"\n", ").reset_index()\n", "\n", "# select the metadata columns you want from the long df\n", "meta_cols = [\n", " \"model_name\", \n", " \"parameters\", \n", " \"parameters_raw\",\n", " \"total_time\", \n", " \"total_time_raw\", \n", " \"batch_size\", \n", " \"batch_sizes\", \n", "]\n", "\n", "# drop duplicate rows by model_name + parameters\n", "df_meta = master_df[meta_cols].drop_duplicates(subset=[\"model_name\", \"parameters\"])\n", "\n", "# merge the metadata back into your wide dataframe\n", "df_wide_merged = df_meta.merge(wide_df, on=[\"model_name\", \"parameters\"], how=\"left\")\n", "\n", "\n", "# display(df_wide_merged.drop(columns=[\"parameters_raw\", \"total_time_raw\", \"batch_sizes\"]))\n", "\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "324364b8-b59a-4450-8723-0c4057488513", "metadata": {}, "outputs": [], "source": [ "gpu_dir = Path(\"/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/gpu_usage\")\n", "gpu_files = list(gpu_dir.glob(\"*_gpu_usage.csv\"))\n", "\n", "def model_from_filename(p: Path) -> str:\n", " return p.stem.replace(\"_gpu_usage\", \"\").strip()\n", "\n", "model_names_gpu = [model_from_filename(x) for x in gpu_files]\n", "\n", "# Check if match with result\n", "set_gpu = set(model_names_gpu)\n", "set_results = set(wide_df['model_name'])\n", "extra_in_gpu = set_gpu - set_results\n", "missing_in_gpu = set_results - set_gpu\n", "# print(\"Extra models in GPU logs:\", extra_in_gpu)\n", "# print(\"Models in results with no GPU log:\", missing_in_gpu)\n", "\n", "\n", "# Check for Dulicates\n", "def print_duplicates(name_list, label=\"\"):\n", " counts = Counter(name_list)\n", " dups = [name for name, cnt in counts.items() if cnt > 1]\n", " if dups:\n", " print(f\"Duplicates in {label}:\")\n", " for name in dups:\n", " print(f\" {name} (count = {counts[name]})\")\n", " else:\n", " print(f\"No duplicates found in {label}.\")\n", "# print_duplicates(model_names_gpu, \"GPU logs\")\n", "\n", "\n", "def read_last_run_csv(path: Path) -> pd.DataFrame:\n", " \"\"\"\n", " Return a DataFrame for only the *last* '==== New Run ... ====' block.\n", " Assumes next line after the marker is the CSV header.\n", " \"\"\"\n", " lines = path.read_text(encoding=\"utf-8\").splitlines()\n", " # locate all run markers\n", " run_idx = [i for i, line in enumerate(lines) if line.startswith(\"==== New Run:\")]\n", " if not run_idx:\n", " raise ValueError(f\"No '==== New Run' marker found in {path}\")\n", " start = run_idx[-1] + 1 # header line index\n", "\n", " # slice from header to end and parse CSV\n", " block = \"\\n\".join(lines[start:])\n", " df = pd.read_csv(StringIO(block))\n", "\n", " # optional cleanup: strip units and cast to numbers if these columns exist\n", " if \" utilization.gpu [%]\" in df.columns:\n", " df[\" utilization.gpu [%]\"] = (\n", " df[\" utilization.gpu [%]\"].astype(str).str.replace(\"%\", \"\", regex=False).str.strip().astype(\"float\")\n", " )\n", " if \" memory.used [MiB]\" in df.columns:\n", " df[\" memory.used [MiB]\"] = (\n", " df[\" memory.used [MiB]\"].astype(str).str.replace(\"MiB\", \"\", regex=False).str.strip().astype(\"float\")\n", " )\n", " # parse timestamp if desired\n", " if \"timestamp\" in df.columns:\n", " df[\"timestamp\"] = pd.to_datetime(df[\"timestamp\"], errors=\"coerce\")\n", "\n", " return df\n", "\n", "\n", "def eq_full_util_time(df, util_col=\" utilization.gpu [%]\", interval_sec=60):\n", " # clip just in case and cast to float\n", " u = pd.to_numeric(df[util_col], errors=\"coerce\")\n", " # u = pd.to_numeric(df[util_col], errors=\"coerce\").fillna(0).clip(0, 100)\n", " eq_full_sec = float((u / 100 * interval_sec).sum())\n", " full_sec = float(len(u)*interval_sec)\n", "\n", " # pretty formatter\n", " h, rem = divmod(int(round(full_sec)), 3600)\n", " m, s = divmod(rem, 60)\n", " pretty_full = f\"{h}h {m}m\"\n", " h, rem = divmod(int(round(eq_full_sec)), 3600)\n", " m, s = divmod(rem, 60)\n", " pretty = f\"{h}h {m}m\"\n", " return pretty, pretty_full, eq_full_sec\n", "\n", "\n", "gpu_df = [read_last_run_csv(df) for df in gpu_files]\n", "\n", "\n", "results = []\n", "for name, df in zip(model_names_gpu, gpu_df):\n", " pretty, pretty_full, full_sec_raw = eq_full_util_time(df) # unpack values\n", " results.append((name, pretty, full_sec_raw, pretty_full)) # collect tuple\n", "\n", "# Turn into DataFrame\n", "gpu_util_df = pd.DataFrame(results, columns=[\"model_name\", \"gpu_util_time\", \"gpu_util_time_raw\", 'full_time_from_gpu_log'])\n", "\n", "result_gpu_merged = gpu_util_df.merge(df_wide_merged, on=[\"model_name\"], how=\"left\")\n" ] }, { "cell_type": "code", "execution_count": 44, "id": "2fa54bc3-81f2-492c-832c-26e4f9a7cff3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Overall RankModel NameGPU Util Timegpu_util_time_rawfull_time_from_gpu_logParametersparameters_rawTotal Timetotal_time_rawbatch_sizebatch_sizesanli_r1(acc,none)anli_r2(acc,none)anli_r3(acc,none)arc_challenge(acc_norm,none)bbh(exact_match,get-answer)boolq(acc,none)drop(f1,none)gpqa_main_zeroshot(acc_norm,none)gsm8k(exact_match,strict-match)hellaswag(acc_norm,none)mmlu(acc,none)nq_open(exact_match,remove_whitespace)openbookqa(acc_norm,none)piqa(acc_norm,none)qnli(acc,none)sciq(acc_norm,none)triviaqa(exact_match,remove_whitespace)truthfulqa_mc1(acc,none)truthfulqa_mc2(acc,none)winogrande(acc,none)gsm8k(exact_match,strict-match)_rankbbh(exact_match,get-answer)_rankarc_challenge(acc_norm,none)_rankanli_r1(acc,none)_rankanli_r2(acc,none)_rankanli_r3(acc,none)_rankgpqa_main_zeroshot(acc_norm,none)_rankhellaswag(acc_norm,none)_rankpiqa(acc_norm,none)_rankwinogrande(acc,none)_rankboolq(acc,none)_rankopenbookqa(acc_norm,none)_ranksciq(acc_norm,none)_rankqnli(acc,none)_rankmmlu(acc,none)_ranknq_open(exact_match,remove_whitespace)_rankdrop(f1,none)_ranktruthfulqa_mc1(acc,none)_ranktruthfulqa_mc2(acc,none)_ranktriviaqa(exact_match,remove_whitespace)_rankReasoning & Math Mean ScoreReasoning & Math Avg. RankCommonsense & NLI Mean ScoreCommonsense & NLI Avg. RankKnowledge & Reading Mean ScoreKnowledge & Reading Avg. RankMean Score
01google_gemma-3-12b-it14h 8m50906.415h 47m12.2B1218732504015h 45m56750.865892auto[2]0.6030.5600.5958330.6109220.8018740.8746180.1395660.3370540.8771800.8187610.7161370.1570640.4980.7807400.7457440.9540.2752450.4051410.5811830.7442783.01.02.05.03.02.09.03.019.04.03.01.06.08.06.08.08.08.08.023.00.626610.773730.3791100.6038
12Qwen_Qwen3-14B (8bit)17h 29m62956.229h 46m14.8B1476830720029h 45m107151.8020651[]0.6460.5700.5566670.6006830.4329600.8917430.0904100.3973210.8984080.7876920.7694770.0922440.4600.7948860.8442250.9660.4074900.4063650.5894040.7206001.029.04.04.02.03.01.012.014.011.01.08.01.02.02.015.018.07.06.013.00.586030.780720.392670.5961
23openchat_openchat-3.6-8b-202405226h 59m25150.87h 52m8.0B80302612487h 51m28278.8594703[]0.5560.5130.4800000.6032420.6178770.8727830.2515690.3325890.7505690.7978490.6430710.1706370.4620.8182810.7300020.9640.5659270.3525090.4976010.76322016.010.03.09.08.011.011.06.03.01.04.07.02.09.014.05.03.015.019.03.00.550560.772650.413620.5871
34Qwen_Qwen3-8B13h 44m49497.015h 33m8.2B819073536015h 31m55918.467860auto[1]0.6690.5420.5558330.5622870.7975730.8657490.1098770.3504460.8726310.7486560.7289560.0736840.4180.7752990.7818050.9580.3206090.3635250.5431400.6803474.02.011.03.05.04.05.024.021.022.06.020.04.07.03.016.013.013.013.018.00.621420.746880.3566140.5859
45Qwen_Qwen2.5-7B-Instruct8h 33m30831.69h 38m7.6B76156165129h 36m34616.6042483[]0.6850.5490.5525000.5529010.4487790.8633030.0710890.3281250.7626990.8049190.7180600.0457060.4860.8030470.8045030.9370.3254010.4773560.6484830.71191812.027.012.02.04.05.012.05.010.014.07.02.012.04.04.023.027.02.02.017.00.554150.773040.381090.5788
56Qwen_Qwen2.5-14B-Instruct (8bit)29h 32m106374.652h 45m14.8B1477003366452h 44m189869.4094041[]0.7210.6340.6175000.6151880.1068960.8862390.0712760.3549110.7922670.8419640.7830790.0614960.4760.8171930.8539260.9290.0392890.5104040.6830150.7545389.041.01.01.01.01.04.01.04.02.02.04.017.01.01.019.025.01.01.035.00.548870.794110.3581130.5775
6701-ai_Yi-1.5-9B10h 26m37569.611h 44m8.8B882940723211h 43m42212.1126222[]0.5320.4800.4391670.5469280.7120260.8581040.4456860.2946430.6391210.7789290.6892890.1531860.4560.8063110.5086950.9520.5438030.3219090.4675720.72612525.05.015.012.012.018.020.014.06.09.010.09.07.028.08.010.01.019.026.04.00.5206160.7266150.436910.5676
78Qwen_Qwen2.5-7B-Instruct-1M10h 10m36621.011h 18m7.6B761561651211h 17m40632.813397auto[1]0.5850.5330.5566670.5853240.2772230.8525990.0570470.3392860.7952990.7899820.7166360.1576180.4800.8161040.6781990.9500.4205310.4259490.6000720.7277038.038.07.06.06.03.08.011.05.08.012.03.08.011.05.07.029.03.03.012.00.5245150.756470.396360.5672
89meta-llama_Llama-3.1-8B-Instruct10h 52m39147.612h 20m8.0B803026124812h 19m44363.249360auto[1]0.4820.4670.4433330.5503410.7155580.8415900.1937290.3437500.7543590.7920730.6793190.1775620.4320.8063110.5013730.9620.5181680.3659730.5411540.73875315.04.013.018.014.017.06.09.06.06.016.016.03.030.012.04.04.012.014.07.00.5366120.7249170.412730.5653
91001-ai_Yi-1.5-9B-Chat12h 15m44120.413h 55m8.8B882940723213h 54m50056.3313452[]0.5350.5090.5258330.5870310.6106590.8681960.1253260.3035710.7081120.7872930.6840910.0094180.4360.8035910.7876620.9540.3386650.3745410.5479340.74664618.012.06.011.09.06.018.013.09.03.05.014.06.06.09.036.09.010.011.015.00.539990.769160.3467150.5621
1011mistralai_Ministral-8B-Instruct-24109h 27m34053.610h 47m8.0B801980825610h 46m38770.339256auto[1]0.4880.4870.4658330.5622870.6925200.8602450.0714130.3415180.7748290.7910770.6407210.1576180.4660.8231770.4949660.9560.5278090.3255810.4866700.73796411.07.011.016.011.014.07.010.02.07.08.06.05.039.015.07.024.018.022.05.00.544680.7328120.3683120.5576
1112meta-llama_Meta-Llama-3-8B-Instruct5h 46m20809.86h 31m8.0B80302612486h 30m23440.2344213[]0.4840.4580.4483330.5639930.6790050.8311930.1639770.3102680.7566340.7592110.6387270.1590030.4300.7872690.5464030.9320.5112020.3635250.5171420.71665414.08.010.017.018.016.016.019.018.013.019.017.015.020.016.06.05.013.017.08.00.5286130.7147220.392380.5528
1213Qwen_Qwen3-4B5h 3m18234.65h 52m4.0B40224680965h 51m21077.9436466[]0.5500.4610.5133330.5392490.7522650.8504590.0977070.3258930.8567100.6833300.6835920.0146810.4020.7513600.8087130.9320.2250330.3671970.5475750.6582485.03.017.010.017.08.013.029.027.027.013.023.015.03.010.034.016.011.012.026.00.571240.7266160.3226210.5510
1314NousResearch_Hermes-2-Pro-Mistral-7B7h 28m26916.08h 28m7.2B72419942408h 27m30434.3290213[]0.5310.4960.5000000.5657000.5737980.8681960.1097540.2767860.6853680.8049190.6051130.0404430.4340.7986940.5564710.9170.4711320.4137090.5911560.71981121.017.09.013.010.09.025.05.012.012.05.015.020.018.020.025.014.06.05.011.00.5184170.7284130.3719110.5480
1415mistralai_Mistral-7B-Instruct-v0.37h 41m27676.88h 39m7.2B72480235528h 38m31084.8383243[]0.4760.4430.4483330.5895900.5625860.8584100.0899720.2834820.4897650.8289190.5971370.1537400.4700.8269860.5145520.9430.5683240.4210530.5968130.74033128.018.05.020.020.016.024.02.01.05.09.05.09.026.022.09.019.04.04.02.00.4704220.740390.404550.5451
1516google_gemma-3-4b-it3h 50m13811.44h 52m4.3B43000794724h 51m17460.233507auto[4]0.4920.4710.4683330.5708190.7094150.8397550.0892840.2879460.7619410.7413860.5755590.1094180.4660.7720350.5659890.9310.3148130.3488370.5188210.70086813.06.08.015.013.013.023.026.022.017.017.06.016.016.024.013.020.016.016.019.00.5374110.7167190.3261200.5368
161701-ai_Yi-1.5-6B-Chat7h 1m25318.88h 5m6.1B60610355208h 4m29040.4298022[]0.4770.4530.4600000.5392490.5478420.8474010.1160810.3571430.6702050.7674770.6178610.0271470.4360.7878130.6794800.9340.3309740.3769890.5343710.70955022.021.017.019.019.015.03.016.017.015.015.014.014.010.018.029.012.09.015.016.00.5006190.7374100.3339190.5335
171801-ai_Yi-1.5-6B3h 54m14091.64h 29m6.1B60610355204h 28m16094.199661auto[8]0.4480.4070.4066670.4965870.5754880.8015290.3994620.2901790.5223650.7541330.6242700.1781160.4220.8014150.5985720.9410.4952070.2998780.4407500.72060027.016.020.021.026.027.022.022.011.011.023.019.010.013.017.03.02.023.032.010.00.4495240.7199180.406340.5312
1819Qwen_Qwen2-7B-Instruct10h 11m36684.611h 31m7.6B761561651211h 30m41431.857967auto[1]0.5730.5250.5225000.5401020.5774840.8562690.0520280.3147320.6467020.8060150.6994020.0132960.4620.8057670.5471350.9160.0081360.4051410.5734370.69850023.015.016.07.07.07.015.04.07.018.011.07.021.019.07.035.031.08.09.039.00.5285140.7274140.2919240.5271
1920deepseek-ai_DeepSeek-R1-0528-Qwen3-8B15h 30m55855.217h 59m8.2B819073536017h 57m64675.539163auto[1]0.5110.4640.4766670.5494880.5840880.8483180.0532790.3727680.8127370.7564230.6829510.0182830.4300.7568010.5577520.9410.0294810.3574050.5590130.6756127.014.014.014.016.012.02.020.025.024.014.017.010.017.011.033.030.014.010.036.00.5387100.7094230.2834280.5219
2021meta-llama_Llama-3.2-3B-Instruct5h 57m21477.07h 13m3.2B32127498247h 12m25939.885959auto[2]0.4470.4180.4308330.4590440.5564430.7847090.1553940.3281250.6421530.7054370.6051840.1390580.3580.7551690.5451220.9320.3389430.3268050.4975790.67087624.020.025.022.023.021.012.027.026.025.026.028.015.021.019.012.07.017.020.014.00.4688230.6788300.3438160.5048
2122Qwen_Qwen2.5-3B-Instruct6h 30m23452.27h 49m3.1B30859386887h 48m28089.516568auto:4[2, 64, 64, 64, 64]0.5620.4660.4941670.4820820.2491170.8012230.0773330.3214290.1015920.7490540.6549640.0083100.4220.7807400.7979130.9130.3009920.4161570.5860550.69297641.039.022.08.015.010.014.023.019.020.024.019.024.05.013.037.021.05.07.021.00.3823320.7367110.3406170.4939
2223Qwen_Qwen2.5-Math-7B24h 38m88696.227h 23m7.6B761561651227h 21m98517.403245auto[4]0.3870.4070.3825000.5025600.6724010.7455660.0432350.3080360.8476120.6528580.5799030.0509700.3920.7453750.4980780.9290.2183460.3206850.4832190.6479876.09.018.030.026.031.017.031.029.029.033.024.017.033.023.022.033.020.023.027.00.5010180.6587320.2827290.4907
2324deepseek-ai_deepseek-llm-7b-chat9h 8m32906.410h 8m6.9B691036569610h 6m36412.9692443[]0.4230.4190.4208330.4965870.4547690.8330280.1030480.2924110.4639880.7772360.4987890.0634350.4600.8014150.4969800.8930.3111900.3488370.4789330.70165729.026.020.026.022.023.021.015.011.016.018.08.029.035.032.018.015.016.024.020.00.4244270.7090240.3007230.4869
2425deepseek-ai_DeepSeek-R1-Distill-Llama-8B10h 36m38179.211h 47m8.0B803026124811h 46m42405.489811auto:5[1, 64, 64, 64, 64, 64]0.4040.4100.3883330.4232080.6037480.8287460.0712250.2745540.6247160.7429790.5326880.0584490.4100.7758430.5147350.8990.1940480.3219090.5044600.67797926.013.031.029.025.029.026.025.020.023.020.021.027.025.027.020.026.019.018.028.00.4469260.6928270.2805300.4830
2526meta-llama_Llama-2-13b-hf17h 38m63506.419h 22m13.0B1301586432019h 21m69687.765642auto[1]0.3770.3900.3850000.4897610.4776530.8064220.0301320.2544640.2297190.7938660.5209370.2362880.4520.8052230.4953320.9350.6088390.2594860.3689920.72217836.025.021.031.029.030.032.08.08.010.022.010.013.038.031.01.038.033.040.01.00.3719330.7157200.3374180.4819
2627meta-llama_Llama-2-13b-chat-hf15h 37m56271.617h 9m13.0B1301586432017h 8m61732.053618auto[1]0.4300.4300.4141670.5017060.4779600.8165140.0915090.2991070.3472330.7966540.5312630.1030470.4400.7932540.5438400.9050.2724590.2802940.4396240.71191831.024.019.025.021.024.019.07.015.014.021.012.026.022.028.014.017.030.033.024.00.4143280.7153210.2864260.4813
2728deepseek-ai_DeepSeek-R1-Distill-Qwen-7B5h 43m20637.06h 29m7.6B76156165126h 28m23311.0229413[]0.4450.4180.4100000.4377130.5569040.7782870.0411980.3348210.7862020.6025690.5263500.0321330.3600.7165400.5209590.9180.0592400.2888620.4563190.59905310.019.028.023.023.025.010.034.033.033.028.027.019.024.029.028.035.028.029.034.00.4841210.6422340.2340350.4644
2829Qwen_Qwen2.5-1.5B-Instruct2h 36m9398.43h 21m1.5B15437143043h 20m12036.5651956[]0.4480.3920.4316670.4684300.3692210.7813460.0390520.2834820.3191810.6829320.6005550.0415510.4060.7584330.5667220.9390.2826010.3121180.4657480.62746633.037.023.021.028.020.024.030.024.030.027.022.011.015.021.024.036.021.027.022.00.3874310.6803290.2903250.4608
2930Qwen_Qwen3-1.7B3h 36m13010.44h 26m1.7B17205749764h 25m15915.2685756[]0.4100.4040.4341670.4343000.4825680.7764530.0752600.2901790.6899170.6037640.5537670.0221610.3760.7203480.5105250.9140.1349750.2949820.4588120.60852420.023.029.028.027.019.022.033.032.031.030.026.023.027.025.030.023.025.028.032.00.4493250.6442330.2567340.4597
3031Qwen_Qwen2.5-Math-7B-Instruct4h 57m17861.45h 38m7.6B76156165125h 37m20230.489569auto[4]0.4310.4150.4291670.4308870.6140380.6061160.0272990.2879460.8900680.5881300.5372450.0199450.3340.6855280.6774670.8580.0074680.2986540.4750350.5793212.011.030.024.024.022.023.035.035.035.039.030.032.012.026.032.040.024.025.040.00.4997200.6184370.2276360.4596
3132meta-llama_Llama-2-7b-chat-hf6h 7m22072.86h 59m6.7B67384156166h 57m25079.294749auto[4]0.4170.4100.4075000.4428330.4013210.7978590.1174970.2611610.2319940.7548300.4636090.0667590.4380.7714910.5800840.8780.1903700.3023260.4532170.66456235.033.027.027.025.026.031.021.023.026.025.013.031.014.033.017.011.022.030.029.00.3674350.6978250.2656320.4525
3233meta-llama_Llama-2-7b-hf4h 59m17980.25h 43m6.7B67384156165h 42m20539.258032auto[4]0.3640.3720.3758330.4624570.3990170.7773700.0363350.2410710.1379830.7600080.4185300.1889200.4420.7905330.4991760.9100.5250780.2521420.3897160.68981840.034.024.033.031.033.034.018.016.021.029.011.025.031.037.02.037.034.038.06.00.3361390.6956260.3018220.4516
3334deepseek-ai_deepseek-llm-7b-base6h 26m23180.47h 12m6.9B69103656967h 11m25877.1867203[]0.3400.3630.3775000.4453920.4237440.7235470.0421810.2522320.1622440.7606050.4428140.1509700.4340.7976060.4958810.9150.5003900.2325580.3492140.69376538.030.026.037.032.032.033.017.013.019.034.015.022.037.036.011.034.036.041.09.00.3377380.6886280.2864270.4451
3435deepseek-ai_deepseek-math-7b-rl7h 12m25973.48h 3m6.9B69103656968h 2m28925.1107833[]0.3680.3890.4050000.4897610.5246510.7559630.1190270.2723210.1425320.6896040.5249960.0393350.4240.7502720.4989930.9280.1746540.2876380.4028840.65114439.022.021.032.030.028.027.028.028.028.032.018.018.032.030.026.010.029.037.031.00.3702340.6711310.2581330.4419
3536meta-llama_Llama-3.2-1B-Instruct2h 35m9307.83h 32m1.2B12358144003h 30m12653.736082auto[2]0.3380.3340.3725000.3805460.3781290.6948010.1634840.2745540.3373770.6088430.4589090.0565100.3460.7421110.4946000.8970.2499440.2717260.4383000.60142132.036.032.038.037.034.026.032.030.032.035.029.028.040.034.021.06.031.034.025.00.3450370.6264360.2731310.4219
3637google_gemma-3-1b-it4h 52m17533.86h 51m999.9M9998859526h 50m24641.929494auto[1]0.3320.3540.3566670.3805460.3822760.7581040.0761570.2656250.2471570.5782710.3859140.0357340.3880.7208920.4940510.8580.1897010.2460220.3874630.58958234.035.032.039.034.036.030.036.031.034.031.025.032.041.039.027.022.035.039.030.00.3312400.6267350.2202380.4013
3738deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B2h 52m10353.63h 42m1.8B17770880003h 40m13254.9130526[]0.3560.3620.3625000.3464160.4059280.6801220.0506860.2723210.7012890.4467240.3606320.0063710.3080.6577800.5054000.8450.0090280.2937580.4517420.54932919.032.034.034.033.035.027.039.037.038.036.032.033.029.041.038.032.026.031.038.00.4009300.5703390.1954410.3986
3839Qwen_Qwen2.5-Math-1.5B-Instruct2h 39m9542.43h 26m1.5B15437143043h 25m12324.098490auto:4[6, 64, 64, 64, 64]0.3420.3410.3533330.3651880.4372600.5694190.0230860.2834820.7369220.4165500.3787920.0038780.2860.6137110.4973460.7180.0042910.2900860.4895010.52565117.028.033.036.036.037.024.040.038.039.040.033.035.034.040.039.041.027.021.041.00.4085290.5181410.1983400.3838
3940Qwen_Qwen3-0.6B2h 53m10404.63h 46m596.0M5960499203h 45m13547.4461416[]0.3430.3190.3441670.3421500.4148360.6391440.0605440.2700890.4124340.4719180.4012960.0204990.3200.6751900.4960640.8330.0192820.2705020.4277420.55169730.031.035.035.038.039.028.038.036.037.038.031.034.036.038.031.028.032.035.037.00.3494360.5696400.2000390.3816
4041Qwen_Qwen2.5-0.5B-Instruct1h 48m6532.82h 35m494.0M4940327682h 34m9253.0747696[]0.3240.3420.3475000.3370310.2137920.6767580.0286440.2678570.2077330.5240990.4575560.0204990.3460.7040260.5368840.8830.1341950.2717260.4183870.55643337.040.036.040.035.038.029.037.034.036.037.029.030.023.035.031.039.031.036.033.00.2914410.6039380.2218370.3799
\n", "
" ], "text/plain": [ " Overall Rank Model Name GPU Util Time \\\n", "0 1 google_gemma-3-12b-it 14h 8m \n", "1 2 Qwen_Qwen3-14B (8bit) 17h 29m \n", "2 3 openchat_openchat-3.6-8b-20240522 6h 59m \n", "3 4 Qwen_Qwen3-8B 13h 44m \n", "4 5 Qwen_Qwen2.5-7B-Instruct 8h 33m \n", "5 6 Qwen_Qwen2.5-14B-Instruct (8bit) 29h 32m \n", "6 7 01-ai_Yi-1.5-9B 10h 26m \n", "7 8 Qwen_Qwen2.5-7B-Instruct-1M 10h 10m \n", "8 9 meta-llama_Llama-3.1-8B-Instruct 10h 52m \n", "9 10 01-ai_Yi-1.5-9B-Chat 12h 15m \n", "10 11 mistralai_Ministral-8B-Instruct-2410 9h 27m \n", "11 12 meta-llama_Meta-Llama-3-8B-Instruct 5h 46m \n", "12 13 Qwen_Qwen3-4B 5h 3m \n", "13 14 NousResearch_Hermes-2-Pro-Mistral-7B 7h 28m \n", "14 15 mistralai_Mistral-7B-Instruct-v0.3 7h 41m \n", "15 16 google_gemma-3-4b-it 3h 50m \n", "16 17 01-ai_Yi-1.5-6B-Chat 7h 1m \n", "17 18 01-ai_Yi-1.5-6B 3h 54m \n", "18 19 Qwen_Qwen2-7B-Instruct 10h 11m \n", "19 20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 15h 30m \n", "20 21 meta-llama_Llama-3.2-3B-Instruct 5h 57m \n", "21 22 Qwen_Qwen2.5-3B-Instruct 6h 30m \n", "22 23 Qwen_Qwen2.5-Math-7B 24h 38m \n", "23 24 deepseek-ai_deepseek-llm-7b-chat 9h 8m \n", "24 25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 10h 36m \n", "25 26 meta-llama_Llama-2-13b-hf 17h 38m \n", "26 27 meta-llama_Llama-2-13b-chat-hf 15h 37m \n", "27 28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 5h 43m \n", "28 29 Qwen_Qwen2.5-1.5B-Instruct 2h 36m \n", "29 30 Qwen_Qwen3-1.7B 3h 36m \n", "30 31 Qwen_Qwen2.5-Math-7B-Instruct 4h 57m \n", "31 32 meta-llama_Llama-2-7b-chat-hf 6h 7m \n", "32 33 meta-llama_Llama-2-7b-hf 4h 59m \n", "33 34 deepseek-ai_deepseek-llm-7b-base 6h 26m \n", "34 35 deepseek-ai_deepseek-math-7b-rl 7h 12m \n", "35 36 meta-llama_Llama-3.2-1B-Instruct 2h 35m \n", "36 37 google_gemma-3-1b-it 4h 52m \n", "37 38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 2h 52m \n", "38 39 Qwen_Qwen2.5-Math-1.5B-Instruct 2h 39m \n", "39 40 Qwen_Qwen3-0.6B 2h 53m \n", "40 41 Qwen_Qwen2.5-0.5B-Instruct 1h 48m \n", "\n", " gpu_util_time_raw full_time_from_gpu_log Parameters parameters_raw \\\n", "0 50906.4 15h 47m 12.2B 12187325040 \n", "1 62956.2 29h 46m 14.8B 14768307200 \n", "2 25150.8 7h 52m 8.0B 8030261248 \n", "3 49497.0 15h 33m 8.2B 8190735360 \n", "4 30831.6 9h 38m 7.6B 7615616512 \n", "5 106374.6 52h 45m 14.8B 14770033664 \n", "6 37569.6 11h 44m 8.8B 8829407232 \n", "7 36621.0 11h 18m 7.6B 7615616512 \n", "8 39147.6 12h 20m 8.0B 8030261248 \n", "9 44120.4 13h 55m 8.8B 8829407232 \n", "10 34053.6 10h 47m 8.0B 8019808256 \n", "11 20809.8 6h 31m 8.0B 8030261248 \n", "12 18234.6 5h 52m 4.0B 4022468096 \n", "13 26916.0 8h 28m 7.2B 7241994240 \n", "14 27676.8 8h 39m 7.2B 7248023552 \n", "15 13811.4 4h 52m 4.3B 4300079472 \n", "16 25318.8 8h 5m 6.1B 6061035520 \n", "17 14091.6 4h 29m 6.1B 6061035520 \n", "18 36684.6 11h 31m 7.6B 7615616512 \n", "19 55855.2 17h 59m 8.2B 8190735360 \n", "20 21477.0 7h 13m 3.2B 3212749824 \n", "21 23452.2 7h 49m 3.1B 3085938688 \n", "22 88696.2 27h 23m 7.6B 7615616512 \n", "23 32906.4 10h 8m 6.9B 6910365696 \n", "24 38179.2 11h 47m 8.0B 8030261248 \n", "25 63506.4 19h 22m 13.0B 13015864320 \n", "26 56271.6 17h 9m 13.0B 13015864320 \n", "27 20637.0 6h 29m 7.6B 7615616512 \n", "28 9398.4 3h 21m 1.5B 1543714304 \n", "29 13010.4 4h 26m 1.7B 1720574976 \n", "30 17861.4 5h 38m 7.6B 7615616512 \n", "31 22072.8 6h 59m 6.7B 6738415616 \n", "32 17980.2 5h 43m 6.7B 6738415616 \n", "33 23180.4 7h 12m 6.9B 6910365696 \n", "34 25973.4 8h 3m 6.9B 6910365696 \n", "35 9307.8 3h 32m 1.2B 1235814400 \n", "36 17533.8 6h 51m 999.9M 999885952 \n", "37 10353.6 3h 42m 1.8B 1777088000 \n", "38 9542.4 3h 26m 1.5B 1543714304 \n", "39 10404.6 3h 46m 596.0M 596049920 \n", "40 6532.8 2h 35m 494.0M 494032768 \n", "\n", " Total Time total_time_raw batch_size batch_sizes \\\n", "0 15h 45m 56750.865892 auto [2] \n", "1 29h 45m 107151.802065 1 [] \n", "2 7h 51m 28278.859470 3 [] \n", "3 15h 31m 55918.467860 auto [1] \n", "4 9h 36m 34616.604248 3 [] \n", "5 52h 44m 189869.409404 1 [] \n", "6 11h 43m 42212.112622 2 [] \n", "7 11h 17m 40632.813397 auto [1] \n", "8 12h 19m 44363.249360 auto [1] \n", "9 13h 54m 50056.331345 2 [] \n", "10 10h 46m 38770.339256 auto [1] \n", "11 6h 30m 23440.234421 3 [] \n", "12 5h 51m 21077.943646 6 [] \n", "13 8h 27m 30434.329021 3 [] \n", "14 8h 38m 31084.838324 3 [] \n", "15 4h 51m 17460.233507 auto [4] \n", "16 8h 4m 29040.429802 2 [] \n", "17 4h 28m 16094.199661 auto [8] \n", "18 11h 30m 41431.857967 auto [1] \n", "19 17h 57m 64675.539163 auto [1] \n", "20 7h 12m 25939.885959 auto [2] \n", "21 7h 48m 28089.516568 auto:4 [2, 64, 64, 64, 64] \n", "22 27h 21m 98517.403245 auto [4] \n", "23 10h 6m 36412.969244 3 [] \n", "24 11h 46m 42405.489811 auto:5 [1, 64, 64, 64, 64, 64] \n", "25 19h 21m 69687.765642 auto [1] \n", "26 17h 8m 61732.053618 auto [1] \n", "27 6h 28m 23311.022941 3 [] \n", "28 3h 20m 12036.565195 6 [] \n", "29 4h 25m 15915.268575 6 [] \n", "30 5h 37m 20230.489569 auto [4] \n", "31 6h 57m 25079.294749 auto [4] \n", "32 5h 42m 20539.258032 auto [4] \n", "33 7h 11m 25877.186720 3 [] \n", "34 8h 2m 28925.110783 3 [] \n", "35 3h 30m 12653.736082 auto [2] \n", "36 6h 50m 24641.929494 auto [1] \n", "37 3h 40m 13254.913052 6 [] \n", "38 3h 25m 12324.098490 auto:4 [6, 64, 64, 64, 64] \n", "39 3h 45m 13547.446141 6 [] \n", "40 2h 34m 9253.074769 6 [] \n", "\n", " anli_r1(acc,none) anli_r2(acc,none) anli_r3(acc,none) \\\n", "0 0.603 0.560 0.595833 \n", "1 0.646 0.570 0.556667 \n", "2 0.556 0.513 0.480000 \n", "3 0.669 0.542 0.555833 \n", "4 0.685 0.549 0.552500 \n", "5 0.721 0.634 0.617500 \n", "6 0.532 0.480 0.439167 \n", "7 0.585 0.533 0.556667 \n", "8 0.482 0.467 0.443333 \n", "9 0.535 0.509 0.525833 \n", "10 0.488 0.487 0.465833 \n", "11 0.484 0.458 0.448333 \n", "12 0.550 0.461 0.513333 \n", "13 0.531 0.496 0.500000 \n", "14 0.476 0.443 0.448333 \n", "15 0.492 0.471 0.468333 \n", "16 0.477 0.453 0.460000 \n", "17 0.448 0.407 0.406667 \n", "18 0.573 0.525 0.522500 \n", "19 0.511 0.464 0.476667 \n", "20 0.447 0.418 0.430833 \n", "21 0.562 0.466 0.494167 \n", "22 0.387 0.407 0.382500 \n", "23 0.423 0.419 0.420833 \n", "24 0.404 0.410 0.388333 \n", "25 0.377 0.390 0.385000 \n", "26 0.430 0.430 0.414167 \n", "27 0.445 0.418 0.410000 \n", "28 0.448 0.392 0.431667 \n", "29 0.410 0.404 0.434167 \n", "30 0.431 0.415 0.429167 \n", "31 0.417 0.410 0.407500 \n", "32 0.364 0.372 0.375833 \n", "33 0.340 0.363 0.377500 \n", "34 0.368 0.389 0.405000 \n", "35 0.338 0.334 0.372500 \n", "36 0.332 0.354 0.356667 \n", "37 0.356 0.362 0.362500 \n", "38 0.342 0.341 0.353333 \n", "39 0.343 0.319 0.344167 \n", "40 0.324 0.342 0.347500 \n", "\n", " arc_challenge(acc_norm,none) bbh(exact_match,get-answer) \\\n", "0 0.610922 0.801874 \n", "1 0.600683 0.432960 \n", "2 0.603242 0.617877 \n", "3 0.562287 0.797573 \n", "4 0.552901 0.448779 \n", "5 0.615188 0.106896 \n", "6 0.546928 0.712026 \n", "7 0.585324 0.277223 \n", "8 0.550341 0.715558 \n", "9 0.587031 0.610659 \n", "10 0.562287 0.692520 \n", "11 0.563993 0.679005 \n", "12 0.539249 0.752265 \n", "13 0.565700 0.573798 \n", "14 0.589590 0.562586 \n", "15 0.570819 0.709415 \n", "16 0.539249 0.547842 \n", "17 0.496587 0.575488 \n", "18 0.540102 0.577484 \n", "19 0.549488 0.584088 \n", "20 0.459044 0.556443 \n", "21 0.482082 0.249117 \n", "22 0.502560 0.672401 \n", "23 0.496587 0.454769 \n", "24 0.423208 0.603748 \n", "25 0.489761 0.477653 \n", "26 0.501706 0.477960 \n", "27 0.437713 0.556904 \n", "28 0.468430 0.369221 \n", "29 0.434300 0.482568 \n", "30 0.430887 0.614038 \n", "31 0.442833 0.401321 \n", "32 0.462457 0.399017 \n", "33 0.445392 0.423744 \n", "34 0.489761 0.524651 \n", "35 0.380546 0.378129 \n", "36 0.380546 0.382276 \n", "37 0.346416 0.405928 \n", "38 0.365188 0.437260 \n", "39 0.342150 0.414836 \n", "40 0.337031 0.213792 \n", "\n", " boolq(acc,none) drop(f1,none) gpqa_main_zeroshot(acc_norm,none) \\\n", "0 0.874618 0.139566 0.337054 \n", "1 0.891743 0.090410 0.397321 \n", "2 0.872783 0.251569 0.332589 \n", "3 0.865749 0.109877 0.350446 \n", "4 0.863303 0.071089 0.328125 \n", "5 0.886239 0.071276 0.354911 \n", "6 0.858104 0.445686 0.294643 \n", "7 0.852599 0.057047 0.339286 \n", "8 0.841590 0.193729 0.343750 \n", "9 0.868196 0.125326 0.303571 \n", "10 0.860245 0.071413 0.341518 \n", "11 0.831193 0.163977 0.310268 \n", "12 0.850459 0.097707 0.325893 \n", "13 0.868196 0.109754 0.276786 \n", "14 0.858410 0.089972 0.283482 \n", "15 0.839755 0.089284 0.287946 \n", "16 0.847401 0.116081 0.357143 \n", "17 0.801529 0.399462 0.290179 \n", "18 0.856269 0.052028 0.314732 \n", "19 0.848318 0.053279 0.372768 \n", "20 0.784709 0.155394 0.328125 \n", "21 0.801223 0.077333 0.321429 \n", "22 0.745566 0.043235 0.308036 \n", "23 0.833028 0.103048 0.292411 \n", "24 0.828746 0.071225 0.274554 \n", "25 0.806422 0.030132 0.254464 \n", "26 0.816514 0.091509 0.299107 \n", "27 0.778287 0.041198 0.334821 \n", "28 0.781346 0.039052 0.283482 \n", "29 0.776453 0.075260 0.290179 \n", "30 0.606116 0.027299 0.287946 \n", "31 0.797859 0.117497 0.261161 \n", "32 0.777370 0.036335 0.241071 \n", "33 0.723547 0.042181 0.252232 \n", "34 0.755963 0.119027 0.272321 \n", "35 0.694801 0.163484 0.274554 \n", "36 0.758104 0.076157 0.265625 \n", "37 0.680122 0.050686 0.272321 \n", "38 0.569419 0.023086 0.283482 \n", "39 0.639144 0.060544 0.270089 \n", "40 0.676758 0.028644 0.267857 \n", "\n", " gsm8k(exact_match,strict-match) hellaswag(acc_norm,none) mmlu(acc,none) \\\n", "0 0.877180 0.818761 0.716137 \n", "1 0.898408 0.787692 0.769477 \n", "2 0.750569 0.797849 0.643071 \n", "3 0.872631 0.748656 0.728956 \n", "4 0.762699 0.804919 0.718060 \n", "5 0.792267 0.841964 0.783079 \n", "6 0.639121 0.778929 0.689289 \n", "7 0.795299 0.789982 0.716636 \n", "8 0.754359 0.792073 0.679319 \n", "9 0.708112 0.787293 0.684091 \n", "10 0.774829 0.791077 0.640721 \n", "11 0.756634 0.759211 0.638727 \n", "12 0.856710 0.683330 0.683592 \n", "13 0.685368 0.804919 0.605113 \n", "14 0.489765 0.828919 0.597137 \n", "15 0.761941 0.741386 0.575559 \n", "16 0.670205 0.767477 0.617861 \n", "17 0.522365 0.754133 0.624270 \n", "18 0.646702 0.806015 0.699402 \n", "19 0.812737 0.756423 0.682951 \n", "20 0.642153 0.705437 0.605184 \n", "21 0.101592 0.749054 0.654964 \n", "22 0.847612 0.652858 0.579903 \n", "23 0.463988 0.777236 0.498789 \n", "24 0.624716 0.742979 0.532688 \n", "25 0.229719 0.793866 0.520937 \n", "26 0.347233 0.796654 0.531263 \n", "27 0.786202 0.602569 0.526350 \n", "28 0.319181 0.682932 0.600555 \n", "29 0.689917 0.603764 0.553767 \n", "30 0.890068 0.588130 0.537245 \n", "31 0.231994 0.754830 0.463609 \n", "32 0.137983 0.760008 0.418530 \n", "33 0.162244 0.760605 0.442814 \n", "34 0.142532 0.689604 0.524996 \n", "35 0.337377 0.608843 0.458909 \n", "36 0.247157 0.578271 0.385914 \n", "37 0.701289 0.446724 0.360632 \n", "38 0.736922 0.416550 0.378792 \n", "39 0.412434 0.471918 0.401296 \n", "40 0.207733 0.524099 0.457556 \n", "\n", " nq_open(exact_match,remove_whitespace) openbookqa(acc_norm,none) \\\n", "0 0.157064 0.498 \n", "1 0.092244 0.460 \n", "2 0.170637 0.462 \n", "3 0.073684 0.418 \n", "4 0.045706 0.486 \n", "5 0.061496 0.476 \n", "6 0.153186 0.456 \n", "7 0.157618 0.480 \n", "8 0.177562 0.432 \n", "9 0.009418 0.436 \n", "10 0.157618 0.466 \n", "11 0.159003 0.430 \n", "12 0.014681 0.402 \n", "13 0.040443 0.434 \n", "14 0.153740 0.470 \n", "15 0.109418 0.466 \n", "16 0.027147 0.436 \n", "17 0.178116 0.422 \n", "18 0.013296 0.462 \n", "19 0.018283 0.430 \n", "20 0.139058 0.358 \n", "21 0.008310 0.422 \n", "22 0.050970 0.392 \n", "23 0.063435 0.460 \n", "24 0.058449 0.410 \n", "25 0.236288 0.452 \n", "26 0.103047 0.440 \n", "27 0.032133 0.360 \n", "28 0.041551 0.406 \n", "29 0.022161 0.376 \n", "30 0.019945 0.334 \n", "31 0.066759 0.438 \n", "32 0.188920 0.442 \n", "33 0.150970 0.434 \n", "34 0.039335 0.424 \n", "35 0.056510 0.346 \n", "36 0.035734 0.388 \n", "37 0.006371 0.308 \n", "38 0.003878 0.286 \n", "39 0.020499 0.320 \n", "40 0.020499 0.346 \n", "\n", " piqa(acc_norm,none) qnli(acc,none) sciq(acc_norm,none) \\\n", "0 0.780740 0.745744 0.954 \n", "1 0.794886 0.844225 0.966 \n", "2 0.818281 0.730002 0.964 \n", "3 0.775299 0.781805 0.958 \n", "4 0.803047 0.804503 0.937 \n", "5 0.817193 0.853926 0.929 \n", "6 0.806311 0.508695 0.952 \n", "7 0.816104 0.678199 0.950 \n", "8 0.806311 0.501373 0.962 \n", "9 0.803591 0.787662 0.954 \n", "10 0.823177 0.494966 0.956 \n", "11 0.787269 0.546403 0.932 \n", "12 0.751360 0.808713 0.932 \n", "13 0.798694 0.556471 0.917 \n", "14 0.826986 0.514552 0.943 \n", "15 0.772035 0.565989 0.931 \n", "16 0.787813 0.679480 0.934 \n", "17 0.801415 0.598572 0.941 \n", "18 0.805767 0.547135 0.916 \n", "19 0.756801 0.557752 0.941 \n", "20 0.755169 0.545122 0.932 \n", "21 0.780740 0.797913 0.913 \n", "22 0.745375 0.498078 0.929 \n", "23 0.801415 0.496980 0.893 \n", "24 0.775843 0.514735 0.899 \n", "25 0.805223 0.495332 0.935 \n", "26 0.793254 0.543840 0.905 \n", "27 0.716540 0.520959 0.918 \n", "28 0.758433 0.566722 0.939 \n", "29 0.720348 0.510525 0.914 \n", "30 0.685528 0.677467 0.858 \n", "31 0.771491 0.580084 0.878 \n", "32 0.790533 0.499176 0.910 \n", "33 0.797606 0.495881 0.915 \n", "34 0.750272 0.498993 0.928 \n", "35 0.742111 0.494600 0.897 \n", "36 0.720892 0.494051 0.858 \n", "37 0.657780 0.505400 0.845 \n", "38 0.613711 0.497346 0.718 \n", "39 0.675190 0.496064 0.833 \n", "40 0.704026 0.536884 0.883 \n", "\n", " triviaqa(exact_match,remove_whitespace) truthfulqa_mc1(acc,none) \\\n", "0 0.275245 0.405141 \n", "1 0.407490 0.406365 \n", "2 0.565927 0.352509 \n", "3 0.320609 0.363525 \n", "4 0.325401 0.477356 \n", "5 0.039289 0.510404 \n", "6 0.543803 0.321909 \n", "7 0.420531 0.425949 \n", "8 0.518168 0.365973 \n", "9 0.338665 0.374541 \n", "10 0.527809 0.325581 \n", "11 0.511202 0.363525 \n", "12 0.225033 0.367197 \n", "13 0.471132 0.413709 \n", "14 0.568324 0.421053 \n", "15 0.314813 0.348837 \n", "16 0.330974 0.376989 \n", "17 0.495207 0.299878 \n", "18 0.008136 0.405141 \n", "19 0.029481 0.357405 \n", "20 0.338943 0.326805 \n", "21 0.300992 0.416157 \n", "22 0.218346 0.320685 \n", "23 0.311190 0.348837 \n", "24 0.194048 0.321909 \n", "25 0.608839 0.259486 \n", "26 0.272459 0.280294 \n", "27 0.059240 0.288862 \n", "28 0.282601 0.312118 \n", "29 0.134975 0.294982 \n", "30 0.007468 0.298654 \n", "31 0.190370 0.302326 \n", "32 0.525078 0.252142 \n", "33 0.500390 0.232558 \n", "34 0.174654 0.287638 \n", "35 0.249944 0.271726 \n", "36 0.189701 0.246022 \n", "37 0.009028 0.293758 \n", "38 0.004291 0.290086 \n", "39 0.019282 0.270502 \n", "40 0.134195 0.271726 \n", "\n", " truthfulqa_mc2(acc,none) winogrande(acc,none) \\\n", "0 0.581183 0.744278 \n", "1 0.589404 0.720600 \n", "2 0.497601 0.763220 \n", "3 0.543140 0.680347 \n", "4 0.648483 0.711918 \n", "5 0.683015 0.754538 \n", "6 0.467572 0.726125 \n", "7 0.600072 0.727703 \n", "8 0.541154 0.738753 \n", "9 0.547934 0.746646 \n", "10 0.486670 0.737964 \n", "11 0.517142 0.716654 \n", "12 0.547575 0.658248 \n", "13 0.591156 0.719811 \n", "14 0.596813 0.740331 \n", "15 0.518821 0.700868 \n", "16 0.534371 0.709550 \n", "17 0.440750 0.720600 \n", "18 0.573437 0.698500 \n", "19 0.559013 0.675612 \n", "20 0.497579 0.670876 \n", "21 0.586055 0.692976 \n", "22 0.483219 0.647987 \n", "23 0.478933 0.701657 \n", "24 0.504460 0.677979 \n", "25 0.368992 0.722178 \n", "26 0.439624 0.711918 \n", "27 0.456319 0.599053 \n", "28 0.465748 0.627466 \n", "29 0.458812 0.608524 \n", "30 0.475035 0.579321 \n", "31 0.453217 0.664562 \n", "32 0.389716 0.689818 \n", "33 0.349214 0.693765 \n", "34 0.402884 0.651144 \n", "35 0.438300 0.601421 \n", "36 0.387463 0.589582 \n", "37 0.451742 0.549329 \n", "38 0.489501 0.525651 \n", "39 0.427742 0.551697 \n", "40 0.418387 0.556433 \n", "\n", " gsm8k(exact_match,strict-match)_rank bbh(exact_match,get-answer)_rank \\\n", "0 3.0 1.0 \n", "1 1.0 29.0 \n", "2 16.0 10.0 \n", "3 4.0 2.0 \n", "4 12.0 27.0 \n", "5 9.0 41.0 \n", "6 25.0 5.0 \n", "7 8.0 38.0 \n", "8 15.0 4.0 \n", "9 18.0 12.0 \n", "10 11.0 7.0 \n", "11 14.0 8.0 \n", "12 5.0 3.0 \n", "13 21.0 17.0 \n", "14 28.0 18.0 \n", "15 13.0 6.0 \n", "16 22.0 21.0 \n", "17 27.0 16.0 \n", "18 23.0 15.0 \n", "19 7.0 14.0 \n", "20 24.0 20.0 \n", "21 41.0 39.0 \n", "22 6.0 9.0 \n", "23 29.0 26.0 \n", "24 26.0 13.0 \n", "25 36.0 25.0 \n", "26 31.0 24.0 \n", "27 10.0 19.0 \n", "28 33.0 37.0 \n", "29 20.0 23.0 \n", "30 2.0 11.0 \n", "31 35.0 33.0 \n", "32 40.0 34.0 \n", "33 38.0 30.0 \n", "34 39.0 22.0 \n", "35 32.0 36.0 \n", "36 34.0 35.0 \n", "37 19.0 32.0 \n", "38 17.0 28.0 \n", "39 30.0 31.0 \n", "40 37.0 40.0 \n", "\n", " arc_challenge(acc_norm,none)_rank anli_r1(acc,none)_rank \\\n", "0 2.0 5.0 \n", "1 4.0 4.0 \n", "2 3.0 9.0 \n", "3 11.0 3.0 \n", "4 12.0 2.0 \n", "5 1.0 1.0 \n", "6 15.0 12.0 \n", "7 7.0 6.0 \n", "8 13.0 18.0 \n", "9 6.0 11.0 \n", "10 11.0 16.0 \n", "11 10.0 17.0 \n", "12 17.0 10.0 \n", "13 9.0 13.0 \n", "14 5.0 20.0 \n", "15 8.0 15.0 \n", "16 17.0 19.0 \n", "17 20.0 21.0 \n", "18 16.0 7.0 \n", "19 14.0 14.0 \n", "20 25.0 22.0 \n", "21 22.0 8.0 \n", "22 18.0 30.0 \n", "23 20.0 26.0 \n", "24 31.0 29.0 \n", "25 21.0 31.0 \n", "26 19.0 25.0 \n", "27 28.0 23.0 \n", "28 23.0 21.0 \n", "29 29.0 28.0 \n", "30 30.0 24.0 \n", "31 27.0 27.0 \n", "32 24.0 33.0 \n", "33 26.0 37.0 \n", "34 21.0 32.0 \n", "35 32.0 38.0 \n", "36 32.0 39.0 \n", "37 34.0 34.0 \n", "38 33.0 36.0 \n", "39 35.0 35.0 \n", "40 36.0 40.0 \n", "\n", " anli_r2(acc,none)_rank anli_r3(acc,none)_rank \\\n", "0 3.0 2.0 \n", "1 2.0 3.0 \n", "2 8.0 11.0 \n", "3 5.0 4.0 \n", "4 4.0 5.0 \n", "5 1.0 1.0 \n", "6 12.0 18.0 \n", "7 6.0 3.0 \n", "8 14.0 17.0 \n", "9 9.0 6.0 \n", "10 11.0 14.0 \n", "11 18.0 16.0 \n", "12 17.0 8.0 \n", "13 10.0 9.0 \n", "14 20.0 16.0 \n", "15 13.0 13.0 \n", "16 19.0 15.0 \n", "17 26.0 27.0 \n", "18 7.0 7.0 \n", "19 16.0 12.0 \n", "20 23.0 21.0 \n", "21 15.0 10.0 \n", "22 26.0 31.0 \n", "23 22.0 23.0 \n", "24 25.0 29.0 \n", "25 29.0 30.0 \n", "26 21.0 24.0 \n", "27 23.0 25.0 \n", "28 28.0 20.0 \n", "29 27.0 19.0 \n", "30 24.0 22.0 \n", "31 25.0 26.0 \n", "32 31.0 33.0 \n", "33 32.0 32.0 \n", "34 30.0 28.0 \n", "35 37.0 34.0 \n", "36 34.0 36.0 \n", "37 33.0 35.0 \n", "38 36.0 37.0 \n", "39 38.0 39.0 \n", "40 35.0 38.0 \n", "\n", " gpqa_main_zeroshot(acc_norm,none)_rank hellaswag(acc_norm,none)_rank \\\n", "0 9.0 3.0 \n", "1 1.0 12.0 \n", "2 11.0 6.0 \n", "3 5.0 24.0 \n", "4 12.0 5.0 \n", "5 4.0 1.0 \n", "6 20.0 14.0 \n", "7 8.0 11.0 \n", "8 6.0 9.0 \n", "9 18.0 13.0 \n", "10 7.0 10.0 \n", "11 16.0 19.0 \n", "12 13.0 29.0 \n", "13 25.0 5.0 \n", "14 24.0 2.0 \n", "15 23.0 26.0 \n", "16 3.0 16.0 \n", "17 22.0 22.0 \n", "18 15.0 4.0 \n", "19 2.0 20.0 \n", "20 12.0 27.0 \n", "21 14.0 23.0 \n", "22 17.0 31.0 \n", "23 21.0 15.0 \n", "24 26.0 25.0 \n", "25 32.0 8.0 \n", "26 19.0 7.0 \n", "27 10.0 34.0 \n", "28 24.0 30.0 \n", "29 22.0 33.0 \n", "30 23.0 35.0 \n", "31 31.0 21.0 \n", "32 34.0 18.0 \n", "33 33.0 17.0 \n", "34 27.0 28.0 \n", "35 26.0 32.0 \n", "36 30.0 36.0 \n", "37 27.0 39.0 \n", "38 24.0 40.0 \n", "39 28.0 38.0 \n", "40 29.0 37.0 \n", "\n", " piqa(acc_norm,none)_rank winogrande(acc,none)_rank boolq(acc,none)_rank \\\n", "0 19.0 4.0 3.0 \n", "1 14.0 11.0 1.0 \n", "2 3.0 1.0 4.0 \n", "3 21.0 22.0 6.0 \n", "4 10.0 14.0 7.0 \n", "5 4.0 2.0 2.0 \n", "6 6.0 9.0 10.0 \n", "7 5.0 8.0 12.0 \n", "8 6.0 6.0 16.0 \n", "9 9.0 3.0 5.0 \n", "10 2.0 7.0 8.0 \n", "11 18.0 13.0 19.0 \n", "12 27.0 27.0 13.0 \n", "13 12.0 12.0 5.0 \n", "14 1.0 5.0 9.0 \n", "15 22.0 17.0 17.0 \n", "16 17.0 15.0 15.0 \n", "17 11.0 11.0 23.0 \n", "18 7.0 18.0 11.0 \n", "19 25.0 24.0 14.0 \n", "20 26.0 25.0 26.0 \n", "21 19.0 20.0 24.0 \n", "22 29.0 29.0 33.0 \n", "23 11.0 16.0 18.0 \n", "24 20.0 23.0 20.0 \n", "25 8.0 10.0 22.0 \n", "26 15.0 14.0 21.0 \n", "27 33.0 33.0 28.0 \n", "28 24.0 30.0 27.0 \n", "29 32.0 31.0 30.0 \n", "30 35.0 35.0 39.0 \n", "31 23.0 26.0 25.0 \n", "32 16.0 21.0 29.0 \n", "33 13.0 19.0 34.0 \n", "34 28.0 28.0 32.0 \n", "35 30.0 32.0 35.0 \n", "36 31.0 34.0 31.0 \n", "37 37.0 38.0 36.0 \n", "38 38.0 39.0 40.0 \n", "39 36.0 37.0 38.0 \n", "40 34.0 36.0 37.0 \n", "\n", " openbookqa(acc_norm,none)_rank sciq(acc_norm,none)_rank \\\n", "0 1.0 6.0 \n", "1 8.0 1.0 \n", "2 7.0 2.0 \n", "3 20.0 4.0 \n", "4 2.0 12.0 \n", "5 4.0 17.0 \n", "6 9.0 7.0 \n", "7 3.0 8.0 \n", "8 16.0 3.0 \n", "9 14.0 6.0 \n", "10 6.0 5.0 \n", "11 17.0 15.0 \n", "12 23.0 15.0 \n", "13 15.0 20.0 \n", "14 5.0 9.0 \n", "15 6.0 16.0 \n", "16 14.0 14.0 \n", "17 19.0 10.0 \n", "18 7.0 21.0 \n", "19 17.0 10.0 \n", "20 28.0 15.0 \n", "21 19.0 24.0 \n", "22 24.0 17.0 \n", "23 8.0 29.0 \n", "24 21.0 27.0 \n", "25 10.0 13.0 \n", "26 12.0 26.0 \n", "27 27.0 19.0 \n", "28 22.0 11.0 \n", "29 26.0 23.0 \n", "30 30.0 32.0 \n", "31 13.0 31.0 \n", "32 11.0 25.0 \n", "33 15.0 22.0 \n", "34 18.0 18.0 \n", "35 29.0 28.0 \n", "36 25.0 32.0 \n", "37 32.0 33.0 \n", "38 33.0 35.0 \n", "39 31.0 34.0 \n", "40 29.0 30.0 \n", "\n", " qnli(acc,none)_rank mmlu(acc,none)_rank \\\n", "0 8.0 6.0 \n", "1 2.0 2.0 \n", "2 9.0 14.0 \n", "3 7.0 3.0 \n", "4 4.0 4.0 \n", "5 1.0 1.0 \n", "6 28.0 8.0 \n", "7 11.0 5.0 \n", "8 30.0 12.0 \n", "9 6.0 9.0 \n", "10 39.0 15.0 \n", "11 20.0 16.0 \n", "12 3.0 10.0 \n", "13 18.0 20.0 \n", "14 26.0 22.0 \n", "15 16.0 24.0 \n", "16 10.0 18.0 \n", "17 13.0 17.0 \n", "18 19.0 7.0 \n", "19 17.0 11.0 \n", "20 21.0 19.0 \n", "21 5.0 13.0 \n", "22 33.0 23.0 \n", "23 35.0 32.0 \n", "24 25.0 27.0 \n", "25 38.0 31.0 \n", "26 22.0 28.0 \n", "27 24.0 29.0 \n", "28 15.0 21.0 \n", "29 27.0 25.0 \n", "30 12.0 26.0 \n", "31 14.0 33.0 \n", "32 31.0 37.0 \n", "33 37.0 36.0 \n", "34 32.0 30.0 \n", "35 40.0 34.0 \n", "36 41.0 39.0 \n", "37 29.0 41.0 \n", "38 34.0 40.0 \n", "39 36.0 38.0 \n", "40 23.0 35.0 \n", "\n", " nq_open(exact_match,remove_whitespace)_rank drop(f1,none)_rank \\\n", "0 8.0 8.0 \n", "1 15.0 18.0 \n", "2 5.0 3.0 \n", "3 16.0 13.0 \n", "4 23.0 27.0 \n", "5 19.0 25.0 \n", "6 10.0 1.0 \n", "7 7.0 29.0 \n", "8 4.0 4.0 \n", "9 36.0 9.0 \n", "10 7.0 24.0 \n", "11 6.0 5.0 \n", "12 34.0 16.0 \n", "13 25.0 14.0 \n", "14 9.0 19.0 \n", "15 13.0 20.0 \n", "16 29.0 12.0 \n", "17 3.0 2.0 \n", "18 35.0 31.0 \n", "19 33.0 30.0 \n", "20 12.0 7.0 \n", "21 37.0 21.0 \n", "22 22.0 33.0 \n", "23 18.0 15.0 \n", "24 20.0 26.0 \n", "25 1.0 38.0 \n", "26 14.0 17.0 \n", "27 28.0 35.0 \n", "28 24.0 36.0 \n", "29 30.0 23.0 \n", "30 32.0 40.0 \n", "31 17.0 11.0 \n", "32 2.0 37.0 \n", "33 11.0 34.0 \n", "34 26.0 10.0 \n", "35 21.0 6.0 \n", "36 27.0 22.0 \n", "37 38.0 32.0 \n", "38 39.0 41.0 \n", "39 31.0 28.0 \n", "40 31.0 39.0 \n", "\n", " truthfulqa_mc1(acc,none)_rank truthfulqa_mc2(acc,none)_rank \\\n", "0 8.0 8.0 \n", "1 7.0 6.0 \n", "2 15.0 19.0 \n", "3 13.0 13.0 \n", "4 2.0 2.0 \n", "5 1.0 1.0 \n", "6 19.0 26.0 \n", "7 3.0 3.0 \n", "8 12.0 14.0 \n", "9 10.0 11.0 \n", "10 18.0 22.0 \n", "11 13.0 17.0 \n", "12 11.0 12.0 \n", "13 6.0 5.0 \n", "14 4.0 4.0 \n", "15 16.0 16.0 \n", "16 9.0 15.0 \n", "17 23.0 32.0 \n", "18 8.0 9.0 \n", "19 14.0 10.0 \n", "20 17.0 20.0 \n", "21 5.0 7.0 \n", "22 20.0 23.0 \n", "23 16.0 24.0 \n", "24 19.0 18.0 \n", "25 33.0 40.0 \n", "26 30.0 33.0 \n", "27 28.0 29.0 \n", "28 21.0 27.0 \n", "29 25.0 28.0 \n", "30 24.0 25.0 \n", "31 22.0 30.0 \n", "32 34.0 38.0 \n", "33 36.0 41.0 \n", "34 29.0 37.0 \n", "35 31.0 34.0 \n", "36 35.0 39.0 \n", "37 26.0 31.0 \n", "38 27.0 21.0 \n", "39 32.0 35.0 \n", "40 31.0 36.0 \n", "\n", " triviaqa(exact_match,remove_whitespace)_rank Reasoning & Math Mean Score \\\n", "0 23.0 0.6266 \n", "1 13.0 0.5860 \n", "2 3.0 0.5505 \n", "3 18.0 0.6214 \n", "4 17.0 0.5541 \n", "5 35.0 0.5488 \n", "6 4.0 0.5206 \n", "7 12.0 0.5245 \n", "8 7.0 0.5366 \n", "9 15.0 0.5399 \n", "10 5.0 0.5446 \n", "11 8.0 0.5286 \n", "12 26.0 0.5712 \n", "13 11.0 0.5184 \n", "14 2.0 0.4704 \n", "15 19.0 0.5374 \n", "16 16.0 0.5006 \n", "17 10.0 0.4495 \n", "18 39.0 0.5285 \n", "19 36.0 0.5387 \n", "20 14.0 0.4688 \n", "21 21.0 0.3823 \n", "22 27.0 0.5010 \n", "23 20.0 0.4244 \n", "24 28.0 0.4469 \n", "25 1.0 0.3719 \n", "26 24.0 0.4143 \n", "27 34.0 0.4841 \n", "28 22.0 0.3874 \n", "29 32.0 0.4493 \n", "30 40.0 0.4997 \n", "31 29.0 0.3674 \n", "32 6.0 0.3361 \n", "33 9.0 0.3377 \n", "34 31.0 0.3702 \n", "35 25.0 0.3450 \n", "36 30.0 0.3312 \n", "37 38.0 0.4009 \n", "38 41.0 0.4085 \n", "39 37.0 0.3494 \n", "40 33.0 0.2914 \n", "\n", " Reasoning & Math Avg. Rank Commonsense & NLI Mean Score \\\n", "0 1 0.7737 \n", "1 3 0.7807 \n", "2 6 0.7726 \n", "3 2 0.7468 \n", "4 5 0.7730 \n", "5 7 0.7941 \n", "6 16 0.7266 \n", "7 15 0.7564 \n", "8 12 0.7249 \n", "9 9 0.7691 \n", "10 8 0.7328 \n", "11 13 0.7147 \n", "12 4 0.7266 \n", "13 17 0.7284 \n", "14 22 0.7403 \n", "15 11 0.7167 \n", "16 19 0.7374 \n", "17 24 0.7199 \n", "18 14 0.7274 \n", "19 10 0.7094 \n", "20 23 0.6788 \n", "21 32 0.7367 \n", "22 18 0.6587 \n", "23 27 0.7090 \n", "24 26 0.6928 \n", "25 33 0.7157 \n", "26 28 0.7153 \n", "27 21 0.6422 \n", "28 31 0.6803 \n", "29 25 0.6442 \n", "30 20 0.6184 \n", "31 35 0.6978 \n", "32 39 0.6956 \n", "33 38 0.6886 \n", "34 34 0.6711 \n", "35 37 0.6264 \n", "36 40 0.6267 \n", "37 30 0.5703 \n", "38 29 0.5181 \n", "39 36 0.5696 \n", "40 41 0.6039 \n", "\n", " Commonsense & NLI Avg. Rank Knowledge & Reading Mean Score \\\n", "0 3 0.3791 \n", "1 2 0.3926 \n", "2 5 0.4136 \n", "3 8 0.3566 \n", "4 4 0.3810 \n", "5 1 0.3581 \n", "6 15 0.4369 \n", "7 7 0.3963 \n", "8 17 0.4127 \n", "9 6 0.3467 \n", "10 12 0.3683 \n", "11 22 0.3923 \n", "12 16 0.3226 \n", "13 13 0.3719 \n", "14 9 0.4045 \n", "15 19 0.3261 \n", "16 10 0.3339 \n", "17 18 0.4063 \n", "18 14 0.2919 \n", "19 23 0.2834 \n", "20 30 0.3438 \n", "21 11 0.3406 \n", "22 32 0.2827 \n", "23 24 0.3007 \n", "24 27 0.2805 \n", "25 20 0.3374 \n", "26 21 0.2864 \n", "27 34 0.2340 \n", "28 29 0.2903 \n", "29 33 0.2567 \n", "30 37 0.2276 \n", "31 25 0.2656 \n", "32 26 0.3018 \n", "33 28 0.2864 \n", "34 31 0.2581 \n", "35 36 0.2731 \n", "36 35 0.2202 \n", "37 39 0.1954 \n", "38 41 0.1983 \n", "39 40 0.2000 \n", "40 38 0.2218 \n", "\n", " Knowledge & Reading Avg. Rank Mean Score \n", "0 10 0.6038 \n", "1 7 0.5961 \n", "2 2 0.5871 \n", "3 14 0.5859 \n", "4 9 0.5788 \n", "5 13 0.5775 \n", "6 1 0.5676 \n", "7 6 0.5672 \n", "8 3 0.5653 \n", "9 15 0.5621 \n", "10 12 0.5576 \n", "11 8 0.5528 \n", "12 21 0.5510 \n", "13 11 0.5480 \n", "14 5 0.5451 \n", "15 20 0.5368 \n", "16 19 0.5335 \n", "17 4 0.5312 \n", "18 24 0.5271 \n", "19 28 0.5219 \n", "20 16 0.5048 \n", "21 17 0.4939 \n", "22 29 0.4907 \n", "23 23 0.4869 \n", "24 30 0.4830 \n", "25 18 0.4819 \n", "26 26 0.4813 \n", "27 35 0.4644 \n", "28 25 0.4608 \n", "29 34 0.4597 \n", "30 36 0.4596 \n", "31 32 0.4525 \n", "32 22 0.4516 \n", "33 27 0.4451 \n", "34 33 0.4419 \n", "35 31 0.4219 \n", "36 38 0.4013 \n", "37 41 0.3986 \n", "38 40 0.3838 \n", "39 39 0.3816 \n", "40 37 0.3799 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "{'total_time_raw': '18d 7h 55m', 'gpu_util_time_raw': '14d 23h 41m'}\n" ] } ], "source": [ "\n", "GROUPS = {\n", " \"Reasoning & Math\": [\n", " \"gsm8k(exact_match,strict-match)\", \n", " \"bbh(exact_match,get-answer)\", \n", " \"arc_challenge(acc_norm,none)\", 'anli_r1(acc,none)',\n", " 'anli_r2(acc,none)', 'anli_r3(acc,none)',\n", " \"gpqa_main_zeroshot(acc_norm,none)\",\n", " ],\n", " \"Commonsense & NLI\": [\n", " \"hellaswag(acc_norm,none)\",\n", " \"piqa(acc_norm,none)\", \"winogrande(acc,none)\", \"boolq(acc,none)\",\n", " \"openbookqa(acc_norm,none)\", \"sciq(acc_norm,none)\", \"qnli(acc,none)\",\n", " ],\n", " \"Knowledge & Reading\": [\n", " \"mmlu(acc,none)\", \"nq_open(exact_match,remove_whitespace)\", \"drop(f1,none)\",\n", " \"truthfulqa_mc1(acc,none)\", 'truthfulqa_mc2(acc,none)','triviaqa(exact_match,remove_whitespace)',\n", " ],\n", "}\n", "\n", "\n", "\n", "def add_task_ranks(df, task_cols):\n", " df = df.copy()\n", " for col in task_cols:\n", " if col not in df.columns: \n", " raise ValueError(f\"No task: {col}\")\n", " # rank: 1 = best; NaN scores get ranked at the bottom\n", " df[f\"{col}_rank\"] = df[col].rank(ascending=False, method=\"dense\", na_option=\"bottom\")\n", " return df\n", "\n", "def add_group_ranks(df, groups):\n", " df = df.copy()\n", " for gname, cols in groups.items():\n", " # strip task name before \"(\" if any\n", " tasks = [c for c in cols]\n", " mean_col = f\"{gname}_mean\"\n", " rank_col = f\"{gname}_rank\"\n", " df[mean_col] = df[tasks].mean(axis=1)\n", " df[rank_col] = df[mean_col].rank(ascending=False, method=\"dense\", na_option=\"bottom\").astype(int)\n", " return df\n", "\n", "\n", "def add_overall_rank(df, groups):\n", " df = df.copy()\n", " all_tasks = [c for cols in groups.values() for c in cols]\n", "\n", " # overall mean score across all tasks\n", " df[\"overall_mean\"] = df[all_tasks].mean(axis=1, skipna=True)\n", "\n", " # higher = better → rank descending\n", " df[\"overall_rank\"] = df[\"overall_mean\"].rank(\n", " ascending=False, method=\"dense\", na_option=\"bottom\"\n", " ).astype(int)\n", " return df\n", "\n", "\n", "all_task_cols = [c for cols in GROUPS.values() for c in cols]\n", "\n", "df_task_ranked = add_task_ranks(result_gpu_merged, all_task_cols)\n", "df_group_ranked = add_group_ranks(df_task_ranked, GROUPS)\n", "leaderboard = add_overall_rank(df_group_ranked, GROUPS)\n", "\n", "\n", "col = \"overall_rank\" # the one you want first\n", "cols = [col] + [c for c in leaderboard.columns if c != col]\n", "df = leaderboard[cols]\n", "df = df.sort_values(by=col, ascending=True).reset_index(drop=True)\n", "\n", "# Add quantization marker\n", "targets = ['Qwen_Qwen3-14B', 'Qwen_Qwen2.5-14B-Instruct'] # use hyphen\n", "mask = df['model_name'].isin(targets)\n", "df.loc[mask, 'model_name'] = df.loc[mask, 'model_name'] + ' (8bit)'\n", "\n", "# display(df)\n", "\n", "df_display = df.rename(columns={\n", " \"overall_rank\": \"Overall Rank\",\n", " \"model_name\": \"Model Name\",\n", " \"gpu_util_time\": \"GPU Util Time\",\n", " \"total_time\": \"Total Time\",\n", " \"parameters\": \"Parameters\",\n", " 'Reasoning & Math_rank': 'Reasoning & Math Avg. Rank',\n", " 'Commonsense & NLI_rank': 'Commonsense & NLI Avg. Rank',\n", " 'Knowledge & Reading_rank': 'Knowledge & Reading Avg. Rank',\n", " 'overall_mean': 'Mean Score',\n", " 'Reasoning & Math_mean': 'Reasoning & Math Mean Score',\n", " 'Commonsense & NLI_mean': 'Commonsense & NLI Mean Score',\n", " 'Knowledge & Reading_mean': 'Knowledge & Reading Mean Score',\n", "})\n", "\n", "cols_to_round = [\"Mean Score\", \"Reasoning & Math Mean Score\", \"Commonsense & NLI Mean Score\", \"Knowledge & Reading Mean Score\"] \n", "df_display[cols_to_round] = df_display[cols_to_round].round(4)\n", "\n", "display(df_display)\n", "df.to_csv(\"/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_master.csv\")\n", "\n", "\n", " \n", "# Total time calculation\n", "def format_seconds(secs: int) -> str:\n", " days, rem = divmod(int(secs), 86400) # 86400 sec = 1 day\n", " hours, rem = divmod(rem, 3600) # 3600 sec = 1 hour\n", " minutes, _ = divmod(rem, 60)\n", " return f\"{days}d {hours}h {minutes}m\"\n", "\n", "# Example usage with df_display\n", "totals = {}\n", "for col in [\"total_time_raw\", \"gpu_util_time_raw\"]:\n", " total_secs = df_display[col].sum()\n", " totals[col] = format_seconds(total_secs)\n", "\n", "print(totals)" ] }, { "cell_type": "code", "execution_count": 48, "id": "b3ce5953-3a36-436a-ba4c-46bedd2b4c56", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "overall\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Model NameTotal TimeGPU Util TimeMean ScoreOverall Rank
1google_gemma-3-12b-it15h 45m14h 8m0.60381
2Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.59612
3openchat_openchat-3.6-8b-202405227h 51m6h 59m0.58713
4Qwen_Qwen3-8B15h 31m13h 44m0.58594
5Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.57885
6Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.57756
701-ai_Yi-1.5-9B11h 43m10h 26m0.56767
8Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.56728
9meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.56539
1001-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.562110
11mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.557611
12meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.552812
13Qwen_Qwen3-4B5h 51m5h 3m0.551013
14NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.548014
15mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.545115
16google_gemma-3-4b-it4h 51m3h 50m0.536816
1701-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.533517
1801-ai_Yi-1.5-6B4h 28m3h 54m0.531218
19Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.527119
20deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.521920
21meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.504821
22Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.493922
23Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.490723
24deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.486924
25deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.483025
26meta-llama_Llama-2-13b-hf19h 21m17h 38m0.481926
27meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.481327
28deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.464428
29Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.460829
30Qwen_Qwen3-1.7B4h 25m3h 36m0.459730
31Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.459631
32meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.452532
33meta-llama_Llama-2-7b-hf5h 42m4h 59m0.451633
34deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.445134
35deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.441935
36meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.421936
37google_gemma-3-1b-it6h 50m4h 52m0.401337
38deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.398638
39Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.383839
40Qwen_Qwen3-0.6B3h 45m2h 53m0.381640
41Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.379941
\n", "
" ], "text/plain": [ " Model Name Total Time GPU Util Time \\\n", "1 google_gemma-3-12b-it 15h 45m 14h 8m \n", "2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", "3 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", "4 Qwen_Qwen3-8B 15h 31m 13h 44m \n", "5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", "6 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", "7 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", "8 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", "9 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", "10 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", "11 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", "12 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", "13 Qwen_Qwen3-4B 5h 51m 5h 3m \n", "14 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", "15 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", "16 google_gemma-3-4b-it 4h 51m 3h 50m \n", "17 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", "18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", "19 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", "20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", "21 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", "22 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", "23 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", "24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", "25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", "26 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", "27 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", "28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", "29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", "30 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", "31 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", "32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", "33 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", "34 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", "35 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", "36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", "37 google_gemma-3-1b-it 6h 50m 4h 52m \n", "38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", "39 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", "40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", "41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", "\n", " Mean Score Overall Rank \n", "1 0.6038 1 \n", "2 0.5961 2 \n", "3 0.5871 3 \n", "4 0.5859 4 \n", "5 0.5788 5 \n", "6 0.5775 6 \n", "7 0.5676 7 \n", "8 0.5672 8 \n", "9 0.5653 9 \n", "10 0.5621 10 \n", "11 0.5576 11 \n", "12 0.5528 12 \n", "13 0.5510 13 \n", "14 0.5480 14 \n", "15 0.5451 15 \n", "16 0.5368 16 \n", "17 0.5335 17 \n", "18 0.5312 18 \n", "19 0.5271 19 \n", "20 0.5219 20 \n", "21 0.5048 21 \n", "22 0.4939 22 \n", "23 0.4907 23 \n", "24 0.4869 24 \n", "25 0.4830 25 \n", "26 0.4819 26 \n", "27 0.4813 27 \n", "28 0.4644 28 \n", "29 0.4608 29 \n", "30 0.4597 30 \n", "31 0.4596 31 \n", "32 0.4525 32 \n", "33 0.4516 33 \n", "34 0.4451 34 \n", "35 0.4419 35 \n", "36 0.4219 36 \n", "37 0.4013 37 \n", "38 0.3986 38 \n", "39 0.3838 39 \n", "40 0.3816 40 \n", "41 0.3799 41 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "| Model Name | Total Time | GPU Util Time | Mean Score | Overall Rank |\n", "|:------------------------------------------|:-------------|:----------------|-------------:|---------------:|\n", "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.6038 | 1 |\n", "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.5961 | 2 |\n", "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.5871 | 3 |\n", "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.5859 | 4 |\n", "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.5788 | 5 |\n", "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.5775 | 6 |\n", "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.5676 | 7 |\n", "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.5672 | 8 |\n", "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.5653 | 9 |\n", "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.5621 | 10 |\n", "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.5576 | 11 |\n", "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.5528 | 12 |\n", "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.551 | 13 |\n", "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.548 | 14 |\n", "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.5451 | 15 |\n", "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.5368 | 16 |\n", "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.5335 | 17 |\n", "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.5312 | 18 |\n", "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.5271 | 19 |\n", "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.5219 | 20 |\n", "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.5048 | 21 |\n", "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.4939 | 22 |\n", "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.4907 | 23 |\n", "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.4869 | 24 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.483 | 25 |\n", "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.4819 | 26 |\n", "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.4813 | 27 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.4644 | 28 |\n", "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.4608 | 29 |\n", "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.4597 | 30 |\n", "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.4596 | 31 |\n", "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.4525 | 32 |\n", "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.4516 | 33 |\n", "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.4451 | 34 |\n", "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.4419 | 35 |\n", "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.4219 | 36 |\n", "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.4013 | 37 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.3986 | 38 |\n", "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.3838 | 39 |\n", "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.3816 | 40 |\n", "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.3799 | 41 |\n", "\n", "\n", "reasoning_and_math\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Model NameTotal TimeGPU Util TimeReasoning & Math Mean ScoreReasoning & Math Avg. Rank
1google_gemma-3-12b-it15h 45m14h 8m0.62661
2Qwen_Qwen3-8B15h 31m13h 44m0.62142
3Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.58603
4Qwen_Qwen3-4B5h 51m5h 3m0.57124
5Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.55415
6openchat_openchat-3.6-8b-202405227h 51m6h 59m0.55056
7Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.54887
8mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.54468
901-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.53999
10deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.538710
11google_gemma-3-4b-it4h 51m3h 50m0.537411
12meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.536612
13meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.528613
14Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.528514
15Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.524515
1601-ai_Yi-1.5-9B11h 43m10h 26m0.520616
17NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.518417
18Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.501018
1901-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.500619
20Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.499720
21deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.484121
22mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.470422
23meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.468823
2401-ai_Yi-1.5-6B4h 28m3h 54m0.449524
25Qwen_Qwen3-1.7B4h 25m3h 36m0.449325
26deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.446926
27deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.424427
28meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.414328
29Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.408529
30deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.400930
31Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.387431
32Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.382332
33meta-llama_Llama-2-13b-hf19h 21m17h 38m0.371933
34deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.370234
35meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.367435
36Qwen_Qwen3-0.6B3h 45m2h 53m0.349436
37meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.345037
38deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.337738
39meta-llama_Llama-2-7b-hf5h 42m4h 59m0.336139
40google_gemma-3-1b-it6h 50m4h 52m0.331240
41Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.291441
\n", "
" ], "text/plain": [ " Model Name Total Time GPU Util Time \\\n", "1 google_gemma-3-12b-it 15h 45m 14h 8m \n", "2 Qwen_Qwen3-8B 15h 31m 13h 44m \n", "3 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", "4 Qwen_Qwen3-4B 5h 51m 5h 3m \n", "5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", "6 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", "7 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", "8 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", "9 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", "10 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", "11 google_gemma-3-4b-it 4h 51m 3h 50m \n", "12 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", "13 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", "14 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", "15 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", "16 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", "17 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", "18 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", "19 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", "20 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", "21 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", "22 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", "23 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", "24 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", "25 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", "26 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", "27 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", "28 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", "29 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", "30 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", "31 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", "32 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", "33 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", "34 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", "35 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", "36 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", "37 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", "38 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", "39 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", "40 google_gemma-3-1b-it 6h 50m 4h 52m \n", "41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", "\n", " Reasoning & Math Mean Score Reasoning & Math Avg. Rank \n", "1 0.6266 1 \n", "2 0.6214 2 \n", "3 0.5860 3 \n", "4 0.5712 4 \n", "5 0.5541 5 \n", "6 0.5505 6 \n", "7 0.5488 7 \n", "8 0.5446 8 \n", "9 0.5399 9 \n", "10 0.5387 10 \n", "11 0.5374 11 \n", "12 0.5366 12 \n", "13 0.5286 13 \n", "14 0.5285 14 \n", "15 0.5245 15 \n", "16 0.5206 16 \n", "17 0.5184 17 \n", "18 0.5010 18 \n", "19 0.5006 19 \n", "20 0.4997 20 \n", "21 0.4841 21 \n", "22 0.4704 22 \n", "23 0.4688 23 \n", "24 0.4495 24 \n", "25 0.4493 25 \n", "26 0.4469 26 \n", "27 0.4244 27 \n", "28 0.4143 28 \n", "29 0.4085 29 \n", "30 0.4009 30 \n", "31 0.3874 31 \n", "32 0.3823 32 \n", "33 0.3719 33 \n", "34 0.3702 34 \n", "35 0.3674 35 \n", "36 0.3494 36 \n", "37 0.3450 37 \n", "38 0.3377 38 \n", "39 0.3361 39 \n", "40 0.3312 40 \n", "41 0.2914 41 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "| Model Name | Total Time | GPU Util Time | Reasoning & Math Mean Score | Reasoning & Math Avg. Rank |\n", "|:------------------------------------------|:-------------|:----------------|------------------------------:|-----------------------------:|\n", "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.6266 | 1 |\n", "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.6214 | 2 |\n", "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.586 | 3 |\n", "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.5712 | 4 |\n", "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.5541 | 5 |\n", "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.5505 | 6 |\n", "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.5488 | 7 |\n", "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.5446 | 8 |\n", "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.5399 | 9 |\n", "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.5387 | 10 |\n", "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.5374 | 11 |\n", "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.5366 | 12 |\n", "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.5286 | 13 |\n", "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.5285 | 14 |\n", "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.5245 | 15 |\n", "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.5206 | 16 |\n", "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.5184 | 17 |\n", "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.501 | 18 |\n", "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.5006 | 19 |\n", "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.4997 | 20 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.4841 | 21 |\n", "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.4704 | 22 |\n", "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.4688 | 23 |\n", "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.4495 | 24 |\n", "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.4493 | 25 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.4469 | 26 |\n", "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.4244 | 27 |\n", "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.4143 | 28 |\n", "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.4085 | 29 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.4009 | 30 |\n", "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.3874 | 31 |\n", "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.3823 | 32 |\n", "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.3719 | 33 |\n", "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.3702 | 34 |\n", "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.3674 | 35 |\n", "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.3494 | 36 |\n", "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.345 | 37 |\n", "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.3377 | 38 |\n", "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.3361 | 39 |\n", "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.3312 | 40 |\n", "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.2914 | 41 |\n", "\n", "\n", "commonsense_and_nli\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Model NameTotal TimeGPU Util TimeCommonsense & NLI Mean ScoreCommonsense & NLI Avg. Rank
1Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.79411
2Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.78072
3google_gemma-3-12b-it15h 45m14h 8m0.77373
4Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.77304
5openchat_openchat-3.6-8b-202405227h 51m6h 59m0.77265
601-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.76916
7Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.75647
8Qwen_Qwen3-8B15h 31m13h 44m0.74688
9mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.74039
1001-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.737410
11Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.736711
12mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.732812
13NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.728413
14Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.727414
1501-ai_Yi-1.5-9B11h 43m10h 26m0.726615
16Qwen_Qwen3-4B5h 51m5h 3m0.726616
17meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.724917
1801-ai_Yi-1.5-6B4h 28m3h 54m0.719918
19google_gemma-3-4b-it4h 51m3h 50m0.716719
20meta-llama_Llama-2-13b-hf19h 21m17h 38m0.715720
21meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.715321
22meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.714722
23deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.709423
24deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.709024
25meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.697825
26meta-llama_Llama-2-7b-hf5h 42m4h 59m0.695626
27deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.692827
28deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.688628
29Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.680329
30meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.678830
31deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.671131
32Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.658732
33Qwen_Qwen3-1.7B4h 25m3h 36m0.644233
34deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.642234
35google_gemma-3-1b-it6h 50m4h 52m0.626735
36meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.626436
37Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.618437
38Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.603938
39deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.570339
40Qwen_Qwen3-0.6B3h 45m2h 53m0.569640
41Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.518141
\n", "
" ], "text/plain": [ " Model Name Total Time GPU Util Time \\\n", "1 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", "2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", "3 google_gemma-3-12b-it 15h 45m 14h 8m \n", "4 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", "5 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", "6 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", "7 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", "8 Qwen_Qwen3-8B 15h 31m 13h 44m \n", "9 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", "10 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", "11 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", "12 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", "13 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", "14 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", "15 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", "16 Qwen_Qwen3-4B 5h 51m 5h 3m \n", "17 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", "18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", "19 google_gemma-3-4b-it 4h 51m 3h 50m \n", "20 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", "21 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", "22 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", "23 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", "24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", "25 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", "26 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", "27 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", "28 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", "29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", "30 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", "31 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", "32 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", "33 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", "34 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", "35 google_gemma-3-1b-it 6h 50m 4h 52m \n", "36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", "37 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", "38 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", "39 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", "40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", "41 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", "\n", " Commonsense & NLI Mean Score Commonsense & NLI Avg. Rank \n", "1 0.7941 1 \n", "2 0.7807 2 \n", "3 0.7737 3 \n", "4 0.7730 4 \n", "5 0.7726 5 \n", "6 0.7691 6 \n", "7 0.7564 7 \n", "8 0.7468 8 \n", "9 0.7403 9 \n", "10 0.7374 10 \n", "11 0.7367 11 \n", "12 0.7328 12 \n", "13 0.7284 13 \n", "14 0.7274 14 \n", "15 0.7266 15 \n", "16 0.7266 16 \n", "17 0.7249 17 \n", "18 0.7199 18 \n", "19 0.7167 19 \n", "20 0.7157 20 \n", "21 0.7153 21 \n", "22 0.7147 22 \n", "23 0.7094 23 \n", "24 0.7090 24 \n", "25 0.6978 25 \n", "26 0.6956 26 \n", "27 0.6928 27 \n", "28 0.6886 28 \n", "29 0.6803 29 \n", "30 0.6788 30 \n", "31 0.6711 31 \n", "32 0.6587 32 \n", "33 0.6442 33 \n", "34 0.6422 34 \n", "35 0.6267 35 \n", "36 0.6264 36 \n", "37 0.6184 37 \n", "38 0.6039 38 \n", "39 0.5703 39 \n", "40 0.5696 40 \n", "41 0.5181 41 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "| Model Name | Total Time | GPU Util Time | Commonsense & NLI Mean Score | Commonsense & NLI Avg. Rank |\n", "|:------------------------------------------|:-------------|:----------------|-------------------------------:|------------------------------:|\n", "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.7941 | 1 |\n", "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.7807 | 2 |\n", "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.7737 | 3 |\n", "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.773 | 4 |\n", "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.7726 | 5 |\n", "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.7691 | 6 |\n", "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.7564 | 7 |\n", "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.7468 | 8 |\n", "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.7403 | 9 |\n", "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.7374 | 10 |\n", "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.7367 | 11 |\n", "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.7328 | 12 |\n", "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.7284 | 13 |\n", "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.7274 | 14 |\n", "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.7266 | 15 |\n", "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.7266 | 16 |\n", "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.7249 | 17 |\n", "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.7199 | 18 |\n", "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.7167 | 19 |\n", "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.7157 | 20 |\n", "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.7153 | 21 |\n", "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.7147 | 22 |\n", "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.7094 | 23 |\n", "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.709 | 24 |\n", "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.6978 | 25 |\n", "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.6956 | 26 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.6928 | 27 |\n", "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.6886 | 28 |\n", "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.6803 | 29 |\n", "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.6788 | 30 |\n", "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.6711 | 31 |\n", "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.6587 | 32 |\n", "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.6442 | 33 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.6422 | 34 |\n", "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.6267 | 35 |\n", "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.6264 | 36 |\n", "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.6184 | 37 |\n", "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.6039 | 38 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.5703 | 39 |\n", "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.5696 | 40 |\n", "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.5181 | 41 |\n", "\n", "\n", "knowledge_and_reading\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Model NameTotal TimeGPU Util TimeKnowledge & Reading Mean ScoreKnowledge & Reading Avg. Rank
101-ai_Yi-1.5-9B11h 43m10h 26m0.43691
2openchat_openchat-3.6-8b-202405227h 51m6h 59m0.41362
3meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.41273
401-ai_Yi-1.5-6B4h 28m3h 54m0.40634
5mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.40455
6Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.39636
7Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.39267
8meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.39238
9Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.38109
10google_gemma-3-12b-it15h 45m14h 8m0.379110
11NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.371911
12mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.368312
13Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.358113
14Qwen_Qwen3-8B15h 31m13h 44m0.356614
1501-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.346715
16meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.343816
17Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.340617
18meta-llama_Llama-2-13b-hf19h 21m17h 38m0.337418
1901-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.333919
20google_gemma-3-4b-it4h 51m3h 50m0.326120
21Qwen_Qwen3-4B5h 51m5h 3m0.322621
22meta-llama_Llama-2-7b-hf5h 42m4h 59m0.301822
23deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.300723
24Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.291924
25Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.290325
26meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.286426
27deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.286427
28deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.283428
29Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.282729
30deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.280530
31meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.273131
32meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.265632
33deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.258133
34Qwen_Qwen3-1.7B4h 25m3h 36m0.256734
35deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.234035
36Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.227636
37Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.221837
38google_gemma-3-1b-it6h 50m4h 52m0.220238
39Qwen_Qwen3-0.6B3h 45m2h 53m0.200039
40Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.198340
41deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.195441
\n", "
" ], "text/plain": [ " Model Name Total Time GPU Util Time \\\n", "1 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", "2 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", "3 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", "4 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", "5 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", "6 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", "7 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", "8 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", "9 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", "10 google_gemma-3-12b-it 15h 45m 14h 8m \n", "11 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", "12 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", "13 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", "14 Qwen_Qwen3-8B 15h 31m 13h 44m \n", "15 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", "16 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", "17 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", "18 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", "19 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", "20 google_gemma-3-4b-it 4h 51m 3h 50m \n", "21 Qwen_Qwen3-4B 5h 51m 5h 3m \n", "22 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", "23 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", "24 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", "25 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", "26 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", "27 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", "28 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", "29 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", "30 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", "31 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", "32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", "33 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", "34 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", "35 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", "36 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", "37 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", "38 google_gemma-3-1b-it 6h 50m 4h 52m \n", "39 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", "40 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", "41 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", "\n", " Knowledge & Reading Mean Score Knowledge & Reading Avg. Rank \n", "1 0.4369 1 \n", "2 0.4136 2 \n", "3 0.4127 3 \n", "4 0.4063 4 \n", "5 0.4045 5 \n", "6 0.3963 6 \n", "7 0.3926 7 \n", "8 0.3923 8 \n", "9 0.3810 9 \n", "10 0.3791 10 \n", "11 0.3719 11 \n", "12 0.3683 12 \n", "13 0.3581 13 \n", "14 0.3566 14 \n", "15 0.3467 15 \n", "16 0.3438 16 \n", "17 0.3406 17 \n", "18 0.3374 18 \n", "19 0.3339 19 \n", "20 0.3261 20 \n", "21 0.3226 21 \n", "22 0.3018 22 \n", "23 0.3007 23 \n", "24 0.2919 24 \n", "25 0.2903 25 \n", "26 0.2864 26 \n", "27 0.2864 27 \n", "28 0.2834 28 \n", "29 0.2827 29 \n", "30 0.2805 30 \n", "31 0.2731 31 \n", "32 0.2656 32 \n", "33 0.2581 33 \n", "34 0.2567 34 \n", "35 0.2340 35 \n", "36 0.2276 36 \n", "37 0.2218 37 \n", "38 0.2202 38 \n", "39 0.2000 39 \n", "40 0.1983 40 \n", "41 0.1954 41 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "| Model Name | Total Time | GPU Util Time | Knowledge & Reading Mean Score | Knowledge & Reading Avg. Rank |\n", "|:------------------------------------------|:-------------|:----------------|---------------------------------:|--------------------------------:|\n", "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.4369 | 1 |\n", "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.4136 | 2 |\n", "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.4127 | 3 |\n", "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.4063 | 4 |\n", "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.4045 | 5 |\n", "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.3963 | 6 |\n", "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.3926 | 7 |\n", "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.3923 | 8 |\n", "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.381 | 9 |\n", "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.3791 | 10 |\n", "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.3719 | 11 |\n", "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.3683 | 12 |\n", "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.3581 | 13 |\n", "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.3566 | 14 |\n", "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.3467 | 15 |\n", "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.3438 | 16 |\n", "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.3406 | 17 |\n", "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.3374 | 18 |\n", "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.3339 | 19 |\n", "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.3261 | 20 |\n", "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.3226 | 21 |\n", "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.3018 | 22 |\n", "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.3007 | 23 |\n", "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.2919 | 24 |\n", "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.2903 | 25 |\n", "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.2864 | 26 |\n", "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.2864 | 27 |\n", "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.2834 | 28 |\n", "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.2827 | 29 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.2805 | 30 |\n", "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.2731 | 31 |\n", "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.2656 | 32 |\n", "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.2581 | 33 |\n", "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.2567 | 34 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.234 | 35 |\n", "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.2276 | 36 |\n", "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.2218 | 37 |\n", "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.2202 | 38 |\n", "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.2 | 39 |\n", "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.1983 | 40 |\n", "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.1954 | 41 |\n", "\n", "\n" ] } ], "source": [ "column_map = {\n", " \"overall\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Mean Score', \"Overall Rank\"],\n", " \"reasoning_and_math\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Reasoning & Math Mean Score', \"Reasoning & Math Avg. Rank\"],\n", " \"commonsense_and_nli\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Commonsense & NLI Mean Score', \"Commonsense & NLI Avg. Rank\"],\n", " \"knowledge_and_reading\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Knowledge & Reading Mean Score', \"Knowledge & Reading Avg. Rank\"]\n", "}\n", "\n", "\n", "\n", "# Produce sub-dataframes and export them to csv and excel file.\n", "with pd.ExcelWriter(\"/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_all_results.xlsx\") as writer:\n", " df_display.to_excel(writer, sheet_name=\"Master\", index=False)\n", " \n", " for name, cols in column_map.items():\n", " sub_df = df_display[cols].copy()\n", " rank_col = [c for c in sub_df.columns if 'Rank' in c][0]\n", " sub_df = sub_df.sort_values(by=rank_col, ascending=True).reset_index(drop=True)\n", " sub_df.index = sub_df.index + 1\n", " print(name)\n", " if name == 'overall':\n", " overall_df = sub_df\n", " display(sub_df)\n", " \n", " # sub_df.to_csv(f\"/mnt/data8tb/Documents/project/benchmark_project/{name}_rank.csv\")\n", " # sub_df.to_excel(writer, sheet_name=name, index=False)\n", "\n", " table_md = sub_df.to_markdown(index=False)\n", " print(table_md)\n", "\n", " sub_df.to_html(f\"{name}.html\", index=False)\n", " print()\n", " print()\n", "\n" ] }, { "cell_type": "code", "execution_count": 47, "id": "5642b72a-e416-482b-b45b-8376fd2571b7", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Model NameTotal TimeGPU Util TimeMean ScoreOverall Rank
1google_gemma-3-12b-it15h 45m14h 8m0.60381
2Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.59612
3openchat_openchat-3.6-8b-202405227h 51m6h 59m0.58713
4Qwen_Qwen3-8B15h 31m13h 44m0.58594
5Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.57885
6Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.57756
701-ai_Yi-1.5-9B11h 43m10h 26m0.56767
8Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.56728
9meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.56539
1001-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.562110
11mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.557611
12meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.552812
13Qwen_Qwen3-4B5h 51m5h 3m0.551013
14NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.548014
15mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.545115
16google_gemma-3-4b-it4h 51m3h 50m0.536816
1701-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.533517
1801-ai_Yi-1.5-6B4h 28m3h 54m0.531218
19Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.527119
20deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.521920
21meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.504821
22Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.493922
23Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.490723
24deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.486924
25deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.483025
26meta-llama_Llama-2-13b-hf19h 21m17h 38m0.481926
27meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.481327
28deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.464428
29Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.460829
30Qwen_Qwen3-1.7B4h 25m3h 36m0.459730
31Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.459631
32meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.452532
33meta-llama_Llama-2-7b-hf5h 42m4h 59m0.451633
34deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.445134
35deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.441935
36meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.421936
37google_gemma-3-1b-it6h 50m4h 52m0.401337
38deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.398638
39Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.383839
40Qwen_Qwen3-0.6B3h 45m2h 53m0.381640
41Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.379941
\n", "
" ], "text/plain": [ " Model Name Total Time GPU Util Time \\\n", "1 google_gemma-3-12b-it 15h 45m 14h 8m \n", "2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", "3 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", "4 Qwen_Qwen3-8B 15h 31m 13h 44m \n", "5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", "6 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", "7 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", "8 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", "9 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", "10 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", "11 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", "12 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", "13 Qwen_Qwen3-4B 5h 51m 5h 3m \n", "14 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", "15 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", "16 google_gemma-3-4b-it 4h 51m 3h 50m \n", "17 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", "18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", "19 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", "20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", "21 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", "22 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", "23 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", "24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", "25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", "26 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", "27 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", "28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", "29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", "30 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", "31 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", "32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", "33 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", "34 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", "35 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", "36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", "37 google_gemma-3-1b-it 6h 50m 4h 52m \n", "38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", "39 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", "40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", "41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", "\n", " Mean Score Overall Rank \n", "1 0.6038 1 \n", "2 0.5961 2 \n", "3 0.5871 3 \n", "4 0.5859 4 \n", "5 0.5788 5 \n", "6 0.5775 6 \n", "7 0.5676 7 \n", "8 0.5672 8 \n", "9 0.5653 9 \n", "10 0.5621 10 \n", "11 0.5576 11 \n", "12 0.5528 12 \n", "13 0.5510 13 \n", "14 0.5480 14 \n", "15 0.5451 15 \n", "16 0.5368 16 \n", "17 0.5335 17 \n", "18 0.5312 18 \n", "19 0.5271 19 \n", "20 0.5219 20 \n", "21 0.5048 21 \n", "22 0.4939 22 \n", "23 0.4907 23 \n", "24 0.4869 24 \n", "25 0.4830 25 \n", "26 0.4819 26 \n", "27 0.4813 27 \n", "28 0.4644 28 \n", "29 0.4608 29 \n", "30 0.4597 30 \n", "31 0.4596 31 \n", "32 0.4525 32 \n", "33 0.4516 33 \n", "34 0.4451 34 \n", "35 0.4419 35 \n", "36 0.4219 36 \n", "37 0.4013 37 \n", "38 0.3986 38 \n", "39 0.3838 39 \n", "40 0.3816 40 \n", "41 0.3799 41 " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(overall_df)\n", "overall_df.to_html(\"overall.html\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "1a04411e-c749-428f-89bd-2c23ac74af71", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7368bca2-dd44-4393-be0e-320f737af82b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }