diff --git "a/parse.ipynb" "b/parse.ipynb" new file mode 100644--- /dev/null +++ "b/parse.ipynb" @@ -0,0 +1,6638 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 104, + "id": "73fc3ddb-9d22-4b9b-960a-f78b5111c898", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "15c618d3-e5a2-4ae8-8e2e-df916cc7d465", + "metadata": {}, + "outputs": [], + "source": [ + "import json, pathlib, pandas as pd\n", + "from pprint import pprint\n", + "import os\n", + "from pathlib import Path\n", + "from collections import Counter\n", + "from io import StringIO\n", + "import numpy as np\n", + "\n", + "\n", + "pd.set_option(\"display.max_rows\", None) # show ALL rows\n", + "pd.set_option(\"display.max_columns\", None) # show ALL columns\n", + "\n", + "\n", + "# Time Conversion function\n", + "def format_time(seconds: float) -> str:\n", + " seconds = int(seconds)\n", + " hours, remainder = divmod(seconds, 3600)\n", + " minutes = remainder // 60 # drop leftover seconds\n", + "\n", + " parts = []\n", + " if hours > 0:\n", + " parts.append(f\"{hours}h\")\n", + " if minutes > 0 or not parts: # if no hours and no minutes, show 0m\n", + " parts.append(f\"{minutes}m\")\n", + "\n", + " return \" \".join(parts)\n", + "\n", + "\n", + "def list_json_files(directory: str):\n", + " \"\"\"\n", + " Reads all .json files in a given directory and returns \n", + " their full paths as a list.\n", + " \"\"\"\n", + " json_files = []\n", + " for file in os.listdir(directory):\n", + " if file.endswith(\".json\"):\n", + " full_path = os.path.join(directory, file)\n", + " json_files.append(full_path)\n", + " return json_files\n", + "\n", + "\n", + "def format_params(n: int) -> str:\n", + " \"\"\"\n", + " Convert raw parameter count (int) into human-friendly string.\n", + " Examples:\n", + " 6851947264 -> \"7B\"\n", + " 12500000000 -> \"12.5B\"\n", + " 560000000 -> \"560M\"\n", + " \"\"\"\n", + " if n >= 1_000_000_000: # billions\n", + " val = n / 1_000_000_000\n", + " if val.is_integer():\n", + " return f\"{int(val)}B\"\n", + " else:\n", + " return f\"{val:.1f}B\"\n", + " elif n >= 1_000_000: # millions\n", + " val = n / 1_000_000\n", + " if val.is_integer():\n", + " return f\"{int(val)}M\"\n", + " else:\n", + " return f\"{val:.1f}M\"\n", + " elif n >= 1_000: # thousands (rare for params, but included)\n", + " val = n / 1_000\n", + " if val.is_integer():\n", + " return f\"{int(val)}K\"\n", + " else:\n", + " return f\"{val:.1f}K\"\n", + " else:\n", + " return str(n)\n", + "\n", + "\n", + "metric_map = {\n", + " \"mmlu\":\"acc,none\" ,\n", + " \"hellaswag\": \"acc_norm,none\",\n", + " \"arc_challenge\": \"acc_norm,none\", # prefer normalized accuracy\n", + " \"bbh\": \"exact_match,get-answer\",\n", + " \"gsm8k\":\"exact_match,strict-match\" ,\n", + " \"gpqa_main_zeroshot\":\"acc_norm,none\",\n", + " \"anli_r1\": \"acc,none\",\n", + " \"anli_r2\": \"acc,none\",\n", + " \"anli_r3\": \"acc,none\",\n", + " \"piqa\":\"acc_norm,none\" ,\n", + " \"winogrande\": \"acc,none\",\n", + " \"boolq\": \"acc,none\",\n", + " \"truthfulqa_mc1\":\"acc,none\" ,\n", + " \"truthfulqa_mc2\":\"acc,none\" ,\n", + " \"drop\": \"f1,none\",\n", + " \"nq_open\":\"exact_match,remove_whitespace\" ,\n", + " \"openbookqa\":\"acc_norm,none\" ,\n", + " \"sciq\": \"acc_norm,none\",\n", + " \"triviaqa\":\"exact_match,remove_whitespace\" ,\n", + " \"qnli\":\"acc,none\" ,\n", + "}\n", + "\n", + "# Tasks from most important to least important\n", + "# tasks = [mmlu, hellaswag, arc_challenge, bbh, gsm8k, gpqa_main_zeroshot, ANLI (r1, r2, r3), piqa, winogrande, boolq, TruthfulQA (mc1, mc2), drop, nq_open, openbookqa, sciq, triviaqa, qnli]\n", + "\n", + "# Path list \n", + "directory = \"/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/results\"\n", + "all_json_paths = list_json_files(directory)\n", + "\n", + "def parse_results(json_path: str, metric_map: dict) -> pd.DataFrame:\n", + "\n", + " with open(json_path,'r') as f:\n", + " data = json.load(f)\n", + "\n", + " # Extract core info\n", + " model_name = data.get(\"model_name\")\n", + " model_name = model_name.split(\"/\")[-1]\n", + " total_time_raw = float(data.get(\"total_evaluation_time_seconds\", 0))\n", + " total_time = format_time(float(data.get(\"total_evaluation_time_seconds\", 0)))\n", + " batch_size = data[\"config\"].get(\"batch_size\")\n", + " batch_sizes = data[\"config\"].get(\"batch_sizes\")\n", + " parameters = format_params(data[\"config\"].get(\"model_num_parameters\"))\n", + " parameters_raw = data[\"config\"].get(\"model_num_parameters\")\n", + " \n", + "\n", + " rows = []\n", + " for task, metric_key in metric_map.items():\n", + " # Skip tasks not present in the results\n", + " if task not in data[\"results\"]:\n", + " raise ValueError(f\"'{task}' not in results! \") \n", + " \n", + " metrics = data[\"results\"][task]\n", + " \n", + " # If the metric_key isn't in this task's results, raise error\n", + " if metric_key not in metrics:\n", + " raise ValueError(\n", + " f\"Expected metric '{metric_key}' not found for task '{task}'. \"\n", + " f\"Available keys: {list(metrics.keys())}\"\n", + " )\n", + " \n", + " acc = metrics[metric_key]\n", + " \n", + " row = {\n", + " \"model_name\": model_name,\n", + " # \"task\": task,\n", + " \"task\": task + \"(\" + metric_key + \")\",\n", + " \"score\": acc,\n", + " \"total_time\": total_time,\n", + " \"total_time_raw\" : total_time_raw,\n", + " \"batch_size\": batch_size,\n", + " \"batch_sizes\": batch_sizes,\n", + " \"parameters\": parameters,\n", + " \"parameters_raw\": parameters_raw,\n", + " }\n", + " rows.append(row)\n", + "\n", + " # Convert to tidy dataframe\n", + " return pd.DataFrame(rows)\n", + "\n", + "\n", + "dfs = [parse_results(path, metric_map) for path in all_json_paths]\n", + "master_df = pd.concat(dfs, ignore_index=True)\n", + "\n", + "\n", + "# display(master_df)\n", + "\n", + "\n", + "# Wide format: one row per model, columns = tasks\n", + "#Check for duplicate rows \n", + "key_cols = [\"task\", 'score', 'model_name']\n", + "dups_mask = master_df.duplicated(key_cols, keep=False)\n", + "# dups = master_df.loc[dups_mask]\n", + "# display(dups)\n", + "\n", + "if dups_mask.any():\n", + " dups = master_df.loc[dups_mask, key_cols]\n", + " raise ValueError(f\"Duplicate rows found for keys:\\n{dups}\")\n", + "\n", + "wide_df = master_df.pivot_table(\n", + " index=[\"model_name\", \"parameters\"],\n", + " columns=[\"task\"],\n", + " values=\"score\",\n", + " aggfunc=\"mean\"\n", + ").reset_index()\n", + "\n", + "# select the metadata columns you want from the long df\n", + "meta_cols = [\n", + " \"model_name\", \n", + " \"parameters\", \n", + " \"parameters_raw\",\n", + " \"total_time\", \n", + " \"total_time_raw\", \n", + " \"batch_size\", \n", + " \"batch_sizes\", \n", + "]\n", + "\n", + "# drop duplicate rows by model_name + parameters\n", + "df_meta = master_df[meta_cols].drop_duplicates(subset=[\"model_name\", \"parameters\"])\n", + "\n", + "# merge the metadata back into your wide dataframe\n", + "df_wide_merged = df_meta.merge(wide_df, on=[\"model_name\", \"parameters\"], how=\"left\")\n", + "\n", + "\n", + "# display(df_wide_merged.drop(columns=[\"parameters_raw\", \"total_time_raw\", \"batch_sizes\"]))\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "324364b8-b59a-4450-8723-0c4057488513", + "metadata": {}, + "outputs": [], + "source": [ + "gpu_dir = Path(\"/mnt/data8tb/Documents/lm-evaluation-harness/results/bench_project_1/gpu_usage\")\n", + "gpu_files = list(gpu_dir.glob(\"*_gpu_usage.csv\"))\n", + "\n", + "def model_from_filename(p: Path) -> str:\n", + " return p.stem.replace(\"_gpu_usage\", \"\").strip()\n", + "\n", + "model_names_gpu = [model_from_filename(x) for x in gpu_files]\n", + "\n", + "# Check if match with result\n", + "set_gpu = set(model_names_gpu)\n", + "set_results = set(wide_df['model_name'])\n", + "extra_in_gpu = set_gpu - set_results\n", + "missing_in_gpu = set_results - set_gpu\n", + "# print(\"Extra models in GPU logs:\", extra_in_gpu)\n", + "# print(\"Models in results with no GPU log:\", missing_in_gpu)\n", + "\n", + "\n", + "# Check for Dulicates\n", + "def print_duplicates(name_list, label=\"\"):\n", + " counts = Counter(name_list)\n", + " dups = [name for name, cnt in counts.items() if cnt > 1]\n", + " if dups:\n", + " print(f\"Duplicates in {label}:\")\n", + " for name in dups:\n", + " print(f\" {name} (count = {counts[name]})\")\n", + " else:\n", + " print(f\"No duplicates found in {label}.\")\n", + "# print_duplicates(model_names_gpu, \"GPU logs\")\n", + "\n", + "\n", + "def read_last_run_csv(path: Path) -> pd.DataFrame:\n", + " \"\"\"\n", + " Return a DataFrame for only the *last* '==== New Run ... ====' block.\n", + " Assumes next line after the marker is the CSV header.\n", + " \"\"\"\n", + " lines = path.read_text(encoding=\"utf-8\").splitlines()\n", + " # locate all run markers\n", + " run_idx = [i for i, line in enumerate(lines) if line.startswith(\"==== New Run:\")]\n", + " if not run_idx:\n", + " raise ValueError(f\"No '==== New Run' marker found in {path}\")\n", + " start = run_idx[-1] + 1 # header line index\n", + "\n", + " # slice from header to end and parse CSV\n", + " block = \"\\n\".join(lines[start:])\n", + " df = pd.read_csv(StringIO(block))\n", + "\n", + " # optional cleanup: strip units and cast to numbers if these columns exist\n", + " if \" utilization.gpu [%]\" in df.columns:\n", + " df[\" utilization.gpu [%]\"] = (\n", + " df[\" utilization.gpu [%]\"].astype(str).str.replace(\"%\", \"\", regex=False).str.strip().astype(\"float\")\n", + " )\n", + " if \" memory.used [MiB]\" in df.columns:\n", + " df[\" memory.used [MiB]\"] = (\n", + " df[\" memory.used [MiB]\"].astype(str).str.replace(\"MiB\", \"\", regex=False).str.strip().astype(\"float\")\n", + " )\n", + " # parse timestamp if desired\n", + " if \"timestamp\" in df.columns:\n", + " df[\"timestamp\"] = pd.to_datetime(df[\"timestamp\"], errors=\"coerce\")\n", + "\n", + " return df\n", + "\n", + "\n", + "def eq_full_util_time(df, util_col=\" utilization.gpu [%]\", interval_sec=60):\n", + " # clip just in case and cast to float\n", + " u = pd.to_numeric(df[util_col], errors=\"coerce\")\n", + " # u = pd.to_numeric(df[util_col], errors=\"coerce\").fillna(0).clip(0, 100)\n", + " eq_full_sec = float((u / 100 * interval_sec).sum())\n", + " full_sec = float(len(u)*interval_sec)\n", + "\n", + " # pretty formatter\n", + " h, rem = divmod(int(round(full_sec)), 3600)\n", + " m, s = divmod(rem, 60)\n", + " pretty_full = f\"{h}h {m}m\"\n", + " h, rem = divmod(int(round(eq_full_sec)), 3600)\n", + " m, s = divmod(rem, 60)\n", + " pretty = f\"{h}h {m}m\"\n", + " return pretty, pretty_full, eq_full_sec\n", + "\n", + "\n", + "gpu_df = [read_last_run_csv(df) for df in gpu_files]\n", + "\n", + "\n", + "results = []\n", + "for name, df in zip(model_names_gpu, gpu_df):\n", + " pretty, pretty_full, full_sec_raw = eq_full_util_time(df) # unpack values\n", + " results.append((name, pretty, full_sec_raw, pretty_full)) # collect tuple\n", + "\n", + "# Turn into DataFrame\n", + "gpu_util_df = pd.DataFrame(results, columns=[\"model_name\", \"gpu_util_time\", \"gpu_util_time_raw\", 'full_time_from_gpu_log'])\n", + "\n", + "result_gpu_merged = gpu_util_df.merge(df_wide_merged, on=[\"model_name\"], how=\"left\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "2fa54bc3-81f2-492c-832c-26e4f9a7cff3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Overall RankModel NameGPU Util Timegpu_util_time_rawfull_time_from_gpu_logParametersparameters_rawTotal Timetotal_time_rawbatch_sizebatch_sizesanli_r1(acc,none)anli_r2(acc,none)anli_r3(acc,none)arc_challenge(acc_norm,none)bbh(exact_match,get-answer)boolq(acc,none)drop(f1,none)gpqa_main_zeroshot(acc_norm,none)gsm8k(exact_match,strict-match)hellaswag(acc_norm,none)mmlu(acc,none)nq_open(exact_match,remove_whitespace)openbookqa(acc_norm,none)piqa(acc_norm,none)qnli(acc,none)sciq(acc_norm,none)triviaqa(exact_match,remove_whitespace)truthfulqa_mc1(acc,none)truthfulqa_mc2(acc,none)winogrande(acc,none)gsm8k(exact_match,strict-match)_rankbbh(exact_match,get-answer)_rankarc_challenge(acc_norm,none)_rankanli_r1(acc,none)_rankanli_r2(acc,none)_rankanli_r3(acc,none)_rankgpqa_main_zeroshot(acc_norm,none)_rankhellaswag(acc_norm,none)_rankpiqa(acc_norm,none)_rankwinogrande(acc,none)_rankboolq(acc,none)_rankopenbookqa(acc_norm,none)_ranksciq(acc_norm,none)_rankqnli(acc,none)_rankmmlu(acc,none)_ranknq_open(exact_match,remove_whitespace)_rankdrop(f1,none)_ranktruthfulqa_mc1(acc,none)_ranktruthfulqa_mc2(acc,none)_ranktriviaqa(exact_match,remove_whitespace)_rankReasoning & Math Mean ScoreReasoning & Math Avg. RankCommonsense & NLI Mean ScoreCommonsense & NLI Avg. RankKnowledge & Reading Mean ScoreKnowledge & Reading Avg. RankMean Score
01google_gemma-3-12b-it14h 8m50906.415h 47m12.2B1218732504015h 45m56750.865892auto[2]0.6030.5600.5958330.6109220.8018740.8746180.1395660.3370540.8771800.8187610.7161370.1570640.4980.7807400.7457440.9540.2752450.4051410.5811830.7442783.01.02.05.03.02.09.03.019.04.03.01.06.08.06.08.08.08.08.023.00.626610.773730.3791100.6038
12Qwen_Qwen3-14B (8bit)17h 29m62956.229h 46m14.8B1476830720029h 45m107151.8020651[]0.6460.5700.5566670.6006830.4329600.8917430.0904100.3973210.8984080.7876920.7694770.0922440.4600.7948860.8442250.9660.4074900.4063650.5894040.7206001.029.04.04.02.03.01.012.014.011.01.08.01.02.02.015.018.07.06.013.00.586030.780720.392670.5961
23openchat_openchat-3.6-8b-202405226h 59m25150.87h 52m8.0B80302612487h 51m28278.8594703[]0.5560.5130.4800000.6032420.6178770.8727830.2515690.3325890.7505690.7978490.6430710.1706370.4620.8182810.7300020.9640.5659270.3525090.4976010.76322016.010.03.09.08.011.011.06.03.01.04.07.02.09.014.05.03.015.019.03.00.550560.772650.413620.5871
34Qwen_Qwen3-8B13h 44m49497.015h 33m8.2B819073536015h 31m55918.467860auto[1]0.6690.5420.5558330.5622870.7975730.8657490.1098770.3504460.8726310.7486560.7289560.0736840.4180.7752990.7818050.9580.3206090.3635250.5431400.6803474.02.011.03.05.04.05.024.021.022.06.020.04.07.03.016.013.013.013.018.00.621420.746880.3566140.5859
45Qwen_Qwen2.5-7B-Instruct8h 33m30831.69h 38m7.6B76156165129h 36m34616.6042483[]0.6850.5490.5525000.5529010.4487790.8633030.0710890.3281250.7626990.8049190.7180600.0457060.4860.8030470.8045030.9370.3254010.4773560.6484830.71191812.027.012.02.04.05.012.05.010.014.07.02.012.04.04.023.027.02.02.017.00.554150.773040.381090.5788
56Qwen_Qwen2.5-14B-Instruct (8bit)29h 32m106374.652h 45m14.8B1477003366452h 44m189869.4094041[]0.7210.6340.6175000.6151880.1068960.8862390.0712760.3549110.7922670.8419640.7830790.0614960.4760.8171930.8539260.9290.0392890.5104040.6830150.7545389.041.01.01.01.01.04.01.04.02.02.04.017.01.01.019.025.01.01.035.00.548870.794110.3581130.5775
6701-ai_Yi-1.5-9B10h 26m37569.611h 44m8.8B882940723211h 43m42212.1126222[]0.5320.4800.4391670.5469280.7120260.8581040.4456860.2946430.6391210.7789290.6892890.1531860.4560.8063110.5086950.9520.5438030.3219090.4675720.72612525.05.015.012.012.018.020.014.06.09.010.09.07.028.08.010.01.019.026.04.00.5206160.7266150.436910.5676
78Qwen_Qwen2.5-7B-Instruct-1M10h 10m36621.011h 18m7.6B761561651211h 17m40632.813397auto[1]0.5850.5330.5566670.5853240.2772230.8525990.0570470.3392860.7952990.7899820.7166360.1576180.4800.8161040.6781990.9500.4205310.4259490.6000720.7277038.038.07.06.06.03.08.011.05.08.012.03.08.011.05.07.029.03.03.012.00.5245150.756470.396360.5672
89meta-llama_Llama-3.1-8B-Instruct10h 52m39147.612h 20m8.0B803026124812h 19m44363.249360auto[1]0.4820.4670.4433330.5503410.7155580.8415900.1937290.3437500.7543590.7920730.6793190.1775620.4320.8063110.5013730.9620.5181680.3659730.5411540.73875315.04.013.018.014.017.06.09.06.06.016.016.03.030.012.04.04.012.014.07.00.5366120.7249170.412730.5653
91001-ai_Yi-1.5-9B-Chat12h 15m44120.413h 55m8.8B882940723213h 54m50056.3313452[]0.5350.5090.5258330.5870310.6106590.8681960.1253260.3035710.7081120.7872930.6840910.0094180.4360.8035910.7876620.9540.3386650.3745410.5479340.74664618.012.06.011.09.06.018.013.09.03.05.014.06.06.09.036.09.010.011.015.00.539990.769160.3467150.5621
1011mistralai_Ministral-8B-Instruct-24109h 27m34053.610h 47m8.0B801980825610h 46m38770.339256auto[1]0.4880.4870.4658330.5622870.6925200.8602450.0714130.3415180.7748290.7910770.6407210.1576180.4660.8231770.4949660.9560.5278090.3255810.4866700.73796411.07.011.016.011.014.07.010.02.07.08.06.05.039.015.07.024.018.022.05.00.544680.7328120.3683120.5576
1112meta-llama_Meta-Llama-3-8B-Instruct5h 46m20809.86h 31m8.0B80302612486h 30m23440.2344213[]0.4840.4580.4483330.5639930.6790050.8311930.1639770.3102680.7566340.7592110.6387270.1590030.4300.7872690.5464030.9320.5112020.3635250.5171420.71665414.08.010.017.018.016.016.019.018.013.019.017.015.020.016.06.05.013.017.08.00.5286130.7147220.392380.5528
1213Qwen_Qwen3-4B5h 3m18234.65h 52m4.0B40224680965h 51m21077.9436466[]0.5500.4610.5133330.5392490.7522650.8504590.0977070.3258930.8567100.6833300.6835920.0146810.4020.7513600.8087130.9320.2250330.3671970.5475750.6582485.03.017.010.017.08.013.029.027.027.013.023.015.03.010.034.016.011.012.026.00.571240.7266160.3226210.5510
1314NousResearch_Hermes-2-Pro-Mistral-7B7h 28m26916.08h 28m7.2B72419942408h 27m30434.3290213[]0.5310.4960.5000000.5657000.5737980.8681960.1097540.2767860.6853680.8049190.6051130.0404430.4340.7986940.5564710.9170.4711320.4137090.5911560.71981121.017.09.013.010.09.025.05.012.012.05.015.020.018.020.025.014.06.05.011.00.5184170.7284130.3719110.5480
1415mistralai_Mistral-7B-Instruct-v0.37h 41m27676.88h 39m7.2B72480235528h 38m31084.8383243[]0.4760.4430.4483330.5895900.5625860.8584100.0899720.2834820.4897650.8289190.5971370.1537400.4700.8269860.5145520.9430.5683240.4210530.5968130.74033128.018.05.020.020.016.024.02.01.05.09.05.09.026.022.09.019.04.04.02.00.4704220.740390.404550.5451
1516google_gemma-3-4b-it3h 50m13811.44h 52m4.3B43000794724h 51m17460.233507auto[4]0.4920.4710.4683330.5708190.7094150.8397550.0892840.2879460.7619410.7413860.5755590.1094180.4660.7720350.5659890.9310.3148130.3488370.5188210.70086813.06.08.015.013.013.023.026.022.017.017.06.016.016.024.013.020.016.016.019.00.5374110.7167190.3261200.5368
161701-ai_Yi-1.5-6B-Chat7h 1m25318.88h 5m6.1B60610355208h 4m29040.4298022[]0.4770.4530.4600000.5392490.5478420.8474010.1160810.3571430.6702050.7674770.6178610.0271470.4360.7878130.6794800.9340.3309740.3769890.5343710.70955022.021.017.019.019.015.03.016.017.015.015.014.014.010.018.029.012.09.015.016.00.5006190.7374100.3339190.5335
171801-ai_Yi-1.5-6B3h 54m14091.64h 29m6.1B60610355204h 28m16094.199661auto[8]0.4480.4070.4066670.4965870.5754880.8015290.3994620.2901790.5223650.7541330.6242700.1781160.4220.8014150.5985720.9410.4952070.2998780.4407500.72060027.016.020.021.026.027.022.022.011.011.023.019.010.013.017.03.02.023.032.010.00.4495240.7199180.406340.5312
1819Qwen_Qwen2-7B-Instruct10h 11m36684.611h 31m7.6B761561651211h 30m41431.857967auto[1]0.5730.5250.5225000.5401020.5774840.8562690.0520280.3147320.6467020.8060150.6994020.0132960.4620.8057670.5471350.9160.0081360.4051410.5734370.69850023.015.016.07.07.07.015.04.07.018.011.07.021.019.07.035.031.08.09.039.00.5285140.7274140.2919240.5271
1920deepseek-ai_DeepSeek-R1-0528-Qwen3-8B15h 30m55855.217h 59m8.2B819073536017h 57m64675.539163auto[1]0.5110.4640.4766670.5494880.5840880.8483180.0532790.3727680.8127370.7564230.6829510.0182830.4300.7568010.5577520.9410.0294810.3574050.5590130.6756127.014.014.014.016.012.02.020.025.024.014.017.010.017.011.033.030.014.010.036.00.5387100.7094230.2834280.5219
2021meta-llama_Llama-3.2-3B-Instruct5h 57m21477.07h 13m3.2B32127498247h 12m25939.885959auto[2]0.4470.4180.4308330.4590440.5564430.7847090.1553940.3281250.6421530.7054370.6051840.1390580.3580.7551690.5451220.9320.3389430.3268050.4975790.67087624.020.025.022.023.021.012.027.026.025.026.028.015.021.019.012.07.017.020.014.00.4688230.6788300.3438160.5048
2122Qwen_Qwen2.5-3B-Instruct6h 30m23452.27h 49m3.1B30859386887h 48m28089.516568auto:4[2, 64, 64, 64, 64]0.5620.4660.4941670.4820820.2491170.8012230.0773330.3214290.1015920.7490540.6549640.0083100.4220.7807400.7979130.9130.3009920.4161570.5860550.69297641.039.022.08.015.010.014.023.019.020.024.019.024.05.013.037.021.05.07.021.00.3823320.7367110.3406170.4939
2223Qwen_Qwen2.5-Math-7B24h 38m88696.227h 23m7.6B761561651227h 21m98517.403245auto[4]0.3870.4070.3825000.5025600.6724010.7455660.0432350.3080360.8476120.6528580.5799030.0509700.3920.7453750.4980780.9290.2183460.3206850.4832190.6479876.09.018.030.026.031.017.031.029.029.033.024.017.033.023.022.033.020.023.027.00.5010180.6587320.2827290.4907
2324deepseek-ai_deepseek-llm-7b-chat9h 8m32906.410h 8m6.9B691036569610h 6m36412.9692443[]0.4230.4190.4208330.4965870.4547690.8330280.1030480.2924110.4639880.7772360.4987890.0634350.4600.8014150.4969800.8930.3111900.3488370.4789330.70165729.026.020.026.022.023.021.015.011.016.018.08.029.035.032.018.015.016.024.020.00.4244270.7090240.3007230.4869
2425deepseek-ai_DeepSeek-R1-Distill-Llama-8B10h 36m38179.211h 47m8.0B803026124811h 46m42405.489811auto:5[1, 64, 64, 64, 64, 64]0.4040.4100.3883330.4232080.6037480.8287460.0712250.2745540.6247160.7429790.5326880.0584490.4100.7758430.5147350.8990.1940480.3219090.5044600.67797926.013.031.029.025.029.026.025.020.023.020.021.027.025.027.020.026.019.018.028.00.4469260.6928270.2805300.4830
2526meta-llama_Llama-2-13b-hf17h 38m63506.419h 22m13.0B1301586432019h 21m69687.765642auto[1]0.3770.3900.3850000.4897610.4776530.8064220.0301320.2544640.2297190.7938660.5209370.2362880.4520.8052230.4953320.9350.6088390.2594860.3689920.72217836.025.021.031.029.030.032.08.08.010.022.010.013.038.031.01.038.033.040.01.00.3719330.7157200.3374180.4819
2627meta-llama_Llama-2-13b-chat-hf15h 37m56271.617h 9m13.0B1301586432017h 8m61732.053618auto[1]0.4300.4300.4141670.5017060.4779600.8165140.0915090.2991070.3472330.7966540.5312630.1030470.4400.7932540.5438400.9050.2724590.2802940.4396240.71191831.024.019.025.021.024.019.07.015.014.021.012.026.022.028.014.017.030.033.024.00.4143280.7153210.2864260.4813
2728deepseek-ai_DeepSeek-R1-Distill-Qwen-7B5h 43m20637.06h 29m7.6B76156165126h 28m23311.0229413[]0.4450.4180.4100000.4377130.5569040.7782870.0411980.3348210.7862020.6025690.5263500.0321330.3600.7165400.5209590.9180.0592400.2888620.4563190.59905310.019.028.023.023.025.010.034.033.033.028.027.019.024.029.028.035.028.029.034.00.4841210.6422340.2340350.4644
2829Qwen_Qwen2.5-1.5B-Instruct2h 36m9398.43h 21m1.5B15437143043h 20m12036.5651956[]0.4480.3920.4316670.4684300.3692210.7813460.0390520.2834820.3191810.6829320.6005550.0415510.4060.7584330.5667220.9390.2826010.3121180.4657480.62746633.037.023.021.028.020.024.030.024.030.027.022.011.015.021.024.036.021.027.022.00.3874310.6803290.2903250.4608
2930Qwen_Qwen3-1.7B3h 36m13010.44h 26m1.7B17205749764h 25m15915.2685756[]0.4100.4040.4341670.4343000.4825680.7764530.0752600.2901790.6899170.6037640.5537670.0221610.3760.7203480.5105250.9140.1349750.2949820.4588120.60852420.023.029.028.027.019.022.033.032.031.030.026.023.027.025.030.023.025.028.032.00.4493250.6442330.2567340.4597
3031Qwen_Qwen2.5-Math-7B-Instruct4h 57m17861.45h 38m7.6B76156165125h 37m20230.489569auto[4]0.4310.4150.4291670.4308870.6140380.6061160.0272990.2879460.8900680.5881300.5372450.0199450.3340.6855280.6774670.8580.0074680.2986540.4750350.5793212.011.030.024.024.022.023.035.035.035.039.030.032.012.026.032.040.024.025.040.00.4997200.6184370.2276360.4596
3132meta-llama_Llama-2-7b-chat-hf6h 7m22072.86h 59m6.7B67384156166h 57m25079.294749auto[4]0.4170.4100.4075000.4428330.4013210.7978590.1174970.2611610.2319940.7548300.4636090.0667590.4380.7714910.5800840.8780.1903700.3023260.4532170.66456235.033.027.027.025.026.031.021.023.026.025.013.031.014.033.017.011.022.030.029.00.3674350.6978250.2656320.4525
3233meta-llama_Llama-2-7b-hf4h 59m17980.25h 43m6.7B67384156165h 42m20539.258032auto[4]0.3640.3720.3758330.4624570.3990170.7773700.0363350.2410710.1379830.7600080.4185300.1889200.4420.7905330.4991760.9100.5250780.2521420.3897160.68981840.034.024.033.031.033.034.018.016.021.029.011.025.031.037.02.037.034.038.06.00.3361390.6956260.3018220.4516
3334deepseek-ai_deepseek-llm-7b-base6h 26m23180.47h 12m6.9B69103656967h 11m25877.1867203[]0.3400.3630.3775000.4453920.4237440.7235470.0421810.2522320.1622440.7606050.4428140.1509700.4340.7976060.4958810.9150.5003900.2325580.3492140.69376538.030.026.037.032.032.033.017.013.019.034.015.022.037.036.011.034.036.041.09.00.3377380.6886280.2864270.4451
3435deepseek-ai_deepseek-math-7b-rl7h 12m25973.48h 3m6.9B69103656968h 2m28925.1107833[]0.3680.3890.4050000.4897610.5246510.7559630.1190270.2723210.1425320.6896040.5249960.0393350.4240.7502720.4989930.9280.1746540.2876380.4028840.65114439.022.021.032.030.028.027.028.028.028.032.018.018.032.030.026.010.029.037.031.00.3702340.6711310.2581330.4419
3536meta-llama_Llama-3.2-1B-Instruct2h 35m9307.83h 32m1.2B12358144003h 30m12653.736082auto[2]0.3380.3340.3725000.3805460.3781290.6948010.1634840.2745540.3373770.6088430.4589090.0565100.3460.7421110.4946000.8970.2499440.2717260.4383000.60142132.036.032.038.037.034.026.032.030.032.035.029.028.040.034.021.06.031.034.025.00.3450370.6264360.2731310.4219
3637google_gemma-3-1b-it4h 52m17533.86h 51m999.9M9998859526h 50m24641.929494auto[1]0.3320.3540.3566670.3805460.3822760.7581040.0761570.2656250.2471570.5782710.3859140.0357340.3880.7208920.4940510.8580.1897010.2460220.3874630.58958234.035.032.039.034.036.030.036.031.034.031.025.032.041.039.027.022.035.039.030.00.3312400.6267350.2202380.4013
3738deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B2h 52m10353.63h 42m1.8B17770880003h 40m13254.9130526[]0.3560.3620.3625000.3464160.4059280.6801220.0506860.2723210.7012890.4467240.3606320.0063710.3080.6577800.5054000.8450.0090280.2937580.4517420.54932919.032.034.034.033.035.027.039.037.038.036.032.033.029.041.038.032.026.031.038.00.4009300.5703390.1954410.3986
3839Qwen_Qwen2.5-Math-1.5B-Instruct2h 39m9542.43h 26m1.5B15437143043h 25m12324.098490auto:4[6, 64, 64, 64, 64]0.3420.3410.3533330.3651880.4372600.5694190.0230860.2834820.7369220.4165500.3787920.0038780.2860.6137110.4973460.7180.0042910.2900860.4895010.52565117.028.033.036.036.037.024.040.038.039.040.033.035.034.040.039.041.027.021.041.00.4085290.5181410.1983400.3838
3940Qwen_Qwen3-0.6B2h 53m10404.63h 46m596.0M5960499203h 45m13547.4461416[]0.3430.3190.3441670.3421500.4148360.6391440.0605440.2700890.4124340.4719180.4012960.0204990.3200.6751900.4960640.8330.0192820.2705020.4277420.55169730.031.035.035.038.039.028.038.036.037.038.031.034.036.038.031.028.032.035.037.00.3494360.5696400.2000390.3816
4041Qwen_Qwen2.5-0.5B-Instruct1h 48m6532.82h 35m494.0M4940327682h 34m9253.0747696[]0.3240.3420.3475000.3370310.2137920.6767580.0286440.2678570.2077330.5240990.4575560.0204990.3460.7040260.5368840.8830.1341950.2717260.4183870.55643337.040.036.040.035.038.029.037.034.036.037.029.030.023.035.031.039.031.036.033.00.2914410.6039380.2218370.3799
\n", + "
" + ], + "text/plain": [ + " Overall Rank Model Name GPU Util Time \\\n", + "0 1 google_gemma-3-12b-it 14h 8m \n", + "1 2 Qwen_Qwen3-14B (8bit) 17h 29m \n", + "2 3 openchat_openchat-3.6-8b-20240522 6h 59m \n", + "3 4 Qwen_Qwen3-8B 13h 44m \n", + "4 5 Qwen_Qwen2.5-7B-Instruct 8h 33m \n", + "5 6 Qwen_Qwen2.5-14B-Instruct (8bit) 29h 32m \n", + "6 7 01-ai_Yi-1.5-9B 10h 26m \n", + "7 8 Qwen_Qwen2.5-7B-Instruct-1M 10h 10m \n", + "8 9 meta-llama_Llama-3.1-8B-Instruct 10h 52m \n", + "9 10 01-ai_Yi-1.5-9B-Chat 12h 15m \n", + "10 11 mistralai_Ministral-8B-Instruct-2410 9h 27m \n", + "11 12 meta-llama_Meta-Llama-3-8B-Instruct 5h 46m \n", + "12 13 Qwen_Qwen3-4B 5h 3m \n", + "13 14 NousResearch_Hermes-2-Pro-Mistral-7B 7h 28m \n", + "14 15 mistralai_Mistral-7B-Instruct-v0.3 7h 41m \n", + "15 16 google_gemma-3-4b-it 3h 50m \n", + "16 17 01-ai_Yi-1.5-6B-Chat 7h 1m \n", + "17 18 01-ai_Yi-1.5-6B 3h 54m \n", + "18 19 Qwen_Qwen2-7B-Instruct 10h 11m \n", + "19 20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 15h 30m \n", + "20 21 meta-llama_Llama-3.2-3B-Instruct 5h 57m \n", + "21 22 Qwen_Qwen2.5-3B-Instruct 6h 30m \n", + "22 23 Qwen_Qwen2.5-Math-7B 24h 38m \n", + "23 24 deepseek-ai_deepseek-llm-7b-chat 9h 8m \n", + "24 25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 10h 36m \n", + "25 26 meta-llama_Llama-2-13b-hf 17h 38m \n", + "26 27 meta-llama_Llama-2-13b-chat-hf 15h 37m \n", + "27 28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 5h 43m \n", + "28 29 Qwen_Qwen2.5-1.5B-Instruct 2h 36m \n", + "29 30 Qwen_Qwen3-1.7B 3h 36m \n", + "30 31 Qwen_Qwen2.5-Math-7B-Instruct 4h 57m \n", + "31 32 meta-llama_Llama-2-7b-chat-hf 6h 7m \n", + "32 33 meta-llama_Llama-2-7b-hf 4h 59m \n", + "33 34 deepseek-ai_deepseek-llm-7b-base 6h 26m \n", + "34 35 deepseek-ai_deepseek-math-7b-rl 7h 12m \n", + "35 36 meta-llama_Llama-3.2-1B-Instruct 2h 35m \n", + "36 37 google_gemma-3-1b-it 4h 52m \n", + "37 38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 2h 52m \n", + "38 39 Qwen_Qwen2.5-Math-1.5B-Instruct 2h 39m \n", + "39 40 Qwen_Qwen3-0.6B 2h 53m \n", + "40 41 Qwen_Qwen2.5-0.5B-Instruct 1h 48m \n", + "\n", + " gpu_util_time_raw full_time_from_gpu_log Parameters parameters_raw \\\n", + "0 50906.4 15h 47m 12.2B 12187325040 \n", + "1 62956.2 29h 46m 14.8B 14768307200 \n", + "2 25150.8 7h 52m 8.0B 8030261248 \n", + "3 49497.0 15h 33m 8.2B 8190735360 \n", + "4 30831.6 9h 38m 7.6B 7615616512 \n", + "5 106374.6 52h 45m 14.8B 14770033664 \n", + "6 37569.6 11h 44m 8.8B 8829407232 \n", + "7 36621.0 11h 18m 7.6B 7615616512 \n", + "8 39147.6 12h 20m 8.0B 8030261248 \n", + "9 44120.4 13h 55m 8.8B 8829407232 \n", + "10 34053.6 10h 47m 8.0B 8019808256 \n", + "11 20809.8 6h 31m 8.0B 8030261248 \n", + "12 18234.6 5h 52m 4.0B 4022468096 \n", + "13 26916.0 8h 28m 7.2B 7241994240 \n", + "14 27676.8 8h 39m 7.2B 7248023552 \n", + "15 13811.4 4h 52m 4.3B 4300079472 \n", + "16 25318.8 8h 5m 6.1B 6061035520 \n", + "17 14091.6 4h 29m 6.1B 6061035520 \n", + "18 36684.6 11h 31m 7.6B 7615616512 \n", + "19 55855.2 17h 59m 8.2B 8190735360 \n", + "20 21477.0 7h 13m 3.2B 3212749824 \n", + "21 23452.2 7h 49m 3.1B 3085938688 \n", + "22 88696.2 27h 23m 7.6B 7615616512 \n", + "23 32906.4 10h 8m 6.9B 6910365696 \n", + "24 38179.2 11h 47m 8.0B 8030261248 \n", + "25 63506.4 19h 22m 13.0B 13015864320 \n", + "26 56271.6 17h 9m 13.0B 13015864320 \n", + "27 20637.0 6h 29m 7.6B 7615616512 \n", + "28 9398.4 3h 21m 1.5B 1543714304 \n", + "29 13010.4 4h 26m 1.7B 1720574976 \n", + "30 17861.4 5h 38m 7.6B 7615616512 \n", + "31 22072.8 6h 59m 6.7B 6738415616 \n", + "32 17980.2 5h 43m 6.7B 6738415616 \n", + "33 23180.4 7h 12m 6.9B 6910365696 \n", + "34 25973.4 8h 3m 6.9B 6910365696 \n", + "35 9307.8 3h 32m 1.2B 1235814400 \n", + "36 17533.8 6h 51m 999.9M 999885952 \n", + "37 10353.6 3h 42m 1.8B 1777088000 \n", + "38 9542.4 3h 26m 1.5B 1543714304 \n", + "39 10404.6 3h 46m 596.0M 596049920 \n", + "40 6532.8 2h 35m 494.0M 494032768 \n", + "\n", + " Total Time total_time_raw batch_size batch_sizes \\\n", + "0 15h 45m 56750.865892 auto [2] \n", + "1 29h 45m 107151.802065 1 [] \n", + "2 7h 51m 28278.859470 3 [] \n", + "3 15h 31m 55918.467860 auto [1] \n", + "4 9h 36m 34616.604248 3 [] \n", + "5 52h 44m 189869.409404 1 [] \n", + "6 11h 43m 42212.112622 2 [] \n", + "7 11h 17m 40632.813397 auto [1] \n", + "8 12h 19m 44363.249360 auto [1] \n", + "9 13h 54m 50056.331345 2 [] \n", + "10 10h 46m 38770.339256 auto [1] \n", + "11 6h 30m 23440.234421 3 [] \n", + "12 5h 51m 21077.943646 6 [] \n", + "13 8h 27m 30434.329021 3 [] \n", + "14 8h 38m 31084.838324 3 [] \n", + "15 4h 51m 17460.233507 auto [4] \n", + "16 8h 4m 29040.429802 2 [] \n", + "17 4h 28m 16094.199661 auto [8] \n", + "18 11h 30m 41431.857967 auto [1] \n", + "19 17h 57m 64675.539163 auto [1] \n", + "20 7h 12m 25939.885959 auto [2] \n", + "21 7h 48m 28089.516568 auto:4 [2, 64, 64, 64, 64] \n", + "22 27h 21m 98517.403245 auto [4] \n", + "23 10h 6m 36412.969244 3 [] \n", + "24 11h 46m 42405.489811 auto:5 [1, 64, 64, 64, 64, 64] \n", + "25 19h 21m 69687.765642 auto [1] \n", + "26 17h 8m 61732.053618 auto [1] \n", + "27 6h 28m 23311.022941 3 [] \n", + "28 3h 20m 12036.565195 6 [] \n", + "29 4h 25m 15915.268575 6 [] \n", + "30 5h 37m 20230.489569 auto [4] \n", + "31 6h 57m 25079.294749 auto [4] \n", + "32 5h 42m 20539.258032 auto [4] \n", + "33 7h 11m 25877.186720 3 [] \n", + "34 8h 2m 28925.110783 3 [] \n", + "35 3h 30m 12653.736082 auto [2] \n", + "36 6h 50m 24641.929494 auto [1] \n", + "37 3h 40m 13254.913052 6 [] \n", + "38 3h 25m 12324.098490 auto:4 [6, 64, 64, 64, 64] \n", + "39 3h 45m 13547.446141 6 [] \n", + "40 2h 34m 9253.074769 6 [] \n", + "\n", + " anli_r1(acc,none) anli_r2(acc,none) anli_r3(acc,none) \\\n", + "0 0.603 0.560 0.595833 \n", + "1 0.646 0.570 0.556667 \n", + "2 0.556 0.513 0.480000 \n", + "3 0.669 0.542 0.555833 \n", + "4 0.685 0.549 0.552500 \n", + "5 0.721 0.634 0.617500 \n", + "6 0.532 0.480 0.439167 \n", + "7 0.585 0.533 0.556667 \n", + "8 0.482 0.467 0.443333 \n", + "9 0.535 0.509 0.525833 \n", + "10 0.488 0.487 0.465833 \n", + "11 0.484 0.458 0.448333 \n", + "12 0.550 0.461 0.513333 \n", + "13 0.531 0.496 0.500000 \n", + "14 0.476 0.443 0.448333 \n", + "15 0.492 0.471 0.468333 \n", + "16 0.477 0.453 0.460000 \n", + "17 0.448 0.407 0.406667 \n", + "18 0.573 0.525 0.522500 \n", + "19 0.511 0.464 0.476667 \n", + "20 0.447 0.418 0.430833 \n", + "21 0.562 0.466 0.494167 \n", + "22 0.387 0.407 0.382500 \n", + "23 0.423 0.419 0.420833 \n", + "24 0.404 0.410 0.388333 \n", + "25 0.377 0.390 0.385000 \n", + "26 0.430 0.430 0.414167 \n", + "27 0.445 0.418 0.410000 \n", + "28 0.448 0.392 0.431667 \n", + "29 0.410 0.404 0.434167 \n", + "30 0.431 0.415 0.429167 \n", + "31 0.417 0.410 0.407500 \n", + "32 0.364 0.372 0.375833 \n", + "33 0.340 0.363 0.377500 \n", + "34 0.368 0.389 0.405000 \n", + "35 0.338 0.334 0.372500 \n", + "36 0.332 0.354 0.356667 \n", + "37 0.356 0.362 0.362500 \n", + "38 0.342 0.341 0.353333 \n", + "39 0.343 0.319 0.344167 \n", + "40 0.324 0.342 0.347500 \n", + "\n", + " arc_challenge(acc_norm,none) bbh(exact_match,get-answer) \\\n", + "0 0.610922 0.801874 \n", + "1 0.600683 0.432960 \n", + "2 0.603242 0.617877 \n", + "3 0.562287 0.797573 \n", + "4 0.552901 0.448779 \n", + "5 0.615188 0.106896 \n", + "6 0.546928 0.712026 \n", + "7 0.585324 0.277223 \n", + "8 0.550341 0.715558 \n", + "9 0.587031 0.610659 \n", + "10 0.562287 0.692520 \n", + "11 0.563993 0.679005 \n", + "12 0.539249 0.752265 \n", + "13 0.565700 0.573798 \n", + "14 0.589590 0.562586 \n", + "15 0.570819 0.709415 \n", + "16 0.539249 0.547842 \n", + "17 0.496587 0.575488 \n", + "18 0.540102 0.577484 \n", + "19 0.549488 0.584088 \n", + "20 0.459044 0.556443 \n", + "21 0.482082 0.249117 \n", + "22 0.502560 0.672401 \n", + "23 0.496587 0.454769 \n", + "24 0.423208 0.603748 \n", + "25 0.489761 0.477653 \n", + "26 0.501706 0.477960 \n", + "27 0.437713 0.556904 \n", + "28 0.468430 0.369221 \n", + "29 0.434300 0.482568 \n", + "30 0.430887 0.614038 \n", + "31 0.442833 0.401321 \n", + "32 0.462457 0.399017 \n", + "33 0.445392 0.423744 \n", + "34 0.489761 0.524651 \n", + "35 0.380546 0.378129 \n", + "36 0.380546 0.382276 \n", + "37 0.346416 0.405928 \n", + "38 0.365188 0.437260 \n", + "39 0.342150 0.414836 \n", + "40 0.337031 0.213792 \n", + "\n", + " boolq(acc,none) drop(f1,none) gpqa_main_zeroshot(acc_norm,none) \\\n", + "0 0.874618 0.139566 0.337054 \n", + "1 0.891743 0.090410 0.397321 \n", + "2 0.872783 0.251569 0.332589 \n", + "3 0.865749 0.109877 0.350446 \n", + "4 0.863303 0.071089 0.328125 \n", + "5 0.886239 0.071276 0.354911 \n", + "6 0.858104 0.445686 0.294643 \n", + "7 0.852599 0.057047 0.339286 \n", + "8 0.841590 0.193729 0.343750 \n", + "9 0.868196 0.125326 0.303571 \n", + "10 0.860245 0.071413 0.341518 \n", + "11 0.831193 0.163977 0.310268 \n", + "12 0.850459 0.097707 0.325893 \n", + "13 0.868196 0.109754 0.276786 \n", + "14 0.858410 0.089972 0.283482 \n", + "15 0.839755 0.089284 0.287946 \n", + "16 0.847401 0.116081 0.357143 \n", + "17 0.801529 0.399462 0.290179 \n", + "18 0.856269 0.052028 0.314732 \n", + "19 0.848318 0.053279 0.372768 \n", + "20 0.784709 0.155394 0.328125 \n", + "21 0.801223 0.077333 0.321429 \n", + "22 0.745566 0.043235 0.308036 \n", + "23 0.833028 0.103048 0.292411 \n", + "24 0.828746 0.071225 0.274554 \n", + "25 0.806422 0.030132 0.254464 \n", + "26 0.816514 0.091509 0.299107 \n", + "27 0.778287 0.041198 0.334821 \n", + "28 0.781346 0.039052 0.283482 \n", + "29 0.776453 0.075260 0.290179 \n", + "30 0.606116 0.027299 0.287946 \n", + "31 0.797859 0.117497 0.261161 \n", + "32 0.777370 0.036335 0.241071 \n", + "33 0.723547 0.042181 0.252232 \n", + "34 0.755963 0.119027 0.272321 \n", + "35 0.694801 0.163484 0.274554 \n", + "36 0.758104 0.076157 0.265625 \n", + "37 0.680122 0.050686 0.272321 \n", + "38 0.569419 0.023086 0.283482 \n", + "39 0.639144 0.060544 0.270089 \n", + "40 0.676758 0.028644 0.267857 \n", + "\n", + " gsm8k(exact_match,strict-match) hellaswag(acc_norm,none) mmlu(acc,none) \\\n", + "0 0.877180 0.818761 0.716137 \n", + "1 0.898408 0.787692 0.769477 \n", + "2 0.750569 0.797849 0.643071 \n", + "3 0.872631 0.748656 0.728956 \n", + "4 0.762699 0.804919 0.718060 \n", + "5 0.792267 0.841964 0.783079 \n", + "6 0.639121 0.778929 0.689289 \n", + "7 0.795299 0.789982 0.716636 \n", + "8 0.754359 0.792073 0.679319 \n", + "9 0.708112 0.787293 0.684091 \n", + "10 0.774829 0.791077 0.640721 \n", + "11 0.756634 0.759211 0.638727 \n", + "12 0.856710 0.683330 0.683592 \n", + "13 0.685368 0.804919 0.605113 \n", + "14 0.489765 0.828919 0.597137 \n", + "15 0.761941 0.741386 0.575559 \n", + "16 0.670205 0.767477 0.617861 \n", + "17 0.522365 0.754133 0.624270 \n", + "18 0.646702 0.806015 0.699402 \n", + "19 0.812737 0.756423 0.682951 \n", + "20 0.642153 0.705437 0.605184 \n", + "21 0.101592 0.749054 0.654964 \n", + "22 0.847612 0.652858 0.579903 \n", + "23 0.463988 0.777236 0.498789 \n", + "24 0.624716 0.742979 0.532688 \n", + "25 0.229719 0.793866 0.520937 \n", + "26 0.347233 0.796654 0.531263 \n", + "27 0.786202 0.602569 0.526350 \n", + "28 0.319181 0.682932 0.600555 \n", + "29 0.689917 0.603764 0.553767 \n", + "30 0.890068 0.588130 0.537245 \n", + "31 0.231994 0.754830 0.463609 \n", + "32 0.137983 0.760008 0.418530 \n", + "33 0.162244 0.760605 0.442814 \n", + "34 0.142532 0.689604 0.524996 \n", + "35 0.337377 0.608843 0.458909 \n", + "36 0.247157 0.578271 0.385914 \n", + "37 0.701289 0.446724 0.360632 \n", + "38 0.736922 0.416550 0.378792 \n", + "39 0.412434 0.471918 0.401296 \n", + "40 0.207733 0.524099 0.457556 \n", + "\n", + " nq_open(exact_match,remove_whitespace) openbookqa(acc_norm,none) \\\n", + "0 0.157064 0.498 \n", + "1 0.092244 0.460 \n", + "2 0.170637 0.462 \n", + "3 0.073684 0.418 \n", + "4 0.045706 0.486 \n", + "5 0.061496 0.476 \n", + "6 0.153186 0.456 \n", + "7 0.157618 0.480 \n", + "8 0.177562 0.432 \n", + "9 0.009418 0.436 \n", + "10 0.157618 0.466 \n", + "11 0.159003 0.430 \n", + "12 0.014681 0.402 \n", + "13 0.040443 0.434 \n", + "14 0.153740 0.470 \n", + "15 0.109418 0.466 \n", + "16 0.027147 0.436 \n", + "17 0.178116 0.422 \n", + "18 0.013296 0.462 \n", + "19 0.018283 0.430 \n", + "20 0.139058 0.358 \n", + "21 0.008310 0.422 \n", + "22 0.050970 0.392 \n", + "23 0.063435 0.460 \n", + "24 0.058449 0.410 \n", + "25 0.236288 0.452 \n", + "26 0.103047 0.440 \n", + "27 0.032133 0.360 \n", + "28 0.041551 0.406 \n", + "29 0.022161 0.376 \n", + "30 0.019945 0.334 \n", + "31 0.066759 0.438 \n", + "32 0.188920 0.442 \n", + "33 0.150970 0.434 \n", + "34 0.039335 0.424 \n", + "35 0.056510 0.346 \n", + "36 0.035734 0.388 \n", + "37 0.006371 0.308 \n", + "38 0.003878 0.286 \n", + "39 0.020499 0.320 \n", + "40 0.020499 0.346 \n", + "\n", + " piqa(acc_norm,none) qnli(acc,none) sciq(acc_norm,none) \\\n", + "0 0.780740 0.745744 0.954 \n", + "1 0.794886 0.844225 0.966 \n", + "2 0.818281 0.730002 0.964 \n", + "3 0.775299 0.781805 0.958 \n", + "4 0.803047 0.804503 0.937 \n", + "5 0.817193 0.853926 0.929 \n", + "6 0.806311 0.508695 0.952 \n", + "7 0.816104 0.678199 0.950 \n", + "8 0.806311 0.501373 0.962 \n", + "9 0.803591 0.787662 0.954 \n", + "10 0.823177 0.494966 0.956 \n", + "11 0.787269 0.546403 0.932 \n", + "12 0.751360 0.808713 0.932 \n", + "13 0.798694 0.556471 0.917 \n", + "14 0.826986 0.514552 0.943 \n", + "15 0.772035 0.565989 0.931 \n", + "16 0.787813 0.679480 0.934 \n", + "17 0.801415 0.598572 0.941 \n", + "18 0.805767 0.547135 0.916 \n", + "19 0.756801 0.557752 0.941 \n", + "20 0.755169 0.545122 0.932 \n", + "21 0.780740 0.797913 0.913 \n", + "22 0.745375 0.498078 0.929 \n", + "23 0.801415 0.496980 0.893 \n", + "24 0.775843 0.514735 0.899 \n", + "25 0.805223 0.495332 0.935 \n", + "26 0.793254 0.543840 0.905 \n", + "27 0.716540 0.520959 0.918 \n", + "28 0.758433 0.566722 0.939 \n", + "29 0.720348 0.510525 0.914 \n", + "30 0.685528 0.677467 0.858 \n", + "31 0.771491 0.580084 0.878 \n", + "32 0.790533 0.499176 0.910 \n", + "33 0.797606 0.495881 0.915 \n", + "34 0.750272 0.498993 0.928 \n", + "35 0.742111 0.494600 0.897 \n", + "36 0.720892 0.494051 0.858 \n", + "37 0.657780 0.505400 0.845 \n", + "38 0.613711 0.497346 0.718 \n", + "39 0.675190 0.496064 0.833 \n", + "40 0.704026 0.536884 0.883 \n", + "\n", + " triviaqa(exact_match,remove_whitespace) truthfulqa_mc1(acc,none) \\\n", + "0 0.275245 0.405141 \n", + "1 0.407490 0.406365 \n", + "2 0.565927 0.352509 \n", + "3 0.320609 0.363525 \n", + "4 0.325401 0.477356 \n", + "5 0.039289 0.510404 \n", + "6 0.543803 0.321909 \n", + "7 0.420531 0.425949 \n", + "8 0.518168 0.365973 \n", + "9 0.338665 0.374541 \n", + "10 0.527809 0.325581 \n", + "11 0.511202 0.363525 \n", + "12 0.225033 0.367197 \n", + "13 0.471132 0.413709 \n", + "14 0.568324 0.421053 \n", + "15 0.314813 0.348837 \n", + "16 0.330974 0.376989 \n", + "17 0.495207 0.299878 \n", + "18 0.008136 0.405141 \n", + "19 0.029481 0.357405 \n", + "20 0.338943 0.326805 \n", + "21 0.300992 0.416157 \n", + "22 0.218346 0.320685 \n", + "23 0.311190 0.348837 \n", + "24 0.194048 0.321909 \n", + "25 0.608839 0.259486 \n", + "26 0.272459 0.280294 \n", + "27 0.059240 0.288862 \n", + "28 0.282601 0.312118 \n", + "29 0.134975 0.294982 \n", + "30 0.007468 0.298654 \n", + "31 0.190370 0.302326 \n", + "32 0.525078 0.252142 \n", + "33 0.500390 0.232558 \n", + "34 0.174654 0.287638 \n", + "35 0.249944 0.271726 \n", + "36 0.189701 0.246022 \n", + "37 0.009028 0.293758 \n", + "38 0.004291 0.290086 \n", + "39 0.019282 0.270502 \n", + "40 0.134195 0.271726 \n", + "\n", + " truthfulqa_mc2(acc,none) winogrande(acc,none) \\\n", + "0 0.581183 0.744278 \n", + "1 0.589404 0.720600 \n", + "2 0.497601 0.763220 \n", + "3 0.543140 0.680347 \n", + "4 0.648483 0.711918 \n", + "5 0.683015 0.754538 \n", + "6 0.467572 0.726125 \n", + "7 0.600072 0.727703 \n", + "8 0.541154 0.738753 \n", + "9 0.547934 0.746646 \n", + "10 0.486670 0.737964 \n", + "11 0.517142 0.716654 \n", + "12 0.547575 0.658248 \n", + "13 0.591156 0.719811 \n", + "14 0.596813 0.740331 \n", + "15 0.518821 0.700868 \n", + "16 0.534371 0.709550 \n", + "17 0.440750 0.720600 \n", + "18 0.573437 0.698500 \n", + "19 0.559013 0.675612 \n", + "20 0.497579 0.670876 \n", + "21 0.586055 0.692976 \n", + "22 0.483219 0.647987 \n", + "23 0.478933 0.701657 \n", + "24 0.504460 0.677979 \n", + "25 0.368992 0.722178 \n", + "26 0.439624 0.711918 \n", + "27 0.456319 0.599053 \n", + "28 0.465748 0.627466 \n", + "29 0.458812 0.608524 \n", + "30 0.475035 0.579321 \n", + "31 0.453217 0.664562 \n", + "32 0.389716 0.689818 \n", + "33 0.349214 0.693765 \n", + "34 0.402884 0.651144 \n", + "35 0.438300 0.601421 \n", + "36 0.387463 0.589582 \n", + "37 0.451742 0.549329 \n", + "38 0.489501 0.525651 \n", + "39 0.427742 0.551697 \n", + "40 0.418387 0.556433 \n", + "\n", + " gsm8k(exact_match,strict-match)_rank bbh(exact_match,get-answer)_rank \\\n", + "0 3.0 1.0 \n", + "1 1.0 29.0 \n", + "2 16.0 10.0 \n", + "3 4.0 2.0 \n", + "4 12.0 27.0 \n", + "5 9.0 41.0 \n", + "6 25.0 5.0 \n", + "7 8.0 38.0 \n", + "8 15.0 4.0 \n", + "9 18.0 12.0 \n", + "10 11.0 7.0 \n", + "11 14.0 8.0 \n", + "12 5.0 3.0 \n", + "13 21.0 17.0 \n", + "14 28.0 18.0 \n", + "15 13.0 6.0 \n", + "16 22.0 21.0 \n", + "17 27.0 16.0 \n", + "18 23.0 15.0 \n", + "19 7.0 14.0 \n", + "20 24.0 20.0 \n", + "21 41.0 39.0 \n", + "22 6.0 9.0 \n", + "23 29.0 26.0 \n", + "24 26.0 13.0 \n", + "25 36.0 25.0 \n", + "26 31.0 24.0 \n", + "27 10.0 19.0 \n", + "28 33.0 37.0 \n", + "29 20.0 23.0 \n", + "30 2.0 11.0 \n", + "31 35.0 33.0 \n", + "32 40.0 34.0 \n", + "33 38.0 30.0 \n", + "34 39.0 22.0 \n", + "35 32.0 36.0 \n", + "36 34.0 35.0 \n", + "37 19.0 32.0 \n", + "38 17.0 28.0 \n", + "39 30.0 31.0 \n", + "40 37.0 40.0 \n", + "\n", + " arc_challenge(acc_norm,none)_rank anli_r1(acc,none)_rank \\\n", + "0 2.0 5.0 \n", + "1 4.0 4.0 \n", + "2 3.0 9.0 \n", + "3 11.0 3.0 \n", + "4 12.0 2.0 \n", + "5 1.0 1.0 \n", + "6 15.0 12.0 \n", + "7 7.0 6.0 \n", + "8 13.0 18.0 \n", + "9 6.0 11.0 \n", + "10 11.0 16.0 \n", + "11 10.0 17.0 \n", + "12 17.0 10.0 \n", + "13 9.0 13.0 \n", + "14 5.0 20.0 \n", + "15 8.0 15.0 \n", + "16 17.0 19.0 \n", + "17 20.0 21.0 \n", + "18 16.0 7.0 \n", + "19 14.0 14.0 \n", + "20 25.0 22.0 \n", + "21 22.0 8.0 \n", + "22 18.0 30.0 \n", + "23 20.0 26.0 \n", + "24 31.0 29.0 \n", + "25 21.0 31.0 \n", + "26 19.0 25.0 \n", + "27 28.0 23.0 \n", + "28 23.0 21.0 \n", + "29 29.0 28.0 \n", + "30 30.0 24.0 \n", + "31 27.0 27.0 \n", + "32 24.0 33.0 \n", + "33 26.0 37.0 \n", + "34 21.0 32.0 \n", + "35 32.0 38.0 \n", + "36 32.0 39.0 \n", + "37 34.0 34.0 \n", + "38 33.0 36.0 \n", + "39 35.0 35.0 \n", + "40 36.0 40.0 \n", + "\n", + " anli_r2(acc,none)_rank anli_r3(acc,none)_rank \\\n", + "0 3.0 2.0 \n", + "1 2.0 3.0 \n", + "2 8.0 11.0 \n", + "3 5.0 4.0 \n", + "4 4.0 5.0 \n", + "5 1.0 1.0 \n", + "6 12.0 18.0 \n", + "7 6.0 3.0 \n", + "8 14.0 17.0 \n", + "9 9.0 6.0 \n", + "10 11.0 14.0 \n", + "11 18.0 16.0 \n", + "12 17.0 8.0 \n", + "13 10.0 9.0 \n", + "14 20.0 16.0 \n", + "15 13.0 13.0 \n", + "16 19.0 15.0 \n", + "17 26.0 27.0 \n", + "18 7.0 7.0 \n", + "19 16.0 12.0 \n", + "20 23.0 21.0 \n", + "21 15.0 10.0 \n", + "22 26.0 31.0 \n", + "23 22.0 23.0 \n", + "24 25.0 29.0 \n", + "25 29.0 30.0 \n", + "26 21.0 24.0 \n", + "27 23.0 25.0 \n", + "28 28.0 20.0 \n", + "29 27.0 19.0 \n", + "30 24.0 22.0 \n", + "31 25.0 26.0 \n", + "32 31.0 33.0 \n", + "33 32.0 32.0 \n", + "34 30.0 28.0 \n", + "35 37.0 34.0 \n", + "36 34.0 36.0 \n", + "37 33.0 35.0 \n", + "38 36.0 37.0 \n", + "39 38.0 39.0 \n", + "40 35.0 38.0 \n", + "\n", + " gpqa_main_zeroshot(acc_norm,none)_rank hellaswag(acc_norm,none)_rank \\\n", + "0 9.0 3.0 \n", + "1 1.0 12.0 \n", + "2 11.0 6.0 \n", + "3 5.0 24.0 \n", + "4 12.0 5.0 \n", + "5 4.0 1.0 \n", + "6 20.0 14.0 \n", + "7 8.0 11.0 \n", + "8 6.0 9.0 \n", + "9 18.0 13.0 \n", + "10 7.0 10.0 \n", + "11 16.0 19.0 \n", + "12 13.0 29.0 \n", + "13 25.0 5.0 \n", + "14 24.0 2.0 \n", + "15 23.0 26.0 \n", + "16 3.0 16.0 \n", + "17 22.0 22.0 \n", + "18 15.0 4.0 \n", + "19 2.0 20.0 \n", + "20 12.0 27.0 \n", + "21 14.0 23.0 \n", + "22 17.0 31.0 \n", + "23 21.0 15.0 \n", + "24 26.0 25.0 \n", + "25 32.0 8.0 \n", + "26 19.0 7.0 \n", + "27 10.0 34.0 \n", + "28 24.0 30.0 \n", + "29 22.0 33.0 \n", + "30 23.0 35.0 \n", + "31 31.0 21.0 \n", + "32 34.0 18.0 \n", + "33 33.0 17.0 \n", + "34 27.0 28.0 \n", + "35 26.0 32.0 \n", + "36 30.0 36.0 \n", + "37 27.0 39.0 \n", + "38 24.0 40.0 \n", + "39 28.0 38.0 \n", + "40 29.0 37.0 \n", + "\n", + " piqa(acc_norm,none)_rank winogrande(acc,none)_rank boolq(acc,none)_rank \\\n", + "0 19.0 4.0 3.0 \n", + "1 14.0 11.0 1.0 \n", + "2 3.0 1.0 4.0 \n", + "3 21.0 22.0 6.0 \n", + "4 10.0 14.0 7.0 \n", + "5 4.0 2.0 2.0 \n", + "6 6.0 9.0 10.0 \n", + "7 5.0 8.0 12.0 \n", + "8 6.0 6.0 16.0 \n", + "9 9.0 3.0 5.0 \n", + "10 2.0 7.0 8.0 \n", + "11 18.0 13.0 19.0 \n", + "12 27.0 27.0 13.0 \n", + "13 12.0 12.0 5.0 \n", + "14 1.0 5.0 9.0 \n", + "15 22.0 17.0 17.0 \n", + "16 17.0 15.0 15.0 \n", + "17 11.0 11.0 23.0 \n", + "18 7.0 18.0 11.0 \n", + "19 25.0 24.0 14.0 \n", + "20 26.0 25.0 26.0 \n", + "21 19.0 20.0 24.0 \n", + "22 29.0 29.0 33.0 \n", + "23 11.0 16.0 18.0 \n", + "24 20.0 23.0 20.0 \n", + "25 8.0 10.0 22.0 \n", + "26 15.0 14.0 21.0 \n", + "27 33.0 33.0 28.0 \n", + "28 24.0 30.0 27.0 \n", + "29 32.0 31.0 30.0 \n", + "30 35.0 35.0 39.0 \n", + "31 23.0 26.0 25.0 \n", + "32 16.0 21.0 29.0 \n", + "33 13.0 19.0 34.0 \n", + "34 28.0 28.0 32.0 \n", + "35 30.0 32.0 35.0 \n", + "36 31.0 34.0 31.0 \n", + "37 37.0 38.0 36.0 \n", + "38 38.0 39.0 40.0 \n", + "39 36.0 37.0 38.0 \n", + "40 34.0 36.0 37.0 \n", + "\n", + " openbookqa(acc_norm,none)_rank sciq(acc_norm,none)_rank \\\n", + "0 1.0 6.0 \n", + "1 8.0 1.0 \n", + "2 7.0 2.0 \n", + "3 20.0 4.0 \n", + "4 2.0 12.0 \n", + "5 4.0 17.0 \n", + "6 9.0 7.0 \n", + "7 3.0 8.0 \n", + "8 16.0 3.0 \n", + "9 14.0 6.0 \n", + "10 6.0 5.0 \n", + "11 17.0 15.0 \n", + "12 23.0 15.0 \n", + "13 15.0 20.0 \n", + "14 5.0 9.0 \n", + "15 6.0 16.0 \n", + "16 14.0 14.0 \n", + "17 19.0 10.0 \n", + "18 7.0 21.0 \n", + "19 17.0 10.0 \n", + "20 28.0 15.0 \n", + "21 19.0 24.0 \n", + "22 24.0 17.0 \n", + "23 8.0 29.0 \n", + "24 21.0 27.0 \n", + "25 10.0 13.0 \n", + "26 12.0 26.0 \n", + "27 27.0 19.0 \n", + "28 22.0 11.0 \n", + "29 26.0 23.0 \n", + "30 30.0 32.0 \n", + "31 13.0 31.0 \n", + "32 11.0 25.0 \n", + "33 15.0 22.0 \n", + "34 18.0 18.0 \n", + "35 29.0 28.0 \n", + "36 25.0 32.0 \n", + "37 32.0 33.0 \n", + "38 33.0 35.0 \n", + "39 31.0 34.0 \n", + "40 29.0 30.0 \n", + "\n", + " qnli(acc,none)_rank mmlu(acc,none)_rank \\\n", + "0 8.0 6.0 \n", + "1 2.0 2.0 \n", + "2 9.0 14.0 \n", + "3 7.0 3.0 \n", + "4 4.0 4.0 \n", + "5 1.0 1.0 \n", + "6 28.0 8.0 \n", + "7 11.0 5.0 \n", + "8 30.0 12.0 \n", + "9 6.0 9.0 \n", + "10 39.0 15.0 \n", + "11 20.0 16.0 \n", + "12 3.0 10.0 \n", + "13 18.0 20.0 \n", + "14 26.0 22.0 \n", + "15 16.0 24.0 \n", + "16 10.0 18.0 \n", + "17 13.0 17.0 \n", + "18 19.0 7.0 \n", + "19 17.0 11.0 \n", + "20 21.0 19.0 \n", + "21 5.0 13.0 \n", + "22 33.0 23.0 \n", + "23 35.0 32.0 \n", + "24 25.0 27.0 \n", + "25 38.0 31.0 \n", + "26 22.0 28.0 \n", + "27 24.0 29.0 \n", + "28 15.0 21.0 \n", + "29 27.0 25.0 \n", + "30 12.0 26.0 \n", + "31 14.0 33.0 \n", + "32 31.0 37.0 \n", + "33 37.0 36.0 \n", + "34 32.0 30.0 \n", + "35 40.0 34.0 \n", + "36 41.0 39.0 \n", + "37 29.0 41.0 \n", + "38 34.0 40.0 \n", + "39 36.0 38.0 \n", + "40 23.0 35.0 \n", + "\n", + " nq_open(exact_match,remove_whitespace)_rank drop(f1,none)_rank \\\n", + "0 8.0 8.0 \n", + "1 15.0 18.0 \n", + "2 5.0 3.0 \n", + "3 16.0 13.0 \n", + "4 23.0 27.0 \n", + "5 19.0 25.0 \n", + "6 10.0 1.0 \n", + "7 7.0 29.0 \n", + "8 4.0 4.0 \n", + "9 36.0 9.0 \n", + "10 7.0 24.0 \n", + "11 6.0 5.0 \n", + "12 34.0 16.0 \n", + "13 25.0 14.0 \n", + "14 9.0 19.0 \n", + "15 13.0 20.0 \n", + "16 29.0 12.0 \n", + "17 3.0 2.0 \n", + "18 35.0 31.0 \n", + "19 33.0 30.0 \n", + "20 12.0 7.0 \n", + "21 37.0 21.0 \n", + "22 22.0 33.0 \n", + "23 18.0 15.0 \n", + "24 20.0 26.0 \n", + "25 1.0 38.0 \n", + "26 14.0 17.0 \n", + "27 28.0 35.0 \n", + "28 24.0 36.0 \n", + "29 30.0 23.0 \n", + "30 32.0 40.0 \n", + "31 17.0 11.0 \n", + "32 2.0 37.0 \n", + "33 11.0 34.0 \n", + "34 26.0 10.0 \n", + "35 21.0 6.0 \n", + "36 27.0 22.0 \n", + "37 38.0 32.0 \n", + "38 39.0 41.0 \n", + "39 31.0 28.0 \n", + "40 31.0 39.0 \n", + "\n", + " truthfulqa_mc1(acc,none)_rank truthfulqa_mc2(acc,none)_rank \\\n", + "0 8.0 8.0 \n", + "1 7.0 6.0 \n", + "2 15.0 19.0 \n", + "3 13.0 13.0 \n", + "4 2.0 2.0 \n", + "5 1.0 1.0 \n", + "6 19.0 26.0 \n", + "7 3.0 3.0 \n", + "8 12.0 14.0 \n", + "9 10.0 11.0 \n", + "10 18.0 22.0 \n", + "11 13.0 17.0 \n", + "12 11.0 12.0 \n", + "13 6.0 5.0 \n", + "14 4.0 4.0 \n", + "15 16.0 16.0 \n", + "16 9.0 15.0 \n", + "17 23.0 32.0 \n", + "18 8.0 9.0 \n", + "19 14.0 10.0 \n", + "20 17.0 20.0 \n", + "21 5.0 7.0 \n", + "22 20.0 23.0 \n", + "23 16.0 24.0 \n", + "24 19.0 18.0 \n", + "25 33.0 40.0 \n", + "26 30.0 33.0 \n", + "27 28.0 29.0 \n", + "28 21.0 27.0 \n", + "29 25.0 28.0 \n", + "30 24.0 25.0 \n", + "31 22.0 30.0 \n", + "32 34.0 38.0 \n", + "33 36.0 41.0 \n", + "34 29.0 37.0 \n", + "35 31.0 34.0 \n", + "36 35.0 39.0 \n", + "37 26.0 31.0 \n", + "38 27.0 21.0 \n", + "39 32.0 35.0 \n", + "40 31.0 36.0 \n", + "\n", + " triviaqa(exact_match,remove_whitespace)_rank Reasoning & Math Mean Score \\\n", + "0 23.0 0.6266 \n", + "1 13.0 0.5860 \n", + "2 3.0 0.5505 \n", + "3 18.0 0.6214 \n", + "4 17.0 0.5541 \n", + "5 35.0 0.5488 \n", + "6 4.0 0.5206 \n", + "7 12.0 0.5245 \n", + "8 7.0 0.5366 \n", + "9 15.0 0.5399 \n", + "10 5.0 0.5446 \n", + "11 8.0 0.5286 \n", + "12 26.0 0.5712 \n", + "13 11.0 0.5184 \n", + "14 2.0 0.4704 \n", + "15 19.0 0.5374 \n", + "16 16.0 0.5006 \n", + "17 10.0 0.4495 \n", + "18 39.0 0.5285 \n", + "19 36.0 0.5387 \n", + "20 14.0 0.4688 \n", + "21 21.0 0.3823 \n", + "22 27.0 0.5010 \n", + "23 20.0 0.4244 \n", + "24 28.0 0.4469 \n", + "25 1.0 0.3719 \n", + "26 24.0 0.4143 \n", + "27 34.0 0.4841 \n", + "28 22.0 0.3874 \n", + "29 32.0 0.4493 \n", + "30 40.0 0.4997 \n", + "31 29.0 0.3674 \n", + "32 6.0 0.3361 \n", + "33 9.0 0.3377 \n", + "34 31.0 0.3702 \n", + "35 25.0 0.3450 \n", + "36 30.0 0.3312 \n", + "37 38.0 0.4009 \n", + "38 41.0 0.4085 \n", + "39 37.0 0.3494 \n", + "40 33.0 0.2914 \n", + "\n", + " Reasoning & Math Avg. Rank Commonsense & NLI Mean Score \\\n", + "0 1 0.7737 \n", + "1 3 0.7807 \n", + "2 6 0.7726 \n", + "3 2 0.7468 \n", + "4 5 0.7730 \n", + "5 7 0.7941 \n", + "6 16 0.7266 \n", + "7 15 0.7564 \n", + "8 12 0.7249 \n", + "9 9 0.7691 \n", + "10 8 0.7328 \n", + "11 13 0.7147 \n", + "12 4 0.7266 \n", + "13 17 0.7284 \n", + "14 22 0.7403 \n", + "15 11 0.7167 \n", + "16 19 0.7374 \n", + "17 24 0.7199 \n", + "18 14 0.7274 \n", + "19 10 0.7094 \n", + "20 23 0.6788 \n", + "21 32 0.7367 \n", + "22 18 0.6587 \n", + "23 27 0.7090 \n", + "24 26 0.6928 \n", + "25 33 0.7157 \n", + "26 28 0.7153 \n", + "27 21 0.6422 \n", + "28 31 0.6803 \n", + "29 25 0.6442 \n", + "30 20 0.6184 \n", + "31 35 0.6978 \n", + "32 39 0.6956 \n", + "33 38 0.6886 \n", + "34 34 0.6711 \n", + "35 37 0.6264 \n", + "36 40 0.6267 \n", + "37 30 0.5703 \n", + "38 29 0.5181 \n", + "39 36 0.5696 \n", + "40 41 0.6039 \n", + "\n", + " Commonsense & NLI Avg. Rank Knowledge & Reading Mean Score \\\n", + "0 3 0.3791 \n", + "1 2 0.3926 \n", + "2 5 0.4136 \n", + "3 8 0.3566 \n", + "4 4 0.3810 \n", + "5 1 0.3581 \n", + "6 15 0.4369 \n", + "7 7 0.3963 \n", + "8 17 0.4127 \n", + "9 6 0.3467 \n", + "10 12 0.3683 \n", + "11 22 0.3923 \n", + "12 16 0.3226 \n", + "13 13 0.3719 \n", + "14 9 0.4045 \n", + "15 19 0.3261 \n", + "16 10 0.3339 \n", + "17 18 0.4063 \n", + "18 14 0.2919 \n", + "19 23 0.2834 \n", + "20 30 0.3438 \n", + "21 11 0.3406 \n", + "22 32 0.2827 \n", + "23 24 0.3007 \n", + "24 27 0.2805 \n", + "25 20 0.3374 \n", + "26 21 0.2864 \n", + "27 34 0.2340 \n", + "28 29 0.2903 \n", + "29 33 0.2567 \n", + "30 37 0.2276 \n", + "31 25 0.2656 \n", + "32 26 0.3018 \n", + "33 28 0.2864 \n", + "34 31 0.2581 \n", + "35 36 0.2731 \n", + "36 35 0.2202 \n", + "37 39 0.1954 \n", + "38 41 0.1983 \n", + "39 40 0.2000 \n", + "40 38 0.2218 \n", + "\n", + " Knowledge & Reading Avg. Rank Mean Score \n", + "0 10 0.6038 \n", + "1 7 0.5961 \n", + "2 2 0.5871 \n", + "3 14 0.5859 \n", + "4 9 0.5788 \n", + "5 13 0.5775 \n", + "6 1 0.5676 \n", + "7 6 0.5672 \n", + "8 3 0.5653 \n", + "9 15 0.5621 \n", + "10 12 0.5576 \n", + "11 8 0.5528 \n", + "12 21 0.5510 \n", + "13 11 0.5480 \n", + "14 5 0.5451 \n", + "15 20 0.5368 \n", + "16 19 0.5335 \n", + "17 4 0.5312 \n", + "18 24 0.5271 \n", + "19 28 0.5219 \n", + "20 16 0.5048 \n", + "21 17 0.4939 \n", + "22 29 0.4907 \n", + "23 23 0.4869 \n", + "24 30 0.4830 \n", + "25 18 0.4819 \n", + "26 26 0.4813 \n", + "27 35 0.4644 \n", + "28 25 0.4608 \n", + "29 34 0.4597 \n", + "30 36 0.4596 \n", + "31 32 0.4525 \n", + "32 22 0.4516 \n", + "33 27 0.4451 \n", + "34 33 0.4419 \n", + "35 31 0.4219 \n", + "36 38 0.4013 \n", + "37 41 0.3986 \n", + "38 40 0.3838 \n", + "39 39 0.3816 \n", + "40 37 0.3799 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'total_time_raw': '18d 7h 55m', 'gpu_util_time_raw': '14d 23h 41m'}\n" + ] + } + ], + "source": [ + "\n", + "GROUPS = {\n", + " \"Reasoning & Math\": [\n", + " \"gsm8k(exact_match,strict-match)\", \n", + " \"bbh(exact_match,get-answer)\", \n", + " \"arc_challenge(acc_norm,none)\", 'anli_r1(acc,none)',\n", + " 'anli_r2(acc,none)', 'anli_r3(acc,none)',\n", + " \"gpqa_main_zeroshot(acc_norm,none)\",\n", + " ],\n", + " \"Commonsense & NLI\": [\n", + " \"hellaswag(acc_norm,none)\",\n", + " \"piqa(acc_norm,none)\", \"winogrande(acc,none)\", \"boolq(acc,none)\",\n", + " \"openbookqa(acc_norm,none)\", \"sciq(acc_norm,none)\", \"qnli(acc,none)\",\n", + " ],\n", + " \"Knowledge & Reading\": [\n", + " \"mmlu(acc,none)\", \"nq_open(exact_match,remove_whitespace)\", \"drop(f1,none)\",\n", + " \"truthfulqa_mc1(acc,none)\", 'truthfulqa_mc2(acc,none)','triviaqa(exact_match,remove_whitespace)',\n", + " ],\n", + "}\n", + "\n", + "\n", + "\n", + "def add_task_ranks(df, task_cols):\n", + " df = df.copy()\n", + " for col in task_cols:\n", + " if col not in df.columns: \n", + " raise ValueError(f\"No task: {col}\")\n", + " # rank: 1 = best; NaN scores get ranked at the bottom\n", + " df[f\"{col}_rank\"] = df[col].rank(ascending=False, method=\"dense\", na_option=\"bottom\")\n", + " return df\n", + "\n", + "def add_group_ranks(df, groups):\n", + " df = df.copy()\n", + " for gname, cols in groups.items():\n", + " # strip task name before \"(\" if any\n", + " tasks = [c for c in cols]\n", + " mean_col = f\"{gname}_mean\"\n", + " rank_col = f\"{gname}_rank\"\n", + " df[mean_col] = df[tasks].mean(axis=1)\n", + " df[rank_col] = df[mean_col].rank(ascending=False, method=\"dense\", na_option=\"bottom\").astype(int)\n", + " return df\n", + "\n", + "\n", + "def add_overall_rank(df, groups):\n", + " df = df.copy()\n", + " all_tasks = [c for cols in groups.values() for c in cols]\n", + "\n", + " # overall mean score across all tasks\n", + " df[\"overall_mean\"] = df[all_tasks].mean(axis=1, skipna=True)\n", + "\n", + " # higher = better → rank descending\n", + " df[\"overall_rank\"] = df[\"overall_mean\"].rank(\n", + " ascending=False, method=\"dense\", na_option=\"bottom\"\n", + " ).astype(int)\n", + " return df\n", + "\n", + "\n", + "all_task_cols = [c for cols in GROUPS.values() for c in cols]\n", + "\n", + "df_task_ranked = add_task_ranks(result_gpu_merged, all_task_cols)\n", + "df_group_ranked = add_group_ranks(df_task_ranked, GROUPS)\n", + "leaderboard = add_overall_rank(df_group_ranked, GROUPS)\n", + "\n", + "\n", + "col = \"overall_rank\" # the one you want first\n", + "cols = [col] + [c for c in leaderboard.columns if c != col]\n", + "df = leaderboard[cols]\n", + "df = df.sort_values(by=col, ascending=True).reset_index(drop=True)\n", + "\n", + "# Add quantization marker\n", + "targets = ['Qwen_Qwen3-14B', 'Qwen_Qwen2.5-14B-Instruct'] # use hyphen\n", + "mask = df['model_name'].isin(targets)\n", + "df.loc[mask, 'model_name'] = df.loc[mask, 'model_name'] + ' (8bit)'\n", + "\n", + "# display(df)\n", + "\n", + "df_display = df.rename(columns={\n", + " \"overall_rank\": \"Overall Rank\",\n", + " \"model_name\": \"Model Name\",\n", + " \"gpu_util_time\": \"GPU Util Time\",\n", + " \"total_time\": \"Total Time\",\n", + " \"parameters\": \"Parameters\",\n", + " 'Reasoning & Math_rank': 'Reasoning & Math Avg. Rank',\n", + " 'Commonsense & NLI_rank': 'Commonsense & NLI Avg. Rank',\n", + " 'Knowledge & Reading_rank': 'Knowledge & Reading Avg. Rank',\n", + " 'overall_mean': 'Mean Score',\n", + " 'Reasoning & Math_mean': 'Reasoning & Math Mean Score',\n", + " 'Commonsense & NLI_mean': 'Commonsense & NLI Mean Score',\n", + " 'Knowledge & Reading_mean': 'Knowledge & Reading Mean Score',\n", + "})\n", + "\n", + "cols_to_round = [\"Mean Score\", \"Reasoning & Math Mean Score\", \"Commonsense & NLI Mean Score\", \"Knowledge & Reading Mean Score\"] \n", + "df_display[cols_to_round] = df_display[cols_to_round].round(4)\n", + "\n", + "display(df_display)\n", + "df.to_csv(\"/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_master.csv\")\n", + "\n", + "\n", + " \n", + "# Total time calculation\n", + "def format_seconds(secs: int) -> str:\n", + " days, rem = divmod(int(secs), 86400) # 86400 sec = 1 day\n", + " hours, rem = divmod(rem, 3600) # 3600 sec = 1 hour\n", + " minutes, _ = divmod(rem, 60)\n", + " return f\"{days}d {hours}h {minutes}m\"\n", + "\n", + "# Example usage with df_display\n", + "totals = {}\n", + "for col in [\"total_time_raw\", \"gpu_util_time_raw\"]:\n", + " total_secs = df_display[col].sum()\n", + " totals[col] = format_seconds(total_secs)\n", + "\n", + "print(totals)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "b3ce5953-3a36-436a-ba4c-46bedd2b4c56", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "overall\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Model NameTotal TimeGPU Util TimeMean ScoreOverall Rank
1google_gemma-3-12b-it15h 45m14h 8m0.60381
2Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.59612
3openchat_openchat-3.6-8b-202405227h 51m6h 59m0.58713
4Qwen_Qwen3-8B15h 31m13h 44m0.58594
5Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.57885
6Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.57756
701-ai_Yi-1.5-9B11h 43m10h 26m0.56767
8Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.56728
9meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.56539
1001-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.562110
11mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.557611
12meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.552812
13Qwen_Qwen3-4B5h 51m5h 3m0.551013
14NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.548014
15mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.545115
16google_gemma-3-4b-it4h 51m3h 50m0.536816
1701-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.533517
1801-ai_Yi-1.5-6B4h 28m3h 54m0.531218
19Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.527119
20deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.521920
21meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.504821
22Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.493922
23Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.490723
24deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.486924
25deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.483025
26meta-llama_Llama-2-13b-hf19h 21m17h 38m0.481926
27meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.481327
28deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.464428
29Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.460829
30Qwen_Qwen3-1.7B4h 25m3h 36m0.459730
31Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.459631
32meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.452532
33meta-llama_Llama-2-7b-hf5h 42m4h 59m0.451633
34deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.445134
35deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.441935
36meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.421936
37google_gemma-3-1b-it6h 50m4h 52m0.401337
38deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.398638
39Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.383839
40Qwen_Qwen3-0.6B3h 45m2h 53m0.381640
41Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.379941
\n", + "
" + ], + "text/plain": [ + " Model Name Total Time GPU Util Time \\\n", + "1 google_gemma-3-12b-it 15h 45m 14h 8m \n", + "2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", + "3 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", + "4 Qwen_Qwen3-8B 15h 31m 13h 44m \n", + "5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", + "6 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", + "7 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", + "8 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", + "9 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", + "10 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", + "11 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", + "12 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", + "13 Qwen_Qwen3-4B 5h 51m 5h 3m \n", + "14 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", + "15 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", + "16 google_gemma-3-4b-it 4h 51m 3h 50m \n", + "17 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", + "18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", + "19 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", + "20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", + "21 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", + "22 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", + "23 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", + "24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", + "25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", + "26 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", + "27 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", + "28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", + "29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", + "30 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", + "31 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", + "32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", + "33 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", + "34 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", + "35 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", + "36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", + "37 google_gemma-3-1b-it 6h 50m 4h 52m \n", + "38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", + "39 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", + "40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", + "41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", + "\n", + " Mean Score Overall Rank \n", + "1 0.6038 1 \n", + "2 0.5961 2 \n", + "3 0.5871 3 \n", + "4 0.5859 4 \n", + "5 0.5788 5 \n", + "6 0.5775 6 \n", + "7 0.5676 7 \n", + "8 0.5672 8 \n", + "9 0.5653 9 \n", + "10 0.5621 10 \n", + "11 0.5576 11 \n", + "12 0.5528 12 \n", + "13 0.5510 13 \n", + "14 0.5480 14 \n", + "15 0.5451 15 \n", + "16 0.5368 16 \n", + "17 0.5335 17 \n", + "18 0.5312 18 \n", + "19 0.5271 19 \n", + "20 0.5219 20 \n", + "21 0.5048 21 \n", + "22 0.4939 22 \n", + "23 0.4907 23 \n", + "24 0.4869 24 \n", + "25 0.4830 25 \n", + "26 0.4819 26 \n", + "27 0.4813 27 \n", + "28 0.4644 28 \n", + "29 0.4608 29 \n", + "30 0.4597 30 \n", + "31 0.4596 31 \n", + "32 0.4525 32 \n", + "33 0.4516 33 \n", + "34 0.4451 34 \n", + "35 0.4419 35 \n", + "36 0.4219 36 \n", + "37 0.4013 37 \n", + "38 0.3986 38 \n", + "39 0.3838 39 \n", + "40 0.3816 40 \n", + "41 0.3799 41 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| Model Name | Total Time | GPU Util Time | Mean Score | Overall Rank |\n", + "|:------------------------------------------|:-------------|:----------------|-------------:|---------------:|\n", + "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.6038 | 1 |\n", + "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.5961 | 2 |\n", + "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.5871 | 3 |\n", + "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.5859 | 4 |\n", + "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.5788 | 5 |\n", + "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.5775 | 6 |\n", + "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.5676 | 7 |\n", + "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.5672 | 8 |\n", + "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.5653 | 9 |\n", + "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.5621 | 10 |\n", + "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.5576 | 11 |\n", + "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.5528 | 12 |\n", + "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.551 | 13 |\n", + "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.548 | 14 |\n", + "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.5451 | 15 |\n", + "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.5368 | 16 |\n", + "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.5335 | 17 |\n", + "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.5312 | 18 |\n", + "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.5271 | 19 |\n", + "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.5219 | 20 |\n", + "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.5048 | 21 |\n", + "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.4939 | 22 |\n", + "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.4907 | 23 |\n", + "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.4869 | 24 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.483 | 25 |\n", + "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.4819 | 26 |\n", + "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.4813 | 27 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.4644 | 28 |\n", + "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.4608 | 29 |\n", + "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.4597 | 30 |\n", + "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.4596 | 31 |\n", + "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.4525 | 32 |\n", + "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.4516 | 33 |\n", + "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.4451 | 34 |\n", + "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.4419 | 35 |\n", + "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.4219 | 36 |\n", + "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.4013 | 37 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.3986 | 38 |\n", + "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.3838 | 39 |\n", + "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.3816 | 40 |\n", + "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.3799 | 41 |\n", + "\n", + "\n", + "reasoning_and_math\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Model NameTotal TimeGPU Util TimeReasoning & Math Mean ScoreReasoning & Math Avg. Rank
1google_gemma-3-12b-it15h 45m14h 8m0.62661
2Qwen_Qwen3-8B15h 31m13h 44m0.62142
3Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.58603
4Qwen_Qwen3-4B5h 51m5h 3m0.57124
5Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.55415
6openchat_openchat-3.6-8b-202405227h 51m6h 59m0.55056
7Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.54887
8mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.54468
901-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.53999
10deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.538710
11google_gemma-3-4b-it4h 51m3h 50m0.537411
12meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.536612
13meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.528613
14Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.528514
15Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.524515
1601-ai_Yi-1.5-9B11h 43m10h 26m0.520616
17NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.518417
18Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.501018
1901-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.500619
20Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.499720
21deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.484121
22mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.470422
23meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.468823
2401-ai_Yi-1.5-6B4h 28m3h 54m0.449524
25Qwen_Qwen3-1.7B4h 25m3h 36m0.449325
26deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.446926
27deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.424427
28meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.414328
29Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.408529
30deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.400930
31Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.387431
32Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.382332
33meta-llama_Llama-2-13b-hf19h 21m17h 38m0.371933
34deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.370234
35meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.367435
36Qwen_Qwen3-0.6B3h 45m2h 53m0.349436
37meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.345037
38deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.337738
39meta-llama_Llama-2-7b-hf5h 42m4h 59m0.336139
40google_gemma-3-1b-it6h 50m4h 52m0.331240
41Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.291441
\n", + "
" + ], + "text/plain": [ + " Model Name Total Time GPU Util Time \\\n", + "1 google_gemma-3-12b-it 15h 45m 14h 8m \n", + "2 Qwen_Qwen3-8B 15h 31m 13h 44m \n", + "3 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", + "4 Qwen_Qwen3-4B 5h 51m 5h 3m \n", + "5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", + "6 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", + "7 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", + "8 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", + "9 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", + "10 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", + "11 google_gemma-3-4b-it 4h 51m 3h 50m \n", + "12 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", + "13 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", + "14 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", + "15 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", + "16 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", + "17 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", + "18 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", + "19 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", + "20 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", + "21 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", + "22 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", + "23 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", + "24 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", + "25 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", + "26 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", + "27 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", + "28 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", + "29 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", + "30 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", + "31 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", + "32 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", + "33 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", + "34 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", + "35 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", + "36 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", + "37 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", + "38 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", + "39 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", + "40 google_gemma-3-1b-it 6h 50m 4h 52m \n", + "41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", + "\n", + " Reasoning & Math Mean Score Reasoning & Math Avg. Rank \n", + "1 0.6266 1 \n", + "2 0.6214 2 \n", + "3 0.5860 3 \n", + "4 0.5712 4 \n", + "5 0.5541 5 \n", + "6 0.5505 6 \n", + "7 0.5488 7 \n", + "8 0.5446 8 \n", + "9 0.5399 9 \n", + "10 0.5387 10 \n", + "11 0.5374 11 \n", + "12 0.5366 12 \n", + "13 0.5286 13 \n", + "14 0.5285 14 \n", + "15 0.5245 15 \n", + "16 0.5206 16 \n", + "17 0.5184 17 \n", + "18 0.5010 18 \n", + "19 0.5006 19 \n", + "20 0.4997 20 \n", + "21 0.4841 21 \n", + "22 0.4704 22 \n", + "23 0.4688 23 \n", + "24 0.4495 24 \n", + "25 0.4493 25 \n", + "26 0.4469 26 \n", + "27 0.4244 27 \n", + "28 0.4143 28 \n", + "29 0.4085 29 \n", + "30 0.4009 30 \n", + "31 0.3874 31 \n", + "32 0.3823 32 \n", + "33 0.3719 33 \n", + "34 0.3702 34 \n", + "35 0.3674 35 \n", + "36 0.3494 36 \n", + "37 0.3450 37 \n", + "38 0.3377 38 \n", + "39 0.3361 39 \n", + "40 0.3312 40 \n", + "41 0.2914 41 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| Model Name | Total Time | GPU Util Time | Reasoning & Math Mean Score | Reasoning & Math Avg. Rank |\n", + "|:------------------------------------------|:-------------|:----------------|------------------------------:|-----------------------------:|\n", + "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.6266 | 1 |\n", + "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.6214 | 2 |\n", + "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.586 | 3 |\n", + "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.5712 | 4 |\n", + "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.5541 | 5 |\n", + "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.5505 | 6 |\n", + "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.5488 | 7 |\n", + "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.5446 | 8 |\n", + "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.5399 | 9 |\n", + "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.5387 | 10 |\n", + "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.5374 | 11 |\n", + "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.5366 | 12 |\n", + "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.5286 | 13 |\n", + "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.5285 | 14 |\n", + "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.5245 | 15 |\n", + "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.5206 | 16 |\n", + "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.5184 | 17 |\n", + "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.501 | 18 |\n", + "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.5006 | 19 |\n", + "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.4997 | 20 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.4841 | 21 |\n", + "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.4704 | 22 |\n", + "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.4688 | 23 |\n", + "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.4495 | 24 |\n", + "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.4493 | 25 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.4469 | 26 |\n", + "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.4244 | 27 |\n", + "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.4143 | 28 |\n", + "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.4085 | 29 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.4009 | 30 |\n", + "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.3874 | 31 |\n", + "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.3823 | 32 |\n", + "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.3719 | 33 |\n", + "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.3702 | 34 |\n", + "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.3674 | 35 |\n", + "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.3494 | 36 |\n", + "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.345 | 37 |\n", + "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.3377 | 38 |\n", + "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.3361 | 39 |\n", + "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.3312 | 40 |\n", + "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.2914 | 41 |\n", + "\n", + "\n", + "commonsense_and_nli\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Model NameTotal TimeGPU Util TimeCommonsense & NLI Mean ScoreCommonsense & NLI Avg. Rank
1Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.79411
2Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.78072
3google_gemma-3-12b-it15h 45m14h 8m0.77373
4Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.77304
5openchat_openchat-3.6-8b-202405227h 51m6h 59m0.77265
601-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.76916
7Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.75647
8Qwen_Qwen3-8B15h 31m13h 44m0.74688
9mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.74039
1001-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.737410
11Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.736711
12mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.732812
13NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.728413
14Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.727414
1501-ai_Yi-1.5-9B11h 43m10h 26m0.726615
16Qwen_Qwen3-4B5h 51m5h 3m0.726616
17meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.724917
1801-ai_Yi-1.5-6B4h 28m3h 54m0.719918
19google_gemma-3-4b-it4h 51m3h 50m0.716719
20meta-llama_Llama-2-13b-hf19h 21m17h 38m0.715720
21meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.715321
22meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.714722
23deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.709423
24deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.709024
25meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.697825
26meta-llama_Llama-2-7b-hf5h 42m4h 59m0.695626
27deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.692827
28deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.688628
29Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.680329
30meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.678830
31deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.671131
32Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.658732
33Qwen_Qwen3-1.7B4h 25m3h 36m0.644233
34deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.642234
35google_gemma-3-1b-it6h 50m4h 52m0.626735
36meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.626436
37Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.618437
38Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.603938
39deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.570339
40Qwen_Qwen3-0.6B3h 45m2h 53m0.569640
41Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.518141
\n", + "
" + ], + "text/plain": [ + " Model Name Total Time GPU Util Time \\\n", + "1 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", + "2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", + "3 google_gemma-3-12b-it 15h 45m 14h 8m \n", + "4 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", + "5 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", + "6 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", + "7 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", + "8 Qwen_Qwen3-8B 15h 31m 13h 44m \n", + "9 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", + "10 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", + "11 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", + "12 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", + "13 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", + "14 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", + "15 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", + "16 Qwen_Qwen3-4B 5h 51m 5h 3m \n", + "17 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", + "18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", + "19 google_gemma-3-4b-it 4h 51m 3h 50m \n", + "20 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", + "21 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", + "22 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", + "23 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", + "24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", + "25 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", + "26 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", + "27 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", + "28 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", + "29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", + "30 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", + "31 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", + "32 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", + "33 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", + "34 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", + "35 google_gemma-3-1b-it 6h 50m 4h 52m \n", + "36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", + "37 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", + "38 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", + "39 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", + "40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", + "41 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", + "\n", + " Commonsense & NLI Mean Score Commonsense & NLI Avg. Rank \n", + "1 0.7941 1 \n", + "2 0.7807 2 \n", + "3 0.7737 3 \n", + "4 0.7730 4 \n", + "5 0.7726 5 \n", + "6 0.7691 6 \n", + "7 0.7564 7 \n", + "8 0.7468 8 \n", + "9 0.7403 9 \n", + "10 0.7374 10 \n", + "11 0.7367 11 \n", + "12 0.7328 12 \n", + "13 0.7284 13 \n", + "14 0.7274 14 \n", + "15 0.7266 15 \n", + "16 0.7266 16 \n", + "17 0.7249 17 \n", + "18 0.7199 18 \n", + "19 0.7167 19 \n", + "20 0.7157 20 \n", + "21 0.7153 21 \n", + "22 0.7147 22 \n", + "23 0.7094 23 \n", + "24 0.7090 24 \n", + "25 0.6978 25 \n", + "26 0.6956 26 \n", + "27 0.6928 27 \n", + "28 0.6886 28 \n", + "29 0.6803 29 \n", + "30 0.6788 30 \n", + "31 0.6711 31 \n", + "32 0.6587 32 \n", + "33 0.6442 33 \n", + "34 0.6422 34 \n", + "35 0.6267 35 \n", + "36 0.6264 36 \n", + "37 0.6184 37 \n", + "38 0.6039 38 \n", + "39 0.5703 39 \n", + "40 0.5696 40 \n", + "41 0.5181 41 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| Model Name | Total Time | GPU Util Time | Commonsense & NLI Mean Score | Commonsense & NLI Avg. Rank |\n", + "|:------------------------------------------|:-------------|:----------------|-------------------------------:|------------------------------:|\n", + "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.7941 | 1 |\n", + "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.7807 | 2 |\n", + "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.7737 | 3 |\n", + "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.773 | 4 |\n", + "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.7726 | 5 |\n", + "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.7691 | 6 |\n", + "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.7564 | 7 |\n", + "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.7468 | 8 |\n", + "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.7403 | 9 |\n", + "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.7374 | 10 |\n", + "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.7367 | 11 |\n", + "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.7328 | 12 |\n", + "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.7284 | 13 |\n", + "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.7274 | 14 |\n", + "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.7266 | 15 |\n", + "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.7266 | 16 |\n", + "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.7249 | 17 |\n", + "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.7199 | 18 |\n", + "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.7167 | 19 |\n", + "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.7157 | 20 |\n", + "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.7153 | 21 |\n", + "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.7147 | 22 |\n", + "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.7094 | 23 |\n", + "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.709 | 24 |\n", + "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.6978 | 25 |\n", + "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.6956 | 26 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.6928 | 27 |\n", + "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.6886 | 28 |\n", + "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.6803 | 29 |\n", + "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.6788 | 30 |\n", + "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.6711 | 31 |\n", + "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.6587 | 32 |\n", + "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.6442 | 33 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.6422 | 34 |\n", + "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.6267 | 35 |\n", + "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.6264 | 36 |\n", + "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.6184 | 37 |\n", + "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.6039 | 38 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.5703 | 39 |\n", + "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.5696 | 40 |\n", + "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.5181 | 41 |\n", + "\n", + "\n", + "knowledge_and_reading\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Model NameTotal TimeGPU Util TimeKnowledge & Reading Mean ScoreKnowledge & Reading Avg. Rank
101-ai_Yi-1.5-9B11h 43m10h 26m0.43691
2openchat_openchat-3.6-8b-202405227h 51m6h 59m0.41362
3meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.41273
401-ai_Yi-1.5-6B4h 28m3h 54m0.40634
5mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.40455
6Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.39636
7Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.39267
8meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.39238
9Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.38109
10google_gemma-3-12b-it15h 45m14h 8m0.379110
11NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.371911
12mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.368312
13Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.358113
14Qwen_Qwen3-8B15h 31m13h 44m0.356614
1501-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.346715
16meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.343816
17Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.340617
18meta-llama_Llama-2-13b-hf19h 21m17h 38m0.337418
1901-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.333919
20google_gemma-3-4b-it4h 51m3h 50m0.326120
21Qwen_Qwen3-4B5h 51m5h 3m0.322621
22meta-llama_Llama-2-7b-hf5h 42m4h 59m0.301822
23deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.300723
24Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.291924
25Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.290325
26meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.286426
27deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.286427
28deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.283428
29Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.282729
30deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.280530
31meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.273131
32meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.265632
33deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.258133
34Qwen_Qwen3-1.7B4h 25m3h 36m0.256734
35deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.234035
36Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.227636
37Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.221837
38google_gemma-3-1b-it6h 50m4h 52m0.220238
39Qwen_Qwen3-0.6B3h 45m2h 53m0.200039
40Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.198340
41deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.195441
\n", + "
" + ], + "text/plain": [ + " Model Name Total Time GPU Util Time \\\n", + "1 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", + "2 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", + "3 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", + "4 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", + "5 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", + "6 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", + "7 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", + "8 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", + "9 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", + "10 google_gemma-3-12b-it 15h 45m 14h 8m \n", + "11 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", + "12 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", + "13 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", + "14 Qwen_Qwen3-8B 15h 31m 13h 44m \n", + "15 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", + "16 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", + "17 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", + "18 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", + "19 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", + "20 google_gemma-3-4b-it 4h 51m 3h 50m \n", + "21 Qwen_Qwen3-4B 5h 51m 5h 3m \n", + "22 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", + "23 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", + "24 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", + "25 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", + "26 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", + "27 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", + "28 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", + "29 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", + "30 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", + "31 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", + "32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", + "33 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", + "34 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", + "35 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", + "36 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", + "37 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", + "38 google_gemma-3-1b-it 6h 50m 4h 52m \n", + "39 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", + "40 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", + "41 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", + "\n", + " Knowledge & Reading Mean Score Knowledge & Reading Avg. Rank \n", + "1 0.4369 1 \n", + "2 0.4136 2 \n", + "3 0.4127 3 \n", + "4 0.4063 4 \n", + "5 0.4045 5 \n", + "6 0.3963 6 \n", + "7 0.3926 7 \n", + "8 0.3923 8 \n", + "9 0.3810 9 \n", + "10 0.3791 10 \n", + "11 0.3719 11 \n", + "12 0.3683 12 \n", + "13 0.3581 13 \n", + "14 0.3566 14 \n", + "15 0.3467 15 \n", + "16 0.3438 16 \n", + "17 0.3406 17 \n", + "18 0.3374 18 \n", + "19 0.3339 19 \n", + "20 0.3261 20 \n", + "21 0.3226 21 \n", + "22 0.3018 22 \n", + "23 0.3007 23 \n", + "24 0.2919 24 \n", + "25 0.2903 25 \n", + "26 0.2864 26 \n", + "27 0.2864 27 \n", + "28 0.2834 28 \n", + "29 0.2827 29 \n", + "30 0.2805 30 \n", + "31 0.2731 31 \n", + "32 0.2656 32 \n", + "33 0.2581 33 \n", + "34 0.2567 34 \n", + "35 0.2340 35 \n", + "36 0.2276 36 \n", + "37 0.2218 37 \n", + "38 0.2202 38 \n", + "39 0.2000 39 \n", + "40 0.1983 40 \n", + "41 0.1954 41 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "| Model Name | Total Time | GPU Util Time | Knowledge & Reading Mean Score | Knowledge & Reading Avg. Rank |\n", + "|:------------------------------------------|:-------------|:----------------|---------------------------------:|--------------------------------:|\n", + "| 01-ai_Yi-1.5-9B | 11h 43m | 10h 26m | 0.4369 | 1 |\n", + "| openchat_openchat-3.6-8b-20240522 | 7h 51m | 6h 59m | 0.4136 | 2 |\n", + "| meta-llama_Llama-3.1-8B-Instruct | 12h 19m | 10h 52m | 0.4127 | 3 |\n", + "| 01-ai_Yi-1.5-6B | 4h 28m | 3h 54m | 0.4063 | 4 |\n", + "| mistralai_Mistral-7B-Instruct-v0.3 | 8h 38m | 7h 41m | 0.4045 | 5 |\n", + "| Qwen_Qwen2.5-7B-Instruct-1M | 11h 17m | 10h 10m | 0.3963 | 6 |\n", + "| Qwen_Qwen3-14B (8bit) | 29h 45m | 17h 29m | 0.3926 | 7 |\n", + "| meta-llama_Meta-Llama-3-8B-Instruct | 6h 30m | 5h 46m | 0.3923 | 8 |\n", + "| Qwen_Qwen2.5-7B-Instruct | 9h 36m | 8h 33m | 0.381 | 9 |\n", + "| google_gemma-3-12b-it | 15h 45m | 14h 8m | 0.3791 | 10 |\n", + "| NousResearch_Hermes-2-Pro-Mistral-7B | 8h 27m | 7h 28m | 0.3719 | 11 |\n", + "| mistralai_Ministral-8B-Instruct-2410 | 10h 46m | 9h 27m | 0.3683 | 12 |\n", + "| Qwen_Qwen2.5-14B-Instruct (8bit) | 52h 44m | 29h 32m | 0.3581 | 13 |\n", + "| Qwen_Qwen3-8B | 15h 31m | 13h 44m | 0.3566 | 14 |\n", + "| 01-ai_Yi-1.5-9B-Chat | 13h 54m | 12h 15m | 0.3467 | 15 |\n", + "| meta-llama_Llama-3.2-3B-Instruct | 7h 12m | 5h 57m | 0.3438 | 16 |\n", + "| Qwen_Qwen2.5-3B-Instruct | 7h 48m | 6h 30m | 0.3406 | 17 |\n", + "| meta-llama_Llama-2-13b-hf | 19h 21m | 17h 38m | 0.3374 | 18 |\n", + "| 01-ai_Yi-1.5-6B-Chat | 8h 4m | 7h 1m | 0.3339 | 19 |\n", + "| google_gemma-3-4b-it | 4h 51m | 3h 50m | 0.3261 | 20 |\n", + "| Qwen_Qwen3-4B | 5h 51m | 5h 3m | 0.3226 | 21 |\n", + "| meta-llama_Llama-2-7b-hf | 5h 42m | 4h 59m | 0.3018 | 22 |\n", + "| deepseek-ai_deepseek-llm-7b-chat | 10h 6m | 9h 8m | 0.3007 | 23 |\n", + "| Qwen_Qwen2-7B-Instruct | 11h 30m | 10h 11m | 0.2919 | 24 |\n", + "| Qwen_Qwen2.5-1.5B-Instruct | 3h 20m | 2h 36m | 0.2903 | 25 |\n", + "| meta-llama_Llama-2-13b-chat-hf | 17h 8m | 15h 37m | 0.2864 | 26 |\n", + "| deepseek-ai_deepseek-llm-7b-base | 7h 11m | 6h 26m | 0.2864 | 27 |\n", + "| deepseek-ai_DeepSeek-R1-0528-Qwen3-8B | 17h 57m | 15h 30m | 0.2834 | 28 |\n", + "| Qwen_Qwen2.5-Math-7B | 27h 21m | 24h 38m | 0.2827 | 29 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Llama-8B | 11h 46m | 10h 36m | 0.2805 | 30 |\n", + "| meta-llama_Llama-3.2-1B-Instruct | 3h 30m | 2h 35m | 0.2731 | 31 |\n", + "| meta-llama_Llama-2-7b-chat-hf | 6h 57m | 6h 7m | 0.2656 | 32 |\n", + "| deepseek-ai_deepseek-math-7b-rl | 8h 2m | 7h 12m | 0.2581 | 33 |\n", + "| Qwen_Qwen3-1.7B | 4h 25m | 3h 36m | 0.2567 | 34 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Qwen-7B | 6h 28m | 5h 43m | 0.234 | 35 |\n", + "| Qwen_Qwen2.5-Math-7B-Instruct | 5h 37m | 4h 57m | 0.2276 | 36 |\n", + "| Qwen_Qwen2.5-0.5B-Instruct | 2h 34m | 1h 48m | 0.2218 | 37 |\n", + "| google_gemma-3-1b-it | 6h 50m | 4h 52m | 0.2202 | 38 |\n", + "| Qwen_Qwen3-0.6B | 3h 45m | 2h 53m | 0.2 | 39 |\n", + "| Qwen_Qwen2.5-Math-1.5B-Instruct | 3h 25m | 2h 39m | 0.1983 | 40 |\n", + "| deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B | 3h 40m | 2h 52m | 0.1954 | 41 |\n", + "\n", + "\n" + ] + } + ], + "source": [ + "column_map = {\n", + " \"overall\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Mean Score', \"Overall Rank\"],\n", + " \"reasoning_and_math\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Reasoning & Math Mean Score', \"Reasoning & Math Avg. Rank\"],\n", + " \"commonsense_and_nli\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Commonsense & NLI Mean Score', \"Commonsense & NLI Avg. Rank\"],\n", + " \"knowledge_and_reading\": [\"Model Name\", \"Total Time\", \"GPU Util Time\", 'Knowledge & Reading Mean Score', \"Knowledge & Reading Avg. Rank\"]\n", + "}\n", + "\n", + "\n", + "\n", + "# Produce sub-dataframes and export them to csv and excel file.\n", + "with pd.ExcelWriter(\"/mnt/data8tb/Documents/project/benchmark_project/llm_benchmarks_all_results.xlsx\") as writer:\n", + " df_display.to_excel(writer, sheet_name=\"Master\", index=False)\n", + " \n", + " for name, cols in column_map.items():\n", + " sub_df = df_display[cols].copy()\n", + " rank_col = [c for c in sub_df.columns if 'Rank' in c][0]\n", + " sub_df = sub_df.sort_values(by=rank_col, ascending=True).reset_index(drop=True)\n", + " sub_df.index = sub_df.index + 1\n", + " print(name)\n", + " if name == 'overall':\n", + " overall_df = sub_df\n", + " display(sub_df)\n", + " \n", + " # sub_df.to_csv(f\"/mnt/data8tb/Documents/project/benchmark_project/{name}_rank.csv\")\n", + " # sub_df.to_excel(writer, sheet_name=name, index=False)\n", + "\n", + " table_md = sub_df.to_markdown(index=False)\n", + " print(table_md)\n", + "\n", + " sub_df.to_html(f\"{name}.html\", index=False)\n", + " print()\n", + " print()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "5642b72a-e416-482b-b45b-8376fd2571b7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Model NameTotal TimeGPU Util TimeMean ScoreOverall Rank
1google_gemma-3-12b-it15h 45m14h 8m0.60381
2Qwen_Qwen3-14B (8bit)29h 45m17h 29m0.59612
3openchat_openchat-3.6-8b-202405227h 51m6h 59m0.58713
4Qwen_Qwen3-8B15h 31m13h 44m0.58594
5Qwen_Qwen2.5-7B-Instruct9h 36m8h 33m0.57885
6Qwen_Qwen2.5-14B-Instruct (8bit)52h 44m29h 32m0.57756
701-ai_Yi-1.5-9B11h 43m10h 26m0.56767
8Qwen_Qwen2.5-7B-Instruct-1M11h 17m10h 10m0.56728
9meta-llama_Llama-3.1-8B-Instruct12h 19m10h 52m0.56539
1001-ai_Yi-1.5-9B-Chat13h 54m12h 15m0.562110
11mistralai_Ministral-8B-Instruct-241010h 46m9h 27m0.557611
12meta-llama_Meta-Llama-3-8B-Instruct6h 30m5h 46m0.552812
13Qwen_Qwen3-4B5h 51m5h 3m0.551013
14NousResearch_Hermes-2-Pro-Mistral-7B8h 27m7h 28m0.548014
15mistralai_Mistral-7B-Instruct-v0.38h 38m7h 41m0.545115
16google_gemma-3-4b-it4h 51m3h 50m0.536816
1701-ai_Yi-1.5-6B-Chat8h 4m7h 1m0.533517
1801-ai_Yi-1.5-6B4h 28m3h 54m0.531218
19Qwen_Qwen2-7B-Instruct11h 30m10h 11m0.527119
20deepseek-ai_DeepSeek-R1-0528-Qwen3-8B17h 57m15h 30m0.521920
21meta-llama_Llama-3.2-3B-Instruct7h 12m5h 57m0.504821
22Qwen_Qwen2.5-3B-Instruct7h 48m6h 30m0.493922
23Qwen_Qwen2.5-Math-7B27h 21m24h 38m0.490723
24deepseek-ai_deepseek-llm-7b-chat10h 6m9h 8m0.486924
25deepseek-ai_DeepSeek-R1-Distill-Llama-8B11h 46m10h 36m0.483025
26meta-llama_Llama-2-13b-hf19h 21m17h 38m0.481926
27meta-llama_Llama-2-13b-chat-hf17h 8m15h 37m0.481327
28deepseek-ai_DeepSeek-R1-Distill-Qwen-7B6h 28m5h 43m0.464428
29Qwen_Qwen2.5-1.5B-Instruct3h 20m2h 36m0.460829
30Qwen_Qwen3-1.7B4h 25m3h 36m0.459730
31Qwen_Qwen2.5-Math-7B-Instruct5h 37m4h 57m0.459631
32meta-llama_Llama-2-7b-chat-hf6h 57m6h 7m0.452532
33meta-llama_Llama-2-7b-hf5h 42m4h 59m0.451633
34deepseek-ai_deepseek-llm-7b-base7h 11m6h 26m0.445134
35deepseek-ai_deepseek-math-7b-rl8h 2m7h 12m0.441935
36meta-llama_Llama-3.2-1B-Instruct3h 30m2h 35m0.421936
37google_gemma-3-1b-it6h 50m4h 52m0.401337
38deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B3h 40m2h 52m0.398638
39Qwen_Qwen2.5-Math-1.5B-Instruct3h 25m2h 39m0.383839
40Qwen_Qwen3-0.6B3h 45m2h 53m0.381640
41Qwen_Qwen2.5-0.5B-Instruct2h 34m1h 48m0.379941
\n", + "
" + ], + "text/plain": [ + " Model Name Total Time GPU Util Time \\\n", + "1 google_gemma-3-12b-it 15h 45m 14h 8m \n", + "2 Qwen_Qwen3-14B (8bit) 29h 45m 17h 29m \n", + "3 openchat_openchat-3.6-8b-20240522 7h 51m 6h 59m \n", + "4 Qwen_Qwen3-8B 15h 31m 13h 44m \n", + "5 Qwen_Qwen2.5-7B-Instruct 9h 36m 8h 33m \n", + "6 Qwen_Qwen2.5-14B-Instruct (8bit) 52h 44m 29h 32m \n", + "7 01-ai_Yi-1.5-9B 11h 43m 10h 26m \n", + "8 Qwen_Qwen2.5-7B-Instruct-1M 11h 17m 10h 10m \n", + "9 meta-llama_Llama-3.1-8B-Instruct 12h 19m 10h 52m \n", + "10 01-ai_Yi-1.5-9B-Chat 13h 54m 12h 15m \n", + "11 mistralai_Ministral-8B-Instruct-2410 10h 46m 9h 27m \n", + "12 meta-llama_Meta-Llama-3-8B-Instruct 6h 30m 5h 46m \n", + "13 Qwen_Qwen3-4B 5h 51m 5h 3m \n", + "14 NousResearch_Hermes-2-Pro-Mistral-7B 8h 27m 7h 28m \n", + "15 mistralai_Mistral-7B-Instruct-v0.3 8h 38m 7h 41m \n", + "16 google_gemma-3-4b-it 4h 51m 3h 50m \n", + "17 01-ai_Yi-1.5-6B-Chat 8h 4m 7h 1m \n", + "18 01-ai_Yi-1.5-6B 4h 28m 3h 54m \n", + "19 Qwen_Qwen2-7B-Instruct 11h 30m 10h 11m \n", + "20 deepseek-ai_DeepSeek-R1-0528-Qwen3-8B 17h 57m 15h 30m \n", + "21 meta-llama_Llama-3.2-3B-Instruct 7h 12m 5h 57m \n", + "22 Qwen_Qwen2.5-3B-Instruct 7h 48m 6h 30m \n", + "23 Qwen_Qwen2.5-Math-7B 27h 21m 24h 38m \n", + "24 deepseek-ai_deepseek-llm-7b-chat 10h 6m 9h 8m \n", + "25 deepseek-ai_DeepSeek-R1-Distill-Llama-8B 11h 46m 10h 36m \n", + "26 meta-llama_Llama-2-13b-hf 19h 21m 17h 38m \n", + "27 meta-llama_Llama-2-13b-chat-hf 17h 8m 15h 37m \n", + "28 deepseek-ai_DeepSeek-R1-Distill-Qwen-7B 6h 28m 5h 43m \n", + "29 Qwen_Qwen2.5-1.5B-Instruct 3h 20m 2h 36m \n", + "30 Qwen_Qwen3-1.7B 4h 25m 3h 36m \n", + "31 Qwen_Qwen2.5-Math-7B-Instruct 5h 37m 4h 57m \n", + "32 meta-llama_Llama-2-7b-chat-hf 6h 57m 6h 7m \n", + "33 meta-llama_Llama-2-7b-hf 5h 42m 4h 59m \n", + "34 deepseek-ai_deepseek-llm-7b-base 7h 11m 6h 26m \n", + "35 deepseek-ai_deepseek-math-7b-rl 8h 2m 7h 12m \n", + "36 meta-llama_Llama-3.2-1B-Instruct 3h 30m 2h 35m \n", + "37 google_gemma-3-1b-it 6h 50m 4h 52m \n", + "38 deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B 3h 40m 2h 52m \n", + "39 Qwen_Qwen2.5-Math-1.5B-Instruct 3h 25m 2h 39m \n", + "40 Qwen_Qwen3-0.6B 3h 45m 2h 53m \n", + "41 Qwen_Qwen2.5-0.5B-Instruct 2h 34m 1h 48m \n", + "\n", + " Mean Score Overall Rank \n", + "1 0.6038 1 \n", + "2 0.5961 2 \n", + "3 0.5871 3 \n", + "4 0.5859 4 \n", + "5 0.5788 5 \n", + "6 0.5775 6 \n", + "7 0.5676 7 \n", + "8 0.5672 8 \n", + "9 0.5653 9 \n", + "10 0.5621 10 \n", + "11 0.5576 11 \n", + "12 0.5528 12 \n", + "13 0.5510 13 \n", + "14 0.5480 14 \n", + "15 0.5451 15 \n", + "16 0.5368 16 \n", + "17 0.5335 17 \n", + "18 0.5312 18 \n", + "19 0.5271 19 \n", + "20 0.5219 20 \n", + "21 0.5048 21 \n", + "22 0.4939 22 \n", + "23 0.4907 23 \n", + "24 0.4869 24 \n", + "25 0.4830 25 \n", + "26 0.4819 26 \n", + "27 0.4813 27 \n", + "28 0.4644 28 \n", + "29 0.4608 29 \n", + "30 0.4597 30 \n", + "31 0.4596 31 \n", + "32 0.4525 32 \n", + "33 0.4516 33 \n", + "34 0.4451 34 \n", + "35 0.4419 35 \n", + "36 0.4219 36 \n", + "37 0.4013 37 \n", + "38 0.3986 38 \n", + "39 0.3838 39 \n", + "40 0.3816 40 \n", + "41 0.3799 41 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(overall_df)\n", + "overall_df.to_html(\"overall.html\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a04411e-c749-428f-89bd-2c23ac74af71", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7368bca2-dd44-4393-be0e-320f737af82b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}