simplifying evaluations
Browse files- benchmarking/cli.py +4 -77
- benchmarking/evaluation.py +0 -114
benchmarking/cli.py
CHANGED
|
@@ -2,13 +2,10 @@
|
|
| 2 |
|
| 3 |
import argparse
|
| 4 |
import sys
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
from typing import Dict, Any, Optional
|
| 7 |
|
| 8 |
from .llm_providers import *
|
| 9 |
from .benchmarks import *
|
| 10 |
from .runner import BenchmarkRunner, BenchmarkRunConfig
|
| 11 |
-
from .evaluation import BenchmarkEvaluator
|
| 12 |
|
| 13 |
|
| 14 |
def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMProvider:
|
|
@@ -106,86 +103,16 @@ def run_benchmark_command(args) -> None:
|
|
| 106 |
print(f"Error: {summary['error']}")
|
| 107 |
return
|
| 108 |
|
| 109 |
-
|
|
|
|
|
|
|
| 110 |
print(f"Total Questions: {summary['results']['total_questions']}")
|
| 111 |
print(f"Correct Answers: {summary['results']['correct_answers']}")
|
|
|
|
| 112 |
print(f"Total Duration: {summary['results']['total_duration']:.2f}s")
|
| 113 |
print(f"Results saved to: {summary['results_file']}")
|
| 114 |
|
| 115 |
|
| 116 |
-
def evaluate_results_command(args) -> None:
|
| 117 |
-
"""Evaluate benchmark results."""
|
| 118 |
-
print(f"Evaluating results: {args.results_files}")
|
| 119 |
-
|
| 120 |
-
evaluator = BenchmarkEvaluator(args.output_dir)
|
| 121 |
-
|
| 122 |
-
if len(args.results_files) == 1:
|
| 123 |
-
# Single model evaluation
|
| 124 |
-
evaluation = evaluator.evaluate_single_run(args.results_files[0])
|
| 125 |
-
print("\n" + "="*50)
|
| 126 |
-
print("SINGLE MODEL EVALUATION")
|
| 127 |
-
print("="*50)
|
| 128 |
-
print(f"Model: {evaluation.model_name}")
|
| 129 |
-
print(f"Benchmark: {evaluation.benchmark_name}")
|
| 130 |
-
print(f"Overall Accuracy: {evaluation.overall_accuracy:.2f}%")
|
| 131 |
-
print(f"Total Questions: {evaluation.total_questions}")
|
| 132 |
-
print(f"Error Rate: {evaluation.error_rate:.2f}%")
|
| 133 |
-
print(f"Total Duration: {evaluation.total_duration:.2f}s")
|
| 134 |
-
|
| 135 |
-
if evaluation.category_accuracies:
|
| 136 |
-
print("\nCategory Accuracies:")
|
| 137 |
-
for category, accuracy in evaluation.category_accuracies.items():
|
| 138 |
-
print(f" {category}: {accuracy:.2f}%")
|
| 139 |
-
|
| 140 |
-
else:
|
| 141 |
-
# Multiple model comparison
|
| 142 |
-
comparison = evaluator.compare_models(args.results_files)
|
| 143 |
-
|
| 144 |
-
if "error" in comparison:
|
| 145 |
-
print(f"Error: {comparison['error']}")
|
| 146 |
-
return
|
| 147 |
-
|
| 148 |
-
print("\n" + "="*50)
|
| 149 |
-
print("MODEL COMPARISON")
|
| 150 |
-
print("="*50)
|
| 151 |
-
|
| 152 |
-
summary = comparison["summary"]
|
| 153 |
-
print(f"Models Compared: {summary['models_compared']}")
|
| 154 |
-
print(f"Best Overall Accuracy: {summary['best_overall_accuracy']:.2f}%")
|
| 155 |
-
print(f"Accuracy Range: {summary['accuracy_range'][0]:.2f}% - {summary['accuracy_range'][1]:.2f}%")
|
| 156 |
-
|
| 157 |
-
best_model = comparison["best_model"]
|
| 158 |
-
print(f"\nBest Model: {best_model['Model']} ({best_model['Accuracy (%)']:.2f}%)")
|
| 159 |
-
|
| 160 |
-
# Generate comprehensive report
|
| 161 |
-
report_path = evaluator.generate_report(args.results_files, args.report_name)
|
| 162 |
-
print(f"\nDetailed report saved to: {report_path}")
|
| 163 |
-
|
| 164 |
-
# Statistical significance test
|
| 165 |
-
if args.statistical_test:
|
| 166 |
-
print("\nRunning statistical significance tests...")
|
| 167 |
-
sig_results = evaluator.statistical_significance_test(args.results_files)
|
| 168 |
-
print(f"Found {len(sig_results['comparisons'])} pairwise comparisons")
|
| 169 |
-
|
| 170 |
-
for comp in sig_results["comparisons"]:
|
| 171 |
-
significance = "significant" if comp["significant"] else "not significant"
|
| 172 |
-
print(f"{comp['model1']} vs {comp['model2']}: {significance} (p={comp['p_value']:.4f})")
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
def list_providers_command(args) -> None:
|
| 176 |
-
"""List available LLM providers."""
|
| 177 |
-
print("Available LLM Providers:")
|
| 178 |
-
print("- openai: OpenAI GPT models")
|
| 179 |
-
print("- google: Google Gemini models")
|
| 180 |
-
print("- medrax: MedRAX agent system")
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
def list_benchmarks_command(args) -> None:
|
| 184 |
-
"""List available benchmarks."""
|
| 185 |
-
print("Available Benchmarks:")
|
| 186 |
-
print("- rexvqa: ReXVQA (large-scale chest radiology VQA)")
|
| 187 |
-
|
| 188 |
-
|
| 189 |
def main():
|
| 190 |
"""Main CLI entry point."""
|
| 191 |
parser = argparse.ArgumentParser(description="MedRAX Benchmarking Pipeline")
|
|
|
|
| 2 |
|
| 3 |
import argparse
|
| 4 |
import sys
|
|
|
|
|
|
|
| 5 |
|
| 6 |
from .llm_providers import *
|
| 7 |
from .benchmarks import *
|
| 8 |
from .runner import BenchmarkRunner, BenchmarkRunConfig
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMProvider:
|
|
|
|
| 103 |
print(f"Error: {summary['error']}")
|
| 104 |
return
|
| 105 |
|
| 106 |
+
# Print results
|
| 107 |
+
print(f"Model: {args.model}")
|
| 108 |
+
print(f"Benchmark: {args.benchmark}")
|
| 109 |
print(f"Total Questions: {summary['results']['total_questions']}")
|
| 110 |
print(f"Correct Answers: {summary['results']['correct_answers']}")
|
| 111 |
+
print(f"Overall Accuracy: {summary['results']['accuracy']:.2f}%")
|
| 112 |
print(f"Total Duration: {summary['results']['total_duration']:.2f}s")
|
| 113 |
print(f"Results saved to: {summary['results_file']}")
|
| 114 |
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
def main():
|
| 117 |
"""Main CLI entry point."""
|
| 118 |
parser = argparse.ArgumentParser(description="MedRAX Benchmarking Pipeline")
|
benchmarking/evaluation.py
DELETED
|
@@ -1,114 +0,0 @@
|
|
| 1 |
-
"""Evaluation code for analyzing benchmark results."""
|
| 2 |
-
|
| 3 |
-
import json
|
| 4 |
-
import pandas as pd
|
| 5 |
-
import numpy as np
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
from typing import Dict, List, Optional, Any, Tuple
|
| 8 |
-
from dataclasses import dataclass
|
| 9 |
-
from collections import defaultdict
|
| 10 |
-
import matplotlib.pyplot as plt
|
| 11 |
-
import seaborn as sns
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
@dataclass
|
| 15 |
-
class EvaluationResult:
|
| 16 |
-
"""Results of evaluating a benchmark run."""
|
| 17 |
-
model_name: str
|
| 18 |
-
benchmark_name: str
|
| 19 |
-
overall_accuracy: float
|
| 20 |
-
total_questions: int
|
| 21 |
-
correct_answers: int
|
| 22 |
-
total_duration: float
|
| 23 |
-
category_accuracies: Dict[str, float]
|
| 24 |
-
category_counts: Dict[str, int]
|
| 25 |
-
error_rate: float
|
| 26 |
-
avg_duration_per_question: float
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
class BenchmarkEvaluator:
|
| 30 |
-
"""Class for evaluating and comparing benchmark results."""
|
| 31 |
-
|
| 32 |
-
def __init__(self, output_dir: str = "evaluation_results"):
|
| 33 |
-
"""Initialize the evaluator.
|
| 34 |
-
|
| 35 |
-
Args:
|
| 36 |
-
output_dir (str): Directory to save evaluation results
|
| 37 |
-
"""
|
| 38 |
-
self.output_dir = Path(output_dir)
|
| 39 |
-
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 40 |
-
|
| 41 |
-
def load_results(self, results_file: str) -> Dict[str, Any]:
|
| 42 |
-
"""Load benchmark results from file.
|
| 43 |
-
|
| 44 |
-
Args:
|
| 45 |
-
results_file (str): Path to the results file
|
| 46 |
-
|
| 47 |
-
Returns:
|
| 48 |
-
Dict[str, Any]: Loaded results data
|
| 49 |
-
"""
|
| 50 |
-
with open(results_file, 'r') as f:
|
| 51 |
-
return json.load(f)
|
| 52 |
-
|
| 53 |
-
def evaluate_single_run(self, results_file: str) -> EvaluationResult:
|
| 54 |
-
"""Evaluate a single benchmark run.
|
| 55 |
-
|
| 56 |
-
Args:
|
| 57 |
-
results_file (str): Path to the results file
|
| 58 |
-
|
| 59 |
-
Returns:
|
| 60 |
-
EvaluationResult: Evaluation results
|
| 61 |
-
"""
|
| 62 |
-
results = self.load_results(results_file)
|
| 63 |
-
|
| 64 |
-
# Calculate basic metrics
|
| 65 |
-
total_questions = len(results)
|
| 66 |
-
correct_answers = sum(1 for r in results if r.get("is_correct", False))
|
| 67 |
-
total_duration = sum(r.get("duration", 0) for r in results)
|
| 68 |
-
errors = sum(1 for r in results if r.get("error") is not None)
|
| 69 |
-
|
| 70 |
-
overall_accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
|
| 71 |
-
error_rate = (errors / total_questions) * 100 if total_questions > 0 else 0
|
| 72 |
-
|
| 73 |
-
# Calculate per-category metrics
|
| 74 |
-
category_stats = defaultdict(lambda: {"correct": 0, "total": 0})
|
| 75 |
-
|
| 76 |
-
for result in results:
|
| 77 |
-
metadata = result.get("metadata", {})
|
| 78 |
-
category = metadata.get("category")
|
| 79 |
-
|
| 80 |
-
if category:
|
| 81 |
-
category_stats[category]["total"] += 1
|
| 82 |
-
if result.get("is_correct", False):
|
| 83 |
-
category_stats[category]["correct"] += 1
|
| 84 |
-
|
| 85 |
-
# Calculate category accuracies
|
| 86 |
-
category_accuracies = {}
|
| 87 |
-
category_counts = {}
|
| 88 |
-
for category, stats in category_stats.items():
|
| 89 |
-
category_accuracies[category] = (stats["correct"] / stats["total"]) * 100
|
| 90 |
-
category_counts[category] = stats["total"]
|
| 91 |
-
|
| 92 |
-
# Extract model and benchmark names (assuming they're in the filename or metadata)
|
| 93 |
-
results_path = Path(results_file)
|
| 94 |
-
filename_parts = results_path.stem.split("_")
|
| 95 |
-
|
| 96 |
-
model_name = "unknown"
|
| 97 |
-
benchmark_name = "unknown"
|
| 98 |
-
|
| 99 |
-
if len(filename_parts) >= 2:
|
| 100 |
-
benchmark_name = filename_parts[0]
|
| 101 |
-
model_name = filename_parts[1]
|
| 102 |
-
|
| 103 |
-
return EvaluationResult(
|
| 104 |
-
model_name=model_name,
|
| 105 |
-
benchmark_name=benchmark_name,
|
| 106 |
-
overall_accuracy=overall_accuracy,
|
| 107 |
-
total_questions=total_questions,
|
| 108 |
-
correct_answers=correct_answers,
|
| 109 |
-
total_duration=total_duration,
|
| 110 |
-
category_accuracies=category_accuracies,
|
| 111 |
-
category_counts=category_counts,
|
| 112 |
-
error_rate=error_rate,
|
| 113 |
-
avg_duration_per_question=total_duration / total_questions if total_questions > 0 else 0,
|
| 114 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|