VictorLJZ commited on
Commit
a8f2960
·
1 Parent(s): e4dfc3a

simplifying evaluations

Browse files
Files changed (2) hide show
  1. benchmarking/cli.py +4 -77
  2. benchmarking/evaluation.py +0 -114
benchmarking/cli.py CHANGED
@@ -2,13 +2,10 @@
2
 
3
  import argparse
4
  import sys
5
- from pathlib import Path
6
- from typing import Dict, Any, Optional
7
 
8
  from .llm_providers import *
9
  from .benchmarks import *
10
  from .runner import BenchmarkRunner, BenchmarkRunConfig
11
- from .evaluation import BenchmarkEvaluator
12
 
13
 
14
  def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMProvider:
@@ -106,86 +103,16 @@ def run_benchmark_command(args) -> None:
106
  print(f"Error: {summary['error']}")
107
  return
108
 
109
- print(f"Overall Accuracy: {summary['results']['accuracy']:.2f}%")
 
 
110
  print(f"Total Questions: {summary['results']['total_questions']}")
111
  print(f"Correct Answers: {summary['results']['correct_answers']}")
 
112
  print(f"Total Duration: {summary['results']['total_duration']:.2f}s")
113
  print(f"Results saved to: {summary['results_file']}")
114
 
115
 
116
- def evaluate_results_command(args) -> None:
117
- """Evaluate benchmark results."""
118
- print(f"Evaluating results: {args.results_files}")
119
-
120
- evaluator = BenchmarkEvaluator(args.output_dir)
121
-
122
- if len(args.results_files) == 1:
123
- # Single model evaluation
124
- evaluation = evaluator.evaluate_single_run(args.results_files[0])
125
- print("\n" + "="*50)
126
- print("SINGLE MODEL EVALUATION")
127
- print("="*50)
128
- print(f"Model: {evaluation.model_name}")
129
- print(f"Benchmark: {evaluation.benchmark_name}")
130
- print(f"Overall Accuracy: {evaluation.overall_accuracy:.2f}%")
131
- print(f"Total Questions: {evaluation.total_questions}")
132
- print(f"Error Rate: {evaluation.error_rate:.2f}%")
133
- print(f"Total Duration: {evaluation.total_duration:.2f}s")
134
-
135
- if evaluation.category_accuracies:
136
- print("\nCategory Accuracies:")
137
- for category, accuracy in evaluation.category_accuracies.items():
138
- print(f" {category}: {accuracy:.2f}%")
139
-
140
- else:
141
- # Multiple model comparison
142
- comparison = evaluator.compare_models(args.results_files)
143
-
144
- if "error" in comparison:
145
- print(f"Error: {comparison['error']}")
146
- return
147
-
148
- print("\n" + "="*50)
149
- print("MODEL COMPARISON")
150
- print("="*50)
151
-
152
- summary = comparison["summary"]
153
- print(f"Models Compared: {summary['models_compared']}")
154
- print(f"Best Overall Accuracy: {summary['best_overall_accuracy']:.2f}%")
155
- print(f"Accuracy Range: {summary['accuracy_range'][0]:.2f}% - {summary['accuracy_range'][1]:.2f}%")
156
-
157
- best_model = comparison["best_model"]
158
- print(f"\nBest Model: {best_model['Model']} ({best_model['Accuracy (%)']:.2f}%)")
159
-
160
- # Generate comprehensive report
161
- report_path = evaluator.generate_report(args.results_files, args.report_name)
162
- print(f"\nDetailed report saved to: {report_path}")
163
-
164
- # Statistical significance test
165
- if args.statistical_test:
166
- print("\nRunning statistical significance tests...")
167
- sig_results = evaluator.statistical_significance_test(args.results_files)
168
- print(f"Found {len(sig_results['comparisons'])} pairwise comparisons")
169
-
170
- for comp in sig_results["comparisons"]:
171
- significance = "significant" if comp["significant"] else "not significant"
172
- print(f"{comp['model1']} vs {comp['model2']}: {significance} (p={comp['p_value']:.4f})")
173
-
174
-
175
- def list_providers_command(args) -> None:
176
- """List available LLM providers."""
177
- print("Available LLM Providers:")
178
- print("- openai: OpenAI GPT models")
179
- print("- google: Google Gemini models")
180
- print("- medrax: MedRAX agent system")
181
-
182
-
183
- def list_benchmarks_command(args) -> None:
184
- """List available benchmarks."""
185
- print("Available Benchmarks:")
186
- print("- rexvqa: ReXVQA (large-scale chest radiology VQA)")
187
-
188
-
189
  def main():
190
  """Main CLI entry point."""
191
  parser = argparse.ArgumentParser(description="MedRAX Benchmarking Pipeline")
 
2
 
3
  import argparse
4
  import sys
 
 
5
 
6
  from .llm_providers import *
7
  from .benchmarks import *
8
  from .runner import BenchmarkRunner, BenchmarkRunConfig
 
9
 
10
 
11
  def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMProvider:
 
103
  print(f"Error: {summary['error']}")
104
  return
105
 
106
+ # Print results
107
+ print(f"Model: {args.model}")
108
+ print(f"Benchmark: {args.benchmark}")
109
  print(f"Total Questions: {summary['results']['total_questions']}")
110
  print(f"Correct Answers: {summary['results']['correct_answers']}")
111
+ print(f"Overall Accuracy: {summary['results']['accuracy']:.2f}%")
112
  print(f"Total Duration: {summary['results']['total_duration']:.2f}s")
113
  print(f"Results saved to: {summary['results_file']}")
114
 
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  def main():
117
  """Main CLI entry point."""
118
  parser = argparse.ArgumentParser(description="MedRAX Benchmarking Pipeline")
benchmarking/evaluation.py DELETED
@@ -1,114 +0,0 @@
1
- """Evaluation code for analyzing benchmark results."""
2
-
3
- import json
4
- import pandas as pd
5
- import numpy as np
6
- from pathlib import Path
7
- from typing import Dict, List, Optional, Any, Tuple
8
- from dataclasses import dataclass
9
- from collections import defaultdict
10
- import matplotlib.pyplot as plt
11
- import seaborn as sns
12
-
13
-
14
- @dataclass
15
- class EvaluationResult:
16
- """Results of evaluating a benchmark run."""
17
- model_name: str
18
- benchmark_name: str
19
- overall_accuracy: float
20
- total_questions: int
21
- correct_answers: int
22
- total_duration: float
23
- category_accuracies: Dict[str, float]
24
- category_counts: Dict[str, int]
25
- error_rate: float
26
- avg_duration_per_question: float
27
-
28
-
29
- class BenchmarkEvaluator:
30
- """Class for evaluating and comparing benchmark results."""
31
-
32
- def __init__(self, output_dir: str = "evaluation_results"):
33
- """Initialize the evaluator.
34
-
35
- Args:
36
- output_dir (str): Directory to save evaluation results
37
- """
38
- self.output_dir = Path(output_dir)
39
- self.output_dir.mkdir(parents=True, exist_ok=True)
40
-
41
- def load_results(self, results_file: str) -> Dict[str, Any]:
42
- """Load benchmark results from file.
43
-
44
- Args:
45
- results_file (str): Path to the results file
46
-
47
- Returns:
48
- Dict[str, Any]: Loaded results data
49
- """
50
- with open(results_file, 'r') as f:
51
- return json.load(f)
52
-
53
- def evaluate_single_run(self, results_file: str) -> EvaluationResult:
54
- """Evaluate a single benchmark run.
55
-
56
- Args:
57
- results_file (str): Path to the results file
58
-
59
- Returns:
60
- EvaluationResult: Evaluation results
61
- """
62
- results = self.load_results(results_file)
63
-
64
- # Calculate basic metrics
65
- total_questions = len(results)
66
- correct_answers = sum(1 for r in results if r.get("is_correct", False))
67
- total_duration = sum(r.get("duration", 0) for r in results)
68
- errors = sum(1 for r in results if r.get("error") is not None)
69
-
70
- overall_accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
71
- error_rate = (errors / total_questions) * 100 if total_questions > 0 else 0
72
-
73
- # Calculate per-category metrics
74
- category_stats = defaultdict(lambda: {"correct": 0, "total": 0})
75
-
76
- for result in results:
77
- metadata = result.get("metadata", {})
78
- category = metadata.get("category")
79
-
80
- if category:
81
- category_stats[category]["total"] += 1
82
- if result.get("is_correct", False):
83
- category_stats[category]["correct"] += 1
84
-
85
- # Calculate category accuracies
86
- category_accuracies = {}
87
- category_counts = {}
88
- for category, stats in category_stats.items():
89
- category_accuracies[category] = (stats["correct"] / stats["total"]) * 100
90
- category_counts[category] = stats["total"]
91
-
92
- # Extract model and benchmark names (assuming they're in the filename or metadata)
93
- results_path = Path(results_file)
94
- filename_parts = results_path.stem.split("_")
95
-
96
- model_name = "unknown"
97
- benchmark_name = "unknown"
98
-
99
- if len(filename_parts) >= 2:
100
- benchmark_name = filename_parts[0]
101
- model_name = filename_parts[1]
102
-
103
- return EvaluationResult(
104
- model_name=model_name,
105
- benchmark_name=benchmark_name,
106
- overall_accuracy=overall_accuracy,
107
- total_questions=total_questions,
108
- correct_answers=correct_answers,
109
- total_duration=total_duration,
110
- category_accuracies=category_accuracies,
111
- category_counts=category_counts,
112
- error_rate=error_rate,
113
- avg_duration_per_question=total_duration / total_questions if total_questions > 0 else 0,
114
- )