Spaces:

samwell
/

medrax2

Sleeping

App Files Files Community

VictorLJZ commited on Jul 18

Commit

a8f2960

1 Parent(s): e4dfc3a

simplifying evaluations

Browse files

Files changed (2) hide show

benchmarking/cli.py +4 -77
benchmarking/evaluation.py +0 -114

benchmarking/cli.py CHANGED Viewed

@@ -2,13 +2,10 @@
 import argparse
 import sys
-from pathlib import Path
-from typing import Dict, Any, Optional
 from .llm_providers import *
 from .benchmarks import *
 from .runner import BenchmarkRunner, BenchmarkRunConfig
-from .evaluation import BenchmarkEvaluator
 def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMProvider:
@@ -106,86 +103,16 @@ def run_benchmark_command(args) -> None:
         print(f"Error: {summary['error']}")
         return
-    print(f"Overall Accuracy: {summary['results']['accuracy']:.2f}%")
     print(f"Total Questions: {summary['results']['total_questions']}")
     print(f"Correct Answers: {summary['results']['correct_answers']}")
     print(f"Total Duration: {summary['results']['total_duration']:.2f}s")
     print(f"Results saved to: {summary['results_file']}")
-def evaluate_results_command(args) -> None:
-    """Evaluate benchmark results."""
-    print(f"Evaluating results: {args.results_files}")
-    evaluator = BenchmarkEvaluator(args.output_dir)
-    if len(args.results_files) == 1:
-        # Single model evaluation
-        evaluation = evaluator.evaluate_single_run(args.results_files[0])
-        print("\n" + "="*50)
-        print("SINGLE MODEL EVALUATION")
-        print("="*50)
-        print(f"Model: {evaluation.model_name}")
-        print(f"Benchmark: {evaluation.benchmark_name}")
-        print(f"Overall Accuracy: {evaluation.overall_accuracy:.2f}%")
-        print(f"Total Questions: {evaluation.total_questions}")
-        print(f"Error Rate: {evaluation.error_rate:.2f}%")
-        print(f"Total Duration: {evaluation.total_duration:.2f}s")
-        if evaluation.category_accuracies:
-            print("\nCategory Accuracies:")
-            for category, accuracy in evaluation.category_accuracies.items():
-                print(f"  {category}: {accuracy:.2f}%")
-    else:
-        # Multiple model comparison
-        comparison = evaluator.compare_models(args.results_files)
-        if "error" in comparison:
-            print(f"Error: {comparison['error']}")
-            return
-        print("\n" + "="*50)
-        print("MODEL COMPARISON")
-        print("="*50)
-        summary = comparison["summary"]
-        print(f"Models Compared: {summary['models_compared']}")
-        print(f"Best Overall Accuracy: {summary['best_overall_accuracy']:.2f}%")
-        print(f"Accuracy Range: {summary['accuracy_range'][0]:.2f}% - {summary['accuracy_range'][1]:.2f}%")
-        best_model = comparison["best_model"]
-        print(f"\nBest Model: {best_model['Model']} ({best_model['Accuracy (%)']:.2f}%)")
-        # Generate comprehensive report
-        report_path = evaluator.generate_report(args.results_files, args.report_name)
-        print(f"\nDetailed report saved to: {report_path}")
-        # Statistical significance test
-        if args.statistical_test:
-            print("\nRunning statistical significance tests...")
-            sig_results = evaluator.statistical_significance_test(args.results_files)
-            print(f"Found {len(sig_results['comparisons'])} pairwise comparisons")
-            for comp in sig_results["comparisons"]:
-                significance = "significant" if comp["significant"] else "not significant"
-                print(f"{comp['model1']} vs {comp['model2']}: {significance} (p={comp['p_value']:.4f})")
-def list_providers_command(args) -> None:
-    """List available LLM providers."""
-    print("Available LLM Providers:")
-    print("- openai: OpenAI GPT models")
-    print("- google: Google Gemini models")
-    print("- medrax: MedRAX agent system")
-def list_benchmarks_command(args) -> None:
-    """List available benchmarks."""
-    print("Available Benchmarks:")
-    print("- rexvqa: ReXVQA (large-scale chest radiology VQA)")
 def main():
     """Main CLI entry point."""
     parser = argparse.ArgumentParser(description="MedRAX Benchmarking Pipeline")

 import argparse
 import sys
 from .llm_providers import *
 from .benchmarks import *
 from .runner import BenchmarkRunner, BenchmarkRunConfig
 def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMProvider:
         print(f"Error: {summary['error']}")
         return
+    # Print results
+    print(f"Model: {args.model}")
+    print(f"Benchmark: {args.benchmark}")
     print(f"Total Questions: {summary['results']['total_questions']}")
     print(f"Correct Answers: {summary['results']['correct_answers']}")
+    print(f"Overall Accuracy: {summary['results']['accuracy']:.2f}%")
     print(f"Total Duration: {summary['results']['total_duration']:.2f}s")
     print(f"Results saved to: {summary['results_file']}")
 def main():
     """Main CLI entry point."""
     parser = argparse.ArgumentParser(description="MedRAX Benchmarking Pipeline")

benchmarking/evaluation.py DELETED Viewed

@@ -1,114 +0,0 @@
-"""Evaluation code for analyzing benchmark results."""
-import json
-import pandas as pd
-import numpy as np
-from pathlib import Path
-from typing import Dict, List, Optional, Any, Tuple
-from dataclasses import dataclass
-from collections import defaultdict
-import matplotlib.pyplot as plt
-import seaborn as sns
-@dataclass
-class EvaluationResult:
-    """Results of evaluating a benchmark run."""
-    model_name: str
-    benchmark_name: str
-    overall_accuracy: float
-    total_questions: int
-    correct_answers: int
-    total_duration: float
-    category_accuracies: Dict[str, float]
-    category_counts: Dict[str, int]
-    error_rate: float
-    avg_duration_per_question: float
-class BenchmarkEvaluator:
-    """Class for evaluating and comparing benchmark results."""
-    def __init__(self, output_dir: str = "evaluation_results"):
-        """Initialize the evaluator.
-        Args:
-            output_dir (str): Directory to save evaluation results
-        """
-        self.output_dir = Path(output_dir)
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-    def load_results(self, results_file: str) -> Dict[str, Any]:
-        """Load benchmark results from file.
-        Args:
-            results_file (str): Path to the results file
-        Returns:
-            Dict[str, Any]: Loaded results data
-        """
-        with open(results_file, 'r') as f:
-            return json.load(f)
-    def evaluate_single_run(self, results_file: str) -> EvaluationResult:
-        """Evaluate a single benchmark run.
-        Args:
-            results_file (str): Path to the results file
-        Returns:
-            EvaluationResult: Evaluation results
-        """
-        results = self.load_results(results_file)
-        # Calculate basic metrics
-        total_questions = len(results)
-        correct_answers = sum(1 for r in results if r.get("is_correct", False))
-        total_duration = sum(r.get("duration", 0) for r in results)
-        errors = sum(1 for r in results if r.get("error") is not None)
-        overall_accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
-        error_rate = (errors / total_questions) * 100 if total_questions > 0 else 0
-        # Calculate per-category metrics
-        category_stats = defaultdict(lambda: {"correct": 0, "total": 0})
-        for result in results:
-            metadata = result.get("metadata", {})
-            category = metadata.get("category")
-            if category:
-                category_stats[category]["total"] += 1
-                if result.get("is_correct", False):
-                    category_stats[category]["correct"] += 1
-        # Calculate category accuracies
-        category_accuracies = {}
-        category_counts = {}
-        for category, stats in category_stats.items():
-            category_accuracies[category] = (stats["correct"] / stats["total"]) * 100
-            category_counts[category] = stats["total"]
-        # Extract model and benchmark names (assuming they're in the filename or metadata)
-        results_path = Path(results_file)
-        filename_parts = results_path.stem.split("_")
-        model_name = "unknown"
-        benchmark_name = "unknown"
-        if len(filename_parts) >= 2:
-            benchmark_name = filename_parts[0]
-            model_name = filename_parts[1]
-        return EvaluationResult(
-            model_name=model_name,
-            benchmark_name=benchmark_name,
-            overall_accuracy=overall_accuracy,
-            total_questions=total_questions,
-            correct_answers=correct_answers,
-            total_duration=total_duration,
-            category_accuracies=category_accuracies,
-            category_counts=category_counts,
-            error_rate=error_rate,
-            avg_duration_per_question=total_duration / total_questions if total_questions > 0 else 0,
-        )