Spaces:

samwell
/

medrax2

Sleeping

App Files Files Community

VictorLJZ commited on Jul 22

Commit

d07a267

1 Parent(s): 2b26ed4

updates

Browse files

Files changed (10) hide show

benchmarking/benchmarks/chestagentbench_benchmark.py +1 -2
benchmarking/cli.py +4 -3
benchmarking/llm_providers/__init__.py +2 -0
benchmarking/llm_providers/base.py +3 -2
benchmarking/llm_providers/google_provider.py +3 -4
benchmarking/llm_providers/medrax_provider.py +2 -19
benchmarking/llm_providers/openai_provider.py +1 -4
benchmarking/runner.py +9 -11
medrax/docs/system_prompts.txt +10 -5
medrax/models/model_factory.py +2 -2

benchmarking/benchmarks/chestagentbench_benchmark.py CHANGED Viewed

@@ -1,7 +1,6 @@
-import os
 import json
 from pathlib import Path
-from typing import Dict, List, Optional, Any
 from .base import Benchmark, BenchmarkDataPoint
 class ChestAgentBenchBenchmark(Benchmark):

 import json
 from pathlib import Path
+from typing import Dict, Optional, Any
 from .base import Benchmark, BenchmarkDataPoint
 class ChestAgentBenchBenchmark(Benchmark):

benchmarking/cli.py CHANGED Viewed

@@ -13,7 +13,7 @@ def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMPro
     Args:
         model_name (str): Name of the model
-        provider_type (str): Type of provider (openai, google, openrouter, medrax)
         **kwargs: Additional configuration parameters
     Returns:
@@ -22,6 +22,7 @@ def create_llm_provider(model_name: str, provider_type: str, **kwargs) -> LLMPro
     provider_map = {
         "openai": OpenAIProvider,
         "google": GoogleProvider,
         "medrax": MedRAXProvider,
     }
@@ -111,13 +112,13 @@ def main():
     # Run benchmark command
     run_parser = subparsers.add_parser("run", help="Run a benchmark")
     run_parser.add_argument("--model", required=True, help="Model name (e.g., gpt-4o, gemini-2.5-pro)")
-    run_parser.add_argument("--provider", required=True, choices=["openai", "google", "medrax"], help="LLM provider")
     run_parser.add_argument("--benchmark", required=True, choices=["rexvqa", "chestagentbench"], help="Benchmark to run")
     run_parser.add_argument("--data-dir", required=True, help="Directory containing benchmark data")
     run_parser.add_argument("--output-dir", default="benchmark_results", help="Output directory for results")
     run_parser.add_argument("--max-questions", type=int, help="Maximum number of questions to process")
     run_parser.add_argument("--temperature", type=float, default=0.7, help="Model temperature")
-    run_parser.add_argument("--max-tokens", type=int, default=1500, help="Maximum tokens per response")
     run_parser.set_defaults(func=run_benchmark_command)

     Args:
         model_name (str): Name of the model
+        provider_type (str): Type of provider (openai, google, xai, medrax)
         **kwargs: Additional configuration parameters
     Returns:
     provider_map = {
         "openai": OpenAIProvider,
         "google": GoogleProvider,
+        "xai": XAIProvider,
         "medrax": MedRAXProvider,
     }
     # Run benchmark command
     run_parser = subparsers.add_parser("run", help="Run a benchmark")
     run_parser.add_argument("--model", required=True, help="Model name (e.g., gpt-4o, gemini-2.5-pro)")
+    run_parser.add_argument("--provider", required=True, choices=["openai", "google", "xai", "medrax"], help="LLM provider")
     run_parser.add_argument("--benchmark", required=True, choices=["rexvqa", "chestagentbench"], help="Benchmark to run")
     run_parser.add_argument("--data-dir", required=True, help="Directory containing benchmark data")
     run_parser.add_argument("--output-dir", default="benchmark_results", help="Output directory for results")
     run_parser.add_argument("--max-questions", type=int, help="Maximum number of questions to process")
     run_parser.add_argument("--temperature", type=float, default=0.7, help="Model temperature")
+    run_parser.add_argument("--max-tokens", type=int, default=5000, help="Maximum tokens per response")
     run_parser.set_defaults(func=run_benchmark_command)

benchmarking/llm_providers/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from .base import LLMProvider, LLMRequest, LLMResponse
 from .openai_provider import OpenAIProvider
 from .google_provider import GoogleProvider
 from .medrax_provider import MedRAXProvider
 __all__ = [
     "LLMProvider",
@@ -12,4 +13,5 @@ __all__ = [
     "OpenAIProvider",
     "GoogleProvider",
     "MedRAXProvider",
 ]

 from .openai_provider import OpenAIProvider
 from .google_provider import GoogleProvider
 from .medrax_provider import MedRAXProvider
+from .xai_provider import XAIProvider
 __all__ = [
     "LLMProvider",
     "OpenAIProvider",
     "GoogleProvider",
     "MedRAXProvider",
+    "XAIProvider",
 ]

benchmarking/llm_providers/base.py CHANGED Viewed

@@ -14,7 +14,8 @@ class LLMRequest:
     text: str
     images: Optional[List[str]] = None  # List of image paths
     temperature: float = 0.7
-    max_tokens: int = 1500
     additional_params: Optional[Dict[str, Any]] = None
@@ -47,7 +48,7 @@ class LLMProvider(ABC):
         # Always load system prompt from file
         try:
             prompts = load_prompts_from_file("medrax/docs/system_prompts.txt")
-            self.system_prompt = prompts.get("MEDICAL_ASSISTANT", None)
             if self.system_prompt is None:
                 print(f"Warning: System prompt type 'MEDICAL_ASSISTANT' not found in medrax/docs/system_prompts.txt.")
         except Exception as e:

     text: str
     images: Optional[List[str]] = None  # List of image paths
     temperature: float = 0.7
+    top_p: float = 0.95
+    max_tokens: int = 5000
     additional_params: Optional[Dict[str, Any]] = None
         # Always load system prompt from file
         try:
             prompts = load_prompts_from_file("medrax/docs/system_prompts.txt")
+            self.system_prompt = prompts.get("CHESTAGENTBENCH_PROMPT", None)
             if self.system_prompt is None:
                 print(f"Warning: System prompt type 'MEDICAL_ASSISTANT' not found in medrax/docs/system_prompts.txt.")
         except Exception as e:

benchmarking/llm_providers/google_provider.py CHANGED Viewed

@@ -71,12 +71,11 @@ class GoogleProvider(LLMProvider):
             # Update client parameters for this request
             self.client.temperature = request.temperature
             self.client.max_output_tokens = request.max_tokens
-            if request.additional_params and "top_p" in request.additional_params:
-                self.client.top_p = request.additional_params["top_p"]
             response = self.client.invoke(messages)
             duration = time.time() - start_time
             # Extract response content

             # Update client parameters for this request
             self.client.temperature = request.temperature
             self.client.max_output_tokens = request.max_tokens
+            self.client.top_p = request.top_p
             response = self.client.invoke(messages)
+            print(response)
             duration = time.time() - start_time
             # Extract response content

benchmarking/llm_providers/medrax_provider.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """MedRAX LLM provider implementation."""
 import time
-import tempfile
 import shutil
 from pathlib import Path
@@ -65,14 +64,11 @@ class MedRAXProvider(LLMProvider):
             # Prepare any additional model-specific kwargs
             model_kwargs = {}
-            # Create temporary directory for this session
-            self.session_temp_dir = Path(tempfile.mkdtemp(prefix="medrax_bench_"))
             agent, tools_dict = initialize_agent(
                 prompt_file="medrax/docs/system_prompts.txt",
                 tools_to_use=selected_tools,
                 model_dir="/model-weights",
-                temp_dir=self.session_temp_dir,  # Change this to the path of the temporary directory
                 device="cpu",
                 model=self.model_name,  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
                 temperature=0.7,
@@ -122,7 +118,7 @@ class MedRAXProvider(LLMProvider):
                 for i, image_path in enumerate(valid_images):
                     print(f"Original image path: {image_path}")
                     # Copy image to session temp directory
-                    dest_path = self.session_temp_dir / f"image_{i}_{Path(image_path).name}"
                     print(f"Destination path: {dest_path}")
                     shutil.copy2(image_path, dest_path)
                     image_paths.append(str(dest_path))
@@ -189,16 +185,3 @@ class MedRAXProvider(LLMProvider):
                 duration=time.time() - start_time,
                 raw_response=None
             )
-    def _cleanup_temp_files(self) -> None:
-        """Clean up temporary files."""
-        try:
-            if hasattr(self, 'session_temp_dir') and self.session_temp_dir.exists():
-                shutil.rmtree(self.session_temp_dir)
-                print(f"Cleaned up temporary directory: {self.session_temp_dir}")
-        except Exception as e:
-            print(f"Warning: Failed to cleanup temp files: {e}")
-    def cleanup(self) -> None:
-        """Clean up resources when done with the provider."""
-        self._cleanup_temp_files()

 """MedRAX LLM provider implementation."""
 import time
 import shutil
 from pathlib import Path
             # Prepare any additional model-specific kwargs
             model_kwargs = {}
             agent, tools_dict = initialize_agent(
                 prompt_file="medrax/docs/system_prompts.txt",
                 tools_to_use=selected_tools,
                 model_dir="/model-weights",
+                temp_dir="temp",  # Change this to the path of the temporary directory
                 device="cpu",
                 model=self.model_name,  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
                 temperature=0.7,
                 for i, image_path in enumerate(valid_images):
                     print(f"Original image path: {image_path}")
                     # Copy image to session temp directory
+                    dest_path = Path("temp") / f"image_{i}_{Path(image_path).name}"
                     print(f"Destination path: {dest_path}")
                     shutil.copy2(image_path, dest_path)
                     image_paths.append(str(dest_path))
                 duration=time.time() - start_time,
                 raw_response=None
             )

benchmarking/llm_providers/openai_provider.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import os
 import time
-from typing import Dict, Any
 from tenacity import retry, wait_exponential, stop_after_attempt
 from langchain_openai import ChatOpenAI
 from langchain_core.messages import HumanMessage, SystemMessage
@@ -81,9 +80,7 @@ class OpenAIProvider(LLMProvider):
             # Update client parameters for this request
             self.client.temperature = request.temperature
             self.client.max_tokens = request.max_tokens
-            if request.additional_params and "top_p" in request.additional_params:
-                self.client.model_kwargs = {"top_p": request.additional_params["top_p"]}
             response = self.client.invoke(messages)

 import os
 import time
 from tenacity import retry, wait_exponential, stop_after_attempt
 from langchain_openai import ChatOpenAI
 from langchain_core.messages import HumanMessage, SystemMessage
             # Update client parameters for this request
             self.client.temperature = request.temperature
             self.client.max_tokens = request.max_tokens
+            self.client.top_p = request.top_p
             response = self.client.invoke(messages)

benchmarking/runner.py CHANGED Viewed

@@ -36,7 +36,8 @@ class BenchmarkRunConfig:
     output_dir: str
     max_questions: Optional[int] = None
     temperature: float = 0.7
-    max_tokens: int = 1500
     additional_params: Optional[Dict[str, Any]] = None
@@ -167,14 +168,6 @@ class BenchmarkRunner:
         # Save final results
         summary = self._save_final_results(benchmark)
-        # Clean up provider resources
-        if hasattr(llm_provider, 'cleanup'):
-            try:
-                llm_provider.cleanup()
-                self.logger.info("Provider cleanup completed")
-            except Exception as e:
-                self.logger.warning(f"Provider cleanup failed: {e}")
         self.logger.info(f"Benchmark run completed: {self.run_id}")
         self.logger.info(f"Final accuracy: {summary['results']['accuracy']:.2f}%")
         self.logger.info(f"Total duration: {summary['results']['total_duration']:.2f}s")
@@ -203,6 +196,7 @@ class BenchmarkRunner:
                 text=data_point.text,
                 images=data_point.images,
                 temperature=self.config.temperature,
                 max_tokens=self.config.max_tokens,
                 additional_params=self.config.additional_params
             )
@@ -260,10 +254,14 @@ class BenchmarkRunner:
         Returns:
             str: The extracted answer
         """
         # This is a simple implementation - may need customization per benchmark
         # For multiple choice, look for single letters A, B, C, D, E, F
-        # Look for patterns like "A", "B)", "(C)", "Answer: D", etc.
         patterns = [
             r'\b([A-F])\b',  # Single letter
             r'\b([A-F])\)',  # Letter with closing parenthesis

     output_dir: str
     max_questions: Optional[int] = None
     temperature: float = 0.7
+    top_p: float = 0.95
+    max_tokens: int = 5000
     additional_params: Optional[Dict[str, Any]] = None
         # Save final results
         summary = self._save_final_results(benchmark)
         self.logger.info(f"Benchmark run completed: {self.run_id}")
         self.logger.info(f"Final accuracy: {summary['results']['accuracy']:.2f}%")
         self.logger.info(f"Total duration: {summary['results']['total_duration']:.2f}s")
                 text=data_point.text,
                 images=data_point.images,
                 temperature=self.config.temperature,
+                top_p=self.config.top_p,
                 max_tokens=self.config.max_tokens,
                 additional_params=self.config.additional_params
             )
         Returns:
             str: The extracted answer
         """
+        # First, look for the 'Final answer: <|A|>' format
+        final_answer_pattern = r'Final answer:\s*<\|([A-F])\|>'
+        match = re.search(final_answer_pattern, response_text)
+        if match:
+            return match.group(1).upper()
         # This is a simple implementation - may need customization per benchmark
         # For multiple choice, look for single letters A, B, C, D, E, F
         patterns = [
             r'\b([A-F])\b',  # Single letter
             r'\b([A-F])\)',  # Letter with closing parenthesis

medrax/docs/system_prompts.txt CHANGED Viewed

@@ -1,10 +1,9 @@
 [MEDICAL_ASSISTANT]
 You are an expert medical AI assistant who can answer any medical questions and analyze medical images similar to a doctor.
 Solve using your own vision and reasoning and use tools to complement your reasoning.
-Make multiple tool calls in parallel or sequence as needed for comprehensive answers.
-Critically think about and criticize the tool outputs.
 If you need to look up some information before asking a follow up question, you are allowed to do that.
-When encountering a multiple-choice question, give the final answer in closed parentheses without further elaborations; give a definitive answer even if you're not sure.
 CITATION REQUIREMENTS:
 - When referencing information from RAG and/or web search tools, ALWAYS include numbered citations [1], [2], [3], etc.
@@ -17,5 +16,11 @@ Examples:
 - "The medical literature indicates [2] that this condition typically presents with..."
 - "Based on clinical guidelines [3], the recommended treatment approach is..."
-[GENERAL_ASSISTANT]
-You are a helpful AI assistant. Your role is to assist users with a wide range of tasks and questions, providing accurate and useful information on various topics.

 [MEDICAL_ASSISTANT]
 You are an expert medical AI assistant who can answer any medical questions and analyze medical images similar to a doctor.
 Solve using your own vision and reasoning and use tools to complement your reasoning.
+You can make multiple tool calls in parallel or in sequence as needed for comprehensive answers.
+Think critically about and criticize the tool outputs.
 If you need to look up some information before asking a follow up question, you are allowed to do that.
 CITATION REQUIREMENTS:
 - When referencing information from RAG and/or web search tools, ALWAYS include numbered citations [1], [2], [3], etc.
 - "The medical literature indicates [2] that this condition typically presents with..."
 - "Based on clinical guidelines [3], the recommended treatment approach is..."
+[CHESTAGENTBENCH_PROMPT]
+You are an expert medical AI assistant who can answer any medical questions and analyze medical images similar to a doctor.
+Solve using your own vision and reasoning and use tools (if available) to complement your reasoning.
+You can make multiple tool calls in parallel or in sequence as needed for comprehensive answers.
+Think critically about and criticize the tool outputs.
+If you need to look up some information before asking a follow up question, you are allowed to do that.
+When encountering a multiple-choice question, your final response should end with "Final answer: <|A|>" from list of possible choices A, B, C, D, E, F.
+It is extremely important that you strictly answer in the format mentioned above.

medrax/models/model_factory.py CHANGED Viewed

@@ -36,8 +36,8 @@ class ModelFactory:
             "default_base_url": "https://openrouter.ai/api/v1",
         },
         "grok": {
-        "class": ChatXAI,
-        "env_key": "XAI_API_KEY",
         }
         # Add more providers with default configurations here
     }

             "default_base_url": "https://openrouter.ai/api/v1",
         },
         "grok": {
+            "class": ChatXAI,
+            "env_key": "XAI_API_KEY",
         }
         # Add more providers with default configurations here
     }