Spaces:

samwell
/

medrax2

Starting on L40S

App Files Files Community

Junzhe Li commited on Nov 1

Commit

dba3d2e

1 Parent(s): e4e9fae

yes

Browse files

Files changed (9) hide show

2rexvqa.sh +15 -0
analyze.py +147 -0
benchmarking/cli.py +1 -1
benchmarking/llm_providers/medgemma_provider.py +12 -34
benchmarking/llm_providers/medrax_provider.py +6 -6
benchmarking/system_prompts.txt +6 -1
chestagentbench_script.sh +1 -1
medgemma_script.sh +3 -1
rexvqa_script.sh +2 -2

2rexvqa.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/bin/bash
+#SBATCH --job-name=medrax
+#SBATCH -c 4
+#SBATCH --gres=gpu:l40s:1
+#SBATCH --time=16:00:00
+#SBATCH --mem=50G
+#SBATCH --output=rexvqa-%j.out
+#SBATCH --error=rexvqa-%j.err
+module load arrow clang/18.1.8 scipy-stack
+source venv/bin/activate
+/scratch/lijunzh3/MedRAX2/venv/bin/python -m benchmarking.cli run --benchmark rexvqa --provider medrax --model gemini-2.5-pro --system-prompt CHESTAGENTBENCH_PROMPT --data-dir benchmarking/data/rexvqa --output-dir temp --max-questions 200 --temperature 0.7 --top-p 0.95 --max-tokens 10000 --concurrency 4 --random-seed 42

analyze.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import json
+import argparse
+import sys
+from collections import defaultdict
+from pathlib import Path
+def process_single_file(json_file_path):
+    """
+    Processes a single JSON results file and returns its accuracy counts.
+    Args:
+        json_file_path (Path): Path to the ...results.json file.
+    Returns:
+        defaultdict: A dictionary with the aggregated counts for this file.
+                     Returns None if the file cannot be processed.
+    """
+    # These counts are *only* for the file being processed
+    counts = defaultdict(lambda: defaultdict(lambda: {"total": 0, "correct": 0}))
+    keys_to_track = ["reasoning_type", "category", "class", "subcategory"]
+    try:
+        with open(json_file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except json.JSONDecodeError:
+        print(f"  - WARNING: Could not decode JSON from '{json_file_path}'. Skipping.")
+        return None
+    except Exception as e:
+        print(f"  - ERROR: Unexpected error loading '{json_file_path}': {e}. Skipping.")
+        return None
+    # Iterate through each record in the JSON array
+    for record in data:
+        try:
+            is_correct = record.get("is_correct", False)
+            metadata = record["metadata"]["data_point_metadata"]
+            for key in keys_to_track:
+                value = metadata.get(key)
+                if value is not None:
+                    counts[key][value]["total"] += 1
+                    if is_correct:
+                        counts[key][value]["correct"] += 1
+        except KeyError as e:
+            print(f"  - WARNING: Record {record.get('data_point_id')} is missing expected key: {e}. Skipping record.")
+        except TypeError:
+            print(f"  - WARNING: Record {record.get('data_point_id')} has unexpected data structure. Skipping record.")
+    return counts
+def generate_report_dict(counts):
+    """
+    Converts a counts dictionary into the final, formatted report dictionary.
+    Args:
+        counts (defaultdict): The aggregated counts from process_single_file.
+    Returns:
+        dict: A dictionary formatted with percentages and absolute numbers.
+    """
+    accuracy_report = defaultdict(dict)
+    for key, values in counts.items():
+        # Sort by the sub-category name (e.g., "Negation Assessment")
+        sorted_values = sorted(values.items(), key=lambda item: item[0])
+        for value, tally in sorted_values:
+            total = tally["total"]
+            correct = tally["correct"]
+            if total > 0:
+                accuracy = (correct / total) * 100
+            else:
+                accuracy = 0.0
+            # Store the full results in our report dictionary
+            accuracy_report[key][value] = {
+                "accuracy_percent": round(accuracy, 2),
+                "correct": correct,
+                "total": total
+            }
+    return accuracy_report
+def main():
+    """
+    Main function to find, process, and save individual reports.
+    """
+    parser = argparse.ArgumentParser(
+        description="Finds and processes individual benchmarking runs, saving "
+                    "a separate accuracy report for each run."
+    )
+    parser.add_argument(
+        "directory",
+        type=str,
+        help="The top-level directory to search within (e.g., 'my_experiments')."
+    )
+    args = parser.parse_args()
+    top_dir = Path(args.directory)
+    if not top_dir.is_dir():
+        print(f"Error: Path '{args.directory}' is not a valid directory.")
+        sys.exit(1)
+    # Glob pattern to find all target files
+    search_pattern = '*/final_results/*results.json'
+    json_files_to_process = list(top_dir.glob(search_pattern))
+    if not json_files_to_process:
+        print(f"No files matching the pattern '{search_pattern}' were found in '{top_dir}'.")
+        sys.exit(0)
+    print(f"Found {len(json_files_to_process)} result file(s) to process individually.")
+    # --- Loop and process each file ---
+    for file_path in json_files_to_process:
+        # Use relative path for cleaner logging
+        print(f"\n--- Processing: {file_path.relative_to(top_dir.parent)} ---")
+        # 1. Get counts for this file
+        counts = process_single_file(file_path)
+        if counts is None or not counts:
+            print("  - No data processed. Skipping report generation.")
+            continue
+        # 2. Generate the report dictionary
+        report = generate_report_dict(counts)
+        # 3. Determine the output path and save the file
+        #    The output is saved in the *same directory* as the input file
+        output_filename = file_path.parent / "accuracy_report.json"
+        try:
+            with open(output_filename, 'w', encoding='utf-8') as f:
+                json.dump(report, f, indent=2, sort_keys=True)
+            print(f"  > Successfully saved report to: {output_filename.relative_to(top_dir.parent)}")
+        except Exception as e:
+            print(f"  > ERROR: Could not save report to '{output_filename}': {e}")
+    print("\nAll processing complete.")
+if __name__ == "__main__":
+    main()

benchmarking/cli.py CHANGED Viewed

@@ -118,7 +118,7 @@ def main():
     run_parser.add_argument("--model", required=True,
                            help="Model name (e.g., gpt-4o, gpt-4.1-2025-04-14, gemini-2.5-pro)")
     run_parser.add_argument("--system-prompt", required=True,
-                           choices=["MEDICAL_ASSISTANT", "CHESTAGENTBENCH_PROMPT"],
                            help="System prompt: MEDICAL_ASSISTANT (general) or CHESTAGENTBENCH_PROMPT (benchmarks)")
     run_parser.add_argument("--data-dir", required=True,
                            help="Directory containing benchmark data files")

     run_parser.add_argument("--model", required=True,
                            help="Model name (e.g., gpt-4o, gpt-4.1-2025-04-14, gemini-2.5-pro)")
     run_parser.add_argument("--system-prompt", required=True,
+                           choices=["MEDICAL_ASSISTANT", "CHESTAGENTBENCH_PROMPT", "MEDGEMMA_PROMPT"],
                            help="System prompt: MEDICAL_ASSISTANT (general) or CHESTAGENTBENCH_PROMPT (benchmarks)")
     run_parser.add_argument("--data-dir", required=True,
                            help="Directory containing benchmark data files")

benchmarking/llm_providers/medgemma_provider.py CHANGED Viewed

@@ -3,8 +3,6 @@
 import os
 import time
 import httpx
-from typing import Optional
-from pathlib import Path
 from tenacity import retry, wait_exponential, stop_after_attempt
 from .base import LLMProvider, LLMRequest, LLMResponse
@@ -36,9 +34,8 @@ class MedGemmaProvider(LLMProvider):
                 - api_url: URL of the MedGemma FastAPI service
                 - max_new_tokens: Maximum tokens to generate (default: 300)
         """
-        # Extract MedGemma-specific config before calling super().__init__
-        self.api_url = os.getenv('MEDGEMMA_API_URL', 'http://localhost:8002')
-        self.max_new_tokens = kwargs.pop('max_new_tokens', 300)
         self.client = None
         # Call parent constructor
@@ -52,16 +49,6 @@ class MedGemmaProvider(LLMProvider):
             connect=10.0    # 10 seconds to establish connection
         )
         self.client = httpx.Client(timeout=timeout_config)
-        # Test connection to MedGemma service
-        try:
-            response = self.client.get(f"{self.api_url}/docs")
-            if response.status_code != 200:
-                print(f"Warning: MedGemma API at {self.api_url} may not be running (status: {response.status_code})")
-        except httpx.ConnectError:
-            print(f"Warning: Could not connect to MedGemma API at {self.api_url}")
-            print("Please ensure the MedGemma FastAPI service is running:")
-            print(f"  python medrax/tools/vqa/medgemma/medgemma.py")
     @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
     def generate_response(self, request: LLMRequest) -> LLMResponse:
@@ -100,14 +87,13 @@ class MedGemmaProvider(LLMProvider):
             files_to_send = []
             for image_path in valid_images:
                 try:
-                    # Detect correct MIME type based on file extension
-                    ext = Path(image_path).suffix.lower()
-                    mime_type = "image/png" if ext == ".png" else "image/jpeg"
                     # Read image file
                     with open(image_path, "rb") as f:
                         image_data = f.read()
                     # Add to files list
                     files_to_send.append(
                         ("images", (os.path.basename(image_path), image_data, mime_type))
@@ -122,17 +108,14 @@ class MedGemmaProvider(LLMProvider):
                     duration=time.time() - start_time
                 )
-            # Prepare form data
             # Use system_prompt if provided, otherwise use default
             system_prompt_text = self.system_prompt if self.system_prompt else "You are an expert radiologist who is able to analyze radiological images at any resolution."
-            # Override max_new_tokens if provided in request
-            max_tokens = getattr(request, 'max_tokens', self.max_new_tokens)
             data = {
                 "prompt": request.text,
                 "system_prompt": system_prompt_text,
-                "max_new_tokens": max_tokens,
             }
             # Make API request
@@ -148,19 +131,14 @@ class MedGemmaProvider(LLMProvider):
             # Parse response
             response_data = response.json()
             content = response_data.get("response", "")
-            metadata = response_data.get("metadata", {})
             duration = time.time() - start_time
-            # MedGemma doesn't provide token usage, but we can include request info
-            usage = {
-                "num_images": len(valid_images),
-                "max_new_tokens": max_tokens,
-            }
             return LLMResponse(
                 content=content,
-                usage=usage,
                 duration=duration
             )
@@ -199,7 +177,7 @@ class MedGemmaProvider(LLMProvider):
                 content=f"Error: {error_msg}",
                 duration=duration
             )
     def test_connection(self) -> bool:
         """Test the connection to the MedGemma API service.

 import os
 import time
 import httpx
 from tenacity import retry, wait_exponential, stop_after_attempt
 from .base import LLMProvider, LLMRequest, LLMResponse
                 - api_url: URL of the MedGemma FastAPI service
                 - max_new_tokens: Maximum tokens to generate (default: 300)
         """
+        self.provider_name = "medgemma"
+        self.api_url = "http://kn132.paice.vectorinstitute.ai:8002"
         self.client = None
         # Call parent constructor
             connect=10.0    # 10 seconds to establish connection
         )
         self.client = httpx.Client(timeout=timeout_config)
     @retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
     def generate_response(self, request: LLMRequest) -> LLMResponse:
             files_to_send = []
             for image_path in valid_images:
                 try:
                     # Read image file
                     with open(image_path, "rb") as f:
                         image_data = f.read()
+                    # Detect correct MIME type based on file extension
+                    mime_type = self._get_image_mime_type(image_path)
                     # Add to files list
                     files_to_send.append(
                         ("images", (os.path.basename(image_path), image_data, mime_type))
                     duration=time.time() - start_time
                 )
             # Use system_prompt if provided, otherwise use default
             system_prompt_text = self.system_prompt if self.system_prompt else "You are an expert radiologist who is able to analyze radiological images at any resolution."
+            # Prepare form data
             data = {
                 "prompt": request.text,
                 "system_prompt": system_prompt_text,
+                "max_new_tokens": self.max_tokens,
             }
             # Make API request
             # Parse response
             response_data = response.json()
             content = response_data.get("response", "")
+            # record duration
             duration = time.time() - start_time
+            # return response object
             return LLMResponse(
                 content=content,
+                usage=None,
                 duration=duration
             )
                 content=f"Error: {error_msg}",
                 duration=duration
             )
     def test_connection(self) -> bool:
         """Test the connection to the MedGemma API service.

benchmarking/llm_providers/medrax_provider.py CHANGED Viewed

@@ -37,14 +37,14 @@ class MedRAXProvider(LLMProvider):
             print("Starting server...")
             selected_tools = [
-                "TorchXRayVisionClassifierTool",  # For classifying chest X-ray images using TorchXRayVision
-                "ArcPlusClassifierTool",  # For advanced chest X-ray classification using ArcPlus
-                "ChestXRayReportGeneratorTool",  # For generating medical reports from X-rays
                 # "XRayPhraseGroundingTool",  # For locating described features in X-rays
                 "MedGemmaVQATool", # Google MedGemma VQA tool
-                "MedicalRAGTool",  # For retrieval-augmented generation with medical knowledge
-                "WebBrowserTool",  # For web browsing and search capabilities
-                "DuckDuckGoSearchTool",  # For privacy-focused web search using DuckDuckGo
             ]
             rag_config = RAGConfig(

             print("Starting server...")
             selected_tools = [
+                # "TorchXRayVisionClassifierTool",  # For classifying chest X-ray images using TorchXRayVision
+                # "ArcPlusClassifierTool",  # For advanced chest X-ray classification using ArcPlus
+                # "ChestXRayReportGeneratorTool",  # For generating medical reports from X-rays
                 # "XRayPhraseGroundingTool",  # For locating described features in X-rays
                 "MedGemmaVQATool", # Google MedGemma VQA tool
+                # "MedicalRAGTool",  # For retrieval-augmented generation with medical knowledge
+                # "WebBrowserTool",  # For web browsing and search capabilities
+                # "DuckDuckGoSearchTool",  # For privacy-focused web search using DuckDuckGo
             ]
             rag_config = RAGConfig(

benchmarking/system_prompts.txt CHANGED Viewed

@@ -33,4 +33,9 @@ Your final response for a multiple-choice question must strictly follow this for
 3.  **Critical Thinking & Tool Use:** [Show your reasoning, including how you used tools and evaluated their output]
 4.  **Final Answer:** \boxed{A}
-Do not provide a definitive diagnosis or treatment plan for a patient. Your purpose is to assist medical professionals with your analysis, not to replace them. You must maintain this persona and adhere to all instructions.

 3.  **Critical Thinking & Tool Use:** [Show your reasoning, including how you used tools and evaluated their output]
 4.  **Final Answer:** \boxed{A}
+Do not provide a definitive diagnosis or treatment plan for a patient. Your purpose is to assist medical professionals with your analysis, not to replace them. You must maintain this persona and adhere to all instructions.
+[MEDGEMMA_PROMPT]
+You are an expert in interpreting medical images and able to analyze medical images of any resolution, specifically chest X-rays, CT scans, and MRIs, with world-class accuracy and precision.
+Your final response for a multiple-choice question must strictly follow this boxed format for providing the final answer: **Final Answer:** \boxed{A}

chestagentbench_script.sh CHANGED Viewed

@@ -12,4 +12,4 @@ module load arrow clang/18.1.8 scipy-stack
 source venv/bin/activate
-/scratch/lijunzh3/MedRAX2/venv/bin/python -m benchmarking.cli run --benchmark chestagentbench --provider google --model gemini-2.5-pro --system-prompt CHESTAGENTBENCH_PROMPT --data-dir benchmarking/data/chestagentbench --output-dir temp --max-questions 500 --temperature 0.7 --top-p 0.95 --max-tokens 10000 --concurrency 4 --random-seed 42


12
13	source venv/bin/activate
14
15	+ /scratch/lijunzh3/MedRAX2/venv/bin/python -m benchmarking.cli run --benchmark chestagentbench --provider medrax --model gemini-2.5-pro --system-prompt CHESTAGENTBENCH_PROMPT --data-dir benchmarking/data/chestagentbench --output-dir temp --max-questions 500 --temperature 0.7 --top-p 0.95 --max-tokens 10000 --concurrency 4 --random-seed 42

medgemma_script.sh CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/bin/bash
-#SBATCH --job-name=medgemma
 #SBATCH -c 4
 #SBATCH --gres=gpu:l40s:1
 #SBATCH --time=16:00:00
@@ -8,6 +8,8 @@
 #SBATCH --output=medgemma-%j.out
 #SBATCH --error=medgemma-%j.err
 cd medrax/tools/vqa/medgemma
 source medgemma/bin/activate

 #!/bin/bash
+#SBATCH --job-name=medgemma3
 #SBATCH -c 4
 #SBATCH --gres=gpu:l40s:1
 #SBATCH --time=16:00:00
 #SBATCH --output=medgemma-%j.out
 #SBATCH --error=medgemma-%j.err
+export MEDGEMMA_DEVICE=cuda
 cd medrax/tools/vqa/medgemma
 source medgemma/bin/activate

rexvqa_script.sh CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/bin/bash
-#SBATCH --job-name=rexvqa
 #SBATCH -c 4
 #SBATCH --gres=gpu:l40s:1
 #SBATCH --time=16:00:00
@@ -12,4 +12,4 @@ module load arrow clang/18.1.8 scipy-stack
 source venv/bin/activate
-/scratch/lijunzh3/MedRAX2/venv/bin/python -m benchmarking.cli run --benchmark rexvqa --provider medrax --model gemini-2.5-pro --system-prompt CHESTAGENTBENCH_PROMPT --data-dir benchmarking/data/rexvqa --output-dir temp --max-questions 500 --temperature 0.7 --top-p 0.95 --max-tokens 10000 --concurrency 4 --random-seed 42

 #!/bin/bash
+#SBATCH --job-name=medgemma_run2
 #SBATCH -c 4
 #SBATCH --gres=gpu:l40s:1
 #SBATCH --time=16:00:00
 source venv/bin/activate
+/scratch/lijunzh3/MedRAX2/venv/bin/python -m benchmarking.cli run --benchmark rexvqa --provider medgemma --model medgemma-4b --system-prompt MEDGEMMA_PROMPT --data-dir benchmarking/data/rexvqa --output-dir temp --max-questions 200 --temperature 0.7 --top-p 0.95 --max-tokens 10000 --concurrency 4 --random-seed 100