Spaces:

samwell
/

medrax2

Sleeping

App Files Files Community

Adibvafa commited on Feb 6

Commit

eaca108

0 Parent(s):

Initial commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +7 -0
.gitignore +174 -0
.vscode/launch.json +15 -0
LICENSE +201 -0
README.md +83 -0
assets/medrax_logo.jpg +3 -0
assets/medrax_logo.png +3 -0
benchmark/__init__.py +0 -0
benchmark/create_benchmark.py +352 -0
benchmark/llm.py +42 -0
benchmark/utils.py +78 -0
data/eurorad_metadata.json +0 -0
data/figures.py +74 -0
data/get_cases.py +51 -0
data/stats/age_distribution.png +3 -0
data/stats/area_of_interest_distribution.png +3 -0
data/stats/gender_distribution.png +3 -0
demo/chest/LIDC.dcm +3 -0
demo/chest/Pseudo.dcm +3 -0
demo/chest/RIDER.dcm +3 -0
demo/chest/TCGAA.dcm +3 -0
demo/chest/__init__.py +0 -0
demo/chest/effusion1.png +3 -0
demo/chest/normal1.jpg +3 -0
demo/chest/normal2.jpg +3 -0
demo/chest/normal3.jpg +3 -0
demo/chest/normal4.jpg +3 -0
demo/chest/normal5.jpg +3 -0
demo/chest/normal6.jpg +3 -0
demo/chest/pneumonia1.jpg +3 -0
demo/chest/pneumonia2.jpg +3 -0
demo/chest/pneumonia3.jpg +3 -0
demo/chest/pneumonia4.jpg +3 -0
demo/chest/pneumonia5.jpg +3 -0
experiments/README.md +63 -0
experiments/analyze_axes.py +385 -0
experiments/benchmark_chexagent.py +316 -0
experiments/benchmark_gpt4o.py +331 -0
experiments/benchmark_llama.py +443 -0
experiments/benchmark_llavamed.py +541 -0
experiments/benchmark_medrax.ipynb +374 -0
experiments/chexbench_gpt4.py +405 -0
experiments/compare_runs.py +290 -0
experiments/inspect_logs.py +210 -0
experiments/validate_logs.py +162 -0
interface.py +259 -0
main.py +63 -0
medrax/__init__.py +0 -0
medrax/agent/__init__.py +1 -0
medrax/agent/agent.py +193 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,7 @@

+*.gif filter=lfs diff=lfs merge=lfs -text
+*.dcm filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+*.sqlite3 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# ruff
+ruff-cache/
+.ruff_cache/
+afallah/
+logs/
+temp/
+.gradio/

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: main.py",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "main.py",
+            "console": "integratedTerminal"
+        }
+    ]
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,83 @@

+<h1 align="center">
+🤖 MedRAX: Medical Reasoning Agent for Chest X-ray 🏥
+</h1>
+<br>
+## Problem
+Medical professionals face significant challenges when using traditional Large Language Models (LLMs) for X-ray analysis. Standard LLMs often hallucinate, lack specialized medical imaging capabilities, and can miss critical diagnostic details. While separate tools exist for various aspects of X-ray analysis, the current fragmented approach requires doctors to juggle multiple systems, leading to inefficient workflows and potential oversights in patient care.
+<br>
+<br>
+## Our Solution
+MedRAX is an intelligent medical assistant that seamlessly integrates an LLM with specialized X-ray analysis tools, providing a unified interface for comprehensive X-ray analysis. Through natural conversation, medical professionals can leverage powerful tools while the system intelligently coordinates their usage behind the scenes.
+Our comprehensive toolset includes:
+- **ChestXRayReportGenerator**: Generates detailed, accurate medical reports from X-ray images
+- **ChestXRayClassifier**: Analyzes images for 18 different pathologies providing probability scores for each condition
+- **ChestXRaySegmentation**: Precisely segments anatomical structures
+- **MedicalVisualQA**: Answers to complex visual medical queries
+- **XRayPhraseGrounding**: Locates and visualizes specific medical findings in X-rays with bounding box precision
+- **ImageVisualizer**: Enhances and displays X-ray images for optimal viewing
+- **ChestXRayGenerator**: Generates synthetic chest X-rays for educational purposes
+- **DicomProcessor**: Handles DICOM file processing and analysis
+<br>
+## Technical Implementation
+MedRAX is built on a robust technical foundation:
+- **Core Architecture**: Leverages LangChain and LangGraph for sophisticated agent orchestration
+- **Language Model**: Powered by OpenAI's API for natural language understanding and generation
+- **Specialized Tools**: Integrates medical-domain fine-tuned models for various analysis tasks
+- **Interface**: Built with Gradio for an intuitive, chat-based user experience
+- **Modular Design**: Allows easy integration of additional specialized medical tools
+<br>
+## Potential Impact
+- Accelerates X-ray analysis while maintaining high accuracy
+- Reduces the likelihood of missed diagnoses through multi-tool verification
+- Provides valuable educational support for medical students and residents
+- Offers a scalable solution for facilities with limited specialist availability
+- Improves patient outcomes through comprehensive analysis
+- Streamlines workflow for medical professionals
+<br>
+## Setup and Usage
+### Prerequisites
+- GPU required for optimal performance
+- Python 3.8+
+- OpenAI API key
+### Installation
+1. Clone the repository:
+```bash
+git clone https://github.com/yourusername/MedRAX.git
+cd MedRAX
+```
+2. Install dependencies:
+```bash
+pip install -e .
+```
+3. Set up environment variables:
+```bash
+echo "OPENAI_API_KEY=your_key_here" > .env
+```
+### Running the Application
+Start the application:
+```bash
+python main.py
+```
+<br>
+## Developers
+- Adibvafa Fallahpour
+- Jun Ma
+- Hongwei Lyu
+<br>
+---
+<p align="center">
+Made with ❤️ in Toronto
+</p>

assets/medrax_logo.jpg ADDED Viewed

Git LFS Details

SHA256: 306aa20d47067df102e4ba26d637f22a7d95f449a5969d320ceeca03b71da1d1
Pointer size: 132 Bytes
Size of remote file: 1.45 MB

assets/medrax_logo.png ADDED Viewed

Git LFS Details

SHA256: 5af3f42308022abe028b670e6716152e714c1f25ebbe6375532775a557b66b2c
Pointer size: 131 Bytes
Size of remote file: 148 kB

benchmark/__init__.py ADDED Viewed

File without changes

benchmark/create_benchmark.py ADDED Viewed

	@@ -0,0 +1,352 @@

+#!/usr/bin/env python3
+"""
+Medical X-ray Question Generation Benchmark aka ChestAgentBench
+This script generates clinical questions from X-ray case data of Eurorad dataset using GPT-4o.
+It structures questions across different analytical categories and saves them as JSON.
+"""
+import os
+import re
+import json
+from typing import *
+from pprint import pprint
+import openai
+import numpy as np
+from scipy import stats
+import plotly.graph_objects as go
+from tqdm import tqdm
+from benchmark.utils import load_eurorad_dataset
+from benchmark.llm import get_llm_response
+# Constants
+DATA_DIR = "set your data directory here, e.g. /home/MedRAX/data"
+DATASET_PATH = os.path.join(DATA_DIR, "eurorad_metadata.json")
+SYSTEM_PROMPT = """
+You are an expert medical benchmark creation assistant.
+Your goal is to generate questions that evaluate a multimodal medical AI agent's ability to interpret and reason about chest X-rays.
+""".strip()
+CATEGORIES_META = {
+    "detection": "Identify and locate specific findings in the chest X-ray.",
+    "classification": "Determine whether specific findings are present or absent in the chest X-ray.",
+    "enumeration": "Count the number of target findings in the chest X-ray.",
+    "localization": "Locate a given finding in the chest X-ray.",
+    "comparison": "Compare the size or position of a specific finding in the chest X-ray.",
+    "relationship": "Determine the relationship between two or more findings in the chest X-ray.",
+    "diagnosis": "Make a diagnosis or determine a treatment plan by interpreting the chest X-ray.",
+    "characterization": "Describe specific attributes (shape, density, margins, etc.) of findings.",
+    "reasoning": "Explain the medical rationale and thought process behind findings and conclusions.",
+}
+CATEGORIES = list(CATEGORIES_META.keys())
+CATEGORY_COMBINATIONS = [
+    ["detection", "localization", "characterization", "reasoning"],  # Detailed Finding Analysis
+    ["detection", "classification", "relationship", "reasoning"],  # Pattern Recognition & Relations
+    ["localization", "comparison", "relationship", "reasoning"],  # Spatial Understanding
+    ["classification", "comparison", "diagnosis", "reasoning"],  # Clinical Decision Making
+    ["classification", "characterization", "diagnosis", "reasoning"],  # Diagnostic Characterization
+]
+DEFAULT_SECTIONS = [
+    "history",
+    "image_finding",
+    "discussion",
+    "differential_diagnosis",
+    "diagnosis",
+    "figures",
+]
+class Question:
+    """A class to generate clinical questions from case data.
+    This class handles creating structured clinical questions by combining case data with
+    specified categories and difficulty levels.
+    Attributes:
+        type (str): The type of question (e.g. multiple choice)
+        difficulty (str): Difficulty level of the question
+        case_data (Dict[str, Any]): Dictionary containing the clinical case data
+        case_content (str): Formatted case data from selected sections
+        case_id (str): Unique identifier for the case
+        categories (List[str]): List of analytical categories this question tests
+        sections (List[str]): Case sections to include in question
+        raw_content (Optional[str]): Raw LLM response to the question prompt
+        content (Optional[Dict[str, str]]): Extracted content from the raw LLM response
+    """
+    def __init__(
+        self,
+        type: str,
+        difficulty: str,
+        case_data: Dict[str, Any],
+        categories: List[str],
+        sections: List[str] = [
+            "history",
+            "image_finding",
+            "discussion",
+            "differential_diagnosis",
+            "diagnosis",
+            "figures",
+        ],
+        system_prompt: str = "You are an expert medical benchmark creation assistant.",
+    ) -> None:
+        self.type = type
+        self.difficulty = difficulty
+        self.case_data = case_data
+        self.case_id = case_data["case_id"]
+        self.categories = categories
+        self.sections = sections
+        self.system_prompt = system_prompt
+        self.case_content = self.select_case_sections()
+        self.raw_content: Optional[str] = None
+        self.content: Optional[Dict[str, str]] = None
+    def create_question_prompt(self) -> str:
+        """Creates a formatted prompt for generating a clinical question.
+        Returns:
+            str: A structured prompt containing the question parameters and clinical data
+        """
+        category_descriptions = "\n".join(
+            f"{category}: {desc}"
+            for category, desc in CATEGORIES_META.items()
+            if category in self.categories
+        )
+        return f"""
+        You must follow these guidelines:
+        1. Questions must be answerable using only context and chest X-rays.
+        - Questions must explicitly mention the referenced figures
+        - Questions can only reference the chest X-ray figures
+        2. Questions must have unambiguous, verifiable answers, and should:
+        - Challenge the agent's analytical capabilities
+        - Require multi-step reasoning
+        - Test ability to make precise observations
+        - Evaluate capability to derive insights and findings from the chest X-ray
+        3. The agent has access to tools like classification, report generation, segmentation, grounding, visual question answering, etc. Your question should be complex to require the use of such tools.
+        Create a {self.difficulty} {self.type} clinical question that integrates the following:
+        {category_descriptions}
+        based on the following clinical case:
+        {self.case_content}
+        Do not use any infomration derived from the CT and MRI images. Do not provide any information and findings about the chest X-rays.
+        Your question should require the agent to derive insights and findings from the chest X-ray by itself.
+        Your answer should be verifiable directly in the context of the case.
+        You can only use the image findings that come from the chest X-ray figures.
+        Your response must follow this exact format:
+        THOUGHTS: [Think about different reasoning steps and tools the agent should use to answer the question]
+        QUESTION: [complete question with relevant context. Incorrect choices should be very close to the correct answer.]
+        FIGURES: [list of required figures, e.g. ["Figure 1", "Figure 2a"]]
+        EXPLANATION: [short explanation of why your answer is verifiable in the case]
+        ANSWER: [correct answer e.g. "A"]
+        """.strip().replace(
+            "        ", ""
+        )  # remove tabs
+    def select_case_sections(self) -> str:
+        """Extract and format selected sections from case data into paragraphs.
+        Returns:
+            str: Formatted string with case sections and content
+        """
+        section_mapping = {
+            "history": ("history", "No history provided."),
+            "image_finding": ("image_finding", "No findings provided."),
+            "discussion": ("discussion", "No discussion provided."),
+            "differential_diagnosis": (
+                "differential_diagnosis",
+                "No differential diagnosis provided.",
+            ),
+            "diagnosis": ("diagnosis", "No diagnosis provided."),
+            "figures": ("figures", "No figures provided."),
+        }
+        formatted = []
+        for section in self.sections:
+            if section in section_mapping:
+                key, default = section_mapping[section]
+                content = self.case_data.get(key, default)
+                if key == "figures":
+                    figures_text = []
+                    for figure in content:
+                        for subfig in figure["subfigures"]:
+                            figures_text.append(f"{subfig['number']}: {subfig['caption']}")
+                    content = "\n".join(figures_text)
+                formatted.append(f"{section}:\n{content}")
+        return "\n\n".join(formatted)
+    def create_question(
+        self,
+        client: openai.OpenAI,
+        temperature: float = 0.7,
+        top_p: float = 0.95,
+        max_tokens: int = 500,
+        model: str = "gpt-4o",
+    ) -> str:
+        """Create a clinical question using LLM.
+        Args:
+            client (openai.OpenAI): OpenAI client instance
+            temperature (float): Controls randomness in responses. Defaults to 0.7.
+            top_p (float): Controls diversity via nucleus sampling. Defaults to 0.95.
+            max_tokens (int): Max tokens in model response. Defaults to 500.
+            model (str): OpenAI model to use. Defaults to "gpt-4o".
+        Returns:
+            str: LLM response containing formatted question components
+        """
+        self.raw_content = get_llm_response(
+            client=client,
+            prompt=self.create_question_prompt(),
+            system_prompt=self.system_prompt,
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            model=model,
+        )
+        self.content = self.extract_content()
+        return self.raw_content
+    def extract_content(self) -> Dict[str, str]:
+        """Extract sections from raw LLM response using regex patterns.
+        Returns:
+            Dict[str, str]: Extracted sections including thoughts, question, figures, explanation, and answer
+        """
+        keywords = ["THOUGHTS", "QUESTION", "FIGURES", "EXPLANATION", "ANSWER"]
+        content = {}
+        for kw in keywords:
+            pattern = rf"{kw}:\s*(.*?)(?=\n[A-Z]+:|$)"
+            match = re.search(pattern, self.raw_content, re.DOTALL)
+            content[kw.lower()] = match.group(1).strip() if match else None
+        return content
+    def save(self, output_path: str) -> Dict[str, Any]:
+        """Save question content and metadata as a JSON file.
+        Args:
+            output_path (str): Directory path where the JSON file will be saved
+        Returns:
+            Dict[str, Any]: Question data including content (thoughts, question, figures, options,
+                explanation, answer) and metadata (type, difficulty, categories, etc.)
+        """
+        question_metadata = self.content.copy()
+        # Add metadata
+        question_metadata["metadata"] = {
+            "case_id": self.case_id,
+            "type": self.type,
+            "difficulty": self.difficulty,
+            "categories": self.categories,
+            "sections": self.sections,
+        }
+        # Create a directory for the case
+        case_dir = os.path.join(output_path, str(self.case_id))
+        os.makedirs(case_dir, exist_ok=True)
+        # Save the question metadata to a JSON file
+        output_file = os.path.join(case_dir, f"{self.case_id}_{self.__hash__()}.json")
+        with open(output_file, "w") as f:
+            json.dump(question_metadata, f, indent=2)
+        return question_metadata
+def generate_questions(
+    dataset: Dict[str, Any],
+    client: openai.OpenAI,
+    output_dir: str,
+    skip_first: int = 100,
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+    max_tokens: int = 1200,
+    model: str = "gpt-4o",
+) -> None:
+    """Generate questions for each case and category combination.
+    Args:
+        dataset: Dictionary of case data
+        client: OpenAI client instance
+        output_dir: Directory to save generated questions
+        skip_first: Number of initial cases to skip
+        temperature: LLM temperature parameter
+        top_p: LLM top_p parameter
+        max_tokens: Maximum tokens for LLM response
+        model: LLM model name
+    """
+    target_cases = sorted(list(dataset.keys()), key=int)[-len(dataset) : -skip_first]
+    for case_id in tqdm(target_cases, desc="Processing cases"):
+        case_data = dataset[case_id]
+        for category in tqdm(CATEGORY_COMBINATIONS, desc=f"Categories for case {case_id}"):
+            question = Question(
+                type="multiple choice (A/B/C/D/E/F)",
+                difficulty="complex",
+                case_data=case_data,
+                categories=category,
+                sections=DEFAULT_SECTIONS,
+                system_prompt=SYSTEM_PROMPT,
+            )
+            response = question.create_question(
+                client=client,
+                temperature=temperature,
+                top_p=top_p,
+                max_tokens=max_tokens,
+                model=model,
+            )
+            question.save(output_dir)
+def main():
+    """Main execution function."""
+    client = openai.OpenAI()
+    # Load and verify dataset
+    dataset = load_eurorad_dataset(
+        DATASET_PATH,
+        section="Chest Imaging",
+        as_dict=True,
+        filter_by_caption=[
+            "xray",
+            "x-ray",
+            "x ray",
+            "ray",
+            "xr",
+            "radiograph",
+        ],
+    )
+    print(f"\n---\nFound {len(dataset)} cases with X-ray mentions\n---\n")
+    # Optional: Print sample case for verification
+    case_data = dataset["16798"]
+    pprint(case_data, sort_dicts=False)
+    # Generate questions
+    generate_questions(dataset=dataset, client=client, output_dir="benchmark/questions")
+if __name__ == "__main__":
+    main()

benchmark/llm.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import openai
+from typing import List
+def get_llm_response(
+    client: openai.OpenAI,
+    prompt: str,
+    system_prompt: str = "You are a helpful assistant.",
+    model: str = "gpt-4o-mini",
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+    max_tokens: int = 500,
+) -> str:
+    """
+    Get response from OpenAI language model.
+    Args:
+        client (openai.OpenAI): OpenAI client
+        prompt (str): The user prompt/question to send to the model
+        system_prompt (str, optional): System prompt to set model behavior.
+        model (str, optional): OpenAI model to use. Defaults to "gpt-4o-mini".
+        temperature (float, optional): Controls randomness in responses. Defaults to 0.7.
+        top_p (float, optional): Controls diversity via nucleus sampling. Defaults to 0.95.
+        max_tokens (int, optional): Max tokens in model response. Defaults to 200.
+    Returns:
+        str: The model's response text
+    """
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": prompt},
+    ]
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens,
+    )
+    return response.choices[0].message.content

benchmark/utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+import json
+from typing import Dict, List
+def load_eurorad_dataset(
+    dataset_path: str,
+    section: str = "any",
+    as_dict: bool = False,
+    filter_by_caption: List[str] = [
+        "xray",
+        "x-ray",
+        "x ray",
+        "ray",
+        "xr",
+        "radiograph",
+        "radiogram",
+        "plain film",
+    ],
+) -> List[Dict] | Dict[str, Dict]:
+    """
+    Load a dataset from a JSON file.
+    Args:
+        dataset_path (str): Path to the JSON dataset file.
+        section (str, optional): Section of the dataset to load. Defaults to "any".
+        as_dict (bool, optional): Whether to return data as dict. Defaults to False.
+        filter_by_caption (List[str], optional): List of strings to filter cases by caption content. Defaults to [].
+    Returns:
+        List[Dict] | Dict[str, Dict]: The loaded dataset as a list of dictionaries or dict if as_dict=True.
+    Raises:
+        FileNotFoundError: If dataset_path does not exist
+        json.JSONDecodeError: If file is not valid JSON
+    """
+    with open(dataset_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+    if filter_by_caption:
+        filtered_data = {}
+        for case_id, case in data.items():
+            if any(
+                any(x in subfig["caption"].lower() for x in filter_by_caption)
+                for figure in case["figures"]
+                for subfig in figure["subfigures"]
+            ) or any(x in case["image_finding"].lower() for x in filter_by_caption):
+                filtered_data[case_id] = case
+        data = filtered_data
+    if section != "any":
+        section = section.strip().lower()
+        if not as_dict:
+            data = [
+                item for item in data.values() if item.get("section", "").strip().lower() == section
+            ]
+        else:
+            data = {
+                k: v for k, v in data.items() if v.get("section", "").strip().lower() == section
+            }
+    elif not as_dict:
+        data = list(data.values())
+    return data
+def save_dataset(dataset: Dict | List[Dict], dataset_path: str):
+    """
+    Save a dataset to a JSON file.
+    Args:
+        dataset (Dict | List[Dict]): The dataset to save as a dictionary or list of dictionaries.
+        dataset_path (str): Path where the JSON dataset file will be saved.
+    """
+    with open(dataset_path, "w", encoding="utf-8") as file:
+        json.dump(dataset, file)

data/eurorad_metadata.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/figures.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import json
+import os
+from pathlib import Path
+import requests
+from tqdm import tqdm
+def download_eurorad_figures(metadata_path: str, output_dir: str) -> None:
+    """
+    Download figures from Eurorad dataset and save them organized by case_id.
+    Args:
+        metadata_path: Path to the eurorad_metadata.json file
+        output_dir: Base directory where figures will be saved
+    The figures will be saved as:
+        {output_dir}/{case_id}/{figure_number}.jpg
+    Example:
+        figures/189/Figure_1a.jpg
+    """
+    # Create output directory if it doesn't exist
+    output_path = Path(output_dir)
+    output_path.mkdir(exist_ok=True)
+    # Load metadata
+    with open(metadata_path) as f:
+        metadata = json.load(f)
+    # Iterate through all cases with progress bar
+    for case_id in tqdm(metadata, desc="Downloading cases", unit="case"):
+        case = metadata[case_id]
+        case_dir = output_path / str(case["case_id"])
+        case_dir.mkdir(exist_ok=True)
+        # Process all figures and their subfigures
+        for figure in case["figures"]:
+            for subfig in figure["subfigures"]:
+                # Remove leading and trailing whitespace and convert to lowercase
+                subfig_name = f"{subfig['number'].strip().replace(' ', '_').lower()}.jpg"
+                subfig_path = Path(case_dir) / subfig_name
+                save_figure(
+                    url=subfig["url"],
+                    output_path=subfig_path,
+                )
+def save_figure(url: str, output_path: Path) -> None:
+    """
+    Download and save a single figure.
+    Args:
+        url: URL of the figure to download
+        output_path: Path where the figure should be saved
+    """
+    if output_path.exists():
+        return
+    try:
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        with open(output_path, "wb") as f:
+            f.write(response.content)
+    except Exception as e:
+        print(f"Error downloading {url}: {e}")
+if __name__ == "__main__":
+    root = os.path.dirname(os.path.abspath(__file__))
+    download_eurorad_figures(
+        metadata_path=os.path.join(root, "eurorad_metadata.json"),
+        output_dir=os.path.join(root, "figures"),
+    )

data/get_cases.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import requests
+from bs4 import BeautifulSoup
+import time
+import json
+from tqdm import tqdm
+def get_response(url):
+    headers = {
+        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54"
+    }
+    return requests.get(url, headers=headers)
+def get_case_numbers_from_page(page):
+    url = f"https://www.eurorad.org/advanced-search?sort_by=published_at&sort_order=ASC&page={page}&filter%5B0%5D=section%3A40"
+    # Remove proxy usage since it's likely triggering the protection
+    response = get_response(url)
+    print(response.text)
+    soup = BeautifulSoup(response.text, "html.parser")
+    spans = soup.find_all("span", class_="case__number small")
+    # Remove '#' from the span text and strip extra whitespace
+    numbers = [span.text.strip().replace("#", "").strip() for span in spans]
+    return numbers
+def main():
+    total_pages = 107  # Pages 0 through 106
+    all_numbers = []
+    for page in tqdm(range(total_pages)):
+        numbers = get_case_numbers_from_page(page)
+        all_numbers.extend(numbers)
+        if page != total_pages - 1 and len(numbers) != 9:
+            print(f"Warning: Page {page} returned {len(numbers)} cases instead of 9")
+        # Be kind to the server – avoid hitting it too fast
+        time.sleep(1)
+        break
+    with open('case_numbers.json', 'w') as f:
+        json.dump(all_numbers, f)
+    print(f"Saved {len(all_numbers)} case numbers to case_numbers.json")
+if __name__ == "__main__":
+    main()

data/stats/age_distribution.png ADDED Viewed

Git LFS Details

SHA256: 0409ec03f305ccd8fdee1c097dede52b7cf0f84f05b99fbd18727fb8e67238ad
Pointer size: 132 Bytes
Size of remote file: 2.71 MB

data/stats/area_of_interest_distribution.png ADDED Viewed

Git LFS Details

SHA256: 2a80d9aa1bf9b025b8aaa2b1c0d4807e36afc175747ba71b500ef1ceaf542081
Pointer size: 132 Bytes
Size of remote file: 2.91 MB

data/stats/gender_distribution.png ADDED Viewed

Git LFS Details

SHA256: a4cfd37f71fc91a848d990f6e2ff6c9611f555e09e435885680ffbbb85458838
Pointer size: 132 Bytes
Size of remote file: 1.96 MB

demo/chest/LIDC.dcm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11d25b1d34dff083057de994fef7da3dcef75bd7b334823ec6cb9c16b3ba0338
+size 17071804

demo/chest/Pseudo.dcm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b35ae460fb5f62eb6d6c4c5117f6683100ad92c5fb6ba1a3c36da39703c4652
+size 7535280

demo/chest/RIDER.dcm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc15f7afa5434991e1359f596433870ad611b42227db87d484d31976545de7fd
+size 7534066

demo/chest/TCGAA.dcm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e8137290ac823d3da3c00ce3e18120123eaa62a786934c7afc52a989b0b64cf
+size 7535274

demo/chest/__init__.py ADDED Viewed

File without changes

demo/chest/effusion1.png ADDED Viewed

Git LFS Details

SHA256: ba5af84601f11ab44142e5dfaf578b49d76de45633470e606c7edc4b1c77ba07
Pointer size: 131 Bytes
Size of remote file: 233 kB

demo/chest/normal1.jpg ADDED Viewed

Git LFS Details

SHA256: 785419c9ec7d0235fe056c254cd3be785d6052b558ae32c595ad558be57062dd
Pointer size: 132 Bytes
Size of remote file: 1.05 MB

demo/chest/normal2.jpg ADDED Viewed

Git LFS Details

SHA256: cecf56a8b90e9ccb3c54641beb40652e72a2bdcb311efc696a331fe4de7efbf0
Pointer size: 131 Bytes
Size of remote file: 798 kB

demo/chest/normal3.jpg ADDED Viewed

Git LFS Details

SHA256: 3f721831529e9604c99e3bd999483321e0e0648c5987351570fe45e48c190948
Pointer size: 132 Bytes
Size of remote file: 1.43 MB

demo/chest/normal4.jpg ADDED Viewed

Git LFS Details

SHA256: ed84d75328f1eb80c6554e3c6ba8dcd573e733914b2934bfce399ae6e8f38ec4
Pointer size: 131 Bytes
Size of remote file: 566 kB

demo/chest/normal5.jpg ADDED Viewed

Git LFS Details

SHA256: 9e7c4251d9b300f9256c6fe72ef1c3167beeecca747e6b9c8b80ee3260ea9ac8
Pointer size: 131 Bytes
Size of remote file: 353 kB

demo/chest/normal6.jpg ADDED Viewed

Git LFS Details

SHA256: 4b47dd1665b828ab3610d1a60ec08c37083579f834b2dd5891570c8a105825a5
Pointer size: 131 Bytes
Size of remote file: 387 kB

demo/chest/pneumonia1.jpg ADDED Viewed

Git LFS Details

SHA256: 92d1c1e3334b1dd8f1d5eea56681adeb38dc5b7c8dd17536fb0e47fc701c5ae1
Pointer size: 130 Bytes
Size of remote file: 35.6 kB

demo/chest/pneumonia2.jpg ADDED Viewed

Git LFS Details

SHA256: eb17ab7b6f63d0f0078c378a1cc1debbffffd6331cb2723f7169410a738287fa
Pointer size: 130 Bytes
Size of remote file: 56.7 kB

demo/chest/pneumonia3.jpg ADDED Viewed

Git LFS Details

SHA256: dd41c787362b60e03037b6658f3824068ea268d83915904efb09aae95e10bd72
Pointer size: 130 Bytes
Size of remote file: 81.7 kB

demo/chest/pneumonia4.jpg ADDED Viewed

Git LFS Details

SHA256: 8223cf57d33d1528782f83b62d3d62d2f41fe9bf34053553a86e609c2b2ba94b
Pointer size: 131 Bytes
Size of remote file: 109 kB

demo/chest/pneumonia5.jpg ADDED Viewed

Git LFS Details

SHA256: 59bee7e6a36e7629a320e1c74d65dd0683c8310dbbb2489f5d32054419a3a667
Pointer size: 131 Bytes
Size of remote file: 153 kB

experiments/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# Experiments
+Below are the instructions for running experiments using our novel ChestAgentBench and the previous SoTA CheXbench. ChestAgentBench is a comprehensive benchmark containing over 2,500 complex medical queries across 8 diverse categories.
+### ChestAgentBench
+To run gpt-4o on ChestAgentBench, enter the `experiments` directory and run the following script:
+```bash
+python benchmark_gpt4o.py
+```
+To run llama 3.2 vision 90B on ChestAgentBench, run the following:
+```bash
+python benchmark_llama.py
+```
+To run chexagent on ChestAgentBench, run the following:
+```bash
+python benchmark_chexagent.py
+```
+To run llava-med on ChestAgentBench, you'll need to clone their repo and copy the following script into it, after you follow their setup instructions.
+```bash
+mv benchmark_llavamed.py ~/LLaVA-Med/llava/serve
+python -m llava.serve.benchmark_llavamed --model-name llava-med-v1.5-mistral-7b --controller http://localhost:10000
+```
+If you want to inspect the logs, you can run the following. It will select the most recent log file by default.
+```bash
+python inspect_logs.py [optional: log-file] -n [num-logs]
+```
+Finally, to analyze results, run:
+```bash
+python analyze_axes.py results/[logfile].json ../benchmark/questions/ --model [gpt4|llama|chexagent|llava-med] --max-questions [optional:int]
+```
+### CheXbench
+To run the models on chexbench, you can use `chexbench_gpt4.py` as a reference. You'll need to download the dataset files locally, and upload them for each request. Rad-ReStruct and Open-I use the same set of images, so you can download the `NLMCXR.zip` file just once and copy the images to both directories.
+You can find the datasets here:
+1. [SLAKE: A Semantically-Labeled Knowledge-Enhanced Dataset for Medical Visual Question Answering](https://www.med-vqa.com/slake/). Save this to `MedMAX/data/slake`.
+2. [Rad-ReStruct: A Novel VQA Benchmark and Method for Structured Radiology Reporting](https://github.com/ChantalMP/Rad-ReStruct). Save the images to `MedMAX/data/rad-restruct/images`.
+3. [Open-I Service of the National Library of Medicine](https://openi.nlm.nih.gov/faq). Save the images to `MedMAX/data/openi/images`.
+Once you're finished, you'll want to fix the paths in the `chexbench.json` file to your local paths using the `MedMax/data/fix_chexbench.py` script.
+### Compare Runs
+Analyze a single file based on overall accuracy and along different axes
+```
+python compare_runs.py results/medmax.json
+```
+For a direct evaluation comparing **2** models, on the exact same questions
+```
+python compare_runs.py results/medmax.json results/gpt4o.json
+```
+For a direct evaluation comparing **ALL** models, on the exact same questions (add as many model log files as you want).
+```
+python compare_runs.py results/medmax.json results/gpt4o.json results/llama.json results/chexagent.json results/llavamed.json
+```

experiments/analyze_axes.py ADDED Viewed

	@@ -0,0 +1,385 @@

+from typing import Dict, List, Optional, Tuple, Union, Any
+import json
+import os
+import sys
+import argparse
+from collections import defaultdict
+from tqdm import tqdm
+QUESTION_TYPES = {
+    "Detailed Finding Analysis": ["detection", "localization", "characterization"],
+    "Pattern Recognition & Relations": ["detection", "classification", "relationship"],
+    "Spatial Understanding": ["localization", "comparison", "relationship"],
+    "Clinical Decision Making": ["classification", "comparison", "diagnosis"],
+    "Diagnostic Classification": ["classification", "characterization", "diagnosis"],
+}
+def extract_answer_letter(answer: Optional[Union[str, Any]]) -> Optional[str]:
+    """
+    Extract just the letter from various answer formats.
+    Args:
+        answer: The answer text to extract letter from
+    Returns:
+        Optional[str]: The extracted letter in uppercase, or None if no letter found
+    """
+    if not answer:
+        return None
+    # Convert to string and clean
+    answer = str(answer).strip()
+    # If it's just a single letter, return it
+    if len(answer) == 1 and answer.isalpha():
+        return answer.upper()
+    # Try to extract letter from format like "A)" or "A."
+    if len(answer) >= 2 and answer[0].isalpha() and answer[1] in ").:- ":
+        return answer[0].upper()
+    # Try to extract letter from format like "A) Some text"
+    if answer.startswith(("A)", "B)", "C)", "D)", "E)", "F)")):
+        return answer[0].upper()
+    return None
+def analyze_gpt4_results(
+    results_file: str, max_questions: Optional[int] = None
+) -> Tuple[float, Dict, Dict, List[str], List[str]]:
+    """
+    Analyze results in GPT-4 format.
+    Args:
+        results_file: Path to results file
+        max_questions: Maximum number of questions to analyze
+    Returns:
+        Tuple containing:
+            - overall_accuracy (float)
+            - category_accuracies (Dict)
+            - question_type_stats (Dict)
+            - correct_ids (List[str])
+            - incorrect_ids (List[str])
+    """
+    category_performance = defaultdict(lambda: {"total": 0, "correct": 0})
+    all_questions = 0
+    all_correct = 0
+    correct_ids = []
+    incorrect_ids = []
+    with open(results_file, "r") as f:
+        lines = f.readlines()
+    processed_questions = 0
+    for line in tqdm(lines, desc="Analyzing Benchmark Results"):
+        # Check if we've hit the maximum questions
+        if max_questions is not None and processed_questions >= max_questions:
+            break
+        if line.startswith("HTTP Request:"):
+            continue
+        try:
+            entry = json.loads(line)
+            metadata = entry.get("input", {}).get("question_data", {}).get("metadata", {})
+            question_id = entry.get("question_id")
+            model_letter = extract_answer_letter(entry.get("model_answer"))
+            correct_letter = extract_answer_letter(entry.get("correct_answer"))
+            if model_letter and correct_letter:
+                all_questions += 1
+                processed_questions += 1
+                is_correct = model_letter == correct_letter
+                if is_correct:
+                    all_correct += 1
+                    correct_ids.append(question_id)
+                else:
+                    incorrect_ids.append(question_id)
+                for category in metadata.get("categories", []):
+                    category_performance[category]["total"] += 1
+                    if is_correct:
+                        category_performance[category]["correct"] += 1
+        except json.JSONDecodeError:
+            continue
+    return process_results(
+        category_performance, all_questions, all_correct, correct_ids, incorrect_ids
+    )
+def analyze_llama_results(
+    results_file: str, max_questions: Optional[int] = None
+) -> Tuple[float, Dict, Dict, List[str], List[str]]:
+    """
+    Analyze results in Llama format.
+    Args:
+        results_file: Path to results file
+        max_questions: Maximum number of questions to analyze
+    Returns:
+        Tuple containing:
+            - overall_accuracy (float)
+            - category_accuracies (Dict)
+            - question_type_stats (Dict)
+            - correct_ids (List[str])
+            - incorrect_ids (List[str])
+    """
+    category_performance = defaultdict(lambda: {"total": 0, "correct": 0})
+    all_questions = 0
+    all_correct = 0
+    correct_ids = []
+    incorrect_ids = []
+    with open(results_file, "r") as f:
+        lines = f.readlines()
+    # If max_questions is set, limit the number of lines processed
+    if max_questions is not None:
+        lines = lines[:max_questions]
+    for line in tqdm(lines, desc="Analyzing Benchmark Results"):
+        if line.startswith("HTTP Request:"):
+            continue
+        try:
+            entry = json.loads(line)
+            metadata = entry.get("input", {}).get("question_data", {}).get("metadata", {})
+            question_id = entry.get("question_id")
+            model_letter = extract_answer_letter(entry.get("model_answer"))
+            correct_letter = extract_answer_letter(entry.get("correct_answer"))
+            if model_letter and correct_letter:
+                all_questions += 1
+                is_correct = model_letter == correct_letter
+                if is_correct:
+                    all_correct += 1
+                    correct_ids.append(question_id)
+                else:
+                    incorrect_ids.append(question_id)
+                for category in metadata.get("categories", []):
+                    category_performance[category]["total"] += 1
+                    if is_correct:
+                        category_performance[category]["correct"] += 1
+        except json.JSONDecodeError:
+            continue
+    return process_results(
+        category_performance, all_questions, all_correct, correct_ids, incorrect_ids
+    )
+def analyze_chexagent_results(
+    results_file: str, max_questions: Optional[int] = None
+) -> Tuple[float, Dict, Dict, List[str], List[str]]:
+    """
+    Analyze results in CheXagent format.
+    Args:
+        results_file: Path to results file
+        max_questions: Maximum number of questions to analyze
+    Returns:
+        Tuple containing:
+            - overall_accuracy (float)
+            - category_accuracies (Dict)
+            - question_type_stats (Dict)
+            - correct_ids (List[str])
+            - incorrect_ids (List[str])
+    """
+    category_performance = defaultdict(lambda: {"total": 0, "correct": 0})
+    all_questions = 0
+    all_correct = 0
+    correct_ids = []
+    incorrect_ids = []
+    with open(results_file, "r") as f:
+        lines = f.readlines()
+    # If max_questions is set, limit the number of lines processed
+    if max_questions is not None:
+        lines = lines[:max_questions]
+    for line in tqdm(lines, desc="Analyzing Benchmark Results"):
+        try:
+            entry = json.loads(line)
+            metadata = entry.get("input", {}).get("question_data", {}).get("metadata", {})
+            question_id = entry.get("question_id")
+            model_letter = extract_answer_letter(entry.get("model_answer"))
+            correct_letter = extract_answer_letter(entry.get("correct_answer"))
+            if model_letter and correct_letter:
+                all_questions += 1
+                is_correct = model_letter == correct_letter
+                if is_correct:
+                    all_correct += 1
+                    correct_ids.append(question_id)
+                else:
+                    incorrect_ids.append(question_id)
+                for category in metadata.get("categories", []):
+                    category_performance[category]["total"] += 1
+                    if is_correct:
+                        category_performance[category]["correct"] += 1
+        except json.JSONDecodeError:
+            continue
+    return process_results(
+        category_performance, all_questions, all_correct, correct_ids, incorrect_ids
+    )
+def process_results(
+    category_performance: Dict,
+    all_questions: int,
+    all_correct: int,
+    correct_ids: Optional[List[str]] = None,
+    incorrect_ids: Optional[List[str]] = None,
+) -> Tuple[float, Dict, Dict, List[str], List[str]]:
+    """
+    Process raw results into final statistics.
+    Args:
+        category_performance: Dict containing performance by category
+        all_questions: Total number of questions
+        all_correct: Total number of correct answers
+        correct_ids: List of IDs for correctly answered questions
+        incorrect_ids: List of IDs for incorrectly answered questions
+    Returns:
+        Tuple containing:
+            - overall_accuracy (float)
+            - category_accuracies (Dict)
+            - question_type_stats (Dict)
+            - correct_ids (List[str])
+            - incorrect_ids (List[str])
+    """
+    category_accuracies = {
+        category: {
+            "accuracy": stats["correct"] / stats["total"] * 100 if stats["total"] > 0 else 0,
+            "total": stats["total"],
+            "correct": stats["correct"],
+        }
+        for category, stats in category_performance.items()
+    }
+    question_type_stats = {}
+    for qtype, categories in QUESTION_TYPES.items():
+        total = sum(
+            category_performance[cat]["total"] for cat in categories if cat in category_performance
+        )
+        correct = sum(
+            category_performance[cat]["correct"]
+            for cat in categories
+            if cat in category_performance
+        )
+        question_type_stats[qtype] = {
+            "accuracy": (correct / total * 100) if total > 0 else 0,
+            "total": total,
+            "correct": correct,
+        }
+    overall_accuracy = (all_correct / all_questions * 100) if all_questions > 0 else 0
+    return (
+        overall_accuracy,
+        category_accuracies,
+        question_type_stats,
+        correct_ids or [],
+        incorrect_ids or [],
+    )
+def print_analysis(
+    overall_accuracy: float,
+    category_accuracies: Dict,
+    question_type_stats: Dict,
+    correct_ids: List[str],
+    incorrect_ids: List[str],
+    model_name: str,
+) -> None:
+    """
+    Print analysis results.
+    Args:
+        overall_accuracy: Overall accuracy percentage
+        category_accuracies: Dict containing accuracy metrics by category
+        question_type_stats: Dict containing stats by question type
+        correct_ids: List of IDs for correctly answered questions
+        incorrect_ids: List of IDs for incorrectly answered questions
+        model_name: Name of the model being analyzed
+    """
+    total_questions = len(correct_ids) + len(incorrect_ids)
+    print(
+        f"\nOverall Accuracy: {overall_accuracy:.2f}% ({len(correct_ids)} correct out of {total_questions} questions)"
+    )
+    print("\nCategory Performance:")
+    sorted_categories = sorted(
+        category_accuracies.items(), key=lambda x: x[1]["accuracy"], reverse=True
+    )
+    for category, metrics in sorted_categories:
+        print(f"{category}:")
+        print(f"  Accuracy: {metrics['accuracy']:.2f}%")
+        print(f"  Total Questions: {metrics['total']}")
+        print(f"  Correct Questions: {metrics['correct']}")
+    print("\nQuestion Type Performance:")
+    sorted_types = sorted(question_type_stats.items(), key=lambda x: x[1]["accuracy"], reverse=True)
+    for qtype, metrics in sorted_types:
+        print(f"\n{qtype}:")
+        print(f"  Accuracy: {metrics['accuracy']:.2f}%")
+        print(f"  Total Questions: {metrics['total']}")
+        print(f"  Correct Questions: {metrics['correct']}")
+        print(f"  Categories: {', '.join(QUESTION_TYPES[qtype])}")
+    # Save question IDs to JSON
+    question_ids = {"correct_ids": correct_ids, "incorrect_ids": incorrect_ids}
+    output_filename = f"{model_name}_question_ids.json"
+    with open(output_filename, "w") as f:
+        json.dump(question_ids, f, indent=2)
+    print(f"\nQuestion IDs have been saved to {output_filename}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Analyze benchmark results")
+    parser.add_argument("results_file", help="Path to results file")
+    parser.add_argument("benchmark_dir", nargs="?", help="Path to benchmark questions directory")
+    parser.add_argument(
+        "--model",
+        choices=["llava-med", "chexagent", "llama", "gpt4", "medrax"],
+        default="gpt4",
+        help="Specify model format (default: gpt4)",
+    )
+    parser.add_argument("--max-questions", type=int, help="Maximum number of questions to analyze")
+    args = parser.parse_args()
+    if args.model == "gpt4":
+        results = analyze_gpt4_results(args.results_file, args.max_questions)
+    elif args.model == "llama":
+        results = analyze_llama_results(args.results_file, args.max_questions)
+    elif args.model == "chexagent":
+        results = analyze_chexagent_results(args.results_file, args.max_questions)
+    elif args.model == "medrax":
+        results = analyze_gpt4_results(args.results_file, args.max_questions)
+    else:
+        parser.error(f"Unsupported model: {args.model}")
+    print_analysis(*results, args.model)

experiments/benchmark_chexagent.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import re
+import json
+import os
+import glob
+import time
+import logging
+from datetime import datetime
+import torch
+from PIL import Image
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from tqdm import tqdm
+# Configure model settings
+MODEL_NAME = "StanfordAIMI/CheXagent-2-3b"
+DTYPE = torch.bfloat16
+DEVICE = "cuda"
+# Configure logging
+log_filename = f"model_inference_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+logging.basicConfig(filename=log_filename, level=logging.INFO, format="%(message)s")
+def initialize_model() -> tuple[AutoModelForCausalLM, AutoTokenizer]:
+    """Initialize the CheXagent model and tokenizer.
+    Returns:
+        tuple containing:
+            - AutoModelForCausalLM: The initialized CheXagent model
+            - AutoTokenizer: The initialized tokenizer
+    """
+    print("Loading model and tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME, device_map="auto", trust_remote_code=True
+    )
+    model = model.to(DTYPE)
+    model.eval()
+    return model, tokenizer
+def create_inference_request(
+    question_data: dict,
+    case_details: dict,
+    case_id: str,
+    question_id: str,
+    model: AutoModelForCausalLM,
+    tokenizer: AutoTokenizer,
+) -> str | None:
+    """Create and execute an inference request for the CheXagent model.
+    Args:
+        question_data: Dictionary containing question details and metadata
+        case_details: Dictionary containing case information and image paths
+        case_id: Unique identifier for the medical case
+        question_id: Unique identifier for the question
+        model: The initialized CheXagent model
+        tokenizer: The initialized tokenizer
+    Returns:
+        str | None: Single letter answer (A-F) if successful, None if failed
+    """
+    system_prompt = """You are a medical imaging expert. Your task is to provide ONLY a single letter answer.
+Rules:
+1. Respond with exactly one uppercase letter (A/B/C/D/E/F)
+2. Do not add periods, explanations, or any other text
+3. Do not use markdown or formatting
+4. Do not restate the question
+5. Do not explain your reasoning
+Examples of valid responses:
+A
+B
+C
+Examples of invalid responses:
+"A."
+"Answer: B"
+"C) This shows..."
+"The answer is D"
+"""
+    prompt = f"""Given the following medical case:
+Please answer this multiple choice question:
+{question_data['question']}
+Base your answer only on the provided images and case information."""
+    # Parse required figures
+    try:
+        if isinstance(question_data["figures"], str):
+            try:
+                required_figures = json.loads(question_data["figures"])
+            except json.JSONDecodeError:
+                required_figures = [question_data["figures"]]
+        elif isinstance(question_data["figures"], list):
+            required_figures = question_data["figures"]
+        else:
+            required_figures = [str(question_data["figures"])]
+    except Exception as e:
+        print(f"Error parsing figures: {e}")
+        required_figures = []
+    required_figures = [
+        fig if fig.startswith("Figure ") else f"Figure {fig}" for fig in required_figures
+    ]
+    # Get image paths
+    image_paths = []
+    for figure in required_figures:
+        base_figure_num = "".join(filter(str.isdigit, figure))
+        figure_letter = "".join(filter(str.isalpha, figure.split()[-1])) or None
+        matching_figures = [
+            case_figure
+            for case_figure in case_details.get("figures", [])
+            if case_figure["number"] == f"Figure {base_figure_num}"
+        ]
+        for case_figure in matching_figures:
+            subfigures = []
+            if figure_letter:
+                subfigures = [
+                    subfig
+                    for subfig in case_figure.get("subfigures", [])
+                    if subfig.get("number", "").lower().endswith(figure_letter.lower())
+                    or subfig.get("label", "").lower() == figure_letter.lower()
+                ]
+            else:
+                subfigures = case_figure.get("subfigures", [])
+            for subfig in subfigures:
+                if "local_path" in subfig:
+                    image_paths.append("medrax/data/" + subfig["local_path"])
+    if not image_paths:
+        print(f"No local images found for case {case_id}, question {question_id}")
+        return None
+    try:
+        start_time = time.time()
+        # Prepare input for the model
+        query = tokenizer.from_list_format(
+            [*[{"image": path} for path in image_paths], {"text": prompt}]
+        )
+        conv = [{"from": "system", "value": system_prompt}, {"from": "human", "value": query}]
+        input_ids = tokenizer.apply_chat_template(
+            conv, add_generation_prompt=True, return_tensors="pt"
+        )
+        # Generate response
+        with torch.no_grad():
+            output = model.generate(
+                input_ids.to(DEVICE),
+                do_sample=False,
+                num_beams=1,
+                temperature=1.0,
+                top_p=1.0,
+                use_cache=True,
+                max_new_tokens=512,
+            )[0]
+        response = tokenizer.decode(output[input_ids.size(1) : -1])
+        duration = time.time() - start_time
+        # Clean response
+        clean_answer = validate_answer(response)
+        # Log response
+        log_entry = {
+            "case_id": case_id,
+            "question_id": question_id,
+            "timestamp": datetime.now().isoformat(),
+            "model": MODEL_NAME,
+            "duration": round(duration, 2),
+            "model_answer": clean_answer,
+            "correct_answer": question_data["answer"],
+            "input": {
+                "question_data": {
+                    "question": question_data["question"],
+                    "explanation": question_data["explanation"],
+                    "metadata": question_data.get("metadata", {}),
+                    "figures": question_data["figures"],
+                },
+                "image_paths": image_paths,
+            },
+        }
+        logging.info(json.dumps(log_entry))
+        return clean_answer
+    except Exception as e:
+        print(f"Error processing case {case_id}, question {question_id}: {str(e)}")
+        log_entry = {
+            "case_id": case_id,
+            "question_id": question_id,
+            "timestamp": datetime.now().isoformat(),
+            "model": MODEL_NAME,
+            "status": "error",
+            "error": str(e),
+            "input": {
+                "question_data": {
+                    "question": question_data["question"],
+                    "explanation": question_data["explanation"],
+                    "metadata": question_data.get("metadata", {}),
+                    "figures": question_data["figures"],
+                },
+                "image_paths": image_paths,
+            },
+        }
+        logging.info(json.dumps(log_entry))
+        return None
+def validate_answer(response_text: str) -> str | None:
+    """Enforce strict single-letter response format.
+    Args:
+        response_text: Raw response text from the model
+    Returns:
+        str | None: Single uppercase letter (A-F) if valid, None if invalid
+    """
+    if not response_text:
+        return None
+    # Remove all whitespace and convert to uppercase
+    cleaned = response_text.strip().upper()
+    # Check if it's exactly one valid letter
+    if len(cleaned) == 1 and cleaned in "ABCDEF":
+        return cleaned
+    # If not, try to extract just the letter
+    match = re.search(r"([A-F])", cleaned)
+    return match.group(1) if match else None
+def load_benchmark_questions(case_id: str) -> list[str]:
+    """Find all question files for a given case ID.
+    Args:
+        case_id: Unique identifier for the medical case
+    Returns:
+        list[str]: List of paths to question JSON files
+    """
+    benchmark_dir = "../benchmark/questions"
+    return glob.glob(f"{benchmark_dir}/{case_id}/{case_id}_*.json")
+def count_total_questions() -> tuple[int, int]:
+    """Count total number of cases and questions in benchmark.
+    Returns:
+        tuple containing:
+            - int: Total number of cases
+            - int: Total number of questions
+    """
+    total_cases = len(glob.glob("../benchmark/questions/*"))
+    total_questions = sum(
+        len(glob.glob(f"../benchmark/questions/{case_id}/*.json"))
+        for case_id in os.listdir("../benchmark/questions")
+    )
+    return total_cases, total_questions
+def main():
+    # Load the cases with local paths
+    with open("medrax/data/updated_cases.json", "r") as file:
+        data = json.load(file)
+    # Initialize model and tokenizer
+    model, tokenizer = initialize_model()
+    total_cases, total_questions = count_total_questions()
+    cases_processed = 0
+    questions_processed = 0
+    skipped_questions = 0
+    print(f"\nBeginning inference with {MODEL_NAME}")
+    print(f"Found {total_cases} cases with {total_questions} total questions")
+    # Process each case with progress bar
+    for case_id, case_details in tqdm(data.items(), desc="Processing cases"):
+        question_files = load_benchmark_questions(case_id)
+        if not question_files:
+            continue
+        cases_processed += 1
+        for question_file in tqdm(
+            question_files, desc=f"Processing questions for case {case_id}", leave=False
+        ):
+            with open(question_file, "r") as file:
+                question_data = json.load(file)
+                question_id = os.path.basename(question_file).split(".")[0]
+            questions_processed += 1
+            answer = create_inference_request(
+                question_data, case_details, case_id, question_id, model, tokenizer
+            )
+            if answer is None:
+                skipped_questions += 1
+                continue
+            print(f"\nCase {case_id}, Question {question_id}")
+            print(f"Model Answer: {answer}")
+            print(f"Correct Answer: {question_data['answer']}")
+    print(f"\nInference Summary:")
+    print(f"Total Cases Processed: {cases_processed}")
+    print(f"Total Questions Processed: {questions_processed}")
+    print(f"Total Questions Skipped: {skipped_questions}")
+if __name__ == "__main__":
+    main()

experiments/benchmark_gpt4o.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import json
+import openai
+import os
+import glob
+import time
+import logging
+from datetime import datetime
+from tenacity import retry, wait_exponential, stop_after_attempt
+model_name = "chatgpt-4o-latest"
+temperature = 0.2
+log_filename = f"api_usage_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+logging.basicConfig(filename=log_filename, level=logging.INFO, format="%(message)s")
+def calculate_cost(
+    prompt_tokens: int, completion_tokens: int, model: str = "chatgpt-4o-latest"
+) -> float:
+    """Calculate the cost of API usage based on token counts.
+    Args:
+        prompt_tokens: Number of tokens in the prompt
+        completion_tokens: Number of tokens in the completion
+        model: Model name to use for pricing, defaults to chatgpt-4o-latest
+    Returns:
+        float: Cost in USD
+    """
+    pricing = {"chatgpt-4o-latest": {"prompt": 5.0, "completion": 15.0}}
+    rates = pricing.get(model, {"prompt": 5.0, "completion": 15.0})
+    return (prompt_tokens * rates["prompt"] + completion_tokens * rates["completion"]) / 1000000
+@retry(wait=wait_exponential(multiplier=1, min=4, max=10), stop=stop_after_attempt(3))
+def create_multimodal_request(
+    question_data: dict, case_details: dict, case_id: str, question_id: str, client: openai.OpenAI
+) -> openai.types.chat.ChatCompletion:
+    """Create and send a multimodal request to the OpenAI API.
+    Args:
+        question_data: Dictionary containing question details and figures
+        case_details: Dictionary containing case information and figures
+        case_id: Identifier for the medical case
+        question_id: Identifier for the specific question
+        client: OpenAI client instance
+    Returns:
+        openai.types.chat.ChatCompletion: API response object, or None if request fails
+    """
+    prompt = f"""Given the following medical case:
+Please answer this multiple choice question:
+{question_data['question']}
+Base your answer only on the provided images and case information."""
+    content = [{"type": "text", "text": prompt}]
+    # Parse required figures
+    try:
+        # Try multiple ways of parsing figures
+        if isinstance(question_data["figures"], str):
+            try:
+                required_figures = json.loads(question_data["figures"])
+            except json.JSONDecodeError:
+                required_figures = [question_data["figures"]]
+        elif isinstance(question_data["figures"], list):
+            required_figures = question_data["figures"]
+        else:
+            required_figures = [str(question_data["figures"])]
+    except Exception as e:
+        print(f"Error parsing figures: {e}")
+        required_figures = []
+    # Ensure each figure starts with "Figure "
+    required_figures = [
+        fig if fig.startswith("Figure ") else f"Figure {fig}" for fig in required_figures
+    ]
+    subfigures = []
+    for figure in required_figures:
+        # Handle both regular figures and those with letter suffixes
+        base_figure_num = "".join(filter(str.isdigit, figure))
+        figure_letter = "".join(filter(str.isalpha, figure.split()[-1])) or None
+        # Find matching figures in case details
+        matching_figures = [
+            case_figure
+            for case_figure in case_details.get("figures", [])
+            if case_figure["number"] == f"Figure {base_figure_num}"
+        ]
+        if not matching_figures:
+            print(f"No matching figure found for {figure} in case {case_id}")
+            continue
+        for case_figure in matching_figures:
+            # If a specific letter is specified, filter subfigures
+            if figure_letter:
+                matching_subfigures = [
+                    subfig
+                    for subfig in case_figure.get("subfigures", [])
+                    if subfig.get("number", "").lower().endswith(figure_letter.lower())
+                    or subfig.get("label", "").lower() == figure_letter.lower()
+                ]
+                subfigures.extend(matching_subfigures)
+            else:
+                # If no letter specified, add all subfigures
+                subfigures.extend(case_figure.get("subfigures", []))
+    # Add images to content
+    for subfig in subfigures:
+        if "url" in subfig:
+            content.append({"type": "image_url", "image_url": {"url": subfig["url"]}})
+        else:
+            print(f"Subfigure missing URL: {subfig}")
+    # If no images found, log and return None
+    if len(content) == 1:  # Only the text prompt exists
+        print(f"No images found for case {case_id}, question {question_id}")
+        return None
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a medical imaging expert. Provide only the letter corresponding to your answer choice (A/B/C/D/E/F).",
+        },
+        {"role": "user", "content": content},
+    ]
+    if len(content) == 1:  # Only the text prompt exists
+        print(f"No images found for case {case_id}, question {question_id}")
+        log_entry = {
+            "case_id": case_id,
+            "question_id": question_id,
+            "timestamp": datetime.now().isoformat(),
+            "model": model_name,
+            "temperature": temperature,
+            "status": "skipped",
+            "reason": "no_images",
+            "cost": 0,
+            "input": {
+                "messages": messages,
+                "question_data": {
+                    "question": question_data["question"],
+                    "explanation": question_data["explanation"],
+                    "metadata": question_data.get("metadata", {}),
+                    "figures": question_data["figures"],
+                },
+                "image_urls": [subfig["url"] for subfig in subfigures if "url" in subfig],
+                "image_captions": [subfig.get("caption", "") for subfig in subfigures],
+            },
+        }
+        logging.info(json.dumps(log_entry))
+        return None
+    try:
+        start_time = time.time()
+        response = client.chat.completions.create(
+            model=model_name, messages=messages, max_tokens=50, temperature=temperature
+        )
+        duration = time.time() - start_time
+        log_entry = {
+            "case_id": case_id,
+            "question_id": question_id,
+            "timestamp": datetime.now().isoformat(),
+            "model": model_name,
+            "temperature": temperature,
+            "duration": round(duration, 2),
+            "usage": {
+                "prompt_tokens": response.usage.prompt_tokens,
+                "completion_tokens": response.usage.completion_tokens,
+                "total_tokens": response.usage.total_tokens,
+            },
+            "cost": calculate_cost(response.usage.prompt_tokens, response.usage.completion_tokens),
+            "model_answer": response.choices[0].message.content,
+            "correct_answer": question_data["answer"],
+            "input": {
+                "messages": messages,
+                "question_data": {
+                    "question": question_data["question"],
+                    "explanation": question_data["explanation"],
+                    "metadata": question_data.get("metadata", {}),
+                    "figures": question_data["figures"],
+                },
+                "image_urls": [subfig["url"] for subfig in subfigures if "url" in subfig],
+                "image_captions": [subfig.get("caption", "") for subfig in subfigures],
+            },
+        }
+        logging.info(json.dumps(log_entry))
+        return response
+    except openai.RateLimitError:
+        log_entry = {
+            "case_id": case_id,
+            "question_id": question_id,
+            "timestamp": datetime.now().isoformat(),
+            "model": model_name,
+            "temperature": temperature,
+            "status": "error",
+            "reason": "rate_limit",
+            "cost": 0,
+            "input": {
+                "messages": messages,
+                "question_data": {
+                    "question": question_data["question"],
+                    "explanation": question_data["explanation"],
+                    "metadata": question_data.get("metadata", {}),
+                    "figures": question_data["figures"],
+                },
+                "image_urls": [subfig["url"] for subfig in subfigures if "url" in subfig],
+                "image_captions": [subfig.get("caption", "") for subfig in subfigures],
+            },
+        }
+        logging.info(json.dumps(log_entry))
+        print(
+            f"\nRate limit hit for case {case_id}, question {question_id}. Waiting 20s...",
+            flush=True,
+        )
+        time.sleep(20)
+        raise
+    except Exception as e:
+        log_entry = {
+            "case_id": case_id,
+            "question_id": question_id,
+            "timestamp": datetime.now().isoformat(),
+            "model": model_name,
+            "temperature": temperature,
+            "status": "error",
+            "error": str(e),
+            "cost": 0,
+            "input": {
+                "messages": messages,
+                "question_data": {
+                    "question": question_data["question"],
+                    "explanation": question_data["explanation"],
+                    "metadata": question_data.get("metadata", {}),
+                    "figures": question_data["figures"],
+                },
+                "image_urls": [subfig["url"] for subfig in subfigures if "url" in subfig],
+                "image_captions": [subfig.get("caption", "") for subfig in subfigures],
+            },
+        }
+        logging.info(json.dumps(log_entry))
+        print(f"Error processing case {case_id}, question {question_id}: {str(e)}")
+        raise
+def load_benchmark_questions(case_id: str) -> list:
+    """Load benchmark questions for a given case.
+    Args:
+        case_id: Identifier for the medical case
+    Returns:
+        list: List of paths to question files
+    """
+    benchmark_dir = "../benchmark/questions"
+    return glob.glob(f"{benchmark_dir}/{case_id}/{case_id}_*.json")
+def count_total_questions() -> tuple[int, int]:
+    """Count total number of cases and questions in benchmark.
+    Returns:
+        tuple: (total_cases, total_questions)
+    """
+    total_cases = len(glob.glob("../benchmark/questions/*"))
+    total_questions = sum(
+        len(glob.glob(f"../benchmark/questions/{case_id}/*.json"))
+        for case_id in os.listdir("../benchmark/questions")
+    )
+    return total_cases, total_questions
+def main() -> None:
+    """Main function to run the benchmark evaluation."""
+    with open("../data/eurorad_metadata.json", "r") as file:
+        data = json.load(file)
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError("OPENAI_API_KEY environment variable is not set.")
+    global client
+    client = openai.OpenAI(api_key=api_key)
+    total_cases, total_questions = count_total_questions()
+    cases_processed = 0
+    questions_processed = 0
+    skipped_questions = 0
+    print(f"Beginning benchmark evaluation for model {model_name} with temperature {temperature}")
+    for case_id, case_details in data.items():
+        question_files = load_benchmark_questions(case_id)
+        if not question_files:
+            continue
+        cases_processed += 1
+        for question_file in question_files:
+            with open(question_file, "r") as file:
+                question_data = json.load(file)
+                question_id = os.path.basename(question_file).split(".")[0]
+            questions_processed += 1
+            response = create_multimodal_request(
+                question_data, case_details, case_id, question_id, client
+            )
+            # Handle cases where response is None
+            if response is None:
+                skipped_questions += 1
+                print(f"Skipped question: Case ID {case_id}, Question ID {question_id}")
+                continue
+            print(
+                f"Progress: Case {cases_processed}/{total_cases}, Question {questions_processed}/{total_questions}"
+            )
+            print(f"Case ID: {case_id}")
+            print(f"Question ID: {question_id}")
+            print(f"Model Answer: {response.choices[0].message.content}")
+            print(f"Correct Answer: {question_data['answer']}\n")
+    print(f"\nBenchmark Summary:")
+    print(f"Total Cases Processed: {cases_processed}")
+    print(f"Total Questions Processed: {questions_processed}")
+    print(f"Total Questions Skipped: {skipped_questions}")
+if __name__ == "__main__":
+    main()

experiments/benchmark_llama.py ADDED Viewed

	@@ -0,0 +1,443 @@

+from typing import Dict, List, Optional, Any, Union
+import re
+import json
+import os
+import glob
+import time
+import logging
+import socket
+import requests
+import httpx
+import backoff
+from datetime import datetime
+from tenacity import retry, wait_exponential, stop_after_attempt
+from openai import OpenAI
+# Configure model settings
+MODEL_NAME = "meta-llama/llama-3.2-90b-vision-instruct"
+temperature = 0.2
+# Configure logging
+log_filename = f"api_usage_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+logging.basicConfig(filename=log_filename, level=logging.INFO, format="%(message)s")
+def verify_dns() -> bool:
+    """Verify DNS resolution and connectivity.
+    Returns:
+        bool: True if DNS resolution succeeds, False otherwise
+    """
+    try:
+        # Try to resolve openrouter.ai
+        socket.gethostbyname("openrouter.ai")
+        return True
+    except socket.gaierror:
+        print("DNS resolution failed. Trying to use Google DNS (8.8.8.8)...")
+        # Modify resolv.conf to use Google DNS
+        try:
+            with open("/etc/resolv.conf", "w") as f:
+                f.write("nameserver 8.8.8.8\n")
+            return True
+        except Exception as e:
+            print(f"Failed to update DNS settings: {e}")
+            return False
+def verify_connection() -> bool:
+    """Verify connection to OpenRouter API.
+    Returns:
+        bool: True if connection succeeds, False otherwise
+    """
+    try:
+        response = requests.get("https://openrouter.ai/api/v1/status", timeout=10)
+        return response.status_code == 200
+    except Exception as e:
+        print(f"Connection test failed: {e}")
+        return False
+def initialize_client() -> OpenAI:
+    """Initialize the OpenRouter client with proper timeout settings and connection verification.
+    Returns:
+        OpenAI: Configured OpenAI client for OpenRouter
+    Raises:
+        ValueError: If OPENROUTER_API_KEY environment variable is not set
+        ConnectionError: If DNS verification or connection test fails
+    """
+    api_key = os.getenv("OPENROUTER_API_KEY")
+    if not api_key:
+        raise ValueError("OPENROUTER_API_KEY environment variable is not set.")
+    # Configure timeout settings for the client
+    timeout_settings = 120  # Increased timeout for large images/responses
+    # Verify DNS and connection
+    if not verify_dns():
+        raise ConnectionError("DNS verification failed. Please check your network settings.")
+    if not verify_connection():
+        raise ConnectionError(
+            "Cannot connect to OpenRouter. Please check your internet connection."
+        )
+    # Set up client with retry and timeout settings
+    return OpenAI(
+        base_url="https://openrouter.ai/api/v1",
+        api_key=api_key,
+        timeout=timeout_settings,
+        http_client=httpx.Client(
+            timeout=timeout_settings, transport=httpx.HTTPTransport(retries=3)
+        ),
+    )
+@backoff.on_exception(
+    backoff.expo,
+    (ConnectionError, TimeoutError, socket.gaierror, httpx.ConnectError),
+    max_tries=5,
+    max_time=300,  # Maximum total time to try in seconds
+)
+def create_multimodal_request(
+    question_data: Dict[str, Any],
+    case_details: Dict[str, Any],
+    case_id: str,
+    question_id: str,
+    client: OpenAI,
+) -> Optional[Any]:
+    """Create and send a multimodal request to the model.
+    Args:
+        question_data: Dictionary containing question details
+        case_details: Dictionary containing case information
+        case_id: ID of the medical case
+        question_id: ID of the specific question
+        client: OpenAI client instance
+    Returns:
+        Optional[Any]: Model response if successful, None if skipped
+    Raises:
+        ConnectionError: If connection fails
+        TimeoutError: If request times out
+        Exception: For other errors
+    """
+    system_prompt = """You are a medical imaging expert. Your task is to provide ONLY a single letter answer.
+Rules:
+1. Respond with exactly one uppercase letter (A/B/C/D/E/F)
+2. Do not add periods, explanations, or any other text
+3. Do not use markdown or formatting
+4. Do not restate the question
+5. Do not explain your reasoning
+Examples of valid responses:
+A
+B
+C
+Examples of invalid responses:
+"A."
+"Answer: B"
+"C) This shows..."
+"The answer is D"
+"""
+    prompt = f"""Given the following medical case:
+Please answer this multiple choice question:
+{question_data['question']}
+Base your answer only on the provided images and case information."""
+    # Parse required figures
+    try:
+        if isinstance(question_data["figures"], str):
+            try:
+                required_figures = json.loads(question_data["figures"])
+            except json.JSONDecodeError:
+                required_figures = [question_data["figures"]]
+        elif isinstance(question_data["figures"], list):
+            required_figures = question_data["figures"]
+        else:
+            required_figures = [str(question_data["figures"])]
+    except Exception as e:
+        print(f"Error parsing figures: {e}")
+        required_figures = []
+    required_figures = [
+        fig if fig.startswith("Figure ") else f"Figure {fig}" for fig in required_figures
+    ]
+    # Process subfigures and prepare content
+    content = [{"type": "text", "text": prompt}]
+    image_urls = []
+    image_captions = []
+    for figure in required_figures:
+        base_figure_num = "".join(filter(str.isdigit, figure))
+        figure_letter = "".join(filter(str.isalpha, figure.split()[-1])) or None
+        matching_figures = [
+            case_figure
+            for case_figure in case_details.get("figures", [])
+            if case_figure["number"] == f"Figure {base_figure_num}"
+        ]
+        for case_figure in matching_figures:
+            subfigures = []
+            if figure_letter:
+                subfigures = [
+                    subfig
+                    for subfig in case_figure.get("subfigures", [])
+                    if subfig.get("number", "").lower().endswith(figure_letter.lower())
+                    or subfig.get("label", "").lower() == figure_letter.lower()
+                ]
+            else:
+                subfigures = case_figure.get("subfigures", [])
+            for subfig in subfigures:
+                if "url" in subfig:
+                    content.append({"type": "image_url", "image_url": {"url": subfig["url"]}})
+                    image_urls.append(subfig["url"])
+                    image_captions.append(subfig.get("caption", ""))
+    if len(content) == 1:  # Only the text prompt exists
+        print(f"No images found for case {case_id}, question {question_id}")
+        # Log the skipped question
+        log_entry = {
+            "case_id": case_id,
+            "question_id": question_id,
+            "timestamp": datetime.now().isoformat(),
+            "model": MODEL_NAME,
+            "status": "skipped",
+            "reason": "no_images",
+            "input": {
+                "question_data": {
+                    "question": question_data["question"],
+                    "explanation": question_data["explanation"],
+                    "metadata": question_data.get("metadata", {}),
+                    "figures": question_data["figures"],
+                },
+                "image_urls": image_urls,
+            },
+        }
+        logging.info(json.dumps(log_entry))
+        return None
+    try:
+        start_time = time.time()
+        response = client.chat.completions.create(
+            model=MODEL_NAME,
+            temperature=temperature,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": content},
+            ],
+        )
+        duration = time.time() - start_time
+        # Get raw response
+        raw_answer = response.choices[0].message.content
+        # Validate and clean
+        clean_answer = validate_answer(raw_answer)
+        if not clean_answer:
+            print(f"Warning: Invalid response format for case {case_id}, question {question_id}")
+            print(f"Raw response: {raw_answer}")
+        # Update response object with cleaned answer
+        response.choices[0].message.content = clean_answer
+        # Log response
+        log_entry = {
+            "case_id": case_id,
+            "question_id": question_id,
+            "timestamp": datetime.now().isoformat(),
+            "model": MODEL_NAME,
+            "temperature": temperature,
+            "duration": round(duration, 2),
+            "usage": {
+                "prompt_tokens": response.usage.prompt_tokens,
+                "completion_tokens": response.usage.completion_tokens,
+                "total_tokens": response.usage.total_tokens,
+            },
+            "model_answer": response.choices[0].message.content,
+            "correct_answer": question_data["answer"],
+            "input": {
+                "question_data": {
+                    "question": question_data["question"],
+                    "explanation": question_data["explanation"],
+                    "metadata": question_data.get("metadata", {}),
+                    "figures": question_data["figures"],
+                },
+                "image_urls": image_urls,
+            },
+        }
+        logging.info(json.dumps(log_entry))
+        return response
+    except ConnectionError as e:
+        print(f"Connection error for case {case_id}, question {question_id}: {str(e)}")
+        print("Retrying after a longer delay...")
+        time.sleep(30)  # Add a longer delay before retry
+        raise
+    except TimeoutError as e:
+        print(f"Timeout error for case {case_id}, question {question_id}: {str(e)}")
+        print("Retrying with increased timeout...")
+        raise
+    except Exception as e:
+        # Log failed requests too
+        log_entry = {
+            "case_id": case_id,
+            "question_id": question_id,
+            "timestamp": datetime.now().isoformat(),
+            "model": MODEL_NAME,
+            "temperature": temperature,
+            "status": "error",
+            "error": str(e),
+            "input": {
+                "question_data": {
+                    "question": question_data["question"],
+                    "explanation": question_data["explanation"],
+                    "metadata": question_data.get("metadata", {}),
+                    "figures": question_data["figures"],
+                },
+                "image_urls": image_urls,
+            },
+        }
+        logging.info(json.dumps(log_entry))
+        raise
+def extract_answer(response_text: str) -> Optional[str]:
+    """Extract single letter answer from model response.
+    Args:
+        response_text: Raw text response from model
+    Returns:
+        Optional[str]: Single letter answer if found, None otherwise
+    """
+    # Convert to uppercase and remove periods
+    text = response_text.upper().replace(".", "")
+    # Look for common patterns
+    patterns = [
+        r"ANSWER:\s*([A-F])",  # Matches "ANSWER: X"
+        r"OPTION\s*([A-F])",  # Matches "OPTION X"
+        r"([A-F])\)",  # Matches "X)"
+        r"\b([A-F])\b",  # Matches single letter
+    ]
+    for pattern in patterns:
+        matches = re.findall(pattern, text)
+        if matches:
+            return matches[0]
+    return None
+def validate_answer(response_text: str) -> Optional[str]:
+    """Enforce strict single-letter response format.
+    Args:
+        response_text: Raw text response from model
+    Returns:
+        Optional[str]: Valid single letter answer if found, None otherwise
+    """
+    if not response_text:
+        return None
+    # Remove all whitespace and convert to uppercase
+    cleaned = response_text.strip().upper()
+    # Check if it's exactly one valid letter
+    if len(cleaned) == 1 and cleaned in "ABCDEF":
+        return cleaned
+    # If not, try to extract just the letter
+    match = re.search(r"([A-F])", cleaned)
+    return match.group(1) if match else None
+def load_benchmark_questions(case_id: str) -> List[str]:
+    """Find all question files for a given case ID.
+    Args:
+        case_id: ID of the medical case
+    Returns:
+        List[str]: List of paths to question files
+    """
+    benchmark_dir = "../benchmark/questions"
+    return glob.glob(f"{benchmark_dir}/{case_id}/{case_id}_*.json")
+def count_total_questions() -> Tuple[int, int]:
+    """Count total number of cases and questions.
+    Returns:
+        Tuple[int, int]: (total_cases, total_questions)
+    """
+    total_cases = len(glob.glob("../benchmark/questions/*"))
+    total_questions = sum(
+        len(glob.glob(f"../benchmark/questions/{case_id}/*.json"))
+        for case_id in os.listdir("../benchmark/questions")
+    )
+    return total_cases, total_questions
+def main():
+    with open("../data/eurorad_metadata.json", "r") as file:
+        data = json.load(file)
+    client = initialize_client()
+    total_cases, total_questions = count_total_questions()
+    cases_processed = 0
+    questions_processed = 0
+    skipped_questions = 0
+    print(f"Beginning benchmark evaluation for {MODEL_NAME} with temperature {temperature}")
+    for case_id, case_details in data.items():
+        question_files = load_benchmark_questions(case_id)
+        if not question_files:
+            continue
+        cases_processed += 1
+        for question_file in question_files:
+            with open(question_file, "r") as file:
+                question_data = json.load(file)
+                question_id = os.path.basename(question_file).split(".")[0]
+            questions_processed += 1
+            response = create_multimodal_request(
+                question_data, case_details, case_id, question_id, client
+            )
+            if response is None:
+                skipped_questions += 1
+                print(f"Skipped question: Case ID {case_id}, Question ID {question_id}")
+                continue
+            print(
+                f"Progress: Case {cases_processed}/{total_cases}, Question {questions_processed}/{total_questions}"
+            )
+            print(f"Case ID: {case_id}")
+            print(f"Question ID: {question_id}")
+            print(f"Model Answer: {response.choices[0].message.content}")
+            print(f"Correct Answer: {question_data['answer']}\n")
+    print(f"\nBenchmark Summary:")
+    print(f"Total Cases Processed: {cases_processed}")
+    print(f"Total Questions Processed: {questions_processed}")
+    print(f"Total Questions Skipped: {skipped_questions}")
+if __name__ == "__main__":
+    main()

experiments/benchmark_llavamed.py ADDED Viewed

	@@ -0,0 +1,541 @@

+import argparse
+import json
+import requests
+import base64
+from PIL import Image
+from io import BytesIO
+from llava.conversation import conv_templates
+import time
+import os
+import glob
+import logging
+from datetime import datetime
+from tqdm import tqdm
+import re
+from typing import Dict, List, Optional, Union, Any, Tuple
+def process_image(image_path: str, target_size: int = 640) -> Image.Image:
+    """Process and resize an image to match model requirements.
+    Args:
+        image_path: Path to the input image file
+        target_size: Target size for both width and height in pixels
+    Returns:
+        PIL.Image: Processed and padded image with dimensions (target_size, target_size)
+    """
+    image = Image.open(image_path)
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    # Calculate scaling to maintain aspect ratio
+    ratio = min(target_size / image.width, target_size / image.height)
+    new_size = (int(image.width * ratio), int(image.height * ratio))
+    # Resize image
+    image = image.resize(new_size, Image.LANCZOS)
+    # Create new image with padding
+    new_image = Image.new("RGB", (target_size, target_size), (0, 0, 0))
+    # Paste resized image in center
+    offset = ((target_size - new_size[0]) // 2, (target_size - new_size[1]) // 2)
+    new_image.paste(image, offset)
+    return new_image
+def validate_answer(response_text: str) -> Optional[str]:
+    """Extract and validate a single-letter response from the model's output.
+    Handles multiple response formats and edge cases.
+    Args:
+        response_text: The full text output from the model
+    Returns:
+        A single letter answer (A-F) or None if no valid answer found
+    """
+    if not response_text:
+        return None
+    # Clean the response text
+    cleaned = response_text.strip()
+    # Comprehensive set of patterns to extract the answer
+    extraction_patterns = [
+        # Strict format with explicit letter answer
+        r"(?:THE\s*)?(?:SINGLE\s*)?LETTER\s*(?:ANSWER\s*)?(?:IS:?)\s*([A-F])\b",
+        # Patterns for extracting from longer descriptions
+        r"(?:correct\s+)?(?:answer|option)\s*(?:is\s*)?([A-F])\b",
+        r"\b(?:answer|option)\s*([A-F])[):]\s*",
+        # Patterns for extracting from descriptive sentences
+        r"(?:most\s+likely\s+)?(?:answer|option)\s*(?:is\s*)?([A-F])\b",
+        r"suggest[s]?\s+(?:that\s+)?(?:the\s+)?(?:answer\s+)?(?:is\s*)?([A-F])\b",
+        # Patterns with contextual words
+        r"characteriz[e]?d?\s+by\s+([A-F])\b",
+        r"indicat[e]?s?\s+([A-F])\b",
+        # Fallback to Option X or Letterr X formats
+        r"Option\s*([A-F])\b",
+        r"\b([A-F])\)\s*",
+        # Fallback to standalone letter
+        r"^\s*([A-F])\s*$",
+    ]
+    # Try each pattern
+    for pattern in extraction_patterns:
+        matches = re.findall(pattern, cleaned, re.IGNORECASE)
+        for match in matches:
+            # Ensure match is a single valid letter
+            if isinstance(match, tuple):
+                match = match[0] if match[0] in "ABCDEF" else None
+            if match and match.upper() in "ABCDEF":
+                return match.upper()
+    # Final fallback: look for standalone letters in context
+    context_matches = re.findall(r"\b([A-F])\b", cleaned.upper())
+    context_letters = [m for m in context_matches if m in "ABCDEF"]
+    if context_letters:
+        return context_letters[0]
+    # No valid answer found
+    return None
+def load_benchmark_questions(case_id: str) -> List[str]:
+    """Find all question files for a given case ID.
+    Args:
+        case_id: The ID of the medical case
+    Returns:
+        List of paths to question JSON files
+    """
+    benchmark_dir = "MedMAX/benchmark/questions"
+    return glob.glob(f"{benchmark_dir}/{case_id}/{case_id}_*.json")
+def count_total_questions() -> Tuple[int, int]:
+    """Count total number of cases and questions in benchmark.
+    Returns:
+        Tuple containing (total_cases, total_questions)
+    """
+    total_cases = len(glob.glob("MedMAX/benchmark/questions/*"))
+    total_questions = sum(
+        len(glob.glob(f"MedMAX/benchmark/questions/{case_id}/*.json"))
+        for case_id in os.listdir("MedMAX/benchmark/questions")
+    )
+    return total_cases, total_questions
+def create_inference_request(
+    question_data: Dict[str, Any],
+    case_details: Dict[str, Any],
+    case_id: str,
+    question_id: str,
+    worker_addr: str,
+    model_name: str,
+    raw_output: bool = False,
+) -> Union[Tuple[Optional[str], Optional[float]], Dict[str, Any]]:
+    """Create and send inference request to worker.
+    Args:
+        question_data: Dictionary containing question details and figures
+        case_details: Dictionary containing case information and figures
+        case_id: Identifier for the medical case
+        question_id: Identifier for the specific question
+        worker_addr: Address of the worker endpoint
+        model_name: Name of the model to use
+        raw_output: Whether to return raw model output
+    Returns:
+        If raw_output is False: Tuple of (validated_answer, duration)
+        If raw_output is True: Dictionary with full inference details
+    """
+    system_prompt = """You are a medical imaging expert. Your answer MUST be a SINGLE LETTER (A/B/C/D/E/F), provided in this format: 'The SINGLE LETTER answer is: X'.
+"""
+    prompt = f"""Given the following medical case:
+Please answer this multiple choice question:
+{question_data['question']}
+Base your answer only on the provided images and case information. Respond with your SINGLE LETTER answer: """
+    try:
+        # Parse required figures
+        if isinstance(question_data["figures"], str):
+            try:
+                required_figures = json.loads(question_data["figures"])
+            except json.JSONDecodeError:
+                required_figures = [question_data["figures"]]
+        elif isinstance(question_data["figures"], list):
+            required_figures = question_data["figures"]
+        else:
+            required_figures = [str(question_data["figures"])]
+    except Exception as e:
+        print(f"Error parsing figures: {e}")
+        required_figures = []
+    required_figures = [
+        fig if fig.startswith("Figure ") else f"Figure {fig}" for fig in required_figures
+    ]
+    # Get image paths
+    image_paths = []
+    for figure in required_figures:
+        base_figure_num = "".join(filter(str.isdigit, figure))
+        figure_letter = "".join(filter(str.isalpha, figure.split()[-1])) or None
+        matching_figures = [
+            case_figure
+            for case_figure in case_details.get("figures", [])
+            if case_figure["number"] == f"Figure {base_figure_num}"
+        ]
+        for case_figure in matching_figures:
+            subfigures = []
+            if figure_letter:
+                subfigures = [
+                    subfig
+                    for subfig in case_figure.get("subfigures", [])
+                    if subfig.get("number", "").lower().endswith(figure_letter.lower())
+                    or subfig.get("label", "").lower() == figure_letter.lower()
+                ]
+            else:
+                subfigures = case_figure.get("subfigures", [])
+            for subfig in subfigures:
+                if "local_path" in subfig:
+                    image_paths.append("MedMAX/data/" + subfig["local_path"])
+    if not image_paths:
+        print(f"No local images found for case {case_id}, question {question_id}")
+        return "skipped", 0.0  # Return a special 'skipped' marker
+    try:
+        start_time = time.time()
+        # Process each image
+        processed_images = [process_image(path) for path in image_paths]
+        # Create conversation
+        conv = conv_templates["mistral_instruct"].copy()
+        # Add image and message
+        if "<image>" not in prompt:
+            text = prompt + "\n<image>"
+        else:
+            text = prompt
+        message = (text, processed_images[0], "Default")  # Currently handling first image
+        conv.append_message(conv.roles[0], message)
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+        headers = {"User-Agent": "LLaVA-Med Client"}
+        pload = {
+            "model": model_name,
+            "prompt": prompt,
+            "max_new_tokens": 150,  # Reduce this since we only need one letter
+            "temperature": 0.5,  # Lower temperature for more focused responses
+            "stop": conv.sep2,
+            "images": conv.get_images(),
+            "top_p": 1,  # Lower top_p for more focused sampling
+            "frequency_penalty": 0.0,
+            "presence_penalty": 0.0,
+        }
+        max_retries = 3
+        retry_delay = 5
+        response_text = None
+        for attempt in range(max_retries):
+            try:
+                response = requests.post(
+                    worker_addr + "/worker_generate_stream",
+                    headers=headers,
+                    json=pload,
+                    stream=True,
+                    timeout=30,
+                )
+                complete_output = ""
+                for chunk in response.iter_lines(
+                    chunk_size=8192, decode_unicode=False, delimiter=b"\0"
+                ):
+                    if chunk:
+                        data = json.loads(chunk.decode("utf-8"))
+                        if data["error_code"] == 0:
+                            output = data["text"].split("[/INST]")[-1]
+                            complete_output = output
+                        else:
+                            print(f"\nError: {data['text']} (error_code: {data['error_code']})")
+                            if attempt < max_retries - 1:
+                                time.sleep(retry_delay)
+                                break
+                            return None, None
+                if complete_output:
+                    response_text = complete_output
+                    break
+            except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
+                if attempt < max_retries - 1:
+                    print(f"\nNetwork error: {str(e)}. Retrying in {retry_delay} seconds...")
+                    time.sleep(retry_delay)
+                else:
+                    print(f"\nFailed after {max_retries} attempts: {str(e)}")
+                    return None, None
+        duration = time.time() - start_time
+        if raw_output:
+            inference_details = {
+                "raw_output": response_text,
+                "validated_answer": validate_answer(response_text),
+                "duration": duration,
+                "prompt": prompt,
+                "system_prompt": system_prompt,
+                "image_paths": image_paths,
+                "payload": pload,
+            }
+            return inference_details
+        return validate_answer(response_text), duration
+    except Exception as e:
+        print(f"Error in inference request: {str(e)}")
+        return None, None
+def clean_payload(payload: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
+    """Remove image-related and large data from the payload to keep the log lean.
+    Args:
+        payload: Original request payload dictionary
+    Returns:
+        Cleaned payload dictionary with large data removed
+    """
+    if not payload:
+        return None
+    # Create a copy of the payload to avoid modifying the original
+    cleaned_payload = payload.copy()
+    # Remove large or sensitive data
+    if "images" in cleaned_payload:
+        del cleaned_payload["images"]
+    return cleaned_payload
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
+    parser.add_argument("--worker-address", type=str)
+    parser.add_argument("--model-name", type=str, default="llava-med-v1.5-mistral-7b")
+    parser.add_argument("--output-dir", type=str, default="benchmark_results")
+    parser.add_argument(
+        "--raw-output", action="store_true", help="Return raw model output without validation"
+    )
+    parser.add_argument(
+        "--num-cases",
+        type=int,
+        help="Number of cases to process if looking at raw outputs",
+        default=2,
+    )
+    args = parser.parse_args()
+    # Setup output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    # Setup live logging files
+    live_log_filename = os.path.join(args.output_dir, f"live_benchmark_log_{timestamp}.json")
+    final_results_filename = os.path.join(args.output_dir, f"final_results_{timestamp}.json")
+    # Initialize live log file
+    with open(live_log_filename, "w") as live_log_file:
+        live_log_file.write("[\n")  # Start of JSON array
+    # Setup logging
+    logging.basicConfig(
+        filename=os.path.join(args.output_dir, f"benchmark_{timestamp}.log"),
+        level=logging.INFO,
+        format="%(message)s",
+    )
+    # Get worker address
+    if args.worker_address:
+        worker_addr = args.worker_address
+    else:
+        try:
+            requests.post(args.controller_address + "/refresh_all_workers")
+            ret = requests.post(args.controller_address + "/list_models")
+            models = ret.json()["models"]
+            ret = requests.post(
+                args.controller_address + "/get_worker_address", json={"model": args.model_name}
+            )
+            worker_addr = ret.json()["address"]
+            print(f"Worker address: {worker_addr}")
+        except requests.exceptions.RequestException as e:
+            print(f"Failed to connect to controller: {e}")
+            return
+    if worker_addr == "":
+        print("No available worker")
+        return
+    # Load cases with local paths
+    with open("MedMAX/data/updated_cases.json", "r") as file:
+        data = json.load(file)
+    total_cases, total_questions = count_total_questions()
+    print(f"\nStarting benchmark with {args.model_name}")
+    print(f"Found {total_cases} cases with {total_questions} total questions")
+    results = {
+        "model": args.model_name,
+        "timestamp": datetime.now().isoformat(),
+        "total_cases": total_cases,
+        "total_questions": total_questions,
+        "results": [],
+    }
+    cases_processed = 0
+    questions_processed = 0
+    correct_answers = 0
+    skipped_questions = 0
+    total_processed_entries = 0
+    # Process each case
+    for case_id, case_details in tqdm(data.items(), desc="Processing cases"):
+        question_files = load_benchmark_questions(case_id)
+        if not question_files:
+            continue
+        cases_processed += 1
+        for question_file in tqdm(
+            question_files, desc=f"Processing questions for case {case_id}", leave=False
+        ):
+            with open(question_file, "r") as file:
+                question_data = json.load(file)
+                question_id = os.path.basename(question_file).split(".")[0]
+            questions_processed += 1
+            # Get model's answer
+            inference_result = create_inference_request(
+                question_data,
+                case_details,
+                case_id,
+                question_id,
+                worker_addr,
+                args.model_name,
+                raw_output=True,  # Always use raw output for detailed logging
+            )
+            # Handle skipped questions
+            if inference_result == ("skipped", 0.0):
+                skipped_questions += 1
+                print(f"\nCase {case_id}, Question {question_id}: Skipped (No images)")
+                # Log skipped question
+                skipped_entry = {
+                    "case_id": case_id,
+                    "question_id": question_id,
+                    "status": "skipped",
+                    "reason": "No images found",
+                }
+                with open(live_log_filename, "a") as live_log_file:
+                    json.dump(skipped_entry, live_log_file, indent=2)
+                    live_log_file.write(",\n")  # Add comma for next entry
+                continue
+            # Extract information
+            answer = inference_result["validated_answer"]
+            duration = inference_result["duration"]
+            # Prepare detailed logging entry
+            log_entry = {
+                "case_id": case_id,
+                "question_id": question_id,
+                "question": question_data["question"],
+                "correct_answer": question_data["answer"],
+                "raw_output": inference_result["raw_output"],
+                "validated_answer": answer,
+                "model_answer": answer,
+                "is_correct": answer == question_data["answer"] if answer else False,
+                "duration": duration,
+                "system_prompt": inference_result["system_prompt"],
+                "input_prompt": inference_result["prompt"],
+                "image_paths": inference_result["image_paths"],
+                "payload": clean_payload(inference_result["payload"]),
+            }
+            # Write to live log file
+            with open(live_log_filename, "a") as live_log_file:
+                json.dump(log_entry, live_log_file, indent=2)
+                live_log_file.write(",\n")  # Add comma for next entry
+            # Print to console
+            print(f"\nCase {case_id}, Question {question_id}")
+            print(f"Model Answer: {answer}")
+            print(f"Correct Answer: {question_data['answer']}")
+            print(f"Time taken: {duration:.2f}s")
+            # Track correct answers
+            if answer == question_data["answer"]:
+                correct_answers += 1
+            # Append to results
+            results["results"].append(log_entry)
+            total_processed_entries += 1
+            # Optional: break if reached specified number of cases
+            if args.raw_output and cases_processed == args.num_cases:
+                break
+        # Optional: break if reached specified number of cases
+        if args.raw_output and cases_processed == args.num_cases:
+            break
+    # Close live log file
+    with open(live_log_filename, "a") as live_log_file:
+        # Remove trailing comma and close JSON array
+        live_log_file.seek(live_log_file.tell() - 2, 0)  # Go back 2 chars to remove ',\n'
+        live_log_file.write("\n]")
+    # Calculate final statistics
+    results["summary"] = {
+        "cases_processed": cases_processed,
+        "questions_processed": questions_processed,
+        "total_processed_entries": total_processed_entries,
+        "correct_answers": correct_answers,
+        "skipped_questions": skipped_questions,
+        "accuracy": (
+            correct_answers / (questions_processed - skipped_questions)
+            if (questions_processed - skipped_questions) > 0
+            else 0
+        ),
+    }
+    # Save final results
+    with open(final_results_filename, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nBenchmark Summary:")
+    print(f"Total Cases Processed: {cases_processed}")
+    print(f"Total Questions Processed: {questions_processed}")
+    print(f"Total Processed Entries: {total_processed_entries}")
+    print(f"Correct Answers: {correct_answers}")
+    print(f"Skipped Questions: {skipped_questions}")
+    print(f"Accuracy: {(correct_answers / (questions_processed - skipped_questions) * 100):.2f}%")
+    print(f"\nResults saved to {args.output_dir}")
+    print(f"Live log: {live_log_filename}")
+    print(f"Final results: {final_results_filename}")
+if __name__ == "__main__":
+    main()

experiments/benchmark_medrax.ipynb ADDED Viewed

	@@ -0,0 +1,374 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import operator\n",
+    "import warnings\n",
+    "from typing import *\n",
+    "import traceback\n",
+    "\n",
+    "import os\n",
+    "import torch\n",
+    "from dotenv import load_dotenv\n",
+    "from IPython.display import Image\n",
+    "from langgraph.checkpoint.memory import MemorySaver\n",
+    "from langgraph.graph import END, StateGraph\n",
+    "from langchain_core.messages import AnyMessage, HumanMessage, SystemMessage, ToolMessage\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "from transformers import logging\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import re\n",
+    "\n",
+    "from medrax.agent import *\n",
+    "from medrax.tools import *\n",
+    "from medrax.utils import *\n",
+    "\n",
+    "import json\n",
+    "import openai\n",
+    "import os\n",
+    "import glob\n",
+    "import time\n",
+    "import logging\n",
+    "from datetime import datetime\n",
+    "from tenacity import retry, wait_exponential, stop_after_attempt\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "_ = load_dotenv()\n",
+    "\n",
+    "\n",
+    "# Setup directory paths\n",
+    "ROOT = \"set this directory to where MedRAX is, .e.g /home/MedRAX\"\n",
+    "PROMPT_FILE = f\"{ROOT}/medrax/docs/system_prompts.txt\"\n",
+    "BENCHMARK_FILE = f\"{ROOT}/benchmark/questions\"\n",
+    "MODEL_DIR = f\"set this to where the tool models are, e.g /home/models\"\n",
+    "FIGURES_DIR = f\"{ROOT}/benchmark/figures\"\n",
+    "\n",
+    "model_name = \"medrax\"\n",
+    "temperature = 0.2\n",
+    "medrax_logs = f\"{ROOT}/experiments/medrax_logs\"\n",
+    "log_filename = f\"{medrax_logs}/{model_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json\"\n",
+    "logging.basicConfig(filename=log_filename, level=logging.INFO, format=\"%(message)s\", force=True)\n",
+    "device = \"cuda\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_tools():\n",
+    "    report_tool = ChestXRayReportGeneratorTool(cache_dir=MODEL_DIR, device=device)\n",
+    "    xray_classification_tool = ChestXRayClassifierTool(device=device)\n",
+    "    segmentation_tool = ChestXRaySegmentationTool(device=device)\n",
+    "    grounding_tool = XRayPhraseGroundingTool(\n",
+    "        cache_dir=MODEL_DIR, temp_dir=\"temp\", device=device, load_in_8bit=True\n",
+    "    )\n",
+    "    xray_vqa_tool = XRayVQATool(cache_dir=MODEL_DIR, device=device)\n",
+    "    llava_med_tool = LlavaMedTool(cache_dir=MODEL_DIR, device=device, load_in_8bit=True)\n",
+    "\n",
+    "    return [\n",
+    "        report_tool,\n",
+    "        xray_classification_tool,\n",
+    "        segmentation_tool,\n",
+    "        grounding_tool,\n",
+    "        xray_vqa_tool,\n",
+    "        llava_med_tool,\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "def get_agent(tools):\n",
+    "    prompts = load_prompts_from_file(PROMPT_FILE)\n",
+    "    prompt = prompts[\"MEDICAL_ASSISTANT\"]\n",
+    "\n",
+    "    checkpointer = MemorySaver()\n",
+    "    model = ChatOpenAI(model=\"gpt-4o\", temperature=temperature, top_p=0.95)\n",
+    "    agent = Agent(\n",
+    "        model,\n",
+    "        tools=tools,\n",
+    "        log_tools=True,\n",
+    "        log_dir=\"logs\",\n",
+    "        system_prompt=prompt,\n",
+    "        checkpointer=checkpointer,\n",
+    "    )\n",
+    "    thread = {\"configurable\": {\"thread_id\": \"1\"}}\n",
+    "    return agent, thread\n",
+    "\n",
+    "\n",
+    "def run_medrax(agent, thread, prompt, image_urls=[]):\n",
+    "    messages = [\n",
+    "        HumanMessage(\n",
+    "            content=[\n",
+    "                {\"type\": \"text\", \"text\": prompt},\n",
+    "            ]\n",
+    "            + [{\"type\": \"image_url\", \"image_url\": {\"url\": image_url}} for image_url in image_urls]\n",
+    "        )\n",
+    "    ]\n",
+    "\n",
+    "    final_response = None\n",
+    "    for event in agent.workflow.stream({\"messages\": messages}, thread):\n",
+    "        for v in event.values():\n",
+    "            final_response = v\n",
+    "\n",
+    "    final_response = final_response[\"messages\"][-1].content.strip()\n",
+    "    agent_state = agent.workflow.get_state(thread)\n",
+    "\n",
+    "    return final_response, str(agent_state)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_multimodal_request(question_data, case_details, case_id, question_id, agent, thread):\n",
+    "    # Parse required figures\n",
+    "    try:\n",
+    "        # Try multiple ways of parsing figures\n",
+    "        if isinstance(question_data[\"figures\"], str):\n",
+    "            try:\n",
+    "                required_figures = json.loads(question_data[\"figures\"])\n",
+    "            except json.JSONDecodeError:\n",
+    "                required_figures = [question_data[\"figures\"]]\n",
+    "        elif isinstance(question_data[\"figures\"], list):\n",
+    "            required_figures = question_data[\"figures\"]\n",
+    "        else:\n",
+    "            required_figures = [str(question_data[\"figures\"])]\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error parsing figures: {e}\")\n",
+    "        required_figures = []\n",
+    "\n",
+    "    # Ensure each figure starts with \"Figure \"\n",
+    "    required_figures = [\n",
+    "        fig if fig.startswith(\"Figure \") else f\"Figure {fig}\" for fig in required_figures\n",
+    "    ]\n",
+    "\n",
+    "    subfigures = []\n",
+    "    for figure in required_figures:\n",
+    "        # Handle both regular figures and those with letter suffixes\n",
+    "        base_figure_num = \"\".join(filter(str.isdigit, figure))\n",
+    "        figure_letter = \"\".join(filter(str.isalpha, figure.split()[-1])) or None\n",
+    "\n",
+    "        # Find matching figures in case details\n",
+    "        matching_figures = [\n",
+    "            case_figure\n",
+    "            for case_figure in case_details.get(\"figures\", [])\n",
+    "            if case_figure[\"number\"] == f\"Figure {base_figure_num}\"\n",
+    "        ]\n",
+    "\n",
+    "        if not matching_figures:\n",
+    "            print(f\"No matching figure found for {figure} in case {case_id}\")\n",
+    "            continue\n",
+    "\n",
+    "        for case_figure in matching_figures:\n",
+    "            # If a specific letter is specified, filter subfigures\n",
+    "            if figure_letter:\n",
+    "                matching_subfigures = [\n",
+    "                    subfig\n",
+    "                    for subfig in case_figure.get(\"subfigures\", [])\n",
+    "                    if subfig.get(\"number\", \"\").lower().endswith(figure_letter.lower())\n",
+    "                    or subfig.get(\"label\", \"\").lower() == figure_letter.lower()\n",
+    "                ]\n",
+    "                subfigures.extend(matching_subfigures)\n",
+    "            else:\n",
+    "                # If no letter specified, add all subfigures\n",
+    "                subfigures.extend(case_figure.get(\"subfigures\", []))\n",
+    "\n",
+    "    # Add images to content\n",
+    "    figure_prompt = \"\"\n",
+    "    image_urls = []\n",
+    "\n",
+    "    for subfig in subfigures:\n",
+    "        if \"number\" in subfig:\n",
+    "            subfig_number = subfig[\"number\"].lower().strip().replace(\" \", \"_\") + \".jpg\"\n",
+    "            subfig_path = os.path.join(FIGURES_DIR, case_id, subfig_number)\n",
+    "            figure_prompt += f\"{subfig_number} located at {subfig_path}\\n\"\n",
+    "        if \"url\" in subfig:\n",
+    "            image_urls.append(subfig[\"url\"])\n",
+    "        else:\n",
+    "            print(f\"Subfigure missing URL: {subfig}\")\n",
+    "\n",
+    "    prompt = (\n",
+    "        f\"Answer this question correctly using chain of thought reasoning and \"\n",
+    "        \"carefully evaluating choices. Solve using our own vision and reasoning and then\"\n",
+    "        \"use tools to complement your reasoning. Trust your own judgement over any tools.\\n\"\n",
+    "        f\"{question_data['question']}\\n{figure_prompt}\"\n",
+    "    )\n",
+    "\n",
+    "    try:\n",
+    "        start_time = time.time()\n",
+    "\n",
+    "        final_response, agent_state = run_medrax(\n",
+    "            agent=agent, thread=thread, prompt=prompt, image_urls=image_urls\n",
+    "        )\n",
+    "        model_answer, agent_state = run_medrax(\n",
+    "            agent=agent,\n",
+    "            thread=thread,\n",
+    "            prompt=\"If you had to choose the best option, only respond with the letter of choice (only one of A, B, C, D, E, F)\",\n",
+    "        )\n",
+    "        duration = time.time() - start_time\n",
+    "\n",
+    "        log_entry = {\n",
+    "            \"case_id\": case_id,\n",
+    "            \"question_id\": question_id,\n",
+    "            \"timestamp\": datetime.now().isoformat(),\n",
+    "            \"model\": model_name,\n",
+    "            \"temperature\": temperature,\n",
+    "            \"duration\": round(duration, 2),\n",
+    "            \"usage\": \"\",\n",
+    "            \"cost\": 0,\n",
+    "            \"raw_response\": final_response,\n",
+    "            \"model_answer\": model_answer.strip(),\n",
+    "            \"correct_answer\": question_data[\"answer\"][0],\n",
+    "            \"input\": {\n",
+    "                \"messages\": prompt,\n",
+    "                \"question_data\": {\n",
+    "                    \"question\": question_data[\"question\"],\n",
+    "                    \"explanation\": question_data[\"explanation\"],\n",
+    "                    \"metadata\": question_data.get(\"metadata\", {}),\n",
+    "                    \"figures\": question_data[\"figures\"],\n",
+    "                },\n",
+    "                \"image_urls\": [subfig[\"url\"] for subfig in subfigures if \"url\" in subfig],\n",
+    "                \"image_captions\": [subfig.get(\"caption\", \"\") for subfig in subfigures],\n",
+    "            },\n",
+    "            \"agent_state\": agent_state,\n",
+    "        }\n",
+    "        logging.info(json.dumps(log_entry))\n",
+    "        return final_response, model_answer.strip()\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        log_entry = {\n",
+    "            \"case_id\": case_id,\n",
+    "            \"question_id\": question_id,\n",
+    "            \"timestamp\": datetime.now().isoformat(),\n",
+    "            \"model\": model_name,\n",
+    "            \"temperature\": temperature,\n",
+    "            \"status\": \"error\",\n",
+    "            \"error\": str(e),\n",
+    "            \"cost\": 0,\n",
+    "            \"input\": {\n",
+    "                \"messages\": prompt,\n",
+    "                \"question_data\": {\n",
+    "                    \"question\": question_data[\"question\"],\n",
+    "                    \"explanation\": question_data[\"explanation\"],\n",
+    "                    \"metadata\": question_data.get(\"metadata\", {}),\n",
+    "                    \"figures\": question_data[\"figures\"],\n",
+    "                },\n",
+    "                \"image_urls\": [subfig[\"url\"] for subfig in subfigures if \"url\" in subfig],\n",
+    "                \"image_captions\": [subfig.get(\"caption\", \"\") for subfig in subfigures],\n",
+    "            },\n",
+    "        }\n",
+    "        logging.info(json.dumps(log_entry))\n",
+    "        print(f\"Error processing case {case_id}, question {question_id}: {str(e)}\")\n",
+    "        return \"\", \"\"\n",
+    "\n",
+    "\n",
+    "def load_benchmark_questions(case_id):\n",
+    "    benchmark_dir = \"../benchmark/questions\"\n",
+    "    return glob.glob(f\"{benchmark_dir}/{case_id}/{case_id}_*.json\")\n",
+    "\n",
+    "\n",
+    "def count_total_questions():\n",
+    "    total_cases = len(glob.glob(\"../benchmark/questions/*\"))\n",
+    "    total_questions = sum(\n",
+    "        len(glob.glob(f\"../benchmark/questions/{case_id}/*.json\"))\n",
+    "        for case_id in os.listdir(\"../benchmark/questions\")\n",
+    "    )\n",
+    "    return total_cases, total_questions\n",
+    "\n",
+    "\n",
+    "def main(tools):\n",
+    "    with open(\"../data/eurorad_metadata.json\", \"r\") as file:\n",
+    "        data = json.load(file)\n",
+    "\n",
+    "    total_cases, total_questions = count_total_questions()\n",
+    "    cases_processed = 0\n",
+    "    questions_processed = 0\n",
+    "    skipped_questions = 0\n",
+    "\n",
+    "    print(f\"Beginning benchmark evaluation for model {model_name} with temperature {temperature}\\n\")\n",
+    "\n",
+    "    for case_id, case_details in data.items():\n",
+    "        if int(case_details[\"case_id\"]) <= 17158:\n",
+    "            continue\n",
+    "\n",
+    "        print(f\"----------------------------------------------------------------\")\n",
+    "        agent, thread = get_agent(tools)\n",
+    "\n",
+    "        question_files = load_benchmark_questions(case_id)\n",
+    "        if not question_files:\n",
+    "            continue\n",
+    "\n",
+    "        cases_processed += 1\n",
+    "        for question_file in question_files:\n",
+    "            with open(question_file, \"r\") as file:\n",
+    "                question_data = json.load(file)\n",
+    "                question_id = os.path.basename(question_file).split(\".\")[0]\n",
+    "\n",
+    "            # agent, thread = get_agent(tools)\n",
+    "            questions_processed += 1\n",
+    "            final_response, model_answer = create_multimodal_request(\n",
+    "                question_data, case_details, case_id, question_id, agent, thread\n",
+    "            )\n",
+    "\n",
+    "            # Handle cases where response is None\n",
+    "            if final_response is None:\n",
+    "                skipped_questions += 1\n",
+    "                print(f\"Skipped question: Case ID {case_id}, Question ID {question_id}\")\n",
+    "                continue\n",
+    "\n",
+    "            print(\n",
+    "                f\"Progress: Case {cases_processed}/{total_cases}, Question {questions_processed}/{total_questions}\"\n",
+    "            )\n",
+    "            print(f\"Case ID: {case_id}\")\n",
+    "            print(f\"Question ID: {question_id}\")\n",
+    "            print(f\"Final Response: {final_response}\")\n",
+    "            print(f\"Model Answer: {model_answer}\")\n",
+    "            print(f\"Correct Answer: {question_data['answer']}\")\n",
+    "            print(f\"----------------------------------------------------------------\\n\")\n",
+    "\n",
+    "    print(f\"\\nBenchmark Summary:\")\n",
+    "    print(f\"Total Cases Processed: {cases_processed}\")\n",
+    "    print(f\"Total Questions Processed: {questions_processed}\")\n",
+    "    print(f\"Total Questions Skipped: {skipped_questions}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tools = get_tools()\n",
+    "main(tools)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "medmax",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

experiments/chexbench_gpt4.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import json
+import openai
+import os
+from datetime import datetime
+import base64
+import logging
+from pathlib import Path
+import time
+from tqdm import tqdm
+from typing import Dict, List, Optional, Union, Any
+# Configuration constants
+DEBUG_MODE = False
+OUTPUT_DIR = "results"
+MODEL_NAME = "gpt-4o-2024-05-13"
+TEMPERATURE = 0.2
+SUBSET = "Visual Question Answering"
+# Set up logging configuration
+logging_level = logging.DEBUG if DEBUG_MODE else logging.INFO
+logging.basicConfig(level=logging_level, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+def get_mime_type(file_path: str) -> str:
+    """
+    Determine MIME type based on file extension.
+    Args:
+        file_path (str): Path to the file
+    Returns:
+        str: MIME type string for the file
+    """
+    extension = os.path.splitext(file_path)[1].lower()
+    mime_types = {
+        ".png": "image/png",
+        ".jpg": "image/jpeg",
+        ".jpeg": "image/jpeg",
+        ".gif": "image/gif",
+    }
+    return mime_types.get(extension, "application/octet-stream")
+def encode_image(image_path: str) -> str:
+    """
+    Encode image to base64 with extensive error checking.
+    Args:
+        image_path (str): Path to the image file
+    Returns:
+        str: Base64 encoded image string
+    Raises:
+        FileNotFoundError: If image file does not exist
+        ValueError: If image file is empty or too large
+        Exception: For other image processing errors
+    """
+    logger.debug(f"Attempting to read image from: {image_path}")
+    if not os.path.exists(image_path):
+        raise FileNotFoundError(f"Image file not found: {image_path}")
+    # Add check for file size
+    file_size = os.path.getsize(image_path)
+    if file_size > 20 * 1024 * 1024:  # 20MB limit
+        raise ValueError("Image file size exceeds 20MB limit")
+    if file_size == 0:
+        raise ValueError("Image file is empty")
+    logger.debug(f"Image file size: {file_size / 1024:.2f} KB")
+    try:
+        from PIL import Image
+        # Try to open and verify the image
+        with Image.open(image_path) as img:
+            # Get image details
+            width, height = img.size
+            format = img.format
+            mode = img.mode
+            logger.debug(
+                f"Image verification - Format: {format}, Size: {width}x{height}, Mode: {mode}"
+            )
+            if format not in ["PNG", "JPEG", "GIF"]:
+                raise ValueError(f"Unsupported image format: {format}")
+        with open(image_path, "rb") as image_file:
+            # Read the first few bytes to verify it's a valid PNG
+            header = image_file.read(8)
+            # if header != b'\x89PNG\r\n\x1a\n':
+            #     logger.warning("File does not have a valid PNG signature")
+            # Reset file pointer and read entire file
+            image_file.seek(0)
+            encoded = base64.b64encode(image_file.read()).decode("utf-8")
+            encoded_length = len(encoded)
+            logger.debug(f"Base64 encoded length: {encoded_length} characters")
+            # Verify the encoded string is not empty and starts correctly
+            if encoded_length == 0:
+                raise ValueError("Base64 encoding produced empty string")
+            if not encoded.startswith("/9j/") and not encoded.startswith("iVBOR"):
+                logger.warning("Base64 string doesn't start with expected JPEG or PNG header")
+            return encoded
+    except Exception as e:
+        logger.error(f"Error reading/encoding image: {str(e)}")
+        raise
+def create_single_request(
+    image_path: str, question: str, options: Dict[str, str]
+) -> List[Dict[str, Any]]:
+    """
+    Create a single API request with image and question.
+    Args:
+        image_path (str): Path to the image file
+        question (str): Question text
+        options (Dict[str, str]): Dictionary containing options with keys 'option_0' and 'option_1'
+    Returns:
+        List[Dict[str, Any]]: List of message dictionaries for the API request
+    Raises:
+        Exception: For errors in request creation
+    """
+    if DEBUG_MODE:
+        logger.debug("Creating API request...")
+    prompt = f"""Given the following medical examination question:
+Please answer this multiple choice question:
+Question: {question}
+Options:
+A) {options['option_0']}
+B) {options['option_1']}
+Base your answer only on the provided image and select either A or B."""
+    try:
+        encoded_image = encode_image(image_path)
+        mime_type = get_mime_type(image_path)
+        if DEBUG_MODE:
+            logger.debug(f"Image encoded with MIME type: {mime_type}")
+        messages = [
+            {
+                "role": "system",
+                "content": "You are taking a medical exam. Answer ONLY with the letter (A/B) corresponding to your answer.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{mime_type};base64,{encoded_image}"},
+                    },
+                ],
+            },
+        ]
+        if DEBUG_MODE:
+            log_messages = json.loads(json.dumps(messages))
+            log_messages[1]["content"][1]["image_url"][
+                "url"
+            ] = f"data:{mime_type};base64,[BASE64_IMAGE_TRUNCATED]"
+            logger.debug(f"Complete API request payload:\n{json.dumps(log_messages, indent=2)}")
+        return messages
+    except Exception as e:
+        logger.error(f"Error creating request: {str(e)}")
+        raise
+def check_answer(model_answer: str, correct_answer: int) -> bool:
+    """
+    Check if the model's answer matches the correct answer.
+    Args:
+        model_answer (str): The model's answer (A or B)
+        correct_answer (int): The correct answer index (0 for A, 1 for B)
+    Returns:
+        bool: True if answer is correct, False otherwise
+    """
+    if not isinstance(model_answer, str):
+        return False
+    # Clean the model answer to get just the letter
+    model_letter = model_answer.strip().upper()
+    if model_letter.startswith("A"):
+        model_index = 0
+    elif model_letter.startswith("B"):
+        model_index = 1
+    else:
+        return False
+    return model_index == correct_answer
+def save_results_to_json(results: List[Dict[str, Any]], output_dir: str) -> str:
+    """
+    Save results to a JSON file with timestamp.
+    Args:
+        results (List[Dict[str, Any]]): List of result dictionaries
+        output_dir (str): Directory to save results
+    Returns:
+        str: Path to the saved file
+    """
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = os.path.join(output_dir, f"batch_results_{timestamp}.json")
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=2)
+    logger.info(f"Batch results saved to {output_file}")
+    return output_file
+def calculate_accuracy(results: List[Dict[str, Any]]) -> tuple[float, int, int]:
+    """
+    Calculate accuracy from results, handling error cases.
+    Args:
+        results (List[Dict[str, Any]]): List of result dictionaries
+    Returns:
+        tuple[float, int, int]: Tuple containing (accuracy percentage, number correct, total)
+    """
+    if not results:
+        return 0.0, 0, 0
+    total = len(results)
+    valid_results = [r for r in results if "output" in r]
+    correct = sum(
+        1 for result in valid_results if result.get("output", {}).get("is_correct", False)
+    )
+    accuracy = (correct / total * 100) if total > 0 else 0
+    return accuracy, correct, total
+def calculate_batch_accuracy(results: List[Dict[str, Any]]) -> float:
+    """
+    Calculate accuracy for the current batch.
+    Args:
+        results (List[Dict[str, Any]]): List of result dictionaries
+    Returns:
+        float: Accuracy percentage for the batch
+    """
+    valid_results = [r for r in results if "output" in r]
+    if not valid_results:
+        return 0.0
+    return sum(1 for r in valid_results if r["output"]["is_correct"]) / len(valid_results) * 100
+def process_batch(
+    data: List[Dict[str, Any]], client: openai.OpenAI, start_idx: int = 0, batch_size: int = 50
+) -> List[Dict[str, Any]]:
+    """
+    Process a batch of examples and return results.
+    Args:
+        data (List[Dict[str, Any]]): List of data items to process
+        client (openai.OpenAI): OpenAI client instance
+        start_idx (int, optional): Starting index for batch. Defaults to 0
+        batch_size (int, optional): Size of batch to process. Defaults to 50
+    Returns:
+        List[Dict[str, Any]]: List of processed results
+    """
+    batch_results = []
+    end_idx = min(start_idx + batch_size, len(data))
+    pbar = tqdm(
+        range(start_idx, end_idx),
+        desc=f"Processing batch {start_idx//batch_size + 1}",
+        unit="example",
+    )
+    for index in pbar:
+        vqa_item = data[index]
+        options = {"option_0": vqa_item["option_0"], "option_1": vqa_item["option_1"]}
+        try:
+            messages = create_single_request(
+                image_path=vqa_item["image_path"], question=vqa_item["question"], options=options
+            )
+            response = client.chat.completions.create(
+                model=MODEL_NAME, messages=messages, max_tokens=50, temperature=TEMPERATURE
+            )
+            model_answer = response.choices[0].message.content.strip()
+            is_correct = check_answer(model_answer, vqa_item["answer"])
+            result = {
+                "timestamp": datetime.now().isoformat(),
+                "example_index": index,
+                "input": {
+                    "question": vqa_item["question"],
+                    "options": {"A": vqa_item["option_0"], "B": vqa_item["option_1"]},
+                    "image_path": vqa_item["image_path"],
+                },
+                "output": {
+                    "model_answer": model_answer,
+                    "correct_answer": "A" if vqa_item["answer"] == 0 else "B",
+                    "is_correct": is_correct,
+                    "usage": {
+                        "prompt_tokens": response.usage.prompt_tokens,
+                        "completion_tokens": response.usage.completion_tokens,
+                        "total_tokens": response.usage.total_tokens,
+                    },
+                },
+            }
+            batch_results.append(result)
+            # Update progress bar with current accuracy
+            current_accuracy = calculate_batch_accuracy(batch_results)
+            pbar.set_description(
+                f"Batch {start_idx//batch_size + 1} - Accuracy: {current_accuracy:.2f}% "
+                f"({len(batch_results)}/{index-start_idx+1} examples)"
+            )
+        except Exception as e:
+            error_result = {
+                "timestamp": datetime.now().isoformat(),
+                "example_index": index,
+                "error": str(e),
+                "input": {
+                    "question": vqa_item["question"],
+                    "options": {"A": vqa_item["option_0"], "B": vqa_item["option_1"]},
+                    "image_path": vqa_item["image_path"],
+                },
+            }
+            batch_results.append(error_result)
+            if DEBUG_MODE:
+                pbar.write(f"Error processing example {index}: {str(e)}")
+        time.sleep(1)  # Rate limiting
+    return batch_results
+def main() -> None:
+    """
+    Main function to process the entire dataset.
+    Raises:
+        ValueError: If OPENAI_API_KEY is not set
+        Exception: For other processing errors
+    """
+    logger.info("Starting full dataset processing...")
+    json_path = "../data/chexbench_updated.json"
+    try:
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError("OPENAI_API_KEY environment variable is not set.")
+        client = openai.OpenAI(api_key=api_key)
+        with open(json_path, "r") as f:
+            data = json.load(f)
+        subset_data = data[SUBSET]
+        total_examples = len(subset_data)
+        logger.info(f"Found {total_examples} examples in {SUBSET} subset")
+        all_results = []
+        batch_size = 50  # Process in batches of 50 examples
+        # Process all examples in batches
+        for start_idx in range(0, total_examples, batch_size):
+            batch_results = process_batch(subset_data, client, start_idx, batch_size)
+            all_results.extend(batch_results)
+            # Save intermediate results after each batch
+            output_file = save_results_to_json(all_results, OUTPUT_DIR)
+            # Calculate and log overall progress
+            overall_accuracy, correct, total = calculate_accuracy(all_results)
+            logger.info(f"Overall Progress: {len(all_results)}/{total_examples} examples processed")
+            logger.info(f"Current Accuracy: {overall_accuracy:.2f}% ({correct}/{total} correct)")
+        logger.info("Processing completed!")
+        logger.info(f"Final results saved to: {output_file}")
+    except Exception as e:
+        logger.error(f"Fatal error: {str(e)}")
+        raise
+if __name__ == "__main__":
+    main()

experiments/compare_runs.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import json
+import argparse
+import random
+from typing import List, Dict, Any, Tuple
+import re
+from collections import defaultdict
+# Define category order
+CATEGORY_ORDER = [
+    "detection",
+    "classification",
+    "localization",
+    "comparison",
+    "relationship",
+    "diagnosis",
+    "characterization",
+]
+def extract_letter_answer(answer: str) -> str:
+    """Extract just the letter answer from various answer formats.
+    Args:
+        answer: The answer string to extract a letter from
+    Returns:
+        str: The extracted letter in uppercase, or empty string if no letter found
+    """
+    if not answer:
+        return ""
+    # Convert to string and clean
+    answer = str(answer).strip()
+    # If it's just a single letter A-F, return it
+    if len(answer) == 1 and answer.upper() in "ABCDEF":
+        return answer.upper()
+    # Try to match patterns like "A)", "A.", "A ", etc.
+    match = re.match(r"^([A-F])[).\s]", answer, re.IGNORECASE)
+    if match:
+        return match.group(1).upper()
+    # Try to find any standalone A-F letters preceded by space or start of string
+    # and followed by space, period, parenthesis or end of string
+    matches = re.findall(r"(?:^|\s)([A-F])(?:[).\s]|$)", answer, re.IGNORECASE)
+    if matches:
+        return matches[0].upper()
+    # Last resort: just find any A-F letter
+    letters = re.findall(r"[A-F]", answer, re.IGNORECASE)
+    if letters:
+        return letters[0].upper()
+    # If no letter found, return original (cleaned)
+    return answer.strip().upper()
+def parse_json_lines(file_path: str) -> Tuple[str, List[Dict[str, Any]]]:
+    """Parse JSON Lines file and extract valid predictions.
+    Args:
+        file_path: Path to the JSON Lines file to parse
+    Returns:
+        Tuple containing:
+            - str: Model name or file path if model name not found
+            - List[Dict[str, Any]]: List of valid prediction entries
+    """
+    valid_predictions = []
+    model_name = None
+    # First try to parse as LLaVA format
+    try:
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            if data.get("model") == "llava-med-v1.5-mistral-7b":
+                model_name = data["model"]
+                for result in data.get("results", []):
+                    if all(k in result for k in ["case_id", "question_id", "correct_answer"]):
+                        # Extract answer with priority: model_answer > validated_answer > raw_output
+                        model_answer = (
+                            result.get("model_answer")
+                            or result.get("validated_answer")
+                            or result.get("raw_output", "")
+                        )
+                        # Add default categories for LLaVA results
+                        prediction = {
+                            "case_id": result["case_id"],
+                            "question_id": result["question_id"],
+                            "model_answer": model_answer,
+                            "correct_answer": result["correct_answer"],
+                            "input": {
+                                "question_data": {
+                                    "metadata": {
+                                        "categories": [
+                                            "detection",
+                                            "classification",
+                                            "localization",
+                                            "comparison",
+                                            "relationship",
+                                            "diagnosis",
+                                            "characterization",
+                                        ]
+                                    }
+                                }
+                            },
+                        }
+                        valid_predictions.append(prediction)
+                return model_name, valid_predictions
+    except (json.JSONDecodeError, KeyError):
+        pass
+    # If not LLaVA format, process as original format
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.startswith("HTTP Request:"):
+                continue
+            try:
+                data = json.loads(line.strip())
+                if "model" in data:
+                    model_name = data["model"]
+                if all(
+                    k in data for k in ["model_answer", "correct_answer", "case_id", "question_id"]
+                ):
+                    valid_predictions.append(data)
+            except json.JSONDecodeError:
+                continue
+    return model_name if model_name else file_path, valid_predictions
+def filter_common_questions(
+    predictions_list: List[List[Dict[str, Any]]]
+) -> List[List[Dict[str, Any]]]:
+    """Ensure only questions that exist across all models are evaluated.
+    Args:
+        predictions_list: List of prediction lists from different models
+    Returns:
+        List[List[Dict[str, Any]]]: Filtered predictions containing only common questions
+    """
+    question_sets = [
+        set((p["case_id"], p["question_id"]) for p in preds) for preds in predictions_list
+    ]
+    common_questions = set.intersection(*question_sets)
+    return [
+        [p for p in preds if (p["case_id"], p["question_id"]) in common_questions]
+        for preds in predictions_list
+    ]
+def calculate_accuracy(
+    predictions: List[Dict[str, Any]]
+) -> Tuple[float, int, int, Dict[str, Dict[str, float]]]:
+    """Compute overall and category-level accuracy.
+    Args:
+        predictions: List of prediction entries to analyze
+    Returns:
+        Tuple containing:
+            - float: Overall accuracy percentage
+            - int: Number of correct predictions
+            - int: Total number of predictions
+            - Dict[str, Dict[str, float]]: Category-level accuracy statistics
+    """
+    if not predictions:
+        return 0.0, 0, 0, {}
+    category_performance = defaultdict(lambda: {"total": 0, "correct": 0})
+    correct = 0
+    total = 0
+    sample_size = min(5, len(predictions))
+    sampled_indices = random.sample(range(len(predictions)), sample_size)
+    print("\nSample extracted answers:")
+    for i in sampled_indices:
+        pred = predictions[i]
+        model_ans = extract_letter_answer(pred["model_answer"])
+        correct_ans = extract_letter_answer(pred["correct_answer"])
+        print(f"QID: {pred['question_id']}")
+        print(f"  Raw Model Answer: {pred['model_answer']}")
+        print(f"  Extracted Model Answer: {model_ans}")
+        print(f"  Raw Correct Answer: {pred['correct_answer']}")
+        print(f"  Extracted Correct Answer: {correct_ans}")
+        print("-" * 80)
+    for pred in predictions:
+        try:
+            model_ans = extract_letter_answer(pred["model_answer"])
+            correct_ans = extract_letter_answer(pred["correct_answer"])
+            categories = (
+                pred.get("input", {})
+                .get("question_data", {})
+                .get("metadata", {})
+                .get("categories", [])
+            )
+            if model_ans and correct_ans:
+                total += 1
+                is_correct = model_ans == correct_ans
+                if is_correct:
+                    correct += 1
+                for category in categories:
+                    category_performance[category]["total"] += 1
+                    if is_correct:
+                        category_performance[category]["correct"] += 1
+        except KeyError:
+            continue
+    category_accuracies = {
+        category: {
+            "accuracy": (stats["correct"] / stats["total"]) * 100 if stats["total"] > 0 else 0,
+            "total": stats["total"],
+            "correct": stats["correct"],
+        }
+        for category, stats in category_performance.items()
+    }
+    return (correct / total * 100 if total > 0 else 0.0, correct, total, category_accuracies)
+def compare_models(file_paths: List[str]) -> None:
+    """Compare accuracy between multiple model prediction files.
+    Args:
+        file_paths: List of paths to model prediction files to compare
+    """
+    # Parse all files
+    parsed_results = [parse_json_lines(file_path) for file_path in file_paths]
+    model_names, predictions_list = zip(*parsed_results)
+    # Get initial stats
+    print(f"\n📊 **Initial Accuracy**:")
+    results = []
+    category_results = []
+    for preds, name in zip(predictions_list, model_names):
+        acc, correct, total, category_acc = calculate_accuracy(preds)
+        results.append((acc, correct, total, name))
+        category_results.append(category_acc)
+        print(f"{name}: Accuracy = {acc:.2f}% ({correct}/{total} correct)")
+    # Get common questions across all models
+    filtered_predictions = filter_common_questions(predictions_list)
+    print(
+        f"\nQuestions per model after ensuring common questions: {[len(p) for p in filtered_predictions]}"
+    )
+    # Compute accuracy on common questions
+    print(f"\n📊 **Accuracy on Common Questions**:")
+    filtered_results = []
+    filtered_category_results = []
+    for preds, name in zip(filtered_predictions, model_names):
+        acc, correct, total, category_acc = calculate_accuracy(preds)
+        filtered_results.append((acc, correct, total, name))
+        filtered_category_results.append(category_acc)
+        print(f"{name}: Accuracy = {acc:.2f}% ({correct}/{total} correct)")
+    # Print category-wise accuracy
+    print("\nCategory Performance (Common Questions):")
+    for category in CATEGORY_ORDER:
+        print(f"\n{category.capitalize()}:")
+        for model_name, category_acc in zip(model_names, filtered_category_results):
+            stats = category_acc.get(category, {"accuracy": 0, "total": 0, "correct": 0})
+            print(f"  {model_name}: {stats['accuracy']:.2f}% ({stats['correct']}/{stats['total']})")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare accuracy across multiple model prediction files"
+    )
+    parser.add_argument("files", nargs="+", help="Paths to model prediction files")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for sampling")
+    args = parser.parse_args()
+    random.seed(args.seed)
+    compare_models(args.files)
+if __name__ == "__main__":
+    main()

experiments/inspect_logs.py ADDED Viewed

	@@ -0,0 +1,210 @@

+from typing import Optional, List
+import argparse
+import json
+import glob
+from pathlib import Path
+from datetime import datetime
+def get_latest_log() -> str:
+    """Find the most recently modified log file in the current directory.
+    Returns:
+        str: Path to the most recently modified log file
+    Raises:
+        FileNotFoundError: If no log files are found in the current directory
+    """
+    logs = list(Path(".").glob("api_usage_*.json"))
+    if not logs:
+        raise FileNotFoundError("No log files found in the current directory.")
+    return str(max(logs, key=lambda p: p.stat().st_mtime))
+def format_cost(entry: dict) -> str:
+    """Format cost if available, otherwise return 'N/A'
+    Args:
+        entry: Log entry dictionary containing cost information
+    Returns:
+        str: Formatted cost string with $ and 4 decimal places, or 'N/A' if cost not found
+    """
+    return f"${entry.get('cost', 'N/A'):.4f}" if "cost" in entry else "N/A"
+def print_gpt4_entry(entry: dict) -> None:
+    """Print entry for GPT-4 format
+    Args:
+        entry: Log entry dictionary in GPT-4 format containing model info, inputs and outputs
+    """
+    print("\n=== Log Entry ===")
+    print(f"Model: {entry['model']}")
+    print(f"Case ID: {entry['case_id']}")
+    print(f"Question ID: {entry['question_id']}")
+    print("\n=== Model Input ===")
+    messages = entry["input"]["messages"]
+    print("System message:", messages[0]["content"])
+    user_content = messages[1]["content"]
+    print("\nUser prompt:", user_content[0]["text"])
+    print("\nImages provided:")
+    for content in user_content[1:]:
+        print(f"  - {content['image_url']['url']}")
+    print("\n=== Model Output ===")
+    print(f"Answer: {entry['model_answer']}")
+    print(f"Correct: {entry['correct_answer']}")
+    print("\n=== Usage Stats ===")
+    print(f"Duration: {entry['duration']}s")
+    print(f"Cost: {format_cost(entry)}")
+    print(
+        f"Tokens: {entry['usage']['total_tokens']}",
+        f"(prompt: {entry['usage']['prompt_tokens']},",
+        f"completion: {entry['usage']['completion_tokens']})",
+    )
+def print_llama_entry(entry: dict) -> None:
+    """Print entry for Llama-3.2 format
+    Args:
+        entry: Log entry dictionary in Llama format containing model info, inputs and outputs
+    """
+    print("\n=== Log Entry ===")
+    print(f"Model: {entry['model']}")
+    print(f"Case ID: {entry['case_id']}")
+    print(f"Question ID: {entry['question_id']}")
+    print("\n=== Model Input ===")
+    print(f"Question: {entry['input']['question_data']['question']}")
+    print("\nImages provided:")
+    for url in entry["input"]["image_urls"]:
+        print(f"  - {url}")
+    if entry["input"]["image_captions"]:
+        print("\nImage captions:")
+        for caption in entry["input"]["image_captions"]:
+            if caption:
+                print(f"  - {caption}")
+    print("\n=== Model Output ===")
+    print(f"Answer: {entry['model_answer']}")
+    print(f"Correct: {entry['correct_answer']}")
+    print("\n=== Usage Stats ===")
+    print(f"Duration: {entry['duration']}s")
+    if "usage" in entry:
+        print(
+            f"Tokens: {entry['usage']['total_tokens']}",
+            f"(prompt: {entry['usage']['prompt_tokens']},",
+            f"completion: {entry['usage']['completion_tokens']})",
+        )
+def determine_model_type(entry: dict) -> str:
+    """Determine the model type from the entry
+    Args:
+        entry: Log entry dictionary containing model information
+    Returns:
+        str: Model type - 'gpt4', 'llama', or 'unknown'
+    """
+    model = entry.get("model", "").lower()
+    if "gpt-4" in model:
+        return "gpt4"
+    elif "llama" in model:
+        return "llama"
+    elif "chexagent" in model:
+        return "chexagent"
+    elif "medrax" in model:
+        return "medrax"
+    else:
+        return "unknown"
+def print_log_entry(
+    log_file: Optional[str] = None,
+    num_entries: Optional[int] = None,
+    model_filter: Optional[str] = None,
+) -> None:
+    """Print log entries from the specified log file or the latest log file.
+    Args:
+        log_file: Path to the log file. If None, uses the latest log file.
+        num_entries: Number of entries to print. If None, prints all entries.
+        model_filter: Filter entries by model type ('gpt4' or 'llama'). If None, prints all.
+    """
+    if log_file is None:
+        log_file = get_latest_log()
+        print(f"Using latest log file: {log_file}")
+    entries_printed = 0
+    total_entries = 0
+    filtered_entries = 0
+    with open(log_file, "r") as f:
+        for line in f:
+            if line.startswith("HTTP"):
+                continue
+            try:
+                total_entries += 1
+                entry = json.loads(line)
+                # Apply model filter if specified
+                model_type = determine_model_type(entry)
+                if model_filter and model_type != model_filter:
+                    filtered_entries += 1
+                    continue
+                if model_type == "gpt4":
+                    print_gpt4_entry(entry)
+                elif model_type == "llama":
+                    print_llama_entry(entry)
+                else:
+                    print(f"Unknown model type in entry: {entry['model']}")
+                    continue
+                print("=" * 50)
+                entries_printed += 1
+                if num_entries and entries_printed >= num_entries:
+                    break
+            except (json.JSONDecodeError, KeyError) as e:
+                print(f"Error processing entry: {e}")
+                continue
+    print(f"\nSummary:")
+    print(f"Total entries: {total_entries}")
+    print(f"Entries printed: {entries_printed}")
+    if model_filter:
+        print(f"Entries filtered: {filtered_entries}")
+def main() -> None:
+    """Main entry point for the script"""
+    parser = argparse.ArgumentParser(
+        description="Parse and display log entries from API usage logs."
+    )
+    parser.add_argument("-l", "--log_file", nargs="?", help="Path to the log file (optional)")
+    parser.add_argument("-n", "--num_entries", type=int, help="Number of entries to display")
+    parser.add_argument(
+        "-m",
+        "--model",
+        choices=["gpt4", "llama"],
+        default="gpt4",
+        help="Model type to display (default: gpt4)",
+    )
+    args = parser.parse_args()
+    try:
+        print_log_entry(args.log_file, args.num_entries, args.model)
+    except FileNotFoundError as e:
+        print(f"Error: {e}")
+        exit(1)
+if __name__ == "__main__":
+    main()

experiments/validate_logs.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from typing import Dict, List, Tuple, Optional
+import json
+import sys
+import glob
+from pathlib import Path
+from collections import defaultdict
+def get_latest_log() -> str:
+    """Find the most recently modified log file in the current directory.
+    Returns:
+        str: Path to the most recently modified log file
+    Raises:
+        SystemExit: If no log files are found in current directory
+    """
+    log_pattern = "api_usage_*.json"
+    logs = list(Path(".").glob(log_pattern))
+    if not logs:
+        print(f"No files matching pattern '{log_pattern}' found in current directory")
+        sys.exit(1)
+    return str(max(logs, key=lambda p: p.stat().st_mtime))
+def analyze_log_file(filename: str) -> Tuple[List[Dict], List[Dict], Dict[str, List[str]]]:
+    """Analyze a log file for entries missing images and errors.
+    Args:
+        filename: Path to the log file to analyze
+    Returns:
+        Tuple containing:
+            - List of entries with no images
+            - List of skipped/error entries
+            - Dict of processing errors by type
+    Raises:
+        SystemExit: If file cannot be found or read
+    """
+    no_images = []
+    errors = defaultdict(list)
+    skipped = []
+    try:
+        with open(filename, "r") as f:
+            for line_num, line in enumerate(f, 1):
+                # Skip HTTP request logs
+                if line.startswith("HTTP Request:") or line.strip() == "":
+                    continue
+                try:
+                    # Try to parse the JSON line
+                    if not line.strip().startswith("{"):
+                        continue
+                    entry = json.loads(line.strip())
+                    case_id = entry.get("case_id")
+                    question_id = entry.get("question_id")
+                    # Skip if we can't identify the question
+                    if not case_id or not question_id:
+                        continue
+                    # Check for explicit skip/error status
+                    if entry.get("status") in ["skipped", "error"]:
+                        skipped.append(
+                            {
+                                "case_id": case_id,
+                                "question_id": question_id,
+                                "reason": entry.get("reason"),
+                                "status": entry.get("status"),
+                            }
+                        )
+                        continue
+                    # Check user content for images
+                    messages = entry.get("input", {}).get("messages", [])
+                    has_image = False
+                    for msg in messages:
+                        content = msg.get("content", [])
+                        if isinstance(content, list):
+                            for item in content:
+                                if isinstance(item, dict) and item.get("type") == "image_url":
+                                    has_image = True
+                                    break
+                    if not has_image:
+                        no_images.append(
+                            {
+                                "case_id": case_id,
+                                "question_id": question_id,
+                                "question": entry.get("input", {})
+                                .get("question_data", {})
+                                .get("question", "")[:100]
+                                + "...",  # First 100 chars of question
+                            }
+                        )
+                except json.JSONDecodeError:
+                    errors["json_decode"].append(f"Line {line_num}: Invalid JSON")
+                    continue
+                except Exception as e:
+                    errors["other"].append(f"Line {line_num}: Error processing entry: {str(e)}")
+    except FileNotFoundError:
+        print(f"Error: Could not find log file: {filename}")
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error reading file {filename}: {str(e)}")
+        sys.exit(1)
+    return no_images, skipped, errors
+def print_results(
+    filename: str, no_images: List[Dict], skipped: List[Dict], errors: Dict[str, List[str]]
+) -> None:
+    """Print analysis results.
+    Args:
+        filename: Name of the analyzed log file
+        no_images: List of entries with no images
+        skipped: List of skipped/error entries
+        errors: Dict of processing errors by type
+    """
+    print(f"\nAnalyzing log file: {filename}")
+    print("\n=== Questions with No Images ===")
+    if no_images:
+        for entry in no_images:
+            print(f"\nCase ID: {entry['case_id']}")
+            print(f"Question ID: {entry['question_id']}")
+            print(f"Question Preview: {entry['question']}")
+    print(f"\nTotal questions without images: {len(no_images)}")
+    print("\n=== Skipped/Error Questions ===")
+    if skipped:
+        for entry in skipped:
+            print(f"\nCase ID: {entry['case_id']}")
+            print(f"Question ID: {entry['question_id']}")
+            print(f"Status: {entry['status']}")
+            print(f"Reason: {entry.get('reason', 'unknown')}")
+    print(f"\nTotal skipped/error questions: {len(skipped)}")
+    if errors:
+        print("\n=== Processing Errors ===")
+        for error_type, messages in errors.items():
+            if messages:
+                print(f"\n{error_type}:")
+                for msg in messages:
+                    print(f"  {msg}")
+def main() -> None:
+    """Main entry point for log validation script."""
+    # If a file is specified as an argument, use it; otherwise find the latest log
+    if len(sys.argv) > 1:
+        log_file = sys.argv[1]
+    else:
+        log_file = get_latest_log()
+    no_images, skipped, errors = analyze_log_file(log_file)
+    print_results(log_file, no_images, skipped, errors)
+if __name__ == "__main__":
+    main()

interface.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import re
+import gradio as gr
+from pathlib import Path
+import time
+import shutil
+from typing import AsyncGenerator, List, Optional, Tuple
+from gradio import ChatMessage
+class ChatInterface:
+    """
+    A chat interface for interacting with a medical AI agent through Gradio.
+    Handles file uploads, message processing, and chat history management.
+    Supports both regular image files and DICOM medical imaging files.
+    """
+    def __init__(self, agent, tools_dict):
+        """
+        Initialize the chat interface.
+        Args:
+            agent: The medical AI agent to handle requests
+            tools_dict (dict): Dictionary of available tools for image processing
+        """
+        self.agent = agent
+        self.tools_dict = tools_dict
+        self.upload_dir = Path("temp")
+        self.upload_dir.mkdir(exist_ok=True)
+        self.current_thread_id = None
+        # Separate storage for original and display paths
+        self.original_file_path = None  # For LLM (.dcm or other)
+        self.display_file_path = None  # For UI (always viewable format)
+    def handle_upload(self, file_path: str) -> str:
+        """
+        Handle new file upload and set appropriate paths.
+        Args:
+            file_path (str): Path to the uploaded file
+        Returns:
+            str: Display path for UI, or None if no file uploaded
+        """
+        if not file_path:
+            return None
+        source = Path(file_path)
+        timestamp = int(time.time())
+        # Save original file with proper suffix
+        suffix = source.suffix.lower()
+        saved_path = self.upload_dir / f"upload_{timestamp}{suffix}"
+        shutil.copy2(file_path, saved_path)  # Use file_path directly instead of source
+        self.original_file_path = str(saved_path)
+        # Handle DICOM conversion for display only
+        if suffix == ".dcm":
+            output, _ = self.tools_dict["DicomProcessorTool"]._run(str(saved_path))
+            self.display_file_path = output["image_path"]
+        else:
+            self.display_file_path = str(saved_path)
+        return self.display_file_path
+    def add_message(
+        self, message: str, display_image: str, history: List[dict]
+    ) -> Tuple[List[dict], gr.Textbox]:
+        """
+        Add a new message to the chat history.
+        Args:
+            message (str): Text message to add
+            display_image (str): Path to image being displayed
+            history (List[dict]): Current chat history
+        Returns:
+            Tuple[List[dict], gr.Textbox]: Updated history and textbox component
+        """
+        image_path = self.original_file_path or display_image
+        if image_path is not None:
+            history.append({"role": "user", "content": {"path": image_path}})
+        if message is not None:
+            history.append({"role": "user", "content": message})
+        return history, gr.Textbox(value=message, interactive=False)
+    async def process_message(
+        self, message: str, display_image: Optional[str], chat_history: List[ChatMessage]
+    ) -> AsyncGenerator[Tuple[List[ChatMessage], Optional[str], str], None]:
+        """
+        Process a message and generate responses.
+        Args:
+            message (str): User message to process
+            display_image (Optional[str]): Path to currently displayed image
+            chat_history (List[ChatMessage]): Current chat history
+        Yields:
+            Tuple[List[ChatMessage], Optional[str], str]: Updated chat history, display path, and empty string
+        """
+        chat_history = chat_history or []
+        # Initialize thread if needed
+        if not self.current_thread_id:
+            self.current_thread_id = str(time.time())
+        messages = []
+        image_path = self.original_file_path or display_image
+        if image_path is not None:
+            messages.append({"role": "user", "content": f"path: {image_path}"})
+        if message is not None:
+            messages.append({"role": "user", "content": message})
+        try:
+            for event in self.agent.workflow.stream(
+                {"messages": messages}, {"configurable": {"thread_id": self.current_thread_id}}
+            ):
+                if isinstance(event, dict):
+                    if "process" in event:
+                        content = event["process"]["messages"][-1].content
+                        if content:
+                            content = re.sub(r"temp/[^\s]*", "", content)
+                            chat_history.append(ChatMessage(role="assistant", content=content))
+                            yield chat_history, self.display_file_path, ""
+                    elif "execute" in event:
+                        for message in event["execute"]["messages"]:
+                            tool_name = message.name
+                            tool_result = eval(message.content)[0]
+                            if tool_result:
+                                metadata = {"title": f"🖼️ Image from tool: {tool_name}"}
+                                formatted_result = " ".join(
+                                    line.strip() for line in str(tool_result).splitlines()
+                                ).strip()
+                                metadata["description"] = formatted_result
+                                chat_history.append(
+                                    ChatMessage(
+                                        role="assistant",
+                                        content=formatted_result,
+                                        metadata=metadata,
+                                    )
+                                )
+                            # For image_visualizer, use display path
+                            if tool_name == "image_visualizer":
+                                self.display_file_path = tool_result["image_path"]
+                                chat_history.append(
+                                    ChatMessage(
+                                        role="assistant",
+                                        # content=gr.Image(value=self.display_file_path),
+                                        content={"path": self.display_file_path},
+                                    )
+                                )
+                            yield chat_history, self.display_file_path, ""
+        except Exception as e:
+            chat_history.append(
+                ChatMessage(
+                    role="assistant", content=f"❌ Error: {str(e)}", metadata={"title": "Error"}
+                )
+            )
+            yield chat_history, self.display_file_path
+def create_demo(agent, tools_dict):
+    """
+    Create a Gradio demo interface for the medical AI agent.
+    Args:
+        agent: The medical AI agent to handle requests
+        tools_dict (dict): Dictionary of available tools for image processing
+    Returns:
+        gr.Blocks: Gradio Blocks interface
+    """
+    interface = ChatInterface(agent, tools_dict)
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        with gr.Column():
+            gr.Markdown(
+                """
+            # 🏥 MedRAX
+            Medical Reasoning Agent for Chest X-ray
+            """
+            )
+            with gr.Row():
+                with gr.Column(scale=3):
+                    chatbot = gr.Chatbot(
+                        [],
+                        height=800,
+                        container=True,
+                        show_label=True,
+                        elem_classes="chat-box",
+                        type="messages",
+                        label="Agent",
+                        avatar_images=(
+                            None,
+                            "assets/medrax_logo.jpg",
+                        ),
+                    )
+                    with gr.Row():
+                        with gr.Column(scale=3):
+                            txt = gr.Textbox(
+                                show_label=False,
+                                placeholder="Ask about the X-ray...",
+                                container=False,
+                            )
+                with gr.Column(scale=3):
+                    image_display = gr.Image(
+                        label="Image", type="filepath", height=700, container=True
+                    )
+                    with gr.Row():
+                        upload_button = gr.UploadButton(
+                            "📎 Upload X-Ray",
+                            file_types=["image"],
+                        )
+                        dicom_upload = gr.UploadButton(
+                            "📄 Upload DICOM",
+                            file_types=["file"],
+                        )
+                    with gr.Row():
+                        clear_btn = gr.Button("Clear Chat")
+                        new_thread_btn = gr.Button("New Thread")
+        # Event handlers
+        def clear_chat():
+            interface.original_file_path = None
+            interface.display_file_path = None
+            return [], None
+        def new_thread():
+            interface.current_thread_id = str(time.time())
+            return [], interface.display_file_path
+        def handle_file_upload(file):
+            return interface.handle_upload(file.name)
+        chat_msg = txt.submit(
+            interface.add_message, inputs=[txt, image_display, chatbot], outputs=[chatbot, txt]
+        )
+        bot_msg = chat_msg.then(
+            interface.process_message,
+            inputs=[txt, image_display, chatbot],
+            outputs=[chatbot, image_display, txt],
+        )
+        bot_msg.then(lambda: gr.Textbox(interactive=True), None, [txt])
+        upload_button.upload(handle_file_upload, inputs=upload_button, outputs=image_display)
+        dicom_upload.upload(handle_file_upload, inputs=dicom_upload, outputs=image_display)
+        clear_btn.click(clear_chat, outputs=[chatbot, image_display])
+        new_thread_btn.click(new_thread, outputs=[chatbot, image_display])
+    return demo

main.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import warnings
+from typing import *
+from dotenv import load_dotenv
+from transformers import logging
+from langgraph.checkpoint.memory import MemorySaver
+from langchain_openai import ChatOpenAI
+from langgraph.checkpoint.memory import MemorySaver
+from langchain_openai import ChatOpenAI
+from interface import create_demo
+from medrax.agent import *
+from medrax.tools import *
+from medrax.utils import *
+warnings.filterwarnings("ignore")
+logging.set_verbosity_error()
+_ = load_dotenv()
+def initialize_agent(prompt_file, model_dir="/model-weights", temp_dir="temp", device="cuda"):
+    prompts = load_prompts_from_file(prompt_file)
+    prompt = prompts["MEDICAL_ASSISTANT"]
+    tools_dict = {
+        "ChestXRayClassifierTool": ChestXRayClassifierTool(device=device),
+        "ChestXRayReportGeneratorTool": ChestXRayReportGeneratorTool(
+            cache_dir=model_dir, device=device
+        ),
+        "ChestXRaySegmentationTool": ChestXRaySegmentationTool(device=device),
+        "LlavaMedTool": LlavaMedTool(cache_dir=model_dir, device=device, load_in_8bit=True),
+        "XRayVQATool": XRayVQATool(cache_dir=model_dir, device=device),
+        "ImageVisualizerTool": ImageVisualizerTool(),
+        "XRayPhraseGroundingTool": XRayPhraseGroundingTool(
+            cache_dir=model_dir, temp_dir=temp_dir, load_in_8bit=True, device=device
+        ),
+        "ChestXRayGeneratorTool": ChestXRayGeneratorTool(
+            model_path=f"{model_dir}/roentgen", temp_dir=temp_dir, device=device
+        ),
+        "DicomProcessorTool": DicomProcessorTool(temp_dir=temp_dir),
+    }
+    checkpointer = MemorySaver()
+    model = ChatOpenAI(model="gpt-4o", temperature=0.7, top_p=0.95)
+    agent = Agent(
+        model,
+        tools=list(tools_dict.values()),
+        log_tools=True,
+        log_dir="logs",
+        system_prompt=prompt,
+        checkpointer=checkpointer,
+    )
+    print("Agent initialized")
+    return agent, tools_dict
+if __name__ == "__main__":
+    print("Starting server...")
+    agent, tools_dict = initialize_agent("medrax/docs/system_prompts.txt")
+    demo = create_demo(agent, tools_dict)
+    demo.launch(server_name="0.0.0.0", server_port=8585, share=True)

medrax/__init__.py ADDED Viewed

File without changes

medrax/agent/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .agent import AgentState, Agent

medrax/agent/agent.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import json
+import operator
+from pathlib import Path
+from dotenv import load_dotenv
+from datetime import datetime
+from typing import List, Dict, Any, TypedDict, Annotated, Optional
+from langgraph.graph import StateGraph, END
+from langchain_core.messages import AnyMessage, SystemMessage, ToolMessage
+from langchain_core.language_models import BaseLanguageModel
+from langchain_core.tools import BaseTool
+_ = load_dotenv()
+class ToolCallLog(TypedDict):
+    """
+    A TypedDict representing a log entry for a tool call.
+    Attributes:
+        timestamp (str): The timestamp of when the tool call was made.
+        tool_call_id (str): The unique identifier for the tool call.
+        name (str): The name of the tool that was called.
+        args (Any): The arguments passed to the tool.
+        content (str): The content or result of the tool call.
+    """
+    timestamp: str
+    tool_call_id: str
+    name: str
+    args: Any
+    content: str
+class AgentState(TypedDict):
+    """
+    A TypedDict representing the state of an agent.
+    Attributes:
+        messages (Annotated[List[AnyMessage], operator.add]): A list of messages
+            representing the conversation history. The operator.add annotation
+            indicates that new messages should be appended to this list.
+    """
+    messages: Annotated[List[AnyMessage], operator.add]
+class Agent:
+    """
+    A class representing an agent that processes requests and executes tools based on
+    language model responses.
+    Attributes:
+        model (BaseLanguageModel): The language model used for processing.
+        tools (Dict[str, BaseTool]): A dictionary of available tools.
+        checkpointer (Any): Manages and persists the agent's state.
+        system_prompt (str): The system instructions for the agent.
+        workflow (StateGraph): The compiled workflow for the agent's processing.
+        log_tools (bool): Whether to log tool calls.
+        log_path (Path): Path to save tool call logs.
+    """
+    def __init__(
+        self,
+        model: BaseLanguageModel,
+        tools: List[BaseTool],
+        checkpointer: Any = None,
+        system_prompt: str = "",
+        log_tools: bool = True,
+        log_dir: Optional[str] = "logs",
+    ):
+        """
+        Initialize the Agent.
+        Args:
+            model (BaseLanguageModel): The language model to use.
+            tools (List[BaseTool]): A list of available tools.
+            checkpointer (Any, optional): State persistence manager. Defaults to None.
+            system_prompt (str, optional): System instructions. Defaults to "".
+            log_tools (bool, optional): Whether to log tool calls. Defaults to True.
+            log_dir (str, optional): Directory to save logs. Defaults to 'logs'.
+        """
+        self.system_prompt = system_prompt
+        self.log_tools = log_tools
+        if self.log_tools:
+            self.log_path = Path(log_dir or "logs")
+            self.log_path.mkdir(exist_ok=True)
+        # Define the agent workflow
+        workflow = StateGraph(AgentState)
+        workflow.add_node("process", self.process_request)
+        workflow.add_node("execute", self.execute_tools)
+        workflow.add_conditional_edges(
+            "process", self.has_tool_calls, {True: "execute", False: END}
+        )
+        workflow.add_edge("execute", "process")
+        workflow.set_entry_point("process")
+        self.workflow = workflow.compile(checkpointer=checkpointer)
+        self.tools = {t.name: t for t in tools}
+        self.model = model.bind_tools(tools)
+    def process_request(self, state: AgentState) -> Dict[str, List[AnyMessage]]:
+        """
+        Process the request using the language model.
+        Args:
+            state (AgentState): The current state of the agent.
+        Returns:
+            Dict[str, List[AnyMessage]]: A dictionary containing the model's response.
+        """
+        messages = state["messages"]
+        if self.system_prompt:
+            messages = [SystemMessage(content=self.system_prompt)] + messages
+        response = self.model.invoke(messages)
+        return {"messages": [response]}
+    def has_tool_calls(self, state: AgentState) -> bool:
+        """
+        Check if the response contains any tool calls.
+        Args:
+            state (AgentState): The current state of the agent.
+        Returns:
+            bool: True if tool calls exist, False otherwise.
+        """
+        response = state["messages"][-1]
+        return len(response.tool_calls) > 0
+    def execute_tools(self, state: AgentState) -> Dict[str, List[ToolMessage]]:
+        """
+        Execute tool calls from the model's response.
+        Args:
+            state (AgentState): The current state of the agent.
+        Returns:
+            Dict[str, List[ToolMessage]]: A dictionary containing tool execution results.
+        """
+        tool_calls = state["messages"][-1].tool_calls
+        results = []
+        for call in tool_calls:
+            print(f"Executing tool: {call}")
+            if call["name"] not in self.tools:
+                print("\n....invalid tool....")
+                result = "invalid tool, please retry"
+            else:
+                result = self.tools[call["name"]].invoke(call["args"])
+            results.append(
+                ToolMessage(
+                    tool_call_id=call["id"],
+                    name=call["name"],
+                    args=call["args"],
+                    content=str(result),
+                )
+            )
+        self._save_tool_calls(results)
+        print("Returning to model processing!")
+        return {"messages": results}
+    def _save_tool_calls(self, tool_calls: List[ToolMessage]) -> None:
+        """
+        Save tool calls to a JSON file with timestamp-based naming.
+        Args:
+            tool_calls (List[ToolMessage]): List of tool calls to save.
+        """
+        if not self.log_tools:
+            return
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = self.log_path / f"tool_calls_{timestamp}.json"
+        logs: List[ToolCallLog] = []
+        for call in tool_calls:
+            log_entry = {
+                "tool_call_id": call.tool_call_id,
+                "name": call.name,
+                "args": call.args,
+                "content": call.content,
+                "timestamp": datetime.now().isoformat(),
+            }
+            logs.append(log_entry)
+        with open(filename, "w") as f:
+            json.dump(logs, f, indent=4)