Spaces:

samwell
/

medrax2

Sleeping

App Files Files Community

VictorLJZ commited on Oct 18

Commit

e4e9fae

2 Parent(s): fd330d9 373615b

Merge branch 'main' into tool-changes

Browse files

Files changed (48) hide show

api.py +342 -0
benchmarking/llm_providers/medrax_provider.py +2 -1
interface.py +46 -56
main.py +374 -62
medrax/agent/agent.py +13 -106
medrax/docs/system_prompts.txt +3 -1
medrax/llava/conversation.py +1 -3
medrax/llava/eval/eval_multimodal_chat_gpt_score.py +3 -6
medrax/llava/eval/llm.py +8 -23
medrax/llava/eval/model_vqa.py +2 -8
medrax/llava/eval/summarize_gpt_review.py +3 -7
medrax/llava/mm_utils.py +4 -14
medrax/llava/model/builder.py +4 -12
medrax/llava/model/language_model/llava_mistral.py +1 -3
medrax/llava/model/llava_arch.py +13 -39
medrax/llava/model/multimodal_encoder/builder.py +2 -8
medrax/llava/model/multimodal_projector/builder.py +1 -3
medrax/llava/serve/cli.py +1 -3
medrax/llava/serve/controller.py +3 -6
medrax/llava/serve/gradio_web_server.py +4 -12
medrax/llava/serve/model_worker.py +6 -14
medrax/llava/serve/test_message.py +2 -6
medrax/llava/utils.py +1 -3
medrax/models/model_factory.py +5 -12
medrax/rag/rag.py +3 -9
medrax/tools/browsing/__init__.py +3 -3
medrax/tools/browsing/duckduckgo.py +12 -33
medrax/tools/browsing/web_browser.py +3 -9
medrax/tools/classification/__init__.py +1 -6
medrax/tools/classification/arcplus.py +5 -17
medrax/tools/classification/torchxrayvision.py +1 -3
medrax/tools/dicom.py +1 -3
medrax/tools/grounding.py +4 -13
medrax/tools/rag.py +1 -1
medrax/tools/report_generation.py +4 -14
medrax/tools/segmentation/__init__.py +1 -7
medrax/tools/segmentation/medsam2.py +69 -79
medrax/tools/segmentation/segmentation.py +10 -30
medrax/tools/utils.py +5 -15
medrax/tools/vqa/__init__.py +4 -4
medrax/tools/vqa/llava_med.py +4 -12
medrax/tools/vqa/medgemma/medgemma.py +51 -11
medrax/tools/vqa/medgemma/medgemma_client.py +12 -4
medrax/tools/vqa/medgemma/medgemma_requirements_standard.txt +1 -1
medrax/tools/vqa/medgemma/medgemma_setup.py +91 -4
medrax/tools/vqa/xray_vqa.py +6 -12
medrax/tools/xray_generation.py +12 -23
pyproject.toml +11 -9

api.py ADDED Viewed

	@@ -0,0 +1,342 @@

+"""
+MedRAX API Module
+This module provides a FastAPI-based REST API for the MedRAX medical imaging AI assistant.
+It offers endpoints for processing medical images with text queries using the same agent
+architecture as the Gradio interface.
+The API supports:
+- Text-only queries
+- Single or multiple image inputs
+- Optional custom system prompts
+- Automatic thread management for each request
+- Tool execution and result aggregation
+"""
+import uuid
+import base64
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+import re
+import time
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
+from langchain_core.messages import AIMessage, ToolMessage
+# Import MedRAX components
+from medrax.agent import Agent
+class QueryRequest(BaseModel):
+    """
+    Request model for text-only queries.
+    Attributes:
+        question (str): The question or query to ask the agent
+        system_prompt (Optional[str]): Custom system prompt to override default
+        thread_id (Optional[str]): Optional thread ID for conversation continuity
+    """
+    question: str = Field(..., description="The question or query to ask the agent")
+    system_prompt: Optional[str] = Field(None, description="Custom system prompt to override default")
+    thread_id: Optional[str] = Field(None, description="Optional thread ID for conversation continuity")
+class QueryResponse(BaseModel):
+    """
+    Response model for API queries.
+    Attributes:
+        response (str): The agent's text response
+        thread_id (str): The thread ID used for this conversation
+        tools_used (List[str]): List of tools that were executed
+        processing_time (float): Time taken to process the request in seconds
+    """
+    response: str = Field(..., description="The agent's text response")
+    thread_id: str = Field(..., description="The thread ID used for this conversation")
+    tools_used: List[str] = Field(..., description="List of tools that were executed")
+    processing_time: float = Field(..., description="Time taken to process the request in seconds")
+class MedRAXAPI:
+    """
+    FastAPI application wrapper for the MedRAX agent.
+    This class provides a clean interface for creating and managing the API endpoints
+    while maintaining separation of concerns from the core agent functionality.
+    """
+    def __init__(self, agent: Agent, tools_dict: Dict[str, Any], temp_dir: str = "temp_api"):
+        """
+        Initialize the MedRAX API.
+        Args:
+            agent (Agent): The initialized MedRAX agent
+            tools_dict (Dict[str, Any]): Dictionary of available tools
+            temp_dir (str): Directory for temporary file storage
+        """
+        self.agent = agent
+        self.tools_dict = tools_dict
+        self.temp_dir = Path(temp_dir)
+        self.temp_dir.mkdir(exist_ok=True)
+        # Create FastAPI app
+        self.app = FastAPI(
+            title="MedRAX API",
+            description="Medical Reasoning Agent for Chest X-ray Analysis",
+            version="2.0.0",
+            docs_url="/docs",
+            redoc_url="/redoc",
+        )
+        # Add CORS middleware
+        self.app.add_middleware(
+            CORSMiddleware,
+            allow_origins=["*"],
+            allow_credentials=True,
+            allow_methods=["*"],
+            allow_headers=["*"],
+        )
+        # Register routes
+        self._register_routes()
+    def _register_routes(self):
+        """Register all API routes."""
+        @self.app.get("/health")
+        async def health_check():
+            """Health check endpoint."""
+            return {"status": "healthy", "service": "MedRAX API"}
+        @self.app.get("/tools")
+        async def list_tools():
+            """List available tools."""
+            return {"available_tools": list(self.tools_dict.keys()), "total_count": len(self.tools_dict)}
+        @self.app.post("/query", response_model=QueryResponse)
+        async def query_text_only(request: QueryRequest):
+            """
+            Process a text-only query without images.
+            Args:
+                request (QueryRequest): The query request
+            Returns:
+                QueryResponse: The agent's response
+            """
+            return await self._process_query(
+                question=request.question, system_prompt=request.system_prompt, thread_id=request.thread_id, images=None
+            )
+        @self.app.post("/query-with-images", response_model=QueryResponse)
+        async def query_with_images(
+            question: str = Form(..., description="The question or query to ask the agent"),
+            system_prompt: Optional[str] = Form(None, description="Custom system prompt to override default"),
+            thread_id: Optional[str] = Form(None, description="Optional thread ID for conversation continuity"),
+            images: List[UploadFile] = File(..., description="One or more medical images to analyze"),
+        ):
+            """
+            Process a query with one or more images.
+            Args:
+                question (str): The question or query to ask the agent
+                system_prompt (Optional[str]): Custom system prompt to override default
+                thread_id (Optional[str]): Optional thread ID for conversation continuity
+                images (List[UploadFile]): List of uploaded image files
+            Returns:
+                QueryResponse: The agent's response
+            """
+            # Validate image files
+            if not images or len(images) == 0:
+                raise HTTPException(status_code=400, detail="At least one image is required")
+            # Validate file types
+            allowed_types = {"image/jpeg", "image/jpg", "image/png", "image/bmp", "image/tiff", "application/dicom"}
+            for image in images:
+                if image.content_type not in allowed_types:
+                    raise HTTPException(
+                        status_code=400,
+                        detail=f"Unsupported file type: {image.content_type}. Allowed types: {allowed_types}",
+                    )
+            return await self._process_query(
+                question=question, system_prompt=system_prompt, thread_id=thread_id, images=images
+            )
+    async def _process_query(
+        self,
+        question: str,
+        system_prompt: Optional[str] = None,
+        thread_id: Optional[str] = None,
+        images: Optional[List[UploadFile]] = None,
+    ) -> QueryResponse:
+        """
+        Internal method to process queries through the agent.
+        Args:
+            question (str): The question to ask
+            system_prompt (Optional[str]): Custom system prompt
+            thread_id (Optional[str]): Thread ID for conversation
+            images (Optional[List[UploadFile]]): List of images
+        Returns:
+            QueryResponse: The processed response
+        """
+        start_time = time.time()
+        # Generate thread ID if not provided
+        if not thread_id:
+            thread_id = str(uuid.uuid4())
+        try:
+            # Prepare messages
+            messages = []
+            image_paths = []
+            # Handle image uploads
+            if images:
+                for i, image in enumerate(images):
+                    # Save uploaded file temporarily
+                    temp_path = self.temp_dir / f"{thread_id}_{i}_{image.filename}"
+                    with open(temp_path, "wb") as buffer:
+                        content = await image.read()
+                        buffer.write(content)
+                    image_paths.append(str(temp_path))
+                    # Add image path for tools
+                    messages.append({"role": "user", "content": f"image_path: {temp_path}"})
+                    # Add base64 encoded image for multimodal processing
+                    image_base64 = base64.b64encode(content).decode("utf-8")
+                    # Determine MIME type
+                    mime_type = "image/jpeg"  # Default
+                    if image.content_type:
+                        mime_type = image.content_type
+                    elif temp_path.suffix.lower() in [".png"]:
+                        mime_type = "image/png"
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": f"data:{mime_type};base64,{image_base64}"},
+                                }
+                            ],
+                        }
+                    )
+            # Add text question
+            messages.append({"role": "user", "content": [{"type": "text", "text": question}]})
+            # Process through agent workflow
+            response_text = ""
+            tools_used = []
+            # Temporarily update system prompt if provided
+            original_prompt = None
+            if system_prompt:
+                original_prompt = self.agent.system_prompt
+                self.agent.system_prompt = system_prompt
+            try:
+                async for chunk in self._stream_agent_response(messages, thread_id):
+                    if chunk.get("type") == "text":
+                        response_text += chunk.get("content", "")
+                    elif chunk.get("type") == "tool":
+                        tools_used.append(chunk.get("tool_name", ""))
+            finally:
+                # Restore original system prompt
+                if original_prompt is not None:
+                    self.agent.system_prompt = original_prompt
+            # Clean up temporary files
+            for image_path in image_paths:
+                try:
+                    Path(image_path).unlink(missing_ok=True)
+                except Exception:
+                    pass  # Ignore cleanup errors
+            processing_time = time.time() - start_time
+            return QueryResponse(
+                response=response_text.strip(),
+                thread_id=thread_id,
+                tools_used=list(set(tools_used)),  # Remove duplicates
+                processing_time=processing_time,
+            )
+        except Exception as e:
+            # Clean up on error
+            for image_path in image_paths:
+                try:
+                    Path(image_path).unlink(missing_ok=True)
+                except Exception:
+                    pass
+            raise HTTPException(status_code=500, detail=f"Error processing query: {str(e)}")
+    async def _stream_agent_response(self, messages: List[Dict], thread_id: str):
+        """
+        Stream responses from the agent workflow.
+        Args:
+            messages (List[Dict]): Messages to process
+            thread_id (str): Thread ID for the conversation
+        Yields:
+            Dict: Response chunks with type and content
+        """
+        try:
+            for chunk in self.agent.workflow.stream(
+                {"messages": messages},
+                {"configurable": {"thread_id": thread_id}},
+                stream_mode="updates",
+            ):
+                if not isinstance(chunk, dict):
+                    continue
+                for node_name, node_output in chunk.items():
+                    if "messages" not in node_output:
+                        continue
+                    for msg in node_output["messages"]:
+                        if isinstance(msg, AIMessage) and msg.content:
+                            # Clean up temp paths from response
+                            clean_content = re.sub(r"temp[^\s]*", "", msg.content).strip()
+                            if clean_content:
+                                yield {"type": "text", "content": clean_content}
+                        elif isinstance(msg, ToolMessage):
+                            # Extract tool name from the message
+                            tool_call_id = msg.tool_call_id
+                            # We'll track tool usage but not include detailed output in API response
+                            yield {"type": "tool", "tool_name": "tool_executed"}
+        except Exception as e:
+            yield {"type": "error", "content": str(e)}
+def create_api(agent: Agent, tools_dict: Dict[str, Any], temp_dir: str = "temp_api") -> FastAPI:
+    """
+    Create and configure the MedRAX FastAPI application.
+    Args:
+        agent (Agent): The initialized MedRAX agent
+        tools_dict (Dict[str, Any]): Dictionary of available tools
+        temp_dir (str): Directory for temporary file storage
+    Returns:
+        FastAPI: Configured FastAPI application
+    """
+    api = MedRAXAPI(agent, tools_dict, temp_dir)
+    return api.app

benchmarking/llm_providers/medrax_provider.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """MedRAX LLM provider implementation."""
 import time
 import re
 import uuid
@@ -68,7 +69,7 @@ class MedRAXProvider(LLMProvider):
                 tools_to_use=selected_tools,
                 model_dir="/home/lijunzh3/scratch/MedRAX2/model-weights",
                 temp_dir="temp",  # Change this to the path of the temporary directory
-                device="cuda:0",
                 model=self.model_name,  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
                 temperature=self.temperature,
                 top_p=self.top_p,

 """MedRAX LLM provider implementation."""
+import os
 import time
 import re
 import uuid
                 tools_to_use=selected_tools,
                 model_dir="/home/lijunzh3/scratch/MedRAX2/model-weights",
                 temp_dir="temp",  # Change this to the path of the temporary directory
+                device=os.getenv("MEDRAX_DEVICE", "cuda:0"),
                 model=self.model_name,  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
                 temperature=self.temperature,
                 top_p=self.top_p,

interface.py CHANGED Viewed

@@ -29,7 +29,7 @@ class ChatInterface:
         """
         self.agent = agent
         self.tools_dict = tools_dict
-        self.upload_dir = Path("temp")
         self.upload_dir.mkdir(exist_ok=True)
         self.current_thread_id = None
         # Separate storage for original and display paths
@@ -68,9 +68,7 @@ class ChatInterface:
         return self.display_file_path
-    def add_message(
-        self, message: str, display_image: str, history: List[dict]
-    ) -> Tuple[List[dict], gr.Textbox]:
         """
         Add a new message to the chat history.
@@ -155,9 +153,7 @@ class ChatInterface:
                         if isinstance(msg, AIMessageChunk) and msg.content:
                             accumulated_content += msg.content
                             if final_message is None:
-                                final_message = ChatMessage(
-                                    role="assistant", content=accumulated_content
-                                )
                                 chat_history.append(final_message)
                             else:
                                 final_message.content = accumulated_content
@@ -169,9 +165,7 @@ class ChatInterface:
                                 if final_message:
                                     final_message.content = final_content
                                 else:
-                                    chat_history.append(
-                                        ChatMessage(role="assistant", content=final_content)
-                                    )
                                 yield chat_history, self.display_file_path, ""
                             if msg.tool_calls:
@@ -190,21 +184,25 @@ class ChatInterface:
                                 pending_call = self.pending_tool_calls.pop(tool_call_id)
                                 tool_name = pending_call["name"]
                                 tool_args = pending_call["args"]
                                 try:
-                                    # Handle case where tool returns tuple (output, metadata)
-                                    content = msg.content
-                                    content_tuple = ast.literal_eval(content)
-                                    content = json.dumps(content_tuple[0])
-                                    tool_output_json = json.loads(content)
-                                    tool_output_str = json.dumps(tool_output_json, indent=2)
-                                except (json.JSONDecodeError, TypeError):
-                                    tool_output_str = str(msg.content)
                                 tool_args_str = json.dumps(tool_args, indent=2)
                                 description = f"**Input:**\n```json\n{tool_args_str}\n```\n\n**Output:**\n```json\n{tool_output_str}\n```"
                                 metadata = {
                                     "title": f"⚒️ Tool: {tool_name}",
                                     "description": description,
@@ -217,32 +215,33 @@ class ChatInterface:
                                         metadata=metadata,
                                     )
                                 )
-                                yield chat_history, self.display_file_path, ""
                                 if tool_name == "image_visualizer":
                                     try:
-                                        # Handle case where tool returns tuple (output, metadata)
-                                        content = msg.content
-                                        content_tuple = ast.literal_eval(content)
-                                        result = content_tuple[0]
-                                        if isinstance(result, dict) and "image_path" in result:
-                                            self.display_file_path = result["image_path"]
-                                            chat_history.append(
-                                                ChatMessage(
-                                                    role="assistant",
-                                                    content={"path": self.display_file_path},
-                                                )
                                             )
-                                            yield chat_history, self.display_file_path, ""
-                                    except (json.JSONDecodeError, TypeError):
-                                        pass
         except Exception as e:
             chat_history.append(
-                ChatMessage(
-                    role="assistant", content=f"❌ Error: {str(e)}", metadata={"title": "Error"}
-                )
             )
             yield chat_history, self.display_file_path, ""
@@ -293,9 +292,7 @@ def create_demo(agent, tools_dict):
                             )
                 with gr.Column(scale=3):
-                    image_display = gr.Image(
-                        label="Image", type="filepath", height=600, container=True
-                    )
                     with gr.Row():
                         upload_button = gr.UploadButton(
                             "📎 Upload X-Ray",
@@ -306,25 +303,19 @@ def create_demo(agent, tools_dict):
                             file_types=["file"],
                         )
                     with gr.Row():
-                        clear_btn = gr.Button("Clear Chat")
-                        new_thread_btn = gr.Button("New Thread")
         # Event handlers
-        def clear_chat():
             interface.original_file_path = None
             interface.display_file_path = None
-            return [], None
-        def new_thread():
             interface.current_thread_id = str(time.time())
-            return [], interface.display_file_path
         def handle_file_upload(file):
             return interface.handle_upload(file.name)
-        chat_msg = txt.submit(
-            interface.add_message, inputs=[txt, image_display, chatbot], outputs=[chatbot, txt]
-        )
         bot_msg = chat_msg.then(
             interface.process_message,
             inputs=[txt, image_display, chatbot],
@@ -336,7 +327,6 @@ def create_demo(agent, tools_dict):
         dicom_upload.upload(handle_file_upload, inputs=dicom_upload, outputs=image_display)
-        clear_btn.click(clear_chat, outputs=[chatbot, image_display])
-        new_thread_btn.click(new_thread, outputs=[chatbot, image_display])
-    return demo

         """
         self.agent = agent
         self.tools_dict = tools_dict
+        self.upload_dir = Path(f"temp/{time.time()}")
         self.upload_dir.mkdir(exist_ok=True)
         self.current_thread_id = None
         # Separate storage for original and display paths
         return self.display_file_path
+    def add_message(self, message: str, display_image: str, history: List[dict]) -> Tuple[List[dict], gr.Textbox]:
         """
         Add a new message to the chat history.
                         if isinstance(msg, AIMessageChunk) and msg.content:
                             accumulated_content += msg.content
                             if final_message is None:
+                                final_message = ChatMessage(role="assistant", content=accumulated_content)
                                 chat_history.append(final_message)
                             else:
                                 final_message.content = accumulated_content
                                 if final_message:
                                     final_message.content = final_content
                                 else:
+                                    chat_history.append(ChatMessage(role="assistant", content=final_content))
                                 yield chat_history, self.display_file_path, ""
                             if msg.tool_calls:
                                 pending_call = self.pending_tool_calls.pop(tool_call_id)
                                 tool_name = pending_call["name"]
                                 tool_args = pending_call["args"]
+                                # Parse content
                                 try:
+                                    # Try JSON parsing first
+                                    result = json.loads(msg.content)
+                                    tool_output_str = json.dumps(result, indent=2)
+                                except json.JSONDecodeError:
+                                    try:
+                                        # Use ast.literal_eval as safe fallback for Python literals
+                                        content_tuple = ast.literal_eval(msg.content)
+                                        result = content_tuple[0]
+                                        tool_output_str = json.dumps(result, indent=2)
+                                    except (ValueError, SyntaxError):
+                                        # Fall back to treating as plain string
+                                        result = msg.content
+                                        tool_output_str = str(msg.content)
+                                # Display tool usage card
                                 tool_args_str = json.dumps(tool_args, indent=2)
                                 description = f"**Input:**\n```json\n{tool_args_str}\n```\n\n**Output:**\n```json\n{tool_output_str}\n```"
                                 metadata = {
                                     "title": f"⚒️ Tool: {tool_name}",
                                     "description": description,
                                         metadata=metadata,
                                     )
                                 )
+                                # Special handling for image_visualizer
                                 if tool_name == "image_visualizer":
+                                    image_path = None
                                     try:
+                                        image_path = result["image_path"]
+                                    except (TypeError, KeyError):
+                                        try:
+                                            image_path = result[0]["image_path"]
+                                        except (TypeError, KeyError, IndexError):
+                                            pass
+                                    if image_path:
+                                        self.display_file_path = image_path
+                                        chat_history.append(
+                                            ChatMessage(
+                                                role="assistant",
+                                                content={"path": self.display_file_path},
                                             )
+                                        )
+                                # Yield a single update for this tool event
+                                yield chat_history, self.display_file_path, ""
         except Exception as e:
             chat_history.append(
+                ChatMessage(role="assistant", content=f"❌ Error: {str(e)}", metadata={"title": "Error"})
             )
             yield chat_history, self.display_file_path, ""
                             )
                 with gr.Column(scale=3):
+                    image_display = gr.Image(label="Image", type="filepath", height=600, container=True)
                     with gr.Row():
                         upload_button = gr.UploadButton(
                             "📎 Upload X-Ray",
                             file_types=["file"],
                         )
                     with gr.Row():
+                        new_chat_btn = gr.Button("New Chat")
         # Event handlers
+        def new_chat():
             interface.original_file_path = None
             interface.display_file_path = None
             interface.current_thread_id = str(time.time())
+            return [], None
         def handle_file_upload(file):
             return interface.handle_upload(file.name)
+        chat_msg = txt.submit(interface.add_message, inputs=[txt, image_display, chatbot], outputs=[chatbot, txt])
         bot_msg = chat_msg.then(
             interface.process_message,
             inputs=[txt, image_display, chatbot],
         dicom_upload.upload(handle_file_upload, inputs=dicom_upload, outputs=image_display)
+        new_chat_btn.click(new_chat, outputs=[chatbot, image_display])
+    return demo

main.py CHANGED Viewed

@@ -11,6 +11,10 @@ with different model weights, tools, and parameters.
 import warnings
 import os
 from typing import Dict, List, Optional, Any
 from dotenv import load_dotenv
 from transformers import logging
@@ -19,6 +23,7 @@ from langgraph.checkpoint.memory import MemorySaver
 from medrax.models import ModelFactory
 from interface import create_demo
 from medrax.agent import *
 from medrax.tools import *
 from medrax.utils import *
@@ -31,19 +36,93 @@ logging.set_verbosity_error()
 _ = load_dotenv()
 def initialize_agent(
     prompt_file: str,
     tools_to_use: Optional[List[str]] = None,
-    model_dir: str = "model-weights",
     temp_dir: str = "temp",
-    device: str = "cpu",
-    model: str = "gemini-2.5-pro",
     temperature: float = 1.0,
     top_p: float = 0.95,
     max_tokens: int = 5000,
     rag_config: Optional[RAGConfig] = None,
     model_kwargs: Dict[str, Any] = {},
     system_prompt: str = "MEDICAL_ASSISTANT",
 ):
     """Initialize the MedRAX agent with specified tools and configuration.
@@ -55,7 +134,6 @@ def initialize_agent(
         device (str, optional): Device to run models on. Defaults to "cuda".
         model (str, optional): Model to use. Defaults to "gpt-4o".
         temperature (float, optional): Temperature for the model. Defaults to 0.7.
-        top_p (float, optional): Top P for the model. Defaults to 0.95.
         rag_config (RAGConfig, optional): Configuration for the RAG tool. Defaults to None.
         model_kwargs (dict, optional): Additional keyword arguments for model.
         system_prompt (str, optional): System prompt to use. Defaults to "MEDICAL_ASSISTANT".
@@ -68,18 +146,13 @@ def initialize_agent(
     prompts = load_prompts_from_file(prompt_file)
     prompt = prompts[system_prompt]
-    # Define the URL of the MedGemma FastAPI service.
-    MEDGEMMA_API_URL = os.getenv("MEDGEMMA_API_URL", "http://localhost:8002")
     all_tools = {
         "TorchXRayVisionClassifierTool": lambda: TorchXRayVisionClassifierTool(device=device),
         "ArcPlusClassifierTool": lambda: ArcPlusClassifierTool(cache_dir=model_dir, device=device),
         "ChestXRaySegmentationTool": lambda: ChestXRaySegmentationTool(device=device),
         "LlavaMedTool": lambda: LlavaMedTool(cache_dir=model_dir, device=device, load_in_8bit=True),
         "CheXagentXRayVQATool": lambda: CheXagentXRayVQATool(cache_dir=model_dir, device=device),
-        "ChestXRayReportGeneratorTool": lambda: ChestXRayReportGeneratorTool(
-            cache_dir=model_dir, device=device
-        ),
         "XRayPhraseGroundingTool": lambda: XRayPhraseGroundingTool(
             cache_dir=model_dir, temp_dir=temp_dir, load_in_8bit=True, device=device
         ),
@@ -91,18 +164,21 @@ def initialize_agent(
         "MedicalRAGTool": lambda: RAGTool(config=rag_config),
         "WebBrowserTool": lambda: WebBrowserTool(),
         "DuckDuckGoSearchTool": lambda: DuckDuckGoSearchTool(),
-        "MedSAM2Tool": lambda: MedSAM2Tool(
-            device=device, cache_dir=model_dir, temp_dir=temp_dir
         ),
-        "MedGemmaVQATool": lambda: MedGemmaAPIClientTool(cache_dir=model_dir, device=device, api_url=MEDGEMMA_API_URL)
-    }
     # Initialize only selected tools or all if none specified
     tools_dict: Dict[str, BaseTool] = {}
     if tools_to_use is None:
         tools_to_use = []
     for tool_name in tools_to_use:
         if tool_name == "PythonSandboxTool":
             try:
@@ -112,7 +188,6 @@ def initialize_agent(
                 print("Skipping PythonSandboxTool")
         if tool_name in all_tools:
             tools_dict[tool_name] = all_tools[tool_name]()
     # Set up checkpointing for conversation state
     checkpointer = MemorySaver()
@@ -130,8 +205,6 @@ def initialize_agent(
     agent = Agent(
         llm,
         tools=list(tools_dict.values()),
-        log_tools=True,
-        log_dir="logs",
         system_prompt=prompt,
         checkpointer=checkpointer,
     )
@@ -140,50 +213,262 @@ def initialize_agent(
     return agent, tools_dict
 if __name__ == "__main__":
     """
     This is the main entry point for the MedRAX application.
-    It initializes the agent with the selected tools and creates the demo.
     """
-    print("Starting server...")
-    # Example: initialize with only specific tools
-    # Here three tools are commented out, you can uncomment them to use them
-    selected_tools = [
-        "ImageVisualizerTool",  # For displaying images in the UI
-        # "DicomProcessorTool",  # For processing DICOM medical image files
-        # "ChestXRayGeneratorTool",  # For generating synthetic chest X-rays
-        "ChestXRayReportGeneratorTool",  # For generating medical reports from X-rays
-        "TorchXRayVisionClassifierTool",  # For classifying chest X-ray images using TorchXRayVision
-        "ArcPlusClassifierTool",  # For advanced chest X-ray classification using ArcPlus
-        "MedGemmaVQATool" # Google MedGemma VQA tool
-        "XRayVQATool",  # For visual question answering on X-rays
-        # "LlavaMedTool",  # For multimodal medical image understanding
-        "XRayPhraseGroundingTool",  # For locating described features in X-rays
-        "ChestXRaySegmentationTool",  # For segmenting anatomical regions in chest X-rays
-        # "MedSAM2Tool",  # For advanced medical image segmentation using MedSAM2
-        # "WebBrowserTool",  # For web browsing and search capabilities
-        "DuckDuckGoSearchTool",  # For privacy-focused web search using DuckDuckGo
-        # "MedicalRAGTool",  # For retrieval-augmented generation with medical knowledge
-        # "PythonSandboxTool",  # Add the Python sandbox tool
-    ]
     # Setup the MedGemma environment if the MedGemmaVQATool is selected
     if "MedGemmaVQATool" in selected_tools:
-        setup_medgemma_env()
     # Configure the Retrieval Augmented Generation (RAG) system
     # This allows the agent to access and use medical knowledge documents
     rag_config = RAGConfig(
-        model="command-a-03-2025",  # Chat model for generating responses
-        embedding_model="embed-v4.0",  # Embedding model for the RAG system
-        rerank_model="rerank-v3.5",  # Reranking model for the RAG system
-        temperature=0.3,
-        pinecone_index_name="medrax2",  # Name for the Pinecone index
-        chunk_size=1500,
-        chunk_overlap=300,
-        retriever_k=3,
-        local_docs_dir="rag_docs",  # Change this to the path of the documents for RAG
         huggingface_datasets=["VictorLJZ/medrax2"],  # List of HuggingFace datasets to load
         dataset_split="train",  # Which split of the datasets to use
     )
@@ -192,19 +477,46 @@ if __name__ == "__main__":
     model_kwargs = {}
     agent, tools_dict = initialize_agent(
-        prompt_file="medrax/docs/system_prompts.txt",
         tools_to_use=selected_tools,
-        model_dir="model-weights",
-        temp_dir="temp",  # Change this to the path of the temporary directory
-        device="cpu",
-        model="gemini-2.5-pro",  # Change this to the model you want to use, e.g. gpt-4.1-2025-04-14, gemini-2.5-pro
-        temperature=1.0,
-        top_p=0.95,
         model_kwargs=model_kwargs,
         rag_config=rag_config,
-        system_prompt="MEDICAL_ASSISTANT",
     )
-    # Create and launch the web interface
-    demo = create_demo(agent, tools_dict)
-    demo.launch(server_name="0.0.0.0", server_port=8585, share=True)

 import warnings
 import os
+import argparse
+from pyngrok import ngrok
+import threading
+import uvicorn
 from typing import Dict, List, Optional, Any
 from dotenv import load_dotenv
 from transformers import logging
 from medrax.models import ModelFactory
 from interface import create_demo
+from api import create_api
 from medrax.agent import *
 from medrax.tools import *
 from medrax.utils import *
 _ = load_dotenv()
+def resolve_medgemma_api_url_from_value(value: Optional[str]) -> str:
+    """Resolve the MedGemma API base URL using CLI value, env var, and SLURM-aware fallback.
+    Resolution order:
+    1) Explicit provided value (e.g., CLI flag)
+    2) MEDGEMMA_API_URL environment variable
+    3) If on SLURM, require explicit URL (raise)
+    4) Otherwise, default to localhost for single-box setups
+    """
+    if value:
+        return value
+    env_url = os.getenv("MEDGEMMA_API_URL")
+    if env_url:
+        return env_url
+    if os.getenv("SLURM_JOB_ID") or os.getenv("SLURM_NODEID"):
+        raise RuntimeError(
+            "MEDGEMMA_API_URL not set and --medgemma-api-url not provided. "
+            "On SLURM, the client usually runs on a different node, "
+            "so you must point to the server’s reachable IP, e.g. http://<node-ip>:8002"
+        )
+    return "http://127.0.0.1:8002"
+def resolve_medgemma_api_url(args) -> str:
+    """Helper that reads from an argparse Namespace if available."""
+    return resolve_medgemma_api_url_from_value(getattr(args, "medgemma_api_url", None))
+def resolve_auth_credentials(args) -> Optional[tuple]:
+    """Resolve authentication credentials from CLI args or environment variables.
+    Resolution order:
+    1) Explicit --no-auth flag (returns None, no warnings)
+    2) Explicit --auth USERNAME PASSWORD (returns credentials tuple)
+    3) MEDRAX_AUTH_USERNAME and MEDRAX_AUTH_PASSWORD environment variables
+    4) Default to None with warning messages
+    Args:
+        args: Parsed command-line arguments
+    Returns:
+        Optional[tuple]: (username, password) tuple if auth is enabled, None otherwise
+    """
+    if args.no_auth:
+        print("⚠️  Authentication disabled (public access)")
+        return None
+    if args.auth:
+        username, password = args.auth
+        print(f"✅ Authentication enabled for user: {username}")
+        return (username, password)
+    # Try to read from environment variables
+    auth_username = os.getenv("MEDRAX_AUTH_USERNAME")
+    auth_password = os.getenv("MEDRAX_AUTH_PASSWORD")
+    if auth_username and auth_password:
+        print(f"✅ Authentication enabled from environment for user: {auth_username}")
+        return (auth_username, auth_password)
+    # No auth specified anywhere - default to no auth with warning
+    print("⚠️  No authentication configured!")
+    print("⚠️  Running without authentication (public access)")
+    print("⚠️  To enable auth, either:")
+    print("    - Use --auth USERNAME PASSWORD")
+    print("    - Set MEDRAX_AUTH_USERNAME and MEDRAX_AUTH_PASSWORD in .env")
+    print("    - Or explicitly use --no-auth to suppress this warning")
+    return None
 def initialize_agent(
     prompt_file: str,
     tools_to_use: Optional[List[str]] = None,
+    model_dir: str = "/model-weights",
     temp_dir: str = "temp",
+    device: str = "cuda",
+    model: str = "gpt-4.1",
     temperature: float = 1.0,
     top_p: float = 0.95,
     max_tokens: int = 5000,
     rag_config: Optional[RAGConfig] = None,
     model_kwargs: Dict[str, Any] = {},
     system_prompt: str = "MEDICAL_ASSISTANT",
+    medgemma_api_url: Optional[str] = None,
 ):
     """Initialize the MedRAX agent with specified tools and configuration.
         device (str, optional): Device to run models on. Defaults to "cuda".
         model (str, optional): Model to use. Defaults to "gpt-4o".
         temperature (float, optional): Temperature for the model. Defaults to 0.7.
         rag_config (RAGConfig, optional): Configuration for the RAG tool. Defaults to None.
         model_kwargs (dict, optional): Additional keyword arguments for model.
         system_prompt (str, optional): System prompt to use. Defaults to "MEDICAL_ASSISTANT".
     prompts = load_prompts_from_file(prompt_file)
     prompt = prompts[system_prompt]
     all_tools = {
         "TorchXRayVisionClassifierTool": lambda: TorchXRayVisionClassifierTool(device=device),
         "ArcPlusClassifierTool": lambda: ArcPlusClassifierTool(cache_dir=model_dir, device=device),
         "ChestXRaySegmentationTool": lambda: ChestXRaySegmentationTool(device=device),
         "LlavaMedTool": lambda: LlavaMedTool(cache_dir=model_dir, device=device, load_in_8bit=True),
         "CheXagentXRayVQATool": lambda: CheXagentXRayVQATool(cache_dir=model_dir, device=device),
+        "ChestXRayReportGeneratorTool": lambda: ChestXRayReportGeneratorTool(cache_dir=model_dir, device=device),
         "XRayPhraseGroundingTool": lambda: XRayPhraseGroundingTool(
             cache_dir=model_dir, temp_dir=temp_dir, load_in_8bit=True, device=device
         ),
         "MedicalRAGTool": lambda: RAGTool(config=rag_config),
         "WebBrowserTool": lambda: WebBrowserTool(),
         "DuckDuckGoSearchTool": lambda: DuckDuckGoSearchTool(),
+        "MedSAM2Tool": lambda: MedSAM2Tool(device=device, cache_dir=model_dir, temp_dir=temp_dir),
+        "MedGemmaVQATool": lambda: MedGemmaAPIClientTool(
+            cache_dir=model_dir,
+            device=device,
+            load_in_8bit=True,
+            api_url=resolve_medgemma_api_url_from_value(medgemma_api_url),
         ),
+    }
     # Initialize only selected tools or all if none specified
     tools_dict: Dict[str, BaseTool] = {}
     if tools_to_use is None:
         tools_to_use = []
     for tool_name in tools_to_use:
         if tool_name == "PythonSandboxTool":
             try:
                 print("Skipping PythonSandboxTool")
         if tool_name in all_tools:
             tools_dict[tool_name] = all_tools[tool_name]()
     # Set up checkpointing for conversation state
     checkpointer = MemorySaver()
     agent = Agent(
         llm,
         tools=list(tools_dict.values()),
         system_prompt=prompt,
         checkpointer=checkpointer,
     )
     return agent, tools_dict
+def run_gradio_interface(agent, tools_dict, host="0.0.0.0", port=8686,
+                        auth=None, share=False):
+    """
+    Run the Gradio web interface.
+    Args:
+        agent: The initialized MedRAX agent
+        tools_dict: Dictionary of available tools
+        host (str): Host to bind the server to
+        port (int): Port to run the server on
+        auth: Authentication credentials (tuple)
+        share (bool): Whether to create a shareable public link
+    """
+    print(f"Starting Gradio interface on {host}:{port}")
+    if auth:
+        print(f"🔐 Authentication enabled for user: {auth[0]}")
+    else:
+        print("⚠️  Running without authentication (public access)")
+    if share:
+        print("🌍 Creating shareable public link (expires in 1 week)...")
+    demo = create_demo(agent, tools_dict)
+    # Prepare launch parameters
+    launch_kwargs = {
+        "server_name": host,
+        "server_port": port,
+        "share": share
+    }
+    if auth:
+        launch_kwargs["auth"] = auth
+    demo.launch(**launch_kwargs)
+def run_api_server(agent, tools_dict, host="0.0.0.0", port=8585, public=False):
+    """
+    Run the FastAPI server.
+    Args:
+        agent: The initialized MedRAX agent
+        tools_dict: Dictionary of available tools
+        host (str): Host to bind the server to
+        port (int): Port to run the server on
+        public (bool): Whether to expose via ngrok tunnel
+    """
+    print(f"Starting API server on {host}:{port}")
+    if public:
+        try:
+            public_tunnel = ngrok.connect(port)
+            public_url = public_tunnel.public_url
+            print(
+                f"🌍 Public URL: {public_url}\n🌍 API Documentation: {public_url}/docs\n🌍 Share this URL with your friend!\n{'=' * 60}"
+            )
+        except ImportError:
+            print("⚠️  pyngrok not installed. Install with: pip install pyngrok\nRunning locally only...")
+            public = False
+        except Exception as e:
+            print(f"⚠️  Failed to create public tunnel: {e}\nRunning locally only...")
+            public = False
+    app = create_api(agent, tools_dict)
+    try:
+        uvicorn.run(app, host=host, port=port)
+    finally:
+        if public:
+            try:
+                ngrok.disconnect(public_tunnel.public_url)
+                ngrok.kill()
+            except:
+                pass
+def parse_arguments():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="MedRAX - Medical Reasoning Agent for Chest X-ray")
+    # Run mode
+    parser.add_argument(
+        "--mode",
+        choices=["gradio", "api", "both"],
+        default="gradio",
+        help="Run mode: 'gradio' for web interface, 'api' for REST API, 'both' for both services",
+    )
+    # Gradio interface options
+    parser.add_argument("--gradio-host", default="0.0.0.0", help="Gradio host address")
+    parser.add_argument("--gradio-port", type=int, default=8686, help="Gradio port")
+    parser.add_argument("--auth", nargs=2, metavar=("USERNAME", "PASSWORD"),
+                       default=None,
+                       help="Enable password authentication with specified username and password")
+    parser.add_argument("--no-auth", action="store_true",
+                       help="Disable authentication (public access)")
+    parser.add_argument("--share", action="store_true",
+                       help="Create a temporary shareable link (expires in 1 week)")
+    # API server options
+    parser.add_argument("--api-host", default="0.0.0.0", help="API host address")
+    parser.add_argument("--api-port", type=int, default=8000, help="API port")
+    parser.add_argument("--public", action="store_true", help="Make API publicly accessible via ngrok tunnel")
+    # Model and system configuration
+    parser.add_argument(
+        "--model-dir",
+        default="/model-weights",
+        help="Directory containing model weights (default: uses MODEL_WEIGHTS_DIR env var or '/model-weights')",
+    )
+    parser.add_argument(
+        "--device", default="cuda", help="Device to run models on (default: uses MEDRAX_DEVICE env var or 'cuda:1')"
+    )
+    parser.add_argument(
+        "--model",
+        default="gpt-4.1",
+        help="Model to use (default: gpt-4.1). Examples: gpt-4.1-2025-04-14, gemini-2.5-pro, gpt-5",
+    )
+    parser.add_argument("--temperature", type=float, default=1.0, help="Temperature for the model (default: 1.0)")
+    parser.add_argument("--temp-dir", default="temp2", help="Directory for temporary files (default: temp2)")
+    parser.add_argument(
+        "--prompt-file",
+        default="medrax/docs/system_prompts.txt",
+        help="Path to file containing system prompts (default: medrax/docs/system_prompts.txt)",
+    )
+    parser.add_argument(
+        "--system-prompt", default="MEDICAL_ASSISTANT", help="System prompt to use (default: MEDICAL_ASSISTANT)"
+    )
+    # RAG configuration
+    parser.add_argument(
+        "--rag-model", default="command-a-03-2025", help="Chat model for RAG responses (default: command-a-03-2025)"
+    )
+    parser.add_argument(
+        "--rag-embedding-model", default="embed-v4.0", help="Embedding model for RAG system (default: embed-v4.0)"
+    )
+    parser.add_argument(
+        "--rag-rerank-model", default="rerank-v3.5", help="Reranking model for RAG system (default: rerank-v3.5)"
+    )
+    parser.add_argument("--rag-temperature", type=float, default=0.3, help="Temperature for RAG model (default: 0.3)")
+    parser.add_argument("--pinecone-index", default="medrax2", help="Pinecone index name (default: medrax2)")
+    parser.add_argument("--chunk-size", type=int, default=1500, help="RAG chunk size (default: 1500)")
+    parser.add_argument("--chunk-overlap", type=int, default=300, help="RAG chunk overlap (default: 300)")
+    parser.add_argument("--retriever-k", type=int, default=3, help="Number of documents to retrieve (default: 3)")
+    parser.add_argument("--rag-docs-dir", default="rag_docs", help="Directory for RAG documents (default: rag_docs)")
+    # Tools configuration
+    parser.add_argument(
+        "--tools",
+        nargs="*",
+        help="Specific tools to enable (if not provided, uses default set). Available tools: "
+        + "ImageVisualizerTool, DicomProcessorTool, MedSAM2Tool, ChestXRaySegmentationTool, "
+        + "ChestXRayGeneratorTool, TorchXRayVisionClassifierTool, ArcPlusClassifierTool, "
+        + "ChestXRayReportGeneratorTool, XRayPhraseGroundingTool, MedGemmaVQATool, "
+        + "XRayVQATool, LlavaMedTool, MedicalRAGTool, WebBrowserTool, DuckDuckGoSearchTool, "
+        + "PythonSandboxTool",
+    )
+    # MedGemma API configuration
+    parser.add_argument(
+        "--medgemma-api-url",
+        default=None,
+        help="MedGemma API base URL, e.g. http://127.0.0.1:8002 or http://<node-ip>:8002"
+    )
+    return parser.parse_args()
 if __name__ == "__main__":
     """
     This is the main entry point for the MedRAX application.
+    It initializes the agent with the selected tools and creates the demo/API.
     """
+    args = parse_arguments()
+    print(f"Starting MedRAX in {args.mode} mode...")
+    # Configure tools based on arguments
+    if args.tools is not None:
+        # Use tools specified via command line
+        selected_tools = args.tools
+    else:
+        # Use default tools selection
+        selected_tools = [
+            # Image Processing Tools
+            "ImageVisualizerTool",  # For displaying images in the UI
+            # "DicomProcessorTool",  # For processing DICOM medical image files
+            # Segmentation Tools
+            "MedSAM2Tool",  # For advanced medical image segmentation using MedSAM2
+            "ChestXRaySegmentationTool",  # For segmenting anatomical regions in chest X-rays
+            # Generation Tools
+            # "ChestXRayGeneratorTool",  # For generating synthetic chest X-rays
+            # Classification Tools
+            "TorchXRayVisionClassifierTool",  # For classifying chest X-ray images using TorchXRayVision
+            "ArcPlusClassifierTool",  # For advanced chest X-ray classification using ArcPlus
+            # Report Generation Tools
+            "ChestXRayReportGeneratorTool",  # For generating medical reports from X-rays
+            # Grounding Tools
+            "XRayPhraseGroundingTool",  # For locating described features in X-rays
+            # VQA Tools
+            # "MedGemmaVQATool",  # Google MedGemma VQA tool
+            "XRayVQATool",  # For visual question answering on X-rays
+            # "LlavaMedTool",  # For multimodal medical image understanding
+            # RAG Tools
+            "MedicalRAGTool",  # For retrieval-augmented generation with medical knowledge
+            # Search Tools
+            # "WebBrowserTool",  # For web browsing and search capabilities
+            "DuckDuckGoSearchTool",  # For privacy-focused web search using DuckDuckGo
+            # Development Tools
+            # "PythonSandboxTool",  # Add the Python sandbox tool
+        ]
+    # Configure model directory and device
+    model_dir = args.model_dir or os.getenv("MODEL_WEIGHTS_DIR", "/model-weights")
+    device = args.device or os.getenv("MEDRAX_DEVICE", "cuda:0")
+    print(f"Using model directory: {model_dir}")
+    print(f"Using device: {device}")
+    print(f"Using model: {args.model}")
+    print(f"Selected tools: {selected_tools}")
+    print(f"Using system prompt: {args.system_prompt}")
+    # Set up authentication (reads from CLI, env vars, or requires explicit choice)
+    auth_credentials = resolve_auth_credentials(args)
     # Setup the MedGemma environment if the MedGemmaVQATool is selected
+    medgemma_base_url_from_setup: Optional[str] = None
+    medgemma_api_url_effective: Optional[str] = args.medgemma_api_url
     if "MedGemmaVQATool" in selected_tools:
+        # Launch server and capture its URL if no explicit URL/ENV provided
+        try:
+            if medgemma_api_url_effective is None and os.getenv("MEDGEMMA_API_URL") is None:
+                medgemma_base_url_from_setup = setup_medgemma_env(cache_dir=model_dir, device=device)
+                # If we auto-launched, use this URL unless overridden later
+                if medgemma_base_url_from_setup:
+                    medgemma_api_url_effective = medgemma_base_url_from_setup
+                    print(f"MedGemma API auto-launched at {medgemma_api_url_effective}")
+            else:
+                # Still ensure environment is set up; it will bind to provided host/port
+                setup_medgemma_env(cache_dir=model_dir, device=device)
+        except Exception as e:
+            print(f"Warning: Failed to launch MedGemma service automatically: {e}")
     # Configure the Retrieval Augmented Generation (RAG) system
     # This allows the agent to access and use medical knowledge documents
     rag_config = RAGConfig(
+        model=args.rag_model,
+        embedding_model=args.rag_embedding_model,
+        rerank_model=args.rag_rerank_model,
+        temperature=args.rag_temperature,
+        pinecone_index_name=args.pinecone_index,
+        chunk_size=args.chunk_size,
+        chunk_overlap=args.chunk_overlap,
+        retriever_k=args.retriever_k,
+        local_docs_dir=args.rag_docs_dir,
         huggingface_datasets=["VictorLJZ/medrax2"],  # List of HuggingFace datasets to load
         dataset_split="train",  # Which split of the datasets to use
     )
     model_kwargs = {}
     agent, tools_dict = initialize_agent(
+        prompt_file=args.prompt_file,
         tools_to_use=selected_tools,
+        model_dir=model_dir,
+        temp_dir=args.temp_dir,
+        device=device,
+        model=args.model,
+        temperature=args.temperature,
         model_kwargs=model_kwargs,
         rag_config=rag_config,
+        system_prompt=args.system_prompt,
+        medgemma_api_url=medgemma_api_url_effective,
     )
+    # Launch based on selected mode
+    if args.mode == "gradio":
+        run_gradio_interface(
+            agent, tools_dict,
+            host=args.gradio_host,
+            port=args.gradio_port,
+            auth=auth_credentials,
+            share=args.share
+        )
+    elif args.mode == "api":
+        run_api_server(agent, tools_dict, args.api_host, args.api_port, args.public)
+    elif args.mode == "both":
+        # Run both services in separate threads
+        api_thread = threading.Thread(
+            target=run_api_server,
+            args=(agent, tools_dict, args.api_host, args.api_port, args.public)
+        )
+        api_thread.daemon = True
+        api_thread.start()
+        # Run Gradio in main thread with authentication and sharing
+        run_gradio_interface(
+            agent, tools_dict,
+            host=args.gradio_host,
+            port=args.gradio_port,
+            auth=auth_credentials,
+            share=args.share
+        )

medrax/agent/agent.py CHANGED Viewed

@@ -1,37 +1,17 @@
-import json
 import operator
-from pathlib import Path
-from dotenv import load_dotenv
-from datetime import datetime
 from typing import List, Dict, Any, TypedDict, Annotated, Optional
 from langgraph.graph import StateGraph, END
 from langchain_core.messages import AnyMessage, SystemMessage, ToolMessage, HumanMessage
 from langchain_core.language_models import BaseLanguageModel
 from langchain_core.tools import BaseTool
 _ = load_dotenv()
-class ToolCallLog(TypedDict):
-    """
-    A TypedDict representing a log entry for a tool call.
-    Attributes:
-        timestamp (str): The timestamp of when the tool call was made.
-        tool_call_id (str): The unique identifier for the tool call.
-        name (str): The name of the tool that was called.
-        args (Any): The arguments passed to the tool.
-        content (str): The content or result of the tool call.
-    """
-    timestamp: str
-    tool_call_id: str
-    name: str
-    args: Any
-    content: str
 class AgentState(TypedDict):
     """
     A TypedDict representing the state of an agent.
@@ -48,16 +28,14 @@ class AgentState(TypedDict):
 class Agent:
     """
     A class representing an agent that processes requests and executes tools based on
-    language model responses.
     Attributes:
         model (BaseLanguageModel): The language model used for processing.
-        tools (Dict[str, BaseTool]): A dictionary of available tools.
         checkpointer (Any): Manages and persists the agent's state.
         system_prompt (str): The system instructions for the agent.
         workflow (StateGraph): The compiled workflow for the agent's processing.
-        log_tools (bool): Whether to log tool calls.
-        log_path (Path): Path to save tool call logs.
     """
     def __init__(
@@ -66,8 +44,6 @@ class Agent:
         tools: List[BaseTool],
         checkpointer: Any = None,
         system_prompt: str = "",
-        log_tools: bool = True,
-        log_dir: Optional[str] = "logs",
     ):
         """
         Initialize the Agent.
@@ -77,28 +53,21 @@ class Agent:
             tools (List[BaseTool]): A list of available tools.
             checkpointer (Any, optional): State persistence manager. Defaults to None.
             system_prompt (str, optional): System instructions. Defaults to "".
-            log_tools (bool, optional): Whether to log tool calls. Defaults to True.
-            log_dir (str, optional): Directory to save logs. Defaults to 'logs'.
         """
         self.system_prompt = system_prompt
-        self.log_tools = log_tools
-        if self.log_tools:
-            self.log_path = Path(log_dir or "logs")
-            self.log_path.mkdir(exist_ok=True)
-        # Define the agent workflow
         workflow = StateGraph(AgentState)
-        workflow.add_node("process", self.process_request)
-        workflow.add_node("execute", self.execute_tools)
-        workflow.add_conditional_edges(
-            "process", self.has_tool_calls, {True: "execute", False: END}
-        )
-        workflow.add_edge("execute", "process")
-        workflow.set_entry_point("process")
         self.workflow = workflow.compile(checkpointer=checkpointer)
-        self.tools = {t.name: t for t in tools}
         self.model = model.bind_tools(tools)
     def process_request(self, state: AgentState) -> Dict[str, List[AnyMessage]]:
@@ -148,65 +117,3 @@ class Agent:
         """
         response = state["messages"][-1]
         return len(response.tool_calls) > 0
-    def execute_tools(self, state: AgentState) -> Dict[str, List[ToolMessage]]:
-        """
-        Execute tool calls from the model's response.
-        Args:
-            state (AgentState): The current state of the agent.
-        Returns:
-            Dict[str, List[ToolMessage]]: A dictionary containing tool execution results.
-        """
-        tool_calls = state["messages"][-1].tool_calls
-        results = []
-        for call in tool_calls:
-            print(f"Executing tool: {call}")
-            if call["name"] not in self.tools:
-                print("\n....invalid tool....")
-                result = "invalid tool, please retry"
-            else:
-                result = self.tools[call["name"]].invoke(call["args"])
-            results.append(
-                ToolMessage(
-                    tool_call_id=call["id"],
-                    name=call["name"],
-                    args=call["args"],
-                    content=str(result),
-                )
-            )
-        self._save_tool_calls(results)
-        print("Returning to model processing!")
-        return {"messages": results}
-    def _save_tool_calls(self, tool_calls: List[ToolMessage]) -> None:
-        """
-        Save tool calls to a JSON file with timestamp-based naming.
-        Args:
-            tool_calls (List[ToolMessage]): List of tool calls to save.
-        """
-        if not self.log_tools:
-            return
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = self.log_path / f"tool_calls_{timestamp}.json"
-        logs: List[ToolCallLog] = []
-        for call in tool_calls:
-            log_entry = {
-                "tool_call_id": call.tool_call_id,
-                "name": call.name,
-                "args": call.args,
-                "content": call.content,
-                "timestamp": datetime.now().isoformat(),
-            }
-            logs.append(log_entry)
-        with open(filename, "w") as f:
-            json.dump(logs, f, indent=4)

 import operator
 from typing import List, Dict, Any, TypedDict, Annotated, Optional
+from dotenv import load_dotenv
 from langgraph.graph import StateGraph, END
 from langchain_core.messages import AnyMessage, SystemMessage, ToolMessage, HumanMessage
+from langgraph.prebuilt import ToolNode
+from langchain_core.messages import AnyMessage, SystemMessage
 from langchain_core.language_models import BaseLanguageModel
 from langchain_core.tools import BaseTool
 _ = load_dotenv()
 class AgentState(TypedDict):
     """
     A TypedDict representing the state of an agent.
 class Agent:
     """
     A class representing an agent that processes requests and executes tools based on
+    language model responses with parallel tool execution capabilities.
     Attributes:
         model (BaseLanguageModel): The language model used for processing.
+        tool_node (ToolNode): The parallel tool execution node.
         checkpointer (Any): Manages and persists the agent's state.
         system_prompt (str): The system instructions for the agent.
         workflow (StateGraph): The compiled workflow for the agent's processing.
     """
     def __init__(
         tools: List[BaseTool],
         checkpointer: Any = None,
         system_prompt: str = "",
     ):
         """
         Initialize the Agent.
             tools (List[BaseTool]): A list of available tools.
             checkpointer (Any, optional): State persistence manager. Defaults to None.
             system_prompt (str, optional): System instructions. Defaults to "".
         """
         self.system_prompt = system_prompt
+        # Create the parallel tool execution node
+        self.tool_node = ToolNode(tools)
+        # Define the agent workflow with parallel tool execution
         workflow = StateGraph(AgentState)
+        workflow.add_node("agent", self.process_request)
+        workflow.add_node("tools", self.tool_node)
+        workflow.add_conditional_edges("agent", self.has_tool_calls, {True: "tools", False: END})
+        workflow.add_edge("tools", "agent")
+        workflow.set_entry_point("agent")
         self.workflow = workflow.compile(checkpointer=checkpointer)
         self.model = model.bind_tools(tools)
     def process_request(self, state: AgentState) -> Dict[str, List[AnyMessage]]:
         """
         response = state["messages"][-1]
         return len(response.tool_calls) > 0

medrax/docs/system_prompts.txt CHANGED Viewed

@@ -33,4 +33,6 @@ Your final response for a multiple-choice question must strictly follow this for
 3.  **Critical Thinking & Tool Use:** [Show your reasoning, including how you used tools and evaluated their output]
 4.  **Final Answer:** \boxed{A}
-Do not provide a definitive diagnosis or treatment plan for a patient. Your purpose is to assist medical professionals with your analysis, not to replace them. You must maintain this persona and adhere to all instructions.

 3.  **Critical Thinking & Tool Use:** [Show your reasoning, including how you used tools and evaluated their output]
 4.  **Final Answer:** \boxed{A}
+Do not provide a definitive diagnosis or treatment plan for a patient. Your purpose is to assist medical professionals with your analysis, not to replace them. You must maintain this persona and adhere to all instructions.
+[EMPTY]

medrax/llava/conversation.py CHANGED Viewed

@@ -230,9 +230,7 @@ class Conversation:
                     buffered = BytesIO()
                     image.save(buffered, format="JPEG")
                     img_b64_str = base64.b64encode(buffered.getvalue()).decode()
-                    img_str = (
-                        f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
-                    )
                     msg = img_str + msg.replace("<image>", "").strip()
                     ret.append([msg, None])
                 else:

                     buffered = BytesIO()
                     image.save(buffered, format="JPEG")
                     img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
                     msg = img_str + msg.replace("<image>", "").strip()
                     ret.append([msg, None])
                 else:

medrax/llava/eval/eval_multimodal_chat_gpt_score.py CHANGED Viewed

@@ -14,6 +14,7 @@ INSTRUCT_PROMPT = """We would like to request your feedback on the performance o
   Please first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."""
 ROLE = "Assistant"
 # Generate instruction for GPT-4 to score the two answers.
 def conv_to_str(fig_label, fig_caption, fig_context, question, ans1, ans2):
     return (
@@ -127,17 +128,13 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("GPT-4 Multimodal Chat Scoring", add_help=True)
-    parser.add_argument(
-        "--answers-file", default="", metavar="FILE", help="path to model answer file"
-    )
     parser.add_argument(
         "--question-file",
         default="data/questions/llava_med_eval_qa50_qa.jsonl",
         metavar="FILE",
         help="path to multichat questions file",
     )
-    parser.add_argument(
-        "--scores-file", default="", metavar="FILE", help="path to save gpt-4 score file"
-    )
     args = parser.parse_args()
     main(args)

   Please first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."""
 ROLE = "Assistant"
 # Generate instruction for GPT-4 to score the two answers.
 def conv_to_str(fig_label, fig_caption, fig_context, question, ans1, ans2):
     return (
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("GPT-4 Multimodal Chat Scoring", add_help=True)
+    parser.add_argument("--answers-file", default="", metavar="FILE", help="path to model answer file")
     parser.add_argument(
         "--question-file",
         default="data/questions/llava_med_eval_qa50_qa.jsonl",
         metavar="FILE",
         help="path to multichat questions file",
     )
+    parser.add_argument("--scores-file", default="", metavar="FILE", help="path to save gpt-4 score file")
     args = parser.parse_args()
     main(args)

medrax/llava/eval/llm.py CHANGED Viewed

@@ -21,9 +21,7 @@ class LLM(abc.ABC):
         raise NotImplementedError("Subclasses should implement this!")
     @abstractmethod
-    def split_input(
-        self, fixed_instruction, few_shot_examples, splittable_input, input_header, output_header
-    ):
         raise NotImplementedError("Subclasses should implement this!")
@@ -49,9 +47,7 @@ class GPT(LLM):
     def __init__(self, model_id):
         self.temperature = 0.0
         self.top_k = 1
-        self.encoding = tiktoken.encoding_for_model(
-            "-".join(model_id.split("-", 2)[:2]).replace("5", ".5")
-        )
         self.openai_api = "default"
         self.model_id = model_id
         self.max_length = self.deployment_max_length_dict[model_id]
@@ -61,9 +57,7 @@ class GPT(LLM):
             azure_endpoint=self.openai_cxn_dict[self.openai_api]["endpoint"],
         )
-    def gen_messages(
-        self, fixed_instruction, few_shot_examples, input, input_header, output_header
-    ):
         messages = [
             {
                 "role": "system",
@@ -120,18 +114,13 @@ class GPT(LLM):
     ):
         return asyncio.run(self.dispatch_openai_requests(messages_list))
-    def split_input(
-        self, fixed_instruction, few_shot_examples, splittable_input, input_header, output_header
-    ):
         # Tokenize fixed_prompt
         fixed_token_ids = self.encoding.encode(
-            fixed_instruction
-            + " ".join([x["user"] + " " + x["assistant"] for x in few_shot_examples])
         )
         # Calculate remaining token length
-        remaining_token_len = math.ceil(
-            (self.prompt_percent * self.max_length) - len(fixed_token_ids)
-        )
         # Tokenize splittable_input
         split_token_ids = self.encoding.encode(splittable_input)
@@ -141,14 +130,10 @@ class GPT(LLM):
             split_token_ids[i : i + remaining_token_len + 10]
             for i in range(0, len(split_token_ids), remaining_token_len)
         ]
-        split_input_list = [
-            self.encoding.decode(split_token_ids) for split_token_ids in split_token_ids_list
-        ]
         # Take the fixed_prompt, few_shot_examples, splitted inputs, and input/output headers and generate list of prompt strings.
         return [
-            self.gen_messages(
-                fixed_instruction, few_shot_examples, split_input, input_header, output_header
-            )
             for split_input in split_input_list
         ]

         raise NotImplementedError("Subclasses should implement this!")
     @abstractmethod
+    def split_input(self, fixed_instruction, few_shot_examples, splittable_input, input_header, output_header):
         raise NotImplementedError("Subclasses should implement this!")
     def __init__(self, model_id):
         self.temperature = 0.0
         self.top_k = 1
+        self.encoding = tiktoken.encoding_for_model("-".join(model_id.split("-", 2)[:2]).replace("5", ".5"))
         self.openai_api = "default"
         self.model_id = model_id
         self.max_length = self.deployment_max_length_dict[model_id]
             azure_endpoint=self.openai_cxn_dict[self.openai_api]["endpoint"],
         )
+    def gen_messages(self, fixed_instruction, few_shot_examples, input, input_header, output_header):
         messages = [
             {
                 "role": "system",
     ):
         return asyncio.run(self.dispatch_openai_requests(messages_list))
+    def split_input(self, fixed_instruction, few_shot_examples, splittable_input, input_header, output_header):
         # Tokenize fixed_prompt
         fixed_token_ids = self.encoding.encode(
+            fixed_instruction + " ".join([x["user"] + " " + x["assistant"] for x in few_shot_examples])
         )
         # Calculate remaining token length
+        remaining_token_len = math.ceil((self.prompt_percent * self.max_length) - len(fixed_token_ids))
         # Tokenize splittable_input
         split_token_ids = self.encoding.encode(splittable_input)
             split_token_ids[i : i + remaining_token_len + 10]
             for i in range(0, len(split_token_ids), remaining_token_len)
         ]
+        split_input_list = [self.encoding.decode(split_token_ids) for split_token_ids in split_token_ids_list]
         # Take the fixed_prompt, few_shot_examples, splitted inputs, and input/output headers and generate list of prompt strings.
         return [
+            self.gen_messages(fixed_instruction, few_shot_examples, split_input, input_header, output_header)
             for split_input in split_input_list
         ]

medrax/llava/eval/model_vqa.py CHANGED Viewed

@@ -45,9 +45,7 @@ def eval_model(args):
     disable_torch_init()
     model_path = os.path.expanduser(args.model_path)
     model_name = get_model_name_from_path(model_path)
-    tokenizer, model, image_processor, context_len = load_pretrained_model(
-        model_path, args.model_base, model_name
-    )
     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
@@ -69,11 +67,7 @@ def eval_model(args):
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
-        input_ids = (
-            tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
-            .unsqueeze(0)
-            .cuda()
-        )
         image = Image.open(os.path.join(args.image_folder, image_file))
         image_tensor = process_images([image], image_processor, model.config)[0]

     disable_torch_init()
     model_path = os.path.expanduser(args.model_path)
     model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
     questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
     questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
         conv.append_message(conv.roles[1], None)
         prompt = conv.get_prompt()
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
         image = Image.open(os.path.join(args.image_folder, image_file))
         image_tensor = process_images([image], image_processor, model.config)[0]

medrax/llava/eval/summarize_gpt_review.py CHANGED Viewed

@@ -14,8 +14,7 @@ def get_domain(x):
 def main(args):
     scores_data = util.load_file_jsonl(args.scores_file)
     predictions = [
-        (x["question_id"], x["type"], get_domain(x), x["gpt_eval"].split("\n")[0].split(" "))
-        for x in scores_data
     ]
     score_type_dict = defaultdict(lambda: defaultdict(list))
@@ -33,8 +32,7 @@ def main(args):
         result[q_type]["gpt4_score"] = util.get_avg(score_dict[1])
         result[q_type]["pred_score"] = util.get_avg(score_dict[2])
         result[q_type]["pred_relative_score"] = (
-            util.get_avg([float(s2) / float(s1) for s1, s2 in zip(score_dict[1], score_dict[2])])
-            * 100
         )
         result[q_type]["data_size"] = len(score_dict[1])
@@ -55,8 +53,6 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("GPT-4 Multimodal Chat Eval Postprocessing", add_help=True)
-    parser.add_argument(
-        "--scores-file", default="", metavar="FILE", help="input path to gpt-4 score file"
-    )
     args = parser.parse_args()
     main(args)

 def main(args):
     scores_data = util.load_file_jsonl(args.scores_file)
     predictions = [
+        (x["question_id"], x["type"], get_domain(x), x["gpt_eval"].split("\n")[0].split(" ")) for x in scores_data
     ]
     score_type_dict = defaultdict(lambda: defaultdict(list))
         result[q_type]["gpt4_score"] = util.get_avg(score_dict[1])
         result[q_type]["pred_score"] = util.get_avg(score_dict[2])
         result[q_type]["pred_relative_score"] = (
+            util.get_avg([float(s2) / float(s1) for s1, s2 in zip(score_dict[1], score_dict[2])]) * 100
         )
         result[q_type]["data_size"] = len(score_dict[1])
 if __name__ == "__main__":
     parser = argparse.ArgumentParser("GPT-4 Multimodal Chat Eval Postprocessing", add_help=True)
+    parser.add_argument("--scores-file", default="", metavar="FILE", help="input path to gpt-4 score file")
     args = parser.parse_args()
     main(args)

medrax/llava/mm_utils.py CHANGED Viewed

@@ -35,9 +35,7 @@ def process_images(images, image_processor, model_cfg):
     for image in images:
         if image_aspect_ratio == "pad":
             if image.mode == "L":
-                background_color = int(
-                    255 * sum(image_processor.image_mean) / len(image_processor.image_mean)
-                )
             else:
                 background_color = tuple(int(x * 255) for x in image_processor.image_mean)
             image = expand2square(image, background_color)
@@ -48,9 +46,7 @@ def process_images(images, image_processor, model_cfg):
     return new_images
-def tokenizer_image_token(
-    prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
-):
     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
     def insert_separator(X, sep):
@@ -58,11 +54,7 @@ def tokenizer_image_token(
     input_ids = []
     offset = 0
-    if (
-        len(prompt_chunks) > 0
-        and len(prompt_chunks[0]) > 0
-        and prompt_chunks[0][0] == tokenizer.bos_token_id
-    ):
         offset = 1
         input_ids.append(prompt_chunks[0][0])
@@ -100,9 +92,7 @@ class KeywordsStoppingCriteria(StoppingCriteria):
         self.tokenizer = tokenizer
         self.start_len = input_ids.shape[1]
-    def call_for_batch(
-        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
-    ) -> bool:
         offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
         for keyword_id in self.keyword_ids:

     for image in images:
         if image_aspect_ratio == "pad":
             if image.mode == "L":
+                background_color = int(255 * sum(image_processor.image_mean) / len(image_processor.image_mean))
             else:
                 background_color = tuple(int(x * 255) for x in image_processor.image_mean)
             image = expand2square(image, background_color)
     return new_images
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
     prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
     def insert_separator(X, sep):
     input_ids = []
     offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
         offset = 1
         input_ids.append(prompt_chunks[0][0])
         self.tokenizer = tokenizer
         self.start_len = input_ids.shape[1]
+    def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
         offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
         self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
         for keyword_id in self.keyword_ids:

medrax/llava/model/builder.py CHANGED Viewed

@@ -59,9 +59,7 @@ def load_pretrained_model(
             # PEFT model
             from peft import PeftModel
-            tokenizer = AutoTokenizer.from_pretrained(
-                model_base, use_fast=False, cache_dir=cache_dir
-            )
             model = AutoModelForCausalLM.from_pretrained(
                 model_base,
                 low_cpu_mem_usage=True,
@@ -78,9 +76,7 @@ def load_pretrained_model(
         else:
             use_fast = False
             if "mpt" in model_name.lower():
-                tokenizer = AutoTokenizer.from_pretrained(
-                    model_path, use_fast=True, cache_dir=cache_dir
-                )
                 model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     low_cpu_mem_usage=True,
@@ -90,9 +86,7 @@ def load_pretrained_model(
                     **kwargs,
                 )
             else:
-                tokenizer = AutoTokenizer.from_pretrained(
-                    model_path, use_fast=False, cache_dir=cache_dir
-                )
                 model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     low_cpu_mem_usage=True,
@@ -109,9 +103,7 @@ def load_pretrained_model(
         if mm_use_im_patch_token:
             tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
         if mm_use_im_start_end:
-            tokenizer.add_tokens(
-                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
-            )
         model.resize_token_embeddings(len(tokenizer))
         vision_tower = model.get_vision_tower()

             # PEFT model
             from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, cache_dir=cache_dir)
             model = AutoModelForCausalLM.from_pretrained(
                 model_base,
                 low_cpu_mem_usage=True,
         else:
             use_fast = False
             if "mpt" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, cache_dir=cache_dir)
                 model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     low_cpu_mem_usage=True,
                     **kwargs,
                 )
             else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, cache_dir=cache_dir)
                 model = AutoModelForCausalLM.from_pretrained(
                     model_path,
                     low_cpu_mem_usage=True,
         if mm_use_im_patch_token:
             tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
         if mm_use_im_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
         model.resize_token_embeddings(len(tokenizer))
         vision_tower = model.get_vision_tower()

medrax/llava/model/language_model/llava_mistral.py CHANGED Viewed

@@ -125,9 +125,7 @@ class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
             **kwargs,
         )
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
-    ):
         images = kwargs.pop("images", None)
         image_sizes = kwargs.pop("image_sizes", None)
         inputs = super().prepare_inputs_for_generation(

             **kwargs,
         )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
         images = kwargs.pop("images", None)
         image_sizes = kwargs.pop("image_sizes", None)
         inputs = super().prepare_inputs_for_generation(

medrax/llava/model/llava_arch.py CHANGED Viewed

@@ -104,9 +104,7 @@ class LlavaMetaModel:
             checkpoint_folder = os.path.dirname(pretrain_mm_mlp_adapter)
             ckpts = glob(f"{checkpoint_folder}/checkpoint-*", recursive=False)
             if len(ckpts) > 0:
-                vision_module_weights = torch.load(
-                    f"{ckpts[-1]}/mm_projector.bin", map_location="cpu"
-                )
                 model_dict = get_w(vision_module_weights, "vision_tower")
                 print(f"Loading vision module weights from {ckpts[-1]}/mm_projector.bin")
                 # print keys in model_dict
@@ -170,9 +168,7 @@ class LlavaMetaForCausalLM(ABC):
             image_features = self.encode_images(images).to(self.device)
         # TODO: image start / end is not implemented here to support pretraining.
-        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
-            self.config, "mm_use_im_start_end", False
-        ):
             raise NotImplementedError
         # Let's just add dummy tensors if they do not exist,
@@ -188,21 +184,15 @@ class LlavaMetaForCausalLM(ABC):
         else:
             attention_mask = attention_mask.bool()
         if position_ids is None:
-            position_ids = torch.arange(
-                0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
-            )
         if labels is None:
             labels = torch.full_like(input_ids, IGNORE_INDEX)
         input_ids = [
-            cur_input_ids[cur_attention_mask]
-            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
-        ]
-        labels = [
-            cur_labels[cur_attention_mask]
-            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
         ]
         new_input_embeds = []
         new_labels = []
@@ -219,20 +209,14 @@ class LlavaMetaForCausalLM(ABC):
                 continue
             image_token_indices = (
-                [-1]
-                + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist()
-                + [cur_input_ids.shape[0]]
             )
             cur_input_ids_noim = []
             cur_labels = labels[batch_idx]
             cur_labels_noim = []
             for i in range(len(image_token_indices) - 1):
-                cur_input_ids_noim.append(
-                    cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]]
-                )
-                cur_labels_noim.append(
-                    cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]]
-                )
             split_sizes = [x.shape[0] for x in cur_labels_noim]
             cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
@@ -279,12 +263,8 @@ class LlavaMetaForCausalLM(ABC):
             dtype=new_labels[0].dtype,
             device=new_labels[0].device,
         )
-        attention_mask = torch.zeros(
-            (batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device
-        )
-        position_ids = torch.zeros(
-            (batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device
-        )
         for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
             cur_len = cur_new_embed.shape[0]
@@ -351,9 +331,7 @@ class LlavaMetaForCausalLM(ABC):
             self.resize_token_embeddings(len(tokenizer))
         if model_args.mm_use_im_start_end:
-            num_new_tokens = tokenizer.add_tokens(
-                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
-            )
             self.resize_token_embeddings(len(tokenizer))
             if num_new_tokens > 0:
@@ -361,9 +339,7 @@ class LlavaMetaForCausalLM(ABC):
                 output_embeddings = self.get_output_embeddings().weight.data
                 input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
-                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
-                    dim=0, keepdim=True
-                )
                 input_embeddings[-num_new_tokens:] = input_embeddings_avg
                 output_embeddings[-num_new_tokens:] = output_embeddings_avg
@@ -375,9 +351,7 @@ class LlavaMetaForCausalLM(ABC):
                     p.requires_grad = False
             if model_args.pretrain_mm_mlp_adapter:
-                mm_projector_weights = torch.load(
-                    model_args.pretrain_mm_mlp_adapter, map_location="cpu"
-                )
                 embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
                 assert num_new_tokens == 2
                 if input_embeddings.shape == embed_tokens_weight.shape:

             checkpoint_folder = os.path.dirname(pretrain_mm_mlp_adapter)
             ckpts = glob(f"{checkpoint_folder}/checkpoint-*", recursive=False)
             if len(ckpts) > 0:
+                vision_module_weights = torch.load(f"{ckpts[-1]}/mm_projector.bin", map_location="cpu")
                 model_dict = get_w(vision_module_weights, "vision_tower")
                 print(f"Loading vision module weights from {ckpts[-1]}/mm_projector.bin")
                 # print keys in model_dict
             image_features = self.encode_images(images).to(self.device)
         # TODO: image start / end is not implemented here to support pretraining.
+        if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(self.config, "mm_use_im_start_end", False):
             raise NotImplementedError
         # Let's just add dummy tensors if they do not exist,
         else:
             attention_mask = attention_mask.bool()
         if position_ids is None:
+            position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
         if labels is None:
             labels = torch.full_like(input_ids, IGNORE_INDEX)
         input_ids = [
+            cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
         ]
+        labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
         new_input_embeds = []
         new_labels = []
                 continue
             image_token_indices = (
+                [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
             )
             cur_input_ids_noim = []
             cur_labels = labels[batch_idx]
             cur_labels_noim = []
             for i in range(len(image_token_indices) - 1):
+                cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1 : image_token_indices[i + 1]])
+                cur_labels_noim.append(cur_labels[image_token_indices[i] + 1 : image_token_indices[i + 1]])
             split_sizes = [x.shape[0] for x in cur_labels_noim]
             cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
             dtype=new_labels[0].dtype,
             device=new_labels[0].device,
         )
+        attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
+        position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
         for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
             cur_len = cur_new_embed.shape[0]
             self.resize_token_embeddings(len(tokenizer))
         if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
             self.resize_token_embeddings(len(tokenizer))
             if num_new_tokens > 0:
                 output_embeddings = self.get_output_embeddings().weight.data
                 input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
                 input_embeddings[-num_new_tokens:] = input_embeddings_avg
                 output_embeddings[-num_new_tokens:] = output_embeddings_avg
                     p.requires_grad = False
             if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location="cpu")
                 embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
                 assert num_new_tokens == 2
                 if input_embeddings.shape == embed_tokens_weight.shape:

medrax/llava/model/multimodal_encoder/builder.py CHANGED Viewed

@@ -3,13 +3,7 @@ from .clip_encoder import CLIPVisionTower
 def build_vision_tower(vision_tower_cfg, **kwargs):
-    vision_tower = getattr(
-        vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None)
-    )
     is_absolute_path_exists = os.path.exists(vision_tower)
-    if (
-        is_absolute_path_exists
-        or vision_tower.startswith("openai")
-        or vision_tower.startswith("laion")
-    ):
         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)

 def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
     is_absolute_path_exists = os.path.exists(vision_tower)
+    if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)

medrax/llava/model/multimodal_projector/builder.py CHANGED Viewed

@@ -19,9 +19,7 @@ class SimpleResBlock(nn.Module):
         super().__init__()
         self.pre_norm = nn.LayerNorm(channels)
-        self.proj = nn.Sequential(
-            nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels)
-        )
     def forward(self, x):
         x = self.pre_norm(x)

         super().__init__()
         self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(nn.Linear(channels, channels), nn.GELU(), nn.Linear(channels, channels))
     def forward(self, x):
         x = self.pre_norm(x)

medrax/llava/serve/cli.py CHANGED Viewed

@@ -94,9 +94,7 @@ def main(args):
         if image is not None:
             # first message
             if model.config.mm_use_im_start_end:
-                inp = (
-                    DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + inp
-                )
             else:
                 inp = DEFAULT_IMAGE_TOKEN + "\n" + inp
             conv.append_message(conv.roles[0], inp)

         if image is not None:
             # first message
             if model.config.mm_use_im_start_end:
+                inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + inp
             else:
                 inp = DEFAULT_IMAGE_TOKEN + "\n" + inp
             conv.append_message(conv.roles[0], inp)

medrax/llava/serve/controller.py CHANGED Viewed

@@ -2,6 +2,7 @@
 A controller manages distributed workers.
 It sends worker addresses to clients.
 """
 import argparse
 import dataclasses
 from enum import Enum, auto
@@ -199,9 +200,7 @@ class Controller:
             yield json.dumps(ret).encode() + b"\0"
         try:
-            response = requests.post(
-                worker_addr + "/worker_generate_stream", json=params, stream=True, timeout=5
-            )
             for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
                 if chunk:
                     yield chunk + b"\0"
@@ -240,9 +239,7 @@ app = FastAPI()
 @app.post("/register_worker")
 async def register_worker(request: Request):
     data = await request.json()
-    controller.register_worker(
-        data["worker_name"], data["check_heart_beat"], data.get("worker_status", None)
-    )
 @app.post("/refresh_all_workers")

 A controller manages distributed workers.
 It sends worker addresses to clients.
 """
 import argparse
 import dataclasses
 from enum import Enum, auto
             yield json.dumps(ret).encode() + b"\0"
         try:
+            response = requests.post(worker_addr + "/worker_generate_stream", json=params, stream=True, timeout=5)
             for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
                 if chunk:
                     yield chunk + b"\0"
 @app.post("/register_worker")
 async def register_worker(request: Request):
     data = await request.json()
+    controller.register_worker(data["worker_name"], data["check_heart_beat"], data.get("worker_status", None))
 @app.post("/refresh_all_workers")

medrax/llava/serve/gradio_web_server.py CHANGED Viewed

@@ -216,9 +216,7 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
     all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
     for image, hash in zip(all_images, all_image_hash):
         t = datetime.datetime.now()
-        filename = os.path.join(
-            LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg"
-        )
         if not os.path.isfile(filename):
             os.makedirs(os.path.dirname(filename), exist_ok=True)
             image.save(filename)
@@ -230,9 +228,7 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
         "temperature": float(temperature),
         "top_p": float(top_p),
         "max_new_tokens": min(int(max_new_tokens), 1536),
-        "stop": state.sep
-        if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT]
-        else state.sep2,
         "images": f"List of {len(state.get_images())} images: {all_image_hash}",
     }
     logger.info(f"==== request ====\n{pload}")
@@ -330,9 +326,7 @@ block_css = """
 def build_demo(embed_mode):
-    textbox = gr.Textbox(
-        show_label=False, placeholder="Enter text and press ENTER", container=False
-    )
     with gr.Blocks(title="LLaVA", theme=gr.themes.Default(), css=block_css) as demo:
         state = gr.State()
@@ -468,9 +462,7 @@ def build_demo(embed_mode):
             [state, chatbot] + btn_list,
         )
-        clear_btn.click(
-            clear_history, None, [state, chatbot, textbox, imagebox] + btn_list, queue=False
-        )
         textbox.submit(
             add_text,

     all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
     for image, hash in zip(all_images, all_image_hash):
         t = datetime.datetime.now()
+        filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
         if not os.path.isfile(filename):
             os.makedirs(os.path.dirname(filename), exist_ok=True)
             image.save(filename)
         "temperature": float(temperature),
         "top_p": float(top_p),
         "max_new_tokens": min(int(max_new_tokens), 1536),
+        "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
         "images": f"List of {len(state.get_images())} images: {all_image_hash}",
     }
     logger.info(f"==== request ====\n{pload}")
 def build_demo(embed_mode):
+    textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
     with gr.Blocks(title="LLaVA", theme=gr.themes.Default(), css=block_css) as demo:
         state = gr.State()
             [state, chatbot] + btn_list,
         )
+        clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox] + btn_list, queue=False)
         textbox.submit(
             add_text,

medrax/llava/serve/model_worker.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 A model worker executes the model.
 """
 import argparse
 import asyncio
 import json
@@ -155,9 +156,7 @@ class ModelWorker:
         if images is not None and len(images) > 0 and self.is_multimodal:
             if len(images) > 0:
                 if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
-                    raise ValueError(
-                        "Number of images does not match number of <image> tokens in prompt"
-                    )
                 images = [load_image_from_base64(image) for image in images]
                 images = process_images(images, image_processor, model.config)
@@ -172,9 +171,7 @@ class ModelWorker:
                     replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
                 prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
-                num_image_tokens = (
-                    prompt.count(replace_token) * model.get_vision_tower().num_patches
-                )
             else:
                 images = None
             image_args = {"images": images}
@@ -196,19 +193,14 @@ class ModelWorker:
         )
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
-        streamer = TextIteratorStreamer(
-            tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15
-        )
-        max_new_tokens = min(
-            max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens
-        )
         if max_new_tokens < 1:
             yield json.dumps(
                 {
-                    "text": ori_prompt
-                    + "Exceeds max token length. Please start a new conversation, thanks.",
                     "error_code": 0,
                 }
             ).encode() + b"\0"

 """
 A model worker executes the model.
 """
 import argparse
 import asyncio
 import json
         if images is not None and len(images) > 0 and self.is_multimodal:
             if len(images) > 0:
                 if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
+                    raise ValueError("Number of images does not match number of <image> tokens in prompt")
                 images = [load_image_from_base64(image) for image in images]
                 images = process_images(images, image_processor, model.config)
                     replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
                 prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+                num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
             else:
                 images = None
             image_args = {"images": images}
         )
         keywords = [stop_str]
         stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
+        max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
         if max_new_tokens < 1:
             yield json.dumps(
                 {
+                    "text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.",
                     "error_code": 0,
                 }
             ).encode() + b"\0"

medrax/llava/serve/test_message.py CHANGED Viewed

@@ -17,9 +17,7 @@ def main():
         models.sort()
         print(f"Models: {models}")
-        ret = requests.post(
-            controller_addr + "/get_worker_address", json={"model": args.model_name}
-        )
         worker_addr = ret.json()["address"]
         print(f"worker_addr: {worker_addr}")
@@ -38,9 +36,7 @@ def main():
         "temperature": 0.7,
         "stop": conv.sep2,
     }
-    response = requests.post(
-        worker_addr + "/worker_generate_stream", headers=headers, json=pload, stream=True
-    )
     print(prompt, end="")
     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):

         models.sort()
         print(f"Models: {models}")
+        ret = requests.post(controller_addr + "/get_worker_address", json={"model": args.model_name})
         worker_addr = ret.json()["address"]
         print(f"worker_addr: {worker_addr}")
         "temperature": 0.7,
         "stop": conv.sep2,
     }
+    response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, json=pload, stream=True)
     print(prompt, end="")
     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):

medrax/llava/utils.py CHANGED Viewed

@@ -45,9 +45,7 @@ def build_logger(logger_name, logger_filename):
     if handler is None:
         os.makedirs(LOGDIR, exist_ok=True)
         filename = os.path.join(LOGDIR, logger_filename)
-        handler = logging.handlers.TimedRotatingFileHandler(
-            filename, when="D", utc=True, encoding="UTF-8"
-        )
         handler.setFormatter(formatter)
         for name, item in logging.root.manager.loggerDict.items():

     if handler is None:
         os.makedirs(LOGDIR, exist_ok=True)
         filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True, encoding="UTF-8")
         handler.setFormatter(formatter)
         for name, item in logging.root.manager.loggerDict.items():

medrax/models/model_factory.py CHANGED Viewed

@@ -29,7 +29,7 @@ class ModelFactory:
             "base_url_key": "OPENAI_BASE_URL",
         },
         "gemini": {
-            "class": ChatGoogleGenerativeAI,
             "env_key": "GOOGLE_API_KEY",
             "base_url_key": "GOOGLE_BASE_URL",
         },
@@ -42,14 +42,12 @@ class ModelFactory:
         "grok": {
             "class": ChatXAI,
             "env_key": "XAI_API_KEY",
-        }
         # Add more providers with default configurations here
     }
     @classmethod
-    def register_provider(
-        cls, prefix: str, model_class: Type[BaseLanguageModel], env_key: str, **kwargs
-    ) -> None:
         """Register a new model provider.
         Args:
@@ -81,9 +79,7 @@ class ModelFactory:
             ValueError: If the required API key is missing
         """
         # Find the matching provider based on model name prefix
-        provider_prefix = next(
-            (prefix for prefix in cls._model_providers if model_name.startswith(prefix)), None
-        )
         if not provider_prefix:
             raise ValueError(
@@ -153,7 +149,4 @@ class ModelFactory:
             Dict[str, Dict[str, Any]]: Dictionary of registered providers and their configurations
         """
         # Return a copy to prevent accidental modification
-        return {
-            k: {kk: vv for kk, vv in v.items() if kk != "class"}
-            for k, v in cls._model_providers.items()
-        }

             "base_url_key": "OPENAI_BASE_URL",
         },
         "gemini": {
+            "class": ChatGoogleGenerativeAI,
             "env_key": "GOOGLE_API_KEY",
             "base_url_key": "GOOGLE_BASE_URL",
         },
         "grok": {
             "class": ChatXAI,
             "env_key": "XAI_API_KEY",
+        },
         # Add more providers with default configurations here
     }
     @classmethod
+    def register_provider(cls, prefix: str, model_class: Type[BaseLanguageModel], env_key: str, **kwargs) -> None:
         """Register a new model provider.
         Args:
             ValueError: If the required API key is missing
         """
         # Find the matching provider based on model name prefix
+        provider_prefix = next((prefix for prefix in cls._model_providers if model_name.startswith(prefix)), None)
         if not provider_prefix:
             raise ValueError(
             Dict[str, Dict[str, Any]]: Dictionary of registered providers and their configurations
         """
         # Return a copy to prevent accidental modification
+        return {k: {kk: vv for kk, vv in v.items() if kk != "class"} for k, v in cls._model_providers.items()}

medrax/rag/rag.py CHANGED Viewed

@@ -107,9 +107,7 @@ class CohereRAG:
         # Initialize Pinecone
         self.pinecone_api_key = os.getenv("PINECONE_API_KEY")
         if not self.pinecone_api_key:
-            raise ValueError(
-                "PINECONE_API_KEY environment variable not set. Please get a key from app.pinecone.io"
-            )
         self.pinecone = Pinecone(api_key=self.pinecone_api_key)
         self.index_name = self.config.pinecone_index_name
@@ -161,9 +159,7 @@ class CohereRAG:
             )
         print(f"Connecting to existing Pinecone index: {self.index_name}")
-        vectorstore = PineconeVectorStore.from_existing_index(
-            index_name=self.index_name, embedding=self.embeddings
-        )
         # Check if the index is empty and needs to be populated
         try:
@@ -329,9 +325,7 @@ class CohereRAG:
                 )
                 documents.append(doc)
-            print(
-                f"Loaded {len(documents)} document chunks from HuggingFace dataset: {dataset_name}"
-            )
             return documents
         except Exception as e:

         # Initialize Pinecone
         self.pinecone_api_key = os.getenv("PINECONE_API_KEY")
         if not self.pinecone_api_key:
+            raise ValueError("PINECONE_API_KEY environment variable not set. Please get a key from app.pinecone.io")
         self.pinecone = Pinecone(api_key=self.pinecone_api_key)
         self.index_name = self.config.pinecone_index_name
             )
         print(f"Connecting to existing Pinecone index: {self.index_name}")
+        vectorstore = PineconeVectorStore.from_existing_index(index_name=self.index_name, embedding=self.embeddings)
         # Check if the index is empty and needs to be populated
         try:
                 )
                 documents.append(doc)
+            print(f"Loaded {len(documents)} document chunks from HuggingFace dataset: {dataset_name}")
             return documents
         except Exception as e:

medrax/tools/browsing/__init__.py CHANGED Viewed

@@ -6,8 +6,8 @@ from .web_browser import WebBrowserTool, WebBrowserSchema, SearchQuerySchema, Vi
 __all__ = [
     "DuckDuckGoSearchTool",
     "WebSearchInput",
-    "WebBrowserTool",
     "WebBrowserSchema",
     "SearchQuerySchema",
-    "VisitUrlSchema"
-]

 __all__ = [
     "DuckDuckGoSearchTool",
     "WebSearchInput",
+    "WebBrowserTool",
     "WebBrowserSchema",
     "SearchQuerySchema",
+    "VisitUrlSchema",
+]

medrax/tools/browsing/duckduckgo.py CHANGED Viewed

@@ -95,18 +95,12 @@ class DuckDuckGoSearchTool(BaseTool):
         super().__init__(**kwargs)
         if DDGS is None:
-            logger.error(
-                "duckduckgo-search package not installed. Install with: pip install duckduckgo-search"
-            )
-            raise ImportError(
-                "duckduckgo-search package is required for web search functionality"
-            )
         logger.info("DuckDuckGo search tool initialized successfully")
-    def _perform_search_sync(
-        self, query: str, max_results: int = 5, region: str = "us-en"
-    ) -> Dict[str, Any]:
         """
         Perform the actual web search using DuckDuckGo synchronously.
@@ -118,9 +112,7 @@ class DuckDuckGoSearchTool(BaseTool):
         Returns:
             Dict[str, Any]: Structured search results.
         """
-        logger.info(
-            f"Performing web search: '{query}' (max_results={max_results}, region={region})"
-        )
         try:
             # Initialize DDGS with error handling
@@ -158,9 +150,7 @@ class DuckDuckGoSearchTool(BaseTool):
                     summary = f"No results found for '{query}'"
                 # Log successful completion
-                logger.info(
-                    f"Web search completed successfully: {len(formatted_results)} results"
-                )
                 return {
                     "query": query,
@@ -217,7 +207,7 @@ class DuckDuckGoSearchTool(BaseTool):
         try:
             result = self._perform_search_sync(query, max_results, region)
             # Check if search was successful
             if "error" in result:
                 metadata["analysis_status"] = "failed"
@@ -239,7 +229,7 @@ class DuckDuckGoSearchTool(BaseTool):
             }
             metadata["analysis_status"] = "failed"
             metadata["error_details"] = str(e)
             return error_result, metadata
     async def _arun(
@@ -296,9 +286,7 @@ class DuckDuckGoSearchTool(BaseTool):
             # Use asyncio to run sync search in executor
             loop = asyncio.get_event_loop()
-            result, metadata = await loop.run_in_executor(
-                None, self._run, query, max_results, region
-            )
             if writer:
                 # Parse result to get count for progress update
@@ -333,7 +321,7 @@ class DuckDuckGoSearchTool(BaseTool):
                 "search_engine": "DuckDuckGo",
                 "timestamp": datetime.now().isoformat(),
             }
             metadata = {
                 "query": query,
                 "max_results": max_results,
@@ -344,12 +332,10 @@ class DuckDuckGoSearchTool(BaseTool):
                 "analysis_status": "failed",
                 "error_details": str(e),
             }
             return error_result, metadata
-    def get_search_summary(
-        self, query: str, max_results: int = 3
-    ) -> dict[str, str | list[str]]:
         """
         Get a quick summary of search results for a given query.
@@ -375,14 +361,7 @@ class DuckDuckGoSearchTool(BaseTool):
             results = result.get("results", [])
             titles = [r["title"] for r in results]
             urls = [r["url"] for r in results]
-            snippets = [
-                (
-                    r["snippet"][:100] + "..."
-                    if len(r["snippet"]) > 100
-                    else r["snippet"]
-                )
-                for r in results
-            ]
             return {
                 "query": query,

         super().__init__(**kwargs)
         if DDGS is None:
+            logger.error("duckduckgo-search package not installed. Install with: pip install duckduckgo-search")
+            raise ImportError("duckduckgo-search package is required for web search functionality")
         logger.info("DuckDuckGo search tool initialized successfully")
+    def _perform_search_sync(self, query: str, max_results: int = 5, region: str = "us-en") -> Dict[str, Any]:
         """
         Perform the actual web search using DuckDuckGo synchronously.
         Returns:
             Dict[str, Any]: Structured search results.
         """
+        logger.info(f"Performing web search: '{query}' (max_results={max_results}, region={region})")
         try:
             # Initialize DDGS with error handling
                     summary = f"No results found for '{query}'"
                 # Log successful completion
+                logger.info(f"Web search completed successfully: {len(formatted_results)} results")
                 return {
                     "query": query,
         try:
             result = self._perform_search_sync(query, max_results, region)
             # Check if search was successful
             if "error" in result:
                 metadata["analysis_status"] = "failed"
             }
             metadata["analysis_status"] = "failed"
             metadata["error_details"] = str(e)
             return error_result, metadata
     async def _arun(
             # Use asyncio to run sync search in executor
             loop = asyncio.get_event_loop()
+            result, metadata = await loop.run_in_executor(None, self._run, query, max_results, region)
             if writer:
                 # Parse result to get count for progress update
                 "search_engine": "DuckDuckGo",
                 "timestamp": datetime.now().isoformat(),
             }
             metadata = {
                 "query": query,
                 "max_results": max_results,
                 "analysis_status": "failed",
                 "error_details": str(e),
             }
             return error_result, metadata
+    def get_search_summary(self, query: str, max_results: int = 3) -> dict[str, str | list[str]]:
         """
         Get a quick summary of search results for a given query.
             results = result.get("results", [])
             titles = [r["title"] for r in results]
             urls = [r["url"] for r in results]
+            snippets = [(r["snippet"][:100] + "..." if len(r["snippet"]) > 100 else r["snippet"]) for r in results]
             return {
                 "query": query,

medrax/tools/browsing/web_browser.py CHANGED Viewed

@@ -78,9 +78,7 @@ class WebBrowserTool(BaseTool):
     max_results: int = 5
     args_schema: Type[BaseModel] = WebBrowserSchema
-    def __init__(
-        self, search_api_key: Optional[str] = None, search_engine_id: Optional[str] = None, **kwargs
-    ):
         """Initialize the web browser tool with optional search API credentials.
         Args:
@@ -145,9 +143,7 @@ class WebBrowserTool(BaseTool):
         except Exception as e:
             return {"error": f"Search failed: {str(e)}"}
-    def visit_url(
-        self, url: str, max_content_length: int = 5000, max_links: int = 5
-    ) -> Dict[str, Any]:
         """Visit a URL and extract its content with comprehensive parsing.
         Args:
@@ -218,9 +214,7 @@ class WebBrowserTool(BaseTool):
             return {
                 "title": title,
                 "content": (
-                    text_content[:max_content_length]
-                    if len(text_content) > max_content_length
-                    else text_content
                 ),
                 "url": url,
                 "links": links[:max_links],  # Limit to max_links

     max_results: int = 5
     args_schema: Type[BaseModel] = WebBrowserSchema
+    def __init__(self, search_api_key: Optional[str] = None, search_engine_id: Optional[str] = None, **kwargs):
         """Initialize the web browser tool with optional search API credentials.
         Args:
         except Exception as e:
             return {"error": f"Search failed: {str(e)}"}
+    def visit_url(self, url: str, max_content_length: int = 5000, max_links: int = 5) -> Dict[str, Any]:
         """Visit a URL and extract its content with comprehensive parsing.
         Args:
             return {
                 "title": title,
                 "content": (
+                    text_content[:max_content_length] if len(text_content) > max_content_length else text_content
                 ),
                 "url": url,
                 "links": links[:max_links],  # Limit to max_links

medrax/tools/classification/__init__.py CHANGED Viewed

@@ -3,9 +3,4 @@
 from .torchxrayvision import TorchXRayVisionClassifierTool, TorchXRayVisionInput
 from .arcplus import ArcPlusClassifierTool, ArcPlusInput
-__all__ = [
-    "TorchXRayVisionClassifierTool",
-    "TorchXRayVisionInput",
-    "ArcPlusClassifierTool",
-    "ArcPlusInput"
-]

 from .torchxrayvision import TorchXRayVisionClassifierTool, TorchXRayVisionInput
 from .arcplus import ArcPlusClassifierTool, ArcPlusInput
+__all__ = ["TorchXRayVisionClassifierTool", "TorchXRayVisionInput", "ArcPlusClassifierTool", "ArcPlusInput"]

medrax/tools/classification/arcplus.py CHANGED Viewed

@@ -38,9 +38,7 @@ class OmniSwinTransformer(SwinTransformer):
         self.omni_heads = []
         for num_classes in num_classes_list:
-            self.omni_heads.append(
-                nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
-            )
         self.omni_heads = nn.ModuleList(self.omni_heads)
     def forward(self, x, head_n=None):
@@ -62,9 +60,7 @@ class OmniSwinTransformer(SwinTransformer):
 class ArcPlusInput(BaseModel):
     """Input for ArcPlus chest X-ray analysis tool. Only supports JPG or PNG images."""
-    image_path: str = Field(
-        ..., description="Path to the radiology image file, only supports JPG or PNG images"
-    )
 class ArcPlusClassifierTool(BaseTool):
@@ -249,11 +245,7 @@ class ArcPlusClassifierTool(BaseTool):
         # Remove "module." prefix if present (improved logic from example)
         if any([True if "module." in k else False for k in state_dict.keys()]):
-            state_dict = {
-                k.replace("module.", ""): v
-                for k, v in state_dict.items()
-                if k.startswith("module.")
-            }
         # Load the model weights
         msg = self.model.load_state_dict(state_dict, strict=False)
@@ -342,14 +334,10 @@ class ArcPlusClassifierTool(BaseTool):
             # Map predictions to disease names
             if len(predictions) != len(self.disease_list):
-                print(
-                    f"Warning: Expected {len(self.disease_list)} predictions, got {len(predictions)}"
-                )
                 # Pad or truncate as needed
                 if len(predictions) < len(self.disease_list):
-                    predictions = np.pad(
-                        predictions, (0, len(self.disease_list) - len(predictions))
-                    )
                 else:
                     predictions = predictions[: len(self.disease_list)]

         self.omni_heads = []
         for num_classes in num_classes_list:
+            self.omni_heads.append(nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity())
         self.omni_heads = nn.ModuleList(self.omni_heads)
     def forward(self, x, head_n=None):
 class ArcPlusInput(BaseModel):
     """Input for ArcPlus chest X-ray analysis tool. Only supports JPG or PNG images."""
+    image_path: str = Field(..., description="Path to the radiology image file, only supports JPG or PNG images")
 class ArcPlusClassifierTool(BaseTool):
         # Remove "module." prefix if present (improved logic from example)
         if any([True if "module." in k else False for k in state_dict.keys()]):
+            state_dict = {k.replace("module.", ""): v for k, v in state_dict.items() if k.startswith("module.")}
         # Load the model weights
         msg = self.model.load_state_dict(state_dict, strict=False)
             # Map predictions to disease names
             if len(predictions) != len(self.disease_list):
+                print(f"Warning: Expected {len(self.disease_list)} predictions, got {len(predictions)}")
                 # Pad or truncate as needed
                 if len(predictions) < len(self.disease_list):
+                    predictions = np.pad(predictions, (0, len(self.disease_list) - len(predictions)))
                 else:
                     predictions = predictions[: len(self.disease_list)]

medrax/tools/classification/torchxrayvision.py CHANGED Viewed

@@ -19,9 +19,7 @@ from medrax.utils.utils import preprocess_medical_image
 class TorchXRayVisionInput(BaseModel):
     """Input for TorchXRayVision chest X-ray analysis tools. Only supports JPG or PNG images."""
-    image_path: str = Field(
-        ..., description="Path to the radiology image file, only supports JPG or PNG images"
-    )
 class TorchXRayVisionClassifierTool(BaseTool):

 class TorchXRayVisionInput(BaseModel):
     """Input for TorchXRayVision chest X-ray analysis tools. Only supports JPG or PNG images."""
+    image_path: str = Field(..., description="Path to the radiology image file, only supports JPG or PNG images")
 class TorchXRayVisionClassifierTool(BaseTool):

medrax/tools/dicom.py CHANGED Viewed

@@ -14,9 +14,7 @@ class DicomProcessorInput(BaseModel):
     """Input schema for the DICOM Processor Tool."""
     dicom_path: str = Field(..., description="Path to the DICOM file")
-    window_center: Optional[float] = Field(
-        None, description="Window center for contrast adjustment"
-    )
     window_width: Optional[float] = Field(None, description="Window width for contrast adjustment")

     """Input schema for the DICOM Processor Tool."""
     dicom_path: str = Field(..., description="Path to the DICOM file")
+    window_center: Optional[float] = Field(None, description="Window center for contrast adjustment")
     window_width: Optional[float] = Field(None, description="Window width for contrast adjustment")

medrax/tools/grounding.py CHANGED Viewed

@@ -90,11 +90,8 @@ class XRayPhraseGroundingTool(BaseTool):
             trust_remote_code=True,
             quantization_config=quantization_config,
         )
-        self.processor = AutoProcessor.from_pretrained(
-            model_path, cache_dir=cache_dir, trust_remote_code=True
-        )
         self.model = self.model.eval()
         self.temp_dir = Path(temp_dir if temp_dir else tempfile.mkdtemp())
@@ -176,12 +173,8 @@ class XRayPhraseGroundingTool(BaseTool):
                 )
             prompt_length = inputs["input_ids"].shape[-1]
-            decoded_text = self.processor.decode(
-                output[0][prompt_length:], skip_special_tokens=True
-            )
-            predictions = self.processor.convert_output_to_plaintext_or_grounded_sequence(
-                decoded_text
-            )
             metadata = {
                 "image_path": image_path,
@@ -208,9 +201,7 @@ class XRayPhraseGroundingTool(BaseTool):
                 # Convert model bboxes to list format and get original image bboxes
                 model_bboxes = [list(bbox) for bbox in pred_bboxes]
                 original_bboxes = [
-                    self.processor.adjust_box_for_original_image_size(
-                        bbox, width=image.size[0], height=image.size[1]
-                    )
                     for bbox in model_bboxes
                 ]

             trust_remote_code=True,
             quantization_config=quantization_config,
         )
+        self.processor = AutoProcessor.from_pretrained(model_path, cache_dir=cache_dir, trust_remote_code=True)
         self.model = self.model.eval()
         self.temp_dir = Path(temp_dir if temp_dir else tempfile.mkdtemp())
                 )
             prompt_length = inputs["input_ids"].shape[-1]
+            decoded_text = self.processor.decode(output[0][prompt_length:], skip_special_tokens=True)
+            predictions = self.processor.convert_output_to_plaintext_or_grounded_sequence(decoded_text)
             metadata = {
                 "image_path": image_path,
                 # Convert model bboxes to list format and get original image bboxes
                 model_bboxes = [list(bbox) for bbox in pred_bboxes]
                 original_bboxes = [
+                    self.processor.adjust_box_for_original_image_size(bbox, width=image.size[0], height=image.size[1])
                     for bbox in model_bboxes
                 ]

medrax/tools/rag.py CHANGED Viewed

@@ -14,7 +14,7 @@ class RAGTool(BaseTool):
     The knowledge base includes:
     - Medical textbooks and reference materials
-    - Research papers and clinical studies
     - Medical manuals and guidelines
     - Specialized medical literature

     The knowledge base includes:
     - Medical textbooks and reference materials
+    - Research papers and clinical studies
     - Medical manuals and guidelines
     - Specialized medical literature

medrax/tools/report_generation.py CHANGED Viewed

@@ -23,9 +23,7 @@ from transformers import (
 class ChestXRayInput(BaseModel):
     """Input for chest X-ray analysis tools. Only supports JPG or PNG images."""
-    image_path: str = Field(
-        ..., description="Path to the radiology image file, only supports JPG or PNG images"
-    )
 class ChestXRayReportGeneratorTool(BaseTool):
@@ -180,12 +178,8 @@ class ChestXRayReportGeneratorTool(BaseTool):
         """
         try:
             # Process image for both models
-            findings_pixels = self._process_image(
-                image_path, self.findings_processor, self.findings_model
-            )
-            impression_pixels = self._process_image(
-                image_path, self.impression_processor, self.impression_model
-            )
             # Generate both sections
             with torch.inference_mode():
@@ -197,11 +191,7 @@ class ChestXRayReportGeneratorTool(BaseTool):
                 )
             # Combine into formatted report
-            report = (
-                "CHEST X-RAY REPORT\n\n"
-                f"FINDINGS:\n{findings_text}\n\n"
-                f"IMPRESSION:\n{impression_text}"
-            )
             output = {
                 "report": report,

 class ChestXRayInput(BaseModel):
     """Input for chest X-ray analysis tools. Only supports JPG or PNG images."""
+    image_path: str = Field(..., description="Path to the radiology image file, only supports JPG or PNG images")
 class ChestXRayReportGeneratorTool(BaseTool):
         """
         try:
             # Process image for both models
+            findings_pixels = self._process_image(image_path, self.findings_processor, self.findings_model)
+            impression_pixels = self._process_image(image_path, self.impression_processor, self.impression_model)
             # Generate both sections
             with torch.inference_mode():
                 )
             # Combine into formatted report
+            report = "CHEST X-RAY REPORT\n\n" f"FINDINGS:\n{findings_text}\n\n" f"IMPRESSION:\n{impression_text}"
             output = {
                 "report": report,

medrax/tools/segmentation/__init__.py CHANGED Viewed

@@ -3,10 +3,4 @@
 from .segmentation import ChestXRaySegmentationTool, ChestXRaySegmentationInput, OrganMetrics
 from .medsam2 import MedSAM2Tool, MedSAM2Input
-__all__ = [
-    "ChestXRaySegmentationTool",
-    "ChestXRaySegmentationInput",
-    "OrganMetrics",
-    "MedSAM2Tool",
-    "MedSAM2Input"
-]

 from .segmentation import ChestXRaySegmentationTool, ChestXRaySegmentationInput, OrganMetrics
 from .medsam2 import MedSAM2Tool, MedSAM2Input
+__all__ = ["ChestXRaySegmentationTool", "ChestXRaySegmentationInput", "OrganMetrics", "MedSAM2Tool", "MedSAM2Input"]

medrax/tools/segmentation/medsam2.py CHANGED Viewed

@@ -26,7 +26,6 @@ from hydra import initialize_config_dir
 from hydra.core.global_hydra import GlobalHydra
 class MedSAM2Input(BaseModel):
     """Input schema for the MedSAM2 Tool."""
@@ -47,7 +46,7 @@ class MedSAM2Input(BaseModel):
 class MedSAM2Tool(BaseTool):
     """Advanced medical image segmentation tool using MedSAM2.
     This tool provides state-of-the-art medical image segmentation capabilities using
     the MedSAM2 model, which is specifically adapted for medical imaging from Meta's SAM2.
     Supports interactive prompting with boxes, points, or automatic segmentation.
@@ -92,22 +91,17 @@ class MedSAM2Tool(BaseTool):
             # This works around the issue with initialize_config_module in sam2
             if GlobalHydra.instance().is_initialized():
                 GlobalHydra.instance().clear()
             config_dir = Path(__file__).parent.parent.parent.parent / "MedSAM2" / "sam2" / "configs"
             initialize_config_dir(config_dir=str(config_dir), version_base="1.2")
             hf_hub_download(
-                repo_id=model_path,
-                filename=model_file,
-                local_dir=self.cache_dir,
-                local_dir_use_symlinks=False
             )
-            config_path = model_cfg.replace('.yaml', '')
             sam2_model = build_sam2(config_path, str(self.cache_dir / model_file), device=device)
             self.predictor = SAM2ImagePredictor(sam2_model)
-            print(f"MedSAM2 model loaded successfully on {device}")
         except Exception as e:
             raise RuntimeError(f"Failed to initialize MedSAM2: {str(e)}")
@@ -116,10 +110,10 @@ class MedSAM2Tool(BaseTool):
         """Load and preprocess image for medical analysis."""
         try:
             # Handle different image formats
-            if image_path.lower().endswith('.dcm'):
                 # DICOM files - would need DICOM processor
                 raise ValueError("DICOM files not directly supported. Please convert to standard image format first.")
             # Load standard image formats
             image = Image.open(image_path)
@@ -131,29 +125,29 @@ class MedSAM2Tool(BaseTool):
                 image = Image.fromarray(img_normalized, mode='L')
             # For medical images, convert to grayscale first if needed, then to RGB
-            if image.mode == 'L':  # Grayscale
                 # Convert grayscale to RGB for SAM2
-                image = image.convert('RGB')
-            elif image.mode != 'RGB':
-                if image.mode == 'RGBA':
                     # Create white background for RGBA
-                    background = Image.new('RGB', image.size, (255, 255, 255))
                     background.paste(image, mask=image.split()[-1])
                     image = background
                 else:
-                    image = image.convert('RGB')
             # Convert to numpy array
             image_np = np.array(image)
             # Ensure image is in proper range [0, 255]
             if image_np.max() <= 1.0:
                 image_np = (image_np * 255).astype(np.uint8)
             else:
                 image_np = image_np.astype(np.uint8)
             return image_np
         except Exception as e:
             raise ValueError(f"Failed to load image {image_path}: {str(e)}")
@@ -161,55 +155,53 @@ class MedSAM2Tool(BaseTool):
         """Process and validate prompts."""
         if prompt_type == "auto":
             return None, None, None
         if prompt_coords is None:
             if prompt_type != "auto":
                 raise ValueError(f"Prompt coordinates required for prompt type '{prompt_type}'")
             return None, None, None
         if prompt_type == "box":
             if len(prompt_coords) != 4:
                 raise ValueError("Box prompt requires 4 coordinates: [x1,y1,x2,y2]")
             x1, y1, x2, y2 = prompt_coords
             # Validate coordinates
             if x1 >= x2 or y1 >= y2:
                 raise ValueError("Invalid box coordinates: x1 < x2 and y1 < y2 required")
             input_box = np.array([[x1, y1, x2, y2]])
             return input_box, None, None
         elif prompt_type == "point":
             if len(prompt_coords) != 2:
                 raise ValueError("Point prompt requires 2 coordinates: [x,y]")
             x, y = prompt_coords
             input_point = np.array([[x, y]])
             input_label = np.array([1])  # Positive point
             return None, input_point, input_label
         else:
             raise ValueError(f"Unknown prompt type: {prompt_type}")
     def _create_visualization(self, image: np.ndarray, masks: np.ndarray, prompt_info: Dict) -> str:
         """Create visualization of segmentation results."""
         plt.figure(figsize=(10, 10))
         # Convert RGB image to grayscale for background display
         if len(image.shape) == 3:
             # Convert RGB to grayscale using standard luminance formula
-            gray_image = 0.299 * image[:,:,0] + 0.587 * image[:,:,1] + 0.114 * image[:,:,2]
         else:
             gray_image = image
         # Display grayscale background
-        plt.imshow(
-            gray_image, cmap="gray", extent=[0, image.shape[1], image.shape[0], 0]
-        )
         # Generate color palette for multiple masks
         colors = plt.cm.rainbow(np.linspace(0, 1, len(masks)))
         # Process and overlay each mask
         for idx, (mask, color) in enumerate(zip(masks, colors)):
             if mask.sum() > 0:
@@ -217,33 +209,31 @@ class MedSAM2Tool(BaseTool):
                 mask_bool = mask.astype(bool)
                 colored_mask = np.zeros((*mask_bool.shape, 4))
                 colored_mask[mask_bool] = (*color[:3], 0.3)  # 30% transparency like segmentation tool
-                plt.imshow(
-                    colored_mask, extent=[0, image.shape[1], image.shape[0], 0]
-                )
                 # Add legend entry for each mask
                 mask_label = f"Mask {idx + 1} (score: {prompt_info.get('scores', [0])[idx] if idx < len(prompt_info.get('scores', [])) else 0:.3f})"
                 plt.plot([], [], color=color, label=mask_label, linewidth=3)
         # Add prompt visualization with consistent styling
-        if prompt_info.get('box') is not None:
-            box = prompt_info['box'][0]
             x1, y1, x2, y2 = box
-            plt.plot([x1, x2, x2, x1, x1], [y1, y1, y2, y2, y1], 'g-', linewidth=2, label='Box Prompt')
-        if prompt_info.get('point') is not None:
-            point = prompt_info['point'][0]
-            plt.plot(point[0], point[1], 'go', markersize=10, label='Point Prompt')
         plt.title("Segmentation Overlay")
         plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
         plt.axis("off")
         # Save visualization with higher DPI like segmentation tool
         viz_path = self.temp_dir / f"medsam2_result_{uuid.uuid4().hex[:8]}.png"
-        plt.savefig(viz_path, bbox_inches='tight', dpi=300)
         plt.close()
         return str(viz_path)
     def _run(
@@ -258,28 +248,28 @@ class MedSAM2Tool(BaseTool):
         try:
             # Load image
             image = self._load_image(image_path)
             # Set image for predictor
             self.predictor.set_image(image)
             # Process prompts
-            input_box, input_point, input_label = self._process_prompts(
-                prompt_type, prompt_coords, image.shape[:2]
-            )
             # Run inference
             if prompt_type == "auto":
                 # For auto segmentation, try multiple approaches and select best result
                 h, w = image.shape[:2]
                 # Try multiple points in key areas for medical images
-                sample_points = np.array([
-                    [w//3, h//3],      # Upper left lung area
-                    [2*w//3, h//3],    # Upper right lung area
-                    [w//2, 2*h//3],    # Lower center area
-                ])
                 sample_labels = np.array([1, 1, 1])  # All positive points
                 masks, scores, logits = self.predictor.predict(
                     point_coords=sample_points,
                     point_labels=sample_labels,
@@ -292,29 +282,29 @@ class MedSAM2Tool(BaseTool):
                     box=input_box,
                     multimask_output=True,
                 )
             # Create visualization
             prompt_info = {
-                'box': input_box,
-                'point': input_point,
-                'type': prompt_type,
-                'scores': scores  # Add scores for legend display
             }
             viz_path = self._create_visualization(image, masks, prompt_info)
             # Create output dictionary (main results)
             output = {
                 "segmentation_image_path": viz_path,
-                "confidence_scores": scores.tolist() if hasattr(scores, 'tolist') else list(scores),
                 "num_masks": len(masks),
                 "best_mask_score": float(scores[0]) if len(scores) > 0 else 0.0,
                 "mask_summary": {
                     "total_masks": len(masks),
                     "mask_shapes": [list(mask.shape) for mask in masks],
-                    "segmented_area_pixels": [int(mask.sum()) for mask in masks]
                 },
             }
             # Create metadata dictionary
             metadata = {
                 "image_path": image_path,
@@ -326,9 +316,9 @@ class MedSAM2Tool(BaseTool):
                 "num_masks_generated": len(masks),
                 "analysis_status": "completed",
             }
             return output, metadata
         except Exception as e:
             error_output = {"error": str(e)}
             error_metadata = {
@@ -347,4 +337,4 @@ class MedSAM2Tool(BaseTool):
         run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
     ) -> Tuple[Dict[str, Any], Dict]:
         """Async version of _run."""
-        return self._run(image_path, prompt_type, prompt_coords, slice_index, run_manager)

 from hydra.core.global_hydra import GlobalHydra
 class MedSAM2Input(BaseModel):
     """Input schema for the MedSAM2 Tool."""
 class MedSAM2Tool(BaseTool):
     """Advanced medical image segmentation tool using MedSAM2.
     This tool provides state-of-the-art medical image segmentation capabilities using
     the MedSAM2 model, which is specifically adapted for medical imaging from Meta's SAM2.
     Supports interactive prompting with boxes, points, or automatic segmentation.
             # This works around the issue with initialize_config_module in sam2
             if GlobalHydra.instance().is_initialized():
                 GlobalHydra.instance().clear()
             config_dir = Path(__file__).parent.parent.parent.parent / "MedSAM2" / "sam2" / "configs"
             initialize_config_dir(config_dir=str(config_dir), version_base="1.2")
             hf_hub_download(
+                repo_id=model_path, filename=model_file, local_dir=self.cache_dir, local_dir_use_symlinks=False
             )
+            config_path = model_cfg.replace(".yaml", "")
             sam2_model = build_sam2(config_path, str(self.cache_dir / model_file), device=device)
             self.predictor = SAM2ImagePredictor(sam2_model)
         except Exception as e:
             raise RuntimeError(f"Failed to initialize MedSAM2: {str(e)}")
         """Load and preprocess image for medical analysis."""
         try:
             # Handle different image formats
+            if image_path.lower().endswith(".dcm"):
                 # DICOM files - would need DICOM processor
                 raise ValueError("DICOM files not directly supported. Please convert to standard image format first.")
             # Load standard image formats
             image = Image.open(image_path)
                 image = Image.fromarray(img_normalized, mode='L')
             # For medical images, convert to grayscale first if needed, then to RGB
+            if image.mode == "L":  # Grayscale
                 # Convert grayscale to RGB for SAM2
+                image = image.convert("RGB")
+            elif image.mode != "RGB":
+                if image.mode == "RGBA":
                     # Create white background for RGBA
+                    background = Image.new("RGB", image.size, (255, 255, 255))
                     background.paste(image, mask=image.split()[-1])
                     image = background
                 else:
+                    image = image.convert("RGB")
             # Convert to numpy array
             image_np = np.array(image)
             # Ensure image is in proper range [0, 255]
             if image_np.max() <= 1.0:
                 image_np = (image_np * 255).astype(np.uint8)
             else:
                 image_np = image_np.astype(np.uint8)
             return image_np
         except Exception as e:
             raise ValueError(f"Failed to load image {image_path}: {str(e)}")
         """Process and validate prompts."""
         if prompt_type == "auto":
             return None, None, None
         if prompt_coords is None:
             if prompt_type != "auto":
                 raise ValueError(f"Prompt coordinates required for prompt type '{prompt_type}'")
             return None, None, None
         if prompt_type == "box":
             if len(prompt_coords) != 4:
                 raise ValueError("Box prompt requires 4 coordinates: [x1,y1,x2,y2]")
             x1, y1, x2, y2 = prompt_coords
             # Validate coordinates
             if x1 >= x2 or y1 >= y2:
                 raise ValueError("Invalid box coordinates: x1 < x2 and y1 < y2 required")
             input_box = np.array([[x1, y1, x2, y2]])
             return input_box, None, None
         elif prompt_type == "point":
             if len(prompt_coords) != 2:
                 raise ValueError("Point prompt requires 2 coordinates: [x,y]")
             x, y = prompt_coords
             input_point = np.array([[x, y]])
             input_label = np.array([1])  # Positive point
             return None, input_point, input_label
         else:
             raise ValueError(f"Unknown prompt type: {prompt_type}")
     def _create_visualization(self, image: np.ndarray, masks: np.ndarray, prompt_info: Dict) -> str:
         """Create visualization of segmentation results."""
         plt.figure(figsize=(10, 10))
         # Convert RGB image to grayscale for background display
         if len(image.shape) == 3:
             # Convert RGB to grayscale using standard luminance formula
+            gray_image = 0.299 * image[:, :, 0] + 0.587 * image[:, :, 1] + 0.114 * image[:, :, 2]
         else:
             gray_image = image
         # Display grayscale background
+        plt.imshow(gray_image, cmap="gray", extent=[0, image.shape[1], image.shape[0], 0])
         # Generate color palette for multiple masks
         colors = plt.cm.rainbow(np.linspace(0, 1, len(masks)))
         # Process and overlay each mask
         for idx, (mask, color) in enumerate(zip(masks, colors)):
             if mask.sum() > 0:
                 mask_bool = mask.astype(bool)
                 colored_mask = np.zeros((*mask_bool.shape, 4))
                 colored_mask[mask_bool] = (*color[:3], 0.3)  # 30% transparency like segmentation tool
+                plt.imshow(colored_mask, extent=[0, image.shape[1], image.shape[0], 0])
                 # Add legend entry for each mask
                 mask_label = f"Mask {idx + 1} (score: {prompt_info.get('scores', [0])[idx] if idx < len(prompt_info.get('scores', [])) else 0:.3f})"
                 plt.plot([], [], color=color, label=mask_label, linewidth=3)
         # Add prompt visualization with consistent styling
+        if prompt_info.get("box") is not None:
+            box = prompt_info["box"][0]
             x1, y1, x2, y2 = box
+            plt.plot([x1, x2, x2, x1, x1], [y1, y1, y2, y2, y1], "g-", linewidth=2, label="Box Prompt")
+        if prompt_info.get("point") is not None:
+            point = prompt_info["point"][0]
+            plt.plot(point[0], point[1], "go", markersize=10, label="Point Prompt")
         plt.title("Segmentation Overlay")
         plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
         plt.axis("off")
         # Save visualization with higher DPI like segmentation tool
         viz_path = self.temp_dir / f"medsam2_result_{uuid.uuid4().hex[:8]}.png"
+        plt.savefig(viz_path, bbox_inches="tight", dpi=300)
         plt.close()
         return str(viz_path)
     def _run(
         try:
             # Load image
             image = self._load_image(image_path)
             # Set image for predictor
             self.predictor.set_image(image)
             # Process prompts
+            input_box, input_point, input_label = self._process_prompts(prompt_type, prompt_coords, image.shape[:2])
             # Run inference
             if prompt_type == "auto":
                 # For auto segmentation, try multiple approaches and select best result
                 h, w = image.shape[:2]
                 # Try multiple points in key areas for medical images
+                sample_points = np.array(
+                    [
+                        [w // 3, h // 3],  # Upper left lung area
+                        [2 * w // 3, h // 3],  # Upper right lung area
+                        [w // 2, 2 * h // 3],  # Lower center area
+                    ]
+                )
                 sample_labels = np.array([1, 1, 1])  # All positive points
                 masks, scores, logits = self.predictor.predict(
                     point_coords=sample_points,
                     point_labels=sample_labels,
                     box=input_box,
                     multimask_output=True,
                 )
             # Create visualization
             prompt_info = {
+                "box": input_box,
+                "point": input_point,
+                "type": prompt_type,
+                "scores": scores,  # Add scores for legend display
             }
             viz_path = self._create_visualization(image, masks, prompt_info)
             # Create output dictionary (main results)
             output = {
                 "segmentation_image_path": viz_path,
+                "confidence_scores": scores.tolist() if hasattr(scores, "tolist") else list(scores),
                 "num_masks": len(masks),
                 "best_mask_score": float(scores[0]) if len(scores) > 0 else 0.0,
                 "mask_summary": {
                     "total_masks": len(masks),
                     "mask_shapes": [list(mask.shape) for mask in masks],
+                    "segmented_area_pixels": [int(mask.sum()) for mask in masks],
                 },
             }
             # Create metadata dictionary
             metadata = {
                 "image_path": image_path,
                 "num_masks_generated": len(masks),
                 "analysis_status": "completed",
             }
             return output, metadata
         except Exception as e:
             error_output = {"error": str(e)}
             error_metadata = {
         run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
     ) -> Tuple[Dict[str, Any], Dict]:
         """Async version of _run."""
+        return self._run(image_path, prompt_type, prompt_coords, slice_index, run_manager)

medrax/tools/segmentation/segmentation.py CHANGED Viewed

@@ -43,9 +43,7 @@ class OrganMetrics(BaseModel):
     area_pixels: int = Field(..., description="Area in pixels")
     area_cm2: float = Field(..., description="Approximate area in cm²")
     centroid: Tuple[float, float] = Field(..., description="(y, x) coordinates of centroid")
-    bbox: Tuple[int, int, int, int] = Field(
-        ..., description="Bounding box coordinates (min_y, min_x, max_y, max_x)"
-    )
     # Size metrics
     width: int = Field(..., description="Width of the organ in pixels")
@@ -53,9 +51,7 @@ class OrganMetrics(BaseModel):
     aspect_ratio: float = Field(..., description="Height/width ratio")
     # Position metrics
-    relative_position: Dict[str, float] = Field(
-        ..., description="Position relative to image boundaries (0-1 scale)"
-    )
     # Analysis metrics
     mean_intensity: float = Field(..., description="Mean pixel intensity in the organ region")
@@ -92,9 +88,7 @@ class ChestXRaySegmentationTool(BaseTool):
         self.model = self.model.to(self.device)
         self.model.eval()
-        self.transform = torchvision.transforms.Compose(
-            [xrv.datasets.XRayCenterCrop(), xrv.datasets.XRayResizer(512)]
-        )
         self.temp_dir = temp_dir if isinstance(temp_dir, Path) else Path(temp_dir)
         self.temp_dir.mkdir(exist_ok=True)
@@ -117,9 +111,7 @@ class ChestXRaySegmentationTool(BaseTool):
             "Spine": 13,
         }
-    def _align_mask_to_original(
-        self, mask: np.ndarray, original_shape: Tuple[int, int]
-    ) -> np.ndarray:
         """
         Align a mask from the transformed (cropped/resized) space back to the full original image.
         Assumes that the transform does a center crop to a square of side = min(original height, width)
@@ -172,23 +164,17 @@ class ChestXRaySegmentationTool(BaseTool):
             bbox=tuple(map(int, props.bbox)),
             width=int(props.bbox[3] - props.bbox[1]),
             height=int(props.bbox[2] - props.bbox[0]),
-            aspect_ratio=float(
-                (props.bbox[2] - props.bbox[0]) / max(1, props.bbox[3] - props.bbox[1])
-            ),
             relative_position=relative_pos,
             mean_intensity=float(mean_intensity),
             std_intensity=float(std_intensity),
             confidence_score=float(confidence),
         )
-    def _save_visualization(
-        self, original_img: np.ndarray, pred_masks: torch.Tensor, organ_indices: List[int]
-    ) -> str:
         """Save visualization of original image with segmentation masks overlaid."""
         plt.figure(figsize=(10, 10))
-        plt.imshow(
-            original_img, cmap="gray", extent=[0, original_img.shape[1], original_img.shape[0], 0]
-        )
         # Generate color palette for organs
         colors = plt.cm.rainbow(np.linspace(0, 1, len(organ_indices)))
@@ -204,14 +190,10 @@ class ChestXRaySegmentationTool(BaseTool):
                 # Create a colored overlay with transparency
                 colored_mask = np.zeros((*original_img.shape, 4))
                 colored_mask[mask > 0] = (*color[:3], 0.3)
-                plt.imshow(
-                    colored_mask, extent=[0, original_img.shape[1], original_img.shape[0], 0]
-                )
                 # Add legend entry for the organ
-                organ_name = list(self.organ_map.keys())[
-                    list(self.organ_map.values()).index(organ_idx)
-                ]
                 plt.plot([], [], color=color, label=organ_name, linewidth=3)
         plt.title("Segmentation Overlay")
@@ -269,9 +251,7 @@ class ChestXRaySegmentationTool(BaseTool):
             for idx, organ_name in zip(organ_indices, organs):
                 mask = pred_masks[0, idx].cpu().numpy()
                 if mask.sum() > 0:
-                    metrics = self._compute_organ_metrics(
-                        mask, original_img, float(pred_probs[0, idx].mean().cpu())
-                    )
                     if metrics:
                         results[organ_name] = metrics

     area_pixels: int = Field(..., description="Area in pixels")
     area_cm2: float = Field(..., description="Approximate area in cm²")
     centroid: Tuple[float, float] = Field(..., description="(y, x) coordinates of centroid")
+    bbox: Tuple[int, int, int, int] = Field(..., description="Bounding box coordinates (min_y, min_x, max_y, max_x)")
     # Size metrics
     width: int = Field(..., description="Width of the organ in pixels")
     aspect_ratio: float = Field(..., description="Height/width ratio")
     # Position metrics
+    relative_position: Dict[str, float] = Field(..., description="Position relative to image boundaries (0-1 scale)")
     # Analysis metrics
     mean_intensity: float = Field(..., description="Mean pixel intensity in the organ region")
         self.model = self.model.to(self.device)
         self.model.eval()
+        self.transform = torchvision.transforms.Compose([xrv.datasets.XRayCenterCrop(), xrv.datasets.XRayResizer(512)])
         self.temp_dir = temp_dir if isinstance(temp_dir, Path) else Path(temp_dir)
         self.temp_dir.mkdir(exist_ok=True)
             "Spine": 13,
         }
+    def _align_mask_to_original(self, mask: np.ndarray, original_shape: Tuple[int, int]) -> np.ndarray:
         """
         Align a mask from the transformed (cropped/resized) space back to the full original image.
         Assumes that the transform does a center crop to a square of side = min(original height, width)
             bbox=tuple(map(int, props.bbox)),
             width=int(props.bbox[3] - props.bbox[1]),
             height=int(props.bbox[2] - props.bbox[0]),
+            aspect_ratio=float((props.bbox[2] - props.bbox[0]) / max(1, props.bbox[3] - props.bbox[1])),
             relative_position=relative_pos,
             mean_intensity=float(mean_intensity),
             std_intensity=float(std_intensity),
             confidence_score=float(confidence),
         )
+    def _save_visualization(self, original_img: np.ndarray, pred_masks: torch.Tensor, organ_indices: List[int]) -> str:
         """Save visualization of original image with segmentation masks overlaid."""
         plt.figure(figsize=(10, 10))
+        plt.imshow(original_img, cmap="gray", extent=[0, original_img.shape[1], original_img.shape[0], 0])
         # Generate color palette for organs
         colors = plt.cm.rainbow(np.linspace(0, 1, len(organ_indices)))
                 # Create a colored overlay with transparency
                 colored_mask = np.zeros((*original_img.shape, 4))
                 colored_mask[mask > 0] = (*color[:3], 0.3)
+                plt.imshow(colored_mask, extent=[0, original_img.shape[1], original_img.shape[0], 0])
                 # Add legend entry for the organ
+                organ_name = list(self.organ_map.keys())[list(self.organ_map.values()).index(organ_idx)]
                 plt.plot([], [], color=color, label=organ_name, linewidth=3)
         plt.title("Segmentation Overlay")
             for idx, organ_name in zip(organ_indices, organs):
                 mask = pred_masks[0, idx].cpu().numpy()
                 if mask.sum() > 0:
+                    metrics = self._compute_organ_metrics(mask, original_img, float(pred_probs[0, idx].mean().cpu()))
                     if metrics:
                         results[organ_name] = metrics

medrax/tools/utils.py CHANGED Viewed

@@ -16,18 +16,10 @@ class ImageVisualizerInput(BaseModel):
     image_path: str = Field(..., description="Path to the image file to display, only supports JPG or PNG images")
     title: Optional[str] = Field(None, description="Optional title to display above the image")
-    description: Optional[str] = Field(
-        None, description="Optional description to display below the image"
-    )
-    width: Optional[int] = Field(
-        10, description="Optional figure width in inches"
-    )
-    height: Optional[int] = Field(
-        10, description="Optional figure height in inches"
-    )
-    cmap: Optional[str] = Field(
-        "rgb", description="Optional colormap to use for displaying the image"
-    )
 class ImageVisualizerTool(BaseTool):
@@ -65,9 +57,7 @@ class ImageVisualizerTool(BaseTool):
         # Add description if provided
         if description:
-            plt.figtext(
-                0.5, 0.01, description, wrap=True, horizontalalignment="center", fontsize=10
-            )
         # Adjust margins to minimize whitespace while preventing overlap
         plt.subplots_adjust(top=0.95, bottom=0.05, left=0.05, right=0.95)

     image_path: str = Field(..., description="Path to the image file to display, only supports JPG or PNG images")
     title: Optional[str] = Field(None, description="Optional title to display above the image")
+    description: Optional[str] = Field(None, description="Optional description to display below the image")
+    width: Optional[int] = Field(10, description="Optional figure width in inches")
+    height: Optional[int] = Field(10, description="Optional figure height in inches")
+    cmap: Optional[str] = Field("rgb", description="Optional colormap to use for displaying the image")
 class ImageVisualizerTool(BaseTool):
         # Add description if provided
         if description:
+            plt.figtext(0.5, 0.01, description, wrap=True, horizontalalignment="center", fontsize=10)
         # Adjust margins to minimize whitespace while preventing overlap
         plt.subplots_adjust(top=0.95, bottom=0.05, left=0.05, right=0.95)

medrax/tools/vqa/__init__.py CHANGED Viewed

@@ -1,16 +1,16 @@
 """Visual Question Answering tools for medical images."""
 from .llava_med import LlavaMedTool, LlavaMedInput
-from .xray_vqa import CheXagentXRayVQATool, XRayVQAToolInput
 from .medgemma.medgemma_client import MedGemmaAPIClientTool, MedGemmaVQAInput
 from .medgemma.medgemma_setup import setup_medgemma_env
 __all__ = [
     "LlavaMedTool",
     "LlavaMedInput",
-    "CheXagentXRayVQATool",
     "XRayVQAToolInput",
     "MedGemmaAPIClientTool",
     "MedGemmaVQAInput",
-    "setup_medgemma_env"
-]

 """Visual Question Answering tools for medical images."""
 from .llava_med import LlavaMedTool, LlavaMedInput
+from .xray_vqa import CheXagentXRayVQATool, XRayVQAToolInput
 from .medgemma.medgemma_client import MedGemmaAPIClientTool, MedGemmaVQAInput
 from .medgemma.medgemma_setup import setup_medgemma_env
 __all__ = [
     "LlavaMedTool",
     "LlavaMedInput",
+    "CheXagentXRayVQATool",
     "XRayVQAToolInput",
     "MedGemmaAPIClientTool",
     "MedGemmaVQAInput",
+    "setup_medgemma_env",
+]

medrax/tools/vqa/llava_med.py CHANGED Viewed

@@ -84,13 +84,7 @@ class LlavaMedTool(BaseTool):
         self, question: str, image_path: Optional[str] = None
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         if self.model.config.mm_use_im_start_end:
-            question = (
-                DEFAULT_IM_START_TOKEN
-                + DEFAULT_IMAGE_TOKEN
-                + DEFAULT_IM_END_TOKEN
-                + "\n"
-                + question
-            )
         else:
             question = DEFAULT_IMAGE_TOKEN + "\n" + question
@@ -100,9 +94,7 @@ class LlavaMedTool(BaseTool):
         prompt = conv.get_prompt()
         input_ids = (
-            tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
-            .unsqueeze(0)
-            .cuda()
         )
         image_tensor = None
@@ -156,11 +148,11 @@ class LlavaMedTool(BaseTool):
                 )
             answer = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
             output = {
                 "answer": answer,
             }
             metadata = {
                 "question": question,
                 "image_path": image_path,

         self, question: str, image_path: Optional[str] = None
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         if self.model.config.mm_use_im_start_end:
+            question = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + question
         else:
             question = DEFAULT_IMAGE_TOKEN + "\n" + question
         prompt = conv.get_prompt()
         input_ids = (
+            tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
         )
         image_tensor = None
                 )
             answer = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
             output = {
                 "answer": answer,
             }
             metadata = {
                 "question": question,
                 "image_path": image_path,

medrax/tools/vqa/medgemma/medgemma.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import asyncio
 import os
 from pathlib import Path
-import sys
 import traceback
 from typing import Any, Dict, List, Optional, Tuple
 import uuid
@@ -22,6 +21,7 @@ UPLOAD_DIR = "./medgemma_images"
 # Create directories if they don't exist
 os.makedirs(UPLOAD_DIR, exist_ok=True)
 # Pydantic Models for API
 class VQAInput(BaseModel):
     """Input schema for the MedGemma VQA API endpoint.
@@ -100,7 +100,7 @@ class MedGemmaModel:
         device: Optional[str] = "cuda",
         dtype: torch.dtype = torch.bfloat16,
         cache_dir: Optional[str] = None,
-        load_in_4bit: bool = True,
         **kwargs: Any,
     ) -> None:
         """Initialize the MedGemmaModel.
@@ -110,7 +110,7 @@ class MedGemmaModel:
             device: Device to run model on - "cuda" or "cpu" (default: "cuda")
             dtype: Data type for model weights - bfloat16 recommended for efficiency (default: torch.bfloat16)
             cache_dir: Directory to cache downloaded models (default: None)
-            load_in_4bit: Whether to load model in 4-bit quantization for memory efficiency (default: True)
             **kwargs: Additional arguments passed to the model pipeline
         Raises:
@@ -140,8 +140,8 @@ class MedGemmaModel:
             "use_cache": True,
         }
-        if load_in_4bit:
-            model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_4bit=True)
         model_kwargs["device_map"] = {"": self.device}
         try:
@@ -298,6 +298,12 @@ app = FastAPI(
 )
 medgemma_model: Optional[MedGemmaModel] = None
 @app.on_event("startup")
 async def startup_event():
@@ -316,7 +322,32 @@ async def startup_event():
     """
     global medgemma_model
     try:
-        medgemma_model = MedGemmaModel()
         print("MedGemma model loaded successfully.")
     except RuntimeError as e:
         print(f"Error loading MedGemma model: {e}")
@@ -389,8 +420,12 @@ async def analyze_images(
             raise HTTPException(status_code=500, detail=f"Failed to save uploaded image: {str(e)}")
     try:
-        # Generate AI analysis
-        response_text = await medgemma_model.aget_response(image_paths, prompt, system_prompt, max_new_tokens)
         # Prepare success response
         metadata = {
@@ -428,7 +463,12 @@ async def analyze_images(
 if __name__ == "__main__":
     """Launch the MedGemma VQA API server.
-    Starts the FastAPI application with uvicorn server, binding to all
-    network interfaces on port 8002.
     """
-    uvicorn.run(app, host="0.0.0.0", port=8002)

 import asyncio
 import os
 from pathlib import Path
 import traceback
 from typing import Any, Dict, List, Optional, Tuple
 import uuid
 # Create directories if they don't exist
 os.makedirs(UPLOAD_DIR, exist_ok=True)
 # Pydantic Models for API
 class VQAInput(BaseModel):
     """Input schema for the MedGemma VQA API endpoint.
         device: Optional[str] = "cuda",
         dtype: torch.dtype = torch.bfloat16,
         cache_dir: Optional[str] = None,
+        load_in_8bit: bool = True,
         **kwargs: Any,
     ) -> None:
         """Initialize the MedGemmaModel.
             device: Device to run model on - "cuda" or "cpu" (default: "cuda")
             dtype: Data type for model weights - bfloat16 recommended for efficiency (default: torch.bfloat16)
             cache_dir: Directory to cache downloaded models (default: None)
+            load_in_8bit: Whether to load model in 4-bit quantization for memory efficiency (default: True)
             **kwargs: Additional arguments passed to the model pipeline
         Raises:
             "use_cache": True,
         }
+        if load_in_8bit:
+            model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
         model_kwargs["device_map"] = {"": self.device}
         try:
 )
 medgemma_model: Optional[MedGemmaModel] = None
+inference_semaphore: Optional[asyncio.Semaphore] = None
+@app.get("/health")
+async def health():
+    """Health check endpoint."""
+    return {"status": "ok"}
 @app.on_event("startup")
 async def startup_event():
     """
     global medgemma_model
     try:
+        # Allow overriding Hugging Face cache directory and device via env vars
+        cache_dir_env = os.getenv("MEDGEMMA_CACHE_DIR")
+        device_env = os.getenv("MEDGEMMA_DEVICE")
+        max_concurrency_env = os.getenv("MEDGEMMA_MAX_CONCURRENCY", "1")
+        # Ensure the cache directory is writable; if not, fall back to a user cache
+        if cache_dir_env:
+            try:
+                os.makedirs(cache_dir_env, exist_ok=True)
+                if not os.access(cache_dir_env, os.W_OK):
+                    raise PermissionError("Cache dir not writable")
+            except Exception:
+                fallback = os.path.join(Path.home(), ".cache", "medrax", "medgemma")
+                os.makedirs(fallback, exist_ok=True)
+                print(f"Warning: MEDGEMMA_CACHE_DIR '{cache_dir_env}' not writable. Falling back to '{fallback}'.")
+                cache_dir_env = fallback
+        medgemma_model = MedGemmaModel(cache_dir=cache_dir_env, device=device_env)
+        # Initialize concurrency gate
+        try:
+            max_concurrency = int(max_concurrency_env)
+        except ValueError:
+            max_concurrency = 1
+        max_concurrency = max(1, max_concurrency)
+        global inference_semaphore
+        inference_semaphore = asyncio.Semaphore(max_concurrency)
         print("MedGemma model loaded successfully.")
     except RuntimeError as e:
         print(f"Error loading MedGemma model: {e}")
             raise HTTPException(status_code=500, detail=f"Failed to save uploaded image: {str(e)}")
     try:
+        # Generate AI analysis with concurrency gating to avoid GPU contention timeouts
+        global inference_semaphore
+        if inference_semaphore is None:
+            inference_semaphore = asyncio.Semaphore(1)
+        async with inference_semaphore:
+            response_text = await medgemma_model.aget_response(image_paths, prompt, system_prompt, max_new_tokens)
         # Prepare success response
         metadata = {
 if __name__ == "__main__":
     """Launch the MedGemma VQA API server.
+    Reads MEDGEMMA_HOST and MEDGEMMA_PORT if provided; otherwise defaults
+    to 0.0.0.0:8002.
     """
+    host = os.getenv("MEDGEMMA_HOST", "0.0.0.0")
+    try:
+        port = int(os.getenv("MEDGEMMA_PORT", "8002"))
+    except ValueError:
+        port = 8002
+    uvicorn.run(app, host=host, port=port)

medrax/tools/vqa/medgemma/medgemma_client.py CHANGED Viewed

@@ -59,15 +59,21 @@ class MedGemmaAPIClientTool(BaseTool):
     # API configuration
     api_url: str  # The URL of the running FastAPI service
-    def __init__(self, api_url: str, **kwargs: Any):
         """Initialize the MedGemmaAPIClientTool.
         Args:
             api_url: The URL of the running MedGemma FastAPI service
             **kwargs: Additional arguments passed to BaseTool
         """
-        super().__init__(api_url=api_url, **kwargs)
     def _prepare_request_data(
         self, image_paths: List[str], prompt: str, system_prompt: str, max_new_tokens: int
@@ -154,7 +160,8 @@ class MedGemmaAPIClientTool(BaseTool):
             Tuple of output dictionary and metadata
         """
         # httpx is a modern HTTP client that supports sync and async
-        timeout_config = httpx.Timeout(300.0, connect=10.0)
         client = httpx.Client(timeout=timeout_config)
         try:
@@ -238,11 +245,12 @@ class MedGemmaAPIClientTool(BaseTool):
                     image_paths, prompt, system_prompt, max_new_tokens
                 )
                 response = await client.post(
                     f"{self.api_url}/analyze-images/",
                     data=data,
                     files=files_to_send,
-                    timeout=120.0
                 )
                 response.raise_for_status()

     # API configuration
     api_url: str  # The URL of the running FastAPI service
+    cache_dir: Optional[str] = None # Not used by the client directly, but accepted to keep a uniform constructor
+    device: Optional[str] = None
+    def __init__(self, api_url: str, cache_dir: Optional[str] = None, device: Optional[str] = None, timeout_seconds: Optional[float] = None, **kwargs: Any):
         """Initialize the MedGemmaAPIClientTool.
         Args:
             api_url: The URL of the running MedGemma FastAPI service
+            cache_dir: Optional local cache directory for model weights (accepted for interface consistency)
+            device: Optional device spec (accepted for interface consistency)
+            timeout_seconds: Optional request timeout override (seconds)
             **kwargs: Additional arguments passed to BaseTool
         """
+        super().__init__(api_url=api_url, cache_dir=cache_dir, device=device, **kwargs)
+        self._timeout_seconds = timeout_seconds
     def _prepare_request_data(
         self, image_paths: List[str], prompt: str, system_prompt: str, max_new_tokens: int
             Tuple of output dictionary and metadata
         """
         # httpx is a modern HTTP client that supports sync and async
+        timeout_value = self._timeout_seconds if self._timeout_seconds is not None else 600.0
+        timeout_config = httpx.Timeout(timeout_value, connect=10.0)
         client = httpx.Client(timeout=timeout_config)
         try:
                     image_paths, prompt, system_prompt, max_new_tokens
                 )
+                timeout_value = self._timeout_seconds if self._timeout_seconds is not None else 600.0
                 response = await client.post(
                     f"{self.api_url}/analyze-images/",
                     data=data,
                     files=files_to_send,
+                    timeout=timeout_value
                 )
                 response.raise_for_status()

medrax/tools/vqa/medgemma/medgemma_requirements_standard.txt CHANGED Viewed

@@ -52,4 +52,4 @@ typing_inspection==0.4.1
 urllib3==2.5.0
 uvicorn==0.35.0
 wcwidth==0.2.13
-zstandard==0.23.0

 urllib3==2.5.0
 uvicorn==0.35.0
 wcwidth==0.2.13
+zstandard==0.23.0

medrax/tools/vqa/medgemma/medgemma_setup.py CHANGED Viewed

@@ -1,9 +1,61 @@
 import os
 from pathlib import Path
 import subprocess
 import venv
-def setup_medgemma_env():
     """Set up MedGemma virtual environment and launch the FastAPI service.
     This function performs the following steps:
@@ -53,12 +105,47 @@ def setup_medgemma_env():
     if not env_dir.exists():
         raise RuntimeError("Failed to create MedGemma virtual environment")
-    # Launch MedGemma FastAPI service
-    print("Launching MedGemma FastAPI service...")
     subprocess.Popen([
         str(python_executable),
         str(medgemma_path)
-    ])
     # Note: stdout and stderr redirection commented out for debugging
     # stdout=subprocess.DEVNULL,
     # stderr=subprocess.DEVNULL,

 import os
 from pathlib import Path
 import subprocess
+import socket
+from contextlib import closing
 import venv
+def _resolve_writable_cache_dir(preferred: str | None) -> str:
+    """Return a writable cache directory, falling back to user cache if needed."""
+    # Preferred path first
+    if preferred:
+        try:
+            os.makedirs(preferred, exist_ok=True)
+            if os.access(preferred, os.W_OK):
+                return preferred
+        except Exception:
+            pass
+    # Fallback path under user's home
+    fallback = os.path.join(Path.home(), ".cache", "medrax", "medgemma")
+    os.makedirs(fallback, exist_ok=True)
+    return fallback
+def _is_port_free(host: str, port: int) -> bool:
+    """Return True if (host, port) is free to bind on this machine."""
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        try:
+            sock.bind((host, port))
+            return True
+        except OSError:
+            return False
+def _find_free_loopback_and_port(start_octet: int = 2, end_octet: int = 254, base_port: int = 8002, max_port_tries: int = 50) -> tuple[str, int]:
+    """Find a free 127.0.0.X address and port combination.
+    Tries 127.0.0.2..127.0.0.254 each with ports base_port..base_port+max_port_tries
+    until a free pair is found. Falls back to 127.0.0.1 if none found for other octets.
+    """
+    # Try alternate loopback IPs first
+    for last_octet in range(start_octet, end_octet + 1):
+        host = f"127.0.0.{last_octet}"
+        for port in range(base_port, base_port + max_port_tries):
+            if _is_port_free(host, port):
+                return host, port
+    # Fallback: use 127.0.0.1 with port scan
+    host = "127.0.0.1"
+    for port in range(base_port, base_port + max_port_tries):
+        if _is_port_free(host, port):
+            return host, port
+    # Last resort: system-chosen ephemeral on 127.0.0.1
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+        sock.bind((host, 0))
+        return host, sock.getsockname()[1]
+def setup_medgemma_env(cache_dir: str | None = None, device: str | None = None) -> str:
     """Set up MedGemma virtual environment and launch the FastAPI service.
     This function performs the following steps:
     if not env_dir.exists():
         raise RuntimeError("Failed to create MedGemma virtual environment")
+    # Decide host/port to avoid collisions when multiple instances run
+    medgemma_host = os.getenv("MEDGEMMA_HOST")
+    medgemma_port_env = os.getenv("MEDGEMMA_PORT")
+    chosen_host: str
+    chosen_port: int
+    if medgemma_host and medgemma_port_env:
+        try:
+            port_val = int(medgemma_port_env)
+        except ValueError:
+            port_val = 8002
+        # If explicit host/port are provided, prefer them; if taken, try incrementing the port on the same host
+        chosen_host = medgemma_host
+        chosen_port = None
+        for p in range(port_val, port_val + 50):
+            if _is_port_free(medgemma_host, p):
+                chosen_port = p
+                break
+        if chosen_port is None:
+            print(f"No free ports in range {port_val}-{port_val+49} on {medgemma_host}; selecting a free loopback IP/port...")
+            chosen_host, chosen_port = _find_free_loopback_and_port()
+    else:
+        # Auto-pick a free loopback IP and port
+        chosen_host, chosen_port = _find_free_loopback_and_port()
+    print(f"Launching MedGemma FastAPI service on {chosen_host}:{chosen_port} ...")
+    env = os.environ.copy()
+    resolved_cache = _resolve_writable_cache_dir(cache_dir)
+    env["MEDGEMMA_CACHE_DIR"] = resolved_cache
+    if device:
+        env["MEDGEMMA_DEVICE"] = device
+    # Pass the chosen binding to the server via env
+    env["MEDGEMMA_HOST"] = chosen_host
+    env["MEDGEMMA_PORT"] = str(chosen_port)
     subprocess.Popen([
         str(python_executable),
         str(medgemma_path)
+    ], env=env)
+    # Return the base URL so callers can use it. If bound to 0.0.0.0, use 127.0.0.1 for local client access.
+    chosen_client_host = "127.0.0.1" if chosen_host in ("0.0.0.0", "::") else chosen_host
+    return f"http://{chosen_client_host}:{chosen_port}"
     # Note: stdout and stderr redirection commented out for debugging
     # stdout=subprocess.DEVNULL,
     # stderr=subprocess.DEVNULL,

medrax/tools/vqa/xray_vqa.py CHANGED Viewed

@@ -15,13 +15,9 @@ from langchain_core.tools import BaseTool
 class XRayVQAToolInput(BaseModel):
     """Input schema for the CheXagent Tool."""
-    image_paths: List[str] = Field(
-        ..., description="List of paths to chest X-ray images to analyze"
-    )
     prompt: str = Field(..., description="Question or instruction about the chest X-ray images")
-    max_new_tokens: int = Field(
-        512, description="Maximum number of tokens to generate in the response"
-    )
 class CheXagentXRayVQATool(BaseTool):
@@ -99,16 +95,14 @@ class CheXagentXRayVQATool(BaseTool):
         Returns:
             str: Model's response
         """
-        query = self.tokenizer.from_list_format(
-            [*[{"image": path} for path in image_paths], {"text": prompt}]
-        )
         conv = [
             {"from": "system", "value": "You are a helpful assistant."},
             {"from": "human", "value": query},
         ]
-        input_ids = self.tokenizer.apply_chat_template(
-            conv, add_generation_prompt=True, return_tensors="pt"
-        ).to(device=self.device)
         # Run inference
         with torch.inference_mode():

 class XRayVQAToolInput(BaseModel):
     """Input schema for the CheXagent Tool."""
+    image_paths: List[str] = Field(..., description="List of paths to chest X-ray images to analyze")
     prompt: str = Field(..., description="Question or instruction about the chest X-ray images")
+    max_new_tokens: int = Field(512, description="Maximum number of tokens to generate in the response")
 class CheXagentXRayVQATool(BaseTool):
         Returns:
             str: Model's response
         """
+        query = self.tokenizer.from_list_format([*[{"image": path} for path in image_paths], {"text": prompt}])
         conv = [
             {"from": "system", "value": "You are a helpful assistant."},
             {"from": "human", "value": query},
         ]
+        input_ids = self.tokenizer.apply_chat_template(conv, add_generation_prompt=True, return_tensors="pt").to(
+            device=self.device
+        )
         # Run inference
         with torch.inference_mode():

medrax/tools/xray_generation.py CHANGED Viewed

@@ -11,26 +11,15 @@ from langchain_core.tools import BaseTool
 class ChestXRayGeneratorInput(BaseModel):
     """Input schema for the Chest X-Ray Generator Tool."""
     prompt: str = Field(
-        ...,
-        description="Description of the medical condition to generate (e.g., 'big left-sided pleural effusion')"
-    )
-    height: int = Field(
-        512,
-        description="Height of generated image in pixels"
-    )
-    width: int = Field(
-        512,
-        description="Width of generated image in pixels"
-    )
-    num_inference_steps: int = Field(
-        75,
-        description="Number of denoising steps (higher = better quality but slower)"
     )
     guidance_scale: float = Field(
-        4.0,
-        description="How closely to follow the prompt (higher = more faithful but less diverse)"
     )
@@ -60,11 +49,11 @@ class ChestXRayGeneratorTool(BaseTool):
     ):
         """Initialize the chest X-ray generator tool."""
         super().__init__()
         self.device = torch.device(device) if device else "cuda"
         self.model = StableDiffusionPipeline.from_pretrained(model_path, cache_dir=cache_dir)
         self.model = self.model.to(torch.float32).to(self.device)
         self.temp_dir = Path(temp_dir if temp_dir else tempfile.mkdtemp())
         self.temp_dir.mkdir(exist_ok=True)
@@ -97,7 +86,7 @@ class ChestXRayGeneratorTool(BaseTool):
                 num_inference_steps=num_inference_steps,
                 height=height,
                 width=width,
-                guidance_scale=guidance_scale
             )
             # Save generated image
@@ -107,7 +96,7 @@ class ChestXRayGeneratorTool(BaseTool):
             output = {
                 "image_path": str(image_path),
             }
             metadata = {
                 "prompt": prompt,
                 "num_inference_steps": num_inference_steps,
@@ -126,7 +115,7 @@ class ChestXRayGeneratorTool(BaseTool):
                     "prompt": prompt,
                     "analysis_status": "failed",
                     "error_details": str(e),
-                }
             )
     async def _arun(
@@ -139,4 +128,4 @@ class ChestXRayGeneratorTool(BaseTool):
         run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
     ) -> Tuple[Dict[str, str], Dict]:
         """Async version of _run."""
-        return self._run(prompt, num_inference_steps, guidance_scale, height, width)

 class ChestXRayGeneratorInput(BaseModel):
     """Input schema for the Chest X-Ray Generator Tool."""
     prompt: str = Field(
+        ..., description="Description of the medical condition to generate (e.g., 'big left-sided pleural effusion')"
     )
+    height: int = Field(512, description="Height of generated image in pixels")
+    width: int = Field(512, description="Width of generated image in pixels")
+    num_inference_steps: int = Field(75, description="Number of denoising steps (higher = better quality but slower)")
     guidance_scale: float = Field(
+        4.0, description="How closely to follow the prompt (higher = more faithful but less diverse)"
     )
     ):
         """Initialize the chest X-ray generator tool."""
         super().__init__()
         self.device = torch.device(device) if device else "cuda"
         self.model = StableDiffusionPipeline.from_pretrained(model_path, cache_dir=cache_dir)
         self.model = self.model.to(torch.float32).to(self.device)
         self.temp_dir = Path(temp_dir if temp_dir else tempfile.mkdtemp())
         self.temp_dir.mkdir(exist_ok=True)
                 num_inference_steps=num_inference_steps,
                 height=height,
                 width=width,
+                guidance_scale=guidance_scale,
             )
             # Save generated image
             output = {
                 "image_path": str(image_path),
             }
             metadata = {
                 "prompt": prompt,
                 "num_inference_steps": num_inference_steps,
                     "prompt": prompt,
                     "analysis_status": "failed",
                     "error_details": str(e),
+                },
             )
     async def _arun(
         run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
     ) -> Tuple[Dict[str, str], Dict]:
         """Async version of _run."""
+        return self._run(prompt, num_inference_steps, guidance_scale, height, width)

pyproject.toml CHANGED Viewed

@@ -14,15 +14,15 @@ requires-python = ">=3.12"
 dependencies = [
     "requests>=2.25.0",
     "numpy>=1.19.0",
-    "langchain>=0.1.0",
-    "langchain-core>=0.1.0",
     "langchain-community>=0.0.20",
-    "langchain-openai>=0.0.2",
-    "langchain-cohere>=0.3.0,<0.4.0",
-    "langchain-anthropic>=0.0.2",
-    "langchain-xai>=0.0.1",
-    "langchain-chroma>=0.0.10",
-    "langgraph>=0.0.10",
     "hydra-core>=1.1.0",
     "python-dotenv>=0.19.0",
     "pandas>=1.5.0",
@@ -46,8 +46,9 @@ dependencies = [
     "gradio>=3.0.0",
     "gradio_client>=0.2.0",
     "httpx>=0.23.0",
-    "uvicorn>=0.15.0",
     "fastapi>=0.68.0",
     "einops>=0.3.0",
     "einops-exts>=0.0.4",
     "timm==0.5.4",
@@ -73,6 +74,7 @@ dependencies = [
     "huggingface_hub>=0.17.0",
     "iopath>=0.1.10",
     "duckduckgo-search>=4.0.0",
 ]
 [project.optional-dependencies]

 dependencies = [
     "requests>=2.25.0",
     "numpy>=1.19.0",
+    "langchain>=0.3.26",
+    "langchain-core>=0.3.68",
     "langchain-community>=0.0.20",
+    "langchain-openai>=0.3.27",
+    "langchain-cohere>=0.3.5",
+    "langchain-anthropic>=0.3.17",
+    "langchain-xai>=0.2.4",
+    "langchain-chroma>=0.2.4",
+    "langgraph>=0.5.1",
     "hydra-core>=1.1.0",
     "python-dotenv>=0.19.0",
     "pandas>=1.5.0",
     "gradio>=3.0.0",
     "gradio_client>=0.2.0",
     "httpx>=0.23.0",
+    "uvicorn[standard]>=0.15.0",
     "fastapi>=0.68.0",
+    "python-multipart>=0.0.6",
     "einops>=0.3.0",
     "einops-exts>=0.0.4",
     "timm==0.5.4",
     "huggingface_hub>=0.17.0",
     "iopath>=0.1.10",
     "duckduckgo-search>=4.0.0",
+    "pyngrok>=7.0.0",
 ]
 [project.optional-dependencies]