Final_Assignment_Template

Sleeping

App Files Files Community

Abbasid commited on Jul 31

Commit

6a09b39

verified ·

1 Parent(s): 2da7120

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -63

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """
 app.py
 This script provides the Gradio web interface to run the evaluation.
-This version properly handles multimodal inputs including images, videos, and audio.
 """
 import os
@@ -10,6 +10,8 @@ import gradio as gr
 import requests
 import pandas as pd
 from urllib.parse import urlparse
 from agent import create_agent_executor
@@ -24,74 +26,130 @@ def parse_final_answer(agent_response: str) -> str:
     if lines: return lines[-1].strip()
     return "Could not parse a final answer."
-def detect_file_type(url: str) -> str:
-    """Detect the type of file from URL."""
-    if not url:
-        return "unknown"
-    url_lower = url.lower()
-    # Image extensions
-    if any(ext in url_lower for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg']):
-        return "image"
-    # Video extensions and YouTube
-    if any(domain in url_lower for domain in ['youtube.com', 'youtu.be', 'vimeo.com']):
-        return "youtube"
-    if any(ext in url_lower for ext in ['.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm']):
-        return "video"
-    # Audio extensions
-    if any(ext in url_lower for ext in ['.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a']):
-        return "audio"
-    # Try to detect from headers if possible
     try:
-        response = requests.head(url, timeout=5)
         content_type = response.headers.get('content-type', '').lower()
-        if 'image' in content_type:
-            return "image"
-        elif 'audio' in content_type:
-            return "audio"
-        elif 'video' in content_type:
-            return "video"
-    except:
         pass
-    return "unknown"
-def create_enhanced_prompt(question_text: str, file_url: str = None) -> str:
-    """Create an enhanced prompt that guides the agent to use appropriate tools."""
     if not file_url:
-        return question_text
-    file_type = detect_file_type(file_url)
     if file_type == "image":
-        return f"""{question_text}
-[IMAGE ATTACHMENT]: {file_url}
-INSTRUCTION: There is an image attached to this question. You MUST use the 'describe_image' tool to analyze this image before answering the question."""
-    elif file_type == "youtube":
-        return f"""{question_text}
-[YOUTUBE VIDEO]: {file_url}
-INSTRUCTION: There is a YouTube video attached to this question. You MUST use the 'process_youtube_video' tool to analyze this video before answering the question."""
-    elif file_type == "audio":
-        return f"""{question_text}
-[AUDIO FILE]: {file_url}
-INSTRUCTION: There is an audio file attached to this question. You MUST use the 'process_audio_file' tool to analyze this audio before answering the question."""
-    else:
-        return f"""{question_text}
-[ATTACHMENT]: {file_url}
-INSTRUCTION: There is a file attachment. Analyze the URL and use the appropriate tool to process this content before answering the question."""
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
@@ -112,7 +170,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
     # 1. Instantiate Agent
     print("Initializing your custom agent...")
     try:
-        agent_executor = create_agent_executor(provider="google")  # Using Google for better multimodal support
     except Exception as e:
         return f"Fatal Error: Could not initialize agent. Check logs. Details: {e}", None
@@ -141,18 +199,22 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         # Get file URL if it exists
         file_url = item.get("file_url")
-        # Create enhanced prompt that instructs the agent to use appropriate tools
-        full_question_text = create_enhanced_prompt(question_text, file_url)
         if file_url:
-            file_type = detect_file_type(file_url)
-            print(f"File detected: {file_url} (Type: {file_type})")
-        print(f"Enhanced Prompt for Agent:\n{full_question_text}")
         try:
-            # Pass the enhanced question to the agent
-            result = agent_executor.invoke({"messages": [("user", full_question_text)]})
             raw_answer = result['messages'][-1].content
             submitted_answer = parse_final_answer(raw_answer)
@@ -165,7 +227,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
                 "Task ID": task_id,
                 "Question": question_text,
                 "File URL": file_url or "None",
-                "File Type": detect_file_type(file_url) if file_url else "None",
                 "Submitted Answer": submitted_answer
             })
@@ -177,7 +240,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
                 "Task ID": task_id,
                 "Question": question_text,
                 "File URL": file_url or "None",
-                "File Type": detect_file_type(file_url) if file_url else "None",
                 "Submitted Answer": error_msg
             })
@@ -201,9 +265,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
         return status_message, pd.DataFrame(results_log)
 # --- Gradio UI ---
-with gr.Blocks(title="Multimodal Agent Evaluation") as demo:
-    gr.Markdown("# Multimodal Agent Evaluation Runner")
-    gr.Markdown("This agent can process images, YouTube videos, audio files, and perform web searches.")
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
@@ -212,11 +276,11 @@ with gr.Blocks(title="Multimodal Agent Evaluation") as demo:
         label="Questions and Agent Answers",
         wrap=True,
         row_count=10,
-        column_widths=[80, 200, 150, 80, 200]
     )
     run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 if __name__ == "__main__":
-    print("\n" + "-"*30 + " Multimodal App Starting " + "-"*30)
     demo.launch()

 """
 app.py
 This script provides the Gradio web interface to run the evaluation.
+This version focuses on robust image detection and processing.
 """
 import os
 import requests
 import pandas as pd
 from urllib.parse import urlparse
+import mimetypes
+from typing import Optional, Tuple
 from agent import create_agent_executor
     if lines: return lines[-1].strip()
     return "Could not parse a final answer."
+def detect_file_type_robust(url: str) -> Tuple[str, dict]:
+    """
+    Robust file type detection with multiple validation methods.
+    Returns (file_type, metadata_dict)
+    """
+    if not url or not url.strip():
+        return "unknown", {"error": "Empty URL"}
+    url = url.strip()
+    metadata = {"original_url": url}
+    # Normalize URL
+    if not url.startswith(('http://', 'https://')):
+        return "unknown", {"error": "Invalid URL format - must start with http/https"}
+    try:
+        parsed = urlparse(url)
+        metadata["domain"] = parsed.netloc
+        metadata["path"] = parsed.path
+    except Exception as e:
+        return "unknown", {"error": f"URL parsing failed: {e}"}
+    # Method 1: File extension analysis
+    url_lower = url.lower()
+    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg', '.tiff', '.ico'}
+    # Check for image extensions
+    for ext in image_extensions:
+        if url_lower.endswith(ext) or ext in url_lower.split('?')[0]:  # Handle query params
+            metadata["detection_method"] = "file_extension"
+            metadata["extension"] = ext
+            return "image", metadata
+    # Method 2: Content-Type header check
     try:
+        print(f"Checking content type for: {url}")
+        response = requests.head(url, timeout=10, allow_redirects=True)
         content_type = response.headers.get('content-type', '').lower()
+        metadata["content_type"] = content_type
+        metadata["status_code"] = response.status_code
+        if response.status_code == 200:
+            if any(img_type in content_type for img_type in ['image/', 'image/jpeg', 'image/png', 'image/gif', 'image/webp']):
+                metadata["detection_method"] = "content_type"
+                return "image", metadata
+        else:
+            metadata["error"] = f"HTTP {response.status_code}"
+    except requests.RequestException as e:
+        metadata["error"] = f"Network error: {e}"
+        print(f"Network error checking {url}: {e}")
+    # Method 3: Domain-based detection for common image hosts
+    image_domains = {
+        'imgur.com', 'i.imgur.com',
+        'cdn.discordapp.com', 'media.discordapp.net',
+        'pbs.twimg.com', 'abs.twimg.com',
+        'i.redd.it', 'preview.redd.it',
+        'images.unsplash.com',
+        'via.placeholder.com',
+        'picsum.photos'
+    }
+    domain_lower = metadata.get("domain", "").lower()
+    if any(img_domain in domain_lower for img_domain in image_domains):
+        metadata["detection_method"] = "domain_based"
+        return "image", metadata
+    # Method 4: Guess from MIME types
+    try:
+        mime_type, _ = mimetypes.guess_type(url)
+        if mime_type and mime_type.startswith('image/'):
+            metadata["detection_method"] = "mime_guess"
+            metadata["mime_type"] = mime_type
+            return "image", metadata
+    except Exception:
         pass
+    return "unknown", metadata
+def create_structured_prompt(question_text: str, file_url: str = None) -> str:
+    """
+    Create a structured prompt that provides clear task analysis for the agent.
+    """
     if not file_url:
+        return f"""TASK: {question_text}
+ANALYSIS: This is a text-only question with no attachments.
+APPROACH: Use available tools (web search, Wikipedia, etc.) as needed to answer accurately."""
+    file_type, metadata = detect_file_type_robust(file_url)
     if file_type == "image":
+        return f"""TASK: {question_text}
+ATTACHMENT ANALYSIS:
+- Type: Image file detected
+- URL: {file_url}
+- Detection method: {metadata.get('detection_method', 'unknown')}
+- Metadata: {metadata}
+REASONING REQUIRED:
+1. This question involves an image that needs to be analyzed
+2. You must examine the image content to answer the question
+3. The image URL should be processed directly by your vision capabilities
+APPROACH: Process the image URL directly with your vision model, then provide a comprehensive answer based on what you see."""
+    else:
+        error_info = metadata.get('error', 'Unknown file type')
+        return f"""TASK: {question_text}
+ATTACHMENT ANALYSIS:
+- URL: {file_url}
+- Type: Could not identify as supported file type
+- Error: {error_info}
+- Metadata: {metadata}
+REASONING REQUIRED:
+1. There is an attachment but it's not a recognized image format
+2. You should attempt to process it as a regular web resource
+3. Use web search or other tools to gather information about the URL content
+APPROACH: Use web search or other available tools to gather information about this resource."""
 def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
     # 1. Instantiate Agent
     print("Initializing your custom agent...")
     try:
+        agent_executor = create_agent_executor(provider="groq")
     except Exception as e:
         return f"Fatal Error: Could not initialize agent. Check logs. Details: {e}", None
         # Get file URL if it exists
         file_url = item.get("file_url")
+        # Create structured prompt with robust file analysis
+        structured_prompt = create_structured_prompt(question_text, file_url)
         if file_url:
+            file_type, metadata = detect_file_type_robust(file_url)
+            print(f"File analysis: {file_url}")
+            print(f"  - Type: {file_type}")
+            print(f"  - Detection method: {metadata.get('detection_method', 'unknown')}")
+            if metadata.get('error'):
+                print(f"  - Error: {metadata['error']}")
+        print(f"Structured Prompt for Agent:\n{structured_prompt}")
         try:
+            # Pass the structured prompt to the agent
+            result = agent_executor.invoke({"messages": [("user", structured_prompt)]})
             raw_answer = result['messages'][-1].content
             submitted_answer = parse_final_answer(raw_answer)
                 "Task ID": task_id,
                 "Question": question_text,
                 "File URL": file_url or "None",
+                "File Type": detect_file_type_robust(file_url)[0] if file_url else "None",
+                "Detection Method": detect_file_type_robust(file_url)[1].get('detection_method', 'N/A') if file_url else "N/A",
                 "Submitted Answer": submitted_answer
             })
                 "Task ID": task_id,
                 "Question": question_text,
                 "File URL": file_url or "None",
+                "File Type": detect_file_type_robust(file_url)[0] if file_url else "None",
+                "Detection Method": "Error",
                 "Submitted Answer": error_msg
             })
         return status_message, pd.DataFrame(results_log)
 # --- Gradio UI ---
+with gr.Blocks(title="Image-Capable Agent Evaluation") as demo:
+    gr.Markdown("# Image-Capable Agent Evaluation Runner")
+    gr.Markdown("This agent can process images and perform web searches using Groq's vision-capable models.")
     gr.LoginButton()
     run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
         label="Questions and Agent Answers",
         wrap=True,
         row_count=10,
+        column_widths=[80, 200, 120, 100, 80, 200]
     )
     run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 if __name__ == "__main__":
+    print("\n" + "-"*30 + " Image Agent App Starting " + "-"*30)
     demo.launch()