Spaces:

VOIDER
/

repo-to-md

Running

App Files Files Community

VOIDER commited on Apr 17

Commit

24226a4

verified ·

1 Parent(s): 5c9292e

Update app.py

Browse files

Files changed (1) hide show

app.py +537 -329

app.py CHANGED Viewed

@@ -8,452 +8,660 @@ import fnmatch
 from pathlib import Path
 from pygments.lexers import guess_lexer_for_filename
 from pygments.util import ClassNotFound
 # --- Configuration ---
 DEFAULT_IGNORE_PATTERNS = [
-    ".git/", "*.pyc", "*__pycache__*", "node_modules/", ".DS_Store",
-    "*.log", "*.tmp", "*.swp", ".env", ".venv/", "venv/", ".idea/", ".vscode/",
-    "build/", "dist/", "*.egg-info/"
 ]
 DEFAULT_MAX_FILE_SIZE_KB = 1024  # 1 MB limit for file content inclusion
 # --- Core Logic ---
-def should_ignore(path_str: str, ignore_patterns: list[str], repo_root: Path) -> bool:
-    """Checks if a path matches any of the ignore patterns."""
     try:
-        relative_path = Path(path_str).relative_to(repo_root)
-    except ValueError: # If path_str is not relative to repo_root (shouldn't happen with os.walk)
-        return True # Treat as ignore if path resolution fails unexpectedly
-    # Check against directory patterns (ending with /)
     for pattern in ignore_patterns:
-        pattern_clean = pattern.rstrip('/')
-        # Use Path.match for simpler pattern matching against relative paths
-        # Convert pattern to POSIX-like format for consistency if needed, though fnmatch handles OS specifics
-        try:
-            # Match against the full relative path
-            if fnmatch.fnmatch(str(relative_path), pattern):
-                 #print(f"Ignoring '{relative_path}' due to pattern '{pattern}'")
-                 return True
-            # Match against just the name (for patterns like *.log)
-            if fnmatch.fnmatch(relative_path.name, pattern):
-                 #print(f"Ignoring '{relative_path}' due to pattern '{pattern}'")
                  return True
-            # Special handling for directory patterns (e.g., "node_modules/")
-            if pattern.endswith('/') and os.path.isdir(path_str):
-                 # Check if the directory name matches the pattern
-                 if relative_path.name == pattern_clean:
-                     #print(f"Ignoring directory '{relative_path}' due to pattern '{pattern}'")
-                     return True
-                 # Check if the path starts with the directory pattern
-                 if str(relative_path).startswith(pattern_clean + os.sep):
-                      #print(f"Ignoring path '{relative_path}' within ignored dir '{pattern}'")
-                      return True
-            # Handle cases where pattern might match a parent directory implicitly
-            if pattern.endswith('/'):
-                # Check if any parent directory name matches the pattern
-                for parent in relative_path.parents:
-                    if parent.name == pattern_clean:
-                        #print(f"Ignoring '{relative_path}' due to parent match on '{pattern}'")
-                        return True
-                    # This logic might be complex, reconsider if fnmatch covers enough
-                    # Simplified: Check if the path starts with the pattern directory
-                    if str(relative_path).startswith(pattern_clean + os.sep):
-                         #print(f"Ignoring '{relative_path}' due to prefix match on '{pattern}'")
-                         return True
-        except Exception as e:
-            print(f"Warning: Error during ignore pattern matching for '{relative_path}' with pattern '{pattern}': {e}")
-    return False
-def is_likely_binary(file_path: str, chunk_size=1024) -> bool:
     """Checks if a file is likely binary by reading a chunk."""
     try:
-        with open(file_path, 'rb') as f:
             chunk = f.read(chunk_size)
-        return b'\0' in chunk  # Null byte is a strong indicator of binary data
-    except Exception:
-        return True # Assume binary if reading fails
-def get_file_content(file_path: str, max_size_bytes: int) -> tuple[str | None, str | None, str | None]:
-    """
-    Reads file content, detects language, handles size limits and encodings.
-    Returns (content, language, error_message)
-    """
     try:
-        file_size = os.path.getsize(file_path)
         if file_size > max_size_bytes:
-            return None, None, f"[Content skipped: File size ({file_size / 1024:.1f} KB) exceeds limit ({max_size_bytes / 1024:.1f} KB)]"
         if is_likely_binary(file_path):
-            return None, None, "[Content skipped: Detected as binary file]"
-        # Try reading with UTF-8, fallback to latin-1 for robustness
         try:
-            with open(file_path, 'r', encoding='utf-8') as f:
                 content = f.read()
         except UnicodeDecodeError:
             try:
-                with open(file_path, 'r', encoding='latin-1') as f:
                     content = f.read()
             except Exception as e_read:
                 return None, None, f"[Content skipped: Error reading file - {e_read}]"
-        # Guess language for syntax highlighting
         try:
-            lexer = guess_lexer_for_filename(file_path, content)
             language = lexer.aliases[0] if lexer.aliases else lexer.name
         except ClassNotFound:
-            language = "" # No language detected
         return content, language, None
     except Exception as e:
-        return None, None, f"[Content skipped: Error processing file - {e}]"
-# MODIFIED: Added include_content parameter
-def generate_markdown_for_repo(repo_path: str, ignore_patterns: list[str], max_file_size_kb: int, include_content: bool) -> str:
-    """Generates Markdown content for the repository structure and optionally files."""
-    repo_root = Path(repo_path).resolve()
-    md_content = ["# Repository Structure and Content\n\n"]
-    file_contents_md = []
-    max_size_bytes = max_file_size_kb * 1024
-    processed_paths = set() # To avoid duplicate processing if walk yields dirs multiple times
-    # --- Pass 1: Build the directory structure ---
-    md_content.append("## Directory Structure\n\n```\n")
     structure_lines = []
-    for root, dirs, files in os.walk(repo_path, topdown=True):
-        root_path = Path(root).resolve()
-        # --- Ignore directories based on patterns ---
-        original_dirs = list(dirs) # Copy because we modify dirs list
-        dirs[:] = [d for d in original_dirs if not should_ignore(str(root_path / d), ignore_patterns, repo_root)]
-        # Check if the current root itself should be ignored
-        if root_path != repo_root and should_ignore(str(root_path), ignore_patterns, repo_root):
-             continue # Skip processing this directory and its contents further
-        if root_path not in processed_paths:
-            try:
-                relative_root = root_path.relative_to(repo_root)
-                depth = len(relative_root.parts)
-                indent = "  " * depth
-                # Use '.' for the root directory itself if it's the starting point
-                dir_name = relative_root.name if relative_root.parts else "."
-                structure_lines.append(f"{indent}{dir_name}/")
-                processed_paths.add(root_path)
-                # Add files in this directory to the structure
-                files.sort() # Sort files for consistent output
-                for file in files:
-                    file_path = root_path / file
-                    if not should_ignore(str(file_path), ignore_patterns, repo_root):
-                        structure_lines.append(f"{indent}  {file}")
-            except ValueError:
-                 # Handle cases where root_path might somehow not be under repo_root
-                 print(f"Warning: Path {root_path} not relative to {repo_root}, skipping in structure.")
-                 continue
-    md_content.append("\n".join(structure_lines))
-    md_content.append("```\n\n")
     # --- Pass 2: Process file contents (ONLY if requested) ---
-    if include_content:
-        md_content.append("## File Contents\n") # Add header only if including content
-        for root, dirs, files in os.walk(repo_path, topdown=True):
-            root_path = Path(root).resolve()
-            # --- Apply Ignore Patterns (again, for consistency and pruning) ---
-            dirs[:] = [d for d in dirs if not should_ignore(str(root_path / d), ignore_patterns, repo_root)]
-            if root_path != repo_root and should_ignore(str(root_path), ignore_patterns, repo_root):
-                 continue
-            # --- Process Files ---
-            files.sort() # Ensure consistent order
-            for file in files:
-                file_path = root_path / file
-                if should_ignore(str(file_path), ignore_patterns, repo_root):
-                    continue
                 try:
-                    relative_path = file_path.relative_to(repo_root)
-                    file_contents_md.append(f"\n### `{relative_path}`\n")
-                    content, language, error_msg = get_file_content(str(file_path), max_size_bytes)
-                    if error_msg:
-                        file_contents_md.append(f"```\n{error_msg}\n```\n")
-                    elif content is not None:
-                        file_contents_md.append(f"```{language}\n{content}\n```\n")
-                    else: # Should not happen if error_msg logic is correct, but as fallback
-                         file_contents_md.append("```\n[Content could not be retrieved]\n```\n")
-                except ValueError:
-                    print(f"Warning: Path {file_path} not relative to {repo_root}, skipping content.")
-                    continue
-        md_content.extend(file_contents_md) # Append collected file contents
-    # If not include_content, the "## File Contents" header and the second loop are skipped.
-    return "".join(md_content)
-# MODIFIED: Added include_content parameter
-def repo_to_md(input_type: str, repo_url: str | None, uploaded_zip: tempfile._TemporaryFileWrapper | None, ignore_patterns_str: str, max_file_size_kb: int, include_content: bool):
-    """Main function called by Gradio interface."""
-    temp_dir = None
     repo_path = None
     output_md = ""
     output_file_path = None
     error_message = None
-    ignore_patterns = [p.strip() for p in ignore_patterns_str.split(',') if p.strip()]
-    # Ensure default patterns are added only if they aren't already covered by user patterns
-    # A simple way is just to combine and remove duplicates
-    combined_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
-    ignore_patterns = sorted(list(set(combined_patterns))) # Keep unique and sort for consistency if needed
     try:
-        temp_dir = tempfile.mkdtemp()
-        if input_type == "URL":
-            if not repo_url or not (repo_url.startswith("http://") or repo_url.startswith("https://")):
-                raise ValueError("Invalid Git URL provided. Must start with http:// or https://")
-            print(f"Cloning repository: {repo_url}")
-            try:
-                # Attempt sparse checkout if available (modern Git)
-                # This might fail on older Git versions, hence the fallback
-                subprocess.run(
-                    ["git", "clone", "--depth", "1", "--filter=blob:none", "--no-checkout", repo_url, temp_dir],
-                    check=True, capture_output=True, text=True, timeout=60
-                )
-                subprocess.run(["git", "sparse-checkout", "init", "--cone"], cwd=temp_dir, check=True, capture_output=True, text=True)
-                subprocess.run(["git", "checkout"], cwd=temp_dir, check=True, capture_output=True, text=True, timeout=120)
-                print("Cloning successful (sparse/filtered).")
-            except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e_sparse:
-                print(f"Sparse clone failed ({e_sparse}), attempting standard shallow clone...")
-                # Fallback to standard shallow clone
-                shutil.rmtree(temp_dir) # Clean up failed attempt
-                temp_dir = tempfile.mkdtemp() # Recreate temp dir
                 try:
-                    subprocess.run(
-                        ["git", "clone", "--depth", "1", repo_url, temp_dir],
-                        check=True,
-                        capture_output=True,
-                        text=True,
-                        timeout=180 # 3 min timeout for standard clone
-                    )
-                    print("Cloning successful (standard shallow).")
-                except FileNotFoundError:
-                     raise RuntimeError("Git command not found. Ensure git is installed and in your PATH.")
-                except subprocess.CalledProcessError as e_std:
-                    # Capture more detailed error from stderr
-                    error_detail = e_std.stderr or e_std.stdout or "No output captured."
-                    raise RuntimeError(f"Git clone failed:\n{error_detail.strip()}")
-                except subprocess.TimeoutExpired:
-                    raise RuntimeError("Git clone timed out after 3 minutes.")
-            repo_path = temp_dir
-        elif input_type == "Upload ZIP":
-            if uploaded_zip is None:
-                raise ValueError("No ZIP file uploaded.")
-            print(f"Extracting uploaded ZIP: {uploaded_zip.name}")
-            with zipfile.ZipFile(uploaded_zip.name, 'r') as zip_ref:
-                # Check for potentially malicious paths (e.g., ../..)
-                for member in zip_ref.namelist():
-                    # Basic check for absolute paths or paths trying to go up the directory tree
-                    if member.startswith('/') or member.startswith('\\') or '..' in member.split(os.path.sep):
-                         raise ValueError(f"ZIP contains potentially unsafe path: {member}")
                 try:
-                    zip_ref.extractall(temp_dir)
                 except Exception as e_extract:
                     raise RuntimeError(f"Failed to extract ZIP file: {e_extract}")
-            # --- Find the actual repo root within the extracted files ---
-            extracted_items = os.listdir(temp_dir)
-            # Remove macOS specific hidden file if present
-            if ".DS_Store" in extracted_items:
-                 extracted_items.remove(".DS_Store")
-            # Check for __MACOSX folder often created by macOS archiver
-            if "__MACOSX" in extracted_items and os.path.isdir(os.path.join(temp_dir, "__MACOSX")):
-                extracted_items.remove("__MACOSX")
-            if len(extracted_items) == 1 and os.path.isdir(os.path.join(temp_dir, extracted_items[0])):
-                repo_path = os.path.join(temp_dir, extracted_items[0])
-                print(f"Detected repo root inside ZIP: {extracted_items[0]}")
             else:
-                 # Assume the root of the zip is the repo root
-                repo_path = temp_dir
-                print("Using root of the extracted ZIP as repo root.")
-        else:
-            raise ValueError("Invalid input type selected.")
-        if not repo_path or not os.path.isdir(repo_path):
-             raise RuntimeError(f"Could not determine repository path after processing input.")
-        print(f"Generating Markdown for path: {repo_path}")
-        # MODIFIED: Pass include_content flag
-        output_md = generate_markdown_for_repo(repo_path, ignore_patterns, max_file_size_kb, include_content)
-        print("Markdown generation complete.")
-        # Save markdown to a temporary file for download
-        # Ensure temp file has '.md' extension for Gradio File component to handle it nicely
-        with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md", encoding='utf-8', prefix="repo_") as f:
-            f.write(output_md)
-            output_file_path = f.name
-        print(f"Markdown saved to temporary file: {output_file_path}")
     except Exception as e:
         error_message = f"An error occurred: {e}"
-        print(f"Error: {e}") # Log error to console for debugging
-        # Ensure output is reset on error
-        output_md = f"```\n{error_message}\n```" # Display error in the markdown area
-        output_file_path = None # No file to download on error
     finally:
-        # Clean up the temporary directory used for cloning/extraction
-        if temp_dir and os.path.exists(temp_dir):
-            try:
-                shutil.rmtree(temp_dir, ignore_errors=True) # Be more robust on cleanup
-                print(f"Attempted cleanup of temporary directory: {temp_dir}")
-            except Exception as e_clean:
-                print(f"Warning: Failed to cleanup temporary directory {temp_dir}: {e_clean}")
-        # Gradio's gr.File component should manage its temp files when value is updated.
-        # If we created the temp file (output_file_path) AND there was NO error,
-        # we might need manual cleanup later if Gradio doesn't handle it, but usually it does.
-        # Let's assume Gradio handles the download file cleanup for now.
-    # Return values for Gradio outputs
-    # Always return a string for md_output (either the result or the error message)
-    # Return the file path for download only on success, otherwise None (or an invisible File update)
-    if output_file_path:
-        return output_md, gr.File(value=output_file_path, visible=True)
-    else:
-        # If there was an error, output_md contains the error message
-        # And we hide the download button
-        return output_md, gr.File(visible=False)
 # --- Gradio Interface ---
 css = """
 #md_output {
-    max-height: 70vh; /* Adjust max height as needed */
-    overflow-y: auto; /* Add scrollbar if content exceeds max height */
-    border: 1px solid #e0e0e0; /* Optional: add a border */
-    padding: 10px; /* Optional: add some padding */
 }
-#download_output { /* Style the download component if needed */
-    margin-top: 10px; /* Add space above download button */
 }
-footer { display: none !important; } /* Hide Gradio footer */
 """
-with gr.Blocks(css=css, title="Repo to Markdown Converter") as demo:
-    gr.Markdown("# Repository to Markdown Converter")
     gr.Markdown(
         "Enter a public Git repository URL or upload a local project folder (as a `.zip` archive) "
-        "to generate a single Markdown file containing its structure and optionally file contents."
     )
     with gr.Row():
-        with gr.Column(scale=2):
             input_type = gr.Radio(
                 ["URL", "Upload ZIP"], label="Input Source", value="URL"
             )
-            # --- URL Input ---
             url_input = gr.Textbox(
                 label="Git Repository URL",
-                placeholder="e.g., https://github.com/gradio-app/gradio.git",
-                visible=True, # Initially visible
-                interactive=True
             )
-            # --- Upload Input ---
             zip_input = gr.File(
                 label="Upload Local Folder (as .zip)",
                 file_types=[".zip"],
-                visible=False, # Initially hidden
-                interactive=True,
-                # Use file_count='single' explicitly if needed, though default
             )
-            # --- Shared Options ---
-            ignore_input = gr.Textbox(
-                label="Ignore Patterns (comma-separated)",
-                value=", ".join(DEFAULT_IGNORE_PATTERNS),
-                placeholder="e.g., .git/, *.log, node_modules/",
-                info="Uses standard gitignore patterns (fnmatch). Add `/` for directories. Defaults are included."
-            )
-            max_size_input = gr.Number(
-                label="Max File Size to Include Content (KB)",
-                value=DEFAULT_MAX_FILE_SIZE_KB,
-                minimum=0,
-                step=64,
-                info="Files larger than this will have their content skipped (if content inclusion is enabled)."
-            )
-            # --- ADDED: Checkbox for content inclusion ---
-            include_content_checkbox = gr.Checkbox(
-                label="Include File Content",
-                value=True, # Default to including content
-                info="Uncheck to generate only the directory structure."
-            )
-            # --- End Added Checkbox ---
-            submit_btn = gr.Button("Generate Markdown", variant="primary")
-        with gr.Column(scale=3):
-            gr.Markdown("## Generated Output")
-            md_output = gr.Markdown(elem_id="md_output", value="*Markdown output will appear here...*")
-            download_output = gr.File(label="Download .md File", interactive=False, visible=False, elem_id="download_output")
-    # --- Input Type Change Logic ---
     def update_input_visibility(choice):
-        if choice == "URL":
-            return {url_input: gr.update(visible=True), zip_input: gr.update(visible=False)}
-        else: # Upload ZIP
-            return {url_input: gr.update(visible=False), zip_input: gr.update(visible=True)}
     input_type.change(
         fn=update_input_visibility,
         inputs=input_type,
-        outputs=[url_input, zip_input]
     )
-    # --- Form Submission ---
-    # MODIFIED: Added include_content_checkbox to inputs
     submit_btn.click(
-        fn=repo_to_md,
         inputs=[
-            input_type,
-            url_input,
-            zip_input,
-            ignore_input,
-            max_size_input,
-            include_content_checkbox, # Pass the checkbox state
         ],
-        outputs=[
-            md_output,
-            download_output,
-        ],
-        api_name="repo_to_md" # For API access if needed
     )
 # Launch the interface
 if __name__ == "__main__":
-    # Consider adding share=True if you want to share it publicly via Gradio's service
-    demo.launch()

 from pathlib import Path
 from pygments.lexers import guess_lexer_for_filename
 from pygments.util import ClassNotFound
+import logging
+import time
+import math
+# Try importing pyperclip, provide instructions if missing
+try:
+    import pyperclip
+    PYPERCLIP_AVAILABLE = True
+except ImportError:
+    PYPERCLIP_AVAILABLE = False
+    logging.warning("pyperclip library not found. 'Copy to Clipboard' functionality will be disabled. Install with: pip install pyperclip")
 # --- Configuration ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 DEFAULT_IGNORE_PATTERNS = [
+    # Version Control
+    ".git/", ".gitignore", ".gitattributes", ".gitmodules", ".svn/", ".hg/",
+    # IDE/Editor Files
+    ".idea/", ".vscode/", "*.sublime-project", "*.sublime-workspace", ".project", ".classpath", "*.tmproj", ".DS_Store", "Thumbs.db",
+    # Build Outputs & Dependencies
+    "build/", "dist/", "bin/", "obj/", "out/", "target/", "*.o", "*.so", "*.dll", "*.exe", "*.class", "*.jar", "*.war", "*.ear",
+    "node_modules/", "bower_components/", "vendor/", "*.egg-info/", "wheels/", "**/__pycache__/", "*.pyc",
+    # Python Virtual Environments
+    ".venv/", "venv/", "env/", ".env", "pip-cache/",
+    # Logs & Temporary Files
+    "*.log", "*.tmp", "*.temp", "*.swp", "*.swo", "*.bak",
+    # OS Generated Files
+    "._*",
+    # Secrets (important!)
+    "*.pem", "*.key", ".env*", "secrets.*", "credentials.*",
+    # Common Framework/Tool cache/temp files
+    ".pytest_cache/", ".tox/", ".mypy_cache/", ".ruff_cache/", "*.ipynb_checkpoints",
+    # MACOS specific zip artifact
+    "__MACOSX/",
 ]
 DEFAULT_MAX_FILE_SIZE_KB = 1024  # 1 MB limit for file content inclusion
+CLONE_TIMEOUT_SPARSE = 120 # seconds
+CLONE_TIMEOUT_STANDARD = 300 # seconds
+ZIP_EXTRACT_WARN_THRESHOLD = 1000 # Warn if ZIP contains more than this many files
+MAX_FILES_FOR_DETAILED_PROGRESS = 500 # Only show per-file progress if fewer than this many files
 # --- Core Logic ---
+def should_ignore(path_obj: Path, ignore_patterns: list[str], repo_root: Path) -> bool:
+    """Checks if a file or directory path should be ignored based on gitignore-style patterns."""
     try:
+        relative_path = path_obj.relative_to(repo_root)
+        # Use POSIX paths for consistent pattern matching regardless of OS
+        relative_path_str = relative_path.as_posix()
+    except ValueError:
+        logging.warning(f"Path {path_obj} not relative to root {repo_root}, ignoring.")
+        return True
+    # Optimization: Check direct name match first for common ignores like '.git'
+    if path_obj.name in ignore_patterns:
+        return True
     for pattern in ignore_patterns:
+        pattern = pattern.strip()
+        if not pattern or pattern.startswith('#'):
+            continue
+        # Ensure pattern uses POSIX separators
+        pattern_posix = pattern.replace(os.sep, '/')
+        # Case 1: Pattern specifies a directory (ends with '/')
+        if pattern_posix.endswith('/'):
+            # Match if the relative path *is* this directory or starts with it
+            # Example: pattern "build/", path "build" or "build/foo.txt"
+            dir_pattern = pattern_posix.rstrip('/')
+            if relative_path_str == dir_pattern or relative_path_str.startswith(dir_pattern + '/'):
+                return True
+            # Also match if a *directory component* matches the name (like ignoring 'node_modules' anywhere)
+            # Example: pattern "node_modules/", path "src/my_lib/node_modules/some_dep"
+            if path_obj.is_dir() and path_obj.name == dir_pattern:
                  return True
+            # Check parent directories as well
+            for parent in relative_path.parents:
+                 if parent.name == dir_pattern:
+                    return True
+        # Case 2: Pattern is a file or general pattern (using fnmatch)
+        # Use fnmatchcase for potentially case-sensitive filesystems if needed,
+        # but fnmatch is often sufficient and more git-like on Win/Mac.
+        if fnmatch.fnmatch(relative_path_str, pattern_posix):
+            return True
+        # Also match just the filename part for patterns like "*.log"
+        if fnmatch.fnmatch(path_obj.name, pattern_posix):
+             return True
+    return False
+def is_likely_binary(file_path: Path, chunk_size=1024) -> bool:
     """Checks if a file is likely binary by reading a chunk."""
     try:
+        with file_path.open('rb') as f:
             chunk = f.read(chunk_size)
+        return b'\0' in chunk
+    except OSError as e:
+        logging.warning(f"Could not read file chunk for binary check {file_path}: {e}")
+        return True
+def get_file_content(file_path: Path, max_size_bytes: int) -> tuple[str | None, str | None, str | None]:
+    """Reads file content, detects language, handles size limits and encodings."""
     try:
+        file_size = file_path.stat().st_size
         if file_size > max_size_bytes:
+            kb_limit = max_size_bytes / 1024
+            kb_actual = file_size / 1024
+            return None, None, f"[Content skipped: File size ({kb_actual:.1f} KB) exceeds limit ({kb_limit:.1f} KB)]"
+        if file_size == 0:
+             return "", "", None # Empty file
         if is_likely_binary(file_path):
+            return None, None, "[Content skipped: Detected as likely binary file]"
+        content = None
+        detected_encoding = 'utf-8'
         try:
+            with file_path.open('r', encoding='utf-8') as f:
                 content = f.read()
         except UnicodeDecodeError:
+            logging.warning(f"UTF-8 decoding failed for {file_path}, trying latin-1.")
+            detected_encoding = 'latin-1'
             try:
+                with file_path.open('r', encoding='latin-1') as f:
                     content = f.read()
             except Exception as e_read:
+                logging.error(f"Error reading file {file_path} even with latin-1: {e_read}")
                 return None, None, f"[Content skipped: Error reading file - {e_read}]"
+        except OSError as e_os:
+             logging.error(f"OS Error reading file {file_path}: {e_os}")
+             return None, None, f"[Content skipped: OS Error reading file - {e_os}]"
+        language = ""
         try:
+            lexer = guess_lexer_for_filename(file_path.name, content)
             language = lexer.aliases[0] if lexer.aliases else lexer.name
         except ClassNotFound:
+            language = "" # Plain text
+        except Exception as e_lexer:
+            logging.warning(f"Could not guess lexer for {file_path}: {e_lexer}")
+            language = "" # Fallback
         return content, language, None
+    except OSError as e_os:
+        logging.error(f"OS Error processing file {file_path}: {e_os}")
+        return None, None, f"[Content skipped: Error accessing file properties - {e_os}]"
     except Exception as e:
+        logging.error(f"Unexpected error processing file {file_path}: {e}", exc_info=True)
+        return None, None, f"[Content skipped: Unexpected error processing file - {e}]"
+# --- MODIFIED: Function now uses yield for status updates ---
+def generate_markdown_for_repo(repo_path_str: str, ignore_patterns: list[str], max_file_size_kb: int, include_content: bool):
+    """
+    Generates Markdown content for the repository structure and optionally files.
+    Yields status updates during processing.
+    """
+    repo_root = Path(repo_path_str).resolve()
+    yield f"Status: Analysing repository at {repo_root}..."
+    logging.info(f"Starting markdown generation for: {repo_root}")
+    md_lines = ["# Repository Analysis\n"]
     structure_lines = []
+    content_lines = []
+    max_size_bytes = max_file_size_kb * 1024
+    files_to_process = []
+    # --- Pre-computation: Collect all files to potentially process ---
+    yield "Status: Scanning file structure..."
+    all_paths = []
+    for root, dirs, files in os.walk(repo_path_str, topdown=True):
+        root_path = Path(root).resolve()
+        # --- Filter ignored directories before adding paths ---
+        # We need to check against the original dirs list before modifying it
+        original_dirs = list(dirs)
+        dirs[:] = [d for d in original_dirs if not should_ignore(root_path / d, ignore_patterns, repo_root)]
+        # Add directories that are *not* ignored
+        for d in dirs: # Add the non-ignored directory paths
+            all_paths.append(root_path / d)
+        # Add files that are *not* ignored
+        for f in files:
+            file_path = root_path / f
+            if not should_ignore(file_path, ignore_patterns, repo_root):
+                 all_paths.append(file_path)
+    # --- Pass 1: Build the directory structure visualization ---
+    yield "Status: Generating directory structure..."
+    structure_lines.append("## Directory Structure")
+    structure_lines.append("```")
+    structure_tree = []
+    processed_dirs_for_structure = set()
+    def add_to_structure(path_obj: Path, depth: int):
+        indent = "    " * depth # 4 spaces indent
+        prefix = "└── "
+        if path_obj.is_dir():
+             # Add directory only if it hasn't been added via a parent walk already
+             if path_obj not in processed_dirs_for_structure:
+                 structure_tree.append(f"{indent}{prefix}{path_obj.name}/")
+                 processed_dirs_for_structure.add(path_obj)
+                 # Recursively add children
+                 try:
+                     for item in sorted(path_obj.iterdir(), key=lambda p: (p.is_file(), p.name.lower())):
+                        if not should_ignore(item, ignore_patterns, repo_root):
+                            add_to_structure(item, depth + 1)
+                 except OSError as e:
+                     logging.warning(f"Could not access directory {path_obj}: {e}")
+                     structure_tree.append(f"{indent}    └── [Error accessing directory: {e}]")
+        elif path_obj.is_file():
+             structure_tree.append(f"{indent}{prefix}{path_obj.name}")
+    # Start building the structure from the root
+    structure_tree.append(f"{repo_root.name}/")
+    processed_dirs_for_structure.add(repo_root)
+    try:
+        for item in sorted(repo_root.iterdir(), key=lambda p: (p.is_file(), p.name.lower())):
+            if not should_ignore(item, ignore_patterns, repo_root):
+                add_to_structure(item, 1)
+    except OSError as e:
+        logging.error(f"Could not access repository root {repo_root}: {e}")
+        structure_tree.append(f"    └── [Error accessing repository root: {e}]")
+    structure_lines.extend(structure_tree)
+    structure_lines.append("```\n")
+    yield "Status: Directory structure generated."
+    logging.info("Directory structure built.")
     # --- Pass 2: Process file contents (ONLY if requested) ---
+    files_to_render = [p for p in all_paths if p.is_file()]
+    total_files = len(files_to_render)
+    if include_content and total_files > 0:
+        yield f"Status: Processing content of {total_files} file(s)..."
+        content_lines.append("## File Contents\n")
+        start_time = time.time()
+        show_detailed_progress = total_files <= MAX_FILES_FOR_DETAILED_PROGRESS
+        for i, file_path in enumerate(files_to_render):
+            if show_detailed_progress or (i % 50 == 0 and i > 0): # Update every 50 files if many files
+                progress_percent = (i + 1) / total_files
+                yield f"Status: Processing file {i+1}/{total_files}: {file_path.relative_to(repo_root).as_posix()} ({progress_percent:.0%})"
+            try:
+                relative_path_str = file_path.relative_to(repo_root).as_posix()
+                content_lines.append(f"### `{relative_path_str}`\n") # Use POSIX path in Markdown
+                content, language, error_msg = get_file_content(file_path, max_size_bytes)
+                if error_msg:
+                    content_lines.append(f"```\n{error_msg}\n```\n")
+                elif content is not None:
+                    lang_hint = language if language else ""
+                    content_lines.append(f"```{lang_hint}\n{content}\n```\n")
+                else:
+                    # Should generally be covered by error_msg cases, but as a fallback
+                    content_lines.append("```\n[Content not available or file is binary/empty]\n```\n")
+            except ValueError:
+                logging.warning(f"Path {file_path} not relative to {repo_root}, skipping content.")
+                continue
+            except Exception as e:
+                logging.error(f"Unexpected error processing content for {file_path}: {e}", exc_info=True)
+                relative_path_str = file_path.name # Fallback name
                 try:
+                    relative_path_str = file_path.relative_to(repo_root).as_posix()
+                except ValueError: pass
+                content_lines.append(f"### `{relative_path_str}`\n")
+                content_lines.append(f"```\n[ERROR processing file content: {e}]\n```\n")
+        end_time = time.time()
+        yield f"Status: File content processing complete ({total_files} files in {end_time - start_time:.2f}s)."
+        logging.info(f"File content processing complete. Processed {total_files} files in {end_time - start_time:.2f} seconds.")
+    elif not include_content:
+        yield "Status: Skipping file content inclusion as requested."
+        logging.info("Skipping file content inclusion as requested.")
+    else: # include_content is True but total_files is 0
+         yield "Status: No files found to include content for (after filtering)."
+         logging.info("No files found to include content for (after filtering).")
+    # Combine structure and content
+    md_lines.extend(structure_lines)
+    if include_content and content_lines: # Only add content section if requested and content exists
+        md_lines.extend(content_lines)
+    yield "Status: Markdown generation complete!"
+    yield "".join(md_lines) # Final yield is the complete markdown
+# --- MODIFIED: Function is now a generator, yielding status updates ---
+def repo_to_md_processor(input_type: str, repo_url: str | None, uploaded_zip: tempfile._TemporaryFileWrapper | None, git_branch: str | None, ignore_patterns_str: str, max_file_size_kb: int, include_content: bool):
+    """
+    Main processing generator function called by Gradio interface.
+    Yields status strings and finally the markdown content or an error message.
+    """
+    temp_dir_obj = None
     repo_path = None
     output_md = ""
     output_file_path = None
     error_message = None
+    start_time = time.time()
     try:
+        yield "Status: Initializing..."
+        # Combine user patterns with defaults
+        user_patterns = {p.strip() for p in ignore_patterns_str.split(',') if p.strip()}
+        default_patterns = set(DEFAULT_IGNORE_PATTERNS)
+        combined_patterns = sorted(list(user_patterns.union(default_patterns)))
+        logging.info(f"Using ignore patterns: {combined_patterns}")
+        logging.info(f"Max file size for content: {max_file_size_kb} KB")
+        logging.info(f"Include file content: {include_content}")
+        if input_type == "URL" and git_branch:
+             logging.info(f"Requested Git branch/tag: {git_branch}")
+        with tempfile.TemporaryDirectory(prefix="repo_md_") as temp_dir:
+            logging.info(f"Created temporary directory: {temp_dir}")
+            temp_dir_path = Path(temp_dir)
+            if input_type == "URL":
+                if not repo_url or not (repo_url.startswith("http://") or repo_url.startswith("https://") or repo_url.startswith("git@")):
+                    raise ValueError("Invalid Git URL. Must start with http(s):// or git@")
+                yield f"Status: Processing URL: {repo_url}" + (f" (branch/tag: {git_branch})" if git_branch else "")
+                target_clone_path = temp_dir_path / "repo"
+                target_clone_path.mkdir()
+                repo_path_str = str(target_clone_path)
+                # --- Git Clone ---
+                branch_args = ["--branch", git_branch] if git_branch and git_branch.strip() else []
+                common_args = ["--depth", "1"] # Always shallow clone
                 try:
+                    # Try sparse checkout first
+                    yield "Status: Attempting efficient Git clone (sparse)..."
+                    clone_cmd_sparse = ["git", "clone"] + common_args + ["--filter=blob:none", "--no-checkout"] + branch_args + [repo_url, repo_path_str]
+                    logging.info(f"Running sparse clone command: {' '.join(clone_cmd_sparse)}")
+                    subprocess.run(clone_cmd_sparse, check=True, capture_output=True, text=True, encoding='utf-8', errors='replace', timeout=CLONE_TIMEOUT_SPARSE)
+                    checkout_cmd_sparse = ["git", "sparse-checkout", "init", "--cone"]
+                    logging.info(f"Running sparse checkout init: {' '.join(checkout_cmd_sparse)}")
+                    subprocess.run(checkout_cmd_sparse, cwd=repo_path_str, check=True, capture_output=True, text=True)
+                    checkout_cmd = ["git", "checkout"]
+                    logging.info(f"Running final checkout: {' '.join(checkout_cmd)}")
+                    subprocess.run(checkout_cmd, cwd=repo_path_str, check=True, capture_output=True, text=True, encoding='utf-8', errors='replace', timeout=CLONE_TIMEOUT_SPARSE)
+                    yield "Status: Efficient Git clone successful."
+                except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e_sparse:
+                    yield f"Status: Efficient clone failed ({type(e_sparse).__name__}), attempting standard clone..."
+                    logging.warning(f"Sparse clone failed: {e_sparse}. Output: {e_sparse.stderr if hasattr(e_sparse, 'stderr') else 'N/A'}")
+                    shutil.rmtree(target_clone_path, ignore_errors=True)
+                    target_clone_path.mkdir()
+                    try:
+                        # Fallback to standard shallow clone
+                        clone_cmd_std = ["git", "clone"] + common_args + branch_args + [repo_url, repo_path_str]
+                        logging.info(f"Running standard clone command: {' '.join(clone_cmd_std)}")
+                        subprocess.run(clone_cmd_std, check=True, capture_output=True, text=True, encoding='utf-8', errors='replace', timeout=CLONE_TIMEOUT_STANDARD)
+                        yield "Status: Standard shallow clone successful."
+                    except FileNotFoundError:
+                         logging.error("Git command not found.")
+                         raise RuntimeError("Git command not found. Please install Git and ensure it's in your PATH.")
+                    except subprocess.CalledProcessError as e_std:
+                        error_detail = e_std.stderr or e_std.stdout or "No output captured."
+                        logging.error(f"Standard Git clone failed: {error_detail.strip()}")
+                        raise RuntimeError(f"Git clone failed:\n{error_detail.strip()}")
+                    except subprocess.TimeoutExpired:
+                        logging.error(f"Git clone timed out after {CLONE_TIMEOUT_STANDARD} seconds.")
+                        raise RuntimeError(f"Git clone timed out after {CLONE_TIMEOUT_STANDARD // 60} minutes.")
+                repo_path = target_clone_path
+            elif input_type == "Upload ZIP":
+                if uploaded_zip is None or not hasattr(uploaded_zip, 'name'):
+                    raise ValueError("No ZIP file uploaded or invalid file object.")
+                yield f"Status: Processing uploaded ZIP: {Path(uploaded_zip.name).name}"
+                target_extract_path = temp_dir_path / "extracted"
+                target_extract_path.mkdir()
                 try:
+                    with zipfile.ZipFile(uploaded_zip.name, 'r') as zip_ref:
+                        members = zip_ref.namelist()
+                        num_files = len(members)
+                        yield f"Status: Extracting {num_files} entries from ZIP..."
+                        logging.info(f"ZIP contains {num_files} entries.")
+                        if num_files > ZIP_EXTRACT_WARN_THRESHOLD:
+                            logging.warning(f"ZIP contains a large number of files ({num_files}).")
+                        # Security Checks
+                        for member in members:
+                            if member.startswith('/') or member.startswith('\\') or '..' in member.split(os.path.sep):
+                                 raise ValueError(f"ZIP contains potentially unsafe path: '{member}'. Aborting.")
+                            if len(member) > 1024: # Limit path length
+                                raise ValueError(f"ZIP contains excessively long path: '{member[:100]}...'. Aborting.")
+                        zip_ref.extractall(target_extract_path)
+                        yield "Status: ZIP extraction complete."
+                        logging.info("ZIP extraction complete.")
+                except zipfile.BadZipFile:
+                    logging.error("Invalid or corrupted ZIP file uploaded.")
+                    raise ValueError("Invalid or corrupted ZIP file.")
                 except Exception as e_extract:
+                    logging.error(f"Failed to extract ZIP file: {e_extract}", exc_info=True)
                     raise RuntimeError(f"Failed to extract ZIP file: {e_extract}")
+                # Determine repo root within extracted files
+                extracted_items = list(target_extract_path.iterdir())
+                filtered_items = [item for item in extracted_items if item.name not in (".DS_Store", "__MACOSX")]
+                if len(filtered_items) == 1 and filtered_items[0].is_dir():
+                    repo_path = filtered_items[0]
+                    logging.info(f"Detected single root directory in ZIP: {repo_path.name}")
+                else:
+                    repo_path = target_extract_path
+                    logging.info("Using root of extracted ZIP as repository root.")
             else:
+                raise ValueError("Invalid input type selected.")
+            if not repo_path or not repo_path.is_dir():
+                 raise RuntimeError(f"Could not determine valid repository path.")
+            yield f"Status: Repository path identified: {repo_path}"
+            # --- Generate Markdown ---
+            # This function now yields status updates internally and the final result
+            generator = generate_markdown_for_repo(str(repo_path), combined_patterns, max_file_size_kb, include_content)
+            while True:
+                 try:
+                     status_or_result = next(generator)
+                     if status_or_result.startswith("Status:"):
+                          yield status_or_result # Yield status updates
+                     else:
+                          output_md = status_or_result # Final result
+                          break # Exit loop once markdown is generated
+                 except StopIteration:
+                      # Should have received the final result before StopIteration
+                      logging.error("Markdown generator finished unexpectedly without yielding final result.")
+                      raise RuntimeError("Markdown generation failed internally.")
+            # Save markdown to a temporary file for download
+            with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md", encoding='utf-8', prefix="repo_analysis_") as f:
+                f.write(output_md)
+                output_file_path = f.name
+            yield f"Status: Analysis complete. Output saved to {Path(output_file_path).name}"
     except Exception as e:
+        logging.error(f"An error occurred during processing: {e}", exc_info=True)
         error_message = f"An error occurred: {e}"
+        # Yield a final error status and the error message for the main output
+        yield f"Status: Error - {error_message}"
+        yield f"### Operation Failed\n\n```\n{error_message}\n```" # Final yield for output area
+        output_file_path = None
     finally:
+        # Temp directory is cleaned up automatically by the 'with' statement
+        end_time = time.time()
+        logging.info(f"Total processing time: {end_time - start_time:.2f} seconds.")
+    # Return the file path for the download component (or None on error)
+    # The final text output is handled by the last yield in try/except blocks.
+    yield output_file_path # Yield the file path for the gr.File component update
 # --- Gradio Interface ---
 css = """
+body { font-family: sans-serif; }
+#md_output_panel { /* Style the output panel */
+    max-height: 80vh;
+}
 #md_output {
+    max-height: 70vh; /* Adjust max height for content */
+    overflow: auto;
+    border: 1px solid #ccc;
+    border-radius: 5px;
+    padding: 15px;
+    background-color: #f9f9f9;
+}
+#md_output h1 { font-size: 1.6em; border-bottom: 1px solid #eee; padding-bottom: 5px; margin-top: 0;}
+#md_output h2 { font-size: 1.3em; border-bottom: 1px solid #eee; padding-bottom: 5px; margin-top: 20px; }
+#md_output h3 { font-size: 1.1em; margin-top: 15px; margin-bottom: 5px; color: #333; }
+#md_output code { background-color: #eee; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
+#md_output pre { background-color: #fff; padding: 10px; border-radius: 4px; border: 1px solid #ddd; white-space: pre-wrap; word-wrap: break-word; }
+#md_output pre > code { display: block; padding: 0; background-color: transparent; border: none; font-size: 0.9em;} /* Better code block styling */
+#status_box {
+    font-size: 0.9em;
+    color: #555;
+    padding: 8px;
+    border: 1px dashed #ddd;
+    background-color: #fafafa;
+    border-radius: 4px;
+    min-height: 3em; /* Ensure it's visible even when short messages */
+    margin-top: 10px;
 }
+#copy_button { /* Style the copy button */
+    margin-left: 10px;
+    min-width: 100px; /* Give it a bit more width */
 }
+#download_output { margin-top: 15px; }
+footer { display: none !important; }
+.gradio-container { max-width: 1360px !important; margin: auto !important; }
 """
+# --- Helper function for Copy Button ---
+def copy_to_clipboard(text):
+    if PYPERCLIP_AVAILABLE and text:
+        try:
+            pyperclip.copy(text)
+            logging.info("Copied output to clipboard.")
+            return gr.update(value="Copied!", variant="secondary") # Temporary feedback
+        except Exception as e:
+            logging.error(f"Failed to copy to clipboard: {e}")
+            return gr.update(value="Copy Failed", variant="stop")
+    elif not PYPERCLIP_AVAILABLE:
+        logging.warning("Copy attempt failed: pyperclip not installed.")
+        return gr.update(value="Install Pyperclip", variant="stop")
+    else: # No text to copy
+        return gr.update(value="Nothing to Copy", variant="secondary")
+def reset_copy_button():
+     # Short delay before resetting button appearance
+     time.sleep(1.5)
+     return gr.update(value="Copy Markdown", variant="secondary")
+with gr.Blocks(css=css, title="Repo Analyzer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# Repository Analyzer")
     gr.Markdown(
         "Enter a public Git repository URL or upload a local project folder (as a `.zip` archive) "
+        "to generate a single Markdown file containing its structure and optionally file contents. "
+        "Provides real-time status updates."
     )
     with gr.Row():
+        # --- Input Column ---
+        with gr.Column(scale=1):
+            gr.Markdown("### Input Source & Options")
             input_type = gr.Radio(
                 ["URL", "Upload ZIP"], label="Input Source", value="URL"
             )
+            # URL Specific Inputs (conditionally visible)
             url_input = gr.Textbox(
                 label="Git Repository URL",
+                placeholder="e.g., https://github.com/gradio-app/gradio.git or [email protected]:user/repo.git",
+                visible=True, interactive=True, elem_id="url-input"
+            )
+            git_branch_input = gr.Textbox(
+                label="Branch / Tag (Optional)",
+                placeholder="e.g., main, develop, v1.2.3 (leave empty for default)",
+                visible=True, interactive=True, elem_id="git-branch-input"
             )
+            # ZIP Specific Inputs (conditionally visible)
             zip_input = gr.File(
                 label="Upload Local Folder (as .zip)",
                 file_types=[".zip"],
+                visible=False, interactive=True, elem_id="zip-input"
             )
+            # --- Common Options in Accordion ---
+            with gr.Accordion("Configuration Options", open=False):
+                include_content_checkbox = gr.Checkbox(
+                    label="Include File Content in Output",
+                    value=True,
+                    info="Generate structure only if unchecked."
+                )
+                max_size_input = gr.Number(
+                    label="Max File Size for Content (KB)",
+                    value=DEFAULT_MAX_FILE_SIZE_KB, minimum=0, step=64, precision=0,
+                    info="Files larger than this won't have content included (if enabled). 0 disables content.",
+                )
+                ignore_input = gr.Textbox(
+                    label="Ignore Patterns (comma-separated, gitignore style)",
+                    value=", ".join(DEFAULT_IGNORE_PATTERNS),
+                    placeholder="e.g., .git/, *.log, node_modules/",
+                    info="Uses gitignore syntax. Add `/` for directories. Defaults provided.",
+                    lines=5, max_lines=15
+                )
+            submit_btn = gr.Button("Analyze Repository", variant="primary")
+            gr.Markdown("### Status Updates")
+            status_output = gr.Textbox(label="Current Status", value="Idle.", interactive=False, lines=3, elem_id="status_box")
+        # --- Output Column ---
+        with gr.Column(scale=2):
+            gr.Markdown("### Generated Output")
+            with gr.Row(elem_id="output_header_row"):
+                 copy_button = gr.Button("Copy Markdown", variant="secondary", elem_id="copy_button", visible=PYPERCLIP_AVAILABLE) # Hide if pyperclip missing
+                 download_output = gr.File(label="Download .md File", interactive=False, visible=False, elem_id="download_output", scale=1) # Take less space initially
+            md_output = gr.Markdown(value="*Awaiting analysis results...*", elem_id="md_output", visible=True)
+    # --- Event Handlers ---
+    # Update visibility based on input type choice
     def update_input_visibility(choice):
+        is_url = choice == "URL"
+        return {
+            url_input: gr.update(visible=is_url),
+            git_branch_input: gr.update(visible=is_url),
+            zip_input: gr.update(visible=not is_url)
+        }
     input_type.change(
         fn=update_input_visibility,
         inputs=input_type,
+        outputs=[url_input, git_branch_input, zip_input],
+        queue=False # UI only change
     )
+    # Main processing logic on submit
     submit_btn.click(
+        fn=repo_to_md_processor, # The generator function
         inputs=[
+            input_type, url_input, zip_input, git_branch_input,
+            ignore_input, max_size_input, include_content_checkbox,
         ],
+        # Outputs map to yielded values: status strings, final markdown, final file path
+        outputs=[ status_output, md_output, download_output ],
+        api_name="repo_to_md"
     )
+    # Copy button functionality
+    if PYPERCLIP_AVAILABLE:
+        copy_button.click(
+            fn=copy_to_clipboard,
+            inputs=[md_output], # Takes the current markdown content
+            outputs=[copy_button], # Updates its own text/appearance
+            queue=False
+        ).then(
+            fn=reset_copy_button, # Function to reset button after a delay
+            inputs=None,
+            outputs=[copy_button],
+            queue=False # Don't queue the reset visual change
+        )
 # Launch the interface
 if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", show_error=True, debug=True) # Enable queue & debug for better testing