Spaces:

VOIDER
/

repo-to-md

Running

File size: 21,532 Bytes

import gradio as gr
import os
import subprocess
import tempfile
import zipfile
import pathlib
import shutil
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern
import traceback # Import traceback for better error logging

# --- Configuration --- (Keep as before)
DEFAULT_IGNORE_PATTERNS = """
# Default Ignore Patterns (Gitignore Syntax)
/.git/
/.hg/
/.svn/
/.vscode/
/.idea/
/node_modules/
/vendor/
/build/
/dist/
/target/
*.pyc
*.log
*.swp
*~
__pycache__/
.DS_Store
"""
MAX_OUTPUT_LINES = 10000 # Limit potential output size in display
INDENT_CHAR = "    " # 4 spaces for indentation
FOLDER_ICON = "📁"
FILE_ICON = "📄"

# --- Core Logic --- (Keep get_repo_path and generate_markdown_structure as before)
def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()):
    """Clones or extracts the repository, returning the local path."""
    temp_dir = tempfile.mkdtemp()
    repo_path = None

    try:
        if source_type == "URL":
            if not repo_url:
                raise ValueError("GitHub Repository URL is required.")
            progress(0.1, desc="Cloning repository...")
            git_command = ["git", "clone", "--depth", "1"] # Shallow clone for speed
            if branch_tag:
                git_command.extend(["--branch", branch_tag])
            git_command.extend([repo_url, temp_dir])

            print(f"Running command: {' '.join(git_command)}") # For debugging
            result = subprocess.run(git_command, capture_output=True, text=True, check=False)

            if result.returncode != 0:
                # Attempt clone without branch if specific one failed (might be default branch)
                if branch_tag:
                    progress(0.2, desc=f"Branch '{branch_tag}' not found or clone failed, trying default branch...")
                    git_command = ["git", "clone", "--depth", "1", repo_url, temp_dir]
                    print(f"Running command: {' '.join(git_command)}") # For debugging
                    result = subprocess.run(git_command, capture_output=True, text=True, check=False)

                if result.returncode != 0:
                  error_message = f"Git clone failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
                  print(error_message) # Log detailed error
                  # Try to extract a user-friendly message
                  if "Authentication failed" in result.stderr:
                       raise ValueError("Authentication failed. Private repositories require different handling (e.g., tokens) which is not supported here.")
                  elif "not found" in result.stderr:
                       raise ValueError(f"Repository or branch '{branch_tag or 'default'}' not found at URL: {repo_url}")
                  else:
                       raise ValueError(f"Git clone failed. Check URL and branch/tag. Error: {result.stderr.splitlines()[-1] if result.stderr else 'Unknown error'}")


            repo_path = pathlib.Path(temp_dir)
            progress(0.5, desc="Repository cloned.")
            print(f"Cloned repo to: {repo_path}") # Debugging

        elif source_type == "Upload ZIP":
            if zip_file_obj is None:
                raise ValueError("ZIP file upload is required.")
            progress(0.1, desc="Extracting ZIP file...")
            zip_path = zip_file_obj.name # Gradio provides a temp file path
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                # Check for common zip structure (single top-level dir)
                top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p and p.split('/')[0]))
                extract_target = temp_dir
                potential_repo_root = temp_dir
                if len(top_level_dirs) == 1:
                   # If zip contains repo-main/file structure, extract *into* temp_dir
                   # The actual repo content will be inside temp_dir/repo-main/
                   zip_ref.extractall(extract_target)
                   potential_repo_root = os.path.join(temp_dir, top_level_dirs[0])
                   print(f"ZIP has single top-level dir: {top_level_dirs[0]}. Potential root: {potential_repo_root}")
                else:
                   # Otherwise, extract directly into temp_dir
                   zip_ref.extractall(extract_target)
                   print(f"ZIP structure seems flat or multi-root. Using extract target as root: {extract_target}")

                # Basic check if potential_repo_root looks like a valid directory
                if os.path.isdir(potential_repo_root):
                    repo_path = pathlib.Path(potential_repo_root)
                else:
                     # Fallback if single dir logic failed or wasn't applicable
                     repo_path = pathlib.Path(extract_target)


            progress(0.5, desc="ZIP extracted.")
            print(f"Extracted ZIP to: {repo_path}") # Debugging
        else:
            raise ValueError("Invalid source type selected.")

        if not repo_path or not repo_path.is_dir():
             # Add more specific debugging info here
             print(f"Debug Info: Temp dir content: {list(os.listdir(temp_dir))}")
             if 'potential_repo_root' in locals() and potential_repo_root != temp_dir:
                  print(f"Debug Info: Potential repo root '{potential_repo_root}' exists: {os.path.exists(potential_repo_root)}, is dir: {os.path.isdir(potential_repo_root)}")
             raise ValueError(f"Could not determine repository root directory within: {temp_dir}")

        return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup

    except Exception as e:
        # Clean up the temporary directory on error before re-raising
        shutil.rmtree(temp_dir, ignore_errors=True)
        print(f"Error in get_repo_path: {e}") # Log error
        traceback.print_exc() # Print full traceback for debugging get_repo_path issues
        raise e # Re-raise the exception to be caught by the main function


def generate_markdown_structure(
    repo_root_path: pathlib.Path,
    include_content: bool,
    max_size_kb: int,
    ignore_patterns_str: str,
    progress=gr.Progress()
):
    """Generates the Markdown string from the repository structure."""
    repo_root_path = pathlib.Path(repo_root_path) # Ensure it's a Path object
    markdown_lines = []
    max_file_size_bytes = max_size_kb * 1024 if max_size_kb > 0 else 0

    # --- Prepare ignore patterns ---
    # Combine default and user patterns
    full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip()
    # Filter out empty lines and comments
    patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')]
    # Create unique list while preserving order (important if later patterns override earlier ones)
    seen = set()
    unique_patterns = [x for x in patterns if not (x in seen or seen.add(x))]
    spec = PathSpec.from_lines(GitWildMatchPattern, unique_patterns)
    print(f"Using unique ignore patterns: {unique_patterns}") # Debugging

    # --- Add header ---
    repo_name = repo_root_path.name
    markdown_lines.append(f"# {FOLDER_ICON} {repo_name}")
    markdown_lines.append("")

    # --- Walk through the directory ---
    progress(0.6, desc="Scanning repository structure...")
    files_processed = 0
    # Need to iterate through items relative to the root for pathspec matching
    all_items = sorted(list(repo_root_path.rglob('*')))
    total_items_estimate = len(all_items) # More accurate estimate

    items_scanned = 0
    for item_path in all_items:
        items_scanned += 1
        if items_scanned % 50 == 0: # Update progress periodically
             progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}")

        relative_path = item_path.relative_to(repo_root_path)
        # Pathspec matches against the path string relative to the root where .gitignore would be
        # Important: Add a leading '/' for patterns like '/node_modules/' to only match at the root
        path_str_for_match = str(relative_path)

        # Check if the path itself should be ignored
        # Pathspec automatically handles directory patterns (e.g., node_modules/ matches files and dirs inside)
        if spec.match_file(path_str_for_match):
            print(f"Ignoring: {relative_path}") # Debugging
            # If it's a directory, we don't need to manually skip recursion because
            # rglob already gave us all paths; we just skip processing this specific path.
            # If we were using os.walk, we'd modify the dirs list here.
            continue

        # Calculate depth and indentation
        depth = len(relative_path.parts) -1 # 0-based depth relative to root content
        indent = INDENT_CHAR * depth

        # Add entry to Markdown
        if item_path.is_dir():
            # Check if dir is empty *after* considering ignores. This is tricky with rglob.
            # A simple heuristic: check if any non-ignored children exist directly within it.
            # This isn't perfect but avoids complex lookahead.
            # has_children = any(p.relative_to(repo_root_path).parts[0] == relative_path.parts[0] and not spec.match_file(str(p.relative_to(repo_root_path))) for p in all_items if p != item_path and p.parent == item_path)
            # Simpler: Just always list the dir for now. Empty dir check is complex with ignores + rglob.
            markdown_lines.append(f"{indent}{FOLDER_ICON} **{item_path.name}/**")
        elif item_path.is_file():
            markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}")
            files_processed += 1

            # Include file content if requested and within limits
            if include_content and max_size_kb > 0: # Check > 0 explicitly
                try:
                    file_size = item_path.stat().st_size
                    if file_size == 0:
                        markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                        markdown_lines.append(f"{indent}{INDENT_CHAR}[Empty File]")
                        markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                    elif file_size <= max_file_size_bytes:
                        try:
                            # Attempt to detect binary files heuristically before reading large ones
                            is_binary = False
                            try:
                                # Read a small chunk to check for null bytes
                                with open(item_path, 'rb') as bf:
                                    chunk = bf.read(1024)
                                    if b'\x00' in chunk:
                                        is_binary = True
                            except Exception:
                                # Ignore errors during binary check, proceed as text
                                pass

                            if is_binary:
                                markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Likely a binary file (Size: {file_size} bytes)]")
                            else:
                                content = item_path.read_text(encoding='utf-8', errors='replace') # Replace errors instead of failing
                                lang = item_path.suffix.lstrip('.')
                                # Simple lang detection, can be expanded
                                if not lang: lang = "text"

                                markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}")
                                # Indent content lines
                                content_lines = content.splitlines()
                                # Limit output lines displayed in Markdown preview if necessary
                                # Note: The downloaded file will have full content
                                display_lines = content_lines[:MAX_OUTPUT_LINES]
                                for line in display_lines:
                                    markdown_lines.append(f"{indent}{INDENT_CHAR}{line}")
                                if len(content_lines) > MAX_OUTPUT_LINES:
                                    markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]")
                                markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                        except UnicodeDecodeError:
                             # Should be less common now with errors='replace'
                             markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error decoding file as UTF-8 (Size: {file_size} bytes)]")
                        except Exception as read_err:
                             markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]")
                    else:
                        markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size:,} bytes) exceeds limit ({max_file_size_bytes:,} bytes)]") # Added commas
                except OSError as stat_err:
                    markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file stats - {stat_err}]")

            elif include_content and max_size_kb == 0: # Content included checked, but 0 size limit
                 markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]")


        # Add a newline for separation, helps readability only if content wasn't added (which adds ```\n)
        # Or maybe always add it for consistency between file/dir entries
        markdown_lines.append("")


    progress(0.95, desc="Formatting output...")
    final_markdown = "\n".join(markdown_lines)
    print(f"Processed {files_processed} files.") # Debugging
    return final_markdown

# --- Gradio Interface ---

def process_repo(
    source_type, repo_url, branch_tag, zip_file_obj,
    include_content, max_size_kb, ignore_patterns,
    progress=gr.Progress(track_tqdm=True)
):
    """Main function called by Gradio button."""
    status = ""
    output_markdown = ""
    output_file_path = None
    repo_root_path = None
    temp_dir_to_clean = None
    # Ensure max_size_kb is treated as a number
    try:
        max_size_kb_int = int(max_size_kb) if max_size_kb is not None else 0
    except ValueError:
        yield "Error: Max File Size must be a number.", "", gr.update(value=None, visible=False)
        return


    try:
        progress(0, desc="Starting...")
        # Initial state update for all outputs
        yield "Preparing...", "", gr.update(value=None, visible=False)

        # 1. Get Repository Path
        yield "Fetching repository...", "", gr.update(value=None, visible=False)
        repo_root_path, temp_dir_to_clean = get_repo_path(
            source_type, repo_url, branch_tag, zip_file_obj, progress=progress
        )
        # Check if path finding was successful before proceeding
        if not repo_root_path:
             # Error should have been raised in get_repo_path, but double-check
             raise ValueError("Failed to obtain repository path.")

        yield f"Repository ready at: {repo_root_path.name}", "", gr.update(value=None, visible=False)

        # 2. Generate Markdown
        yield "Generating Markdown structure...", "", gr.update(value=None, visible=False)
        markdown_content = generate_markdown_structure(
            repo_root_path, include_content, max_size_kb_int, ignore_patterns, progress=progress
        )

        # Limit preview size robustly
        preview_limit = 3000
        markdown_preview = markdown_content[:preview_limit]
        if len(markdown_content) > preview_limit:
            markdown_preview += "\n\n[... Output truncated in preview ...]"

        # 3. Prepare Output File
        yield "Saving Markdown to file...", markdown_preview, gr.update(value=None, visible=False)
        output_filename = f"{repo_root_path.name}_structure.md"
        # Sanitize filename slightly (replace spaces, etc.) - less critical in temp file context
        output_filename = "".join(c if c.isalnum() or c in ('_', '-', '.') else '_' for c in output_filename)

        # Save the file in a place Gradio can access (it manages temp files)
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file:
             temp_file.write(markdown_content)
             output_file_path = temp_file.name # Gradio needs the path to this file

        # *** CORRECTED YIELD USING gr.update ***
        yield f"Done. Output file '{output_filename}' ready for download.", markdown_preview, gr.update(value=output_file_path, visible=True, label=f"Download {output_filename}")

    except ValueError as ve:
        print(f"Value Error: {ve}") # Log error
        traceback.print_exc()
        # *** CORRECTED YIELD USING gr.update ***
        yield f"Error: {ve}", "", gr.update(value=None, visible=False)
    except subprocess.CalledProcessError as cpe:
         error_detail = cpe.stderr or cpe.stdout or "Unknown git error"
         print(f"Git Error: {error_detail}") # Log error
         traceback.print_exc()
         # *** CORRECTED YIELD USING gr.update ***
         yield f"Git command failed: {error_detail}", "", gr.update(value=None, visible=False)
    except Exception as e:
        print(f"Unexpected Error: {e}") # Log error
        traceback.print_exc() # Print full traceback to logs
        # *** CORRECTED YIELD USING gr.update ***
        yield f"An unexpected error occurred: {e}", "", gr.update(value=None, visible=False)
    finally:
        # 4. Cleanup
        if temp_dir_to_clean:
             print(f"Cleaning up temporary directory: {temp_dir_to_clean}")
             shutil.rmtree(temp_dir_to_clean, ignore_errors=True)
             print("Cleanup complete.")


# --- Build Gradio UI --- (Keep as before)

with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo:
    gr.Markdown("# GitHub Repository to Markdown Converter")
    gr.Markdown("Convert a GitHub repository's structure (and optionally content) into a single Markdown file.")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Input Source")
            input_source = gr.Radio(
                ["URL", "Upload ZIP"], label="Select Source Type", value="URL"
            )

            url_input_group = gr.Group(visible=True) # Show URL by default
            with url_input_group:
                repo_url_input = gr.Textbox(label="Git Repository URL", placeholder="https://github.com/user/repo.git")
                branch_tag_input = gr.Textbox(label="Branch / Tag (Optional)", placeholder="main")

            zip_input_group = gr.Group(visible=False) # Hide ZIP by default
            with zip_input_group:
                zip_file_input = gr.File(label="Upload Repository ZIP", file_types=[".zip"])

            # --- Configuration Options ---
            gr.Markdown("## Configuration")
            include_content_checkbox = gr.Checkbox(label="Include File Content in Output", value=False)
            max_size_input = gr.Number(label="Max File Size for Content (KB)", value=100, minimum=0, step=10,
                                       info="Files larger than this won't have content included. Set to 0 to disable content inclusion entirely, even if checked above.")
            ignore_patterns_input = gr.Textbox(
                label="Ignore Patterns (comma-separated or newline, gitignore style)",
                info="Uses .gitignore syntax. Add / for directories. Default patterns provided.",
                lines=5,
                value=DEFAULT_IGNORE_PATTERNS.strip()
            )

            generate_button = gr.Button("Generate Markdown", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("## Status & Output")
            status_output = gr.Textbox(label="Current Status", interactive=False, lines=2)
            # Use a Textbox for preview initially, as Markdown rendering can be slow/heavy
            markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20)
            # Use gr.File for the final download link
            download_output = gr.File(label="Download Markdown File", visible=False, interactive=False) # Set interactive=False


    # --- Event Handlers --- (Keep as before)
    def toggle_input_visibility(choice):
        if choice == "URL":
            return gr.update(visible=True), gr.update(visible=False)
        else: # ZIP
            return gr.update(visible=False), gr.update(visible=True)

    input_source.change(
        fn=toggle_input_visibility,
        inputs=input_source,
        outputs=[url_input_group, zip_input_group],
    )

    generate_button.click(
        fn=process_repo,
        inputs=[
            input_source, repo_url_input, branch_tag_input, zip_file_input,
            include_content_checkbox, max_size_input, ignore_patterns_input
        ],
        outputs=[status_output, markdown_preview_output, download_output],
        # api_name="generate_markdown" # Optional: for API access
    )

# --- Launch the App --- (Keep as before)
if __name__ == "__main__":
    # Ensure queue is enabled for HF Spaces deployment
    # debug=True is useful for local testing, might remove/set to False for production space
    demo.queue().launch(debug=True)