Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import subprocess | |
| import tempfile | |
| import zipfile | |
| import pathlib | |
| import shutil | |
| from pathspec import PathSpec | |
| from pathspec.patterns import GitWildMatchPattern | |
| import traceback # Import traceback for better error logging | |
| # --- Configuration --- (Keep as before) | |
| DEFAULT_IGNORE_PATTERNS = """ | |
| # Default Ignore Patterns (Gitignore Syntax) | |
| /.git/ | |
| /.hg/ | |
| /.svn/ | |
| /.vscode/ | |
| /.idea/ | |
| /node_modules/ | |
| /vendor/ | |
| /build/ | |
| /dist/ | |
| /target/ | |
| *.pyc | |
| *.log | |
| *.swp | |
| *~ | |
| __pycache__/ | |
| .DS_Store | |
| """ | |
| MAX_OUTPUT_LINES = 10000 # Limit potential output size in display | |
| INDENT_CHAR = " " # 4 spaces for indentation | |
| FOLDER_ICON = "π" | |
| FILE_ICON = "π" | |
| # --- Core Logic --- (Keep get_repo_path and generate_markdown_structure as before) | |
| def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()): | |
| """Clones or extracts the repository, returning the local path.""" | |
| temp_dir = tempfile.mkdtemp() | |
| repo_path = None | |
| try: | |
| if source_type == "URL": | |
| if not repo_url: | |
| raise ValueError("GitHub Repository URL is required.") | |
| progress(0.1, desc="Cloning repository...") | |
| git_command = ["git", "clone", "--depth", "1"] # Shallow clone for speed | |
| if branch_tag: | |
| git_command.extend(["--branch", branch_tag]) | |
| git_command.extend([repo_url, temp_dir]) | |
| print(f"Running command: {' '.join(git_command)}") # For debugging | |
| result = subprocess.run(git_command, capture_output=True, text=True, check=False) | |
| if result.returncode != 0: | |
| # Attempt clone without branch if specific one failed (might be default branch) | |
| if branch_tag: | |
| progress(0.2, desc=f"Branch '{branch_tag}' not found or clone failed, trying default branch...") | |
| git_command = ["git", "clone", "--depth", "1", repo_url, temp_dir] | |
| print(f"Running command: {' '.join(git_command)}") # For debugging | |
| result = subprocess.run(git_command, capture_output=True, text=True, check=False) | |
| if result.returncode != 0: | |
| error_message = f"Git clone failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}" | |
| print(error_message) # Log detailed error | |
| # Try to extract a user-friendly message | |
| if "Authentication failed" in result.stderr: | |
| raise ValueError("Authentication failed. Private repositories require different handling (e.g., tokens) which is not supported here.") | |
| elif "not found" in result.stderr: | |
| raise ValueError(f"Repository or branch '{branch_tag or 'default'}' not found at URL: {repo_url}") | |
| else: | |
| raise ValueError(f"Git clone failed. Check URL and branch/tag. Error: {result.stderr.splitlines()[-1] if result.stderr else 'Unknown error'}") | |
| repo_path = pathlib.Path(temp_dir) | |
| progress(0.5, desc="Repository cloned.") | |
| print(f"Cloned repo to: {repo_path}") # Debugging | |
| elif source_type == "Upload ZIP": | |
| if zip_file_obj is None: | |
| raise ValueError("ZIP file upload is required.") | |
| progress(0.1, desc="Extracting ZIP file...") | |
| zip_path = zip_file_obj.name # Gradio provides a temp file path | |
| with zipfile.ZipFile(zip_path, 'r') as zip_ref: | |
| # Check for common zip structure (single top-level dir) | |
| top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p and p.split('/')[0])) | |
| extract_target = temp_dir | |
| potential_repo_root = temp_dir | |
| if len(top_level_dirs) == 1: | |
| # If zip contains repo-main/file structure, extract *into* temp_dir | |
| # The actual repo content will be inside temp_dir/repo-main/ | |
| zip_ref.extractall(extract_target) | |
| potential_repo_root = os.path.join(temp_dir, top_level_dirs[0]) | |
| print(f"ZIP has single top-level dir: {top_level_dirs[0]}. Potential root: {potential_repo_root}") | |
| else: | |
| # Otherwise, extract directly into temp_dir | |
| zip_ref.extractall(extract_target) | |
| print(f"ZIP structure seems flat or multi-root. Using extract target as root: {extract_target}") | |
| # Basic check if potential_repo_root looks like a valid directory | |
| if os.path.isdir(potential_repo_root): | |
| repo_path = pathlib.Path(potential_repo_root) | |
| else: | |
| # Fallback if single dir logic failed or wasn't applicable | |
| repo_path = pathlib.Path(extract_target) | |
| progress(0.5, desc="ZIP extracted.") | |
| print(f"Extracted ZIP to: {repo_path}") # Debugging | |
| else: | |
| raise ValueError("Invalid source type selected.") | |
| if not repo_path or not repo_path.is_dir(): | |
| # Add more specific debugging info here | |
| print(f"Debug Info: Temp dir content: {list(os.listdir(temp_dir))}") | |
| if 'potential_repo_root' in locals() and potential_repo_root != temp_dir: | |
| print(f"Debug Info: Potential repo root '{potential_repo_root}' exists: {os.path.exists(potential_repo_root)}, is dir: {os.path.isdir(potential_repo_root)}") | |
| raise ValueError(f"Could not determine repository root directory within: {temp_dir}") | |
| return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup | |
| except Exception as e: | |
| # Clean up the temporary directory on error before re-raising | |
| shutil.rmtree(temp_dir, ignore_errors=True) | |
| print(f"Error in get_repo_path: {e}") # Log error | |
| traceback.print_exc() # Print full traceback for debugging get_repo_path issues | |
| raise e # Re-raise the exception to be caught by the main function | |
| def generate_markdown_structure( | |
| repo_root_path: pathlib.Path, | |
| include_content: bool, | |
| max_size_kb: int, | |
| ignore_patterns_str: str, | |
| progress=gr.Progress() | |
| ): | |
| """Generates the Markdown string from the repository structure.""" | |
| repo_root_path = pathlib.Path(repo_root_path) # Ensure it's a Path object | |
| markdown_lines = [] | |
| max_file_size_bytes = max_size_kb * 1024 if max_size_kb > 0 else 0 | |
| # --- Prepare ignore patterns --- | |
| # Combine default and user patterns | |
| full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip() | |
| # Filter out empty lines and comments | |
| patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')] | |
| # Create unique list while preserving order (important if later patterns override earlier ones) | |
| seen = set() | |
| unique_patterns = [x for x in patterns if not (x in seen or seen.add(x))] | |
| spec = PathSpec.from_lines(GitWildMatchPattern, unique_patterns) | |
| print(f"Using unique ignore patterns: {unique_patterns}") # Debugging | |
| # --- Add header --- | |
| repo_name = repo_root_path.name | |
| markdown_lines.append(f"# {FOLDER_ICON} {repo_name}") | |
| markdown_lines.append("") | |
| # --- Walk through the directory --- | |
| progress(0.6, desc="Scanning repository structure...") | |
| files_processed = 0 | |
| # Need to iterate through items relative to the root for pathspec matching | |
| all_items = sorted(list(repo_root_path.rglob('*'))) | |
| total_items_estimate = len(all_items) # More accurate estimate | |
| items_scanned = 0 | |
| for item_path in all_items: | |
| items_scanned += 1 | |
| if items_scanned % 50 == 0: # Update progress periodically | |
| progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}") | |
| relative_path = item_path.relative_to(repo_root_path) | |
| # Pathspec matches against the path string relative to the root where .gitignore would be | |
| # Important: Add a leading '/' for patterns like '/node_modules/' to only match at the root | |
| path_str_for_match = str(relative_path) | |
| # Check if the path itself should be ignored | |
| # Pathspec automatically handles directory patterns (e.g., node_modules/ matches files and dirs inside) | |
| if spec.match_file(path_str_for_match): | |
| print(f"Ignoring: {relative_path}") # Debugging | |
| # If it's a directory, we don't need to manually skip recursion because | |
| # rglob already gave us all paths; we just skip processing this specific path. | |
| # If we were using os.walk, we'd modify the dirs list here. | |
| continue | |
| # Calculate depth and indentation | |
| depth = len(relative_path.parts) -1 # 0-based depth relative to root content | |
| indent = INDENT_CHAR * depth | |
| # Add entry to Markdown | |
| if item_path.is_dir(): | |
| # Check if dir is empty *after* considering ignores. This is tricky with rglob. | |
| # A simple heuristic: check if any non-ignored children exist directly within it. | |
| # This isn't perfect but avoids complex lookahead. | |
| # has_children = any(p.relative_to(repo_root_path).parts[0] == relative_path.parts[0] and not spec.match_file(str(p.relative_to(repo_root_path))) for p in all_items if p != item_path and p.parent == item_path) | |
| # Simpler: Just always list the dir for now. Empty dir check is complex with ignores + rglob. | |
| markdown_lines.append(f"{indent}{FOLDER_ICON} **{item_path.name}/**") | |
| elif item_path.is_file(): | |
| markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}") | |
| files_processed += 1 | |
| # Include file content if requested and within limits | |
| if include_content and max_size_kb > 0: # Check > 0 explicitly | |
| try: | |
| file_size = item_path.stat().st_size | |
| if file_size == 0: | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}```") | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}[Empty File]") | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}```") | |
| elif file_size <= max_file_size_bytes: | |
| try: | |
| # Attempt to detect binary files heuristically before reading large ones | |
| is_binary = False | |
| try: | |
| # Read a small chunk to check for null bytes | |
| with open(item_path, 'rb') as bf: | |
| chunk = bf.read(1024) | |
| if b'\x00' in chunk: | |
| is_binary = True | |
| except Exception: | |
| # Ignore errors during binary check, proceed as text | |
| pass | |
| if is_binary: | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Likely a binary file (Size: {file_size} bytes)]") | |
| else: | |
| content = item_path.read_text(encoding='utf-8', errors='replace') # Replace errors instead of failing | |
| lang = item_path.suffix.lstrip('.') | |
| # Simple lang detection, can be expanded | |
| if not lang: lang = "text" | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}") | |
| # Indent content lines | |
| content_lines = content.splitlines() | |
| # Limit output lines displayed in Markdown preview if necessary | |
| # Note: The downloaded file will have full content | |
| display_lines = content_lines[:MAX_OUTPUT_LINES] | |
| for line in display_lines: | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}{line}") | |
| if len(content_lines) > MAX_OUTPUT_LINES: | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]") | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}```") | |
| except UnicodeDecodeError: | |
| # Should be less common now with errors='replace' | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error decoding file as UTF-8 (Size: {file_size} bytes)]") | |
| except Exception as read_err: | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]") | |
| else: | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size:,} bytes) exceeds limit ({max_file_size_bytes:,} bytes)]") # Added commas | |
| except OSError as stat_err: | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file stats - {stat_err}]") | |
| elif include_content and max_size_kb == 0: # Content included checked, but 0 size limit | |
| markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]") | |
| # Add a newline for separation, helps readability only if content wasn't added (which adds ```\n) | |
| # Or maybe always add it for consistency between file/dir entries | |
| markdown_lines.append("") | |
| progress(0.95, desc="Formatting output...") | |
| final_markdown = "\n".join(markdown_lines) | |
| print(f"Processed {files_processed} files.") # Debugging | |
| return final_markdown | |
| # --- Gradio Interface --- | |
| def process_repo( | |
| source_type, repo_url, branch_tag, zip_file_obj, | |
| include_content, max_size_kb, ignore_patterns, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| """Main function called by Gradio button.""" | |
| status = "" | |
| output_markdown = "" | |
| output_file_path = None | |
| repo_root_path = None | |
| temp_dir_to_clean = None | |
| # Ensure max_size_kb is treated as a number | |
| try: | |
| max_size_kb_int = int(max_size_kb) if max_size_kb is not None else 0 | |
| except ValueError: | |
| yield "Error: Max File Size must be a number.", "", gr.update(value=None, visible=False) | |
| return | |
| try: | |
| progress(0, desc="Starting...") | |
| # Initial state update for all outputs | |
| yield "Preparing...", "", gr.update(value=None, visible=False) | |
| # 1. Get Repository Path | |
| yield "Fetching repository...", "", gr.update(value=None, visible=False) | |
| repo_root_path, temp_dir_to_clean = get_repo_path( | |
| source_type, repo_url, branch_tag, zip_file_obj, progress=progress | |
| ) | |
| # Check if path finding was successful before proceeding | |
| if not repo_root_path: | |
| # Error should have been raised in get_repo_path, but double-check | |
| raise ValueError("Failed to obtain repository path.") | |
| yield f"Repository ready at: {repo_root_path.name}", "", gr.update(value=None, visible=False) | |
| # 2. Generate Markdown | |
| yield "Generating Markdown structure...", "", gr.update(value=None, visible=False) | |
| markdown_content = generate_markdown_structure( | |
| repo_root_path, include_content, max_size_kb_int, ignore_patterns, progress=progress | |
| ) | |
| # Limit preview size robustly | |
| preview_limit = 3000 | |
| markdown_preview = markdown_content[:preview_limit] | |
| if len(markdown_content) > preview_limit: | |
| markdown_preview += "\n\n[... Output truncated in preview ...]" | |
| # 3. Prepare Output File | |
| yield "Saving Markdown to file...", markdown_preview, gr.update(value=None, visible=False) | |
| output_filename = f"{repo_root_path.name}_structure.md" | |
| # Sanitize filename slightly (replace spaces, etc.) - less critical in temp file context | |
| output_filename = "".join(c if c.isalnum() or c in ('_', '-', '.') else '_' for c in output_filename) | |
| # Save the file in a place Gradio can access (it manages temp files) | |
| with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file: | |
| temp_file.write(markdown_content) | |
| output_file_path = temp_file.name # Gradio needs the path to this file | |
| # *** CORRECTED YIELD USING gr.update *** | |
| yield f"Done. Output file '{output_filename}' ready for download.", markdown_preview, gr.update(value=output_file_path, visible=True, label=f"Download {output_filename}") | |
| except ValueError as ve: | |
| print(f"Value Error: {ve}") # Log error | |
| traceback.print_exc() | |
| # *** CORRECTED YIELD USING gr.update *** | |
| yield f"Error: {ve}", "", gr.update(value=None, visible=False) | |
| except subprocess.CalledProcessError as cpe: | |
| error_detail = cpe.stderr or cpe.stdout or "Unknown git error" | |
| print(f"Git Error: {error_detail}") # Log error | |
| traceback.print_exc() | |
| # *** CORRECTED YIELD USING gr.update *** | |
| yield f"Git command failed: {error_detail}", "", gr.update(value=None, visible=False) | |
| except Exception as e: | |
| print(f"Unexpected Error: {e}") # Log error | |
| traceback.print_exc() # Print full traceback to logs | |
| # *** CORRECTED YIELD USING gr.update *** | |
| yield f"An unexpected error occurred: {e}", "", gr.update(value=None, visible=False) | |
| finally: | |
| # 4. Cleanup | |
| if temp_dir_to_clean: | |
| print(f"Cleaning up temporary directory: {temp_dir_to_clean}") | |
| shutil.rmtree(temp_dir_to_clean, ignore_errors=True) | |
| print("Cleanup complete.") | |
| # --- Build Gradio UI --- (Keep as before) | |
| with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo: | |
| gr.Markdown("# GitHub Repository to Markdown Converter") | |
| gr.Markdown("Convert a GitHub repository's structure (and optionally content) into a single Markdown file.") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("## Input Source") | |
| input_source = gr.Radio( | |
| ["URL", "Upload ZIP"], label="Select Source Type", value="URL" | |
| ) | |
| url_input_group = gr.Group(visible=True) # Show URL by default | |
| with url_input_group: | |
| repo_url_input = gr.Textbox(label="Git Repository URL", placeholder="https://github.com/user/repo.git") | |
| branch_tag_input = gr.Textbox(label="Branch / Tag (Optional)", placeholder="main") | |
| zip_input_group = gr.Group(visible=False) # Hide ZIP by default | |
| with zip_input_group: | |
| zip_file_input = gr.File(label="Upload Repository ZIP", file_types=[".zip"]) | |
| # --- Configuration Options --- | |
| gr.Markdown("## Configuration") | |
| include_content_checkbox = gr.Checkbox(label="Include File Content in Output", value=False) | |
| max_size_input = gr.Number(label="Max File Size for Content (KB)", value=100, minimum=0, step=10, | |
| info="Files larger than this won't have content included. Set to 0 to disable content inclusion entirely, even if checked above.") | |
| ignore_patterns_input = gr.Textbox( | |
| label="Ignore Patterns (comma-separated or newline, gitignore style)", | |
| info="Uses .gitignore syntax. Add / for directories. Default patterns provided.", | |
| lines=5, | |
| value=DEFAULT_IGNORE_PATTERNS.strip() | |
| ) | |
| generate_button = gr.Button("Generate Markdown", variant="primary") | |
| with gr.Column(scale=2): | |
| gr.Markdown("## Status & Output") | |
| status_output = gr.Textbox(label="Current Status", interactive=False, lines=2) | |
| # Use a Textbox for preview initially, as Markdown rendering can be slow/heavy | |
| markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20) | |
| # Use gr.File for the final download link | |
| download_output = gr.File(label="Download Markdown File", visible=False, interactive=False) # Set interactive=False | |
| # --- Event Handlers --- (Keep as before) | |
| def toggle_input_visibility(choice): | |
| if choice == "URL": | |
| return gr.update(visible=True), gr.update(visible=False) | |
| else: # ZIP | |
| return gr.update(visible=False), gr.update(visible=True) | |
| input_source.change( | |
| fn=toggle_input_visibility, | |
| inputs=input_source, | |
| outputs=[url_input_group, zip_input_group], | |
| ) | |
| generate_button.click( | |
| fn=process_repo, | |
| inputs=[ | |
| input_source, repo_url_input, branch_tag_input, zip_file_input, | |
| include_content_checkbox, max_size_input, ignore_patterns_input | |
| ], | |
| outputs=[status_output, markdown_preview_output, download_output], | |
| # api_name="generate_markdown" # Optional: for API access | |
| ) | |
| # --- Launch the App --- (Keep as before) | |
| if __name__ == "__main__": | |
| # Ensure queue is enabled for HF Spaces deployment | |
| # debug=True is useful for local testing, might remove/set to False for production space | |
| demo.queue().launch(debug=True) |