Spaces:

VOIDER
/

repo-to-md

Running

App Files Files Community

repo-to-md / app.py

VOIDER

Update app.py

6085d45 verified 8 months ago

raw

history blame contribute delete

21.5 kB

	import gradio as gr
	import os
	import subprocess
	import tempfile
	import zipfile
	import pathlib
	import shutil
	from pathspec import PathSpec
	from pathspec.patterns import GitWildMatchPattern
	import traceback # Import traceback for better error logging

	# --- Configuration --- (Keep as before)
	DEFAULT_IGNORE_PATTERNS = """
	# Default Ignore Patterns (Gitignore Syntax)
	/.git/
	/.hg/
	/.svn/
	/.vscode/
	/.idea/
	/node_modules/
	/vendor/
	/build/
	/dist/
	/target/
	*.pyc
	*.log
	*.swp
	*~
	__pycache__/
	.DS_Store
	"""
	MAX_OUTPUT_LINES = 10000 # Limit potential output size in display
	INDENT_CHAR = " " # 4 spaces for indentation
	FOLDER_ICON = "📁"
	FILE_ICON = "📄"

	# --- Core Logic --- (Keep get_repo_path and generate_markdown_structure as before)
	def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()):
	"""Clones or extracts the repository, returning the local path."""
	temp_dir = tempfile.mkdtemp()
	repo_path = None

	try:
	if source_type == "URL":
	if not repo_url:
	raise ValueError("GitHub Repository URL is required.")
	progress(0.1, desc="Cloning repository...")
	git_command = ["git", "clone", "--depth", "1"] # Shallow clone for speed
	if branch_tag:
	git_command.extend(["--branch", branch_tag])
	git_command.extend([repo_url, temp_dir])

	print(f"Running command: {' '.join(git_command)}") # For debugging
	result = subprocess.run(git_command, capture_output=True, text=True, check=False)

	if result.returncode != 0:
	# Attempt clone without branch if specific one failed (might be default branch)
	if branch_tag:
	progress(0.2, desc=f"Branch '{branch_tag}' not found or clone failed, trying default branch...")
	git_command = ["git", "clone", "--depth", "1", repo_url, temp_dir]
	print(f"Running command: {' '.join(git_command)}") # For debugging
	result = subprocess.run(git_command, capture_output=True, text=True, check=False)

	if result.returncode != 0:
	error_message = f"Git clone failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
	print(error_message) # Log detailed error
	# Try to extract a user-friendly message
	if "Authentication failed" in result.stderr:
	raise ValueError("Authentication failed. Private repositories require different handling (e.g., tokens) which is not supported here.")
	elif "not found" in result.stderr:
	raise ValueError(f"Repository or branch '{branch_tag or 'default'}' not found at URL: {repo_url}")
	else:
	raise ValueError(f"Git clone failed. Check URL and branch/tag. Error: {result.stderr.splitlines()[-1] if result.stderr else 'Unknown error'}")


	repo_path = pathlib.Path(temp_dir)
	progress(0.5, desc="Repository cloned.")
	print(f"Cloned repo to: {repo_path}") # Debugging

	elif source_type == "Upload ZIP":
	if zip_file_obj is None:
	raise ValueError("ZIP file upload is required.")
	progress(0.1, desc="Extracting ZIP file...")
	zip_path = zip_file_obj.name # Gradio provides a temp file path
	with zipfile.ZipFile(zip_path, 'r') as zip_ref:
	# Check for common zip structure (single top-level dir)
	top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p and p.split('/')[0]))
	extract_target = temp_dir
	potential_repo_root = temp_dir
	if len(top_level_dirs) == 1:
	# If zip contains repo-main/file structure, extract into temp_dir
	# The actual repo content will be inside temp_dir/repo-main/
	zip_ref.extractall(extract_target)
	potential_repo_root = os.path.join(temp_dir, top_level_dirs[0])
	print(f"ZIP has single top-level dir: {top_level_dirs[0]}. Potential root: {potential_repo_root}")
	else:
	# Otherwise, extract directly into temp_dir
	zip_ref.extractall(extract_target)
	print(f"ZIP structure seems flat or multi-root. Using extract target as root: {extract_target}")

	# Basic check if potential_repo_root looks like a valid directory
	if os.path.isdir(potential_repo_root):
	repo_path = pathlib.Path(potential_repo_root)
	else:
	# Fallback if single dir logic failed or wasn't applicable
	repo_path = pathlib.Path(extract_target)


	progress(0.5, desc="ZIP extracted.")
	print(f"Extracted ZIP to: {repo_path}") # Debugging
	else:
	raise ValueError("Invalid source type selected.")

	if not repo_path or not repo_path.is_dir():
	# Add more specific debugging info here
	print(f"Debug Info: Temp dir content: {list(os.listdir(temp_dir))}")
	if 'potential_repo_root' in locals() and potential_repo_root != temp_dir:
	print(f"Debug Info: Potential repo root '{potential_repo_root}' exists: {os.path.exists(potential_repo_root)}, is dir: {os.path.isdir(potential_repo_root)}")
	raise ValueError(f"Could not determine repository root directory within: {temp_dir}")

	return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup

	except Exception as e:
	# Clean up the temporary directory on error before re-raising
	shutil.rmtree(temp_dir, ignore_errors=True)
	print(f"Error in get_repo_path: {e}") # Log error
	traceback.print_exc() # Print full traceback for debugging get_repo_path issues
	raise e # Re-raise the exception to be caught by the main function


	def generate_markdown_structure(
	repo_root_path: pathlib.Path,
	include_content: bool,
	max_size_kb: int,
	ignore_patterns_str: str,
	progress=gr.Progress()
	):
	"""Generates the Markdown string from the repository structure."""
	repo_root_path = pathlib.Path(repo_root_path) # Ensure it's a Path object
	markdown_lines = []
	max_file_size_bytes = max_size_kb * 1024 if max_size_kb > 0 else 0

	# --- Prepare ignore patterns ---
	# Combine default and user patterns
	full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip()
	# Filter out empty lines and comments
	patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')]
	# Create unique list while preserving order (important if later patterns override earlier ones)
	seen = set()
	unique_patterns = [x for x in patterns if not (x in seen or seen.add(x))]
	spec = PathSpec.from_lines(GitWildMatchPattern, unique_patterns)
	print(f"Using unique ignore patterns: {unique_patterns}") # Debugging

	# --- Add header ---
	repo_name = repo_root_path.name
	markdown_lines.append(f"# {FOLDER_ICON} {repo_name}")
	markdown_lines.append("")

	# --- Walk through the directory ---
	progress(0.6, desc="Scanning repository structure...")
	files_processed = 0
	# Need to iterate through items relative to the root for pathspec matching
	all_items = sorted(list(repo_root_path.rglob('*')))
	total_items_estimate = len(all_items) # More accurate estimate

	items_scanned = 0
	for item_path in all_items:
	items_scanned += 1
	if items_scanned % 50 == 0: # Update progress periodically
	progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}")

	relative_path = item_path.relative_to(repo_root_path)
	# Pathspec matches against the path string relative to the root where .gitignore would be
	# Important: Add a leading '/' for patterns like '/node_modules/' to only match at the root
	path_str_for_match = str(relative_path)

	# Check if the path itself should be ignored
	# Pathspec automatically handles directory patterns (e.g., node_modules/ matches files and dirs inside)
	if spec.match_file(path_str_for_match):
	print(f"Ignoring: {relative_path}") # Debugging
	# If it's a directory, we don't need to manually skip recursion because
	# rglob already gave us all paths; we just skip processing this specific path.
	# If we were using os.walk, we'd modify the dirs list here.
	continue

	# Calculate depth and indentation
	depth = len(relative_path.parts) -1 # 0-based depth relative to root content
	indent = INDENT_CHAR * depth

	# Add entry to Markdown
	if item_path.is_dir():
	# Check if dir is empty after considering ignores. This is tricky with rglob.
	# A simple heuristic: check if any non-ignored children exist directly within it.
	# This isn't perfect but avoids complex lookahead.
	# has_children = any(p.relative_to(repo_root_path).parts[0] == relative_path.parts[0] and not spec.match_file(str(p.relative_to(repo_root_path))) for p in all_items if p != item_path and p.parent == item_path)
	# Simpler: Just always list the dir for now. Empty dir check is complex with ignores + rglob.
	markdown_lines.append(f"{indent}{FOLDER_ICON} {item_path.name}/")
	elif item_path.is_file():
	markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}")
	files_processed += 1

	# Include file content if requested and within limits
	if include_content and max_size_kb > 0: # Check > 0 explicitly
	try:
	file_size = item_path.stat().st_size
	if file_size == 0:
	markdown_lines.append(f"{indent}{INDENT_CHAR}```")
	markdown_lines.append(f"{indent}{INDENT_CHAR}[Empty File]")
	markdown_lines.append(f"{indent}{INDENT_CHAR}```")
	elif file_size <= max_file_size_bytes:
	try:
	# Attempt to detect binary files heuristically before reading large ones
	is_binary = False
	try:
	# Read a small chunk to check for null bytes
	with open(item_path, 'rb') as bf:
	chunk = bf.read(1024)
	if b'\x00' in chunk:
	is_binary = True
	except Exception:
	# Ignore errors during binary check, proceed as text
	pass

	if is_binary:
	markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Likely a binary file (Size: {file_size} bytes)]")
	else:
	content = item_path.read_text(encoding='utf-8', errors='replace') # Replace errors instead of failing
	lang = item_path.suffix.lstrip('.')
	# Simple lang detection, can be expanded
	if not lang: lang = "text"

	markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}")
	# Indent content lines
	content_lines = content.splitlines()
	# Limit output lines displayed in Markdown preview if necessary
	# Note: The downloaded file will have full content
	display_lines = content_lines[:MAX_OUTPUT_LINES]
	for line in display_lines:
	markdown_lines.append(f"{indent}{INDENT_CHAR}{line}")
	if len(content_lines) > MAX_OUTPUT_LINES:
	markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]")
	markdown_lines.append(f"{indent}{INDENT_CHAR}```")
	except UnicodeDecodeError:
	# Should be less common now with errors='replace'
	markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error decoding file as UTF-8 (Size: {file_size} bytes)]")
	except Exception as read_err:
	markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]")
	else:
	markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size:,} bytes) exceeds limit ({max_file_size_bytes:,} bytes)]") # Added commas
	except OSError as stat_err:
	markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file stats - {stat_err}]")

	elif include_content and max_size_kb == 0: # Content included checked, but 0 size limit
	markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]")


	# Add a newline for separation, helps readability only if content wasn't added (which adds ```\n)
	# Or maybe always add it for consistency between file/dir entries
	markdown_lines.append("")


	progress(0.95, desc="Formatting output...")
	final_markdown = "\n".join(markdown_lines)
	print(f"Processed {files_processed} files.") # Debugging
	return final_markdown

	# --- Gradio Interface ---

	def process_repo(
	source_type, repo_url, branch_tag, zip_file_obj,
	include_content, max_size_kb, ignore_patterns,
	progress=gr.Progress(track_tqdm=True)
	):
	"""Main function called by Gradio button."""
	status = ""
	output_markdown = ""
	output_file_path = None
	repo_root_path = None
	temp_dir_to_clean = None
	# Ensure max_size_kb is treated as a number
	try:
	max_size_kb_int = int(max_size_kb) if max_size_kb is not None else 0
	except ValueError:
	yield "Error: Max File Size must be a number.", "", gr.update(value=None, visible=False)
	return


	try:
	progress(0, desc="Starting...")
	# Initial state update for all outputs
	yield "Preparing...", "", gr.update(value=None, visible=False)

	# 1. Get Repository Path
	yield "Fetching repository...", "", gr.update(value=None, visible=False)
	repo_root_path, temp_dir_to_clean = get_repo_path(
	source_type, repo_url, branch_tag, zip_file_obj, progress=progress
	)
	# Check if path finding was successful before proceeding
	if not repo_root_path:
	# Error should have been raised in get_repo_path, but double-check
	raise ValueError("Failed to obtain repository path.")

	yield f"Repository ready at: {repo_root_path.name}", "", gr.update(value=None, visible=False)

	# 2. Generate Markdown
	yield "Generating Markdown structure...", "", gr.update(value=None, visible=False)
	markdown_content = generate_markdown_structure(
	repo_root_path, include_content, max_size_kb_int, ignore_patterns, progress=progress
	)

	# Limit preview size robustly
	preview_limit = 3000
	markdown_preview = markdown_content[:preview_limit]
	if len(markdown_content) > preview_limit:
	markdown_preview += "\n\n[... Output truncated in preview ...]"

	# 3. Prepare Output File
	yield "Saving Markdown to file...", markdown_preview, gr.update(value=None, visible=False)
	output_filename = f"{repo_root_path.name}_structure.md"
	# Sanitize filename slightly (replace spaces, etc.) - less critical in temp file context
	output_filename = "".join(c if c.isalnum() or c in ('_', '-', '.') else '_' for c in output_filename)

	# Save the file in a place Gradio can access (it manages temp files)
	with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file:
	temp_file.write(markdown_content)
	output_file_path = temp_file.name # Gradio needs the path to this file

	# * CORRECTED YIELD USING gr.update *
	yield f"Done. Output file '{output_filename}' ready for download.", markdown_preview, gr.update(value=output_file_path, visible=True, label=f"Download {output_filename}")

	except ValueError as ve:
	print(f"Value Error: {ve}") # Log error
	traceback.print_exc()
	# * CORRECTED YIELD USING gr.update *
	yield f"Error: {ve}", "", gr.update(value=None, visible=False)
	except subprocess.CalledProcessError as cpe:
	error_detail = cpe.stderr or cpe.stdout or "Unknown git error"
	print(f"Git Error: {error_detail}") # Log error
	traceback.print_exc()
	# * CORRECTED YIELD USING gr.update *
	yield f"Git command failed: {error_detail}", "", gr.update(value=None, visible=False)
	except Exception as e:
	print(f"Unexpected Error: {e}") # Log error
	traceback.print_exc() # Print full traceback to logs
	# * CORRECTED YIELD USING gr.update *
	yield f"An unexpected error occurred: {e}", "", gr.update(value=None, visible=False)
	finally:
	# 4. Cleanup
	if temp_dir_to_clean:
	print(f"Cleaning up temporary directory: {temp_dir_to_clean}")
	shutil.rmtree(temp_dir_to_clean, ignore_errors=True)
	print("Cleanup complete.")


	# --- Build Gradio UI --- (Keep as before)

	with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo:
	gr.Markdown("# GitHub Repository to Markdown Converter")
	gr.Markdown("Convert a GitHub repository's structure (and optionally content) into a single Markdown file.")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("## Input Source")
	input_source = gr.Radio(
	["URL", "Upload ZIP"], label="Select Source Type", value="URL"
	)

	url_input_group = gr.Group(visible=True) # Show URL by default
	with url_input_group:
	repo_url_input = gr.Textbox(label="Git Repository URL", placeholder="https://github.com/user/repo.git")
	branch_tag_input = gr.Textbox(label="Branch / Tag (Optional)", placeholder="main")

	zip_input_group = gr.Group(visible=False) # Hide ZIP by default
	with zip_input_group:
	zip_file_input = gr.File(label="Upload Repository ZIP", file_types=[".zip"])

	# --- Configuration Options ---
	gr.Markdown("## Configuration")
	include_content_checkbox = gr.Checkbox(label="Include File Content in Output", value=False)
	max_size_input = gr.Number(label="Max File Size for Content (KB)", value=100, minimum=0, step=10,
	info="Files larger than this won't have content included. Set to 0 to disable content inclusion entirely, even if checked above.")
	ignore_patterns_input = gr.Textbox(
	label="Ignore Patterns (comma-separated or newline, gitignore style)",
	info="Uses .gitignore syntax. Add / for directories. Default patterns provided.",
	lines=5,
	value=DEFAULT_IGNORE_PATTERNS.strip()
	)

	generate_button = gr.Button("Generate Markdown", variant="primary")

	with gr.Column(scale=2):
	gr.Markdown("## Status & Output")
	status_output = gr.Textbox(label="Current Status", interactive=False, lines=2)
	# Use a Textbox for preview initially, as Markdown rendering can be slow/heavy
	markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20)
	# Use gr.File for the final download link
	download_output = gr.File(label="Download Markdown File", visible=False, interactive=False) # Set interactive=False


	# --- Event Handlers --- (Keep as before)
	def toggle_input_visibility(choice):
	if choice == "URL":
	return gr.update(visible=True), gr.update(visible=False)
	else: # ZIP
	return gr.update(visible=False), gr.update(visible=True)

	input_source.change(
	fn=toggle_input_visibility,
	inputs=input_source,
	outputs=[url_input_group, zip_input_group],
	)

	generate_button.click(
	fn=process_repo,
	inputs=[
	input_source, repo_url_input, branch_tag_input, zip_file_input,
	include_content_checkbox, max_size_input, ignore_patterns_input
	],
	outputs=[status_output, markdown_preview_output, download_output],
	# api_name="generate_markdown" # Optional: for API access
	)

	# --- Launch the App --- (Keep as before)
	if __name__ == "__main__":
	# Ensure queue is enabled for HF Spaces deployment
	# debug=True is useful for local testing, might remove/set to False for production space
	demo.queue().launch(debug=True)