Spaces:
Running
Running
File size: 21,532 Bytes
40184db 3c5f9e7 29606bb 6085d45 29606bb 6085d45 3c5f9e7 6085d45 3c5f9e7 29606bb 5c9292e 3c5f9e7 29606bb 3c5f9e7 29606bb 3c5f9e7 6085d45 3c5f9e7 29606bb 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 29606bb 3c5f9e7 29606bb 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 29606bb 3c5f9e7 6085d45 40184db 3c5f9e7 6085d45 f19e8d1 3c5f9e7 6085d45 3c5f9e7 29606bb 6085d45 29606bb 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 29606bb 6085d45 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 29606bb 6085d45 29606bb 3c5f9e7 6085d45 29606bb 3c5f9e7 6085d45 40184db 3c5f9e7 6085d45 40184db 3c5f9e7 6085d45 3c5f9e7 29606bb 3c5f9e7 40184db 24226a4 3c5f9e7 40184db 3c5f9e7 40184db 29606bb 24226a4 3c5f9e7 6085d45 3c5f9e7 6085d45 3c5f9e7 40184db 29606bb 3c5f9e7 40184db 3c5f9e7 40184db 3c5f9e7 40184db 6085d45 40184db 6085d45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 |
import gradio as gr
import os
import subprocess
import tempfile
import zipfile
import pathlib
import shutil
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern
import traceback # Import traceback for better error logging
# --- Configuration --- (Keep as before)
DEFAULT_IGNORE_PATTERNS = """
# Default Ignore Patterns (Gitignore Syntax)
/.git/
/.hg/
/.svn/
/.vscode/
/.idea/
/node_modules/
/vendor/
/build/
/dist/
/target/
*.pyc
*.log
*.swp
*~
__pycache__/
.DS_Store
"""
MAX_OUTPUT_LINES = 10000 # Limit potential output size in display
INDENT_CHAR = " " # 4 spaces for indentation
FOLDER_ICON = "π"
FILE_ICON = "π"
# --- Core Logic --- (Keep get_repo_path and generate_markdown_structure as before)
def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()):
"""Clones or extracts the repository, returning the local path."""
temp_dir = tempfile.mkdtemp()
repo_path = None
try:
if source_type == "URL":
if not repo_url:
raise ValueError("GitHub Repository URL is required.")
progress(0.1, desc="Cloning repository...")
git_command = ["git", "clone", "--depth", "1"] # Shallow clone for speed
if branch_tag:
git_command.extend(["--branch", branch_tag])
git_command.extend([repo_url, temp_dir])
print(f"Running command: {' '.join(git_command)}") # For debugging
result = subprocess.run(git_command, capture_output=True, text=True, check=False)
if result.returncode != 0:
# Attempt clone without branch if specific one failed (might be default branch)
if branch_tag:
progress(0.2, desc=f"Branch '{branch_tag}' not found or clone failed, trying default branch...")
git_command = ["git", "clone", "--depth", "1", repo_url, temp_dir]
print(f"Running command: {' '.join(git_command)}") # For debugging
result = subprocess.run(git_command, capture_output=True, text=True, check=False)
if result.returncode != 0:
error_message = f"Git clone failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
print(error_message) # Log detailed error
# Try to extract a user-friendly message
if "Authentication failed" in result.stderr:
raise ValueError("Authentication failed. Private repositories require different handling (e.g., tokens) which is not supported here.")
elif "not found" in result.stderr:
raise ValueError(f"Repository or branch '{branch_tag or 'default'}' not found at URL: {repo_url}")
else:
raise ValueError(f"Git clone failed. Check URL and branch/tag. Error: {result.stderr.splitlines()[-1] if result.stderr else 'Unknown error'}")
repo_path = pathlib.Path(temp_dir)
progress(0.5, desc="Repository cloned.")
print(f"Cloned repo to: {repo_path}") # Debugging
elif source_type == "Upload ZIP":
if zip_file_obj is None:
raise ValueError("ZIP file upload is required.")
progress(0.1, desc="Extracting ZIP file...")
zip_path = zip_file_obj.name # Gradio provides a temp file path
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Check for common zip structure (single top-level dir)
top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p and p.split('/')[0]))
extract_target = temp_dir
potential_repo_root = temp_dir
if len(top_level_dirs) == 1:
# If zip contains repo-main/file structure, extract *into* temp_dir
# The actual repo content will be inside temp_dir/repo-main/
zip_ref.extractall(extract_target)
potential_repo_root = os.path.join(temp_dir, top_level_dirs[0])
print(f"ZIP has single top-level dir: {top_level_dirs[0]}. Potential root: {potential_repo_root}")
else:
# Otherwise, extract directly into temp_dir
zip_ref.extractall(extract_target)
print(f"ZIP structure seems flat or multi-root. Using extract target as root: {extract_target}")
# Basic check if potential_repo_root looks like a valid directory
if os.path.isdir(potential_repo_root):
repo_path = pathlib.Path(potential_repo_root)
else:
# Fallback if single dir logic failed or wasn't applicable
repo_path = pathlib.Path(extract_target)
progress(0.5, desc="ZIP extracted.")
print(f"Extracted ZIP to: {repo_path}") # Debugging
else:
raise ValueError("Invalid source type selected.")
if not repo_path or not repo_path.is_dir():
# Add more specific debugging info here
print(f"Debug Info: Temp dir content: {list(os.listdir(temp_dir))}")
if 'potential_repo_root' in locals() and potential_repo_root != temp_dir:
print(f"Debug Info: Potential repo root '{potential_repo_root}' exists: {os.path.exists(potential_repo_root)}, is dir: {os.path.isdir(potential_repo_root)}")
raise ValueError(f"Could not determine repository root directory within: {temp_dir}")
return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup
except Exception as e:
# Clean up the temporary directory on error before re-raising
shutil.rmtree(temp_dir, ignore_errors=True)
print(f"Error in get_repo_path: {e}") # Log error
traceback.print_exc() # Print full traceback for debugging get_repo_path issues
raise e # Re-raise the exception to be caught by the main function
def generate_markdown_structure(
repo_root_path: pathlib.Path,
include_content: bool,
max_size_kb: int,
ignore_patterns_str: str,
progress=gr.Progress()
):
"""Generates the Markdown string from the repository structure."""
repo_root_path = pathlib.Path(repo_root_path) # Ensure it's a Path object
markdown_lines = []
max_file_size_bytes = max_size_kb * 1024 if max_size_kb > 0 else 0
# --- Prepare ignore patterns ---
# Combine default and user patterns
full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip()
# Filter out empty lines and comments
patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')]
# Create unique list while preserving order (important if later patterns override earlier ones)
seen = set()
unique_patterns = [x for x in patterns if not (x in seen or seen.add(x))]
spec = PathSpec.from_lines(GitWildMatchPattern, unique_patterns)
print(f"Using unique ignore patterns: {unique_patterns}") # Debugging
# --- Add header ---
repo_name = repo_root_path.name
markdown_lines.append(f"# {FOLDER_ICON} {repo_name}")
markdown_lines.append("")
# --- Walk through the directory ---
progress(0.6, desc="Scanning repository structure...")
files_processed = 0
# Need to iterate through items relative to the root for pathspec matching
all_items = sorted(list(repo_root_path.rglob('*')))
total_items_estimate = len(all_items) # More accurate estimate
items_scanned = 0
for item_path in all_items:
items_scanned += 1
if items_scanned % 50 == 0: # Update progress periodically
progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}")
relative_path = item_path.relative_to(repo_root_path)
# Pathspec matches against the path string relative to the root where .gitignore would be
# Important: Add a leading '/' for patterns like '/node_modules/' to only match at the root
path_str_for_match = str(relative_path)
# Check if the path itself should be ignored
# Pathspec automatically handles directory patterns (e.g., node_modules/ matches files and dirs inside)
if spec.match_file(path_str_for_match):
print(f"Ignoring: {relative_path}") # Debugging
# If it's a directory, we don't need to manually skip recursion because
# rglob already gave us all paths; we just skip processing this specific path.
# If we were using os.walk, we'd modify the dirs list here.
continue
# Calculate depth and indentation
depth = len(relative_path.parts) -1 # 0-based depth relative to root content
indent = INDENT_CHAR * depth
# Add entry to Markdown
if item_path.is_dir():
# Check if dir is empty *after* considering ignores. This is tricky with rglob.
# A simple heuristic: check if any non-ignored children exist directly within it.
# This isn't perfect but avoids complex lookahead.
# has_children = any(p.relative_to(repo_root_path).parts[0] == relative_path.parts[0] and not spec.match_file(str(p.relative_to(repo_root_path))) for p in all_items if p != item_path and p.parent == item_path)
# Simpler: Just always list the dir for now. Empty dir check is complex with ignores + rglob.
markdown_lines.append(f"{indent}{FOLDER_ICON} **{item_path.name}/**")
elif item_path.is_file():
markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}")
files_processed += 1
# Include file content if requested and within limits
if include_content and max_size_kb > 0: # Check > 0 explicitly
try:
file_size = item_path.stat().st_size
if file_size == 0:
markdown_lines.append(f"{indent}{INDENT_CHAR}```")
markdown_lines.append(f"{indent}{INDENT_CHAR}[Empty File]")
markdown_lines.append(f"{indent}{INDENT_CHAR}```")
elif file_size <= max_file_size_bytes:
try:
# Attempt to detect binary files heuristically before reading large ones
is_binary = False
try:
# Read a small chunk to check for null bytes
with open(item_path, 'rb') as bf:
chunk = bf.read(1024)
if b'\x00' in chunk:
is_binary = True
except Exception:
# Ignore errors during binary check, proceed as text
pass
if is_binary:
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Likely a binary file (Size: {file_size} bytes)]")
else:
content = item_path.read_text(encoding='utf-8', errors='replace') # Replace errors instead of failing
lang = item_path.suffix.lstrip('.')
# Simple lang detection, can be expanded
if not lang: lang = "text"
markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}")
# Indent content lines
content_lines = content.splitlines()
# Limit output lines displayed in Markdown preview if necessary
# Note: The downloaded file will have full content
display_lines = content_lines[:MAX_OUTPUT_LINES]
for line in display_lines:
markdown_lines.append(f"{indent}{INDENT_CHAR}{line}")
if len(content_lines) > MAX_OUTPUT_LINES:
markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]")
markdown_lines.append(f"{indent}{INDENT_CHAR}```")
except UnicodeDecodeError:
# Should be less common now with errors='replace'
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error decoding file as UTF-8 (Size: {file_size} bytes)]")
except Exception as read_err:
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]")
else:
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size:,} bytes) exceeds limit ({max_file_size_bytes:,} bytes)]") # Added commas
except OSError as stat_err:
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file stats - {stat_err}]")
elif include_content and max_size_kb == 0: # Content included checked, but 0 size limit
markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]")
# Add a newline for separation, helps readability only if content wasn't added (which adds ```\n)
# Or maybe always add it for consistency between file/dir entries
markdown_lines.append("")
progress(0.95, desc="Formatting output...")
final_markdown = "\n".join(markdown_lines)
print(f"Processed {files_processed} files.") # Debugging
return final_markdown
# --- Gradio Interface ---
def process_repo(
source_type, repo_url, branch_tag, zip_file_obj,
include_content, max_size_kb, ignore_patterns,
progress=gr.Progress(track_tqdm=True)
):
"""Main function called by Gradio button."""
status = ""
output_markdown = ""
output_file_path = None
repo_root_path = None
temp_dir_to_clean = None
# Ensure max_size_kb is treated as a number
try:
max_size_kb_int = int(max_size_kb) if max_size_kb is not None else 0
except ValueError:
yield "Error: Max File Size must be a number.", "", gr.update(value=None, visible=False)
return
try:
progress(0, desc="Starting...")
# Initial state update for all outputs
yield "Preparing...", "", gr.update(value=None, visible=False)
# 1. Get Repository Path
yield "Fetching repository...", "", gr.update(value=None, visible=False)
repo_root_path, temp_dir_to_clean = get_repo_path(
source_type, repo_url, branch_tag, zip_file_obj, progress=progress
)
# Check if path finding was successful before proceeding
if not repo_root_path:
# Error should have been raised in get_repo_path, but double-check
raise ValueError("Failed to obtain repository path.")
yield f"Repository ready at: {repo_root_path.name}", "", gr.update(value=None, visible=False)
# 2. Generate Markdown
yield "Generating Markdown structure...", "", gr.update(value=None, visible=False)
markdown_content = generate_markdown_structure(
repo_root_path, include_content, max_size_kb_int, ignore_patterns, progress=progress
)
# Limit preview size robustly
preview_limit = 3000
markdown_preview = markdown_content[:preview_limit]
if len(markdown_content) > preview_limit:
markdown_preview += "\n\n[... Output truncated in preview ...]"
# 3. Prepare Output File
yield "Saving Markdown to file...", markdown_preview, gr.update(value=None, visible=False)
output_filename = f"{repo_root_path.name}_structure.md"
# Sanitize filename slightly (replace spaces, etc.) - less critical in temp file context
output_filename = "".join(c if c.isalnum() or c in ('_', '-', '.') else '_' for c in output_filename)
# Save the file in a place Gradio can access (it manages temp files)
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file:
temp_file.write(markdown_content)
output_file_path = temp_file.name # Gradio needs the path to this file
# *** CORRECTED YIELD USING gr.update ***
yield f"Done. Output file '{output_filename}' ready for download.", markdown_preview, gr.update(value=output_file_path, visible=True, label=f"Download {output_filename}")
except ValueError as ve:
print(f"Value Error: {ve}") # Log error
traceback.print_exc()
# *** CORRECTED YIELD USING gr.update ***
yield f"Error: {ve}", "", gr.update(value=None, visible=False)
except subprocess.CalledProcessError as cpe:
error_detail = cpe.stderr or cpe.stdout or "Unknown git error"
print(f"Git Error: {error_detail}") # Log error
traceback.print_exc()
# *** CORRECTED YIELD USING gr.update ***
yield f"Git command failed: {error_detail}", "", gr.update(value=None, visible=False)
except Exception as e:
print(f"Unexpected Error: {e}") # Log error
traceback.print_exc() # Print full traceback to logs
# *** CORRECTED YIELD USING gr.update ***
yield f"An unexpected error occurred: {e}", "", gr.update(value=None, visible=False)
finally:
# 4. Cleanup
if temp_dir_to_clean:
print(f"Cleaning up temporary directory: {temp_dir_to_clean}")
shutil.rmtree(temp_dir_to_clean, ignore_errors=True)
print("Cleanup complete.")
# --- Build Gradio UI --- (Keep as before)
with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo:
gr.Markdown("# GitHub Repository to Markdown Converter")
gr.Markdown("Convert a GitHub repository's structure (and optionally content) into a single Markdown file.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Input Source")
input_source = gr.Radio(
["URL", "Upload ZIP"], label="Select Source Type", value="URL"
)
url_input_group = gr.Group(visible=True) # Show URL by default
with url_input_group:
repo_url_input = gr.Textbox(label="Git Repository URL", placeholder="https://github.com/user/repo.git")
branch_tag_input = gr.Textbox(label="Branch / Tag (Optional)", placeholder="main")
zip_input_group = gr.Group(visible=False) # Hide ZIP by default
with zip_input_group:
zip_file_input = gr.File(label="Upload Repository ZIP", file_types=[".zip"])
# --- Configuration Options ---
gr.Markdown("## Configuration")
include_content_checkbox = gr.Checkbox(label="Include File Content in Output", value=False)
max_size_input = gr.Number(label="Max File Size for Content (KB)", value=100, minimum=0, step=10,
info="Files larger than this won't have content included. Set to 0 to disable content inclusion entirely, even if checked above.")
ignore_patterns_input = gr.Textbox(
label="Ignore Patterns (comma-separated or newline, gitignore style)",
info="Uses .gitignore syntax. Add / for directories. Default patterns provided.",
lines=5,
value=DEFAULT_IGNORE_PATTERNS.strip()
)
generate_button = gr.Button("Generate Markdown", variant="primary")
with gr.Column(scale=2):
gr.Markdown("## Status & Output")
status_output = gr.Textbox(label="Current Status", interactive=False, lines=2)
# Use a Textbox for preview initially, as Markdown rendering can be slow/heavy
markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20)
# Use gr.File for the final download link
download_output = gr.File(label="Download Markdown File", visible=False, interactive=False) # Set interactive=False
# --- Event Handlers --- (Keep as before)
def toggle_input_visibility(choice):
if choice == "URL":
return gr.update(visible=True), gr.update(visible=False)
else: # ZIP
return gr.update(visible=False), gr.update(visible=True)
input_source.change(
fn=toggle_input_visibility,
inputs=input_source,
outputs=[url_input_group, zip_input_group],
)
generate_button.click(
fn=process_repo,
inputs=[
input_source, repo_url_input, branch_tag_input, zip_file_input,
include_content_checkbox, max_size_input, ignore_patterns_input
],
outputs=[status_output, markdown_preview_output, download_output],
# api_name="generate_markdown" # Optional: for API access
)
# --- Launch the App --- (Keep as before)
if __name__ == "__main__":
# Ensure queue is enabled for HF Spaces deployment
# debug=True is useful for local testing, might remove/set to False for production space
demo.queue().launch(debug=True) |