Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,452 +8,660 @@ import fnmatch
|
|
| 8 |
from pathlib import Path
|
| 9 |
from pygments.lexers import guess_lexer_for_filename
|
| 10 |
from pygments.util import ClassNotFound
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# --- Configuration ---
|
|
|
|
|
|
|
| 13 |
DEFAULT_IGNORE_PATTERNS = [
|
| 14 |
-
|
| 15 |
-
"
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
]
|
| 18 |
DEFAULT_MAX_FILE_SIZE_KB = 1024 # 1 MB limit for file content inclusion
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# --- Core Logic ---
|
| 21 |
|
| 22 |
-
def should_ignore(
|
| 23 |
-
"""Checks if a path
|
| 24 |
try:
|
| 25 |
-
relative_path =
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
for pattern in ignore_patterns:
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
if
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
return True
|
| 43 |
-
#
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
#print(f"Ignoring directory '{relative_path}' due to pattern '{pattern}'")
|
| 48 |
-
return True
|
| 49 |
-
# Check if the path starts with the directory pattern
|
| 50 |
-
if str(relative_path).startswith(pattern_clean + os.sep):
|
| 51 |
-
#print(f"Ignoring path '{relative_path}' within ignored dir '{pattern}'")
|
| 52 |
-
return True
|
| 53 |
-
# Handle cases where pattern might match a parent directory implicitly
|
| 54 |
-
if pattern.endswith('/'):
|
| 55 |
-
# Check if any parent directory name matches the pattern
|
| 56 |
-
for parent in relative_path.parents:
|
| 57 |
-
if parent.name == pattern_clean:
|
| 58 |
-
#print(f"Ignoring '{relative_path}' due to parent match on '{pattern}'")
|
| 59 |
-
return True
|
| 60 |
-
# This logic might be complex, reconsider if fnmatch covers enough
|
| 61 |
-
# Simplified: Check if the path starts with the pattern directory
|
| 62 |
-
if str(relative_path).startswith(pattern_clean + os.sep):
|
| 63 |
-
#print(f"Ignoring '{relative_path}' due to prefix match on '{pattern}'")
|
| 64 |
-
return True
|
| 65 |
|
| 66 |
-
except Exception as e:
|
| 67 |
-
print(f"Warning: Error during ignore pattern matching for '{relative_path}' with pattern '{pattern}': {e}")
|
| 68 |
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
-
|
|
|
|
|
|
|
| 73 |
"""Checks if a file is likely binary by reading a chunk."""
|
| 74 |
try:
|
| 75 |
-
with open(
|
| 76 |
chunk = f.read(chunk_size)
|
| 77 |
-
return b'\0' in chunk
|
| 78 |
-
except
|
| 79 |
-
|
|
|
|
| 80 |
|
| 81 |
-
def get_file_content(file_path:
|
| 82 |
-
"""
|
| 83 |
-
Reads file content, detects language, handles size limits and encodings.
|
| 84 |
-
Returns (content, language, error_message)
|
| 85 |
-
"""
|
| 86 |
try:
|
| 87 |
-
file_size =
|
| 88 |
if file_size > max_size_bytes:
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
if is_likely_binary(file_path):
|
| 92 |
-
return None, None, "[Content skipped: Detected as binary file]"
|
| 93 |
|
| 94 |
-
|
|
|
|
| 95 |
try:
|
| 96 |
-
with open(
|
| 97 |
content = f.read()
|
| 98 |
except UnicodeDecodeError:
|
|
|
|
|
|
|
| 99 |
try:
|
| 100 |
-
with open(
|
| 101 |
content = f.read()
|
| 102 |
except Exception as e_read:
|
|
|
|
| 103 |
return None, None, f"[Content skipped: Error reading file - {e_read}]"
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
|
| 106 |
try:
|
| 107 |
-
lexer = guess_lexer_for_filename(file_path, content)
|
| 108 |
language = lexer.aliases[0] if lexer.aliases else lexer.name
|
| 109 |
except ClassNotFound:
|
| 110 |
-
language = "" #
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
return content, language, None
|
| 113 |
|
|
|
|
|
|
|
|
|
|
| 114 |
except Exception as e:
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
# MODIFIED: Added include_content parameter
|
| 118 |
-
def generate_markdown_for_repo(repo_path: str, ignore_patterns: list[str], max_file_size_kb: int, include_content: bool) -> str:
|
| 119 |
-
"""Generates Markdown content for the repository structure and optionally files."""
|
| 120 |
-
repo_root = Path(repo_path).resolve()
|
| 121 |
-
md_content = ["# Repository Structure and Content\n\n"]
|
| 122 |
-
file_contents_md = []
|
| 123 |
-
max_size_bytes = max_file_size_kb * 1024
|
| 124 |
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
| 128 |
-
md_content.append("## Directory Structure\n\n```\n")
|
| 129 |
structure_lines = []
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
# --- Ignore directories based on patterns ---
|
| 134 |
-
original_dirs = list(dirs) # Copy because we modify dirs list
|
| 135 |
-
dirs[:] = [d for d in original_dirs if not should_ignore(str(root_path / d), ignore_patterns, repo_root)]
|
| 136 |
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
|
|
|
|
|
|
| 165 |
|
| 166 |
# --- Pass 2: Process file contents (ONLY if requested) ---
|
| 167 |
-
if
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
for root, dirs, files in os.walk(repo_path, topdown=True):
|
| 171 |
-
root_path = Path(root).resolve()
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
try:
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
repo_path = None
|
| 214 |
output_md = ""
|
| 215 |
output_file_path = None
|
| 216 |
error_message = None
|
| 217 |
-
|
| 218 |
-
ignore_patterns = [p.strip() for p in ignore_patterns_str.split(',') if p.strip()]
|
| 219 |
-
# Ensure default patterns are added only if they aren't already covered by user patterns
|
| 220 |
-
# A simple way is just to combine and remove duplicates
|
| 221 |
-
combined_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
|
| 222 |
-
ignore_patterns = sorted(list(set(combined_patterns))) # Keep unique and sort for consistency if needed
|
| 223 |
|
| 224 |
try:
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
-
if input_type == "URL":
|
| 228 |
-
if not repo_url or not (repo_url.startswith("http://") or repo_url.startswith("https://")):
|
| 229 |
-
raise ValueError("Invalid Git URL provided. Must start with http:// or https://")
|
| 230 |
-
print(f"Cloning repository: {repo_url}")
|
| 231 |
-
try:
|
| 232 |
-
# Attempt sparse checkout if available (modern Git)
|
| 233 |
-
# This might fail on older Git versions, hence the fallback
|
| 234 |
-
subprocess.run(
|
| 235 |
-
["git", "clone", "--depth", "1", "--filter=blob:none", "--no-checkout", repo_url, temp_dir],
|
| 236 |
-
check=True, capture_output=True, text=True, timeout=60
|
| 237 |
-
)
|
| 238 |
-
subprocess.run(["git", "sparse-checkout", "init", "--cone"], cwd=temp_dir, check=True, capture_output=True, text=True)
|
| 239 |
-
subprocess.run(["git", "checkout"], cwd=temp_dir, check=True, capture_output=True, text=True, timeout=120)
|
| 240 |
-
print("Cloning successful (sparse/filtered).")
|
| 241 |
-
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e_sparse:
|
| 242 |
-
print(f"Sparse clone failed ({e_sparse}), attempting standard shallow clone...")
|
| 243 |
-
# Fallback to standard shallow clone
|
| 244 |
-
shutil.rmtree(temp_dir) # Clean up failed attempt
|
| 245 |
-
temp_dir = tempfile.mkdtemp() # Recreate temp dir
|
| 246 |
try:
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
except subprocess.TimeoutExpired:
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
try:
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
except Exception as e_extract:
|
|
|
|
| 278 |
raise RuntimeError(f"Failed to extract ZIP file: {e_extract}")
|
| 279 |
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
if ".DS_Store" in extracted_items:
|
| 284 |
-
extracted_items.remove(".DS_Store")
|
| 285 |
-
# Check for __MACOSX folder often created by macOS archiver
|
| 286 |
-
if "__MACOSX" in extracted_items and os.path.isdir(os.path.join(temp_dir, "__MACOSX")):
|
| 287 |
-
extracted_items.remove("__MACOSX")
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
-
if len(extracted_items) == 1 and os.path.isdir(os.path.join(temp_dir, extracted_items[0])):
|
| 291 |
-
repo_path = os.path.join(temp_dir, extracted_items[0])
|
| 292 |
-
print(f"Detected repo root inside ZIP: {extracted_items[0]}")
|
| 293 |
else:
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
except Exception as e:
|
|
|
|
| 318 |
error_message = f"An error occurred: {e}"
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
output_file_path = None
|
| 323 |
|
| 324 |
finally:
|
| 325 |
-
#
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
# Gradio's gr.File component should manage its temp files when value is updated.
|
| 333 |
-
# If we created the temp file (output_file_path) AND there was NO error,
|
| 334 |
-
# we might need manual cleanup later if Gradio doesn't handle it, but usually it does.
|
| 335 |
-
# Let's assume Gradio handles the download file cleanup for now.
|
| 336 |
-
|
| 337 |
-
# Return values for Gradio outputs
|
| 338 |
-
# Always return a string for md_output (either the result or the error message)
|
| 339 |
-
# Return the file path for download only on success, otherwise None (or an invisible File update)
|
| 340 |
-
if output_file_path:
|
| 341 |
-
return output_md, gr.File(value=output_file_path, visible=True)
|
| 342 |
-
else:
|
| 343 |
-
# If there was an error, output_md contains the error message
|
| 344 |
-
# And we hide the download button
|
| 345 |
-
return output_md, gr.File(visible=False)
|
| 346 |
|
| 347 |
|
| 348 |
# --- Gradio Interface ---
|
| 349 |
|
| 350 |
css = """
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
#md_output {
|
| 352 |
-
max-height: 70vh; /* Adjust max height
|
| 353 |
-
overflow
|
| 354 |
-
border: 1px solid #
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
}
|
| 357 |
-
#
|
| 358 |
-
margin-
|
|
|
|
| 359 |
}
|
| 360 |
-
|
|
|
|
|
|
|
| 361 |
"""
|
| 362 |
|
| 363 |
-
|
| 364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
gr.Markdown(
|
| 366 |
"Enter a public Git repository URL or upload a local project folder (as a `.zip` archive) "
|
| 367 |
-
"to generate a single Markdown file containing its structure and optionally file contents."
|
|
|
|
| 368 |
)
|
| 369 |
|
| 370 |
with gr.Row():
|
| 371 |
-
|
|
|
|
|
|
|
| 372 |
input_type = gr.Radio(
|
| 373 |
["URL", "Upload ZIP"], label="Input Source", value="URL"
|
| 374 |
)
|
| 375 |
|
| 376 |
-
#
|
| 377 |
url_input = gr.Textbox(
|
| 378 |
label="Git Repository URL",
|
| 379 |
-
placeholder="e.g., https://github.com/gradio-app/gradio.git",
|
| 380 |
-
visible=True,
|
| 381 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 382 |
)
|
| 383 |
|
| 384 |
-
#
|
| 385 |
zip_input = gr.File(
|
| 386 |
label="Upload Local Folder (as .zip)",
|
| 387 |
file_types=[".zip"],
|
| 388 |
-
visible=False,
|
| 389 |
-
interactive=True,
|
| 390 |
-
# Use file_count='single' explicitly if needed, though default
|
| 391 |
)
|
| 392 |
|
| 393 |
-
# ---
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
|
| 408 |
-
# --- ADDED: Checkbox for content inclusion ---
|
| 409 |
-
include_content_checkbox = gr.Checkbox(
|
| 410 |
-
label="Include File Content",
|
| 411 |
-
value=True, # Default to including content
|
| 412 |
-
info="Uncheck to generate only the directory structure."
|
| 413 |
-
)
|
| 414 |
-
# --- End Added Checkbox ---
|
| 415 |
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
-
with gr.Column(scale=3):
|
| 419 |
-
gr.Markdown("## Generated Output")
|
| 420 |
-
md_output = gr.Markdown(elem_id="md_output", value="*Markdown output will appear here...*")
|
| 421 |
-
download_output = gr.File(label="Download .md File", interactive=False, visible=False, elem_id="download_output")
|
| 422 |
|
|
|
|
| 423 |
|
| 424 |
-
#
|
| 425 |
def update_input_visibility(choice):
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
| 430 |
|
| 431 |
input_type.change(
|
| 432 |
fn=update_input_visibility,
|
| 433 |
inputs=input_type,
|
| 434 |
-
outputs=[url_input, zip_input]
|
|
|
|
| 435 |
)
|
| 436 |
|
| 437 |
-
#
|
| 438 |
-
# MODIFIED: Added include_content_checkbox to inputs
|
| 439 |
submit_btn.click(
|
| 440 |
-
fn=
|
| 441 |
inputs=[
|
| 442 |
-
input_type,
|
| 443 |
-
|
| 444 |
-
zip_input,
|
| 445 |
-
ignore_input,
|
| 446 |
-
max_size_input,
|
| 447 |
-
include_content_checkbox, # Pass the checkbox state
|
| 448 |
],
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
],
|
| 453 |
-
api_name="repo_to_md" # For API access if needed
|
| 454 |
)
|
| 455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
# Launch the interface
|
| 457 |
if __name__ == "__main__":
|
| 458 |
-
|
| 459 |
-
demo.launch()
|
|
|
|
| 8 |
from pathlib import Path
|
| 9 |
from pygments.lexers import guess_lexer_for_filename
|
| 10 |
from pygments.util import ClassNotFound
|
| 11 |
+
import logging
|
| 12 |
+
import time
|
| 13 |
+
import math
|
| 14 |
+
|
| 15 |
+
# Try importing pyperclip, provide instructions if missing
|
| 16 |
+
try:
|
| 17 |
+
import pyperclip
|
| 18 |
+
PYPERCLIP_AVAILABLE = True
|
| 19 |
+
except ImportError:
|
| 20 |
+
PYPERCLIP_AVAILABLE = False
|
| 21 |
+
logging.warning("pyperclip library not found. 'Copy to Clipboard' functionality will be disabled. Install with: pip install pyperclip")
|
| 22 |
+
|
| 23 |
|
| 24 |
# --- Configuration ---
|
| 25 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 26 |
+
|
| 27 |
DEFAULT_IGNORE_PATTERNS = [
|
| 28 |
+
# Version Control
|
| 29 |
+
".git/", ".gitignore", ".gitattributes", ".gitmodules", ".svn/", ".hg/",
|
| 30 |
+
# IDE/Editor Files
|
| 31 |
+
".idea/", ".vscode/", "*.sublime-project", "*.sublime-workspace", ".project", ".classpath", "*.tmproj", ".DS_Store", "Thumbs.db",
|
| 32 |
+
# Build Outputs & Dependencies
|
| 33 |
+
"build/", "dist/", "bin/", "obj/", "out/", "target/", "*.o", "*.so", "*.dll", "*.exe", "*.class", "*.jar", "*.war", "*.ear",
|
| 34 |
+
"node_modules/", "bower_components/", "vendor/", "*.egg-info/", "wheels/", "**/__pycache__/", "*.pyc",
|
| 35 |
+
# Python Virtual Environments
|
| 36 |
+
".venv/", "venv/", "env/", ".env", "pip-cache/",
|
| 37 |
+
# Logs & Temporary Files
|
| 38 |
+
"*.log", "*.tmp", "*.temp", "*.swp", "*.swo", "*.bak",
|
| 39 |
+
# OS Generated Files
|
| 40 |
+
"._*",
|
| 41 |
+
# Secrets (important!)
|
| 42 |
+
"*.pem", "*.key", ".env*", "secrets.*", "credentials.*",
|
| 43 |
+
# Common Framework/Tool cache/temp files
|
| 44 |
+
".pytest_cache/", ".tox/", ".mypy_cache/", ".ruff_cache/", "*.ipynb_checkpoints",
|
| 45 |
+
# MACOS specific zip artifact
|
| 46 |
+
"__MACOSX/",
|
| 47 |
]
|
| 48 |
DEFAULT_MAX_FILE_SIZE_KB = 1024 # 1 MB limit for file content inclusion
|
| 49 |
+
CLONE_TIMEOUT_SPARSE = 120 # seconds
|
| 50 |
+
CLONE_TIMEOUT_STANDARD = 300 # seconds
|
| 51 |
+
ZIP_EXTRACT_WARN_THRESHOLD = 1000 # Warn if ZIP contains more than this many files
|
| 52 |
+
MAX_FILES_FOR_DETAILED_PROGRESS = 500 # Only show per-file progress if fewer than this many files
|
| 53 |
|
| 54 |
# --- Core Logic ---
|
| 55 |
|
| 56 |
+
def should_ignore(path_obj: Path, ignore_patterns: list[str], repo_root: Path) -> bool:
|
| 57 |
+
"""Checks if a file or directory path should be ignored based on gitignore-style patterns."""
|
| 58 |
try:
|
| 59 |
+
relative_path = path_obj.relative_to(repo_root)
|
| 60 |
+
# Use POSIX paths for consistent pattern matching regardless of OS
|
| 61 |
+
relative_path_str = relative_path.as_posix()
|
| 62 |
+
except ValueError:
|
| 63 |
+
logging.warning(f"Path {path_obj} not relative to root {repo_root}, ignoring.")
|
| 64 |
+
return True
|
| 65 |
+
|
| 66 |
+
# Optimization: Check direct name match first for common ignores like '.git'
|
| 67 |
+
if path_obj.name in ignore_patterns:
|
| 68 |
+
return True
|
| 69 |
+
|
| 70 |
for pattern in ignore_patterns:
|
| 71 |
+
pattern = pattern.strip()
|
| 72 |
+
if not pattern or pattern.startswith('#'):
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
# Ensure pattern uses POSIX separators
|
| 76 |
+
pattern_posix = pattern.replace(os.sep, '/')
|
| 77 |
+
|
| 78 |
+
# Case 1: Pattern specifies a directory (ends with '/')
|
| 79 |
+
if pattern_posix.endswith('/'):
|
| 80 |
+
# Match if the relative path *is* this directory or starts with it
|
| 81 |
+
# Example: pattern "build/", path "build" or "build/foo.txt"
|
| 82 |
+
dir_pattern = pattern_posix.rstrip('/')
|
| 83 |
+
if relative_path_str == dir_pattern or relative_path_str.startswith(dir_pattern + '/'):
|
| 84 |
+
return True
|
| 85 |
+
# Also match if a *directory component* matches the name (like ignoring 'node_modules' anywhere)
|
| 86 |
+
# Example: pattern "node_modules/", path "src/my_lib/node_modules/some_dep"
|
| 87 |
+
if path_obj.is_dir() and path_obj.name == dir_pattern:
|
| 88 |
return True
|
| 89 |
+
# Check parent directories as well
|
| 90 |
+
for parent in relative_path.parents:
|
| 91 |
+
if parent.name == dir_pattern:
|
| 92 |
+
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
|
|
|
|
|
|
| 94 |
|
| 95 |
+
# Case 2: Pattern is a file or general pattern (using fnmatch)
|
| 96 |
+
# Use fnmatchcase for potentially case-sensitive filesystems if needed,
|
| 97 |
+
# but fnmatch is often sufficient and more git-like on Win/Mac.
|
| 98 |
+
if fnmatch.fnmatch(relative_path_str, pattern_posix):
|
| 99 |
+
return True
|
| 100 |
+
# Also match just the filename part for patterns like "*.log"
|
| 101 |
+
if fnmatch.fnmatch(path_obj.name, pattern_posix):
|
| 102 |
+
return True
|
| 103 |
|
| 104 |
|
| 105 |
+
return False
|
| 106 |
+
|
| 107 |
+
def is_likely_binary(file_path: Path, chunk_size=1024) -> bool:
|
| 108 |
"""Checks if a file is likely binary by reading a chunk."""
|
| 109 |
try:
|
| 110 |
+
with file_path.open('rb') as f:
|
| 111 |
chunk = f.read(chunk_size)
|
| 112 |
+
return b'\0' in chunk
|
| 113 |
+
except OSError as e:
|
| 114 |
+
logging.warning(f"Could not read file chunk for binary check {file_path}: {e}")
|
| 115 |
+
return True
|
| 116 |
|
| 117 |
+
def get_file_content(file_path: Path, max_size_bytes: int) -> tuple[str | None, str | None, str | None]:
|
| 118 |
+
"""Reads file content, detects language, handles size limits and encodings."""
|
|
|
|
|
|
|
|
|
|
| 119 |
try:
|
| 120 |
+
file_size = file_path.stat().st_size
|
| 121 |
if file_size > max_size_bytes:
|
| 122 |
+
kb_limit = max_size_bytes / 1024
|
| 123 |
+
kb_actual = file_size / 1024
|
| 124 |
+
return None, None, f"[Content skipped: File size ({kb_actual:.1f} KB) exceeds limit ({kb_limit:.1f} KB)]"
|
| 125 |
+
|
| 126 |
+
if file_size == 0:
|
| 127 |
+
return "", "", None # Empty file
|
| 128 |
|
| 129 |
if is_likely_binary(file_path):
|
| 130 |
+
return None, None, "[Content skipped: Detected as likely binary file]"
|
| 131 |
|
| 132 |
+
content = None
|
| 133 |
+
detected_encoding = 'utf-8'
|
| 134 |
try:
|
| 135 |
+
with file_path.open('r', encoding='utf-8') as f:
|
| 136 |
content = f.read()
|
| 137 |
except UnicodeDecodeError:
|
| 138 |
+
logging.warning(f"UTF-8 decoding failed for {file_path}, trying latin-1.")
|
| 139 |
+
detected_encoding = 'latin-1'
|
| 140 |
try:
|
| 141 |
+
with file_path.open('r', encoding='latin-1') as f:
|
| 142 |
content = f.read()
|
| 143 |
except Exception as e_read:
|
| 144 |
+
logging.error(f"Error reading file {file_path} even with latin-1: {e_read}")
|
| 145 |
return None, None, f"[Content skipped: Error reading file - {e_read}]"
|
| 146 |
+
except OSError as e_os:
|
| 147 |
+
logging.error(f"OS Error reading file {file_path}: {e_os}")
|
| 148 |
+
return None, None, f"[Content skipped: OS Error reading file - {e_os}]"
|
| 149 |
|
| 150 |
+
language = ""
|
| 151 |
try:
|
| 152 |
+
lexer = guess_lexer_for_filename(file_path.name, content)
|
| 153 |
language = lexer.aliases[0] if lexer.aliases else lexer.name
|
| 154 |
except ClassNotFound:
|
| 155 |
+
language = "" # Plain text
|
| 156 |
+
except Exception as e_lexer:
|
| 157 |
+
logging.warning(f"Could not guess lexer for {file_path}: {e_lexer}")
|
| 158 |
+
language = "" # Fallback
|
| 159 |
|
| 160 |
return content, language, None
|
| 161 |
|
| 162 |
+
except OSError as e_os:
|
| 163 |
+
logging.error(f"OS Error processing file {file_path}: {e_os}")
|
| 164 |
+
return None, None, f"[Content skipped: Error accessing file properties - {e_os}]"
|
| 165 |
except Exception as e:
|
| 166 |
+
logging.error(f"Unexpected error processing file {file_path}: {e}", exc_info=True)
|
| 167 |
+
return None, None, f"[Content skipped: Unexpected error processing file - {e}]"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
# --- MODIFIED: Function now uses yield for status updates ---
|
| 170 |
+
def generate_markdown_for_repo(repo_path_str: str, ignore_patterns: list[str], max_file_size_kb: int, include_content: bool):
|
| 171 |
+
"""
|
| 172 |
+
Generates Markdown content for the repository structure and optionally files.
|
| 173 |
+
Yields status updates during processing.
|
| 174 |
+
"""
|
| 175 |
+
repo_root = Path(repo_path_str).resolve()
|
| 176 |
+
yield f"Status: Analysing repository at {repo_root}..."
|
| 177 |
+
logging.info(f"Starting markdown generation for: {repo_root}")
|
| 178 |
|
| 179 |
+
md_lines = ["# Repository Analysis\n"]
|
|
|
|
| 180 |
structure_lines = []
|
| 181 |
+
content_lines = []
|
| 182 |
+
max_size_bytes = max_file_size_kb * 1024
|
| 183 |
+
files_to_process = []
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
+
# --- Pre-computation: Collect all files to potentially process ---
|
| 186 |
+
yield "Status: Scanning file structure..."
|
| 187 |
+
all_paths = []
|
| 188 |
+
for root, dirs, files in os.walk(repo_path_str, topdown=True):
|
| 189 |
+
root_path = Path(root).resolve()
|
| 190 |
|
| 191 |
+
# --- Filter ignored directories before adding paths ---
|
| 192 |
+
# We need to check against the original dirs list before modifying it
|
| 193 |
+
original_dirs = list(dirs)
|
| 194 |
+
dirs[:] = [d for d in original_dirs if not should_ignore(root_path / d, ignore_patterns, repo_root)]
|
| 195 |
+
|
| 196 |
+
# Add directories that are *not* ignored
|
| 197 |
+
for d in dirs: # Add the non-ignored directory paths
|
| 198 |
+
all_paths.append(root_path / d)
|
| 199 |
+
|
| 200 |
+
# Add files that are *not* ignored
|
| 201 |
+
for f in files:
|
| 202 |
+
file_path = root_path / f
|
| 203 |
+
if not should_ignore(file_path, ignore_patterns, repo_root):
|
| 204 |
+
all_paths.append(file_path)
|
| 205 |
+
|
| 206 |
+
# --- Pass 1: Build the directory structure visualization ---
|
| 207 |
+
yield "Status: Generating directory structure..."
|
| 208 |
+
structure_lines.append("## Directory Structure")
|
| 209 |
+
structure_lines.append("```")
|
| 210 |
+
structure_tree = []
|
| 211 |
+
processed_dirs_for_structure = set()
|
| 212 |
+
|
| 213 |
+
def add_to_structure(path_obj: Path, depth: int):
|
| 214 |
+
indent = " " * depth # 4 spaces indent
|
| 215 |
+
prefix = "└── "
|
| 216 |
+
if path_obj.is_dir():
|
| 217 |
+
# Add directory only if it hasn't been added via a parent walk already
|
| 218 |
+
if path_obj not in processed_dirs_for_structure:
|
| 219 |
+
structure_tree.append(f"{indent}{prefix}{path_obj.name}/")
|
| 220 |
+
processed_dirs_for_structure.add(path_obj)
|
| 221 |
+
# Recursively add children
|
| 222 |
+
try:
|
| 223 |
+
for item in sorted(path_obj.iterdir(), key=lambda p: (p.is_file(), p.name.lower())):
|
| 224 |
+
if not should_ignore(item, ignore_patterns, repo_root):
|
| 225 |
+
add_to_structure(item, depth + 1)
|
| 226 |
+
except OSError as e:
|
| 227 |
+
logging.warning(f"Could not access directory {path_obj}: {e}")
|
| 228 |
+
structure_tree.append(f"{indent} └── [Error accessing directory: {e}]")
|
| 229 |
+
|
| 230 |
+
elif path_obj.is_file():
|
| 231 |
+
structure_tree.append(f"{indent}{prefix}{path_obj.name}")
|
| 232 |
+
|
| 233 |
+
# Start building the structure from the root
|
| 234 |
+
structure_tree.append(f"{repo_root.name}/")
|
| 235 |
+
processed_dirs_for_structure.add(repo_root)
|
| 236 |
+
try:
|
| 237 |
+
for item in sorted(repo_root.iterdir(), key=lambda p: (p.is_file(), p.name.lower())):
|
| 238 |
+
if not should_ignore(item, ignore_patterns, repo_root):
|
| 239 |
+
add_to_structure(item, 1)
|
| 240 |
+
except OSError as e:
|
| 241 |
+
logging.error(f"Could not access repository root {repo_root}: {e}")
|
| 242 |
+
structure_tree.append(f" └── [Error accessing repository root: {e}]")
|
| 243 |
|
| 244 |
|
| 245 |
+
structure_lines.extend(structure_tree)
|
| 246 |
+
structure_lines.append("```\n")
|
| 247 |
+
yield "Status: Directory structure generated."
|
| 248 |
+
logging.info("Directory structure built.")
|
| 249 |
|
| 250 |
# --- Pass 2: Process file contents (ONLY if requested) ---
|
| 251 |
+
files_to_render = [p for p in all_paths if p.is_file()]
|
| 252 |
+
total_files = len(files_to_render)
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
+
if include_content and total_files > 0:
|
| 255 |
+
yield f"Status: Processing content of {total_files} file(s)..."
|
| 256 |
+
content_lines.append("## File Contents\n")
|
| 257 |
+
start_time = time.time()
|
| 258 |
+
show_detailed_progress = total_files <= MAX_FILES_FOR_DETAILED_PROGRESS
|
| 259 |
|
| 260 |
+
for i, file_path in enumerate(files_to_render):
|
| 261 |
+
if show_detailed_progress or (i % 50 == 0 and i > 0): # Update every 50 files if many files
|
| 262 |
+
progress_percent = (i + 1) / total_files
|
| 263 |
+
yield f"Status: Processing file {i+1}/{total_files}: {file_path.relative_to(repo_root).as_posix()} ({progress_percent:.0%})"
|
| 264 |
|
| 265 |
+
try:
|
| 266 |
+
relative_path_str = file_path.relative_to(repo_root).as_posix()
|
| 267 |
+
content_lines.append(f"### `{relative_path_str}`\n") # Use POSIX path in Markdown
|
| 268 |
+
content, language, error_msg = get_file_content(file_path, max_size_bytes)
|
| 269 |
+
|
| 270 |
+
if error_msg:
|
| 271 |
+
content_lines.append(f"```\n{error_msg}\n```\n")
|
| 272 |
+
elif content is not None:
|
| 273 |
+
lang_hint = language if language else ""
|
| 274 |
+
content_lines.append(f"```{lang_hint}\n{content}\n```\n")
|
| 275 |
+
else:
|
| 276 |
+
# Should generally be covered by error_msg cases, but as a fallback
|
| 277 |
+
content_lines.append("```\n[Content not available or file is binary/empty]\n```\n")
|
| 278 |
|
| 279 |
+
except ValueError:
|
| 280 |
+
logging.warning(f"Path {file_path} not relative to {repo_root}, skipping content.")
|
| 281 |
+
continue
|
| 282 |
+
except Exception as e:
|
| 283 |
+
logging.error(f"Unexpected error processing content for {file_path}: {e}", exc_info=True)
|
| 284 |
+
relative_path_str = file_path.name # Fallback name
|
| 285 |
try:
|
| 286 |
+
relative_path_str = file_path.relative_to(repo_root).as_posix()
|
| 287 |
+
except ValueError: pass
|
| 288 |
+
content_lines.append(f"### `{relative_path_str}`\n")
|
| 289 |
+
content_lines.append(f"```\n[ERROR processing file content: {e}]\n```\n")
|
| 290 |
+
|
| 291 |
+
end_time = time.time()
|
| 292 |
+
yield f"Status: File content processing complete ({total_files} files in {end_time - start_time:.2f}s)."
|
| 293 |
+
logging.info(f"File content processing complete. Processed {total_files} files in {end_time - start_time:.2f} seconds.")
|
| 294 |
+
elif not include_content:
|
| 295 |
+
yield "Status: Skipping file content inclusion as requested."
|
| 296 |
+
logging.info("Skipping file content inclusion as requested.")
|
| 297 |
+
else: # include_content is True but total_files is 0
|
| 298 |
+
yield "Status: No files found to include content for (after filtering)."
|
| 299 |
+
logging.info("No files found to include content for (after filtering).")
|
| 300 |
+
|
| 301 |
+
# Combine structure and content
|
| 302 |
+
md_lines.extend(structure_lines)
|
| 303 |
+
if include_content and content_lines: # Only add content section if requested and content exists
|
| 304 |
+
md_lines.extend(content_lines)
|
| 305 |
+
|
| 306 |
+
yield "Status: Markdown generation complete!"
|
| 307 |
+
yield "".join(md_lines) # Final yield is the complete markdown
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
# --- MODIFIED: Function is now a generator, yielding status updates ---
|
| 311 |
+
def repo_to_md_processor(input_type: str, repo_url: str | None, uploaded_zip: tempfile._TemporaryFileWrapper | None, git_branch: str | None, ignore_patterns_str: str, max_file_size_kb: int, include_content: bool):
|
| 312 |
+
"""
|
| 313 |
+
Main processing generator function called by Gradio interface.
|
| 314 |
+
Yields status strings and finally the markdown content or an error message.
|
| 315 |
+
"""
|
| 316 |
+
temp_dir_obj = None
|
| 317 |
repo_path = None
|
| 318 |
output_md = ""
|
| 319 |
output_file_path = None
|
| 320 |
error_message = None
|
| 321 |
+
start_time = time.time()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
try:
|
| 324 |
+
yield "Status: Initializing..."
|
| 325 |
+
# Combine user patterns with defaults
|
| 326 |
+
user_patterns = {p.strip() for p in ignore_patterns_str.split(',') if p.strip()}
|
| 327 |
+
default_patterns = set(DEFAULT_IGNORE_PATTERNS)
|
| 328 |
+
combined_patterns = sorted(list(user_patterns.union(default_patterns)))
|
| 329 |
+
logging.info(f"Using ignore patterns: {combined_patterns}")
|
| 330 |
+
logging.info(f"Max file size for content: {max_file_size_kb} KB")
|
| 331 |
+
logging.info(f"Include file content: {include_content}")
|
| 332 |
+
if input_type == "URL" and git_branch:
|
| 333 |
+
logging.info(f"Requested Git branch/tag: {git_branch}")
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
with tempfile.TemporaryDirectory(prefix="repo_md_") as temp_dir:
|
| 337 |
+
logging.info(f"Created temporary directory: {temp_dir}")
|
| 338 |
+
temp_dir_path = Path(temp_dir)
|
| 339 |
+
|
| 340 |
+
if input_type == "URL":
|
| 341 |
+
if not repo_url or not (repo_url.startswith("http://") or repo_url.startswith("https://") or repo_url.startswith("git@")):
|
| 342 |
+
raise ValueError("Invalid Git URL. Must start with http(s):// or git@")
|
| 343 |
+
yield f"Status: Processing URL: {repo_url}" + (f" (branch/tag: {git_branch})" if git_branch else "")
|
| 344 |
+
|
| 345 |
+
target_clone_path = temp_dir_path / "repo"
|
| 346 |
+
target_clone_path.mkdir()
|
| 347 |
+
repo_path_str = str(target_clone_path)
|
| 348 |
+
|
| 349 |
+
# --- Git Clone ---
|
| 350 |
+
branch_args = ["--branch", git_branch] if git_branch and git_branch.strip() else []
|
| 351 |
+
common_args = ["--depth", "1"] # Always shallow clone
|
| 352 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
try:
|
| 354 |
+
# Try sparse checkout first
|
| 355 |
+
yield "Status: Attempting efficient Git clone (sparse)..."
|
| 356 |
+
clone_cmd_sparse = ["git", "clone"] + common_args + ["--filter=blob:none", "--no-checkout"] + branch_args + [repo_url, repo_path_str]
|
| 357 |
+
logging.info(f"Running sparse clone command: {' '.join(clone_cmd_sparse)}")
|
| 358 |
+
subprocess.run(clone_cmd_sparse, check=True, capture_output=True, text=True, encoding='utf-8', errors='replace', timeout=CLONE_TIMEOUT_SPARSE)
|
| 359 |
+
|
| 360 |
+
checkout_cmd_sparse = ["git", "sparse-checkout", "init", "--cone"]
|
| 361 |
+
logging.info(f"Running sparse checkout init: {' '.join(checkout_cmd_sparse)}")
|
| 362 |
+
subprocess.run(checkout_cmd_sparse, cwd=repo_path_str, check=True, capture_output=True, text=True)
|
| 363 |
+
|
| 364 |
+
checkout_cmd = ["git", "checkout"]
|
| 365 |
+
logging.info(f"Running final checkout: {' '.join(checkout_cmd)}")
|
| 366 |
+
subprocess.run(checkout_cmd, cwd=repo_path_str, check=True, capture_output=True, text=True, encoding='utf-8', errors='replace', timeout=CLONE_TIMEOUT_SPARSE)
|
| 367 |
+
yield "Status: Efficient Git clone successful."
|
| 368 |
+
except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e_sparse:
|
| 369 |
+
yield f"Status: Efficient clone failed ({type(e_sparse).__name__}), attempting standard clone..."
|
| 370 |
+
logging.warning(f"Sparse clone failed: {e_sparse}. Output: {e_sparse.stderr if hasattr(e_sparse, 'stderr') else 'N/A'}")
|
| 371 |
+
shutil.rmtree(target_clone_path, ignore_errors=True)
|
| 372 |
+
target_clone_path.mkdir()
|
| 373 |
+
|
| 374 |
+
try:
|
| 375 |
+
# Fallback to standard shallow clone
|
| 376 |
+
clone_cmd_std = ["git", "clone"] + common_args + branch_args + [repo_url, repo_path_str]
|
| 377 |
+
logging.info(f"Running standard clone command: {' '.join(clone_cmd_std)}")
|
| 378 |
+
subprocess.run(clone_cmd_std, check=True, capture_output=True, text=True, encoding='utf-8', errors='replace', timeout=CLONE_TIMEOUT_STANDARD)
|
| 379 |
+
yield "Status: Standard shallow clone successful."
|
| 380 |
+
except FileNotFoundError:
|
| 381 |
+
logging.error("Git command not found.")
|
| 382 |
+
raise RuntimeError("Git command not found. Please install Git and ensure it's in your PATH.")
|
| 383 |
+
except subprocess.CalledProcessError as e_std:
|
| 384 |
+
error_detail = e_std.stderr or e_std.stdout or "No output captured."
|
| 385 |
+
logging.error(f"Standard Git clone failed: {error_detail.strip()}")
|
| 386 |
+
raise RuntimeError(f"Git clone failed:\n{error_detail.strip()}")
|
| 387 |
+
except subprocess.TimeoutExpired:
|
| 388 |
+
logging.error(f"Git clone timed out after {CLONE_TIMEOUT_STANDARD} seconds.")
|
| 389 |
+
raise RuntimeError(f"Git clone timed out after {CLONE_TIMEOUT_STANDARD // 60} minutes.")
|
| 390 |
+
|
| 391 |
+
repo_path = target_clone_path
|
| 392 |
+
|
| 393 |
+
elif input_type == "Upload ZIP":
|
| 394 |
+
if uploaded_zip is None or not hasattr(uploaded_zip, 'name'):
|
| 395 |
+
raise ValueError("No ZIP file uploaded or invalid file object.")
|
| 396 |
+
yield f"Status: Processing uploaded ZIP: {Path(uploaded_zip.name).name}"
|
| 397 |
+
|
| 398 |
+
target_extract_path = temp_dir_path / "extracted"
|
| 399 |
+
target_extract_path.mkdir()
|
| 400 |
+
|
| 401 |
try:
|
| 402 |
+
with zipfile.ZipFile(uploaded_zip.name, 'r') as zip_ref:
|
| 403 |
+
members = zip_ref.namelist()
|
| 404 |
+
num_files = len(members)
|
| 405 |
+
yield f"Status: Extracting {num_files} entries from ZIP..."
|
| 406 |
+
logging.info(f"ZIP contains {num_files} entries.")
|
| 407 |
+
if num_files > ZIP_EXTRACT_WARN_THRESHOLD:
|
| 408 |
+
logging.warning(f"ZIP contains a large number of files ({num_files}).")
|
| 409 |
+
|
| 410 |
+
# Security Checks
|
| 411 |
+
for member in members:
|
| 412 |
+
if member.startswith('/') or member.startswith('\\') or '..' in member.split(os.path.sep):
|
| 413 |
+
raise ValueError(f"ZIP contains potentially unsafe path: '{member}'. Aborting.")
|
| 414 |
+
if len(member) > 1024: # Limit path length
|
| 415 |
+
raise ValueError(f"ZIP contains excessively long path: '{member[:100]}...'. Aborting.")
|
| 416 |
+
|
| 417 |
+
zip_ref.extractall(target_extract_path)
|
| 418 |
+
yield "Status: ZIP extraction complete."
|
| 419 |
+
logging.info("ZIP extraction complete.")
|
| 420 |
+
|
| 421 |
+
except zipfile.BadZipFile:
|
| 422 |
+
logging.error("Invalid or corrupted ZIP file uploaded.")
|
| 423 |
+
raise ValueError("Invalid or corrupted ZIP file.")
|
| 424 |
except Exception as e_extract:
|
| 425 |
+
logging.error(f"Failed to extract ZIP file: {e_extract}", exc_info=True)
|
| 426 |
raise RuntimeError(f"Failed to extract ZIP file: {e_extract}")
|
| 427 |
|
| 428 |
+
# Determine repo root within extracted files
|
| 429 |
+
extracted_items = list(target_extract_path.iterdir())
|
| 430 |
+
filtered_items = [item for item in extracted_items if item.name not in (".DS_Store", "__MACOSX")]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
+
if len(filtered_items) == 1 and filtered_items[0].is_dir():
|
| 433 |
+
repo_path = filtered_items[0]
|
| 434 |
+
logging.info(f"Detected single root directory in ZIP: {repo_path.name}")
|
| 435 |
+
else:
|
| 436 |
+
repo_path = target_extract_path
|
| 437 |
+
logging.info("Using root of extracted ZIP as repository root.")
|
| 438 |
|
|
|
|
|
|
|
|
|
|
| 439 |
else:
|
| 440 |
+
raise ValueError("Invalid input type selected.")
|
| 441 |
+
|
| 442 |
+
if not repo_path or not repo_path.is_dir():
|
| 443 |
+
raise RuntimeError(f"Could not determine valid repository path.")
|
| 444 |
+
|
| 445 |
+
yield f"Status: Repository path identified: {repo_path}"
|
| 446 |
+
|
| 447 |
+
# --- Generate Markdown ---
|
| 448 |
+
# This function now yields status updates internally and the final result
|
| 449 |
+
generator = generate_markdown_for_repo(str(repo_path), combined_patterns, max_file_size_kb, include_content)
|
| 450 |
+
while True:
|
| 451 |
+
try:
|
| 452 |
+
status_or_result = next(generator)
|
| 453 |
+
if status_or_result.startswith("Status:"):
|
| 454 |
+
yield status_or_result # Yield status updates
|
| 455 |
+
else:
|
| 456 |
+
output_md = status_or_result # Final result
|
| 457 |
+
break # Exit loop once markdown is generated
|
| 458 |
+
except StopIteration:
|
| 459 |
+
# Should have received the final result before StopIteration
|
| 460 |
+
logging.error("Markdown generator finished unexpectedly without yielding final result.")
|
| 461 |
+
raise RuntimeError("Markdown generation failed internally.")
|
| 462 |
+
|
| 463 |
+
# Save markdown to a temporary file for download
|
| 464 |
+
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md", encoding='utf-8', prefix="repo_analysis_") as f:
|
| 465 |
+
f.write(output_md)
|
| 466 |
+
output_file_path = f.name
|
| 467 |
+
yield f"Status: Analysis complete. Output saved to {Path(output_file_path).name}"
|
| 468 |
|
| 469 |
except Exception as e:
|
| 470 |
+
logging.error(f"An error occurred during processing: {e}", exc_info=True)
|
| 471 |
error_message = f"An error occurred: {e}"
|
| 472 |
+
# Yield a final error status and the error message for the main output
|
| 473 |
+
yield f"Status: Error - {error_message}"
|
| 474 |
+
yield f"### Operation Failed\n\n```\n{error_message}\n```" # Final yield for output area
|
| 475 |
+
output_file_path = None
|
| 476 |
|
| 477 |
finally:
|
| 478 |
+
# Temp directory is cleaned up automatically by the 'with' statement
|
| 479 |
+
end_time = time.time()
|
| 480 |
+
logging.info(f"Total processing time: {end_time - start_time:.2f} seconds.")
|
| 481 |
+
|
| 482 |
+
# Return the file path for the download component (or None on error)
|
| 483 |
+
# The final text output is handled by the last yield in try/except blocks.
|
| 484 |
+
yield output_file_path # Yield the file path for the gr.File component update
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
|
| 486 |
|
| 487 |
# --- Gradio Interface ---
|
| 488 |
|
| 489 |
css = """
|
| 490 |
+
body { font-family: sans-serif; }
|
| 491 |
+
#md_output_panel { /* Style the output panel */
|
| 492 |
+
max-height: 80vh;
|
| 493 |
+
}
|
| 494 |
#md_output {
|
| 495 |
+
max-height: 70vh; /* Adjust max height for content */
|
| 496 |
+
overflow: auto;
|
| 497 |
+
border: 1px solid #ccc;
|
| 498 |
+
border-radius: 5px;
|
| 499 |
+
padding: 15px;
|
| 500 |
+
background-color: #f9f9f9;
|
| 501 |
+
}
|
| 502 |
+
#md_output h1 { font-size: 1.6em; border-bottom: 1px solid #eee; padding-bottom: 5px; margin-top: 0;}
|
| 503 |
+
#md_output h2 { font-size: 1.3em; border-bottom: 1px solid #eee; padding-bottom: 5px; margin-top: 20px; }
|
| 504 |
+
#md_output h3 { font-size: 1.1em; margin-top: 15px; margin-bottom: 5px; color: #333; }
|
| 505 |
+
#md_output code { background-color: #eee; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
|
| 506 |
+
#md_output pre { background-color: #fff; padding: 10px; border-radius: 4px; border: 1px solid #ddd; white-space: pre-wrap; word-wrap: break-word; }
|
| 507 |
+
#md_output pre > code { display: block; padding: 0; background-color: transparent; border: none; font-size: 0.9em;} /* Better code block styling */
|
| 508 |
+
|
| 509 |
+
#status_box {
|
| 510 |
+
font-size: 0.9em;
|
| 511 |
+
color: #555;
|
| 512 |
+
padding: 8px;
|
| 513 |
+
border: 1px dashed #ddd;
|
| 514 |
+
background-color: #fafafa;
|
| 515 |
+
border-radius: 4px;
|
| 516 |
+
min-height: 3em; /* Ensure it's visible even when short messages */
|
| 517 |
+
margin-top: 10px;
|
| 518 |
}
|
| 519 |
+
#copy_button { /* Style the copy button */
|
| 520 |
+
margin-left: 10px;
|
| 521 |
+
min-width: 100px; /* Give it a bit more width */
|
| 522 |
}
|
| 523 |
+
#download_output { margin-top: 15px; }
|
| 524 |
+
footer { display: none !important; }
|
| 525 |
+
.gradio-container { max-width: 1360px !important; margin: auto !important; }
|
| 526 |
"""
|
| 527 |
|
| 528 |
+
# --- Helper function for Copy Button ---
|
| 529 |
+
def copy_to_clipboard(text):
|
| 530 |
+
if PYPERCLIP_AVAILABLE and text:
|
| 531 |
+
try:
|
| 532 |
+
pyperclip.copy(text)
|
| 533 |
+
logging.info("Copied output to clipboard.")
|
| 534 |
+
return gr.update(value="Copied!", variant="secondary") # Temporary feedback
|
| 535 |
+
except Exception as e:
|
| 536 |
+
logging.error(f"Failed to copy to clipboard: {e}")
|
| 537 |
+
return gr.update(value="Copy Failed", variant="stop")
|
| 538 |
+
elif not PYPERCLIP_AVAILABLE:
|
| 539 |
+
logging.warning("Copy attempt failed: pyperclip not installed.")
|
| 540 |
+
return gr.update(value="Install Pyperclip", variant="stop")
|
| 541 |
+
else: # No text to copy
|
| 542 |
+
return gr.update(value="Nothing to Copy", variant="secondary")
|
| 543 |
+
|
| 544 |
+
def reset_copy_button():
|
| 545 |
+
# Short delay before resetting button appearance
|
| 546 |
+
time.sleep(1.5)
|
| 547 |
+
return gr.update(value="Copy Markdown", variant="secondary")
|
| 548 |
+
|
| 549 |
+
|
| 550 |
+
with gr.Blocks(css=css, title="Repo Analyzer", theme=gr.themes.Soft()) as demo:
|
| 551 |
+
gr.Markdown("# Repository Analyzer")
|
| 552 |
gr.Markdown(
|
| 553 |
"Enter a public Git repository URL or upload a local project folder (as a `.zip` archive) "
|
| 554 |
+
"to generate a single Markdown file containing its structure and optionally file contents. "
|
| 555 |
+
"Provides real-time status updates."
|
| 556 |
)
|
| 557 |
|
| 558 |
with gr.Row():
|
| 559 |
+
# --- Input Column ---
|
| 560 |
+
with gr.Column(scale=1):
|
| 561 |
+
gr.Markdown("### Input Source & Options")
|
| 562 |
input_type = gr.Radio(
|
| 563 |
["URL", "Upload ZIP"], label="Input Source", value="URL"
|
| 564 |
)
|
| 565 |
|
| 566 |
+
# URL Specific Inputs (conditionally visible)
|
| 567 |
url_input = gr.Textbox(
|
| 568 |
label="Git Repository URL",
|
| 569 |
+
placeholder="e.g., https://github.com/gradio-app/gradio.git or [email protected]:user/repo.git",
|
| 570 |
+
visible=True, interactive=True, elem_id="url-input"
|
| 571 |
+
)
|
| 572 |
+
git_branch_input = gr.Textbox(
|
| 573 |
+
label="Branch / Tag (Optional)",
|
| 574 |
+
placeholder="e.g., main, develop, v1.2.3 (leave empty for default)",
|
| 575 |
+
visible=True, interactive=True, elem_id="git-branch-input"
|
| 576 |
)
|
| 577 |
|
| 578 |
+
# ZIP Specific Inputs (conditionally visible)
|
| 579 |
zip_input = gr.File(
|
| 580 |
label="Upload Local Folder (as .zip)",
|
| 581 |
file_types=[".zip"],
|
| 582 |
+
visible=False, interactive=True, elem_id="zip-input"
|
|
|
|
|
|
|
| 583 |
)
|
| 584 |
|
| 585 |
+
# --- Common Options in Accordion ---
|
| 586 |
+
with gr.Accordion("Configuration Options", open=False):
|
| 587 |
+
include_content_checkbox = gr.Checkbox(
|
| 588 |
+
label="Include File Content in Output",
|
| 589 |
+
value=True,
|
| 590 |
+
info="Generate structure only if unchecked."
|
| 591 |
+
)
|
| 592 |
+
max_size_input = gr.Number(
|
| 593 |
+
label="Max File Size for Content (KB)",
|
| 594 |
+
value=DEFAULT_MAX_FILE_SIZE_KB, minimum=0, step=64, precision=0,
|
| 595 |
+
info="Files larger than this won't have content included (if enabled). 0 disables content.",
|
| 596 |
+
)
|
| 597 |
+
ignore_input = gr.Textbox(
|
| 598 |
+
label="Ignore Patterns (comma-separated, gitignore style)",
|
| 599 |
+
value=", ".join(DEFAULT_IGNORE_PATTERNS),
|
| 600 |
+
placeholder="e.g., .git/, *.log, node_modules/",
|
| 601 |
+
info="Uses gitignore syntax. Add `/` for directories. Defaults provided.",
|
| 602 |
+
lines=5, max_lines=15
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
submit_btn = gr.Button("Analyze Repository", variant="primary")
|
| 606 |
+
|
| 607 |
+
gr.Markdown("### Status Updates")
|
| 608 |
+
status_output = gr.Textbox(label="Current Status", value="Idle.", interactive=False, lines=3, elem_id="status_box")
|
| 609 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 610 |
|
| 611 |
+
# --- Output Column ---
|
| 612 |
+
with gr.Column(scale=2):
|
| 613 |
+
gr.Markdown("### Generated Output")
|
| 614 |
+
with gr.Row(elem_id="output_header_row"):
|
| 615 |
+
copy_button = gr.Button("Copy Markdown", variant="secondary", elem_id="copy_button", visible=PYPERCLIP_AVAILABLE) # Hide if pyperclip missing
|
| 616 |
+
download_output = gr.File(label="Download .md File", interactive=False, visible=False, elem_id="download_output", scale=1) # Take less space initially
|
| 617 |
+
|
| 618 |
+
md_output = gr.Markdown(value="*Awaiting analysis results...*", elem_id="md_output", visible=True)
|
| 619 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
|
| 621 |
+
# --- Event Handlers ---
|
| 622 |
|
| 623 |
+
# Update visibility based on input type choice
|
| 624 |
def update_input_visibility(choice):
|
| 625 |
+
is_url = choice == "URL"
|
| 626 |
+
return {
|
| 627 |
+
url_input: gr.update(visible=is_url),
|
| 628 |
+
git_branch_input: gr.update(visible=is_url),
|
| 629 |
+
zip_input: gr.update(visible=not is_url)
|
| 630 |
+
}
|
| 631 |
|
| 632 |
input_type.change(
|
| 633 |
fn=update_input_visibility,
|
| 634 |
inputs=input_type,
|
| 635 |
+
outputs=[url_input, git_branch_input, zip_input],
|
| 636 |
+
queue=False # UI only change
|
| 637 |
)
|
| 638 |
|
| 639 |
+
# Main processing logic on submit
|
|
|
|
| 640 |
submit_btn.click(
|
| 641 |
+
fn=repo_to_md_processor, # The generator function
|
| 642 |
inputs=[
|
| 643 |
+
input_type, url_input, zip_input, git_branch_input,
|
| 644 |
+
ignore_input, max_size_input, include_content_checkbox,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
],
|
| 646 |
+
# Outputs map to yielded values: status strings, final markdown, final file path
|
| 647 |
+
outputs=[ status_output, md_output, download_output ],
|
| 648 |
+
api_name="repo_to_md"
|
|
|
|
|
|
|
| 649 |
)
|
| 650 |
|
| 651 |
+
# Copy button functionality
|
| 652 |
+
if PYPERCLIP_AVAILABLE:
|
| 653 |
+
copy_button.click(
|
| 654 |
+
fn=copy_to_clipboard,
|
| 655 |
+
inputs=[md_output], # Takes the current markdown content
|
| 656 |
+
outputs=[copy_button], # Updates its own text/appearance
|
| 657 |
+
queue=False
|
| 658 |
+
).then(
|
| 659 |
+
fn=reset_copy_button, # Function to reset button after a delay
|
| 660 |
+
inputs=None,
|
| 661 |
+
outputs=[copy_button],
|
| 662 |
+
queue=False # Don't queue the reset visual change
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
# Launch the interface
|
| 666 |
if __name__ == "__main__":
|
| 667 |
+
demo.queue().launch(server_name="0.0.0.0", show_error=True, debug=True) # Enable queue & debug for better testing
|
|
|