VOIDER commited on
Commit
24226a4
·
verified ·
1 Parent(s): 5c9292e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +537 -329
app.py CHANGED
@@ -8,452 +8,660 @@ import fnmatch
8
  from pathlib import Path
9
  from pygments.lexers import guess_lexer_for_filename
10
  from pygments.util import ClassNotFound
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # --- Configuration ---
 
 
13
  DEFAULT_IGNORE_PATTERNS = [
14
- ".git/", "*.pyc", "*__pycache__*", "node_modules/", ".DS_Store",
15
- "*.log", "*.tmp", "*.swp", ".env", ".venv/", "venv/", ".idea/", ".vscode/",
16
- "build/", "dist/", "*.egg-info/"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  ]
18
  DEFAULT_MAX_FILE_SIZE_KB = 1024 # 1 MB limit for file content inclusion
 
 
 
 
19
 
20
  # --- Core Logic ---
21
 
22
- def should_ignore(path_str: str, ignore_patterns: list[str], repo_root: Path) -> bool:
23
- """Checks if a path matches any of the ignore patterns."""
24
  try:
25
- relative_path = Path(path_str).relative_to(repo_root)
26
- except ValueError: # If path_str is not relative to repo_root (shouldn't happen with os.walk)
27
- return True # Treat as ignore if path resolution fails unexpectedly
28
-
29
- # Check against directory patterns (ending with /)
 
 
 
 
 
 
30
  for pattern in ignore_patterns:
31
- pattern_clean = pattern.rstrip('/')
32
- # Use Path.match for simpler pattern matching against relative paths
33
- # Convert pattern to POSIX-like format for consistency if needed, though fnmatch handles OS specifics
34
- try:
35
- # Match against the full relative path
36
- if fnmatch.fnmatch(str(relative_path), pattern):
37
- #print(f"Ignoring '{relative_path}' due to pattern '{pattern}'")
38
- return True
39
- # Match against just the name (for patterns like *.log)
40
- if fnmatch.fnmatch(relative_path.name, pattern):
41
- #print(f"Ignoring '{relative_path}' due to pattern '{pattern}'")
 
 
 
 
 
 
42
  return True
43
- # Special handling for directory patterns (e.g., "node_modules/")
44
- if pattern.endswith('/') and os.path.isdir(path_str):
45
- # Check if the directory name matches the pattern
46
- if relative_path.name == pattern_clean:
47
- #print(f"Ignoring directory '{relative_path}' due to pattern '{pattern}'")
48
- return True
49
- # Check if the path starts with the directory pattern
50
- if str(relative_path).startswith(pattern_clean + os.sep):
51
- #print(f"Ignoring path '{relative_path}' within ignored dir '{pattern}'")
52
- return True
53
- # Handle cases where pattern might match a parent directory implicitly
54
- if pattern.endswith('/'):
55
- # Check if any parent directory name matches the pattern
56
- for parent in relative_path.parents:
57
- if parent.name == pattern_clean:
58
- #print(f"Ignoring '{relative_path}' due to parent match on '{pattern}'")
59
- return True
60
- # This logic might be complex, reconsider if fnmatch covers enough
61
- # Simplified: Check if the path starts with the pattern directory
62
- if str(relative_path).startswith(pattern_clean + os.sep):
63
- #print(f"Ignoring '{relative_path}' due to prefix match on '{pattern}'")
64
- return True
65
 
66
- except Exception as e:
67
- print(f"Warning: Error during ignore pattern matching for '{relative_path}' with pattern '{pattern}': {e}")
68
 
69
- return False
 
 
 
 
 
 
 
70
 
71
 
72
- def is_likely_binary(file_path: str, chunk_size=1024) -> bool:
 
 
73
  """Checks if a file is likely binary by reading a chunk."""
74
  try:
75
- with open(file_path, 'rb') as f:
76
  chunk = f.read(chunk_size)
77
- return b'\0' in chunk # Null byte is a strong indicator of binary data
78
- except Exception:
79
- return True # Assume binary if reading fails
 
80
 
81
- def get_file_content(file_path: str, max_size_bytes: int) -> tuple[str | None, str | None, str | None]:
82
- """
83
- Reads file content, detects language, handles size limits and encodings.
84
- Returns (content, language, error_message)
85
- """
86
  try:
87
- file_size = os.path.getsize(file_path)
88
  if file_size > max_size_bytes:
89
- return None, None, f"[Content skipped: File size ({file_size / 1024:.1f} KB) exceeds limit ({max_size_bytes / 1024:.1f} KB)]"
 
 
 
 
 
90
 
91
  if is_likely_binary(file_path):
92
- return None, None, "[Content skipped: Detected as binary file]"
93
 
94
- # Try reading with UTF-8, fallback to latin-1 for robustness
 
95
  try:
96
- with open(file_path, 'r', encoding='utf-8') as f:
97
  content = f.read()
98
  except UnicodeDecodeError:
 
 
99
  try:
100
- with open(file_path, 'r', encoding='latin-1') as f:
101
  content = f.read()
102
  except Exception as e_read:
 
103
  return None, None, f"[Content skipped: Error reading file - {e_read}]"
 
 
 
104
 
105
- # Guess language for syntax highlighting
106
  try:
107
- lexer = guess_lexer_for_filename(file_path, content)
108
  language = lexer.aliases[0] if lexer.aliases else lexer.name
109
  except ClassNotFound:
110
- language = "" # No language detected
 
 
 
111
 
112
  return content, language, None
113
 
 
 
 
114
  except Exception as e:
115
- return None, None, f"[Content skipped: Error processing file - {e}]"
116
-
117
- # MODIFIED: Added include_content parameter
118
- def generate_markdown_for_repo(repo_path: str, ignore_patterns: list[str], max_file_size_kb: int, include_content: bool) -> str:
119
- """Generates Markdown content for the repository structure and optionally files."""
120
- repo_root = Path(repo_path).resolve()
121
- md_content = ["# Repository Structure and Content\n\n"]
122
- file_contents_md = []
123
- max_size_bytes = max_file_size_kb * 1024
124
 
125
- processed_paths = set() # To avoid duplicate processing if walk yields dirs multiple times
 
 
 
 
 
 
 
 
126
 
127
- # --- Pass 1: Build the directory structure ---
128
- md_content.append("## Directory Structure\n\n```\n")
129
  structure_lines = []
130
- for root, dirs, files in os.walk(repo_path, topdown=True):
131
- root_path = Path(root).resolve()
132
-
133
- # --- Ignore directories based on patterns ---
134
- original_dirs = list(dirs) # Copy because we modify dirs list
135
- dirs[:] = [d for d in original_dirs if not should_ignore(str(root_path / d), ignore_patterns, repo_root)]
136
 
137
- # Check if the current root itself should be ignored
138
- if root_path != repo_root and should_ignore(str(root_path), ignore_patterns, repo_root):
139
- continue # Skip processing this directory and its contents further
 
 
140
 
141
- if root_path not in processed_paths:
142
- try:
143
- relative_root = root_path.relative_to(repo_root)
144
- depth = len(relative_root.parts)
145
- indent = " " * depth
146
- # Use '.' for the root directory itself if it's the starting point
147
- dir_name = relative_root.name if relative_root.parts else "."
148
- structure_lines.append(f"{indent}{dir_name}/")
149
- processed_paths.add(root_path)
150
-
151
- # Add files in this directory to the structure
152
- files.sort() # Sort files for consistent output
153
- for file in files:
154
- file_path = root_path / file
155
- if not should_ignore(str(file_path), ignore_patterns, repo_root):
156
- structure_lines.append(f"{indent} {file}")
157
- except ValueError:
158
- # Handle cases where root_path might somehow not be under repo_root
159
- print(f"Warning: Path {root_path} not relative to {repo_root}, skipping in structure.")
160
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
 
163
- md_content.append("\n".join(structure_lines))
164
- md_content.append("```\n\n")
 
 
165
 
166
  # --- Pass 2: Process file contents (ONLY if requested) ---
167
- if include_content:
168
- md_content.append("## File Contents\n") # Add header only if including content
169
-
170
- for root, dirs, files in os.walk(repo_path, topdown=True):
171
- root_path = Path(root).resolve()
172
 
173
- # --- Apply Ignore Patterns (again, for consistency and pruning) ---
174
- dirs[:] = [d for d in dirs if not should_ignore(str(root_path / d), ignore_patterns, repo_root)]
175
- if root_path != repo_root and should_ignore(str(root_path), ignore_patterns, repo_root):
176
- continue
 
177
 
178
- # --- Process Files ---
179
- files.sort() # Ensure consistent order
180
- for file in files:
181
- file_path = root_path / file
182
 
183
- if should_ignore(str(file_path), ignore_patterns, repo_root):
184
- continue
 
 
 
 
 
 
 
 
 
 
 
185
 
 
 
 
 
 
 
186
  try:
187
- relative_path = file_path.relative_to(repo_root)
188
- file_contents_md.append(f"\n### `{relative_path}`\n")
189
- content, language, error_msg = get_file_content(str(file_path), max_size_bytes)
190
-
191
- if error_msg:
192
- file_contents_md.append(f"```\n{error_msg}\n```\n")
193
- elif content is not None:
194
- file_contents_md.append(f"```{language}\n{content}\n```\n")
195
- else: # Should not happen if error_msg logic is correct, but as fallback
196
- file_contents_md.append("```\n[Content could not be retrieved]\n```\n")
197
- except ValueError:
198
- print(f"Warning: Path {file_path} not relative to {repo_root}, skipping content.")
199
- continue
200
-
201
-
202
- md_content.extend(file_contents_md) # Append collected file contents
203
-
204
- # If not include_content, the "## File Contents" header and the second loop are skipped.
205
-
206
- return "".join(md_content)
207
-
208
-
209
- # MODIFIED: Added include_content parameter
210
- def repo_to_md(input_type: str, repo_url: str | None, uploaded_zip: tempfile._TemporaryFileWrapper | None, ignore_patterns_str: str, max_file_size_kb: int, include_content: bool):
211
- """Main function called by Gradio interface."""
212
- temp_dir = None
 
 
 
 
 
213
  repo_path = None
214
  output_md = ""
215
  output_file_path = None
216
  error_message = None
217
-
218
- ignore_patterns = [p.strip() for p in ignore_patterns_str.split(',') if p.strip()]
219
- # Ensure default patterns are added only if they aren't already covered by user patterns
220
- # A simple way is just to combine and remove duplicates
221
- combined_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
222
- ignore_patterns = sorted(list(set(combined_patterns))) # Keep unique and sort for consistency if needed
223
 
224
  try:
225
- temp_dir = tempfile.mkdtemp()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
- if input_type == "URL":
228
- if not repo_url or not (repo_url.startswith("http://") or repo_url.startswith("https://")):
229
- raise ValueError("Invalid Git URL provided. Must start with http:// or https://")
230
- print(f"Cloning repository: {repo_url}")
231
- try:
232
- # Attempt sparse checkout if available (modern Git)
233
- # This might fail on older Git versions, hence the fallback
234
- subprocess.run(
235
- ["git", "clone", "--depth", "1", "--filter=blob:none", "--no-checkout", repo_url, temp_dir],
236
- check=True, capture_output=True, text=True, timeout=60
237
- )
238
- subprocess.run(["git", "sparse-checkout", "init", "--cone"], cwd=temp_dir, check=True, capture_output=True, text=True)
239
- subprocess.run(["git", "checkout"], cwd=temp_dir, check=True, capture_output=True, text=True, timeout=120)
240
- print("Cloning successful (sparse/filtered).")
241
- except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e_sparse:
242
- print(f"Sparse clone failed ({e_sparse}), attempting standard shallow clone...")
243
- # Fallback to standard shallow clone
244
- shutil.rmtree(temp_dir) # Clean up failed attempt
245
- temp_dir = tempfile.mkdtemp() # Recreate temp dir
246
  try:
247
- subprocess.run(
248
- ["git", "clone", "--depth", "1", repo_url, temp_dir],
249
- check=True,
250
- capture_output=True,
251
- text=True,
252
- timeout=180 # 3 min timeout for standard clone
253
- )
254
- print("Cloning successful (standard shallow).")
255
- except FileNotFoundError:
256
- raise RuntimeError("Git command not found. Ensure git is installed and in your PATH.")
257
- except subprocess.CalledProcessError as e_std:
258
- # Capture more detailed error from stderr
259
- error_detail = e_std.stderr or e_std.stdout or "No output captured."
260
- raise RuntimeError(f"Git clone failed:\n{error_detail.strip()}")
261
- except subprocess.TimeoutExpired:
262
- raise RuntimeError("Git clone timed out after 3 minutes.")
263
- repo_path = temp_dir
264
-
265
- elif input_type == "Upload ZIP":
266
- if uploaded_zip is None:
267
- raise ValueError("No ZIP file uploaded.")
268
- print(f"Extracting uploaded ZIP: {uploaded_zip.name}")
269
- with zipfile.ZipFile(uploaded_zip.name, 'r') as zip_ref:
270
- # Check for potentially malicious paths (e.g., ../..)
271
- for member in zip_ref.namelist():
272
- # Basic check for absolute paths or paths trying to go up the directory tree
273
- if member.startswith('/') or member.startswith('\\') or '..' in member.split(os.path.sep):
274
- raise ValueError(f"ZIP contains potentially unsafe path: {member}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  try:
276
- zip_ref.extractall(temp_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  except Exception as e_extract:
 
278
  raise RuntimeError(f"Failed to extract ZIP file: {e_extract}")
279
 
280
- # --- Find the actual repo root within the extracted files ---
281
- extracted_items = os.listdir(temp_dir)
282
- # Remove macOS specific hidden file if present
283
- if ".DS_Store" in extracted_items:
284
- extracted_items.remove(".DS_Store")
285
- # Check for __MACOSX folder often created by macOS archiver
286
- if "__MACOSX" in extracted_items and os.path.isdir(os.path.join(temp_dir, "__MACOSX")):
287
- extracted_items.remove("__MACOSX")
288
 
 
 
 
 
 
 
289
 
290
- if len(extracted_items) == 1 and os.path.isdir(os.path.join(temp_dir, extracted_items[0])):
291
- repo_path = os.path.join(temp_dir, extracted_items[0])
292
- print(f"Detected repo root inside ZIP: {extracted_items[0]}")
293
  else:
294
- # Assume the root of the zip is the repo root
295
- repo_path = temp_dir
296
- print("Using root of the extracted ZIP as repo root.")
297
-
298
- else:
299
- raise ValueError("Invalid input type selected.")
300
-
301
- if not repo_path or not os.path.isdir(repo_path):
302
- raise RuntimeError(f"Could not determine repository path after processing input.")
303
-
304
- print(f"Generating Markdown for path: {repo_path}")
305
- # MODIFIED: Pass include_content flag
306
- output_md = generate_markdown_for_repo(repo_path, ignore_patterns, max_file_size_kb, include_content)
307
- print("Markdown generation complete.")
308
-
309
- # Save markdown to a temporary file for download
310
- # Ensure temp file has '.md' extension for Gradio File component to handle it nicely
311
- with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md", encoding='utf-8', prefix="repo_") as f:
312
- f.write(output_md)
313
- output_file_path = f.name
314
- print(f"Markdown saved to temporary file: {output_file_path}")
315
-
 
 
 
 
 
 
316
 
317
  except Exception as e:
 
318
  error_message = f"An error occurred: {e}"
319
- print(f"Error: {e}") # Log error to console for debugging
320
- # Ensure output is reset on error
321
- output_md = f"```\n{error_message}\n```" # Display error in the markdown area
322
- output_file_path = None # No file to download on error
323
 
324
  finally:
325
- # Clean up the temporary directory used for cloning/extraction
326
- if temp_dir and os.path.exists(temp_dir):
327
- try:
328
- shutil.rmtree(temp_dir, ignore_errors=True) # Be more robust on cleanup
329
- print(f"Attempted cleanup of temporary directory: {temp_dir}")
330
- except Exception as e_clean:
331
- print(f"Warning: Failed to cleanup temporary directory {temp_dir}: {e_clean}")
332
- # Gradio's gr.File component should manage its temp files when value is updated.
333
- # If we created the temp file (output_file_path) AND there was NO error,
334
- # we might need manual cleanup later if Gradio doesn't handle it, but usually it does.
335
- # Let's assume Gradio handles the download file cleanup for now.
336
-
337
- # Return values for Gradio outputs
338
- # Always return a string for md_output (either the result or the error message)
339
- # Return the file path for download only on success, otherwise None (or an invisible File update)
340
- if output_file_path:
341
- return output_md, gr.File(value=output_file_path, visible=True)
342
- else:
343
- # If there was an error, output_md contains the error message
344
- # And we hide the download button
345
- return output_md, gr.File(visible=False)
346
 
347
 
348
  # --- Gradio Interface ---
349
 
350
  css = """
 
 
 
 
351
  #md_output {
352
- max-height: 70vh; /* Adjust max height as needed */
353
- overflow-y: auto; /* Add scrollbar if content exceeds max height */
354
- border: 1px solid #e0e0e0; /* Optional: add a border */
355
- padding: 10px; /* Optional: add some padding */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  }
357
- #download_output { /* Style the download component if needed */
358
- margin-top: 10px; /* Add space above download button */
 
359
  }
360
- footer { display: none !important; } /* Hide Gradio footer */
 
 
361
  """
362
 
363
- with gr.Blocks(css=css, title="Repo to Markdown Converter") as demo:
364
- gr.Markdown("# Repository to Markdown Converter")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  gr.Markdown(
366
  "Enter a public Git repository URL or upload a local project folder (as a `.zip` archive) "
367
- "to generate a single Markdown file containing its structure and optionally file contents."
 
368
  )
369
 
370
  with gr.Row():
371
- with gr.Column(scale=2):
 
 
372
  input_type = gr.Radio(
373
  ["URL", "Upload ZIP"], label="Input Source", value="URL"
374
  )
375
 
376
- # --- URL Input ---
377
  url_input = gr.Textbox(
378
  label="Git Repository URL",
379
- placeholder="e.g., https://github.com/gradio-app/gradio.git",
380
- visible=True, # Initially visible
381
- interactive=True
 
 
 
 
382
  )
383
 
384
- # --- Upload Input ---
385
  zip_input = gr.File(
386
  label="Upload Local Folder (as .zip)",
387
  file_types=[".zip"],
388
- visible=False, # Initially hidden
389
- interactive=True,
390
- # Use file_count='single' explicitly if needed, though default
391
  )
392
 
393
- # --- Shared Options ---
394
- ignore_input = gr.Textbox(
395
- label="Ignore Patterns (comma-separated)",
396
- value=", ".join(DEFAULT_IGNORE_PATTERNS),
397
- placeholder="e.g., .git/, *.log, node_modules/",
398
- info="Uses standard gitignore patterns (fnmatch). Add `/` for directories. Defaults are included."
399
- )
400
- max_size_input = gr.Number(
401
- label="Max File Size to Include Content (KB)",
402
- value=DEFAULT_MAX_FILE_SIZE_KB,
403
- minimum=0,
404
- step=64,
405
- info="Files larger than this will have their content skipped (if content inclusion is enabled)."
406
- )
 
 
 
 
 
 
 
 
 
 
407
 
408
- # --- ADDED: Checkbox for content inclusion ---
409
- include_content_checkbox = gr.Checkbox(
410
- label="Include File Content",
411
- value=True, # Default to including content
412
- info="Uncheck to generate only the directory structure."
413
- )
414
- # --- End Added Checkbox ---
415
 
416
- submit_btn = gr.Button("Generate Markdown", variant="primary")
 
 
 
 
 
 
 
417
 
418
- with gr.Column(scale=3):
419
- gr.Markdown("## Generated Output")
420
- md_output = gr.Markdown(elem_id="md_output", value="*Markdown output will appear here...*")
421
- download_output = gr.File(label="Download .md File", interactive=False, visible=False, elem_id="download_output")
422
 
 
423
 
424
- # --- Input Type Change Logic ---
425
  def update_input_visibility(choice):
426
- if choice == "URL":
427
- return {url_input: gr.update(visible=True), zip_input: gr.update(visible=False)}
428
- else: # Upload ZIP
429
- return {url_input: gr.update(visible=False), zip_input: gr.update(visible=True)}
 
 
430
 
431
  input_type.change(
432
  fn=update_input_visibility,
433
  inputs=input_type,
434
- outputs=[url_input, zip_input]
 
435
  )
436
 
437
- # --- Form Submission ---
438
- # MODIFIED: Added include_content_checkbox to inputs
439
  submit_btn.click(
440
- fn=repo_to_md,
441
  inputs=[
442
- input_type,
443
- url_input,
444
- zip_input,
445
- ignore_input,
446
- max_size_input,
447
- include_content_checkbox, # Pass the checkbox state
448
  ],
449
- outputs=[
450
- md_output,
451
- download_output,
452
- ],
453
- api_name="repo_to_md" # For API access if needed
454
  )
455
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
  # Launch the interface
457
  if __name__ == "__main__":
458
- # Consider adding share=True if you want to share it publicly via Gradio's service
459
- demo.launch()
 
8
  from pathlib import Path
9
  from pygments.lexers import guess_lexer_for_filename
10
  from pygments.util import ClassNotFound
11
+ import logging
12
+ import time
13
+ import math
14
+
15
+ # Try importing pyperclip, provide instructions if missing
16
+ try:
17
+ import pyperclip
18
+ PYPERCLIP_AVAILABLE = True
19
+ except ImportError:
20
+ PYPERCLIP_AVAILABLE = False
21
+ logging.warning("pyperclip library not found. 'Copy to Clipboard' functionality will be disabled. Install with: pip install pyperclip")
22
+
23
 
24
  # --- Configuration ---
25
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
26
+
27
  DEFAULT_IGNORE_PATTERNS = [
28
+ # Version Control
29
+ ".git/", ".gitignore", ".gitattributes", ".gitmodules", ".svn/", ".hg/",
30
+ # IDE/Editor Files
31
+ ".idea/", ".vscode/", "*.sublime-project", "*.sublime-workspace", ".project", ".classpath", "*.tmproj", ".DS_Store", "Thumbs.db",
32
+ # Build Outputs & Dependencies
33
+ "build/", "dist/", "bin/", "obj/", "out/", "target/", "*.o", "*.so", "*.dll", "*.exe", "*.class", "*.jar", "*.war", "*.ear",
34
+ "node_modules/", "bower_components/", "vendor/", "*.egg-info/", "wheels/", "**/__pycache__/", "*.pyc",
35
+ # Python Virtual Environments
36
+ ".venv/", "venv/", "env/", ".env", "pip-cache/",
37
+ # Logs & Temporary Files
38
+ "*.log", "*.tmp", "*.temp", "*.swp", "*.swo", "*.bak",
39
+ # OS Generated Files
40
+ "._*",
41
+ # Secrets (important!)
42
+ "*.pem", "*.key", ".env*", "secrets.*", "credentials.*",
43
+ # Common Framework/Tool cache/temp files
44
+ ".pytest_cache/", ".tox/", ".mypy_cache/", ".ruff_cache/", "*.ipynb_checkpoints",
45
+ # MACOS specific zip artifact
46
+ "__MACOSX/",
47
  ]
48
  DEFAULT_MAX_FILE_SIZE_KB = 1024 # 1 MB limit for file content inclusion
49
+ CLONE_TIMEOUT_SPARSE = 120 # seconds
50
+ CLONE_TIMEOUT_STANDARD = 300 # seconds
51
+ ZIP_EXTRACT_WARN_THRESHOLD = 1000 # Warn if ZIP contains more than this many files
52
+ MAX_FILES_FOR_DETAILED_PROGRESS = 500 # Only show per-file progress if fewer than this many files
53
 
54
  # --- Core Logic ---
55
 
56
+ def should_ignore(path_obj: Path, ignore_patterns: list[str], repo_root: Path) -> bool:
57
+ """Checks if a file or directory path should be ignored based on gitignore-style patterns."""
58
  try:
59
+ relative_path = path_obj.relative_to(repo_root)
60
+ # Use POSIX paths for consistent pattern matching regardless of OS
61
+ relative_path_str = relative_path.as_posix()
62
+ except ValueError:
63
+ logging.warning(f"Path {path_obj} not relative to root {repo_root}, ignoring.")
64
+ return True
65
+
66
+ # Optimization: Check direct name match first for common ignores like '.git'
67
+ if path_obj.name in ignore_patterns:
68
+ return True
69
+
70
  for pattern in ignore_patterns:
71
+ pattern = pattern.strip()
72
+ if not pattern or pattern.startswith('#'):
73
+ continue
74
+
75
+ # Ensure pattern uses POSIX separators
76
+ pattern_posix = pattern.replace(os.sep, '/')
77
+
78
+ # Case 1: Pattern specifies a directory (ends with '/')
79
+ if pattern_posix.endswith('/'):
80
+ # Match if the relative path *is* this directory or starts with it
81
+ # Example: pattern "build/", path "build" or "build/foo.txt"
82
+ dir_pattern = pattern_posix.rstrip('/')
83
+ if relative_path_str == dir_pattern or relative_path_str.startswith(dir_pattern + '/'):
84
+ return True
85
+ # Also match if a *directory component* matches the name (like ignoring 'node_modules' anywhere)
86
+ # Example: pattern "node_modules/", path "src/my_lib/node_modules/some_dep"
87
+ if path_obj.is_dir() and path_obj.name == dir_pattern:
88
  return True
89
+ # Check parent directories as well
90
+ for parent in relative_path.parents:
91
+ if parent.name == dir_pattern:
92
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
 
 
94
 
95
+ # Case 2: Pattern is a file or general pattern (using fnmatch)
96
+ # Use fnmatchcase for potentially case-sensitive filesystems if needed,
97
+ # but fnmatch is often sufficient and more git-like on Win/Mac.
98
+ if fnmatch.fnmatch(relative_path_str, pattern_posix):
99
+ return True
100
+ # Also match just the filename part for patterns like "*.log"
101
+ if fnmatch.fnmatch(path_obj.name, pattern_posix):
102
+ return True
103
 
104
 
105
+ return False
106
+
107
+ def is_likely_binary(file_path: Path, chunk_size=1024) -> bool:
108
  """Checks if a file is likely binary by reading a chunk."""
109
  try:
110
+ with file_path.open('rb') as f:
111
  chunk = f.read(chunk_size)
112
+ return b'\0' in chunk
113
+ except OSError as e:
114
+ logging.warning(f"Could not read file chunk for binary check {file_path}: {e}")
115
+ return True
116
 
117
+ def get_file_content(file_path: Path, max_size_bytes: int) -> tuple[str | None, str | None, str | None]:
118
+ """Reads file content, detects language, handles size limits and encodings."""
 
 
 
119
  try:
120
+ file_size = file_path.stat().st_size
121
  if file_size > max_size_bytes:
122
+ kb_limit = max_size_bytes / 1024
123
+ kb_actual = file_size / 1024
124
+ return None, None, f"[Content skipped: File size ({kb_actual:.1f} KB) exceeds limit ({kb_limit:.1f} KB)]"
125
+
126
+ if file_size == 0:
127
+ return "", "", None # Empty file
128
 
129
  if is_likely_binary(file_path):
130
+ return None, None, "[Content skipped: Detected as likely binary file]"
131
 
132
+ content = None
133
+ detected_encoding = 'utf-8'
134
  try:
135
+ with file_path.open('r', encoding='utf-8') as f:
136
  content = f.read()
137
  except UnicodeDecodeError:
138
+ logging.warning(f"UTF-8 decoding failed for {file_path}, trying latin-1.")
139
+ detected_encoding = 'latin-1'
140
  try:
141
+ with file_path.open('r', encoding='latin-1') as f:
142
  content = f.read()
143
  except Exception as e_read:
144
+ logging.error(f"Error reading file {file_path} even with latin-1: {e_read}")
145
  return None, None, f"[Content skipped: Error reading file - {e_read}]"
146
+ except OSError as e_os:
147
+ logging.error(f"OS Error reading file {file_path}: {e_os}")
148
+ return None, None, f"[Content skipped: OS Error reading file - {e_os}]"
149
 
150
+ language = ""
151
  try:
152
+ lexer = guess_lexer_for_filename(file_path.name, content)
153
  language = lexer.aliases[0] if lexer.aliases else lexer.name
154
  except ClassNotFound:
155
+ language = "" # Plain text
156
+ except Exception as e_lexer:
157
+ logging.warning(f"Could not guess lexer for {file_path}: {e_lexer}")
158
+ language = "" # Fallback
159
 
160
  return content, language, None
161
 
162
+ except OSError as e_os:
163
+ logging.error(f"OS Error processing file {file_path}: {e_os}")
164
+ return None, None, f"[Content skipped: Error accessing file properties - {e_os}]"
165
  except Exception as e:
166
+ logging.error(f"Unexpected error processing file {file_path}: {e}", exc_info=True)
167
+ return None, None, f"[Content skipped: Unexpected error processing file - {e}]"
 
 
 
 
 
 
 
168
 
169
+ # --- MODIFIED: Function now uses yield for status updates ---
170
+ def generate_markdown_for_repo(repo_path_str: str, ignore_patterns: list[str], max_file_size_kb: int, include_content: bool):
171
+ """
172
+ Generates Markdown content for the repository structure and optionally files.
173
+ Yields status updates during processing.
174
+ """
175
+ repo_root = Path(repo_path_str).resolve()
176
+ yield f"Status: Analysing repository at {repo_root}..."
177
+ logging.info(f"Starting markdown generation for: {repo_root}")
178
 
179
+ md_lines = ["# Repository Analysis\n"]
 
180
  structure_lines = []
181
+ content_lines = []
182
+ max_size_bytes = max_file_size_kb * 1024
183
+ files_to_process = []
 
 
 
184
 
185
+ # --- Pre-computation: Collect all files to potentially process ---
186
+ yield "Status: Scanning file structure..."
187
+ all_paths = []
188
+ for root, dirs, files in os.walk(repo_path_str, topdown=True):
189
+ root_path = Path(root).resolve()
190
 
191
+ # --- Filter ignored directories before adding paths ---
192
+ # We need to check against the original dirs list before modifying it
193
+ original_dirs = list(dirs)
194
+ dirs[:] = [d for d in original_dirs if not should_ignore(root_path / d, ignore_patterns, repo_root)]
195
+
196
+ # Add directories that are *not* ignored
197
+ for d in dirs: # Add the non-ignored directory paths
198
+ all_paths.append(root_path / d)
199
+
200
+ # Add files that are *not* ignored
201
+ for f in files:
202
+ file_path = root_path / f
203
+ if not should_ignore(file_path, ignore_patterns, repo_root):
204
+ all_paths.append(file_path)
205
+
206
+ # --- Pass 1: Build the directory structure visualization ---
207
+ yield "Status: Generating directory structure..."
208
+ structure_lines.append("## Directory Structure")
209
+ structure_lines.append("```")
210
+ structure_tree = []
211
+ processed_dirs_for_structure = set()
212
+
213
+ def add_to_structure(path_obj: Path, depth: int):
214
+ indent = " " * depth # 4 spaces indent
215
+ prefix = "└── "
216
+ if path_obj.is_dir():
217
+ # Add directory only if it hasn't been added via a parent walk already
218
+ if path_obj not in processed_dirs_for_structure:
219
+ structure_tree.append(f"{indent}{prefix}{path_obj.name}/")
220
+ processed_dirs_for_structure.add(path_obj)
221
+ # Recursively add children
222
+ try:
223
+ for item in sorted(path_obj.iterdir(), key=lambda p: (p.is_file(), p.name.lower())):
224
+ if not should_ignore(item, ignore_patterns, repo_root):
225
+ add_to_structure(item, depth + 1)
226
+ except OSError as e:
227
+ logging.warning(f"Could not access directory {path_obj}: {e}")
228
+ structure_tree.append(f"{indent} └── [Error accessing directory: {e}]")
229
+
230
+ elif path_obj.is_file():
231
+ structure_tree.append(f"{indent}{prefix}{path_obj.name}")
232
+
233
+ # Start building the structure from the root
234
+ structure_tree.append(f"{repo_root.name}/")
235
+ processed_dirs_for_structure.add(repo_root)
236
+ try:
237
+ for item in sorted(repo_root.iterdir(), key=lambda p: (p.is_file(), p.name.lower())):
238
+ if not should_ignore(item, ignore_patterns, repo_root):
239
+ add_to_structure(item, 1)
240
+ except OSError as e:
241
+ logging.error(f"Could not access repository root {repo_root}: {e}")
242
+ structure_tree.append(f" └── [Error accessing repository root: {e}]")
243
 
244
 
245
+ structure_lines.extend(structure_tree)
246
+ structure_lines.append("```\n")
247
+ yield "Status: Directory structure generated."
248
+ logging.info("Directory structure built.")
249
 
250
  # --- Pass 2: Process file contents (ONLY if requested) ---
251
+ files_to_render = [p for p in all_paths if p.is_file()]
252
+ total_files = len(files_to_render)
 
 
 
253
 
254
+ if include_content and total_files > 0:
255
+ yield f"Status: Processing content of {total_files} file(s)..."
256
+ content_lines.append("## File Contents\n")
257
+ start_time = time.time()
258
+ show_detailed_progress = total_files <= MAX_FILES_FOR_DETAILED_PROGRESS
259
 
260
+ for i, file_path in enumerate(files_to_render):
261
+ if show_detailed_progress or (i % 50 == 0 and i > 0): # Update every 50 files if many files
262
+ progress_percent = (i + 1) / total_files
263
+ yield f"Status: Processing file {i+1}/{total_files}: {file_path.relative_to(repo_root).as_posix()} ({progress_percent:.0%})"
264
 
265
+ try:
266
+ relative_path_str = file_path.relative_to(repo_root).as_posix()
267
+ content_lines.append(f"### `{relative_path_str}`\n") # Use POSIX path in Markdown
268
+ content, language, error_msg = get_file_content(file_path, max_size_bytes)
269
+
270
+ if error_msg:
271
+ content_lines.append(f"```\n{error_msg}\n```\n")
272
+ elif content is not None:
273
+ lang_hint = language if language else ""
274
+ content_lines.append(f"```{lang_hint}\n{content}\n```\n")
275
+ else:
276
+ # Should generally be covered by error_msg cases, but as a fallback
277
+ content_lines.append("```\n[Content not available or file is binary/empty]\n```\n")
278
 
279
+ except ValueError:
280
+ logging.warning(f"Path {file_path} not relative to {repo_root}, skipping content.")
281
+ continue
282
+ except Exception as e:
283
+ logging.error(f"Unexpected error processing content for {file_path}: {e}", exc_info=True)
284
+ relative_path_str = file_path.name # Fallback name
285
  try:
286
+ relative_path_str = file_path.relative_to(repo_root).as_posix()
287
+ except ValueError: pass
288
+ content_lines.append(f"### `{relative_path_str}`\n")
289
+ content_lines.append(f"```\n[ERROR processing file content: {e}]\n```\n")
290
+
291
+ end_time = time.time()
292
+ yield f"Status: File content processing complete ({total_files} files in {end_time - start_time:.2f}s)."
293
+ logging.info(f"File content processing complete. Processed {total_files} files in {end_time - start_time:.2f} seconds.")
294
+ elif not include_content:
295
+ yield "Status: Skipping file content inclusion as requested."
296
+ logging.info("Skipping file content inclusion as requested.")
297
+ else: # include_content is True but total_files is 0
298
+ yield "Status: No files found to include content for (after filtering)."
299
+ logging.info("No files found to include content for (after filtering).")
300
+
301
+ # Combine structure and content
302
+ md_lines.extend(structure_lines)
303
+ if include_content and content_lines: # Only add content section if requested and content exists
304
+ md_lines.extend(content_lines)
305
+
306
+ yield "Status: Markdown generation complete!"
307
+ yield "".join(md_lines) # Final yield is the complete markdown
308
+
309
+
310
+ # --- MODIFIED: Function is now a generator, yielding status updates ---
311
+ def repo_to_md_processor(input_type: str, repo_url: str | None, uploaded_zip: tempfile._TemporaryFileWrapper | None, git_branch: str | None, ignore_patterns_str: str, max_file_size_kb: int, include_content: bool):
312
+ """
313
+ Main processing generator function called by Gradio interface.
314
+ Yields status strings and finally the markdown content or an error message.
315
+ """
316
+ temp_dir_obj = None
317
  repo_path = None
318
  output_md = ""
319
  output_file_path = None
320
  error_message = None
321
+ start_time = time.time()
 
 
 
 
 
322
 
323
  try:
324
+ yield "Status: Initializing..."
325
+ # Combine user patterns with defaults
326
+ user_patterns = {p.strip() for p in ignore_patterns_str.split(',') if p.strip()}
327
+ default_patterns = set(DEFAULT_IGNORE_PATTERNS)
328
+ combined_patterns = sorted(list(user_patterns.union(default_patterns)))
329
+ logging.info(f"Using ignore patterns: {combined_patterns}")
330
+ logging.info(f"Max file size for content: {max_file_size_kb} KB")
331
+ logging.info(f"Include file content: {include_content}")
332
+ if input_type == "URL" and git_branch:
333
+ logging.info(f"Requested Git branch/tag: {git_branch}")
334
+
335
+
336
+ with tempfile.TemporaryDirectory(prefix="repo_md_") as temp_dir:
337
+ logging.info(f"Created temporary directory: {temp_dir}")
338
+ temp_dir_path = Path(temp_dir)
339
+
340
+ if input_type == "URL":
341
+ if not repo_url or not (repo_url.startswith("http://") or repo_url.startswith("https://") or repo_url.startswith("git@")):
342
+ raise ValueError("Invalid Git URL. Must start with http(s):// or git@")
343
+ yield f"Status: Processing URL: {repo_url}" + (f" (branch/tag: {git_branch})" if git_branch else "")
344
+
345
+ target_clone_path = temp_dir_path / "repo"
346
+ target_clone_path.mkdir()
347
+ repo_path_str = str(target_clone_path)
348
+
349
+ # --- Git Clone ---
350
+ branch_args = ["--branch", git_branch] if git_branch and git_branch.strip() else []
351
+ common_args = ["--depth", "1"] # Always shallow clone
352
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  try:
354
+ # Try sparse checkout first
355
+ yield "Status: Attempting efficient Git clone (sparse)..."
356
+ clone_cmd_sparse = ["git", "clone"] + common_args + ["--filter=blob:none", "--no-checkout"] + branch_args + [repo_url, repo_path_str]
357
+ logging.info(f"Running sparse clone command: {' '.join(clone_cmd_sparse)}")
358
+ subprocess.run(clone_cmd_sparse, check=True, capture_output=True, text=True, encoding='utf-8', errors='replace', timeout=CLONE_TIMEOUT_SPARSE)
359
+
360
+ checkout_cmd_sparse = ["git", "sparse-checkout", "init", "--cone"]
361
+ logging.info(f"Running sparse checkout init: {' '.join(checkout_cmd_sparse)}")
362
+ subprocess.run(checkout_cmd_sparse, cwd=repo_path_str, check=True, capture_output=True, text=True)
363
+
364
+ checkout_cmd = ["git", "checkout"]
365
+ logging.info(f"Running final checkout: {' '.join(checkout_cmd)}")
366
+ subprocess.run(checkout_cmd, cwd=repo_path_str, check=True, capture_output=True, text=True, encoding='utf-8', errors='replace', timeout=CLONE_TIMEOUT_SPARSE)
367
+ yield "Status: Efficient Git clone successful."
368
+ except (subprocess.CalledProcessError, FileNotFoundError, subprocess.TimeoutExpired) as e_sparse:
369
+ yield f"Status: Efficient clone failed ({type(e_sparse).__name__}), attempting standard clone..."
370
+ logging.warning(f"Sparse clone failed: {e_sparse}. Output: {e_sparse.stderr if hasattr(e_sparse, 'stderr') else 'N/A'}")
371
+ shutil.rmtree(target_clone_path, ignore_errors=True)
372
+ target_clone_path.mkdir()
373
+
374
+ try:
375
+ # Fallback to standard shallow clone
376
+ clone_cmd_std = ["git", "clone"] + common_args + branch_args + [repo_url, repo_path_str]
377
+ logging.info(f"Running standard clone command: {' '.join(clone_cmd_std)}")
378
+ subprocess.run(clone_cmd_std, check=True, capture_output=True, text=True, encoding='utf-8', errors='replace', timeout=CLONE_TIMEOUT_STANDARD)
379
+ yield "Status: Standard shallow clone successful."
380
+ except FileNotFoundError:
381
+ logging.error("Git command not found.")
382
+ raise RuntimeError("Git command not found. Please install Git and ensure it's in your PATH.")
383
+ except subprocess.CalledProcessError as e_std:
384
+ error_detail = e_std.stderr or e_std.stdout or "No output captured."
385
+ logging.error(f"Standard Git clone failed: {error_detail.strip()}")
386
+ raise RuntimeError(f"Git clone failed:\n{error_detail.strip()}")
387
+ except subprocess.TimeoutExpired:
388
+ logging.error(f"Git clone timed out after {CLONE_TIMEOUT_STANDARD} seconds.")
389
+ raise RuntimeError(f"Git clone timed out after {CLONE_TIMEOUT_STANDARD // 60} minutes.")
390
+
391
+ repo_path = target_clone_path
392
+
393
+ elif input_type == "Upload ZIP":
394
+ if uploaded_zip is None or not hasattr(uploaded_zip, 'name'):
395
+ raise ValueError("No ZIP file uploaded or invalid file object.")
396
+ yield f"Status: Processing uploaded ZIP: {Path(uploaded_zip.name).name}"
397
+
398
+ target_extract_path = temp_dir_path / "extracted"
399
+ target_extract_path.mkdir()
400
+
401
  try:
402
+ with zipfile.ZipFile(uploaded_zip.name, 'r') as zip_ref:
403
+ members = zip_ref.namelist()
404
+ num_files = len(members)
405
+ yield f"Status: Extracting {num_files} entries from ZIP..."
406
+ logging.info(f"ZIP contains {num_files} entries.")
407
+ if num_files > ZIP_EXTRACT_WARN_THRESHOLD:
408
+ logging.warning(f"ZIP contains a large number of files ({num_files}).")
409
+
410
+ # Security Checks
411
+ for member in members:
412
+ if member.startswith('/') or member.startswith('\\') or '..' in member.split(os.path.sep):
413
+ raise ValueError(f"ZIP contains potentially unsafe path: '{member}'. Aborting.")
414
+ if len(member) > 1024: # Limit path length
415
+ raise ValueError(f"ZIP contains excessively long path: '{member[:100]}...'. Aborting.")
416
+
417
+ zip_ref.extractall(target_extract_path)
418
+ yield "Status: ZIP extraction complete."
419
+ logging.info("ZIP extraction complete.")
420
+
421
+ except zipfile.BadZipFile:
422
+ logging.error("Invalid or corrupted ZIP file uploaded.")
423
+ raise ValueError("Invalid or corrupted ZIP file.")
424
  except Exception as e_extract:
425
+ logging.error(f"Failed to extract ZIP file: {e_extract}", exc_info=True)
426
  raise RuntimeError(f"Failed to extract ZIP file: {e_extract}")
427
 
428
+ # Determine repo root within extracted files
429
+ extracted_items = list(target_extract_path.iterdir())
430
+ filtered_items = [item for item in extracted_items if item.name not in (".DS_Store", "__MACOSX")]
 
 
 
 
 
431
 
432
+ if len(filtered_items) == 1 and filtered_items[0].is_dir():
433
+ repo_path = filtered_items[0]
434
+ logging.info(f"Detected single root directory in ZIP: {repo_path.name}")
435
+ else:
436
+ repo_path = target_extract_path
437
+ logging.info("Using root of extracted ZIP as repository root.")
438
 
 
 
 
439
  else:
440
+ raise ValueError("Invalid input type selected.")
441
+
442
+ if not repo_path or not repo_path.is_dir():
443
+ raise RuntimeError(f"Could not determine valid repository path.")
444
+
445
+ yield f"Status: Repository path identified: {repo_path}"
446
+
447
+ # --- Generate Markdown ---
448
+ # This function now yields status updates internally and the final result
449
+ generator = generate_markdown_for_repo(str(repo_path), combined_patterns, max_file_size_kb, include_content)
450
+ while True:
451
+ try:
452
+ status_or_result = next(generator)
453
+ if status_or_result.startswith("Status:"):
454
+ yield status_or_result # Yield status updates
455
+ else:
456
+ output_md = status_or_result # Final result
457
+ break # Exit loop once markdown is generated
458
+ except StopIteration:
459
+ # Should have received the final result before StopIteration
460
+ logging.error("Markdown generator finished unexpectedly without yielding final result.")
461
+ raise RuntimeError("Markdown generation failed internally.")
462
+
463
+ # Save markdown to a temporary file for download
464
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".md", encoding='utf-8', prefix="repo_analysis_") as f:
465
+ f.write(output_md)
466
+ output_file_path = f.name
467
+ yield f"Status: Analysis complete. Output saved to {Path(output_file_path).name}"
468
 
469
  except Exception as e:
470
+ logging.error(f"An error occurred during processing: {e}", exc_info=True)
471
  error_message = f"An error occurred: {e}"
472
+ # Yield a final error status and the error message for the main output
473
+ yield f"Status: Error - {error_message}"
474
+ yield f"### Operation Failed\n\n```\n{error_message}\n```" # Final yield for output area
475
+ output_file_path = None
476
 
477
  finally:
478
+ # Temp directory is cleaned up automatically by the 'with' statement
479
+ end_time = time.time()
480
+ logging.info(f"Total processing time: {end_time - start_time:.2f} seconds.")
481
+
482
+ # Return the file path for the download component (or None on error)
483
+ # The final text output is handled by the last yield in try/except blocks.
484
+ yield output_file_path # Yield the file path for the gr.File component update
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
 
486
 
487
  # --- Gradio Interface ---
488
 
489
  css = """
490
+ body { font-family: sans-serif; }
491
+ #md_output_panel { /* Style the output panel */
492
+ max-height: 80vh;
493
+ }
494
  #md_output {
495
+ max-height: 70vh; /* Adjust max height for content */
496
+ overflow: auto;
497
+ border: 1px solid #ccc;
498
+ border-radius: 5px;
499
+ padding: 15px;
500
+ background-color: #f9f9f9;
501
+ }
502
+ #md_output h1 { font-size: 1.6em; border-bottom: 1px solid #eee; padding-bottom: 5px; margin-top: 0;}
503
+ #md_output h2 { font-size: 1.3em; border-bottom: 1px solid #eee; padding-bottom: 5px; margin-top: 20px; }
504
+ #md_output h3 { font-size: 1.1em; margin-top: 15px; margin-bottom: 5px; color: #333; }
505
+ #md_output code { background-color: #eee; padding: 2px 4px; border-radius: 3px; font-size: 0.9em; }
506
+ #md_output pre { background-color: #fff; padding: 10px; border-radius: 4px; border: 1px solid #ddd; white-space: pre-wrap; word-wrap: break-word; }
507
+ #md_output pre > code { display: block; padding: 0; background-color: transparent; border: none; font-size: 0.9em;} /* Better code block styling */
508
+
509
+ #status_box {
510
+ font-size: 0.9em;
511
+ color: #555;
512
+ padding: 8px;
513
+ border: 1px dashed #ddd;
514
+ background-color: #fafafa;
515
+ border-radius: 4px;
516
+ min-height: 3em; /* Ensure it's visible even when short messages */
517
+ margin-top: 10px;
518
  }
519
+ #copy_button { /* Style the copy button */
520
+ margin-left: 10px;
521
+ min-width: 100px; /* Give it a bit more width */
522
  }
523
+ #download_output { margin-top: 15px; }
524
+ footer { display: none !important; }
525
+ .gradio-container { max-width: 1360px !important; margin: auto !important; }
526
  """
527
 
528
+ # --- Helper function for Copy Button ---
529
+ def copy_to_clipboard(text):
530
+ if PYPERCLIP_AVAILABLE and text:
531
+ try:
532
+ pyperclip.copy(text)
533
+ logging.info("Copied output to clipboard.")
534
+ return gr.update(value="Copied!", variant="secondary") # Temporary feedback
535
+ except Exception as e:
536
+ logging.error(f"Failed to copy to clipboard: {e}")
537
+ return gr.update(value="Copy Failed", variant="stop")
538
+ elif not PYPERCLIP_AVAILABLE:
539
+ logging.warning("Copy attempt failed: pyperclip not installed.")
540
+ return gr.update(value="Install Pyperclip", variant="stop")
541
+ else: # No text to copy
542
+ return gr.update(value="Nothing to Copy", variant="secondary")
543
+
544
+ def reset_copy_button():
545
+ # Short delay before resetting button appearance
546
+ time.sleep(1.5)
547
+ return gr.update(value="Copy Markdown", variant="secondary")
548
+
549
+
550
+ with gr.Blocks(css=css, title="Repo Analyzer", theme=gr.themes.Soft()) as demo:
551
+ gr.Markdown("# Repository Analyzer")
552
  gr.Markdown(
553
  "Enter a public Git repository URL or upload a local project folder (as a `.zip` archive) "
554
+ "to generate a single Markdown file containing its structure and optionally file contents. "
555
+ "Provides real-time status updates."
556
  )
557
 
558
  with gr.Row():
559
+ # --- Input Column ---
560
+ with gr.Column(scale=1):
561
+ gr.Markdown("### Input Source & Options")
562
  input_type = gr.Radio(
563
  ["URL", "Upload ZIP"], label="Input Source", value="URL"
564
  )
565
 
566
+ # URL Specific Inputs (conditionally visible)
567
  url_input = gr.Textbox(
568
  label="Git Repository URL",
569
+ placeholder="e.g., https://github.com/gradio-app/gradio.git or [email protected]:user/repo.git",
570
+ visible=True, interactive=True, elem_id="url-input"
571
+ )
572
+ git_branch_input = gr.Textbox(
573
+ label="Branch / Tag (Optional)",
574
+ placeholder="e.g., main, develop, v1.2.3 (leave empty for default)",
575
+ visible=True, interactive=True, elem_id="git-branch-input"
576
  )
577
 
578
+ # ZIP Specific Inputs (conditionally visible)
579
  zip_input = gr.File(
580
  label="Upload Local Folder (as .zip)",
581
  file_types=[".zip"],
582
+ visible=False, interactive=True, elem_id="zip-input"
 
 
583
  )
584
 
585
+ # --- Common Options in Accordion ---
586
+ with gr.Accordion("Configuration Options", open=False):
587
+ include_content_checkbox = gr.Checkbox(
588
+ label="Include File Content in Output",
589
+ value=True,
590
+ info="Generate structure only if unchecked."
591
+ )
592
+ max_size_input = gr.Number(
593
+ label="Max File Size for Content (KB)",
594
+ value=DEFAULT_MAX_FILE_SIZE_KB, minimum=0, step=64, precision=0,
595
+ info="Files larger than this won't have content included (if enabled). 0 disables content.",
596
+ )
597
+ ignore_input = gr.Textbox(
598
+ label="Ignore Patterns (comma-separated, gitignore style)",
599
+ value=", ".join(DEFAULT_IGNORE_PATTERNS),
600
+ placeholder="e.g., .git/, *.log, node_modules/",
601
+ info="Uses gitignore syntax. Add `/` for directories. Defaults provided.",
602
+ lines=5, max_lines=15
603
+ )
604
+
605
+ submit_btn = gr.Button("Analyze Repository", variant="primary")
606
+
607
+ gr.Markdown("### Status Updates")
608
+ status_output = gr.Textbox(label="Current Status", value="Idle.", interactive=False, lines=3, elem_id="status_box")
609
 
 
 
 
 
 
 
 
610
 
611
+ # --- Output Column ---
612
+ with gr.Column(scale=2):
613
+ gr.Markdown("### Generated Output")
614
+ with gr.Row(elem_id="output_header_row"):
615
+ copy_button = gr.Button("Copy Markdown", variant="secondary", elem_id="copy_button", visible=PYPERCLIP_AVAILABLE) # Hide if pyperclip missing
616
+ download_output = gr.File(label="Download .md File", interactive=False, visible=False, elem_id="download_output", scale=1) # Take less space initially
617
+
618
+ md_output = gr.Markdown(value="*Awaiting analysis results...*", elem_id="md_output", visible=True)
619
 
 
 
 
 
620
 
621
+ # --- Event Handlers ---
622
 
623
+ # Update visibility based on input type choice
624
  def update_input_visibility(choice):
625
+ is_url = choice == "URL"
626
+ return {
627
+ url_input: gr.update(visible=is_url),
628
+ git_branch_input: gr.update(visible=is_url),
629
+ zip_input: gr.update(visible=not is_url)
630
+ }
631
 
632
  input_type.change(
633
  fn=update_input_visibility,
634
  inputs=input_type,
635
+ outputs=[url_input, git_branch_input, zip_input],
636
+ queue=False # UI only change
637
  )
638
 
639
+ # Main processing logic on submit
 
640
  submit_btn.click(
641
+ fn=repo_to_md_processor, # The generator function
642
  inputs=[
643
+ input_type, url_input, zip_input, git_branch_input,
644
+ ignore_input, max_size_input, include_content_checkbox,
 
 
 
 
645
  ],
646
+ # Outputs map to yielded values: status strings, final markdown, final file path
647
+ outputs=[ status_output, md_output, download_output ],
648
+ api_name="repo_to_md"
 
 
649
  )
650
 
651
+ # Copy button functionality
652
+ if PYPERCLIP_AVAILABLE:
653
+ copy_button.click(
654
+ fn=copy_to_clipboard,
655
+ inputs=[md_output], # Takes the current markdown content
656
+ outputs=[copy_button], # Updates its own text/appearance
657
+ queue=False
658
+ ).then(
659
+ fn=reset_copy_button, # Function to reset button after a delay
660
+ inputs=None,
661
+ outputs=[copy_button],
662
+ queue=False # Don't queue the reset visual change
663
+ )
664
+
665
  # Launch the interface
666
  if __name__ == "__main__":
667
+ demo.queue().launch(server_name="0.0.0.0", show_error=True, debug=True) # Enable queue & debug for better testing