File size: 21,532 Bytes
40184db
 
 
 
 
3c5f9e7
29606bb
 
 
6085d45
29606bb
6085d45
3c5f9e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6085d45
3c5f9e7
 
 
 
29606bb
5c9292e
3c5f9e7
 
 
 
 
 
 
 
29606bb
3c5f9e7
 
29606bb
3c5f9e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6085d45
3c5f9e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29606bb
3c5f9e7
6085d45
 
 
 
3c5f9e7
 
 
 
 
 
 
 
6085d45
3c5f9e7
 
6085d45
3c5f9e7
 
 
 
 
 
29606bb
3c5f9e7
 
29606bb
3c5f9e7
 
 
 
 
6085d45
3c5f9e7
6085d45
 
 
 
 
3c5f9e7
 
 
 
 
 
 
 
 
6085d45
 
 
3c5f9e7
 
6085d45
3c5f9e7
 
 
 
 
6085d45
 
 
3c5f9e7
6085d45
 
 
3c5f9e7
6085d45
 
 
3c5f9e7
 
 
 
 
 
 
 
6085d45
 
 
 
 
3c5f9e7
 
 
 
 
 
6085d45
3c5f9e7
 
 
 
 
 
 
 
6085d45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c5f9e7
6085d45
 
3c5f9e7
 
 
6085d45
3c5f9e7
6085d45
3c5f9e7
6085d45
3c5f9e7
 
 
6085d45
 
3c5f9e7
 
 
 
 
 
 
 
 
 
 
 
 
 
29606bb
3c5f9e7
 
 
 
 
 
6085d45
 
 
 
 
 
 
40184db
 
3c5f9e7
6085d45
 
f19e8d1
3c5f9e7
6085d45
3c5f9e7
 
29606bb
6085d45
 
 
 
 
 
29606bb
3c5f9e7
6085d45
3c5f9e7
6085d45
3c5f9e7
29606bb
6085d45
 
 
 
 
 
3c5f9e7
6085d45
3c5f9e7
6085d45
 
 
3c5f9e7
 
 
 
29606bb
6085d45
 
29606bb
 
3c5f9e7
6085d45
 
 
29606bb
3c5f9e7
 
6085d45
 
 
40184db
3c5f9e7
 
6085d45
 
40184db
3c5f9e7
 
 
 
 
 
 
6085d45
3c5f9e7
 
29606bb
3c5f9e7
40184db
 
24226a4
3c5f9e7
 
 
40184db
 
3c5f9e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40184db
 
29606bb
 
24226a4
3c5f9e7
 
 
 
 
6085d45
3c5f9e7
 
6085d45
3c5f9e7
 
 
 
 
 
 
 
 
 
 
40184db
29606bb
3c5f9e7
40184db
3c5f9e7
 
40184db
3c5f9e7
 
40184db
 
6085d45
40184db
6085d45
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
import gradio as gr
import os
import subprocess
import tempfile
import zipfile
import pathlib
import shutil
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern
import traceback # Import traceback for better error logging

# --- Configuration --- (Keep as before)
DEFAULT_IGNORE_PATTERNS = """
# Default Ignore Patterns (Gitignore Syntax)
/.git/
/.hg/
/.svn/
/.vscode/
/.idea/
/node_modules/
/vendor/
/build/
/dist/
/target/
*.pyc
*.log
*.swp
*~
__pycache__/
.DS_Store
"""
MAX_OUTPUT_LINES = 10000 # Limit potential output size in display
INDENT_CHAR = "    " # 4 spaces for indentation
FOLDER_ICON = "πŸ“"
FILE_ICON = "πŸ“„"

# --- Core Logic --- (Keep get_repo_path and generate_markdown_structure as before)
def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()):
    """Clones or extracts the repository, returning the local path."""
    temp_dir = tempfile.mkdtemp()
    repo_path = None

    try:
        if source_type == "URL":
            if not repo_url:
                raise ValueError("GitHub Repository URL is required.")
            progress(0.1, desc="Cloning repository...")
            git_command = ["git", "clone", "--depth", "1"] # Shallow clone for speed
            if branch_tag:
                git_command.extend(["--branch", branch_tag])
            git_command.extend([repo_url, temp_dir])

            print(f"Running command: {' '.join(git_command)}") # For debugging
            result = subprocess.run(git_command, capture_output=True, text=True, check=False)

            if result.returncode != 0:
                # Attempt clone without branch if specific one failed (might be default branch)
                if branch_tag:
                    progress(0.2, desc=f"Branch '{branch_tag}' not found or clone failed, trying default branch...")
                    git_command = ["git", "clone", "--depth", "1", repo_url, temp_dir]
                    print(f"Running command: {' '.join(git_command)}") # For debugging
                    result = subprocess.run(git_command, capture_output=True, text=True, check=False)

                if result.returncode != 0:
                  error_message = f"Git clone failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
                  print(error_message) # Log detailed error
                  # Try to extract a user-friendly message
                  if "Authentication failed" in result.stderr:
                       raise ValueError("Authentication failed. Private repositories require different handling (e.g., tokens) which is not supported here.")
                  elif "not found" in result.stderr:
                       raise ValueError(f"Repository or branch '{branch_tag or 'default'}' not found at URL: {repo_url}")
                  else:
                       raise ValueError(f"Git clone failed. Check URL and branch/tag. Error: {result.stderr.splitlines()[-1] if result.stderr else 'Unknown error'}")


            repo_path = pathlib.Path(temp_dir)
            progress(0.5, desc="Repository cloned.")
            print(f"Cloned repo to: {repo_path}") # Debugging

        elif source_type == "Upload ZIP":
            if zip_file_obj is None:
                raise ValueError("ZIP file upload is required.")
            progress(0.1, desc="Extracting ZIP file...")
            zip_path = zip_file_obj.name # Gradio provides a temp file path
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                # Check for common zip structure (single top-level dir)
                top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p and p.split('/')[0]))
                extract_target = temp_dir
                potential_repo_root = temp_dir
                if len(top_level_dirs) == 1:
                   # If zip contains repo-main/file structure, extract *into* temp_dir
                   # The actual repo content will be inside temp_dir/repo-main/
                   zip_ref.extractall(extract_target)
                   potential_repo_root = os.path.join(temp_dir, top_level_dirs[0])
                   print(f"ZIP has single top-level dir: {top_level_dirs[0]}. Potential root: {potential_repo_root}")
                else:
                   # Otherwise, extract directly into temp_dir
                   zip_ref.extractall(extract_target)
                   print(f"ZIP structure seems flat or multi-root. Using extract target as root: {extract_target}")

                # Basic check if potential_repo_root looks like a valid directory
                if os.path.isdir(potential_repo_root):
                    repo_path = pathlib.Path(potential_repo_root)
                else:
                     # Fallback if single dir logic failed or wasn't applicable
                     repo_path = pathlib.Path(extract_target)


            progress(0.5, desc="ZIP extracted.")
            print(f"Extracted ZIP to: {repo_path}") # Debugging
        else:
            raise ValueError("Invalid source type selected.")

        if not repo_path or not repo_path.is_dir():
             # Add more specific debugging info here
             print(f"Debug Info: Temp dir content: {list(os.listdir(temp_dir))}")
             if 'potential_repo_root' in locals() and potential_repo_root != temp_dir:
                  print(f"Debug Info: Potential repo root '{potential_repo_root}' exists: {os.path.exists(potential_repo_root)}, is dir: {os.path.isdir(potential_repo_root)}")
             raise ValueError(f"Could not determine repository root directory within: {temp_dir}")

        return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup

    except Exception as e:
        # Clean up the temporary directory on error before re-raising
        shutil.rmtree(temp_dir, ignore_errors=True)
        print(f"Error in get_repo_path: {e}") # Log error
        traceback.print_exc() # Print full traceback for debugging get_repo_path issues
        raise e # Re-raise the exception to be caught by the main function


def generate_markdown_structure(
    repo_root_path: pathlib.Path,
    include_content: bool,
    max_size_kb: int,
    ignore_patterns_str: str,
    progress=gr.Progress()
):
    """Generates the Markdown string from the repository structure."""
    repo_root_path = pathlib.Path(repo_root_path) # Ensure it's a Path object
    markdown_lines = []
    max_file_size_bytes = max_size_kb * 1024 if max_size_kb > 0 else 0

    # --- Prepare ignore patterns ---
    # Combine default and user patterns
    full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip()
    # Filter out empty lines and comments
    patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')]
    # Create unique list while preserving order (important if later patterns override earlier ones)
    seen = set()
    unique_patterns = [x for x in patterns if not (x in seen or seen.add(x))]
    spec = PathSpec.from_lines(GitWildMatchPattern, unique_patterns)
    print(f"Using unique ignore patterns: {unique_patterns}") # Debugging

    # --- Add header ---
    repo_name = repo_root_path.name
    markdown_lines.append(f"# {FOLDER_ICON} {repo_name}")
    markdown_lines.append("")

    # --- Walk through the directory ---
    progress(0.6, desc="Scanning repository structure...")
    files_processed = 0
    # Need to iterate through items relative to the root for pathspec matching
    all_items = sorted(list(repo_root_path.rglob('*')))
    total_items_estimate = len(all_items) # More accurate estimate

    items_scanned = 0
    for item_path in all_items:
        items_scanned += 1
        if items_scanned % 50 == 0: # Update progress periodically
             progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}")

        relative_path = item_path.relative_to(repo_root_path)
        # Pathspec matches against the path string relative to the root where .gitignore would be
        # Important: Add a leading '/' for patterns like '/node_modules/' to only match at the root
        path_str_for_match = str(relative_path)

        # Check if the path itself should be ignored
        # Pathspec automatically handles directory patterns (e.g., node_modules/ matches files and dirs inside)
        if spec.match_file(path_str_for_match):
            print(f"Ignoring: {relative_path}") # Debugging
            # If it's a directory, we don't need to manually skip recursion because
            # rglob already gave us all paths; we just skip processing this specific path.
            # If we were using os.walk, we'd modify the dirs list here.
            continue

        # Calculate depth and indentation
        depth = len(relative_path.parts) -1 # 0-based depth relative to root content
        indent = INDENT_CHAR * depth

        # Add entry to Markdown
        if item_path.is_dir():
            # Check if dir is empty *after* considering ignores. This is tricky with rglob.
            # A simple heuristic: check if any non-ignored children exist directly within it.
            # This isn't perfect but avoids complex lookahead.
            # has_children = any(p.relative_to(repo_root_path).parts[0] == relative_path.parts[0] and not spec.match_file(str(p.relative_to(repo_root_path))) for p in all_items if p != item_path and p.parent == item_path)
            # Simpler: Just always list the dir for now. Empty dir check is complex with ignores + rglob.
            markdown_lines.append(f"{indent}{FOLDER_ICON} **{item_path.name}/**")
        elif item_path.is_file():
            markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}")
            files_processed += 1

            # Include file content if requested and within limits
            if include_content and max_size_kb > 0: # Check > 0 explicitly
                try:
                    file_size = item_path.stat().st_size
                    if file_size == 0:
                        markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                        markdown_lines.append(f"{indent}{INDENT_CHAR}[Empty File]")
                        markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                    elif file_size <= max_file_size_bytes:
                        try:
                            # Attempt to detect binary files heuristically before reading large ones
                            is_binary = False
                            try:
                                # Read a small chunk to check for null bytes
                                with open(item_path, 'rb') as bf:
                                    chunk = bf.read(1024)
                                    if b'\x00' in chunk:
                                        is_binary = True
                            except Exception:
                                # Ignore errors during binary check, proceed as text
                                pass

                            if is_binary:
                                markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Likely a binary file (Size: {file_size} bytes)]")
                            else:
                                content = item_path.read_text(encoding='utf-8', errors='replace') # Replace errors instead of failing
                                lang = item_path.suffix.lstrip('.')
                                # Simple lang detection, can be expanded
                                if not lang: lang = "text"

                                markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}")
                                # Indent content lines
                                content_lines = content.splitlines()
                                # Limit output lines displayed in Markdown preview if necessary
                                # Note: The downloaded file will have full content
                                display_lines = content_lines[:MAX_OUTPUT_LINES]
                                for line in display_lines:
                                    markdown_lines.append(f"{indent}{INDENT_CHAR}{line}")
                                if len(content_lines) > MAX_OUTPUT_LINES:
                                    markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]")
                                markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                        except UnicodeDecodeError:
                             # Should be less common now with errors='replace'
                             markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error decoding file as UTF-8 (Size: {file_size} bytes)]")
                        except Exception as read_err:
                             markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]")
                    else:
                        markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size:,} bytes) exceeds limit ({max_file_size_bytes:,} bytes)]") # Added commas
                except OSError as stat_err:
                    markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file stats - {stat_err}]")

            elif include_content and max_size_kb == 0: # Content included checked, but 0 size limit
                 markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]")


        # Add a newline for separation, helps readability only if content wasn't added (which adds ```\n)
        # Or maybe always add it for consistency between file/dir entries
        markdown_lines.append("")


    progress(0.95, desc="Formatting output...")
    final_markdown = "\n".join(markdown_lines)
    print(f"Processed {files_processed} files.") # Debugging
    return final_markdown

# --- Gradio Interface ---

def process_repo(
    source_type, repo_url, branch_tag, zip_file_obj,
    include_content, max_size_kb, ignore_patterns,
    progress=gr.Progress(track_tqdm=True)
):
    """Main function called by Gradio button."""
    status = ""
    output_markdown = ""
    output_file_path = None
    repo_root_path = None
    temp_dir_to_clean = None
    # Ensure max_size_kb is treated as a number
    try:
        max_size_kb_int = int(max_size_kb) if max_size_kb is not None else 0
    except ValueError:
        yield "Error: Max File Size must be a number.", "", gr.update(value=None, visible=False)
        return


    try:
        progress(0, desc="Starting...")
        # Initial state update for all outputs
        yield "Preparing...", "", gr.update(value=None, visible=False)

        # 1. Get Repository Path
        yield "Fetching repository...", "", gr.update(value=None, visible=False)
        repo_root_path, temp_dir_to_clean = get_repo_path(
            source_type, repo_url, branch_tag, zip_file_obj, progress=progress
        )
        # Check if path finding was successful before proceeding
        if not repo_root_path:
             # Error should have been raised in get_repo_path, but double-check
             raise ValueError("Failed to obtain repository path.")

        yield f"Repository ready at: {repo_root_path.name}", "", gr.update(value=None, visible=False)

        # 2. Generate Markdown
        yield "Generating Markdown structure...", "", gr.update(value=None, visible=False)
        markdown_content = generate_markdown_structure(
            repo_root_path, include_content, max_size_kb_int, ignore_patterns, progress=progress
        )

        # Limit preview size robustly
        preview_limit = 3000
        markdown_preview = markdown_content[:preview_limit]
        if len(markdown_content) > preview_limit:
            markdown_preview += "\n\n[... Output truncated in preview ...]"

        # 3. Prepare Output File
        yield "Saving Markdown to file...", markdown_preview, gr.update(value=None, visible=False)
        output_filename = f"{repo_root_path.name}_structure.md"
        # Sanitize filename slightly (replace spaces, etc.) - less critical in temp file context
        output_filename = "".join(c if c.isalnum() or c in ('_', '-', '.') else '_' for c in output_filename)

        # Save the file in a place Gradio can access (it manages temp files)
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file:
             temp_file.write(markdown_content)
             output_file_path = temp_file.name # Gradio needs the path to this file

        # *** CORRECTED YIELD USING gr.update ***
        yield f"Done. Output file '{output_filename}' ready for download.", markdown_preview, gr.update(value=output_file_path, visible=True, label=f"Download {output_filename}")

    except ValueError as ve:
        print(f"Value Error: {ve}") # Log error
        traceback.print_exc()
        # *** CORRECTED YIELD USING gr.update ***
        yield f"Error: {ve}", "", gr.update(value=None, visible=False)
    except subprocess.CalledProcessError as cpe:
         error_detail = cpe.stderr or cpe.stdout or "Unknown git error"
         print(f"Git Error: {error_detail}") # Log error
         traceback.print_exc()
         # *** CORRECTED YIELD USING gr.update ***
         yield f"Git command failed: {error_detail}", "", gr.update(value=None, visible=False)
    except Exception as e:
        print(f"Unexpected Error: {e}") # Log error
        traceback.print_exc() # Print full traceback to logs
        # *** CORRECTED YIELD USING gr.update ***
        yield f"An unexpected error occurred: {e}", "", gr.update(value=None, visible=False)
    finally:
        # 4. Cleanup
        if temp_dir_to_clean:
             print(f"Cleaning up temporary directory: {temp_dir_to_clean}")
             shutil.rmtree(temp_dir_to_clean, ignore_errors=True)
             print("Cleanup complete.")


# --- Build Gradio UI --- (Keep as before)

with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo:
    gr.Markdown("# GitHub Repository to Markdown Converter")
    gr.Markdown("Convert a GitHub repository's structure (and optionally content) into a single Markdown file.")

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Input Source")
            input_source = gr.Radio(
                ["URL", "Upload ZIP"], label="Select Source Type", value="URL"
            )

            url_input_group = gr.Group(visible=True) # Show URL by default
            with url_input_group:
                repo_url_input = gr.Textbox(label="Git Repository URL", placeholder="https://github.com/user/repo.git")
                branch_tag_input = gr.Textbox(label="Branch / Tag (Optional)", placeholder="main")

            zip_input_group = gr.Group(visible=False) # Hide ZIP by default
            with zip_input_group:
                zip_file_input = gr.File(label="Upload Repository ZIP", file_types=[".zip"])

            # --- Configuration Options ---
            gr.Markdown("## Configuration")
            include_content_checkbox = gr.Checkbox(label="Include File Content in Output", value=False)
            max_size_input = gr.Number(label="Max File Size for Content (KB)", value=100, minimum=0, step=10,
                                       info="Files larger than this won't have content included. Set to 0 to disable content inclusion entirely, even if checked above.")
            ignore_patterns_input = gr.Textbox(
                label="Ignore Patterns (comma-separated or newline, gitignore style)",
                info="Uses .gitignore syntax. Add / for directories. Default patterns provided.",
                lines=5,
                value=DEFAULT_IGNORE_PATTERNS.strip()
            )

            generate_button = gr.Button("Generate Markdown", variant="primary")

        with gr.Column(scale=2):
            gr.Markdown("## Status & Output")
            status_output = gr.Textbox(label="Current Status", interactive=False, lines=2)
            # Use a Textbox for preview initially, as Markdown rendering can be slow/heavy
            markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20)
            # Use gr.File for the final download link
            download_output = gr.File(label="Download Markdown File", visible=False, interactive=False) # Set interactive=False


    # --- Event Handlers --- (Keep as before)
    def toggle_input_visibility(choice):
        if choice == "URL":
            return gr.update(visible=True), gr.update(visible=False)
        else: # ZIP
            return gr.update(visible=False), gr.update(visible=True)

    input_source.change(
        fn=toggle_input_visibility,
        inputs=input_source,
        outputs=[url_input_group, zip_input_group],
    )

    generate_button.click(
        fn=process_repo,
        inputs=[
            input_source, repo_url_input, branch_tag_input, zip_file_input,
            include_content_checkbox, max_size_input, ignore_patterns_input
        ],
        outputs=[status_output, markdown_preview_output, download_output],
        # api_name="generate_markdown" # Optional: for API access
    )

# --- Launch the App --- (Keep as before)
if __name__ == "__main__":
    # Ensure queue is enabled for HF Spaces deployment
    # debug=True is useful for local testing, might remove/set to False for production space
    demo.queue().launch(debug=True)