Spaces:

VOIDER
/

repo-to-md

Running

App Files Files Community

VOIDER commited on May 5

Commit

6085d45

verified ·

1 Parent(s): 5bfbe3f

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -66

app.py CHANGED Viewed

@@ -7,8 +7,9 @@ import pathlib
 import shutil
 from pathspec import PathSpec
 from pathspec.patterns import GitWildMatchPattern
-# --- Configuration ---
 DEFAULT_IGNORE_PATTERNS = """
 # Default Ignore Patterns (Gitignore Syntax)
 /.git/
@@ -33,8 +34,7 @@ INDENT_CHAR = "    " # 4 spaces for indentation
 FOLDER_ICON = "📁"
 FILE_ICON = "📄"
-# --- Core Logic ---
 def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()):
     """Clones or extracts the repository, returning the local path."""
     temp_dir = tempfile.mkdtemp()
@@ -84,7 +84,7 @@ def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.P
             zip_path = zip_file_obj.name # Gradio provides a temp file path
             with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                 # Check for common zip structure (single top-level dir)
-                top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p))
                 extract_target = temp_dir
                 potential_repo_root = temp_dir
                 if len(top_level_dirs) == 1:
@@ -112,6 +112,10 @@ def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.P
             raise ValueError("Invalid source type selected.")
         if not repo_path or not repo_path.is_dir():
              raise ValueError(f"Could not determine repository root directory within: {temp_dir}")
         return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup
@@ -120,8 +124,10 @@ def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.P
         # Clean up the temporary directory on error before re-raising
         shutil.rmtree(temp_dir, ignore_errors=True)
         print(f"Error in get_repo_path: {e}") # Log error
         raise e # Re-raise the exception to be caught by the main function
 def generate_markdown_structure(
     repo_root_path: pathlib.Path,
     include_content: bool,
@@ -137,10 +143,13 @@ def generate_markdown_structure(
     # --- Prepare ignore patterns ---
     # Combine default and user patterns
     full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip()
-    # Filter out empty lines
     patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')]
-    spec = PathSpec.from_lines(GitWildMatchPattern, patterns)
-    print(f"Using ignore patterns: {patterns}") # Debugging
     # --- Add header ---
     repo_name = repo_root_path.name
@@ -150,36 +159,28 @@ def generate_markdown_structure(
     # --- Walk through the directory ---
     progress(0.6, desc="Scanning repository structure...")
     files_processed = 0
-    total_items_estimate = sum(1 for _ in repo_root_path.rglob('*')) # Rough estimate for progress
     items_scanned = 0
-    for item_path in sorted(repo_root_path.rglob('*')):
         items_scanned += 1
         if items_scanned % 50 == 0: # Update progress periodically
              progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}")
         relative_path = item_path.relative_to(repo_root_path)
-        # Check if the path itself or any of its parent directories should be ignored
-        # Need to check components because pathspec matches relative paths
-        components = relative_path.parts
-        ignored = False
-        # Check root path first for patterns like '/node_modules/'
-        if spec.match_file(str(relative_path)):
-             ignored = True
-        # Check parent directories if a pattern like 'node_modules/' should match anywhere
-        current_check_path = ""
-        for part in components:
-            current_check_path = os.path.join(current_check_path, part)
-            if spec.match_file(current_check_path):
-                ignored = True
-                break
-        if ignored:
-            # If it's a directory, prevent os.walk from descending further if we were using it
-            # With rglob, we just skip the current item
             print(f"Ignoring: {relative_path}") # Debugging
             continue
         # Calculate depth and indentation
@@ -188,13 +189,18 @@ def generate_markdown_structure(
         # Add entry to Markdown
         if item_path.is_dir():
             markdown_lines.append(f"{indent}{FOLDER_ICON} **{item_path.name}/**")
         elif item_path.is_file():
             markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}")
             files_processed += 1
             # Include file content if requested and within limits
-            if include_content and max_file_size_bytes > 0:
                 try:
                     file_size = item_path.stat().st_size
                     if file_size == 0:
@@ -203,36 +209,53 @@ def generate_markdown_structure(
                         markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                     elif file_size <= max_file_size_bytes:
                         try:
-                            content = item_path.read_text(encoding='utf-8')
-                            lang = item_path.suffix.lstrip('.')
-                            # Simple lang detection, can be expanded
-                            if not lang: lang = "text"
-                            markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}")
-                            # Indent content lines
-                            content_lines = content.splitlines()
-                            # Limit output lines displayed in Markdown preview if necessary
-                            # Note: The downloaded file will have full content
-                            display_lines = content_lines[:MAX_OUTPUT_LINES]
-                            for line in display_lines:
-                                markdown_lines.append(f"{indent}{INDENT_CHAR}{line}")
-                            if len(content_lines) > MAX_OUTPUT_LINES:
-                                markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]")
-                            markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                         except UnicodeDecodeError:
-                            markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Not a UTF-8 text file (Size: {file_size} bytes)]")
                         except Exception as read_err:
                              markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]")
                     else:
-                        markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size} bytes) exceeds limit ({max_file_size_bytes} bytes)]")
                 except OSError as stat_err:
-                    markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file - {stat_err}]")
-            elif include_content and max_file_size_bytes == 0: # Content included, but 0 size limit means no content shown
                  markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]")
-        # Add a newline for separation, helps readability
         markdown_lines.append("")
@@ -254,46 +277,73 @@ def process_repo(
     output_file_path = None
     repo_root_path = None
     temp_dir_to_clean = None
     try:
         progress(0, desc="Starting...")
         # 1. Get Repository Path
-        yield "Fetching repository...", "", None # Update status, clear outputs
         repo_root_path, temp_dir_to_clean = get_repo_path(
             source_type, repo_url, branch_tag, zip_file_obj, progress=progress
         )
-        yield f"Repository ready at: {repo_root_path}", "", None
         # 2. Generate Markdown
-        yield "Generating Markdown structure...", "", None
         markdown_content = generate_markdown_structure(
-            repo_root_path, include_content, int(max_size_kb), ignore_patterns, progress=progress
         )
         # 3. Prepare Output File
-        yield "Saving Markdown to file...", markdown_content[:3000] + ("\n\n[... Output truncated in preview ...]" if len(markdown_content)>3000 else ""), None # Show preview
         output_filename = f"{repo_root_path.name}_structure.md"
         # Save the file in a place Gradio can access (it manages temp files)
-        # Create a temporary file for Gradio output
         with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file:
              temp_file.write(markdown_content)
              output_file_path = temp_file.name # Gradio needs the path to this file
-        yield f"Done. Output file '{output_filename}' ready for download.", markdown_content[:3000] + ("\n\n[... Output truncated in preview ...]" if len(markdown_content)>3000 else ""), gr.File.update(value=output_file_path, visible=True, label=f"Download {output_filename}") # Make file downloadable
     except ValueError as ve:
         print(f"Value Error: {ve}") # Log error
-        yield f"Error: {ve}", "", gr.File.update(value=None, visible=False)
     except subprocess.CalledProcessError as cpe:
          error_detail = cpe.stderr or cpe.stdout or "Unknown git error"
          print(f"Git Error: {error_detail}") # Log error
-         yield f"Git command failed: {error_detail}", "", gr.File.update(value=None, visible=False)
     except Exception as e:
         print(f"Unexpected Error: {e}") # Log error
-        import traceback
         traceback.print_exc() # Print full traceback to logs
-        yield f"An unexpected error occurred: {e}", "", gr.File.update(value=None, visible=False)
     finally:
         # 4. Cleanup
         if temp_dir_to_clean:
@@ -302,7 +352,7 @@ def process_repo(
              print("Cleanup complete.")
-# --- Build Gradio UI ---
 with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo:
     gr.Markdown("# GitHub Repository to Markdown Converter")
@@ -344,10 +394,10 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")
             # Use a Textbox for preview initially, as Markdown rendering can be slow/heavy
             markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20)
             # Use gr.File for the final download link
-            download_output = gr.File(label="Download Markdown File", visible=False, interactive=False)
-    # --- Event Handlers ---
     def toggle_input_visibility(choice):
         if choice == "URL":
             return gr.update(visible=True), gr.update(visible=False)
@@ -370,6 +420,8 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")
         # api_name="generate_markdown" # Optional: for API access
     )
-# --- Launch the App ---
 if __name__ == "__main__":
-    demo.queue().launch(debug=True) # Enable queue for handling multiple users, debug=True for local testing

 import shutil
 from pathspec import PathSpec
 from pathspec.patterns import GitWildMatchPattern
+import traceback # Import traceback for better error logging
+# --- Configuration --- (Keep as before)
 DEFAULT_IGNORE_PATTERNS = """
 # Default Ignore Patterns (Gitignore Syntax)
 /.git/
 FOLDER_ICON = "📁"
 FILE_ICON = "📄"
+# --- Core Logic --- (Keep get_repo_path and generate_markdown_structure as before)
 def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()):
     """Clones or extracts the repository, returning the local path."""
     temp_dir = tempfile.mkdtemp()
             zip_path = zip_file_obj.name # Gradio provides a temp file path
             with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                 # Check for common zip structure (single top-level dir)
+                top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p and p.split('/')[0]))
                 extract_target = temp_dir
                 potential_repo_root = temp_dir
                 if len(top_level_dirs) == 1:
             raise ValueError("Invalid source type selected.")
         if not repo_path or not repo_path.is_dir():
+             # Add more specific debugging info here
+             print(f"Debug Info: Temp dir content: {list(os.listdir(temp_dir))}")
+             if 'potential_repo_root' in locals() and potential_repo_root != temp_dir:
+                  print(f"Debug Info: Potential repo root '{potential_repo_root}' exists: {os.path.exists(potential_repo_root)}, is dir: {os.path.isdir(potential_repo_root)}")
              raise ValueError(f"Could not determine repository root directory within: {temp_dir}")
         return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup
         # Clean up the temporary directory on error before re-raising
         shutil.rmtree(temp_dir, ignore_errors=True)
         print(f"Error in get_repo_path: {e}") # Log error
+        traceback.print_exc() # Print full traceback for debugging get_repo_path issues
         raise e # Re-raise the exception to be caught by the main function
 def generate_markdown_structure(
     repo_root_path: pathlib.Path,
     include_content: bool,
     # --- Prepare ignore patterns ---
     # Combine default and user patterns
     full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip()
+    # Filter out empty lines and comments
     patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')]
+    # Create unique list while preserving order (important if later patterns override earlier ones)
+    seen = set()
+    unique_patterns = [x for x in patterns if not (x in seen or seen.add(x))]
+    spec = PathSpec.from_lines(GitWildMatchPattern, unique_patterns)
+    print(f"Using unique ignore patterns: {unique_patterns}") # Debugging
     # --- Add header ---
     repo_name = repo_root_path.name
     # --- Walk through the directory ---
     progress(0.6, desc="Scanning repository structure...")
     files_processed = 0
+    # Need to iterate through items relative to the root for pathspec matching
+    all_items = sorted(list(repo_root_path.rglob('*')))
+    total_items_estimate = len(all_items) # More accurate estimate
     items_scanned = 0
+    for item_path in all_items:
         items_scanned += 1
         if items_scanned % 50 == 0: # Update progress periodically
              progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}")
         relative_path = item_path.relative_to(repo_root_path)
+        # Pathspec matches against the path string relative to the root where .gitignore would be
+        # Important: Add a leading '/' for patterns like '/node_modules/' to only match at the root
+        path_str_for_match = str(relative_path)
+        # Check if the path itself should be ignored
+        # Pathspec automatically handles directory patterns (e.g., node_modules/ matches files and dirs inside)
+        if spec.match_file(path_str_for_match):
             print(f"Ignoring: {relative_path}") # Debugging
+            # If it's a directory, we don't need to manually skip recursion because
+            # rglob already gave us all paths; we just skip processing this specific path.
+            # If we were using os.walk, we'd modify the dirs list here.
             continue
         # Calculate depth and indentation
         # Add entry to Markdown
         if item_path.is_dir():
+            # Check if dir is empty *after* considering ignores. This is tricky with rglob.
+            # A simple heuristic: check if any non-ignored children exist directly within it.
+            # This isn't perfect but avoids complex lookahead.
+            # has_children = any(p.relative_to(repo_root_path).parts[0] == relative_path.parts[0] and not spec.match_file(str(p.relative_to(repo_root_path))) for p in all_items if p != item_path and p.parent == item_path)
+            # Simpler: Just always list the dir for now. Empty dir check is complex with ignores + rglob.
             markdown_lines.append(f"{indent}{FOLDER_ICON} **{item_path.name}/**")
         elif item_path.is_file():
             markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}")
             files_processed += 1
             # Include file content if requested and within limits
+            if include_content and max_size_kb > 0: # Check > 0 explicitly
                 try:
                     file_size = item_path.stat().st_size
                     if file_size == 0:
                         markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                     elif file_size <= max_file_size_bytes:
                         try:
+                            # Attempt to detect binary files heuristically before reading large ones
+                            is_binary = False
+                            try:
+                                # Read a small chunk to check for null bytes
+                                with open(item_path, 'rb') as bf:
+                                    chunk = bf.read(1024)
+                                    if b'\x00' in chunk:
+                                        is_binary = True
+                            except Exception:
+                                # Ignore errors during binary check, proceed as text
+                                pass
+                            if is_binary:
+                                markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Likely a binary file (Size: {file_size} bytes)]")
+                            else:
+                                content = item_path.read_text(encoding='utf-8', errors='replace') # Replace errors instead of failing
+                                lang = item_path.suffix.lstrip('.')
+                                # Simple lang detection, can be expanded
+                                if not lang: lang = "text"
+                                markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}")
+                                # Indent content lines
+                                content_lines = content.splitlines()
+                                # Limit output lines displayed in Markdown preview if necessary
+                                # Note: The downloaded file will have full content
+                                display_lines = content_lines[:MAX_OUTPUT_LINES]
+                                for line in display_lines:
+                                    markdown_lines.append(f"{indent}{INDENT_CHAR}{line}")
+                                if len(content_lines) > MAX_OUTPUT_LINES:
+                                    markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]")
+                                markdown_lines.append(f"{indent}{INDENT_CHAR}```")
                         except UnicodeDecodeError:
+                             # Should be less common now with errors='replace'
+                             markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error decoding file as UTF-8 (Size: {file_size} bytes)]")
                         except Exception as read_err:
                              markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]")
                     else:
+                        markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size:,} bytes) exceeds limit ({max_file_size_bytes:,} bytes)]") # Added commas
                 except OSError as stat_err:
+                    markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file stats - {stat_err}]")
+            elif include_content and max_size_kb == 0: # Content included checked, but 0 size limit
                  markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]")
+        # Add a newline for separation, helps readability only if content wasn't added (which adds ```\n)
+        # Or maybe always add it for consistency between file/dir entries
         markdown_lines.append("")
     output_file_path = None
     repo_root_path = None
     temp_dir_to_clean = None
+    # Ensure max_size_kb is treated as a number
+    try:
+        max_size_kb_int = int(max_size_kb) if max_size_kb is not None else 0
+    except ValueError:
+        yield "Error: Max File Size must be a number.", "", gr.update(value=None, visible=False)
+        return
     try:
         progress(0, desc="Starting...")
+        # Initial state update for all outputs
+        yield "Preparing...", "", gr.update(value=None, visible=False)
         # 1. Get Repository Path
+        yield "Fetching repository...", "", gr.update(value=None, visible=False)
         repo_root_path, temp_dir_to_clean = get_repo_path(
             source_type, repo_url, branch_tag, zip_file_obj, progress=progress
         )
+        # Check if path finding was successful before proceeding
+        if not repo_root_path:
+             # Error should have been raised in get_repo_path, but double-check
+             raise ValueError("Failed to obtain repository path.")
+        yield f"Repository ready at: {repo_root_path.name}", "", gr.update(value=None, visible=False)
         # 2. Generate Markdown
+        yield "Generating Markdown structure...", "", gr.update(value=None, visible=False)
         markdown_content = generate_markdown_structure(
+            repo_root_path, include_content, max_size_kb_int, ignore_patterns, progress=progress
         )
+        # Limit preview size robustly
+        preview_limit = 3000
+        markdown_preview = markdown_content[:preview_limit]
+        if len(markdown_content) > preview_limit:
+            markdown_preview += "\n\n[... Output truncated in preview ...]"
         # 3. Prepare Output File
+        yield "Saving Markdown to file...", markdown_preview, gr.update(value=None, visible=False)
         output_filename = f"{repo_root_path.name}_structure.md"
+        # Sanitize filename slightly (replace spaces, etc.) - less critical in temp file context
+        output_filename = "".join(c if c.isalnum() or c in ('_', '-', '.') else '_' for c in output_filename)
         # Save the file in a place Gradio can access (it manages temp files)
         with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file:
              temp_file.write(markdown_content)
              output_file_path = temp_file.name # Gradio needs the path to this file
+        # *** CORRECTED YIELD USING gr.update ***
+        yield f"Done. Output file '{output_filename}' ready for download.", markdown_preview, gr.update(value=output_file_path, visible=True, label=f"Download {output_filename}")
     except ValueError as ve:
         print(f"Value Error: {ve}") # Log error
+        traceback.print_exc()
+        # *** CORRECTED YIELD USING gr.update ***
+        yield f"Error: {ve}", "", gr.update(value=None, visible=False)
     except subprocess.CalledProcessError as cpe:
          error_detail = cpe.stderr or cpe.stdout or "Unknown git error"
          print(f"Git Error: {error_detail}") # Log error
+         traceback.print_exc()
+         # *** CORRECTED YIELD USING gr.update ***
+         yield f"Git command failed: {error_detail}", "", gr.update(value=None, visible=False)
     except Exception as e:
         print(f"Unexpected Error: {e}") # Log error
         traceback.print_exc() # Print full traceback to logs
+        # *** CORRECTED YIELD USING gr.update ***
+        yield f"An unexpected error occurred: {e}", "", gr.update(value=None, visible=False)
     finally:
         # 4. Cleanup
         if temp_dir_to_clean:
              print("Cleanup complete.")
+# --- Build Gradio UI --- (Keep as before)
 with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo:
     gr.Markdown("# GitHub Repository to Markdown Converter")
             # Use a Textbox for preview initially, as Markdown rendering can be slow/heavy
             markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20)
             # Use gr.File for the final download link
+            download_output = gr.File(label="Download Markdown File", visible=False, interactive=False) # Set interactive=False
+    # --- Event Handlers --- (Keep as before)
     def toggle_input_visibility(choice):
         if choice == "URL":
             return gr.update(visible=True), gr.update(visible=False)
         # api_name="generate_markdown" # Optional: for API access
     )
+# --- Launch the App --- (Keep as before)
 if __name__ == "__main__":
+    # Ensure queue is enabled for HF Spaces deployment
+    # debug=True is useful for local testing, might remove/set to False for production space
+    demo.queue().launch(debug=True)