pdf-to-markdown

Sleeping

App Files Files Community

Biifruu commited on Jun 2

Commit

75d0452

verified ·

1 Parent(s): 158a59d

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -14

app.py CHANGED Viewed

@@ -1,11 +1,21 @@
-from PIL import Image
-import pytesseract
 import spaces
 import gradio as gr
 import fitz  # PyMuPDF
-import ocrmypdf
 import tempfile
 import os
 def extract_text_markdown(doc):
     markdown_output = ""
@@ -21,30 +31,36 @@ def extract_text_markdown(doc):
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
                     line_text = " ".join([span["text"] for span in line["spans"]]).strip()
                     if line_text:
-                        elements.append((line_y, line_text))
             elif b["type"] == 1:  # Imagen
-                elements.append((y, f"[imagen_{image_counter}]()"))
                 image_counter += 1
         elements.sort(key=lambda x: x[0])
         previous_y = None
-        for y, content in elements:
             if previous_y is not None and abs(y - previous_y) > 10:
                 markdown_output += "\n"
-            markdown_output += content + "\n"
             previous_y = y
         markdown_output += "\n---\n\n"
     return markdown_output.strip()
-def needs_ocr(doc):
-    text_length = sum(len(page.get_text().strip()) for page in doc)
-    image_count = sum(len(page.get_images(full=True)) for page in doc)
-    return text_length < 500 or image_count > 0
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)
@@ -59,11 +75,11 @@ def convert(pdf_file):
             # Página con texto normal
             markdown_output += extract_text_markdown([page]) + "\n"
         else:
-            # Página sin texto: usar OCR por imagen
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             ocr_text = pytesseract.image_to_string(img, lang="spa")
-            markdown_output += ocr_text.strip() + "\n"
         markdown_output += "\n---\n\n"

 import spaces
 import gradio as gr
 import fitz  # PyMuPDF
 import tempfile
 import os
+from PIL import Image
+import pytesseract
+def clean_ocr_text(text):
+    lines = text.splitlines()
+    cleaned_lines = []
+    for line in lines:
+        line = line.strip()
+        if line and not line.isspace():
+            cleaned_lines.append(line)
+    return "\n".join(cleaned_lines)
 def extract_text_markdown(doc):
     markdown_output = ""
                 for line in b["lines"]:
                     line_y = line["bbox"][1]
                     line_text = " ".join([span["text"] for span in line["spans"]]).strip()
+                    max_font_size = max([span.get("size", 10) for span in line["spans"]])
                     if line_text:
+                        elements.append((line_y, line_text, max_font_size))
             elif b["type"] == 1:  # Imagen
+                elements.append((y, f"![imagen_{image_counter}](#)", 10))
                 image_counter += 1
         elements.sort(key=lambda x: x[0])
         previous_y = None
+        previous_font = None
+        for y, text, font_size in elements:
+            is_header = font_size >= 14
             if previous_y is not None and abs(y - previous_y) > 10:
                 markdown_output += "\n"
+            if is_header:
+                markdown_output += f"\n### {text.strip()}\n"
+            else:
+                markdown_output += text.strip() + "\n"
             previous_y = y
+            previous_font = font_size
         markdown_output += "\n---\n\n"
     return markdown_output.strip()
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)
             # Página con texto normal
             markdown_output += extract_text_markdown([page]) + "\n"
         else:
+            # Página sin texto: usar OCR
             pix = page.get_pixmap(dpi=300)
             img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             ocr_text = pytesseract.image_to_string(img, lang="spa")
+            markdown_output += clean_ocr_text(ocr_text) + "\n"
         markdown_output += "\n---\n\n"