DPT2

Sleeping

App Files Files Community

Seth0330 commited on 25 days ago

Commit

60a3137

verified ·

1 Parent(s): 99481de

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -7

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 # app.py
 # Invoice Extraction — Donut (public HF model) + Tesseract tables
-# Fixed version with Poppler PATH detection (Fix A)
 import os, io, re, json, shutil
 from typing import List
@@ -42,7 +44,7 @@ task_prompt = st.sidebar.text_input(
 det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
 show_boxes = st.sidebar.checkbox("Show word boxes (debug)", value=False)
-# ----------------------------- PDF loader (Fix A) -----------------------------
 def _find_poppler_path():
     # Return a folder containing pdfinfo/pdftoppm if not on PATH
     if shutil.which("pdfinfo") and shutil.which("pdftoppm"):
@@ -52,14 +54,33 @@ def _find_poppler_path():
             return p
     return None
 def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
     name = (name or "").lower()
     if name.endswith(".pdf"):
-        poppler_path = _find_poppler_path()
-        if poppler_path:
-            return convert_from_bytes(file_bytes, dpi=300, poppler_path=poppler_path)
-        else:
-            return convert_from_bytes(file_bytes, dpi=300)
     return [Image.open(io.BytesIO(file_bytes)).convert("RGB")]
 def preprocess_for_detection(img: Image.Image) -> Image.Image:

 # app.py
 # Invoice Extraction — Donut (public HF model) + Tesseract tables
+# Robust PDF handling:
+# 1) Try pdf2image with Poppler path detection (Fix A)
+# 2) If Poppler is missing, auto-fallback to PyMuPDF (no Poppler required)
 import os, io, re, json, shutil
 from typing import List
 det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
 show_boxes = st.sidebar.checkbox("Show word boxes (debug)", value=False)
+# ----------------------------- PDF loader (Fix A + fallback) -----------------------------
 def _find_poppler_path():
     # Return a folder containing pdfinfo/pdftoppm if not on PATH
     if shutil.which("pdfinfo") and shutil.which("pdftoppm"):
             return p
     return None
+def _pages_via_pdf2image(file_bytes: bytes) -> List[Image.Image]:
+    poppler_path = _find_poppler_path()
+    if poppler_path:
+        return convert_from_bytes(file_bytes, dpi=300, poppler_path=poppler_path)
+    else:
+        return convert_from_bytes(file_bytes, dpi=300)
+def _pages_via_pymupdf(file_bytes: bytes) -> List[Image.Image]:
+    import fitz  # PyMuPDF
+    doc = fitz.open(stream=file_bytes, filetype="pdf")
+    pages = []
+    for page in doc:
+        # Use a mild upscale for better OCR if you want: matrix = fitz.Matrix(2, 2)
+        pix = page.get_pixmap()  # or: page.get_pixmap(matrix=matrix)
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        pages.append(img)
+    return pages
 def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
     name = (name or "").lower()
     if name.endswith(".pdf"):
+        # Try Poppler route first
+        try:
+            return _pages_via_pdf2image(file_bytes)
+        except Exception:
+            # Fallback: PyMuPDF (no Poppler required)
+            return _pages_via_pymupdf(file_bytes)
     return [Image.open(io.BytesIO(file_bytes)).convert("RGB")]
 def preprocess_for_detection(img: Image.Image) -> Image.Image: