Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
# app.py
|
| 2 |
# Invoice Extraction — Donut (public HF model) + Tesseract tables
|
| 3 |
-
#
|
|
|
|
|
|
|
| 4 |
|
| 5 |
import os, io, re, json, shutil
|
| 6 |
from typing import List
|
|
@@ -42,7 +44,7 @@ task_prompt = st.sidebar.text_input(
|
|
| 42 |
det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
|
| 43 |
show_boxes = st.sidebar.checkbox("Show word boxes (debug)", value=False)
|
| 44 |
|
| 45 |
-
# ----------------------------- PDF loader (Fix A) -----------------------------
|
| 46 |
def _find_poppler_path():
|
| 47 |
# Return a folder containing pdfinfo/pdftoppm if not on PATH
|
| 48 |
if shutil.which("pdfinfo") and shutil.which("pdftoppm"):
|
|
@@ -52,14 +54,33 @@ def _find_poppler_path():
|
|
| 52 |
return p
|
| 53 |
return None
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
|
| 56 |
name = (name or "").lower()
|
| 57 |
if name.endswith(".pdf"):
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
return
|
| 61 |
-
|
| 62 |
-
|
|
|
|
| 63 |
return [Image.open(io.BytesIO(file_bytes)).convert("RGB")]
|
| 64 |
|
| 65 |
def preprocess_for_detection(img: Image.Image) -> Image.Image:
|
|
|
|
| 1 |
# app.py
|
| 2 |
# Invoice Extraction — Donut (public HF model) + Tesseract tables
|
| 3 |
+
# Robust PDF handling:
|
| 4 |
+
# 1) Try pdf2image with Poppler path detection (Fix A)
|
| 5 |
+
# 2) If Poppler is missing, auto-fallback to PyMuPDF (no Poppler required)
|
| 6 |
|
| 7 |
import os, io, re, json, shutil
|
| 8 |
from typing import List
|
|
|
|
| 44 |
det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
|
| 45 |
show_boxes = st.sidebar.checkbox("Show word boxes (debug)", value=False)
|
| 46 |
|
| 47 |
+
# ----------------------------- PDF loader (Fix A + fallback) -----------------------------
|
| 48 |
def _find_poppler_path():
|
| 49 |
# Return a folder containing pdfinfo/pdftoppm if not on PATH
|
| 50 |
if shutil.which("pdfinfo") and shutil.which("pdftoppm"):
|
|
|
|
| 54 |
return p
|
| 55 |
return None
|
| 56 |
|
| 57 |
+
def _pages_via_pdf2image(file_bytes: bytes) -> List[Image.Image]:
|
| 58 |
+
poppler_path = _find_poppler_path()
|
| 59 |
+
if poppler_path:
|
| 60 |
+
return convert_from_bytes(file_bytes, dpi=300, poppler_path=poppler_path)
|
| 61 |
+
else:
|
| 62 |
+
return convert_from_bytes(file_bytes, dpi=300)
|
| 63 |
+
|
| 64 |
+
def _pages_via_pymupdf(file_bytes: bytes) -> List[Image.Image]:
|
| 65 |
+
import fitz # PyMuPDF
|
| 66 |
+
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 67 |
+
pages = []
|
| 68 |
+
for page in doc:
|
| 69 |
+
# Use a mild upscale for better OCR if you want: matrix = fitz.Matrix(2, 2)
|
| 70 |
+
pix = page.get_pixmap() # or: page.get_pixmap(matrix=matrix)
|
| 71 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 72 |
+
pages.append(img)
|
| 73 |
+
return pages
|
| 74 |
+
|
| 75 |
def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
|
| 76 |
name = (name or "").lower()
|
| 77 |
if name.endswith(".pdf"):
|
| 78 |
+
# Try Poppler route first
|
| 79 |
+
try:
|
| 80 |
+
return _pages_via_pdf2image(file_bytes)
|
| 81 |
+
except Exception:
|
| 82 |
+
# Fallback: PyMuPDF (no Poppler required)
|
| 83 |
+
return _pages_via_pymupdf(file_bytes)
|
| 84 |
return [Image.open(io.BytesIO(file_bytes)).convert("RGB")]
|
| 85 |
|
| 86 |
def preprocess_for_detection(img: Image.Image) -> Image.Image:
|