Seth0330 commited on
Commit
60a3137
·
verified ·
1 Parent(s): 99481de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -7
app.py CHANGED
@@ -1,6 +1,8 @@
1
  # app.py
2
  # Invoice Extraction — Donut (public HF model) + Tesseract tables
3
- # Fixed version with Poppler PATH detection (Fix A)
 
 
4
 
5
  import os, io, re, json, shutil
6
  from typing import List
@@ -42,7 +44,7 @@ task_prompt = st.sidebar.text_input(
42
  det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
43
  show_boxes = st.sidebar.checkbox("Show word boxes (debug)", value=False)
44
 
45
- # ----------------------------- PDF loader (Fix A) -----------------------------
46
  def _find_poppler_path():
47
  # Return a folder containing pdfinfo/pdftoppm if not on PATH
48
  if shutil.which("pdfinfo") and shutil.which("pdftoppm"):
@@ -52,14 +54,33 @@ def _find_poppler_path():
52
  return p
53
  return None
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
56
  name = (name or "").lower()
57
  if name.endswith(".pdf"):
58
- poppler_path = _find_poppler_path()
59
- if poppler_path:
60
- return convert_from_bytes(file_bytes, dpi=300, poppler_path=poppler_path)
61
- else:
62
- return convert_from_bytes(file_bytes, dpi=300)
 
63
  return [Image.open(io.BytesIO(file_bytes)).convert("RGB")]
64
 
65
  def preprocess_for_detection(img: Image.Image) -> Image.Image:
 
1
  # app.py
2
  # Invoice Extraction — Donut (public HF model) + Tesseract tables
3
+ # Robust PDF handling:
4
+ # 1) Try pdf2image with Poppler path detection (Fix A)
5
+ # 2) If Poppler is missing, auto-fallback to PyMuPDF (no Poppler required)
6
 
7
  import os, io, re, json, shutil
8
  from typing import List
 
44
  det_lang = st.sidebar.text_input("Tesseract language(s) — detection only", value="eng")
45
  show_boxes = st.sidebar.checkbox("Show word boxes (debug)", value=False)
46
 
47
+ # ----------------------------- PDF loader (Fix A + fallback) -----------------------------
48
  def _find_poppler_path():
49
  # Return a folder containing pdfinfo/pdftoppm if not on PATH
50
  if shutil.which("pdfinfo") and shutil.which("pdftoppm"):
 
54
  return p
55
  return None
56
 
57
+ def _pages_via_pdf2image(file_bytes: bytes) -> List[Image.Image]:
58
+ poppler_path = _find_poppler_path()
59
+ if poppler_path:
60
+ return convert_from_bytes(file_bytes, dpi=300, poppler_path=poppler_path)
61
+ else:
62
+ return convert_from_bytes(file_bytes, dpi=300)
63
+
64
+ def _pages_via_pymupdf(file_bytes: bytes) -> List[Image.Image]:
65
+ import fitz # PyMuPDF
66
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
67
+ pages = []
68
+ for page in doc:
69
+ # Use a mild upscale for better OCR if you want: matrix = fitz.Matrix(2, 2)
70
+ pix = page.get_pixmap() # or: page.get_pixmap(matrix=matrix)
71
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
72
+ pages.append(img)
73
+ return pages
74
+
75
  def load_pages(file_bytes: bytes, name: str) -> List[Image.Image]:
76
  name = (name or "").lower()
77
  if name.endswith(".pdf"):
78
+ # Try Poppler route first
79
+ try:
80
+ return _pages_via_pdf2image(file_bytes)
81
+ except Exception:
82
+ # Fallback: PyMuPDF (no Poppler required)
83
+ return _pages_via_pymupdf(file_bytes)
84
  return [Image.open(io.BytesIO(file_bytes)).convert("RGB")]
85
 
86
  def preprocess_for_detection(img: Image.Image) -> Image.Image: