File size: 5,106 Bytes
b958bfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import json
import logging
import os
import sys
from pathlib import Path
from collections import defaultdict
from multiprocessing import get_context
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
)
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
_log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
def extract_clean_table_data(table):
cells = table.get("data", {}).get("table_cells", [])
if not cells:
return None
max_row = max(cell["end_row_offset_idx"] for cell in cells)
max_col = max(cell["end_col_offset_idx"] for cell in cells)
table_matrix = [["" for _ in range(max_col)] for _ in range(max_row)]
for cell in cells:
row = cell["start_row_offset_idx"]
col = cell["start_col_offset_idx"]
table_matrix[row][col] = cell.get("text", "").strip()
column_headers = table_matrix[0]
data_rows = table_matrix[1:]
structured_rows = []
for row in data_rows:
row_data = {
column_headers[i]: row[i] for i in range(len(column_headers)) if column_headers[i]
}
structured_rows.append(row_data)
return {
"num_rows": len(data_rows),
"num_columns": len(column_headers),
"columns": column_headers,
"data": structured_rows,
}
def process_single_pdf(pdf_path: Path, accelerator_options: AcceleratorOptions):
logging.info(f"Verarbeite: {pdf_path.name}")
output_dir = pdf_path.parent
pipeline_options = PdfPipelineOptions()
pipeline_options.accelerator_options = accelerator_options
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfPipeline,
backend=PyPdfiumDocumentBackend,
pipeline_options=pipeline_options,
)
}
)
doc = converter.convert(pdf_path).document
doc_dict = doc.export_to_dict()
page_texts = defaultdict(list)
page_tables = defaultdict(list)
for text_item in doc_dict.get("texts", []):
if "text" in text_item and "prov" in text_item:
for prov in text_item["prov"]:
page = prov.get("page_no")
if page is not None:
page_texts[page].append(text_item["text"])
for table_item in doc_dict.get("tables", []):
prov = table_item.get("prov", [])
if not prov:
continue
page = prov[0].get("page_no")
clean_table = extract_clean_table_data(table_item)
if clean_table:
page_tables[page].append(clean_table)
output_txt_path = output_dir / f"{pdf_path.stem}_extracted.txt"
with open(output_txt_path, "w", encoding="utf-8") as f:
for page_no in sorted(set(page_texts.keys()).union(page_tables.keys())):
f.write(f"=== Page {page_no} ===\n\n")
texts = page_texts.get(page_no, [])
if texts:
f.write("\n")
f.write("\n".join(texts))
f.write("\n\n")
tables = page_tables.get(page_no, [])
if tables:
f.write("tabele:\n")
for i, table in enumerate(tables, 1):
table_entry = {
"table_index": i,
**table,
}
f.write(json.dumps(table_entry, ensure_ascii=False, indent=1))
f.write("\n\n")
logging.info(f"Fertig: {pdf_path.name} → {output_txt_path.name}")
def main():
base_dir = Path(__file__).resolve().parent
pdf_files = list(base_dir.glob("*.pdf"))
if not pdf_files:
print("Keine PDF-Dateien im aktuellen Ordner gefunden.")
return
print(f"{len(pdf_files)} PDF-Dateien gefunden. Starte Verarbeitung.")
# Manuell festgelegter VRAM in GB
vram_gb = 16 # YOUR GPU VRAM, Dedicated RAM
# Anzahl paralleler Prozesse basierend auf VRAM
max_subprocesses = int(vram_gb / 1.3)
print(f"Maximale Anzahl paralleler Subprozesse: {max_subprocesses}")
accelerator_options = AcceleratorOptions(num_threads=1, device=AcceleratorDevice.AUTO)
ctx = get_context("spawn")
# Verteile PDFs auf Prozesse – jeweils eine ganze PDF pro Subprozess
with ctx.Pool(processes=min(max_subprocesses, len(pdf_files))) as pool:
pool.starmap(process_single_pdf, [(pdf_path, accelerator_options) for pdf_path in pdf_files])
sys.exit(">>> STOP <<<")
if __name__ == "__main__":
main()
|