mineru2 / runpod_handler.py
marcosremar2's picture
Add RunPod serverless configuration with GitHub integration
4112422
raw
history blame
3.5 kB
import runpod
import tempfile
import os
import sys
import json
import base64
from pathlib import Path
from loguru import logger
# Add current directory to path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# Import MinerU converter
from pdf_converter_mineru import PdfConverter
# Initialize converter with model path
CONVERTER = None
def initialize_converter():
"""Initialize the PDF converter once"""
global CONVERTER
if CONVERTER is None:
logger.info("Initializing MinerU converter...")
model_path = os.environ.get('MINERU_MODEL_PATH', '/app/models')
# Create config
config = {
"model_dir": model_path,
"output_dir": "/tmp/mineru_output",
"device": "cuda" if os.path.exists('/dev/nvidia0') else "cpu",
"parse_method": "auto",
"debug": False
}
CONVERTER = PdfConverter(config)
logger.info("MinerU converter initialized successfully")
def handler(job):
"""
RunPod serverless handler for PDF to Markdown conversion
"""
try:
# Initialize converter on first run
initialize_converter()
job_input = job["input"]
# Get PDF data from base64
pdf_base64 = job_input.get("pdf_base64")
filename = job_input.get("filename", "document.pdf")
if not pdf_base64:
return {"error": "No PDF data provided", "status": "failed"}
# Decode base64 PDF
pdf_data = base64.b64decode(pdf_base64)
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp_file:
tmp_file.write(pdf_data)
pdf_path = tmp_file.name
logger.info(f"Processing PDF: {filename} ({len(pdf_data)} bytes)")
# Convert PDF to Markdown using MinerU
try:
output_dir = CONVERTER.convert_single_pdf(pdf_path)
# Find the markdown file in output
md_files = list(Path(output_dir).glob("**/*.md"))
if md_files:
with open(md_files[0], 'r', encoding='utf-8') as f:
markdown_content = f.read()
else:
# Fallback to text files
txt_files = list(Path(output_dir).glob("**/txt/*.txt"))
if txt_files:
with open(txt_files[0], 'r', encoding='utf-8') as f:
markdown_content = f.read()
else:
markdown_content = "# Conversion completed but no markdown found"
# Clean up
os.unlink(pdf_path)
return {
"markdown": markdown_content,
"filename": filename,
"status": "success",
"pages": len(markdown_content.split('\n---\n')) # Rough page count
}
except Exception as conv_error:
logger.error(f"Conversion error: {str(conv_error)}")
return {
"error": f"Conversion failed: {str(conv_error)}",
"filename": filename,
"status": "failed"
}
except Exception as e:
logger.error(f"Handler error: {str(e)}")
return {
"error": str(e),
"status": "failed"
}
# RunPod serverless entrypoint
runpod.serverless.start({"handler": handler})