"""
Enhanced Multimodal PDF Parser for PDFs with Text + Image URLs
Extracts text, detects image URLs, and links them together
"""
import pypdfium2 as pdfium
from typing import List, Dict, Optional, Tuple
import re
from dataclasses import dataclass, field
@dataclass
class MultimodalChunk:
"""Represents a chunk with text and associated images"""
text: str
page_number: int
chunk_index: int
image_urls: List[str] = field(default_factory=list)
metadata: Dict = field(default_factory=dict)
class MultimodalPDFParser:
"""
Enhanced PDF Parser that extracts text and image URLs
Perfect for user guides with screenshots and visual instructions
"""
def __init__(
self,
chunk_size: int = 500,
chunk_overlap: int = 50,
min_chunk_size: int = 50,
extract_images: bool = True
):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.min_chunk_size = min_chunk_size
self.extract_images = extract_images
# URL patterns
self.url_patterns = [
# Standard URLs
r'https?://[^\s<>"{}|\\^`\[\]]+',
# Markdown images: 
r'!\[.*?\]\((https?://[^\s)]+)\)',
# HTML images:
r'
]+src=["\']([^"\']+)["\']',
# Direct image extensions
r'https?://[^\s<>"{}|\\^`\[\]]+\.(?:jpg|jpeg|png|gif|bmp|svg|webp)',
]
def extract_image_urls(self, text: str) -> List[str]:
"""
Extract all image URLs from text
Args:
text: Text content
Returns:
List of image URLs found
"""
urls = []
for pattern in self.url_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
urls.extend(matches)
# Remove duplicates while preserving order
seen = set()
unique_urls = []
for url in urls:
if url not in seen:
seen.add(url)
unique_urls.append(url)
return unique_urls
def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, Tuple[str, List[str]]]:
"""
Extract text and image URLs from PDF
Args:
pdf_path: Path to PDF file
Returns:
Dictionary mapping page number to (text, image_urls) tuple
"""
pdf_pages = {}
try:
pdf = pdfium.PdfDocument(pdf_path)
for page_num in range(len(pdf)):
page = pdf[page_num]
textpage = page.get_textpage()
text = textpage.get_text_range()
# Clean text
text = self._clean_text(text)
# Extract image URLs if enabled
image_urls = []
if self.extract_images:
image_urls = self.extract_image_urls(text)
pdf_pages[page_num + 1] = (text, image_urls)
return pdf_pages
except Exception as e:
raise Exception(f"Error reading PDF: {str(e)}")
def _clean_text(self, text: str) -> str:
"""Clean extracted text"""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters
text = text.replace('\x00', '')
return text.strip()
def chunk_text_with_images(
self,
text: str,
image_urls: List[str],
page_number: int
) -> List[MultimodalChunk]:
"""
Split text into chunks and associate images with relevant chunks
Args:
text: Text to chunk
image_urls: Image URLs from the page
page_number: Page number
Returns:
List of MultimodalChunk objects
"""
# Split into words
words = text.split()
if len(words) < self.min_chunk_size:
if len(words) > 0:
return [MultimodalChunk(
text=text,
page_number=page_number,
chunk_index=0,
image_urls=image_urls, # All images go to single chunk
metadata={'page': page_number, 'chunk': 0}
)]
return []
chunks = []
chunk_index = 0
start = 0
# Calculate how to distribute images across chunks
images_per_chunk = len(image_urls) // max(1, len(words) // self.chunk_size) if image_urls else 0
image_index = 0
while start < len(words):
end = min(start + self.chunk_size, len(words))
chunk_words = words[start:end]
chunk_text = ' '.join(chunk_words)
# Assign images to this chunk
chunk_images = []
if image_urls:
# Simple strategy: distribute images evenly
# or detect if URL appears in chunk text
for url in image_urls:
if url in chunk_text:
chunk_images.append(url)
# If no URLs found in text, distribute evenly
if not chunk_images and image_index < len(image_urls):
# Assign remaining images to chunks
num_imgs = min(images_per_chunk + 1, len(image_urls) - image_index)
chunk_images = image_urls[image_index:image_index + num_imgs]
image_index += num_imgs
chunks.append(MultimodalChunk(
text=chunk_text,
page_number=page_number,
chunk_index=chunk_index,
image_urls=chunk_images,
metadata={
'page': page_number,
'chunk': chunk_index,
'start_word': start,
'end_word': end,
'has_images': len(chunk_images) > 0,
'num_images': len(chunk_images)
}
))
chunk_index += 1
start = end - self.chunk_overlap
if start >= len(words) - self.min_chunk_size:
break
return chunks
def parse_pdf(
self,
pdf_path: str,
document_metadata: Optional[Dict] = None
) -> List[MultimodalChunk]:
"""
Parse PDF into multimodal chunks
Args:
pdf_path: Path to PDF file
document_metadata: Additional metadata
Returns:
List of MultimodalChunk objects
"""
pages_data = self.extract_text_from_pdf(pdf_path)
all_chunks = []
for page_num, (text, image_urls) in pages_data.items():
chunks = self.chunk_text_with_images(text, image_urls, page_num)
# Add document metadata
if document_metadata:
for chunk in chunks:
chunk.metadata.update(document_metadata)
all_chunks.extend(chunks)
return all_chunks
def parse_pdf_bytes(
self,
pdf_bytes: bytes,
document_metadata: Optional[Dict] = None
) -> List[MultimodalChunk]:
"""Parse PDF from bytes"""
import tempfile
import os
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
tmp.write(pdf_bytes)
tmp_path = tmp.name
try:
chunks = self.parse_pdf(tmp_path, document_metadata)
return chunks
finally:
if os.path.exists(tmp_path):
os.unlink(tmp_path)
class MultimodalPDFIndexer:
"""Index multimodal PDF chunks into RAG system"""
def __init__(self, embedding_service, qdrant_service, documents_collection):
self.embedding_service = embedding_service
self.qdrant_service = qdrant_service
self.documents_collection = documents_collection
self.parser = MultimodalPDFParser()
def index_pdf(
self,
pdf_path: str,
document_id: str,
document_metadata: Optional[Dict] = None
) -> Dict:
"""Index PDF with image URLs"""
chunks = self.parser.parse_pdf(pdf_path, document_metadata)
indexed_count = 0
chunk_ids = []
total_images = 0
for chunk in chunks:
chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}"
# Generate embedding (text-based)
embedding = self.embedding_service.encode_text(chunk.text)
# Prepare metadata with image URLs
metadata = {
'text': chunk.text,
'document_id': document_id,
'page': chunk.page_number,
'chunk_index': chunk.chunk_index,
'source': 'pdf',
'has_images': len(chunk.image_urls) > 0,
'image_urls': chunk.image_urls, # Store image URLs!
'num_images': len(chunk.image_urls),
**chunk.metadata
}
# Index to Qdrant
self.qdrant_service.index_data(
doc_id=chunk_id,
embedding=embedding,
metadata=metadata
)
chunk_ids.append(chunk_id)
indexed_count += 1
total_images += len(chunk.image_urls)
# Save document info
doc_info = {
'document_id': document_id,
'type': 'multimodal_pdf',
'file_path': pdf_path,
'num_chunks': indexed_count,
'total_images': total_images,
'chunk_ids': chunk_ids,
'metadata': document_metadata or {}
}
self.documents_collection.insert_one(doc_info)
return {
'success': True,
'document_id': document_id,
'chunks_indexed': indexed_count,
'images_found': total_images,
'chunk_ids': chunk_ids[:5]
}
def index_pdf_bytes(
self,
pdf_bytes: bytes,
document_id: str,
filename: str,
document_metadata: Optional[Dict] = None
) -> Dict:
"""Index PDF from bytes"""
metadata = document_metadata or {}
metadata['filename'] = filename
chunks = self.parser.parse_pdf_bytes(pdf_bytes, metadata)
indexed_count = 0
chunk_ids = []
total_images = 0
for chunk in chunks:
chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}"
embedding = self.embedding_service.encode_text(chunk.text)
metadata = {
'text': chunk.text,
'document_id': document_id,
'page': chunk.page_number,
'chunk_index': chunk.chunk_index,
'source': 'multimodal_pdf',
'filename': filename,
'has_images': len(chunk.image_urls) > 0,
'image_urls': chunk.image_urls,
'num_images': len(chunk.image_urls),
**chunk.metadata
}
self.qdrant_service.index_data(
doc_id=chunk_id,
embedding=embedding,
metadata=metadata
)
chunk_ids.append(chunk_id)
indexed_count += 1
total_images += len(chunk.image_urls)
doc_info = {
'document_id': document_id,
'type': 'multimodal_pdf',
'filename': filename,
'num_chunks': indexed_count,
'total_images': total_images,
'chunk_ids': chunk_ids,
'metadata': metadata
}
self.documents_collection.insert_one(doc_info)
return {
'success': True,
'document_id': document_id,
'filename': filename,
'chunks_indexed': indexed_count,
'images_found': total_images,
'chunk_ids': chunk_ids[:5]
}