""" Enhanced Multimodal PDF Parser for PDFs with Text + Image URLs Extracts text, detects image URLs, and links them together """ import pypdfium2 as pdfium from typing import List, Dict, Optional, Tuple import re from dataclasses import dataclass, field @dataclass class MultimodalChunk: """Represents a chunk with text and associated images""" text: str page_number: int chunk_index: int image_urls: List[str] = field(default_factory=list) metadata: Dict = field(default_factory=dict) class MultimodalPDFParser: """ Enhanced PDF Parser that extracts text and image URLs Perfect for user guides with screenshots and visual instructions """ def __init__( self, chunk_size: int = 500, chunk_overlap: int = 50, min_chunk_size: int = 50, extract_images: bool = True ): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.min_chunk_size = min_chunk_size self.extract_images = extract_images # URL patterns self.url_patterns = [ # Standard URLs r'https?://[^\s<>"{}|\\^`\[\]]+', # Markdown images: ![alt](url) r'!\[.*?\]\((https?://[^\s)]+)\)', # HTML images: r']+src=["\']([^"\']+)["\']', # Direct image extensions r'https?://[^\s<>"{}|\\^`\[\]]+\.(?:jpg|jpeg|png|gif|bmp|svg|webp)', ] def extract_image_urls(self, text: str) -> List[str]: """ Extract all image URLs from text Args: text: Text content Returns: List of image URLs found """ urls = [] for pattern in self.url_patterns: matches = re.findall(pattern, text, re.IGNORECASE) urls.extend(matches) # Remove duplicates while preserving order seen = set() unique_urls = [] for url in urls: if url not in seen: seen.add(url) unique_urls.append(url) return unique_urls def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, Tuple[str, List[str]]]: """ Extract text and image URLs from PDF Args: pdf_path: Path to PDF file Returns: Dictionary mapping page number to (text, image_urls) tuple """ pdf_pages = {} try: pdf = pdfium.PdfDocument(pdf_path) for page_num in range(len(pdf)): page = pdf[page_num] textpage = page.get_textpage() text = textpage.get_text_range() # Clean text text = self._clean_text(text) # Extract image URLs if enabled image_urls = [] if self.extract_images: image_urls = self.extract_image_urls(text) pdf_pages[page_num + 1] = (text, image_urls) return pdf_pages except Exception as e: raise Exception(f"Error reading PDF: {str(e)}") def _clean_text(self, text: str) -> str: """Clean extracted text""" # Remove excessive whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters text = text.replace('\x00', '') return text.strip() def chunk_text_with_images( self, text: str, image_urls: List[str], page_number: int ) -> List[MultimodalChunk]: """ Split text into chunks and associate images with relevant chunks Args: text: Text to chunk image_urls: Image URLs from the page page_number: Page number Returns: List of MultimodalChunk objects """ # Split into words words = text.split() if len(words) < self.min_chunk_size: if len(words) > 0: return [MultimodalChunk( text=text, page_number=page_number, chunk_index=0, image_urls=image_urls, # All images go to single chunk metadata={'page': page_number, 'chunk': 0} )] return [] chunks = [] chunk_index = 0 start = 0 # Calculate how to distribute images across chunks images_per_chunk = len(image_urls) // max(1, len(words) // self.chunk_size) if image_urls else 0 image_index = 0 while start < len(words): end = min(start + self.chunk_size, len(words)) chunk_words = words[start:end] chunk_text = ' '.join(chunk_words) # Assign images to this chunk chunk_images = [] if image_urls: # Simple strategy: distribute images evenly # or detect if URL appears in chunk text for url in image_urls: if url in chunk_text: chunk_images.append(url) # If no URLs found in text, distribute evenly if not chunk_images and image_index < len(image_urls): # Assign remaining images to chunks num_imgs = min(images_per_chunk + 1, len(image_urls) - image_index) chunk_images = image_urls[image_index:image_index + num_imgs] image_index += num_imgs chunks.append(MultimodalChunk( text=chunk_text, page_number=page_number, chunk_index=chunk_index, image_urls=chunk_images, metadata={ 'page': page_number, 'chunk': chunk_index, 'start_word': start, 'end_word': end, 'has_images': len(chunk_images) > 0, 'num_images': len(chunk_images) } )) chunk_index += 1 start = end - self.chunk_overlap if start >= len(words) - self.min_chunk_size: break return chunks def parse_pdf( self, pdf_path: str, document_metadata: Optional[Dict] = None ) -> List[MultimodalChunk]: """ Parse PDF into multimodal chunks Args: pdf_path: Path to PDF file document_metadata: Additional metadata Returns: List of MultimodalChunk objects """ pages_data = self.extract_text_from_pdf(pdf_path) all_chunks = [] for page_num, (text, image_urls) in pages_data.items(): chunks = self.chunk_text_with_images(text, image_urls, page_num) # Add document metadata if document_metadata: for chunk in chunks: chunk.metadata.update(document_metadata) all_chunks.extend(chunks) return all_chunks def parse_pdf_bytes( self, pdf_bytes: bytes, document_metadata: Optional[Dict] = None ) -> List[MultimodalChunk]: """Parse PDF from bytes""" import tempfile import os with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: tmp.write(pdf_bytes) tmp_path = tmp.name try: chunks = self.parse_pdf(tmp_path, document_metadata) return chunks finally: if os.path.exists(tmp_path): os.unlink(tmp_path) class MultimodalPDFIndexer: """Index multimodal PDF chunks into RAG system""" def __init__(self, embedding_service, qdrant_service, documents_collection): self.embedding_service = embedding_service self.qdrant_service = qdrant_service self.documents_collection = documents_collection self.parser = MultimodalPDFParser() def index_pdf( self, pdf_path: str, document_id: str, document_metadata: Optional[Dict] = None ) -> Dict: """Index PDF with image URLs""" chunks = self.parser.parse_pdf(pdf_path, document_metadata) indexed_count = 0 chunk_ids = [] total_images = 0 for chunk in chunks: chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" # Generate embedding (text-based) embedding = self.embedding_service.encode_text(chunk.text) # Prepare metadata with image URLs metadata = { 'text': chunk.text, 'document_id': document_id, 'page': chunk.page_number, 'chunk_index': chunk.chunk_index, 'source': 'pdf', 'has_images': len(chunk.image_urls) > 0, 'image_urls': chunk.image_urls, # Store image URLs! 'num_images': len(chunk.image_urls), **chunk.metadata } # Index to Qdrant self.qdrant_service.index_data( doc_id=chunk_id, embedding=embedding, metadata=metadata ) chunk_ids.append(chunk_id) indexed_count += 1 total_images += len(chunk.image_urls) # Save document info doc_info = { 'document_id': document_id, 'type': 'multimodal_pdf', 'file_path': pdf_path, 'num_chunks': indexed_count, 'total_images': total_images, 'chunk_ids': chunk_ids, 'metadata': document_metadata or {} } self.documents_collection.insert_one(doc_info) return { 'success': True, 'document_id': document_id, 'chunks_indexed': indexed_count, 'images_found': total_images, 'chunk_ids': chunk_ids[:5] } def index_pdf_bytes( self, pdf_bytes: bytes, document_id: str, filename: str, document_metadata: Optional[Dict] = None ) -> Dict: """Index PDF from bytes""" metadata = document_metadata or {} metadata['filename'] = filename chunks = self.parser.parse_pdf_bytes(pdf_bytes, metadata) indexed_count = 0 chunk_ids = [] total_images = 0 for chunk in chunks: chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" embedding = self.embedding_service.encode_text(chunk.text) metadata = { 'text': chunk.text, 'document_id': document_id, 'page': chunk.page_number, 'chunk_index': chunk.chunk_index, 'source': 'multimodal_pdf', 'filename': filename, 'has_images': len(chunk.image_urls) > 0, 'image_urls': chunk.image_urls, 'num_images': len(chunk.image_urls), **chunk.metadata } self.qdrant_service.index_data( doc_id=chunk_id, embedding=embedding, metadata=metadata ) chunk_ids.append(chunk_id) indexed_count += 1 total_images += len(chunk.image_urls) doc_info = { 'document_id': document_id, 'type': 'multimodal_pdf', 'filename': filename, 'num_chunks': indexed_count, 'total_images': total_images, 'chunk_ids': chunk_ids, 'metadata': metadata } self.documents_collection.insert_one(doc_info) return { 'success': True, 'document_id': document_id, 'filename': filename, 'chunks_indexed': indexed_count, 'images_found': total_images, 'chunk_ids': chunk_ids[:5] }