""" Batch script to index PDF files into RAG knowledge base Usage: python batch_index_pdfs.py [options] """ import os import sys from pathlib import Path from pymongo import MongoClient from embedding_service import JinaClipEmbeddingService from qdrant_service import QdrantVectorService from pdf_parser import PDFIndexer def index_pdf_directory( pdf_dir: str, category: str = "user_guide", force: bool = False ): """ Index all PDF files in a directory Args: pdf_dir: Directory containing PDF files category: Category for the PDFs (default: "user_guide") force: Force reindex even if already indexed (default: False) """ print("="*60) print("PDF Batch Indexer") print("="*60) # Initialize services (same as main.py) print("\n[1/5] Initializing services...") embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2") collection_name = os.getenv("COLLECTION_NAME", "event_social_media") qdrant_service = QdrantVectorService( collection_name=collection_name, vector_size=embedding_service.get_embedding_dimension() ) # MongoDB mongodb_uri = os.getenv("MONGODB_URI", "mongodb+srv://truongtn7122003:7KaI9OT5KTUxWjVI@truongtn7122003.xogin4q.mongodb.net/") mongo_client = MongoClient(mongodb_uri) db = mongo_client[os.getenv("MONGODB_DB_NAME", "chatbot_rag")] documents_collection = db["documents"] # Initialize PDF indexer pdf_indexer = PDFIndexer( embedding_service=embedding_service, qdrant_service=qdrant_service, documents_collection=documents_collection ) print("✓ Services initialized") # Find all PDF files print(f"\n[2/5] Scanning directory: {pdf_dir}") pdf_files = list(Path(pdf_dir).glob("*.pdf")) if not pdf_files: print("✗ No PDF files found in directory") return print(f"✓ Found {len(pdf_files)} PDF file(s)") # Index each PDF print(f"\n[3/5] Indexing PDFs...") indexed_count = 0 skipped_count = 0 error_count = 0 for i, pdf_path in enumerate(pdf_files, 1): print(f"\n--- [{i}/{len(pdf_files)}] Processing: {pdf_path.name} ---") # Generate document ID doc_id = f"pdf_{pdf_path.stem}" # Check if already indexed if not force: existing = documents_collection.find_one({"document_id": doc_id}) if existing: print(f"⊘ Already indexed (use --force to reindex)") skipped_count += 1 continue try: # Index PDF metadata = { 'title': pdf_path.stem.replace('_', ' ').title(), 'category': category, 'source_file': str(pdf_path) } result = pdf_indexer.index_pdf( pdf_path=str(pdf_path), document_id=doc_id, document_metadata=metadata ) print(f"✓ Indexed: {result['chunks_indexed']} chunks") indexed_count += 1 except Exception as e: print(f"✗ Error: {str(e)}") error_count += 1 # Summary print("\n" + "="*60) print("SUMMARY") print("="*60) print(f"Total PDFs found: {len(pdf_files)}") print(f"✓ Successfully indexed: {indexed_count}") print(f"⊘ Skipped (already indexed): {skipped_count}") print(f"✗ Errors: {error_count}") if indexed_count > 0: print(f"\n✓ Knowledge base updated successfully!") print(f"You can now chat with your chatbot about the content in these PDFs.") def main(): """Main entry point""" if len(sys.argv) < 2: print("Usage: python batch_index_pdfs.py [--category=] [--force]") print("\nExample:") print(" python batch_index_pdfs.py ./docs/guides") print(" python batch_index_pdfs.py ./docs/guides --category=user_guide --force") sys.exit(1) pdf_dir = sys.argv[1] if not os.path.isdir(pdf_dir): print(f"Error: Directory not found: {pdf_dir}") sys.exit(1) # Parse options category = "user_guide" force = False for arg in sys.argv[2:]: if arg.startswith("--category="): category = arg.split("=")[1] elif arg == "--force": force = True # Index PDFs index_pdf_directory(pdf_dir, category=category, force=force) if __name__ == "__main__": main()