EBD_Fest / batch_index_pdfs.py
minhvtt's picture
Upload 20 files
cb93402 verified
"""
Batch script to index PDF files into RAG knowledge base
Usage: python batch_index_pdfs.py <pdf_directory> [options]
"""
import os
import sys
from pathlib import Path
from pymongo import MongoClient
from embedding_service import JinaClipEmbeddingService
from qdrant_service import QdrantVectorService
from pdf_parser import PDFIndexer
def index_pdf_directory(
pdf_dir: str,
category: str = "user_guide",
force: bool = False
):
"""
Index all PDF files in a directory
Args:
pdf_dir: Directory containing PDF files
category: Category for the PDFs (default: "user_guide")
force: Force reindex even if already indexed (default: False)
"""
print("="*60)
print("PDF Batch Indexer")
print("="*60)
# Initialize services (same as main.py)
print("\n[1/5] Initializing services...")
embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2")
collection_name = os.getenv("COLLECTION_NAME", "event_social_media")
qdrant_service = QdrantVectorService(
collection_name=collection_name,
vector_size=embedding_service.get_embedding_dimension()
)
# MongoDB
mongodb_uri = os.getenv("MONGODB_URI", "mongodb+srv://truongtn7122003:[email protected]/")
mongo_client = MongoClient(mongodb_uri)
db = mongo_client[os.getenv("MONGODB_DB_NAME", "chatbot_rag")]
documents_collection = db["documents"]
# Initialize PDF indexer
pdf_indexer = PDFIndexer(
embedding_service=embedding_service,
qdrant_service=qdrant_service,
documents_collection=documents_collection
)
print("βœ“ Services initialized")
# Find all PDF files
print(f"\n[2/5] Scanning directory: {pdf_dir}")
pdf_files = list(Path(pdf_dir).glob("*.pdf"))
if not pdf_files:
print("βœ— No PDF files found in directory")
return
print(f"βœ“ Found {len(pdf_files)} PDF file(s)")
# Index each PDF
print(f"\n[3/5] Indexing PDFs...")
indexed_count = 0
skipped_count = 0
error_count = 0
for i, pdf_path in enumerate(pdf_files, 1):
print(f"\n--- [{i}/{len(pdf_files)}] Processing: {pdf_path.name} ---")
# Generate document ID
doc_id = f"pdf_{pdf_path.stem}"
# Check if already indexed
if not force:
existing = documents_collection.find_one({"document_id": doc_id})
if existing:
print(f"⊘ Already indexed (use --force to reindex)")
skipped_count += 1
continue
try:
# Index PDF
metadata = {
'title': pdf_path.stem.replace('_', ' ').title(),
'category': category,
'source_file': str(pdf_path)
}
result = pdf_indexer.index_pdf(
pdf_path=str(pdf_path),
document_id=doc_id,
document_metadata=metadata
)
print(f"βœ“ Indexed: {result['chunks_indexed']} chunks")
indexed_count += 1
except Exception as e:
print(f"βœ— Error: {str(e)}")
error_count += 1
# Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Total PDFs found: {len(pdf_files)}")
print(f"βœ“ Successfully indexed: {indexed_count}")
print(f"⊘ Skipped (already indexed): {skipped_count}")
print(f"βœ— Errors: {error_count}")
if indexed_count > 0:
print(f"\nβœ“ Knowledge base updated successfully!")
print(f"You can now chat with your chatbot about the content in these PDFs.")
def main():
"""Main entry point"""
if len(sys.argv) < 2:
print("Usage: python batch_index_pdfs.py <pdf_directory> [--category=<category>] [--force]")
print("\nExample:")
print(" python batch_index_pdfs.py ./docs/guides")
print(" python batch_index_pdfs.py ./docs/guides --category=user_guide --force")
sys.exit(1)
pdf_dir = sys.argv[1]
if not os.path.isdir(pdf_dir):
print(f"Error: Directory not found: {pdf_dir}")
sys.exit(1)
# Parse options
category = "user_guide"
force = False
for arg in sys.argv[2:]:
if arg.startswith("--category="):
category = arg.split("=")[1]
elif arg == "--force":
force = True
# Index PDFs
index_pdf_directory(pdf_dir, category=category, force=force)
if __name__ == "__main__":
main()