Spaces:

minhvtt
/

EBD_Fest

Running

App Files Files Community

EBD_Fest / batch_index_pdfs.py

minhvtt

Upload 20 files

cb93402 verified 3 days ago

raw

history blame contribute delete

4.52 kB

	"""
	Batch script to index PDF files into RAG knowledge base
	Usage: python batch_index_pdfs.py <pdf_directory> [options]
	"""

	import os
	import sys
	from pathlib import Path
	from pymongo import MongoClient
	from embedding_service import JinaClipEmbeddingService
	from qdrant_service import QdrantVectorService
	from pdf_parser import PDFIndexer


	def index_pdf_directory(
	pdf_dir: str,
	category: str = "user_guide",
	force: bool = False
	):
	"""
	Index all PDF files in a directory

	Args:
	pdf_dir: Directory containing PDF files
	category: Category for the PDFs (default: "user_guide")
	force: Force reindex even if already indexed (default: False)
	"""
	print("="*60)
	print("PDF Batch Indexer")
	print("="*60)

	# Initialize services (same as main.py)
	print("\n[1/5] Initializing services...")
	embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2")

	collection_name = os.getenv("COLLECTION_NAME", "event_social_media")
	qdrant_service = QdrantVectorService(
	collection_name=collection_name,
	vector_size=embedding_service.get_embedding_dimension()
	)

	# MongoDB
	mongodb_uri = os.getenv("MONGODB_URI", "mongodb+srv://truongtn7122003:[email protected]/")
	mongo_client = MongoClient(mongodb_uri)
	db = mongo_client[os.getenv("MONGODB_DB_NAME", "chatbot_rag")]
	documents_collection = db["documents"]

	# Initialize PDF indexer
	pdf_indexer = PDFIndexer(
	embedding_service=embedding_service,
	qdrant_service=qdrant_service,
	documents_collection=documents_collection
	)
	print("✓ Services initialized")

	# Find all PDF files
	print(f"\n[2/5] Scanning directory: {pdf_dir}")
	pdf_files = list(Path(pdf_dir).glob("*.pdf"))

	if not pdf_files:
	print("✗ No PDF files found in directory")
	return

	print(f"✓ Found {len(pdf_files)} PDF file(s)")

	# Index each PDF
	print(f"\n[3/5] Indexing PDFs...")
	indexed_count = 0
	skipped_count = 0
	error_count = 0

	for i, pdf_path in enumerate(pdf_files, 1):
	print(f"\n--- [{i}/{len(pdf_files)}] Processing: {pdf_path.name} ---")

	# Generate document ID
	doc_id = f"pdf_{pdf_path.stem}"

	# Check if already indexed
	if not force:
	existing = documents_collection.find_one({"document_id": doc_id})
	if existing:
	print(f"⊘ Already indexed (use --force to reindex)")
	skipped_count += 1
	continue

	try:
	# Index PDF
	metadata = {
	'title': pdf_path.stem.replace('_', ' ').title(),
	'category': category,
	'source_file': str(pdf_path)
	}

	result = pdf_indexer.index_pdf(
	pdf_path=str(pdf_path),
	document_id=doc_id,
	document_metadata=metadata
	)

	print(f"✓ Indexed: {result['chunks_indexed']} chunks")
	indexed_count += 1

	except Exception as e:
	print(f"✗ Error: {str(e)}")
	error_count += 1

	# Summary
	print("\n" + "="*60)
	print("SUMMARY")
	print("="*60)
	print(f"Total PDFs found: {len(pdf_files)}")
	print(f"✓ Successfully indexed: {indexed_count}")
	print(f"⊘ Skipped (already indexed): {skipped_count}")
	print(f"✗ Errors: {error_count}")

	if indexed_count > 0:
	print(f"\n✓ Knowledge base updated successfully!")
	print(f"You can now chat with your chatbot about the content in these PDFs.")


	def main():
	"""Main entry point"""
	if len(sys.argv) < 2:
	print("Usage: python batch_index_pdfs.py <pdf_directory> [--category=<category>] [--force]")
	print("\nExample:")
	print(" python batch_index_pdfs.py ./docs/guides")
	print(" python batch_index_pdfs.py ./docs/guides --category=user_guide --force")
	sys.exit(1)

	pdf_dir = sys.argv[1]

	if not os.path.isdir(pdf_dir):
	print(f"Error: Directory not found: {pdf_dir}")
	sys.exit(1)

	# Parse options
	category = "user_guide"
	force = False

	for arg in sys.argv[2:]:
	if arg.startswith("--category="):
	category = arg.split("=")[1]
	elif arg == "--force":
	force = True

	# Index PDFs
	index_pdf_directory(pdf_dir, category=category, force=force)


	if __name__ == "__main__":
	main()