|
|
""" |
|
|
Batch script to index PDF files into RAG knowledge base |
|
|
Usage: python batch_index_pdfs.py <pdf_directory> [options] |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
from pathlib import Path |
|
|
from pymongo import MongoClient |
|
|
from embedding_service import JinaClipEmbeddingService |
|
|
from qdrant_service import QdrantVectorService |
|
|
from pdf_parser import PDFIndexer |
|
|
|
|
|
|
|
|
def index_pdf_directory( |
|
|
pdf_dir: str, |
|
|
category: str = "user_guide", |
|
|
force: bool = False |
|
|
): |
|
|
""" |
|
|
Index all PDF files in a directory |
|
|
|
|
|
Args: |
|
|
pdf_dir: Directory containing PDF files |
|
|
category: Category for the PDFs (default: "user_guide") |
|
|
force: Force reindex even if already indexed (default: False) |
|
|
""" |
|
|
print("="*60) |
|
|
print("PDF Batch Indexer") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
print("\n[1/5] Initializing services...") |
|
|
embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2") |
|
|
|
|
|
collection_name = os.getenv("COLLECTION_NAME", "event_social_media") |
|
|
qdrant_service = QdrantVectorService( |
|
|
collection_name=collection_name, |
|
|
vector_size=embedding_service.get_embedding_dimension() |
|
|
) |
|
|
|
|
|
|
|
|
mongodb_uri = os.getenv("MONGODB_URI", "mongodb+srv://truongtn7122003:[email protected]/") |
|
|
mongo_client = MongoClient(mongodb_uri) |
|
|
db = mongo_client[os.getenv("MONGODB_DB_NAME", "chatbot_rag")] |
|
|
documents_collection = db["documents"] |
|
|
|
|
|
|
|
|
pdf_indexer = PDFIndexer( |
|
|
embedding_service=embedding_service, |
|
|
qdrant_service=qdrant_service, |
|
|
documents_collection=documents_collection |
|
|
) |
|
|
print("β Services initialized") |
|
|
|
|
|
|
|
|
print(f"\n[2/5] Scanning directory: {pdf_dir}") |
|
|
pdf_files = list(Path(pdf_dir).glob("*.pdf")) |
|
|
|
|
|
if not pdf_files: |
|
|
print("β No PDF files found in directory") |
|
|
return |
|
|
|
|
|
print(f"β Found {len(pdf_files)} PDF file(s)") |
|
|
|
|
|
|
|
|
print(f"\n[3/5] Indexing PDFs...") |
|
|
indexed_count = 0 |
|
|
skipped_count = 0 |
|
|
error_count = 0 |
|
|
|
|
|
for i, pdf_path in enumerate(pdf_files, 1): |
|
|
print(f"\n--- [{i}/{len(pdf_files)}] Processing: {pdf_path.name} ---") |
|
|
|
|
|
|
|
|
doc_id = f"pdf_{pdf_path.stem}" |
|
|
|
|
|
|
|
|
if not force: |
|
|
existing = documents_collection.find_one({"document_id": doc_id}) |
|
|
if existing: |
|
|
print(f"β Already indexed (use --force to reindex)") |
|
|
skipped_count += 1 |
|
|
continue |
|
|
|
|
|
try: |
|
|
|
|
|
metadata = { |
|
|
'title': pdf_path.stem.replace('_', ' ').title(), |
|
|
'category': category, |
|
|
'source_file': str(pdf_path) |
|
|
} |
|
|
|
|
|
result = pdf_indexer.index_pdf( |
|
|
pdf_path=str(pdf_path), |
|
|
document_id=doc_id, |
|
|
document_metadata=metadata |
|
|
) |
|
|
|
|
|
print(f"β Indexed: {result['chunks_indexed']} chunks") |
|
|
indexed_count += 1 |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β Error: {str(e)}") |
|
|
error_count += 1 |
|
|
|
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("SUMMARY") |
|
|
print("="*60) |
|
|
print(f"Total PDFs found: {len(pdf_files)}") |
|
|
print(f"β Successfully indexed: {indexed_count}") |
|
|
print(f"β Skipped (already indexed): {skipped_count}") |
|
|
print(f"β Errors: {error_count}") |
|
|
|
|
|
if indexed_count > 0: |
|
|
print(f"\nβ Knowledge base updated successfully!") |
|
|
print(f"You can now chat with your chatbot about the content in these PDFs.") |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Main entry point""" |
|
|
if len(sys.argv) < 2: |
|
|
print("Usage: python batch_index_pdfs.py <pdf_directory> [--category=<category>] [--force]") |
|
|
print("\nExample:") |
|
|
print(" python batch_index_pdfs.py ./docs/guides") |
|
|
print(" python batch_index_pdfs.py ./docs/guides --category=user_guide --force") |
|
|
sys.exit(1) |
|
|
|
|
|
pdf_dir = sys.argv[1] |
|
|
|
|
|
if not os.path.isdir(pdf_dir): |
|
|
print(f"Error: Directory not found: {pdf_dir}") |
|
|
sys.exit(1) |
|
|
|
|
|
|
|
|
category = "user_guide" |
|
|
force = False |
|
|
|
|
|
for arg in sys.argv[2:]: |
|
|
if arg.startswith("--category="): |
|
|
category = arg.split("=")[1] |
|
|
elif arg == "--force": |
|
|
force = True |
|
|
|
|
|
|
|
|
index_pdf_directory(pdf_dir, category=category, force=force) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|