import argparse import chromadb from tqdm import tqdm # Optional: For progress bar db_config = { "youtube_db": { "source_db_path": "../youtube_surfer_ai_agent/youtube_db", "source_collection_name": "yt_metadata", "destination_collection_name": "yt_metadata", }, "divya_prabandham": { "source_db_path": "../uveda_analyzer/chromadb_store", "source_collection_name": "divya_prabandham", "destination_collection_name": "divya_prabandham", }, } parser = argparse.ArgumentParser(description="My app with database parameter") parser.add_argument( "--db", type=str, required=True, choices=list(db_config.keys()), help=f"Id of the database to use. allowed_values : {', '.join(db_config.keys())}", ) args = parser.parse_args() db_id = args.db if db_id is None: raise Exception(f"No db provided!") if db_id not in db_config: raise Exception(f"db with id {db_id} not found!") # Connect to source and destination local persistent clients source_client = chromadb.PersistentClient(path=db_config[db_id]["source_db_path"]) destination_client = chromadb.PersistentClient(path="./chromadb-store") source_collection_name = db_config[db_id]["source_collection_name"] destination_collection_name = db_config[db_id]["destination_collection_name"] # Get the source collection source_collection = source_client.get_collection(source_collection_name) # Retrieve all data from the source collection source_data = source_collection.get(include=["documents", "metadatas", "embeddings"]) # Create or get the destination collection if destination_client.get_or_create_collection(destination_collection_name): print("Deleting existing collection", destination_collection_name) destination_client.delete_collection(destination_collection_name) destination_collection = destination_client.get_or_create_collection( destination_collection_name, metadata=source_collection.metadata, # Copy metadata if needed ) # Add data to the destination collection in batches BATCH_SIZE = 500 total_records = len(source_data["ids"]) print(f"Copying {total_records} records in batches of {BATCH_SIZE}...") for i in tqdm(range(0, total_records, BATCH_SIZE)): batch_ids = source_data["ids"][i : i + BATCH_SIZE] batch_docs = source_data["documents"][i : i + BATCH_SIZE] batch_metas = source_data["metadatas"][i : i + BATCH_SIZE] batch_embeds = ( source_data["embeddings"][i : i + BATCH_SIZE] if "embeddings" in source_data and source_data["embeddings"] is not None else None ) destination_collection.add( ids=batch_ids, documents=batch_docs, metadatas=batch_metas, embeddings=batch_embeds, ) print("✅ Collection copied successfully!") print("Total records in source collection = ", source_collection.count()) print("Total records in destination collection = ", destination_collection.count())