import chromadb from sentence_transformers import SentenceTransformer import os # --- Constants --- MODEL_NAME = "all-MiniLM-L6-v2" COLLECTION_NAME = "aura_mind_knowledge" KNOWLEDGE_BASE_DIR = "knowledge_base_data" # --- Initialize ChromaDB and Model --- client = chromadb.PersistentClient(path="chroma_db") model = SentenceTransformer(MODEL_NAME) collection = client.get_or_create_collection(name=COLLECTION_NAME) def embed_and_store_documents(): """ Reads documents from the knowledge base directory, generates embeddings, and stores them in ChromaDB. """ if collection.count() > 0: print("✅ Knowledge base is already loaded into ChromaDB.") return print("Embedding and storing documents in ChromaDB...") documents = [] ids = [] for filename in os.listdir(KNOWLEDGE_BASE_DIR): if filename.endswith(".txt"): with open(os.path.join(KNOWLEDGE_BASE_DIR, filename), "r") as f: documents.append(f.read()) ids.append(filename) if documents: embeddings = model.encode(documents).tolist() collection.add( embeddings=embeddings, documents=documents, ids=ids ) print(f"✅ Successfully stored {len(documents)} documents in ChromaDB.") def search_documents(query: str, n_results: int = 1) -> list: """ Searches for relevant documents in ChromaDB based on a query. Args: query: The search query. n_results: The number of results to return. Returns: A list of relevant documents. """ if not query: return [] query_embedding = model.encode([query]).tolist() results = collection.query( query_embeddings=query_embedding, n_results=n_results, ) return results['documents'][0] if results['documents'] else []