Spaces:
Sleeping
Sleeping
| import chromadb | |
| from sentence_transformers import SentenceTransformer | |
| import os | |
| # --- Constants --- | |
| MODEL_NAME = "all-MiniLM-L6-v2" | |
| COLLECTION_NAME = "aura_mind_knowledge" | |
| KNOWLEDGE_BASE_DIR = "knowledge_base_data" | |
| # --- Initialize ChromaDB and Model --- | |
| client = chromadb.PersistentClient(path="chroma_db") | |
| model = SentenceTransformer(MODEL_NAME) | |
| collection = client.get_or_create_collection(name=COLLECTION_NAME) | |
| def embed_and_store_documents(): | |
| """ | |
| Reads documents from the knowledge base directory, generates embeddings, | |
| and stores them in ChromaDB. | |
| """ | |
| if collection.count() > 0: | |
| print("β Knowledge base is already loaded into ChromaDB.") | |
| return | |
| print("Embedding and storing documents in ChromaDB...") | |
| documents = [] | |
| ids = [] | |
| for filename in os.listdir(KNOWLEDGE_BASE_DIR): | |
| if filename.endswith(".txt"): | |
| with open(os.path.join(KNOWLEDGE_BASE_DIR, filename), "r") as f: | |
| documents.append(f.read()) | |
| ids.append(filename) | |
| if documents: | |
| embeddings = model.encode(documents).tolist() | |
| collection.add( | |
| embeddings=embeddings, | |
| documents=documents, | |
| ids=ids | |
| ) | |
| print(f"β Successfully stored {len(documents)} documents in ChromaDB.") | |
| def search_documents(query: str, n_results: int = 1) -> list: | |
| """ | |
| Searches for relevant documents in ChromaDB based on a query. | |
| Args: | |
| query: The search query. | |
| n_results: The number of results to return. | |
| Returns: | |
| A list of relevant documents. | |
| """ | |
| if not query: | |
| return [] | |
| query_embedding = model.encode([query]).tolist() | |
| results = collection.query( | |
| query_embeddings=query_embedding, | |
| n_results=n_results, | |
| ) | |
| return results['documents'][0] if results['documents'] else [] | |