Spaces:
Sleeping
Sleeping
| import os | |
| from sentence_transformers import SentenceTransformer | |
| from database import init_db, check_if_indexed, delete_database_and_index, get_db_connection | |
| from create_index import create_initial_index as build_secure_index | |
| from search import search as secure_search | |
| from ingest_document import ingest_pdf | |
| # Use a CLIP model that can handle both text and images | |
| MODEL_NAME = 'clip-ViT-B-32' | |
| class KnowledgeBase: | |
| def __init__(self): | |
| self.model = SentenceTransformer(MODEL_NAME) | |
| # Ensure the database is initialized | |
| init_db() | |
| # Check if the index exists, if not, build it from initial data | |
| if not check_if_indexed(): | |
| print("Local knowledge base not found. Building initial knowledge base...") | |
| self._build_initial_knowledge_base() | |
| def _build_initial_knowledge_base(self): | |
| current_dir = os.path.dirname(__file__) | |
| knowledge_base_data_dir = os.path.join(current_dir, "knowledge_base_data") | |
| document_filenames = [ | |
| "healthy_maize_remedy.txt", | |
| "maize_phosphorus_deficiency_remedy.txt", | |
| "comic_relief.txt" | |
| ] | |
| documents_content = {} | |
| for filename in document_filenames: | |
| file_path = os.path.join(knowledge_base_data_dir, filename) | |
| try: | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| documents_content[filename] = f.read() | |
| except FileNotFoundError: | |
| print(f"Warning: Knowledge base file not found, skipping: {file_path}") | |
| if documents_content: | |
| build_secure_index(documents_content) | |
| else: | |
| print("No initial knowledge base documents found to index.") | |
| def create_initial_index(self, documents_dict): | |
| # This method now directly calls the external build_secure_index | |
| build_secure_index(documents_dict) | |
| def rebuild_from_default_files(self): | |
| # This method orchestrates rebuilding the index from the default knowledge_base_data files | |
| self._build_initial_knowledge_base() | |
| def ingest_pdf(self, file_path, file_name): | |
| # This method now directly calls the external ingest_pdf | |
| ingest_pdf(file_path, file_name) | |
| def search(self, query, k=1): | |
| # This method now directly calls the external secure_search | |
| return secure_search(query, k) | |
| def get_retriever(): | |
| kb = KnowledgeBase() | |
| class Retriever: | |
| def __init__(self, kb): | |
| self.kb = kb | |
| def get_relevant_documents(self, query): | |
| results = self.kb.search(query) | |
| from langchain.schema import Document | |
| # Ensure that only text content is passed to Document | |
| # For image results, you might need a different handling or filter them out if Langchain Document doesn't support them directly. | |
| text_documents = [Document(page_content=r['content']) for r in results if r['type'] == 'text'] | |
| return text_documents | |
| return Retriever(kb) | |