"""Configuration settings for the ingestion pipeline""" # Default ingestion settings DEFAULT_CHUNK_SIZE = 1000 DEFAULT_OVERLAP = 200 RANDOM_SEED = 42 # Supported file formats SUPPORTED_FORMATS = {".txt", ".md", ".markdown"} # Corpus directory CORPUS_DIRECTORY = "synthetic_policies" # Vector Database Settings VECTOR_DB_PERSIST_PATH = "data/chroma_db" COLLECTION_NAME = "policy_documents" EMBEDDING_DIMENSION = 768 # paraphrase-albert-small-v2 SIMILARITY_METRIC = "cosine" # ChromaDB Configuration for Memory Optimization CHROMA_SETTINGS = { "anonymized_telemetry": False, "allow_reset": False, "is_persistent": True, } # Embedding Model Settings EMBEDDING_MODEL_NAME = "paraphrase-albert-small-v2" EMBEDDING_BATCH_SIZE = 8 # Reduced for memory optimization on free tier EMBEDDING_DEVICE = "cpu" # Use CPU for free tier compatibility # Search Settings DEFAULT_TOP_K = 5 MAX_TOP_K = 20 MIN_SIMILARITY_THRESHOLD = 0.3