Tobias Pasquale
style: Fix code formatting and linting issues for CI/CD compliance
7793bb6
raw
history blame
755 Bytes
"""Configuration settings for the ingestion pipeline"""
# Default ingestion settings
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_OVERLAP = 200
RANDOM_SEED = 42
# Supported file formats
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}
# Corpus directory
CORPUS_DIRECTORY = "synthetic_policies"
# Vector Database Settings
VECTOR_DB_PERSIST_PATH = "data/chroma_db"
COLLECTION_NAME = "policy_documents"
EMBEDDING_DIMENSION = 384 # sentence-transformers/all-MiniLM-L6-v2
SIMILARITY_METRIC = "cosine"
# Embedding Model Settings
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_BATCH_SIZE = 32
EMBEDDING_DEVICE = "cpu" # Use CPU for free tier compatibility
# Search Settings
DEFAULT_TOP_K = 5
MAX_TOP_K = 20
MIN_SIMILARITY_THRESHOLD = 0.3