Spaces:
Sleeping
Sleeping
File size: 938 Bytes
ffa0f3d 7793bb6 ffa0f3d 7793bb6 afecdc5 f88b1d2 afecdc5 f75da29 afecdc5 f88b1d2 32e4125 afecdc5 7793bb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
"""Configuration settings for the ingestion pipeline"""
# Default ingestion settings
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_OVERLAP = 200
RANDOM_SEED = 42
# Supported file formats
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}
# Corpus directory
CORPUS_DIRECTORY = "synthetic_policies"
# Vector Database Settings
VECTOR_DB_PERSIST_PATH = "data/chroma_db"
COLLECTION_NAME = "policy_documents"
EMBEDDING_DIMENSION = 768 # paraphrase-albert-small-v2
SIMILARITY_METRIC = "cosine"
# ChromaDB Configuration for Memory Optimization
CHROMA_SETTINGS = {
"anonymized_telemetry": False,
"allow_reset": False,
"is_persistent": True,
}
# Embedding Model Settings
EMBEDDING_MODEL_NAME = "paraphrase-albert-small-v2"
EMBEDDING_BATCH_SIZE = 8 # Reduced for memory optimization on free tier
EMBEDDING_DEVICE = "cpu" # Use CPU for free tier compatibility
# Search Settings
DEFAULT_TOP_K = 5
MAX_TOP_K = 20
MIN_SIMILARITY_THRESHOLD = 0.3
|