File size: 938 Bytes
ffa0f3d
 
 
 
 
 
 
 
7793bb6
ffa0f3d
 
7793bb6
afecdc5
 
 
 
f88b1d2
afecdc5
 
f75da29
 
 
 
 
 
 
afecdc5
f88b1d2
32e4125
afecdc5
 
 
 
 
7793bb6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
"""Configuration settings for the ingestion pipeline"""

# Default ingestion settings
DEFAULT_CHUNK_SIZE = 1000
DEFAULT_OVERLAP = 200
RANDOM_SEED = 42

# Supported file formats
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}

# Corpus directory
CORPUS_DIRECTORY = "synthetic_policies"

# Vector Database Settings
VECTOR_DB_PERSIST_PATH = "data/chroma_db"
COLLECTION_NAME = "policy_documents"
EMBEDDING_DIMENSION = 768  # paraphrase-albert-small-v2
SIMILARITY_METRIC = "cosine"

# ChromaDB Configuration for Memory Optimization
CHROMA_SETTINGS = {
    "anonymized_telemetry": False,
    "allow_reset": False,
    "is_persistent": True,
}

# Embedding Model Settings
EMBEDDING_MODEL_NAME = "paraphrase-albert-small-v2"
EMBEDDING_BATCH_SIZE = 8  # Reduced for memory optimization on free tier
EMBEDDING_DEVICE = "cpu"  # Use CPU for free tier compatibility

# Search Settings
DEFAULT_TOP_K = 5
MAX_TOP_K = 20
MIN_SIMILARITY_THRESHOLD = 0.3