Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

Seth McKnight

Comprehensive memory optimizations and embedding service updates (#76)

f75da29 about 2 months ago

938 Bytes

	"""Configuration settings for the ingestion pipeline"""

	# Default ingestion settings
	DEFAULT_CHUNK_SIZE = 1000
	DEFAULT_OVERLAP = 200
	RANDOM_SEED = 42

	# Supported file formats
	SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}

	# Corpus directory
	CORPUS_DIRECTORY = "synthetic_policies"

	# Vector Database Settings
	VECTOR_DB_PERSIST_PATH = "data/chroma_db"
	COLLECTION_NAME = "policy_documents"
	EMBEDDING_DIMENSION = 768 # paraphrase-albert-small-v2
	SIMILARITY_METRIC = "cosine"

	# ChromaDB Configuration for Memory Optimization
	CHROMA_SETTINGS = {
	"anonymized_telemetry": False,
	"allow_reset": False,
	"is_persistent": True,
	}

	# Embedding Model Settings
	EMBEDDING_MODEL_NAME = "paraphrase-albert-small-v2"
	EMBEDDING_BATCH_SIZE = 8 # Reduced for memory optimization on free tier
	EMBEDDING_DEVICE = "cpu" # Use CPU for free tier compatibility

	# Search Settings
	DEFAULT_TOP_K = 5
	MAX_TOP_K = 20
	MIN_SIMILARITY_THRESHOLD = 0.3