""" Configuration settings for the Corpus Collection Engine """ import os from pathlib import Path from typing import List, Dict # Project paths PROJECT_ROOT = Path(__file__).parent.parent DATA_DIR = PROJECT_ROOT / "data" MODELS_DIR = PROJECT_ROOT / "models" CACHE_DIR = PROJECT_ROOT / ".cache" # Supported Indic languages SUPPORTED_LANGUAGES: Dict[str, str] = { 'hi': 'Hindi', 'bn': 'Bengali', 'ta': 'Tamil', 'te': 'Telugu', 'ml': 'Malayalam', 'kn': 'Kannada', 'gu': 'Gujarati', 'mr': 'Marathi', 'pa': 'Punjabi', 'or': 'Odia', 'en': 'English' } # Activity types ACTIVITY_TYPES: List[str] = [ 'meme', 'recipe', 'folklore', 'landmark' ] # AI model configurations AI_CONFIG = { 'text_model': 'sarvamai/sarvam-1', 'vision_model': 'microsoft/DiT-base', 'max_tokens': 512, 'temperature': 0.7 } # Database configuration DATABASE_CONFIG = { 'local_db': 'sqlite:///corpus_collection.db', 'remote_db': os.getenv('DATABASE_URL', ''), 'batch_size': 100 } # PWA and offline configuration PWA_CONFIG = { 'cache_version': 'v1.0.0', 'offline_timeout': 5000, # milliseconds 'sync_interval': 300000, # 5 minutes in milliseconds 'max_offline_storage': 50 * 1024 * 1024 # 50MB } # Content validation settings VALIDATION_CONFIG = { 'min_text_length': 10, 'max_text_length': 5000, 'max_image_size': 10 * 1024 * 1024, # 10MB 'allowed_image_types': ['jpg', 'jpeg', 'png', 'webp'] } # Create necessary directories for directory in [DATA_DIR, MODELS_DIR, CACHE_DIR]: directory.mkdir(exist_ok=True)