""" Query Enhancement Module - Improve semantic search with query expansion and synonyms. This module helps bridge the gap between natural user language and document terminology by expanding queries with relevant synonyms and domain-specific terms. """ import re from typing import List class QueryExpander: """ Expands user queries with relevant synonyms and domain-specific terminology to improve semantic search results in corporate policy documents. """ def __init__(self): """Initialize the query expander with predefined synonym mappings.""" # Additional HR-specific synonyms self.hr_synonyms = { # Time off related - enhanced with policy document terms "personal time": [ "PTO", "paid time off", "time off", "vacation", "personal days", "leave", "accrual", "days off", ], "vacation": [ "PTO", "paid time off", "time off", "personal time", "vacation days", "holiday", "accrual", ], "sick leave": [ "sick time", "medical leave", "illness", "health days", "PTO", ], "time off": [ "PTO", "paid time off", "vacation", "leave", "personal time", "days off", "accrual", ], "PTO": [ "paid time off", "vacation", "personal time", "time off", "accrual", "days off", ], "leave": [ "time off", "absence", "PTO", "paid time off", "vacation", "accrual", ], "days off": [ "PTO", "paid time off", "vacation", "time off", "personal time", "leave", ], "accrual": [ "earn", "accumulate", "build up", "PTO", "vacation", "time off", ], "earn": ["accrue", "accumulate", "get", "receive", "build up"], "annual": ["yearly", "per year", "each year", "annually"], "allowance": [ "allocation", "entitlement", "amount", "accrual", "benefit", "limit", ], "allocation": ["allowance", "entitlement", "amount", "limit", "budget"], # Benefits related - enhanced with employee terminology "benefits": [ "perks", "compensation", "package", "coverage", "health insurance", "401k", "retirement", ], "insurance": [ "coverage", "health plan", "medical", "benefits", "healthcare", "dental", "vision", ], "retirement": [ "401k", "pension", "savings", "investment", "matching", "contribution", ], "healthcare": [ "medical", "health insurance", "coverage", "benefits", "health plan", "dental", "vision", ], "401k": ["retirement", "savings", "matching", "contribution", "pension"], "health plan": [ "healthcare", "medical", "insurance", "coverage", "benefits", ], "dental": ["dental coverage", "dental insurance", "benefits", "healthcare"], "vision": ["vision coverage", "eye care", "benefits", "healthcare"], "gym": ["fitness", "wellness", "health", "membership", "benefits"], "tuition": [ "education", "training", "reimbursement", "learning", "development", ], # Work arrangements "remote work": ["work from home", "telecommuting", "WFH", "telework"], "work from home": ["remote work", "telecommuting", "WFH", "telework"], "telecommuting": ["remote work", "work from home", "WFH", "telework"], "WFH": ["work from home", "remote work", "telecommuting", "telework"], "flexible schedule": ["flex time", "flexible hours", "work schedule"], # Performance and development "performance review": ["evaluation", "appraisal", "assessment", "feedback"], "training": ["development", "education", "learning", "courses"], "promotion": ["advancement", "career growth", "progression", "raise"], # HR processes "onboarding": ["orientation", "new hire", "getting started", "setup"], "offboarding": ["termination", "leaving", "exit", "departure"], "policy": ["procedure", "guidelines", "rules", "standards"], # Workplace issues and safety "harassment": [ "discrimination", "bullying", "hostile", "inappropriate behavior", ], "complaint": ["report", "grievance", "issue", "concern", "problem"], "discrimination": ["harassment", "bias", "unfair treatment", "prejudice"], "emergency": ["crisis", "urgent", "fire", "evacuation", "safety"], "safety": ["security", "hazard", "emergency", "protection", "guidelines"], # Expenses and travel "expenses": ["reimbursement", "costs", "spending", "business expenses"], "reimbursement": ["expenses", "refund", "repayment", "reimbursable"], "travel": ["business trip", "trip", "hotel", "flight", "transportation"], "meal allowance": ["food", "dining", "per diem", "meal budget"], # Technology and security "password": ["security", "login", "authentication", "access"], "VPN": ["remote access", "network", "connection", "security"], "security": ["password", "access", "protection", "privacy", "incident"], "device": ["computer", "laptop", "phone", "equipment", "technology"], "WiFi": ["network", "internet", "connection", "wireless"], } # Common question patterns and their expansions self.question_patterns = { r"how much.*time.*earn|accrue": [ "PTO accrual", "vacation days", "time off allocation", ], r"how many.*days.*get|receive": [ "PTO accrual", "vacation days", "annual leave", ], r"what.*my.*allowance": [ "PTO accrual", "vacation allowance", "time off allocation", ], r"time off.*balance": ["PTO balance", "vacation balance", "accrued time"], r"sick.*time": ["sick leave", "medical leave", "PTO for illness"], } def expand_query(self, query: str) -> str: """ Expand a user query with relevant synonyms and terminology. Args: query: Original user query Returns: Expanded query with additional relevant terms """ expanded_terms = set() original_words = self._extract_key_terms(query.lower()) # Add original query expanded_terms.add(query) # Pattern-based expansion for pattern, expansions in self.question_patterns.items(): if re.search(pattern, query.lower()): expanded_terms.update(expansions) # Synonym-based expansion for word in original_words: if word in self.hr_synonyms: expanded_terms.update(self.hr_synonyms[word]) # Multi-word phrase matching query_lower = query.lower() for phrase, synonyms in self.hr_synonyms.items(): if phrase in query_lower: expanded_terms.update(synonyms) # Create expanded query if len(expanded_terms) > 1: # Join with the original query for semantic search expanded_query = f"{query} " + " ".join(expanded_terms - {query}) return expanded_query[:500] # Limit length to prevent overly long queries return query def _extract_key_terms(self, text: str) -> List[str]: """Extract key terms from text, removing common stop words.""" stop_words = { "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "how", "what", "when", "where", "why", "is", "are", "do", "does", "can", "could", "should", "would", "will", "i", "me", "my", } # Simple word extraction (could be enhanced with NLP libraries) words = re.findall(r"\b\w+\b", text.lower()) return [word for word in words if word not in stop_words and len(word) > 2] def get_domain_suggestions(self, query: str) -> List[str]: """ Get domain-specific suggestions for improving the query. Args: query: User's original query Returns: List of suggested alternative phrasings """ suggestions = [] query_lower = query.lower() # Specific suggestions based on common user patterns if "personal time" in query_lower: suggestions.extend( [ "How much PTO do I accrue each year?", "What is my paid time off allocation?", "How many vacation days do I get annually?", ] ) if "time off" in query_lower and "how much" in query_lower: suggestions.extend( [ "What is my PTO accrual rate?", "How many paid time off days do I earn per year?", ] ) if "work from home" in query_lower or "remote" in query_lower: suggestions.extend( [ "What is the remote work policy?", "Can I work from home?", "What are the telecommuting guidelines?", ] ) return suggestions[:3] # Limit to top 3 suggestions