Spaces:
Sleeping
Sleeping
| """ | |
| Query Enhancement Module - Improve semantic search with query expansion and synonyms. | |
| This module helps bridge the gap between natural user language and document terminology | |
| by expanding queries with relevant synonyms and domain-specific terms. | |
| """ | |
| import re | |
| from typing import List | |
| class QueryExpander: | |
| """ | |
| Expands user queries with relevant synonyms and domain-specific terminology | |
| to improve semantic search results in corporate policy documents. | |
| """ | |
| def __init__(self): | |
| """Initialize the query expander with predefined synonym mappings.""" | |
| # Additional HR-specific synonyms | |
| self.hr_synonyms = { | |
| # Time off related - enhanced with policy document terms | |
| "personal time": [ | |
| "PTO", | |
| "paid time off", | |
| "time off", | |
| "vacation", | |
| "personal days", | |
| "leave", | |
| "accrual", | |
| "days off", | |
| ], | |
| "vacation": [ | |
| "PTO", | |
| "paid time off", | |
| "time off", | |
| "personal time", | |
| "vacation days", | |
| "holiday", | |
| "accrual", | |
| ], | |
| "sick leave": [ | |
| "sick time", | |
| "medical leave", | |
| "illness", | |
| "health days", | |
| "PTO", | |
| ], | |
| "time off": [ | |
| "PTO", | |
| "paid time off", | |
| "vacation", | |
| "leave", | |
| "personal time", | |
| "days off", | |
| "accrual", | |
| ], | |
| "PTO": [ | |
| "paid time off", | |
| "vacation", | |
| "personal time", | |
| "time off", | |
| "accrual", | |
| "days off", | |
| ], | |
| "leave": [ | |
| "time off", | |
| "absence", | |
| "PTO", | |
| "paid time off", | |
| "vacation", | |
| "accrual", | |
| ], | |
| "days off": [ | |
| "PTO", | |
| "paid time off", | |
| "vacation", | |
| "time off", | |
| "personal time", | |
| "leave", | |
| ], | |
| "accrual": [ | |
| "earn", | |
| "accumulate", | |
| "build up", | |
| "PTO", | |
| "vacation", | |
| "time off", | |
| ], | |
| "earn": ["accrue", "accumulate", "get", "receive", "build up"], | |
| "annual": ["yearly", "per year", "each year", "annually"], | |
| "allowance": [ | |
| "allocation", | |
| "entitlement", | |
| "amount", | |
| "accrual", | |
| "benefit", | |
| "limit", | |
| ], | |
| "allocation": ["allowance", "entitlement", "amount", "limit", "budget"], | |
| # Benefits related - enhanced with employee terminology | |
| "benefits": [ | |
| "perks", | |
| "compensation", | |
| "package", | |
| "coverage", | |
| "health insurance", | |
| "401k", | |
| "retirement", | |
| ], | |
| "insurance": [ | |
| "coverage", | |
| "health plan", | |
| "medical", | |
| "benefits", | |
| "healthcare", | |
| "dental", | |
| "vision", | |
| ], | |
| "retirement": [ | |
| "401k", | |
| "pension", | |
| "savings", | |
| "investment", | |
| "matching", | |
| "contribution", | |
| ], | |
| "healthcare": [ | |
| "medical", | |
| "health insurance", | |
| "coverage", | |
| "benefits", | |
| "health plan", | |
| "dental", | |
| "vision", | |
| ], | |
| "401k": ["retirement", "savings", "matching", "contribution", "pension"], | |
| "health plan": [ | |
| "healthcare", | |
| "medical", | |
| "insurance", | |
| "coverage", | |
| "benefits", | |
| ], | |
| "dental": ["dental coverage", "dental insurance", "benefits", "healthcare"], | |
| "vision": ["vision coverage", "eye care", "benefits", "healthcare"], | |
| "gym": ["fitness", "wellness", "health", "membership", "benefits"], | |
| "tuition": [ | |
| "education", | |
| "training", | |
| "reimbursement", | |
| "learning", | |
| "development", | |
| ], | |
| # Work arrangements | |
| "remote work": ["work from home", "telecommuting", "WFH", "telework"], | |
| "work from home": ["remote work", "telecommuting", "WFH", "telework"], | |
| "telecommuting": ["remote work", "work from home", "WFH", "telework"], | |
| "WFH": ["work from home", "remote work", "telecommuting", "telework"], | |
| "flexible schedule": ["flex time", "flexible hours", "work schedule"], | |
| # Performance and development | |
| "performance review": ["evaluation", "appraisal", "assessment", "feedback"], | |
| "training": ["development", "education", "learning", "courses"], | |
| "promotion": ["advancement", "career growth", "progression", "raise"], | |
| # HR processes | |
| "onboarding": ["orientation", "new hire", "getting started", "setup"], | |
| "offboarding": ["termination", "leaving", "exit", "departure"], | |
| "policy": ["procedure", "guidelines", "rules", "standards"], | |
| # Workplace issues and safety | |
| "harassment": [ | |
| "discrimination", | |
| "bullying", | |
| "hostile", | |
| "inappropriate behavior", | |
| ], | |
| "complaint": ["report", "grievance", "issue", "concern", "problem"], | |
| "discrimination": ["harassment", "bias", "unfair treatment", "prejudice"], | |
| "emergency": ["crisis", "urgent", "fire", "evacuation", "safety"], | |
| "safety": ["security", "hazard", "emergency", "protection", "guidelines"], | |
| # Expenses and travel | |
| "expenses": ["reimbursement", "costs", "spending", "business expenses"], | |
| "reimbursement": ["expenses", "refund", "repayment", "reimbursable"], | |
| "travel": ["business trip", "trip", "hotel", "flight", "transportation"], | |
| "meal allowance": ["food", "dining", "per diem", "meal budget"], | |
| # Technology and security | |
| "password": ["security", "login", "authentication", "access"], | |
| "VPN": ["remote access", "network", "connection", "security"], | |
| "security": ["password", "access", "protection", "privacy", "incident"], | |
| "device": ["computer", "laptop", "phone", "equipment", "technology"], | |
| "WiFi": ["network", "internet", "connection", "wireless"], | |
| } | |
| # Common question patterns and their expansions | |
| self.question_patterns = { | |
| r"how much.*time.*earn|accrue": [ | |
| "PTO accrual", | |
| "vacation days", | |
| "time off allocation", | |
| ], | |
| r"how many.*days.*get|receive": [ | |
| "PTO accrual", | |
| "vacation days", | |
| "annual leave", | |
| ], | |
| r"what.*my.*allowance": [ | |
| "PTO accrual", | |
| "vacation allowance", | |
| "time off allocation", | |
| ], | |
| r"time off.*balance": ["PTO balance", "vacation balance", "accrued time"], | |
| r"sick.*time": ["sick leave", "medical leave", "PTO for illness"], | |
| } | |
| def expand_query(self, query: str) -> str: | |
| """ | |
| Expand a user query with relevant synonyms and terminology. | |
| Args: | |
| query: Original user query | |
| Returns: | |
| Expanded query with additional relevant terms | |
| """ | |
| expanded_terms = set() | |
| original_words = self._extract_key_terms(query.lower()) | |
| # Add original query | |
| expanded_terms.add(query) | |
| # Pattern-based expansion | |
| for pattern, expansions in self.question_patterns.items(): | |
| if re.search(pattern, query.lower()): | |
| expanded_terms.update(expansions) | |
| # Synonym-based expansion | |
| for word in original_words: | |
| if word in self.hr_synonyms: | |
| expanded_terms.update(self.hr_synonyms[word]) | |
| # Multi-word phrase matching | |
| query_lower = query.lower() | |
| for phrase, synonyms in self.hr_synonyms.items(): | |
| if phrase in query_lower: | |
| expanded_terms.update(synonyms) | |
| # Create expanded query | |
| if len(expanded_terms) > 1: | |
| # Join with the original query for semantic search | |
| expanded_query = f"{query} " + " ".join(expanded_terms - {query}) | |
| return expanded_query[:500] # Limit length to prevent overly long queries | |
| return query | |
| def _extract_key_terms(self, text: str) -> List[str]: | |
| """Extract key terms from text, removing common stop words.""" | |
| stop_words = { | |
| "the", | |
| "a", | |
| "an", | |
| "and", | |
| "or", | |
| "but", | |
| "in", | |
| "on", | |
| "at", | |
| "to", | |
| "for", | |
| "of", | |
| "with", | |
| "by", | |
| "how", | |
| "what", | |
| "when", | |
| "where", | |
| "why", | |
| "is", | |
| "are", | |
| "do", | |
| "does", | |
| "can", | |
| "could", | |
| "should", | |
| "would", | |
| "will", | |
| "i", | |
| "me", | |
| "my", | |
| } | |
| # Simple word extraction (could be enhanced with NLP libraries) | |
| words = re.findall(r"\b\w+\b", text.lower()) | |
| return [word for word in words if word not in stop_words and len(word) > 2] | |
| def get_domain_suggestions(self, query: str) -> List[str]: | |
| """ | |
| Get domain-specific suggestions for improving the query. | |
| Args: | |
| query: User's original query | |
| Returns: | |
| List of suggested alternative phrasings | |
| """ | |
| suggestions = [] | |
| query_lower = query.lower() | |
| # Specific suggestions based on common user patterns | |
| if "personal time" in query_lower: | |
| suggestions.extend( | |
| [ | |
| "How much PTO do I accrue each year?", | |
| "What is my paid time off allocation?", | |
| "How many vacation days do I get annually?", | |
| ] | |
| ) | |
| if "time off" in query_lower and "how much" in query_lower: | |
| suggestions.extend( | |
| [ | |
| "What is my PTO accrual rate?", | |
| "How many paid time off days do I earn per year?", | |
| ] | |
| ) | |
| if "work from home" in query_lower or "remote" in query_lower: | |
| suggestions.extend( | |
| [ | |
| "What is the remote work policy?", | |
| "Can I work from home?", | |
| "What are the telecommuting guidelines?", | |
| ] | |
| ) | |
| return suggestions[:3] # Limit to top 3 suggestions | |