msse-ai-engineering / src /search /query_expander.py
Tobias Pasquale
fix: Remove unused imports and fix line length for flake8 compliance
923405c
raw
history blame
11.4 kB
"""
Query Enhancement Module - Improve semantic search with query expansion and synonyms.
This module helps bridge the gap between natural user language and document terminology
by expanding queries with relevant synonyms and domain-specific terms.
"""
import re
from typing import List
class QueryExpander:
"""
Expands user queries with relevant synonyms and domain-specific terminology
to improve semantic search results in corporate policy documents.
"""
def __init__(self):
"""Initialize the query expander with predefined synonym mappings."""
# Additional HR-specific synonyms
self.hr_synonyms = {
# Time off related - enhanced with policy document terms
"personal time": [
"PTO",
"paid time off",
"time off",
"vacation",
"personal days",
"leave",
"accrual",
"days off",
],
"vacation": [
"PTO",
"paid time off",
"time off",
"personal time",
"vacation days",
"holiday",
"accrual",
],
"sick leave": [
"sick time",
"medical leave",
"illness",
"health days",
"PTO",
],
"time off": [
"PTO",
"paid time off",
"vacation",
"leave",
"personal time",
"days off",
"accrual",
],
"PTO": [
"paid time off",
"vacation",
"personal time",
"time off",
"accrual",
"days off",
],
"leave": [
"time off",
"absence",
"PTO",
"paid time off",
"vacation",
"accrual",
],
"days off": [
"PTO",
"paid time off",
"vacation",
"time off",
"personal time",
"leave",
],
"accrual": [
"earn",
"accumulate",
"build up",
"PTO",
"vacation",
"time off",
],
"earn": ["accrue", "accumulate", "get", "receive", "build up"],
"annual": ["yearly", "per year", "each year", "annually"],
"allowance": [
"allocation",
"entitlement",
"amount",
"accrual",
"benefit",
"limit",
],
"allocation": ["allowance", "entitlement", "amount", "limit", "budget"],
# Benefits related - enhanced with employee terminology
"benefits": [
"perks",
"compensation",
"package",
"coverage",
"health insurance",
"401k",
"retirement",
],
"insurance": [
"coverage",
"health plan",
"medical",
"benefits",
"healthcare",
"dental",
"vision",
],
"retirement": [
"401k",
"pension",
"savings",
"investment",
"matching",
"contribution",
],
"healthcare": [
"medical",
"health insurance",
"coverage",
"benefits",
"health plan",
"dental",
"vision",
],
"401k": ["retirement", "savings", "matching", "contribution", "pension"],
"health plan": [
"healthcare",
"medical",
"insurance",
"coverage",
"benefits",
],
"dental": ["dental coverage", "dental insurance", "benefits", "healthcare"],
"vision": ["vision coverage", "eye care", "benefits", "healthcare"],
"gym": ["fitness", "wellness", "health", "membership", "benefits"],
"tuition": [
"education",
"training",
"reimbursement",
"learning",
"development",
],
# Work arrangements
"remote work": ["work from home", "telecommuting", "WFH", "telework"],
"work from home": ["remote work", "telecommuting", "WFH", "telework"],
"telecommuting": ["remote work", "work from home", "WFH", "telework"],
"WFH": ["work from home", "remote work", "telecommuting", "telework"],
"flexible schedule": ["flex time", "flexible hours", "work schedule"],
# Performance and development
"performance review": ["evaluation", "appraisal", "assessment", "feedback"],
"training": ["development", "education", "learning", "courses"],
"promotion": ["advancement", "career growth", "progression", "raise"],
# HR processes
"onboarding": ["orientation", "new hire", "getting started", "setup"],
"offboarding": ["termination", "leaving", "exit", "departure"],
"policy": ["procedure", "guidelines", "rules", "standards"],
# Workplace issues and safety
"harassment": [
"discrimination",
"bullying",
"hostile",
"inappropriate behavior",
],
"complaint": ["report", "grievance", "issue", "concern", "problem"],
"discrimination": ["harassment", "bias", "unfair treatment", "prejudice"],
"emergency": ["crisis", "urgent", "fire", "evacuation", "safety"],
"safety": ["security", "hazard", "emergency", "protection", "guidelines"],
# Expenses and travel
"expenses": ["reimbursement", "costs", "spending", "business expenses"],
"reimbursement": ["expenses", "refund", "repayment", "reimbursable"],
"travel": ["business trip", "trip", "hotel", "flight", "transportation"],
"meal allowance": ["food", "dining", "per diem", "meal budget"],
# Technology and security
"password": ["security", "login", "authentication", "access"],
"VPN": ["remote access", "network", "connection", "security"],
"security": ["password", "access", "protection", "privacy", "incident"],
"device": ["computer", "laptop", "phone", "equipment", "technology"],
"WiFi": ["network", "internet", "connection", "wireless"],
}
# Common question patterns and their expansions
self.question_patterns = {
r"how much.*time.*earn|accrue": [
"PTO accrual",
"vacation days",
"time off allocation",
],
r"how many.*days.*get|receive": [
"PTO accrual",
"vacation days",
"annual leave",
],
r"what.*my.*allowance": [
"PTO accrual",
"vacation allowance",
"time off allocation",
],
r"time off.*balance": ["PTO balance", "vacation balance", "accrued time"],
r"sick.*time": ["sick leave", "medical leave", "PTO for illness"],
}
def expand_query(self, query: str) -> str:
"""
Expand a user query with relevant synonyms and terminology.
Args:
query: Original user query
Returns:
Expanded query with additional relevant terms
"""
expanded_terms = set()
original_words = self._extract_key_terms(query.lower())
# Add original query
expanded_terms.add(query)
# Pattern-based expansion
for pattern, expansions in self.question_patterns.items():
if re.search(pattern, query.lower()):
expanded_terms.update(expansions)
# Synonym-based expansion
for word in original_words:
if word in self.hr_synonyms:
expanded_terms.update(self.hr_synonyms[word])
# Multi-word phrase matching
query_lower = query.lower()
for phrase, synonyms in self.hr_synonyms.items():
if phrase in query_lower:
expanded_terms.update(synonyms)
# Create expanded query
if len(expanded_terms) > 1:
# Join with the original query for semantic search
expanded_query = f"{query} " + " ".join(expanded_terms - {query})
return expanded_query[:500] # Limit length to prevent overly long queries
return query
def _extract_key_terms(self, text: str) -> List[str]:
"""Extract key terms from text, removing common stop words."""
stop_words = {
"the",
"a",
"an",
"and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"by",
"how",
"what",
"when",
"where",
"why",
"is",
"are",
"do",
"does",
"can",
"could",
"should",
"would",
"will",
"i",
"me",
"my",
}
# Simple word extraction (could be enhanced with NLP libraries)
words = re.findall(r"\b\w+\b", text.lower())
return [word for word in words if word not in stop_words and len(word) > 2]
def get_domain_suggestions(self, query: str) -> List[str]:
"""
Get domain-specific suggestions for improving the query.
Args:
query: User's original query
Returns:
List of suggested alternative phrasings
"""
suggestions = []
query_lower = query.lower()
# Specific suggestions based on common user patterns
if "personal time" in query_lower:
suggestions.extend(
[
"How much PTO do I accrue each year?",
"What is my paid time off allocation?",
"How many vacation days do I get annually?",
]
)
if "time off" in query_lower and "how much" in query_lower:
suggestions.extend(
[
"What is my PTO accrual rate?",
"How many paid time off days do I earn per year?",
]
)
if "work from home" in query_lower or "remote" in query_lower:
suggestions.extend(
[
"What is the remote work policy?",
"Can I work from home?",
"What are the telecommuting guidelines?",
]
)
return suggestions[:3] # Limit to top 3 suggestions