| import re |
| import hashlib |
| from typing import List, Dict |
|
|
| class DataNormalizer: |
| def __init__(self): |
| self.tag_keywords = { |
| 'ml': ['машинное обучение', 'machine learning', 'ml', 'алгоритм', 'модель'], |
| 'dl': ['глубокое обучение', 'deep learning', 'нейронная сеть', 'cnn', 'rnn', 'transformer'], |
| 'nlp': ['nlp', 'обработка естественного языка', 'natural language', 'текст', 'язык'], |
| 'cv': ['компьютерное зрение', 'computer vision', 'cv', 'изображение', 'видео'], |
| 'math': ['математика', 'математический', 'алгебра', 'геометрия', 'анализ'], |
| 'stats': ['статистика', 'вероятность', 'статистический', 'probability'], |
| 'product': ['продукт', 'product', 'разработка продукта', 'продуктовая'], |
| 'business': ['бизнес', 'business', 'менеджмент', 'управление', 'экономика'], |
| 'pm': ['project management', 'управление проектами', 'pm', 'проект'], |
| 'systems': ['система', 'system', 'архитектура', 'инфраструктура'], |
| 'data': ['данные', 'data', 'анализ данных', 'big data', 'база данных'] |
| } |
|
|
| def normalize_courses(self, courses: List[Dict]) -> List[Dict]: |
| normalized_courses = [] |
| seen_hashes = set() |
| |
| for course in courses: |
| normalized = self._normalize_course(course) |
| if normalized: |
| course_hash = self._calculate_course_hash(normalized) |
| if course_hash not in seen_hashes: |
| seen_hashes.add(course_hash) |
| normalized_courses.append(normalized) |
| |
| return normalized_courses |
| |
| def _normalize_course(self, course: Dict) -> Dict: |
| if not course.get('name'): |
| return None |
| |
| normalized = course.copy() |
| |
| normalized['name'] = self._normalize_name(course['name']) |
| normalized['short_desc'] = self._generate_short_desc(course) |
| normalized['tags'] = self._generate_tags(normalized) |
| |
| normalized['semester'] = self._normalize_semester(course.get('semester', 1)) |
| normalized['credits'] = self._normalize_credits(course.get('credits', 0)) |
| normalized['hours'] = self._normalize_hours(course.get('hours', 0)) |
| normalized['type'] = self._normalize_type(course.get('type', 'required')) |
| |
| return normalized |
| |
| def _normalize_name(self, name: str) -> str: |
| if not name: |
| return '' |
| |
| name = str(name).strip() |
| name = re.sub(r'\s+', ' ', name) |
| name = name.replace('"', '').replace('"', '') |
| |
| return name |
| |
| def _generate_short_desc(self, course: dict) -> str: |
| name = course.get('name', '') |
| desc = course.get('description', '') |
| |
| if desc: |
| desc = str(desc).strip() |
| if len(desc) > 220: |
| desc = desc[:220] + '...' |
| return desc |
| |
| if name and len(name) > 50: |
| return name[:220] |
| |
| return 'Курс из учебного плана программы' |
| |
| def _generate_tags(self, course: Dict) -> List[str]: |
| text = f"{course.get('name', '')} {course.get('short_desc', '')}".lower() |
| tags = [] |
| |
| for tag, keywords in self.tag_keywords.items(): |
| if any(keyword in text for keyword in keywords): |
| tags.append(tag) |
| |
| return tags |
| |
| def _normalize_semester(self, semester) -> int: |
| try: |
| semester = int(semester) |
| if 1 <= semester <= 4: |
| return semester |
| except (ValueError, TypeError): |
| pass |
| |
| return 1 |
| |
| def _normalize_credits(self, credits) -> int: |
| try: |
| credits = int(credits) |
| if credits >= 0: |
| return credits |
| except (ValueError, TypeError): |
| pass |
| |
| return 0 |
| |
| def _normalize_hours(self, hours) -> int: |
| try: |
| hours = int(hours) |
| if hours >= 0: |
| return hours |
| except (ValueError, TypeError): |
| pass |
| |
| return 0 |
| |
| def _normalize_type(self, course_type: str) -> str: |
| if not course_type: |
| return 'required' |
| |
| type_lower = str(course_type).lower() |
| |
| if any(word in type_lower for word in ['обязательная', 'required', 'обяз']): |
| return 'required' |
| elif any(word in type_lower for word in ['по выбору', 'elective', 'выбор']): |
| return 'elective' |
| |
| return 'required' |
| |
| def _calculate_course_hash(self, course: Dict) -> str: |
| text = f"{course.get('name', '')}{course.get('program_id', '')}{course.get('semester', '')}" |
| return hashlib.md5(text.encode()).hexdigest() |
| |
| def merge_courses(self, courses_list: List[List[Dict]]) -> List[Dict]: |
| all_courses = [] |
| for courses in courses_list: |
| all_courses.extend(courses) |
| |
| return self.normalize_courses(all_courses) |
| |
| def validate_course(self, course: Dict) -> bool: |
| required_fields = ['name', 'program_id', 'semester'] |
| |
| for field in required_fields: |
| if not course.get(field): |
| return False |
| |
| if len(course.get('name', '')) < 3: |
| return False |
| |
| return True |
| |
| def get_statistics(self, courses: List[Dict]) -> Dict: |
| stats = { |
| 'total_courses': len(courses), |
| 'by_program': {}, |
| 'by_semester': {}, |
| 'by_type': {}, |
| 'by_tags': {} |
| } |
| |
| for course in courses: |
| program_id = course.get('program_id', 'unknown') |
| semester = course.get('semester', 1) |
| course_type = course.get('type', 'required') |
| tags = course.get('tags', []) |
| |
| stats['by_program'][program_id] = stats['by_program'].get(program_id, 0) + 1 |
| stats['by_semester'][semester] = stats['by_semester'].get(semester, 0) + 1 |
| stats['by_type'][course_type] = stats['by_type'].get(course_type, 0) + 1 |
| |
| for tag in tags: |
| stats['by_tags'][tag] = stats['by_tags'].get(tag, 0) + 1 |
| |
| return stats |
|
|
| def main(): |
| normalizer = DataNormalizer() |
| |
| test_courses = [ |
| { |
| 'id': 'test_1', |
| 'program_id': 'ai', |
| 'name': 'Машинное обучение', |
| 'semester': 1, |
| 'credits': 6, |
| 'type': 'required' |
| }, |
| { |
| 'id': 'test_2', |
| 'program_id': 'ai_product', |
| 'name': 'Глубокое обучение', |
| 'semester': 2, |
| 'credits': 4, |
| 'type': 'elective' |
| } |
| ] |
| |
| normalized = normalizer.normalize_courses(test_courses) |
| stats = normalizer.get_statistics(normalized) |
| |
| print(f'Нормализовано курсов: {len(normalized)}') |
| print(f'Статистика: {stats}') |
|
|
| if __name__ == '__main__': |
| main() |
|
|