| import requests |
| import re |
| from bs4 import BeautifulSoup |
| from typing import List, Dict |
| import hashlib |
| import json |
| import os |
|
|
| class HTMLScraper: |
| def __init__(self): |
| self.session = requests.Session() |
| self.session.headers.update({ |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
| }) |
| |
| self.program_urls = { |
| 'ai': 'https://abit.itmo.ru/program/master/ai', |
| 'ai_product': 'https://abit.itmo.ru/program/master/ai_product' |
| } |
|
|
| |
| def scrape_programs(self) -> Dict: |
| programs = {} |
| |
| for program_id, url in self.program_urls.items(): |
| try: |
| print(f'Скрапинг программы {program_id}...') |
| program_data = self._scrape_program_page(url, program_id) |
| programs[program_id] = program_data |
| except Exception as e: |
| print(f'Ошибка при скрапинге {program_id}: {e}') |
| |
| return programs |
| |
| def _scrape_program_page(self, url: str, program_id: str) -> Dict: |
| response = self.session.get(url, timeout=30) |
| response.raise_for_status() |
| |
| soup = BeautifulSoup(response.content, 'html.parser') |
| |
| title = self._extract_title(soup) |
| description = self._extract_description(soup) |
| pdf_links = self._extract_pdf_links(soup, url) |
| |
| program_data = { |
| 'id': program_id, |
| 'title': title, |
| 'description': description, |
| 'url': url, |
| 'pdf_links': pdf_links, |
| 'hash': self._calculate_hash(response.content) |
| } |
| |
| return program_data |
| |
| def _extract_title(self, soup: BeautifulSoup) -> str: |
| title_elem = soup.find('h1') or soup.find('title') |
| if title_elem: |
| return title_elem.get_text().strip() |
| return '' |
| |
| def _extract_description(self, soup: BeautifulSoup) -> str: |
| desc_selectors = [ |
| '.program-description', |
| '.description', |
| '.program-info', |
| 'p', |
| '.content' |
| ] |
| |
| for selector in desc_selectors: |
| elem = soup.select_one(selector) |
| if elem: |
| text = elem.get_text().strip() |
| if len(text) > 50: |
| return text[:500] |
| |
| return '' |
| |
| def _extract_pdf_links(self, soup: BeautifulSoup, base_url: str) -> List[Dict]: |
| pdf_links = [] |
| |
| for link in soup.find_all('a', href=True): |
| href = link.get('href', '') |
| text = link.get_text().strip().lower() |
| |
| if self._is_pdf_link(href, text): |
| full_url = self._make_absolute_url(href, base_url) |
| pdf_links.append({ |
| 'url': full_url, |
| 'text': text, |
| 'filename': self._extract_filename(href) |
| }) |
| |
| return pdf_links |
| |
| def _is_pdf_link(self, href: str, text: str) -> bool: |
| pdf_indicators = [ |
| 'учебный план', 'учебный план', 'curriculum', 'plan', |
| 'pdf', '.pdf', 'программа', 'program' |
| ] |
| |
| href_lower = href.lower() |
| return any(indicator in href_lower or indicator in text for indicator in pdf_indicators) |
| |
| def _make_absolute_url(self, href: str, base_url: str) -> str: |
| if href.startswith('http'): |
| return href |
| elif href.startswith('/'): |
| base = '/'.join(base_url.split('/')[:3]) |
| return base + href |
| else: |
| return base_url.rstrip('/') + '/' + href.lstrip('/') |
| |
| def _extract_filename(self, href: str) -> str: |
| filename = href.split('/')[-1] |
| if not filename.endswith('.pdf'): |
| filename += '.pdf' |
| return filename |
| |
| def _calculate_hash(self, content: bytes) -> str: |
| return hashlib.sha256(content).hexdigest() |
| |
| def save_programs(self, programs: Dict, output_path: str = 'data/processed/programs.json'): |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) |
| |
| with open(output_path, 'w', encoding='utf-8') as f: |
| json.dump(programs, f, ensure_ascii=False, indent=2) |
| |
| print(f'Программы сохранены в {output_path}') |
|
|
| def main(): |
| scraper = HTMLScraper() |
| programs = scraper.scrape_programs() |
| scraper.save_programs(programs) |
| |
| for program_id, program in programs.items(): |
| print(f'\n{program["title"]}:') |
| print(f'PDF ссылок найдено: {len(program["pdf_links"])}') |
| for link in program['pdf_links']: |
| print(f' - {link["filename"]}: {link["url"]}') |
|
|
| if __name__ == '__main__': |
| main() |
|
|