import re from base64 import b64encode from dataclasses import dataclass from datetime import datetime from urllib.parse import quote, urljoin import requests from bs4 import BeautifulSoup, Tag from app.config import get_settings ARTICLE_PATH_RE = re.compile(r"/bbs/computer/(?P\d+)/(?P\d+)/artclView\.do") DATE_RE = re.compile(r"(20\d{2})[-./](\d{1,2})[-./](\d{1,2})") BOARD_CONFIG = { "notice": {"board_id": 1926, "board_name": "공지사항", "subview_id": 10058}, "archive": {"board_id": 1927, "board_name": "자료실", "subview_id": 10059}, "jobs": {"board_id": 1929, "board_name": "취업정보", "subview_id": 10077}, } @dataclass class PostStub: board_key: str board_name: str board_id: int article_id: int title: str post_url: str published_at: datetime | None @dataclass class PostDetail: board_key: str board_name: str board_id: int article_id: int title: str post_url: str author: str | None published_at: datetime | None summary: str | None content_text: str | None attachments: list[dict] class HufsCrawler: def __init__(self) -> None: settings = get_settings() self.base_url = settings.base_url self.max_pages_per_board = settings.max_pages_per_board self.session = requests.Session() self.session.headers.update({"User-Agent": settings.user_agent}) self.timeout = settings.request_timeout_seconds def crawl_board_list(self, board_key: str, page: int = 1) -> list[PostStub]: board = BOARD_CONFIG[board_key] list_url = f"{self.base_url}/bbs/computer/{board['board_id']}/artclList.do" response = self.session.get( list_url, params={"layout": "unknown", "page": page}, timeout=self.timeout, ) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") seen_article_ids: set[int] = set() posts: list[PostStub] = [] for anchor in soup.select("a[href*='artclView.do']"): href = anchor.get("href") or "" match = ARTICLE_PATH_RE.search(href) if not match: continue article_id = int(match.group("article_id")) if article_id in seen_article_ids: continue seen_article_ids.add(article_id) row = anchor.find_parent("tr") posts.append( PostStub( board_key=board_key, board_name=board["board_name"], board_id=board["board_id"], article_id=article_id, title=self._normalize_text(anchor.get_text(" ", strip=True)), post_url=self._build_public_post_url( subview_id=board["subview_id"], board_id=board["board_id"], article_id=article_id, ), published_at=self._extract_date_from_row(row), ) ) return posts def crawl_post_detail(self, stub: PostStub) -> PostDetail: detail_url = self._build_detail_url(board_id=stub.board_id, article_id=stub.article_id) response = self.session.get(detail_url, timeout=self.timeout) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") title = self._first_text( soup, [ ".artclView .title h2", ".artclView h2", ".view-title", "h2", "h3", ], ) or stub.title content_node = self._first_node( soup, [ ".artclContents", ".fr-view", ".view-con", ".artcl-view", ".bbs--view", "#artclView", ], ) content_text = self._normalize_text(content_node.get_text("\n", strip=True)) if content_node else None author = self._extract_meta(soup, ["작성자", "writer", "등록자"]) published_at = self._extract_date_from_soup(soup) or stub.published_at attachments = self._extract_attachments(soup) return PostDetail( board_key=stub.board_key, board_name=stub.board_name, board_id=stub.board_id, article_id=stub.article_id, title=title, post_url=stub.post_url, author=author, published_at=published_at, summary=self._make_summary(content_text), content_text=content_text, attachments=attachments, ) def _build_detail_url(self, board_id: int, article_id: int) -> str: return f"{self.base_url}/bbs/computer/{board_id}/{article_id}/artclView.do" def _build_public_post_url(self, subview_id: int, board_id: int, article_id: int) -> str: article_path = f"/bbs/computer/{board_id}/{article_id}/artclView.do?" encoded_path = quote(article_path, safe="") enc = b64encode(f"fnct1|@@|{encoded_path}".encode("utf-8")).decode("ascii") return f"{self.base_url}/computer/{subview_id}/subview.do?enc={enc}" def _extract_attachments(self, soup: BeautifulSoup) -> list[dict]: attachments: list[dict] = [] seen: set[str] = set() for anchor in soup.select("a[href*='download'], a[href*='fileDown'], a[href*='attach'], a[href*='FileDown']"): href = anchor.get("href") or "" name = self._normalize_text(anchor.get_text(" ", strip=True)) if not href or not name: continue absolute_url = urljoin(self.base_url, href) if absolute_url in seen: continue seen.add(absolute_url) attachments.append({"name": name, "url": absolute_url}) return attachments def _extract_meta(self, soup: BeautifulSoup, labels: list[str]) -> str | None: for label in labels: label_node = soup.find(string=re.compile(label, re.IGNORECASE)) if not label_node: continue parent = label_node.parent if isinstance(label_node.parent, Tag) else None if not parent: continue container = parent.parent if parent.parent else parent candidate_text = self._normalize_text(container.get_text(" ", strip=True)) candidate_text = candidate_text.replace(label, "").replace(":", "").strip() if candidate_text: return candidate_text return None def _extract_date_from_row(self, row: Tag | None) -> datetime | None: if row is None: return None return self._parse_date(row.get_text(" ", strip=True)) def _extract_date_from_soup(self, soup: BeautifulSoup) -> datetime | None: for selector in [".artclInfo", ".view-info", ".info", "body"]: node = soup.select_one(selector) if not node: continue parsed = self._parse_date(node.get_text(" ", strip=True)) if parsed: return parsed return None def _parse_date(self, text: str | None) -> datetime | None: if not text: return None match = DATE_RE.search(text) if not match: return None year, month, day = map(int, match.groups()) return datetime(year, month, day) def _make_summary(self, content_text: str | None, max_length: int = 280) -> str | None: if not content_text: return None normalized = " ".join(content_text.split()) if len(normalized) <= max_length: return normalized return normalized[: max_length - 3].rstrip() + "..." def _first_node(self, soup: BeautifulSoup, selectors: list[str]) -> Tag | None: for selector in selectors: node = soup.select_one(selector) if node: return node return None def _first_text(self, soup: BeautifulSoup, selectors: list[str]) -> str | None: node = self._first_node(soup, selectors) if node: return self._normalize_text(node.get_text(" ", strip=True)) return None def _normalize_text(self, value: str | None) -> str: if not value: return "" return re.sub(r"\s+", " ", value).strip()