Feat: [main] hufs-notice-crawler CI/CD까지 구현 완료

2026-03-17 17:18:16 +09:00
commit ca460453af
23 changed files with 1959 additions and 0 deletions
--- a/app/crawler.py
+++ b/app/crawler.py
@@ -0,0 +1,236 @@
+import re
+from base64 import b64encode
+from dataclasses import dataclass
+from datetime import datetime
+from urllib.parse import quote, urljoin
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+from app.config import get_settings
+
+ARTICLE_PATH_RE = re.compile(r"/bbs/computer/(?P<board_id>\d+)/(?P<article_id>\d+)/artclView\.do")
+DATE_RE = re.compile(r"(20\d{2})[-./](\d{1,2})[-./](\d{1,2})")
+
+
+BOARD_CONFIG = {
+    "notice": {"board_id": 1926, "board_name": "공지사항", "subview_id": 10058},
+    "archive": {"board_id": 1927, "board_name": "자료실", "subview_id": 10059},
+    "jobs": {"board_id": 1929, "board_name": "취업정보", "subview_id": 10077},
+}
+
+
+@dataclass
+class PostStub:
+    board_key: str
+    board_name: str
+    board_id: int
+    article_id: int
+    title: str
+    post_url: str
+    published_at: datetime | None
+
+
+@dataclass
+class PostDetail:
+    board_key: str
+    board_name: str
+    board_id: int
+    article_id: int
+    title: str
+    post_url: str
+    author: str | None
+    published_at: datetime | None
+    summary: str | None
+    content_text: str | None
+    attachments: list[dict]
+
+
+class HufsCrawler:
+    def __init__(self) -> None:
+        settings = get_settings()
+        self.base_url = settings.base_url
+        self.max_pages_per_board = settings.max_pages_per_board
+        self.session = requests.Session()
+        self.session.headers.update({"User-Agent": settings.user_agent})
+        self.timeout = settings.request_timeout_seconds
+
+    def crawl_board_list(self, board_key: str, page: int = 1) -> list[PostStub]:
+        board = BOARD_CONFIG[board_key]
+        list_url = f"{self.base_url}/bbs/computer/{board['board_id']}/artclList.do"
+        response = self.session.get(
+            list_url,
+            params={"layout": "unknown", "page": page},
+            timeout=self.timeout,
+        )
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        seen_article_ids: set[int] = set()
+        posts: list[PostStub] = []
+        for anchor in soup.select("a[href*='artclView.do']"):
+            href = anchor.get("href") or ""
+            match = ARTICLE_PATH_RE.search(href)
+            if not match:
+                continue
+
+            article_id = int(match.group("article_id"))
+            if article_id in seen_article_ids:
+                continue
+            seen_article_ids.add(article_id)
+
+            row = anchor.find_parent("tr")
+            posts.append(
+                PostStub(
+                    board_key=board_key,
+                    board_name=board["board_name"],
+                    board_id=board["board_id"],
+                    article_id=article_id,
+                    title=self._normalize_text(anchor.get_text(" ", strip=True)),
+                    post_url=self._build_public_post_url(
+                        subview_id=board["subview_id"],
+                        board_id=board["board_id"],
+                        article_id=article_id,
+                    ),
+                    published_at=self._extract_date_from_row(row),
+                )
+            )
+
+        return posts
+
+    def crawl_post_detail(self, stub: PostStub) -> PostDetail:
+        detail_url = self._build_detail_url(board_id=stub.board_id, article_id=stub.article_id)
+        response = self.session.get(detail_url, timeout=self.timeout)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        title = self._first_text(
+            soup,
+            [
+                ".artclView .title h2",
+                ".artclView h2",
+                ".view-title",
+                "h2",
+                "h3",
+            ],
+        ) or stub.title
+
+        content_node = self._first_node(
+            soup,
+            [
+                ".artclContents",
+                ".fr-view",
+                ".view-con",
+                ".artcl-view",
+                ".bbs--view",
+                "#artclView",
+            ],
+        )
+        content_text = self._normalize_text(content_node.get_text("\n", strip=True)) if content_node else None
+
+        author = self._extract_meta(soup, ["작성자", "writer", "등록자"])
+        published_at = self._extract_date_from_soup(soup) or stub.published_at
+        attachments = self._extract_attachments(soup)
+
+        return PostDetail(
+            board_key=stub.board_key,
+            board_name=stub.board_name,
+            board_id=stub.board_id,
+            article_id=stub.article_id,
+            title=title,
+            post_url=stub.post_url,
+            author=author,
+            published_at=published_at,
+            summary=self._make_summary(content_text),
+            content_text=content_text,
+            attachments=attachments,
+        )
+
+    def _build_detail_url(self, board_id: int, article_id: int) -> str:
+        return f"{self.base_url}/bbs/computer/{board_id}/{article_id}/artclView.do"
+
+    def _build_public_post_url(self, subview_id: int, board_id: int, article_id: int) -> str:
+        article_path = f"/bbs/computer/{board_id}/{article_id}/artclView.do?"
+        encoded_path = quote(article_path, safe="")
+        enc = b64encode(f"fnct1|@@|{encoded_path}".encode("utf-8")).decode("ascii")
+        return f"{self.base_url}/computer/{subview_id}/subview.do?enc={enc}"
+
+    def _extract_attachments(self, soup: BeautifulSoup) -> list[dict]:
+        attachments: list[dict] = []
+        seen: set[str] = set()
+        for anchor in soup.select("a[href*='download'], a[href*='fileDown'], a[href*='attach'], a[href*='FileDown']"):
+            href = anchor.get("href") or ""
+            name = self._normalize_text(anchor.get_text(" ", strip=True))
+            if not href or not name:
+                continue
+            absolute_url = urljoin(self.base_url, href)
+            if absolute_url in seen:
+                continue
+            seen.add(absolute_url)
+            attachments.append({"name": name, "url": absolute_url})
+        return attachments
+
+    def _extract_meta(self, soup: BeautifulSoup, labels: list[str]) -> str | None:
+        for label in labels:
+            label_node = soup.find(string=re.compile(label, re.IGNORECASE))
+            if not label_node:
+                continue
+            parent = label_node.parent if isinstance(label_node.parent, Tag) else None
+            if not parent:
+                continue
+            container = parent.parent if parent.parent else parent
+            candidate_text = self._normalize_text(container.get_text(" ", strip=True))
+            candidate_text = candidate_text.replace(label, "").replace(":", "").strip()
+            if candidate_text:
+                return candidate_text
+        return None
+
+    def _extract_date_from_row(self, row: Tag | None) -> datetime | None:
+        if row is None:
+            return None
+        return self._parse_date(row.get_text(" ", strip=True))
+
+    def _extract_date_from_soup(self, soup: BeautifulSoup) -> datetime | None:
+        for selector in [".artclInfo", ".view-info", ".info", "body"]:
+            node = soup.select_one(selector)
+            if not node:
+                continue
+            parsed = self._parse_date(node.get_text(" ", strip=True))
+            if parsed:
+                return parsed
+        return None
+
+    def _parse_date(self, text: str | None) -> datetime | None:
+        if not text:
+            return None
+        match = DATE_RE.search(text)
+        if not match:
+            return None
+        year, month, day = map(int, match.groups())
+        return datetime(year, month, day)
+
+    def _make_summary(self, content_text: str | None, max_length: int = 280) -> str | None:
+        if not content_text:
+            return None
+        normalized = " ".join(content_text.split())
+        if len(normalized) <= max_length:
+            return normalized
+        return normalized[: max_length - 3].rstrip() + "..."
+
+    def _first_node(self, soup: BeautifulSoup, selectors: list[str]) -> Tag | None:
+        for selector in selectors:
+            node = soup.select_one(selector)
+            if node:
+                return node
+        return None
+
+    def _first_text(self, soup: BeautifulSoup, selectors: list[str]) -> str | None:
+        node = self._first_node(soup, selectors)
+        if node:
+            return self._normalize_text(node.get_text(" ", strip=True))
+        return None
+
+    def _normalize_text(self, value: str | None) -> str:
+        if not value:
+            return ""
+        return re.sub(r"\s+", " ", value).strip()