Files
hufs-notice-crawler/app/crawler.py
nkey ca460453af
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s
Feat: [main] hufs-notice-crawler CI/CD까지 구현 완료
2026-03-17 17:18:16 +09:00

237 lines
8.3 KiB
Python

import re
from base64 import b64encode
from dataclasses import dataclass
from datetime import datetime
from urllib.parse import quote, urljoin
import requests
from bs4 import BeautifulSoup, Tag
from app.config import get_settings
ARTICLE_PATH_RE = re.compile(r"/bbs/computer/(?P<board_id>\d+)/(?P<article_id>\d+)/artclView\.do")
DATE_RE = re.compile(r"(20\d{2})[-./](\d{1,2})[-./](\d{1,2})")
BOARD_CONFIG = {
"notice": {"board_id": 1926, "board_name": "공지사항", "subview_id": 10058},
"archive": {"board_id": 1927, "board_name": "자료실", "subview_id": 10059},
"jobs": {"board_id": 1929, "board_name": "취업정보", "subview_id": 10077},
}
@dataclass
class PostStub:
board_key: str
board_name: str
board_id: int
article_id: int
title: str
post_url: str
published_at: datetime | None
@dataclass
class PostDetail:
board_key: str
board_name: str
board_id: int
article_id: int
title: str
post_url: str
author: str | None
published_at: datetime | None
summary: str | None
content_text: str | None
attachments: list[dict]
class HufsCrawler:
def __init__(self) -> None:
settings = get_settings()
self.base_url = settings.base_url
self.max_pages_per_board = settings.max_pages_per_board
self.session = requests.Session()
self.session.headers.update({"User-Agent": settings.user_agent})
self.timeout = settings.request_timeout_seconds
def crawl_board_list(self, board_key: str, page: int = 1) -> list[PostStub]:
board = BOARD_CONFIG[board_key]
list_url = f"{self.base_url}/bbs/computer/{board['board_id']}/artclList.do"
response = self.session.get(
list_url,
params={"layout": "unknown", "page": page},
timeout=self.timeout,
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
seen_article_ids: set[int] = set()
posts: list[PostStub] = []
for anchor in soup.select("a[href*='artclView.do']"):
href = anchor.get("href") or ""
match = ARTICLE_PATH_RE.search(href)
if not match:
continue
article_id = int(match.group("article_id"))
if article_id in seen_article_ids:
continue
seen_article_ids.add(article_id)
row = anchor.find_parent("tr")
posts.append(
PostStub(
board_key=board_key,
board_name=board["board_name"],
board_id=board["board_id"],
article_id=article_id,
title=self._normalize_text(anchor.get_text(" ", strip=True)),
post_url=self._build_public_post_url(
subview_id=board["subview_id"],
board_id=board["board_id"],
article_id=article_id,
),
published_at=self._extract_date_from_row(row),
)
)
return posts
def crawl_post_detail(self, stub: PostStub) -> PostDetail:
detail_url = self._build_detail_url(board_id=stub.board_id, article_id=stub.article_id)
response = self.session.get(detail_url, timeout=self.timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
title = self._first_text(
soup,
[
".artclView .title h2",
".artclView h2",
".view-title",
"h2",
"h3",
],
) or stub.title
content_node = self._first_node(
soup,
[
".artclContents",
".fr-view",
".view-con",
".artcl-view",
".bbs--view",
"#artclView",
],
)
content_text = self._normalize_text(content_node.get_text("\n", strip=True)) if content_node else None
author = self._extract_meta(soup, ["작성자", "writer", "등록자"])
published_at = self._extract_date_from_soup(soup) or stub.published_at
attachments = self._extract_attachments(soup)
return PostDetail(
board_key=stub.board_key,
board_name=stub.board_name,
board_id=stub.board_id,
article_id=stub.article_id,
title=title,
post_url=stub.post_url,
author=author,
published_at=published_at,
summary=self._make_summary(content_text),
content_text=content_text,
attachments=attachments,
)
def _build_detail_url(self, board_id: int, article_id: int) -> str:
return f"{self.base_url}/bbs/computer/{board_id}/{article_id}/artclView.do"
def _build_public_post_url(self, subview_id: int, board_id: int, article_id: int) -> str:
article_path = f"/bbs/computer/{board_id}/{article_id}/artclView.do?"
encoded_path = quote(article_path, safe="")
enc = b64encode(f"fnct1|@@|{encoded_path}".encode("utf-8")).decode("ascii")
return f"{self.base_url}/computer/{subview_id}/subview.do?enc={enc}"
def _extract_attachments(self, soup: BeautifulSoup) -> list[dict]:
attachments: list[dict] = []
seen: set[str] = set()
for anchor in soup.select("a[href*='download'], a[href*='fileDown'], a[href*='attach'], a[href*='FileDown']"):
href = anchor.get("href") or ""
name = self._normalize_text(anchor.get_text(" ", strip=True))
if not href or not name:
continue
absolute_url = urljoin(self.base_url, href)
if absolute_url in seen:
continue
seen.add(absolute_url)
attachments.append({"name": name, "url": absolute_url})
return attachments
def _extract_meta(self, soup: BeautifulSoup, labels: list[str]) -> str | None:
for label in labels:
label_node = soup.find(string=re.compile(label, re.IGNORECASE))
if not label_node:
continue
parent = label_node.parent if isinstance(label_node.parent, Tag) else None
if not parent:
continue
container = parent.parent if parent.parent else parent
candidate_text = self._normalize_text(container.get_text(" ", strip=True))
candidate_text = candidate_text.replace(label, "").replace(":", "").strip()
if candidate_text:
return candidate_text
return None
def _extract_date_from_row(self, row: Tag | None) -> datetime | None:
if row is None:
return None
return self._parse_date(row.get_text(" ", strip=True))
def _extract_date_from_soup(self, soup: BeautifulSoup) -> datetime | None:
for selector in [".artclInfo", ".view-info", ".info", "body"]:
node = soup.select_one(selector)
if not node:
continue
parsed = self._parse_date(node.get_text(" ", strip=True))
if parsed:
return parsed
return None
def _parse_date(self, text: str | None) -> datetime | None:
if not text:
return None
match = DATE_RE.search(text)
if not match:
return None
year, month, day = map(int, match.groups())
return datetime(year, month, day)
def _make_summary(self, content_text: str | None, max_length: int = 280) -> str | None:
if not content_text:
return None
normalized = " ".join(content_text.split())
if len(normalized) <= max_length:
return normalized
return normalized[: max_length - 3].rstrip() + "..."
def _first_node(self, soup: BeautifulSoup, selectors: list[str]) -> Tag | None:
for selector in selectors:
node = soup.select_one(selector)
if node:
return node
return None
def _first_text(self, soup: BeautifulSoup, selectors: list[str]) -> str | None:
node = self._first_node(soup, selectors)
if node:
return self._normalize_text(node.get_text(" ", strip=True))
return None
def _normalize_text(self, value: str | None) -> str:
if not value:
return ""
return re.sub(r"\s+", " ", value).strip()