Feat: [main] hufs-notice-crawler CI/CD까지 구현 완료
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s

This commit is contained in:
2026-03-17 17:18:16 +09:00
commit ca460453af
23 changed files with 1959 additions and 0 deletions

236
app/crawler.py Normal file
View File

@@ -0,0 +1,236 @@
import re
from base64 import b64encode
from dataclasses import dataclass
from datetime import datetime
from urllib.parse import quote, urljoin
import requests
from bs4 import BeautifulSoup, Tag
from app.config import get_settings
ARTICLE_PATH_RE = re.compile(r"/bbs/computer/(?P<board_id>\d+)/(?P<article_id>\d+)/artclView\.do")
DATE_RE = re.compile(r"(20\d{2})[-./](\d{1,2})[-./](\d{1,2})")
BOARD_CONFIG = {
"notice": {"board_id": 1926, "board_name": "공지사항", "subview_id": 10058},
"archive": {"board_id": 1927, "board_name": "자료실", "subview_id": 10059},
"jobs": {"board_id": 1929, "board_name": "취업정보", "subview_id": 10077},
}
@dataclass
class PostStub:
board_key: str
board_name: str
board_id: int
article_id: int
title: str
post_url: str
published_at: datetime | None
@dataclass
class PostDetail:
board_key: str
board_name: str
board_id: int
article_id: int
title: str
post_url: str
author: str | None
published_at: datetime | None
summary: str | None
content_text: str | None
attachments: list[dict]
class HufsCrawler:
def __init__(self) -> None:
settings = get_settings()
self.base_url = settings.base_url
self.max_pages_per_board = settings.max_pages_per_board
self.session = requests.Session()
self.session.headers.update({"User-Agent": settings.user_agent})
self.timeout = settings.request_timeout_seconds
def crawl_board_list(self, board_key: str, page: int = 1) -> list[PostStub]:
board = BOARD_CONFIG[board_key]
list_url = f"{self.base_url}/bbs/computer/{board['board_id']}/artclList.do"
response = self.session.get(
list_url,
params={"layout": "unknown", "page": page},
timeout=self.timeout,
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
seen_article_ids: set[int] = set()
posts: list[PostStub] = []
for anchor in soup.select("a[href*='artclView.do']"):
href = anchor.get("href") or ""
match = ARTICLE_PATH_RE.search(href)
if not match:
continue
article_id = int(match.group("article_id"))
if article_id in seen_article_ids:
continue
seen_article_ids.add(article_id)
row = anchor.find_parent("tr")
posts.append(
PostStub(
board_key=board_key,
board_name=board["board_name"],
board_id=board["board_id"],
article_id=article_id,
title=self._normalize_text(anchor.get_text(" ", strip=True)),
post_url=self._build_public_post_url(
subview_id=board["subview_id"],
board_id=board["board_id"],
article_id=article_id,
),
published_at=self._extract_date_from_row(row),
)
)
return posts
def crawl_post_detail(self, stub: PostStub) -> PostDetail:
detail_url = self._build_detail_url(board_id=stub.board_id, article_id=stub.article_id)
response = self.session.get(detail_url, timeout=self.timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
title = self._first_text(
soup,
[
".artclView .title h2",
".artclView h2",
".view-title",
"h2",
"h3",
],
) or stub.title
content_node = self._first_node(
soup,
[
".artclContents",
".fr-view",
".view-con",
".artcl-view",
".bbs--view",
"#artclView",
],
)
content_text = self._normalize_text(content_node.get_text("\n", strip=True)) if content_node else None
author = self._extract_meta(soup, ["작성자", "writer", "등록자"])
published_at = self._extract_date_from_soup(soup) or stub.published_at
attachments = self._extract_attachments(soup)
return PostDetail(
board_key=stub.board_key,
board_name=stub.board_name,
board_id=stub.board_id,
article_id=stub.article_id,
title=title,
post_url=stub.post_url,
author=author,
published_at=published_at,
summary=self._make_summary(content_text),
content_text=content_text,
attachments=attachments,
)
def _build_detail_url(self, board_id: int, article_id: int) -> str:
return f"{self.base_url}/bbs/computer/{board_id}/{article_id}/artclView.do"
def _build_public_post_url(self, subview_id: int, board_id: int, article_id: int) -> str:
article_path = f"/bbs/computer/{board_id}/{article_id}/artclView.do?"
encoded_path = quote(article_path, safe="")
enc = b64encode(f"fnct1|@@|{encoded_path}".encode("utf-8")).decode("ascii")
return f"{self.base_url}/computer/{subview_id}/subview.do?enc={enc}"
def _extract_attachments(self, soup: BeautifulSoup) -> list[dict]:
attachments: list[dict] = []
seen: set[str] = set()
for anchor in soup.select("a[href*='download'], a[href*='fileDown'], a[href*='attach'], a[href*='FileDown']"):
href = anchor.get("href") or ""
name = self._normalize_text(anchor.get_text(" ", strip=True))
if not href or not name:
continue
absolute_url = urljoin(self.base_url, href)
if absolute_url in seen:
continue
seen.add(absolute_url)
attachments.append({"name": name, "url": absolute_url})
return attachments
def _extract_meta(self, soup: BeautifulSoup, labels: list[str]) -> str | None:
for label in labels:
label_node = soup.find(string=re.compile(label, re.IGNORECASE))
if not label_node:
continue
parent = label_node.parent if isinstance(label_node.parent, Tag) else None
if not parent:
continue
container = parent.parent if parent.parent else parent
candidate_text = self._normalize_text(container.get_text(" ", strip=True))
candidate_text = candidate_text.replace(label, "").replace(":", "").strip()
if candidate_text:
return candidate_text
return None
def _extract_date_from_row(self, row: Tag | None) -> datetime | None:
if row is None:
return None
return self._parse_date(row.get_text(" ", strip=True))
def _extract_date_from_soup(self, soup: BeautifulSoup) -> datetime | None:
for selector in [".artclInfo", ".view-info", ".info", "body"]:
node = soup.select_one(selector)
if not node:
continue
parsed = self._parse_date(node.get_text(" ", strip=True))
if parsed:
return parsed
return None
def _parse_date(self, text: str | None) -> datetime | None:
if not text:
return None
match = DATE_RE.search(text)
if not match:
return None
year, month, day = map(int, match.groups())
return datetime(year, month, day)
def _make_summary(self, content_text: str | None, max_length: int = 280) -> str | None:
if not content_text:
return None
normalized = " ".join(content_text.split())
if len(normalized) <= max_length:
return normalized
return normalized[: max_length - 3].rstrip() + "..."
def _first_node(self, soup: BeautifulSoup, selectors: list[str]) -> Tag | None:
for selector in selectors:
node = soup.select_one(selector)
if node:
return node
return None
def _first_text(self, soup: BeautifulSoup, selectors: list[str]) -> str | None:
node = self._first_node(soup, selectors)
if node:
return self._normalize_text(node.get_text(" ", strip=True))
return None
def _normalize_text(self, value: str | None) -> str:
if not value:
return ""
return re.sub(r"\s+", " ", value).strip()