Feat: [main] hufs-notice-crawler CI/CD까지 구현 완료
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s
This commit is contained in:
236
app/crawler.py
Normal file
236
app/crawler.py
Normal file
@@ -0,0 +1,236 @@
|
||||
import re
|
||||
from base64 import b64encode
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from urllib.parse import quote, urljoin
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from app.config import get_settings
|
||||
|
||||
ARTICLE_PATH_RE = re.compile(r"/bbs/computer/(?P<board_id>\d+)/(?P<article_id>\d+)/artclView\.do")
|
||||
DATE_RE = re.compile(r"(20\d{2})[-./](\d{1,2})[-./](\d{1,2})")
|
||||
|
||||
|
||||
BOARD_CONFIG = {
|
||||
"notice": {"board_id": 1926, "board_name": "공지사항", "subview_id": 10058},
|
||||
"archive": {"board_id": 1927, "board_name": "자료실", "subview_id": 10059},
|
||||
"jobs": {"board_id": 1929, "board_name": "취업정보", "subview_id": 10077},
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class PostStub:
|
||||
board_key: str
|
||||
board_name: str
|
||||
board_id: int
|
||||
article_id: int
|
||||
title: str
|
||||
post_url: str
|
||||
published_at: datetime | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PostDetail:
|
||||
board_key: str
|
||||
board_name: str
|
||||
board_id: int
|
||||
article_id: int
|
||||
title: str
|
||||
post_url: str
|
||||
author: str | None
|
||||
published_at: datetime | None
|
||||
summary: str | None
|
||||
content_text: str | None
|
||||
attachments: list[dict]
|
||||
|
||||
|
||||
class HufsCrawler:
|
||||
def __init__(self) -> None:
|
||||
settings = get_settings()
|
||||
self.base_url = settings.base_url
|
||||
self.max_pages_per_board = settings.max_pages_per_board
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({"User-Agent": settings.user_agent})
|
||||
self.timeout = settings.request_timeout_seconds
|
||||
|
||||
def crawl_board_list(self, board_key: str, page: int = 1) -> list[PostStub]:
|
||||
board = BOARD_CONFIG[board_key]
|
||||
list_url = f"{self.base_url}/bbs/computer/{board['board_id']}/artclList.do"
|
||||
response = self.session.get(
|
||||
list_url,
|
||||
params={"layout": "unknown", "page": page},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
seen_article_ids: set[int] = set()
|
||||
posts: list[PostStub] = []
|
||||
for anchor in soup.select("a[href*='artclView.do']"):
|
||||
href = anchor.get("href") or ""
|
||||
match = ARTICLE_PATH_RE.search(href)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
article_id = int(match.group("article_id"))
|
||||
if article_id in seen_article_ids:
|
||||
continue
|
||||
seen_article_ids.add(article_id)
|
||||
|
||||
row = anchor.find_parent("tr")
|
||||
posts.append(
|
||||
PostStub(
|
||||
board_key=board_key,
|
||||
board_name=board["board_name"],
|
||||
board_id=board["board_id"],
|
||||
article_id=article_id,
|
||||
title=self._normalize_text(anchor.get_text(" ", strip=True)),
|
||||
post_url=self._build_public_post_url(
|
||||
subview_id=board["subview_id"],
|
||||
board_id=board["board_id"],
|
||||
article_id=article_id,
|
||||
),
|
||||
published_at=self._extract_date_from_row(row),
|
||||
)
|
||||
)
|
||||
|
||||
return posts
|
||||
|
||||
def crawl_post_detail(self, stub: PostStub) -> PostDetail:
|
||||
detail_url = self._build_detail_url(board_id=stub.board_id, article_id=stub.article_id)
|
||||
response = self.session.get(detail_url, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
title = self._first_text(
|
||||
soup,
|
||||
[
|
||||
".artclView .title h2",
|
||||
".artclView h2",
|
||||
".view-title",
|
||||
"h2",
|
||||
"h3",
|
||||
],
|
||||
) or stub.title
|
||||
|
||||
content_node = self._first_node(
|
||||
soup,
|
||||
[
|
||||
".artclContents",
|
||||
".fr-view",
|
||||
".view-con",
|
||||
".artcl-view",
|
||||
".bbs--view",
|
||||
"#artclView",
|
||||
],
|
||||
)
|
||||
content_text = self._normalize_text(content_node.get_text("\n", strip=True)) if content_node else None
|
||||
|
||||
author = self._extract_meta(soup, ["작성자", "writer", "등록자"])
|
||||
published_at = self._extract_date_from_soup(soup) or stub.published_at
|
||||
attachments = self._extract_attachments(soup)
|
||||
|
||||
return PostDetail(
|
||||
board_key=stub.board_key,
|
||||
board_name=stub.board_name,
|
||||
board_id=stub.board_id,
|
||||
article_id=stub.article_id,
|
||||
title=title,
|
||||
post_url=stub.post_url,
|
||||
author=author,
|
||||
published_at=published_at,
|
||||
summary=self._make_summary(content_text),
|
||||
content_text=content_text,
|
||||
attachments=attachments,
|
||||
)
|
||||
|
||||
def _build_detail_url(self, board_id: int, article_id: int) -> str:
|
||||
return f"{self.base_url}/bbs/computer/{board_id}/{article_id}/artclView.do"
|
||||
|
||||
def _build_public_post_url(self, subview_id: int, board_id: int, article_id: int) -> str:
|
||||
article_path = f"/bbs/computer/{board_id}/{article_id}/artclView.do?"
|
||||
encoded_path = quote(article_path, safe="")
|
||||
enc = b64encode(f"fnct1|@@|{encoded_path}".encode("utf-8")).decode("ascii")
|
||||
return f"{self.base_url}/computer/{subview_id}/subview.do?enc={enc}"
|
||||
|
||||
def _extract_attachments(self, soup: BeautifulSoup) -> list[dict]:
|
||||
attachments: list[dict] = []
|
||||
seen: set[str] = set()
|
||||
for anchor in soup.select("a[href*='download'], a[href*='fileDown'], a[href*='attach'], a[href*='FileDown']"):
|
||||
href = anchor.get("href") or ""
|
||||
name = self._normalize_text(anchor.get_text(" ", strip=True))
|
||||
if not href or not name:
|
||||
continue
|
||||
absolute_url = urljoin(self.base_url, href)
|
||||
if absolute_url in seen:
|
||||
continue
|
||||
seen.add(absolute_url)
|
||||
attachments.append({"name": name, "url": absolute_url})
|
||||
return attachments
|
||||
|
||||
def _extract_meta(self, soup: BeautifulSoup, labels: list[str]) -> str | None:
|
||||
for label in labels:
|
||||
label_node = soup.find(string=re.compile(label, re.IGNORECASE))
|
||||
if not label_node:
|
||||
continue
|
||||
parent = label_node.parent if isinstance(label_node.parent, Tag) else None
|
||||
if not parent:
|
||||
continue
|
||||
container = parent.parent if parent.parent else parent
|
||||
candidate_text = self._normalize_text(container.get_text(" ", strip=True))
|
||||
candidate_text = candidate_text.replace(label, "").replace(":", "").strip()
|
||||
if candidate_text:
|
||||
return candidate_text
|
||||
return None
|
||||
|
||||
def _extract_date_from_row(self, row: Tag | None) -> datetime | None:
|
||||
if row is None:
|
||||
return None
|
||||
return self._parse_date(row.get_text(" ", strip=True))
|
||||
|
||||
def _extract_date_from_soup(self, soup: BeautifulSoup) -> datetime | None:
|
||||
for selector in [".artclInfo", ".view-info", ".info", "body"]:
|
||||
node = soup.select_one(selector)
|
||||
if not node:
|
||||
continue
|
||||
parsed = self._parse_date(node.get_text(" ", strip=True))
|
||||
if parsed:
|
||||
return parsed
|
||||
return None
|
||||
|
||||
def _parse_date(self, text: str | None) -> datetime | None:
|
||||
if not text:
|
||||
return None
|
||||
match = DATE_RE.search(text)
|
||||
if not match:
|
||||
return None
|
||||
year, month, day = map(int, match.groups())
|
||||
return datetime(year, month, day)
|
||||
|
||||
def _make_summary(self, content_text: str | None, max_length: int = 280) -> str | None:
|
||||
if not content_text:
|
||||
return None
|
||||
normalized = " ".join(content_text.split())
|
||||
if len(normalized) <= max_length:
|
||||
return normalized
|
||||
return normalized[: max_length - 3].rstrip() + "..."
|
||||
|
||||
def _first_node(self, soup: BeautifulSoup, selectors: list[str]) -> Tag | None:
|
||||
for selector in selectors:
|
||||
node = soup.select_one(selector)
|
||||
if node:
|
||||
return node
|
||||
return None
|
||||
|
||||
def _first_text(self, soup: BeautifulSoup, selectors: list[str]) -> str | None:
|
||||
node = self._first_node(soup, selectors)
|
||||
if node:
|
||||
return self._normalize_text(node.get_text(" ", strip=True))
|
||||
return None
|
||||
|
||||
def _normalize_text(self, value: str | None) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
return re.sub(r"\s+", " ", value).strip()
|
||||
Reference in New Issue
Block a user