Feat: [main] hufs-notice-crawler CI/CD까지 구현 완료
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s

This commit is contained in:
2026-03-17 17:18:16 +09:00
commit ca460453af
23 changed files with 1959 additions and 0 deletions

1
app/__init__.py Normal file
View File

@@ -0,0 +1 @@

30
app/config.py Normal file
View File

@@ -0,0 +1,30 @@
from functools import lru_cache
from pydantic import Field
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
app_name: str = "hufs-notice-crawler"
app_env: str = "development"
database_url: str = Field(
default="postgresql+psycopg://postgres:postgres@localhost:5432/hufs_notice_crawler"
)
base_url: str = "https://computer.hufs.ac.kr"
request_timeout_seconds: float = 15.0
max_pages_per_board: int = 5
user_agent: str = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
)
@lru_cache(maxsize=1)
def get_settings() -> Settings:
return Settings()

236
app/crawler.py Normal file
View File

@@ -0,0 +1,236 @@
import re
from base64 import b64encode
from dataclasses import dataclass
from datetime import datetime
from urllib.parse import quote, urljoin
import requests
from bs4 import BeautifulSoup, Tag
from app.config import get_settings
ARTICLE_PATH_RE = re.compile(r"/bbs/computer/(?P<board_id>\d+)/(?P<article_id>\d+)/artclView\.do")
DATE_RE = re.compile(r"(20\d{2})[-./](\d{1,2})[-./](\d{1,2})")
BOARD_CONFIG = {
"notice": {"board_id": 1926, "board_name": "공지사항", "subview_id": 10058},
"archive": {"board_id": 1927, "board_name": "자료실", "subview_id": 10059},
"jobs": {"board_id": 1929, "board_name": "취업정보", "subview_id": 10077},
}
@dataclass
class PostStub:
board_key: str
board_name: str
board_id: int
article_id: int
title: str
post_url: str
published_at: datetime | None
@dataclass
class PostDetail:
board_key: str
board_name: str
board_id: int
article_id: int
title: str
post_url: str
author: str | None
published_at: datetime | None
summary: str | None
content_text: str | None
attachments: list[dict]
class HufsCrawler:
def __init__(self) -> None:
settings = get_settings()
self.base_url = settings.base_url
self.max_pages_per_board = settings.max_pages_per_board
self.session = requests.Session()
self.session.headers.update({"User-Agent": settings.user_agent})
self.timeout = settings.request_timeout_seconds
def crawl_board_list(self, board_key: str, page: int = 1) -> list[PostStub]:
board = BOARD_CONFIG[board_key]
list_url = f"{self.base_url}/bbs/computer/{board['board_id']}/artclList.do"
response = self.session.get(
list_url,
params={"layout": "unknown", "page": page},
timeout=self.timeout,
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
seen_article_ids: set[int] = set()
posts: list[PostStub] = []
for anchor in soup.select("a[href*='artclView.do']"):
href = anchor.get("href") or ""
match = ARTICLE_PATH_RE.search(href)
if not match:
continue
article_id = int(match.group("article_id"))
if article_id in seen_article_ids:
continue
seen_article_ids.add(article_id)
row = anchor.find_parent("tr")
posts.append(
PostStub(
board_key=board_key,
board_name=board["board_name"],
board_id=board["board_id"],
article_id=article_id,
title=self._normalize_text(anchor.get_text(" ", strip=True)),
post_url=self._build_public_post_url(
subview_id=board["subview_id"],
board_id=board["board_id"],
article_id=article_id,
),
published_at=self._extract_date_from_row(row),
)
)
return posts
def crawl_post_detail(self, stub: PostStub) -> PostDetail:
detail_url = self._build_detail_url(board_id=stub.board_id, article_id=stub.article_id)
response = self.session.get(detail_url, timeout=self.timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
title = self._first_text(
soup,
[
".artclView .title h2",
".artclView h2",
".view-title",
"h2",
"h3",
],
) or stub.title
content_node = self._first_node(
soup,
[
".artclContents",
".fr-view",
".view-con",
".artcl-view",
".bbs--view",
"#artclView",
],
)
content_text = self._normalize_text(content_node.get_text("\n", strip=True)) if content_node else None
author = self._extract_meta(soup, ["작성자", "writer", "등록자"])
published_at = self._extract_date_from_soup(soup) or stub.published_at
attachments = self._extract_attachments(soup)
return PostDetail(
board_key=stub.board_key,
board_name=stub.board_name,
board_id=stub.board_id,
article_id=stub.article_id,
title=title,
post_url=stub.post_url,
author=author,
published_at=published_at,
summary=self._make_summary(content_text),
content_text=content_text,
attachments=attachments,
)
def _build_detail_url(self, board_id: int, article_id: int) -> str:
return f"{self.base_url}/bbs/computer/{board_id}/{article_id}/artclView.do"
def _build_public_post_url(self, subview_id: int, board_id: int, article_id: int) -> str:
article_path = f"/bbs/computer/{board_id}/{article_id}/artclView.do?"
encoded_path = quote(article_path, safe="")
enc = b64encode(f"fnct1|@@|{encoded_path}".encode("utf-8")).decode("ascii")
return f"{self.base_url}/computer/{subview_id}/subview.do?enc={enc}"
def _extract_attachments(self, soup: BeautifulSoup) -> list[dict]:
attachments: list[dict] = []
seen: set[str] = set()
for anchor in soup.select("a[href*='download'], a[href*='fileDown'], a[href*='attach'], a[href*='FileDown']"):
href = anchor.get("href") or ""
name = self._normalize_text(anchor.get_text(" ", strip=True))
if not href or not name:
continue
absolute_url = urljoin(self.base_url, href)
if absolute_url in seen:
continue
seen.add(absolute_url)
attachments.append({"name": name, "url": absolute_url})
return attachments
def _extract_meta(self, soup: BeautifulSoup, labels: list[str]) -> str | None:
for label in labels:
label_node = soup.find(string=re.compile(label, re.IGNORECASE))
if not label_node:
continue
parent = label_node.parent if isinstance(label_node.parent, Tag) else None
if not parent:
continue
container = parent.parent if parent.parent else parent
candidate_text = self._normalize_text(container.get_text(" ", strip=True))
candidate_text = candidate_text.replace(label, "").replace(":", "").strip()
if candidate_text:
return candidate_text
return None
def _extract_date_from_row(self, row: Tag | None) -> datetime | None:
if row is None:
return None
return self._parse_date(row.get_text(" ", strip=True))
def _extract_date_from_soup(self, soup: BeautifulSoup) -> datetime | None:
for selector in [".artclInfo", ".view-info", ".info", "body"]:
node = soup.select_one(selector)
if not node:
continue
parsed = self._parse_date(node.get_text(" ", strip=True))
if parsed:
return parsed
return None
def _parse_date(self, text: str | None) -> datetime | None:
if not text:
return None
match = DATE_RE.search(text)
if not match:
return None
year, month, day = map(int, match.groups())
return datetime(year, month, day)
def _make_summary(self, content_text: str | None, max_length: int = 280) -> str | None:
if not content_text:
return None
normalized = " ".join(content_text.split())
if len(normalized) <= max_length:
return normalized
return normalized[: max_length - 3].rstrip() + "..."
def _first_node(self, soup: BeautifulSoup, selectors: list[str]) -> Tag | None:
for selector in selectors:
node = soup.select_one(selector)
if node:
return node
return None
def _first_text(self, soup: BeautifulSoup, selectors: list[str]) -> str | None:
node = self._first_node(soup, selectors)
if node:
return self._normalize_text(node.get_text(" ", strip=True))
return None
def _normalize_text(self, value: str | None) -> str:
if not value:
return ""
return re.sub(r"\s+", " ", value).strip()

33
app/db.py Normal file
View File

@@ -0,0 +1,33 @@
from sqlalchemy import create_engine
from sqlalchemy.orm import DeclarativeBase, sessionmaker
from app.config import get_settings
class Base(DeclarativeBase):
pass
settings = get_settings()
engine = create_engine(
settings.database_url,
future=True,
pool_pre_ping=True,
)
SessionLocal = sessionmaker(
bind=engine,
autoflush=False,
autocommit=False,
future=True,
expire_on_commit=False,
)
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()

30
app/main.py Normal file
View File

@@ -0,0 +1,30 @@
from contextlib import asynccontextmanager
from fastapi import Depends, FastAPI
from sqlalchemy.orm import Session
from app.config import get_settings
from app.db import Base, engine, get_db
from app.schemas import CrawlResponse
from app.service import CrawlService
@asynccontextmanager
async def lifespan(_: FastAPI):
Base.metadata.create_all(bind=engine)
yield
settings = get_settings()
app = FastAPI(title=settings.app_name, lifespan=lifespan)
@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok"}
@app.post("/api/v1/crawl", response_model=CrawlResponse)
def crawl_notices(db: Session = Depends(get_db)) -> CrawlResponse:
service = CrawlService(db)
return service.crawl_new_posts()

51
app/models.py Normal file
View File

@@ -0,0 +1,51 @@
from datetime import datetime
from sqlalchemy import JSON, CheckConstraint, DateTime, Integer, String, Text, UniqueConstraint, func
from sqlalchemy.orm import Mapped, mapped_column
from app.db import Base
class ScrapedPost(Base):
__tablename__ = "scraped_posts"
__table_args__ = (
UniqueConstraint("board_key", "article_id", name="uq_scraped_posts_board_article"),
CheckConstraint(
"board_key IN ('notice', 'archive', 'jobs')",
name="ck_scraped_posts_board_key",
),
)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
board_key: Mapped[str] = mapped_column(String(32), nullable=False)
board_name: Mapped[str] = mapped_column(String(100), nullable=False)
board_id: Mapped[int] = mapped_column(Integer, nullable=False)
article_id: Mapped[int] = mapped_column(Integer, nullable=False)
title: Mapped[str] = mapped_column(String(500), nullable=False)
post_url: Mapped[str] = mapped_column(Text, nullable=False)
author: Mapped[str | None] = mapped_column(String(100), nullable=True)
published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=False), nullable=True)
summary: Mapped[str | None] = mapped_column(Text, nullable=True)
content_text: Mapped[str | None] = mapped_column(Text, nullable=True)
attachments: Mapped[list[dict]] = mapped_column(JSON, nullable=False, default=list)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=False),
nullable=False,
server_default=func.now(),
)
class CrawlRun(Base):
__tablename__ = "crawl_runs"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
started_at: Mapped[datetime] = mapped_column(
DateTime(timezone=False),
nullable=False,
server_default=func.now(),
)
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=False), nullable=True)
status: Mapped[str] = mapped_column(String(20), nullable=False, default="running")
discovered_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
inserted_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)

41
app/schemas.py Normal file
View File

@@ -0,0 +1,41 @@
from datetime import datetime
from pydantic import BaseModel
class AttachmentOut(BaseModel):
name: str
url: str
class PostOut(BaseModel):
board_key: str
board_name: str
board_id: int
article_id: int
title: str
post_url: str
author: str | None
published_at: datetime | None
summary: str | None
content_text: str | None
attachments: list[AttachmentOut]
class LatestBoardPostOut(BaseModel):
board_key: str
board_name: str
board_id: int
article_id: int
title: str
post_url: str
published_at: datetime | None
class CrawlResponse(BaseModel):
checked_at: datetime
bootstrap_mode: bool
bootstrap_inserted_count: int
new_posts_count: int
new_posts: list[PostOut]
latest_posts_by_board: list[LatestBoardPostOut]

142
app/service.py Normal file
View File

@@ -0,0 +1,142 @@
from datetime import UTC, datetime
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.crawler import BOARD_CONFIG, HufsCrawler, PostStub
from app.models import CrawlRun, ScrapedPost
from app.schemas import AttachmentOut, CrawlResponse, LatestBoardPostOut, PostOut
class CrawlService:
def __init__(self, db: Session) -> None:
self.db = db
self.crawler = HufsCrawler()
def crawl_new_posts(self) -> CrawlResponse:
bootstrap_mode = self._is_bootstrap_mode()
run = CrawlRun(status="running", discovered_count=0, inserted_count=0)
self.db.add(run)
self.db.commit()
self.db.refresh(run)
inserted_posts: list[ScrapedPost] = []
latest_posts_by_board_map: dict[str, PostStub] = {}
try:
for board_key in BOARD_CONFIG:
board_inserted_posts, latest_stub = self._crawl_board(board_key)
inserted_posts.extend(board_inserted_posts)
if latest_stub is not None:
latest_posts_by_board_map[board_key] = latest_stub
run.status = "success"
run.discovered_count = len(inserted_posts)
run.inserted_count = len(inserted_posts)
run.finished_at = datetime.now(UTC).replace(tzinfo=None)
self.db.add(run)
self.db.commit()
except Exception as exc:
run.status = "failed"
run.error_message = str(exc)
run.finished_at = datetime.now(UTC).replace(tzinfo=None)
self.db.add(run)
self.db.commit()
raise
response_posts = [] if bootstrap_mode else inserted_posts
response_latest_posts = (
list(latest_posts_by_board_map.values())
if (0 if bootstrap_mode else len(inserted_posts)) == 0
else []
)
return CrawlResponse(
checked_at=datetime.now(UTC),
bootstrap_mode=bootstrap_mode,
bootstrap_inserted_count=len(inserted_posts) if bootstrap_mode else 0,
new_posts_count=0 if bootstrap_mode else len(inserted_posts),
new_posts=[
PostOut(
board_key=post.board_key,
board_name=post.board_name,
board_id=post.board_id,
article_id=post.article_id,
title=post.title,
post_url=post.post_url,
author=post.author,
published_at=post.published_at,
summary=post.summary,
content_text=post.content_text,
attachments=[
AttachmentOut(name=item["name"], url=item["url"])
for item in (post.attachments or [])
],
)
for post in response_posts
],
latest_posts_by_board=[
LatestBoardPostOut(
board_key=post.board_key,
board_name=post.board_name,
board_id=post.board_id,
article_id=post.article_id,
title=post.title,
post_url=post.post_url,
published_at=post.published_at,
)
for post in response_latest_posts
],
)
def _is_bootstrap_mode(self) -> bool:
first_saved_post = self.db.scalar(select(ScrapedPost.id).limit(1))
return first_saved_post is None
def _crawl_board(self, board_key: str) -> tuple[list[ScrapedPost], PostStub | None]:
candidates = []
latest_stub: PostStub | None = None
known_article_ids = {
article_id
for article_id in self.db.scalars(
select(ScrapedPost.article_id).where(ScrapedPost.board_key == board_key)
)
}
seen_article_ids: set[int] = set()
for page in range(1, self.crawler.max_pages_per_board + 1):
page_posts = self.crawler.crawl_board_list(board_key=board_key, page=page)
if not page_posts:
break
if page == 1 and latest_stub is None:
latest_stub = page_posts[0]
for stub in page_posts:
if stub.article_id in seen_article_ids:
continue
seen_article_ids.add(stub.article_id)
if stub.article_id in known_article_ids:
continue
candidates.append(stub)
inserted_posts: list[ScrapedPost] = []
for stub in reversed(candidates):
detail = self.crawler.crawl_post_detail(stub)
record = ScrapedPost(
board_key=detail.board_key,
board_name=detail.board_name,
board_id=detail.board_id,
article_id=detail.article_id,
title=detail.title,
post_url=detail.post_url,
author=detail.author,
published_at=detail.published_at,
summary=detail.summary,
content_text=detail.content_text,
attachments=detail.attachments,
)
self.db.add(record)
self.db.commit()
self.db.refresh(record)
inserted_posts.append(record)
return inserted_posts, latest_stub