Feat: [main] hufs-notice-crawler CI/CD까지 구현 완료

2026-03-17 17:18:16 +09:00
commit ca460453af
23 changed files with 1959 additions and 0 deletions
--- a/app/init.py
+++ b/app/init.py
@@ -0,0 +1 @@
+
--- a/app/config.py
+++ b/app/config.py
@@ -0,0 +1,30 @@
+from functools import lru_cache
+
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+    )
+
+    app_name: str = "hufs-notice-crawler"
+    app_env: str = "development"
+    database_url: str = Field(
+        default="postgresql+psycopg://postgres:postgres@localhost:5432/hufs_notice_crawler"
+    )
+    base_url: str = "https://computer.hufs.ac.kr"
+    request_timeout_seconds: float = 15.0
+    max_pages_per_board: int = 5
+    user_agent: str = (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+    )
+
+
+@lru_cache(maxsize=1)
+def get_settings() -> Settings:
+    return Settings()
--- a/app/crawler.py
+++ b/app/crawler.py
@@ -0,0 +1,236 @@
+import re
+from base64 import b64encode
+from dataclasses import dataclass
+from datetime import datetime
+from urllib.parse import quote, urljoin
+
+import requests
+from bs4 import BeautifulSoup, Tag
+
+from app.config import get_settings
+
+ARTICLE_PATH_RE = re.compile(r"/bbs/computer/(?P<board_id>\d+)/(?P<article_id>\d+)/artclView\.do")
+DATE_RE = re.compile(r"(20\d{2})[-./](\d{1,2})[-./](\d{1,2})")
+
+
+BOARD_CONFIG = {
+    "notice": {"board_id": 1926, "board_name": "공지사항", "subview_id": 10058},
+    "archive": {"board_id": 1927, "board_name": "자료실", "subview_id": 10059},
+    "jobs": {"board_id": 1929, "board_name": "취업정보", "subview_id": 10077},
+}
+
+
+@dataclass
+class PostStub:
+    board_key: str
+    board_name: str
+    board_id: int
+    article_id: int
+    title: str
+    post_url: str
+    published_at: datetime | None
+
+
+@dataclass
+class PostDetail:
+    board_key: str
+    board_name: str
+    board_id: int
+    article_id: int
+    title: str
+    post_url: str
+    author: str | None
+    published_at: datetime | None
+    summary: str | None
+    content_text: str | None
+    attachments: list[dict]
+
+
+class HufsCrawler:
+    def __init__(self) -> None:
+        settings = get_settings()
+        self.base_url = settings.base_url
+        self.max_pages_per_board = settings.max_pages_per_board
+        self.session = requests.Session()
+        self.session.headers.update({"User-Agent": settings.user_agent})
+        self.timeout = settings.request_timeout_seconds
+
+    def crawl_board_list(self, board_key: str, page: int = 1) -> list[PostStub]:
+        board = BOARD_CONFIG[board_key]
+        list_url = f"{self.base_url}/bbs/computer/{board['board_id']}/artclList.do"
+        response = self.session.get(
+            list_url,
+            params={"layout": "unknown", "page": page},
+            timeout=self.timeout,
+        )
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        seen_article_ids: set[int] = set()
+        posts: list[PostStub] = []
+        for anchor in soup.select("a[href*='artclView.do']"):
+            href = anchor.get("href") or ""
+            match = ARTICLE_PATH_RE.search(href)
+            if not match:
+                continue
+
+            article_id = int(match.group("article_id"))
+            if article_id in seen_article_ids:
+                continue
+            seen_article_ids.add(article_id)
+
+            row = anchor.find_parent("tr")
+            posts.append(
+                PostStub(
+                    board_key=board_key,
+                    board_name=board["board_name"],
+                    board_id=board["board_id"],
+                    article_id=article_id,
+                    title=self._normalize_text(anchor.get_text(" ", strip=True)),
+                    post_url=self._build_public_post_url(
+                        subview_id=board["subview_id"],
+                        board_id=board["board_id"],
+                        article_id=article_id,
+                    ),
+                    published_at=self._extract_date_from_row(row),
+                )
+            )
+
+        return posts
+
+    def crawl_post_detail(self, stub: PostStub) -> PostDetail:
+        detail_url = self._build_detail_url(board_id=stub.board_id, article_id=stub.article_id)
+        response = self.session.get(detail_url, timeout=self.timeout)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        title = self._first_text(
+            soup,
+            [
+                ".artclView .title h2",
+                ".artclView h2",
+                ".view-title",
+                "h2",
+                "h3",
+            ],
+        ) or stub.title
+
+        content_node = self._first_node(
+            soup,
+            [
+                ".artclContents",
+                ".fr-view",
+                ".view-con",
+                ".artcl-view",
+                ".bbs--view",
+                "#artclView",
+            ],
+        )
+        content_text = self._normalize_text(content_node.get_text("\n", strip=True)) if content_node else None
+
+        author = self._extract_meta(soup, ["작성자", "writer", "등록자"])
+        published_at = self._extract_date_from_soup(soup) or stub.published_at
+        attachments = self._extract_attachments(soup)
+
+        return PostDetail(
+            board_key=stub.board_key,
+            board_name=stub.board_name,
+            board_id=stub.board_id,
+            article_id=stub.article_id,
+            title=title,
+            post_url=stub.post_url,
+            author=author,
+            published_at=published_at,
+            summary=self._make_summary(content_text),
+            content_text=content_text,
+            attachments=attachments,
+        )
+
+    def _build_detail_url(self, board_id: int, article_id: int) -> str:
+        return f"{self.base_url}/bbs/computer/{board_id}/{article_id}/artclView.do"
+
+    def _build_public_post_url(self, subview_id: int, board_id: int, article_id: int) -> str:
+        article_path = f"/bbs/computer/{board_id}/{article_id}/artclView.do?"
+        encoded_path = quote(article_path, safe="")
+        enc = b64encode(f"fnct1|@@|{encoded_path}".encode("utf-8")).decode("ascii")
+        return f"{self.base_url}/computer/{subview_id}/subview.do?enc={enc}"
+
+    def _extract_attachments(self, soup: BeautifulSoup) -> list[dict]:
+        attachments: list[dict] = []
+        seen: set[str] = set()
+        for anchor in soup.select("a[href*='download'], a[href*='fileDown'], a[href*='attach'], a[href*='FileDown']"):
+            href = anchor.get("href") or ""
+            name = self._normalize_text(anchor.get_text(" ", strip=True))
+            if not href or not name:
+                continue
+            absolute_url = urljoin(self.base_url, href)
+            if absolute_url in seen:
+                continue
+            seen.add(absolute_url)
+            attachments.append({"name": name, "url": absolute_url})
+        return attachments
+
+    def _extract_meta(self, soup: BeautifulSoup, labels: list[str]) -> str | None:
+        for label in labels:
+            label_node = soup.find(string=re.compile(label, re.IGNORECASE))
+            if not label_node:
+                continue
+            parent = label_node.parent if isinstance(label_node.parent, Tag) else None
+            if not parent:
+                continue
+            container = parent.parent if parent.parent else parent
+            candidate_text = self._normalize_text(container.get_text(" ", strip=True))
+            candidate_text = candidate_text.replace(label, "").replace(":", "").strip()
+            if candidate_text:
+                return candidate_text
+        return None
+
+    def _extract_date_from_row(self, row: Tag | None) -> datetime | None:
+        if row is None:
+            return None
+        return self._parse_date(row.get_text(" ", strip=True))
+
+    def _extract_date_from_soup(self, soup: BeautifulSoup) -> datetime | None:
+        for selector in [".artclInfo", ".view-info", ".info", "body"]:
+            node = soup.select_one(selector)
+            if not node:
+                continue
+            parsed = self._parse_date(node.get_text(" ", strip=True))
+            if parsed:
+                return parsed
+        return None
+
+    def _parse_date(self, text: str | None) -> datetime | None:
+        if not text:
+            return None
+        match = DATE_RE.search(text)
+        if not match:
+            return None
+        year, month, day = map(int, match.groups())
+        return datetime(year, month, day)
+
+    def _make_summary(self, content_text: str | None, max_length: int = 280) -> str | None:
+        if not content_text:
+            return None
+        normalized = " ".join(content_text.split())
+        if len(normalized) <= max_length:
+            return normalized
+        return normalized[: max_length - 3].rstrip() + "..."
+
+    def _first_node(self, soup: BeautifulSoup, selectors: list[str]) -> Tag | None:
+        for selector in selectors:
+            node = soup.select_one(selector)
+            if node:
+                return node
+        return None
+
+    def _first_text(self, soup: BeautifulSoup, selectors: list[str]) -> str | None:
+        node = self._first_node(soup, selectors)
+        if node:
+            return self._normalize_text(node.get_text(" ", strip=True))
+        return None
+
+    def _normalize_text(self, value: str | None) -> str:
+        if not value:
+            return ""
+        return re.sub(r"\s+", " ", value).strip()
--- a/app/db.py
+++ b/app/db.py
@@ -0,0 +1,33 @@
+from sqlalchemy import create_engine
+from sqlalchemy.orm import DeclarativeBase, sessionmaker
+
+from app.config import get_settings
+
+
+class Base(DeclarativeBase):
+    pass
+
+
+settings = get_settings()
+
+engine = create_engine(
+    settings.database_url,
+    future=True,
+    pool_pre_ping=True,
+)
+
+SessionLocal = sessionmaker(
+    bind=engine,
+    autoflush=False,
+    autocommit=False,
+    future=True,
+    expire_on_commit=False,
+)
+
+
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,30 @@
+from contextlib import asynccontextmanager
+
+from fastapi import Depends, FastAPI
+from sqlalchemy.orm import Session
+
+from app.config import get_settings
+from app.db import Base, engine, get_db
+from app.schemas import CrawlResponse
+from app.service import CrawlService
+
+
+@asynccontextmanager
+async def lifespan(_: FastAPI):
+    Base.metadata.create_all(bind=engine)
+    yield
+
+
+settings = get_settings()
+app = FastAPI(title=settings.app_name, lifespan=lifespan)
+
+
+@app.get("/health")
+def health() -> dict[str, str]:
+    return {"status": "ok"}
+
+
+@app.post("/api/v1/crawl", response_model=CrawlResponse)
+def crawl_notices(db: Session = Depends(get_db)) -> CrawlResponse:
+    service = CrawlService(db)
+    return service.crawl_new_posts()
--- a/app/models.py
+++ b/app/models.py
@@ -0,0 +1,51 @@
+from datetime import datetime
+
+from sqlalchemy import JSON, CheckConstraint, DateTime, Integer, String, Text, UniqueConstraint, func
+from sqlalchemy.orm import Mapped, mapped_column
+
+from app.db import Base
+
+
+class ScrapedPost(Base):
+    __tablename__ = "scraped_posts"
+    __table_args__ = (
+        UniqueConstraint("board_key", "article_id", name="uq_scraped_posts_board_article"),
+        CheckConstraint(
+            "board_key IN ('notice', 'archive', 'jobs')",
+            name="ck_scraped_posts_board_key",
+        ),
+    )
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+    board_key: Mapped[str] = mapped_column(String(32), nullable=False)
+    board_name: Mapped[str] = mapped_column(String(100), nullable=False)
+    board_id: Mapped[int] = mapped_column(Integer, nullable=False)
+    article_id: Mapped[int] = mapped_column(Integer, nullable=False)
+    title: Mapped[str] = mapped_column(String(500), nullable=False)
+    post_url: Mapped[str] = mapped_column(Text, nullable=False)
+    author: Mapped[str | None] = mapped_column(String(100), nullable=True)
+    published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=False), nullable=True)
+    summary: Mapped[str | None] = mapped_column(Text, nullable=True)
+    content_text: Mapped[str | None] = mapped_column(Text, nullable=True)
+    attachments: Mapped[list[dict]] = mapped_column(JSON, nullable=False, default=list)
+    created_at: Mapped[datetime] = mapped_column(
+        DateTime(timezone=False),
+        nullable=False,
+        server_default=func.now(),
+    )
+
+
+class CrawlRun(Base):
+    __tablename__ = "crawl_runs"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+    started_at: Mapped[datetime] = mapped_column(
+        DateTime(timezone=False),
+        nullable=False,
+        server_default=func.now(),
+    )
+    finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=False), nullable=True)
+    status: Mapped[str] = mapped_column(String(20), nullable=False, default="running")
+    discovered_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    inserted_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
+    error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
--- a/app/schemas.py
+++ b/app/schemas.py
@@ -0,0 +1,41 @@
+from datetime import datetime
+
+from pydantic import BaseModel
+
+
+class AttachmentOut(BaseModel):
+    name: str
+    url: str
+
+
+class PostOut(BaseModel):
+    board_key: str
+    board_name: str
+    board_id: int
+    article_id: int
+    title: str
+    post_url: str
+    author: str | None
+    published_at: datetime | None
+    summary: str | None
+    content_text: str | None
+    attachments: list[AttachmentOut]
+
+
+class LatestBoardPostOut(BaseModel):
+    board_key: str
+    board_name: str
+    board_id: int
+    article_id: int
+    title: str
+    post_url: str
+    published_at: datetime | None
+
+
+class CrawlResponse(BaseModel):
+    checked_at: datetime
+    bootstrap_mode: bool
+    bootstrap_inserted_count: int
+    new_posts_count: int
+    new_posts: list[PostOut]
+    latest_posts_by_board: list[LatestBoardPostOut]
--- a/app/service.py
+++ b/app/service.py
@@ -0,0 +1,142 @@
+from datetime import UTC, datetime
+
+from sqlalchemy import select
+from sqlalchemy.orm import Session
+
+from app.crawler import BOARD_CONFIG, HufsCrawler, PostStub
+from app.models import CrawlRun, ScrapedPost
+from app.schemas import AttachmentOut, CrawlResponse, LatestBoardPostOut, PostOut
+
+
+class CrawlService:
+    def __init__(self, db: Session) -> None:
+        self.db = db
+        self.crawler = HufsCrawler()
+
+    def crawl_new_posts(self) -> CrawlResponse:
+        bootstrap_mode = self._is_bootstrap_mode()
+
+        run = CrawlRun(status="running", discovered_count=0, inserted_count=0)
+        self.db.add(run)
+        self.db.commit()
+        self.db.refresh(run)
+
+        inserted_posts: list[ScrapedPost] = []
+        latest_posts_by_board_map: dict[str, PostStub] = {}
+        try:
+            for board_key in BOARD_CONFIG:
+                board_inserted_posts, latest_stub = self._crawl_board(board_key)
+                inserted_posts.extend(board_inserted_posts)
+                if latest_stub is not None:
+                    latest_posts_by_board_map[board_key] = latest_stub
+
+            run.status = "success"
+            run.discovered_count = len(inserted_posts)
+            run.inserted_count = len(inserted_posts)
+            run.finished_at = datetime.now(UTC).replace(tzinfo=None)
+            self.db.add(run)
+            self.db.commit()
+        except Exception as exc:
+            run.status = "failed"
+            run.error_message = str(exc)
+            run.finished_at = datetime.now(UTC).replace(tzinfo=None)
+            self.db.add(run)
+            self.db.commit()
+            raise
+
+        response_posts = [] if bootstrap_mode else inserted_posts
+        response_latest_posts = (
+            list(latest_posts_by_board_map.values())
+            if (0 if bootstrap_mode else len(inserted_posts)) == 0
+            else []
+        )
+        return CrawlResponse(
+            checked_at=datetime.now(UTC),
+            bootstrap_mode=bootstrap_mode,
+            bootstrap_inserted_count=len(inserted_posts) if bootstrap_mode else 0,
+            new_posts_count=0 if bootstrap_mode else len(inserted_posts),
+            new_posts=[
+                PostOut(
+                    board_key=post.board_key,
+                    board_name=post.board_name,
+                    board_id=post.board_id,
+                    article_id=post.article_id,
+                    title=post.title,
+                    post_url=post.post_url,
+                    author=post.author,
+                    published_at=post.published_at,
+                    summary=post.summary,
+                    content_text=post.content_text,
+                    attachments=[
+                        AttachmentOut(name=item["name"], url=item["url"])
+                        for item in (post.attachments or [])
+                    ],
+                )
+                for post in response_posts
+            ],
+            latest_posts_by_board=[
+                LatestBoardPostOut(
+                    board_key=post.board_key,
+                    board_name=post.board_name,
+                    board_id=post.board_id,
+                    article_id=post.article_id,
+                    title=post.title,
+                    post_url=post.post_url,
+                    published_at=post.published_at,
+                )
+                for post in response_latest_posts
+            ],
+        )
+
+    def _is_bootstrap_mode(self) -> bool:
+        first_saved_post = self.db.scalar(select(ScrapedPost.id).limit(1))
+        return first_saved_post is None
+
+    def _crawl_board(self, board_key: str) -> tuple[list[ScrapedPost], PostStub | None]:
+        candidates = []
+        latest_stub: PostStub | None = None
+        known_article_ids = {
+            article_id
+            for article_id in self.db.scalars(
+                select(ScrapedPost.article_id).where(ScrapedPost.board_key == board_key)
+            )
+        }
+
+        seen_article_ids: set[int] = set()
+        for page in range(1, self.crawler.max_pages_per_board + 1):
+            page_posts = self.crawler.crawl_board_list(board_key=board_key, page=page)
+            if not page_posts:
+                break
+            if page == 1 and latest_stub is None:
+                latest_stub = page_posts[0]
+
+            for stub in page_posts:
+                if stub.article_id in seen_article_ids:
+                    continue
+                seen_article_ids.add(stub.article_id)
+                if stub.article_id in known_article_ids:
+                    continue
+                candidates.append(stub)
+
+        inserted_posts: list[ScrapedPost] = []
+        for stub in reversed(candidates):
+            detail = self.crawler.crawl_post_detail(stub)
+            record = ScrapedPost(
+                board_key=detail.board_key,
+                board_name=detail.board_name,
+                board_id=detail.board_id,
+                article_id=detail.article_id,
+                title=detail.title,
+                post_url=detail.post_url,
+                author=detail.author,
+                published_at=detail.published_at,
+                summary=detail.summary,
+                content_text=detail.content_text,
+                attachments=detail.attachments,
+            )
+            self.db.add(record)
+            self.db.commit()
+            self.db.refresh(record)
+            inserted_posts.append(record)
+
+        return inserted_posts, latest_stub