Feat: [main] hufs-notice-crawler CI/CD까지 구현 완료
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s
This commit is contained in:
1
app/__init__.py
Normal file
1
app/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
30
app/config.py
Normal file
30
app/config.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from functools import lru_cache
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
extra="ignore",
|
||||
)
|
||||
|
||||
app_name: str = "hufs-notice-crawler"
|
||||
app_env: str = "development"
|
||||
database_url: str = Field(
|
||||
default="postgresql+psycopg://postgres:postgres@localhost:5432/hufs_notice_crawler"
|
||||
)
|
||||
base_url: str = "https://computer.hufs.ac.kr"
|
||||
request_timeout_seconds: float = 15.0
|
||||
max_pages_per_board: int = 5
|
||||
user_agent: str = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_settings() -> Settings:
|
||||
return Settings()
|
||||
236
app/crawler.py
Normal file
236
app/crawler.py
Normal file
@@ -0,0 +1,236 @@
|
||||
import re
|
||||
from base64 import b64encode
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from urllib.parse import quote, urljoin
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from app.config import get_settings
|
||||
|
||||
ARTICLE_PATH_RE = re.compile(r"/bbs/computer/(?P<board_id>\d+)/(?P<article_id>\d+)/artclView\.do")
|
||||
DATE_RE = re.compile(r"(20\d{2})[-./](\d{1,2})[-./](\d{1,2})")
|
||||
|
||||
|
||||
BOARD_CONFIG = {
|
||||
"notice": {"board_id": 1926, "board_name": "공지사항", "subview_id": 10058},
|
||||
"archive": {"board_id": 1927, "board_name": "자료실", "subview_id": 10059},
|
||||
"jobs": {"board_id": 1929, "board_name": "취업정보", "subview_id": 10077},
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class PostStub:
|
||||
board_key: str
|
||||
board_name: str
|
||||
board_id: int
|
||||
article_id: int
|
||||
title: str
|
||||
post_url: str
|
||||
published_at: datetime | None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PostDetail:
|
||||
board_key: str
|
||||
board_name: str
|
||||
board_id: int
|
||||
article_id: int
|
||||
title: str
|
||||
post_url: str
|
||||
author: str | None
|
||||
published_at: datetime | None
|
||||
summary: str | None
|
||||
content_text: str | None
|
||||
attachments: list[dict]
|
||||
|
||||
|
||||
class HufsCrawler:
|
||||
def __init__(self) -> None:
|
||||
settings = get_settings()
|
||||
self.base_url = settings.base_url
|
||||
self.max_pages_per_board = settings.max_pages_per_board
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({"User-Agent": settings.user_agent})
|
||||
self.timeout = settings.request_timeout_seconds
|
||||
|
||||
def crawl_board_list(self, board_key: str, page: int = 1) -> list[PostStub]:
|
||||
board = BOARD_CONFIG[board_key]
|
||||
list_url = f"{self.base_url}/bbs/computer/{board['board_id']}/artclList.do"
|
||||
response = self.session.get(
|
||||
list_url,
|
||||
params={"layout": "unknown", "page": page},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
seen_article_ids: set[int] = set()
|
||||
posts: list[PostStub] = []
|
||||
for anchor in soup.select("a[href*='artclView.do']"):
|
||||
href = anchor.get("href") or ""
|
||||
match = ARTICLE_PATH_RE.search(href)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
article_id = int(match.group("article_id"))
|
||||
if article_id in seen_article_ids:
|
||||
continue
|
||||
seen_article_ids.add(article_id)
|
||||
|
||||
row = anchor.find_parent("tr")
|
||||
posts.append(
|
||||
PostStub(
|
||||
board_key=board_key,
|
||||
board_name=board["board_name"],
|
||||
board_id=board["board_id"],
|
||||
article_id=article_id,
|
||||
title=self._normalize_text(anchor.get_text(" ", strip=True)),
|
||||
post_url=self._build_public_post_url(
|
||||
subview_id=board["subview_id"],
|
||||
board_id=board["board_id"],
|
||||
article_id=article_id,
|
||||
),
|
||||
published_at=self._extract_date_from_row(row),
|
||||
)
|
||||
)
|
||||
|
||||
return posts
|
||||
|
||||
def crawl_post_detail(self, stub: PostStub) -> PostDetail:
|
||||
detail_url = self._build_detail_url(board_id=stub.board_id, article_id=stub.article_id)
|
||||
response = self.session.get(detail_url, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
title = self._first_text(
|
||||
soup,
|
||||
[
|
||||
".artclView .title h2",
|
||||
".artclView h2",
|
||||
".view-title",
|
||||
"h2",
|
||||
"h3",
|
||||
],
|
||||
) or stub.title
|
||||
|
||||
content_node = self._first_node(
|
||||
soup,
|
||||
[
|
||||
".artclContents",
|
||||
".fr-view",
|
||||
".view-con",
|
||||
".artcl-view",
|
||||
".bbs--view",
|
||||
"#artclView",
|
||||
],
|
||||
)
|
||||
content_text = self._normalize_text(content_node.get_text("\n", strip=True)) if content_node else None
|
||||
|
||||
author = self._extract_meta(soup, ["작성자", "writer", "등록자"])
|
||||
published_at = self._extract_date_from_soup(soup) or stub.published_at
|
||||
attachments = self._extract_attachments(soup)
|
||||
|
||||
return PostDetail(
|
||||
board_key=stub.board_key,
|
||||
board_name=stub.board_name,
|
||||
board_id=stub.board_id,
|
||||
article_id=stub.article_id,
|
||||
title=title,
|
||||
post_url=stub.post_url,
|
||||
author=author,
|
||||
published_at=published_at,
|
||||
summary=self._make_summary(content_text),
|
||||
content_text=content_text,
|
||||
attachments=attachments,
|
||||
)
|
||||
|
||||
def _build_detail_url(self, board_id: int, article_id: int) -> str:
|
||||
return f"{self.base_url}/bbs/computer/{board_id}/{article_id}/artclView.do"
|
||||
|
||||
def _build_public_post_url(self, subview_id: int, board_id: int, article_id: int) -> str:
|
||||
article_path = f"/bbs/computer/{board_id}/{article_id}/artclView.do?"
|
||||
encoded_path = quote(article_path, safe="")
|
||||
enc = b64encode(f"fnct1|@@|{encoded_path}".encode("utf-8")).decode("ascii")
|
||||
return f"{self.base_url}/computer/{subview_id}/subview.do?enc={enc}"
|
||||
|
||||
def _extract_attachments(self, soup: BeautifulSoup) -> list[dict]:
|
||||
attachments: list[dict] = []
|
||||
seen: set[str] = set()
|
||||
for anchor in soup.select("a[href*='download'], a[href*='fileDown'], a[href*='attach'], a[href*='FileDown']"):
|
||||
href = anchor.get("href") or ""
|
||||
name = self._normalize_text(anchor.get_text(" ", strip=True))
|
||||
if not href or not name:
|
||||
continue
|
||||
absolute_url = urljoin(self.base_url, href)
|
||||
if absolute_url in seen:
|
||||
continue
|
||||
seen.add(absolute_url)
|
||||
attachments.append({"name": name, "url": absolute_url})
|
||||
return attachments
|
||||
|
||||
def _extract_meta(self, soup: BeautifulSoup, labels: list[str]) -> str | None:
|
||||
for label in labels:
|
||||
label_node = soup.find(string=re.compile(label, re.IGNORECASE))
|
||||
if not label_node:
|
||||
continue
|
||||
parent = label_node.parent if isinstance(label_node.parent, Tag) else None
|
||||
if not parent:
|
||||
continue
|
||||
container = parent.parent if parent.parent else parent
|
||||
candidate_text = self._normalize_text(container.get_text(" ", strip=True))
|
||||
candidate_text = candidate_text.replace(label, "").replace(":", "").strip()
|
||||
if candidate_text:
|
||||
return candidate_text
|
||||
return None
|
||||
|
||||
def _extract_date_from_row(self, row: Tag | None) -> datetime | None:
|
||||
if row is None:
|
||||
return None
|
||||
return self._parse_date(row.get_text(" ", strip=True))
|
||||
|
||||
def _extract_date_from_soup(self, soup: BeautifulSoup) -> datetime | None:
|
||||
for selector in [".artclInfo", ".view-info", ".info", "body"]:
|
||||
node = soup.select_one(selector)
|
||||
if not node:
|
||||
continue
|
||||
parsed = self._parse_date(node.get_text(" ", strip=True))
|
||||
if parsed:
|
||||
return parsed
|
||||
return None
|
||||
|
||||
def _parse_date(self, text: str | None) -> datetime | None:
|
||||
if not text:
|
||||
return None
|
||||
match = DATE_RE.search(text)
|
||||
if not match:
|
||||
return None
|
||||
year, month, day = map(int, match.groups())
|
||||
return datetime(year, month, day)
|
||||
|
||||
def _make_summary(self, content_text: str | None, max_length: int = 280) -> str | None:
|
||||
if not content_text:
|
||||
return None
|
||||
normalized = " ".join(content_text.split())
|
||||
if len(normalized) <= max_length:
|
||||
return normalized
|
||||
return normalized[: max_length - 3].rstrip() + "..."
|
||||
|
||||
def _first_node(self, soup: BeautifulSoup, selectors: list[str]) -> Tag | None:
|
||||
for selector in selectors:
|
||||
node = soup.select_one(selector)
|
||||
if node:
|
||||
return node
|
||||
return None
|
||||
|
||||
def _first_text(self, soup: BeautifulSoup, selectors: list[str]) -> str | None:
|
||||
node = self._first_node(soup, selectors)
|
||||
if node:
|
||||
return self._normalize_text(node.get_text(" ", strip=True))
|
||||
return None
|
||||
|
||||
def _normalize_text(self, value: str | None) -> str:
|
||||
if not value:
|
||||
return ""
|
||||
return re.sub(r"\s+", " ", value).strip()
|
||||
33
app/db.py
Normal file
33
app/db.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import DeclarativeBase, sessionmaker
|
||||
|
||||
from app.config import get_settings
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
settings = get_settings()
|
||||
|
||||
engine = create_engine(
|
||||
settings.database_url,
|
||||
future=True,
|
||||
pool_pre_ping=True,
|
||||
)
|
||||
|
||||
SessionLocal = sessionmaker(
|
||||
bind=engine,
|
||||
autoflush=False,
|
||||
autocommit=False,
|
||||
future=True,
|
||||
expire_on_commit=False,
|
||||
)
|
||||
|
||||
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
30
app/main.py
Normal file
30
app/main.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import Depends, FastAPI
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.config import get_settings
|
||||
from app.db import Base, engine, get_db
|
||||
from app.schemas import CrawlResponse
|
||||
from app.service import CrawlService
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(_: FastAPI):
|
||||
Base.metadata.create_all(bind=engine)
|
||||
yield
|
||||
|
||||
|
||||
settings = get_settings()
|
||||
app = FastAPI(title=settings.app_name, lifespan=lifespan)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict[str, str]:
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.post("/api/v1/crawl", response_model=CrawlResponse)
|
||||
def crawl_notices(db: Session = Depends(get_db)) -> CrawlResponse:
|
||||
service = CrawlService(db)
|
||||
return service.crawl_new_posts()
|
||||
51
app/models.py
Normal file
51
app/models.py
Normal file
@@ -0,0 +1,51 @@
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import JSON, CheckConstraint, DateTime, Integer, String, Text, UniqueConstraint, func
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from app.db import Base
|
||||
|
||||
|
||||
class ScrapedPost(Base):
|
||||
__tablename__ = "scraped_posts"
|
||||
__table_args__ = (
|
||||
UniqueConstraint("board_key", "article_id", name="uq_scraped_posts_board_article"),
|
||||
CheckConstraint(
|
||||
"board_key IN ('notice', 'archive', 'jobs')",
|
||||
name="ck_scraped_posts_board_key",
|
||||
),
|
||||
)
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
board_key: Mapped[str] = mapped_column(String(32), nullable=False)
|
||||
board_name: Mapped[str] = mapped_column(String(100), nullable=False)
|
||||
board_id: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
article_id: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
title: Mapped[str] = mapped_column(String(500), nullable=False)
|
||||
post_url: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
author: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
published_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=False), nullable=True)
|
||||
summary: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
content_text: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
attachments: Mapped[list[dict]] = mapped_column(JSON, nullable=False, default=list)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=False),
|
||||
nullable=False,
|
||||
server_default=func.now(),
|
||||
)
|
||||
|
||||
|
||||
class CrawlRun(Base):
|
||||
__tablename__ = "crawl_runs"
|
||||
|
||||
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||||
started_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=False),
|
||||
nullable=False,
|
||||
server_default=func.now(),
|
||||
)
|
||||
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=False), nullable=True)
|
||||
status: Mapped[str] = mapped_column(String(20), nullable=False, default="running")
|
||||
discovered_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
inserted_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
|
||||
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
41
app/schemas.py
Normal file
41
app/schemas.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from datetime import datetime
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class AttachmentOut(BaseModel):
|
||||
name: str
|
||||
url: str
|
||||
|
||||
|
||||
class PostOut(BaseModel):
|
||||
board_key: str
|
||||
board_name: str
|
||||
board_id: int
|
||||
article_id: int
|
||||
title: str
|
||||
post_url: str
|
||||
author: str | None
|
||||
published_at: datetime | None
|
||||
summary: str | None
|
||||
content_text: str | None
|
||||
attachments: list[AttachmentOut]
|
||||
|
||||
|
||||
class LatestBoardPostOut(BaseModel):
|
||||
board_key: str
|
||||
board_name: str
|
||||
board_id: int
|
||||
article_id: int
|
||||
title: str
|
||||
post_url: str
|
||||
published_at: datetime | None
|
||||
|
||||
|
||||
class CrawlResponse(BaseModel):
|
||||
checked_at: datetime
|
||||
bootstrap_mode: bool
|
||||
bootstrap_inserted_count: int
|
||||
new_posts_count: int
|
||||
new_posts: list[PostOut]
|
||||
latest_posts_by_board: list[LatestBoardPostOut]
|
||||
142
app/service.py
Normal file
142
app/service.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.crawler import BOARD_CONFIG, HufsCrawler, PostStub
|
||||
from app.models import CrawlRun, ScrapedPost
|
||||
from app.schemas import AttachmentOut, CrawlResponse, LatestBoardPostOut, PostOut
|
||||
|
||||
|
||||
class CrawlService:
|
||||
def __init__(self, db: Session) -> None:
|
||||
self.db = db
|
||||
self.crawler = HufsCrawler()
|
||||
|
||||
def crawl_new_posts(self) -> CrawlResponse:
|
||||
bootstrap_mode = self._is_bootstrap_mode()
|
||||
|
||||
run = CrawlRun(status="running", discovered_count=0, inserted_count=0)
|
||||
self.db.add(run)
|
||||
self.db.commit()
|
||||
self.db.refresh(run)
|
||||
|
||||
inserted_posts: list[ScrapedPost] = []
|
||||
latest_posts_by_board_map: dict[str, PostStub] = {}
|
||||
try:
|
||||
for board_key in BOARD_CONFIG:
|
||||
board_inserted_posts, latest_stub = self._crawl_board(board_key)
|
||||
inserted_posts.extend(board_inserted_posts)
|
||||
if latest_stub is not None:
|
||||
latest_posts_by_board_map[board_key] = latest_stub
|
||||
|
||||
run.status = "success"
|
||||
run.discovered_count = len(inserted_posts)
|
||||
run.inserted_count = len(inserted_posts)
|
||||
run.finished_at = datetime.now(UTC).replace(tzinfo=None)
|
||||
self.db.add(run)
|
||||
self.db.commit()
|
||||
except Exception as exc:
|
||||
run.status = "failed"
|
||||
run.error_message = str(exc)
|
||||
run.finished_at = datetime.now(UTC).replace(tzinfo=None)
|
||||
self.db.add(run)
|
||||
self.db.commit()
|
||||
raise
|
||||
|
||||
response_posts = [] if bootstrap_mode else inserted_posts
|
||||
response_latest_posts = (
|
||||
list(latest_posts_by_board_map.values())
|
||||
if (0 if bootstrap_mode else len(inserted_posts)) == 0
|
||||
else []
|
||||
)
|
||||
return CrawlResponse(
|
||||
checked_at=datetime.now(UTC),
|
||||
bootstrap_mode=bootstrap_mode,
|
||||
bootstrap_inserted_count=len(inserted_posts) if bootstrap_mode else 0,
|
||||
new_posts_count=0 if bootstrap_mode else len(inserted_posts),
|
||||
new_posts=[
|
||||
PostOut(
|
||||
board_key=post.board_key,
|
||||
board_name=post.board_name,
|
||||
board_id=post.board_id,
|
||||
article_id=post.article_id,
|
||||
title=post.title,
|
||||
post_url=post.post_url,
|
||||
author=post.author,
|
||||
published_at=post.published_at,
|
||||
summary=post.summary,
|
||||
content_text=post.content_text,
|
||||
attachments=[
|
||||
AttachmentOut(name=item["name"], url=item["url"])
|
||||
for item in (post.attachments or [])
|
||||
],
|
||||
)
|
||||
for post in response_posts
|
||||
],
|
||||
latest_posts_by_board=[
|
||||
LatestBoardPostOut(
|
||||
board_key=post.board_key,
|
||||
board_name=post.board_name,
|
||||
board_id=post.board_id,
|
||||
article_id=post.article_id,
|
||||
title=post.title,
|
||||
post_url=post.post_url,
|
||||
published_at=post.published_at,
|
||||
)
|
||||
for post in response_latest_posts
|
||||
],
|
||||
)
|
||||
|
||||
def _is_bootstrap_mode(self) -> bool:
|
||||
first_saved_post = self.db.scalar(select(ScrapedPost.id).limit(1))
|
||||
return first_saved_post is None
|
||||
|
||||
def _crawl_board(self, board_key: str) -> tuple[list[ScrapedPost], PostStub | None]:
|
||||
candidates = []
|
||||
latest_stub: PostStub | None = None
|
||||
known_article_ids = {
|
||||
article_id
|
||||
for article_id in self.db.scalars(
|
||||
select(ScrapedPost.article_id).where(ScrapedPost.board_key == board_key)
|
||||
)
|
||||
}
|
||||
|
||||
seen_article_ids: set[int] = set()
|
||||
for page in range(1, self.crawler.max_pages_per_board + 1):
|
||||
page_posts = self.crawler.crawl_board_list(board_key=board_key, page=page)
|
||||
if not page_posts:
|
||||
break
|
||||
if page == 1 and latest_stub is None:
|
||||
latest_stub = page_posts[0]
|
||||
|
||||
for stub in page_posts:
|
||||
if stub.article_id in seen_article_ids:
|
||||
continue
|
||||
seen_article_ids.add(stub.article_id)
|
||||
if stub.article_id in known_article_ids:
|
||||
continue
|
||||
candidates.append(stub)
|
||||
|
||||
inserted_posts: list[ScrapedPost] = []
|
||||
for stub in reversed(candidates):
|
||||
detail = self.crawler.crawl_post_detail(stub)
|
||||
record = ScrapedPost(
|
||||
board_key=detail.board_key,
|
||||
board_name=detail.board_name,
|
||||
board_id=detail.board_id,
|
||||
article_id=detail.article_id,
|
||||
title=detail.title,
|
||||
post_url=detail.post_url,
|
||||
author=detail.author,
|
||||
published_at=detail.published_at,
|
||||
summary=detail.summary,
|
||||
content_text=detail.content_text,
|
||||
attachments=detail.attachments,
|
||||
)
|
||||
self.db.add(record)
|
||||
self.db.commit()
|
||||
self.db.refresh(record)
|
||||
inserted_posts.append(record)
|
||||
|
||||
return inserted_posts, latest_stub
|
||||
Reference in New Issue
Block a user