Feat: [main] hufs-notice-crawler CI/CD까지 구현 완료
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s

This commit is contained in:
2026-03-17 17:18:16 +09:00
commit ca460453af
23 changed files with 1959 additions and 0 deletions

142
app/service.py Normal file
View File

@@ -0,0 +1,142 @@
from datetime import UTC, datetime
from sqlalchemy import select
from sqlalchemy.orm import Session
from app.crawler import BOARD_CONFIG, HufsCrawler, PostStub
from app.models import CrawlRun, ScrapedPost
from app.schemas import AttachmentOut, CrawlResponse, LatestBoardPostOut, PostOut
class CrawlService:
def __init__(self, db: Session) -> None:
self.db = db
self.crawler = HufsCrawler()
def crawl_new_posts(self) -> CrawlResponse:
bootstrap_mode = self._is_bootstrap_mode()
run = CrawlRun(status="running", discovered_count=0, inserted_count=0)
self.db.add(run)
self.db.commit()
self.db.refresh(run)
inserted_posts: list[ScrapedPost] = []
latest_posts_by_board_map: dict[str, PostStub] = {}
try:
for board_key in BOARD_CONFIG:
board_inserted_posts, latest_stub = self._crawl_board(board_key)
inserted_posts.extend(board_inserted_posts)
if latest_stub is not None:
latest_posts_by_board_map[board_key] = latest_stub
run.status = "success"
run.discovered_count = len(inserted_posts)
run.inserted_count = len(inserted_posts)
run.finished_at = datetime.now(UTC).replace(tzinfo=None)
self.db.add(run)
self.db.commit()
except Exception as exc:
run.status = "failed"
run.error_message = str(exc)
run.finished_at = datetime.now(UTC).replace(tzinfo=None)
self.db.add(run)
self.db.commit()
raise
response_posts = [] if bootstrap_mode else inserted_posts
response_latest_posts = (
list(latest_posts_by_board_map.values())
if (0 if bootstrap_mode else len(inserted_posts)) == 0
else []
)
return CrawlResponse(
checked_at=datetime.now(UTC),
bootstrap_mode=bootstrap_mode,
bootstrap_inserted_count=len(inserted_posts) if bootstrap_mode else 0,
new_posts_count=0 if bootstrap_mode else len(inserted_posts),
new_posts=[
PostOut(
board_key=post.board_key,
board_name=post.board_name,
board_id=post.board_id,
article_id=post.article_id,
title=post.title,
post_url=post.post_url,
author=post.author,
published_at=post.published_at,
summary=post.summary,
content_text=post.content_text,
attachments=[
AttachmentOut(name=item["name"], url=item["url"])
for item in (post.attachments or [])
],
)
for post in response_posts
],
latest_posts_by_board=[
LatestBoardPostOut(
board_key=post.board_key,
board_name=post.board_name,
board_id=post.board_id,
article_id=post.article_id,
title=post.title,
post_url=post.post_url,
published_at=post.published_at,
)
for post in response_latest_posts
],
)
def _is_bootstrap_mode(self) -> bool:
first_saved_post = self.db.scalar(select(ScrapedPost.id).limit(1))
return first_saved_post is None
def _crawl_board(self, board_key: str) -> tuple[list[ScrapedPost], PostStub | None]:
candidates = []
latest_stub: PostStub | None = None
known_article_ids = {
article_id
for article_id in self.db.scalars(
select(ScrapedPost.article_id).where(ScrapedPost.board_key == board_key)
)
}
seen_article_ids: set[int] = set()
for page in range(1, self.crawler.max_pages_per_board + 1):
page_posts = self.crawler.crawl_board_list(board_key=board_key, page=page)
if not page_posts:
break
if page == 1 and latest_stub is None:
latest_stub = page_posts[0]
for stub in page_posts:
if stub.article_id in seen_article_ids:
continue
seen_article_ids.add(stub.article_id)
if stub.article_id in known_article_ids:
continue
candidates.append(stub)
inserted_posts: list[ScrapedPost] = []
for stub in reversed(candidates):
detail = self.crawler.crawl_post_detail(stub)
record = ScrapedPost(
board_key=detail.board_key,
board_name=detail.board_name,
board_id=detail.board_id,
article_id=detail.article_id,
title=detail.title,
post_url=detail.post_url,
author=detail.author,
published_at=detail.published_at,
summary=detail.summary,
content_text=detail.content_text,
attachments=detail.attachments,
)
self.db.add(record)
self.db.commit()
self.db.refresh(record)
inserted_posts.append(record)
return inserted_posts, latest_stub