Feat: [main] hufs-notice-crawler CI/CD까지 구현 완료
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s
All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s
This commit is contained in:
142
app/service.py
Normal file
142
app/service.py
Normal file
@@ -0,0 +1,142 @@
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.crawler import BOARD_CONFIG, HufsCrawler, PostStub
|
||||
from app.models import CrawlRun, ScrapedPost
|
||||
from app.schemas import AttachmentOut, CrawlResponse, LatestBoardPostOut, PostOut
|
||||
|
||||
|
||||
class CrawlService:
|
||||
def __init__(self, db: Session) -> None:
|
||||
self.db = db
|
||||
self.crawler = HufsCrawler()
|
||||
|
||||
def crawl_new_posts(self) -> CrawlResponse:
|
||||
bootstrap_mode = self._is_bootstrap_mode()
|
||||
|
||||
run = CrawlRun(status="running", discovered_count=0, inserted_count=0)
|
||||
self.db.add(run)
|
||||
self.db.commit()
|
||||
self.db.refresh(run)
|
||||
|
||||
inserted_posts: list[ScrapedPost] = []
|
||||
latest_posts_by_board_map: dict[str, PostStub] = {}
|
||||
try:
|
||||
for board_key in BOARD_CONFIG:
|
||||
board_inserted_posts, latest_stub = self._crawl_board(board_key)
|
||||
inserted_posts.extend(board_inserted_posts)
|
||||
if latest_stub is not None:
|
||||
latest_posts_by_board_map[board_key] = latest_stub
|
||||
|
||||
run.status = "success"
|
||||
run.discovered_count = len(inserted_posts)
|
||||
run.inserted_count = len(inserted_posts)
|
||||
run.finished_at = datetime.now(UTC).replace(tzinfo=None)
|
||||
self.db.add(run)
|
||||
self.db.commit()
|
||||
except Exception as exc:
|
||||
run.status = "failed"
|
||||
run.error_message = str(exc)
|
||||
run.finished_at = datetime.now(UTC).replace(tzinfo=None)
|
||||
self.db.add(run)
|
||||
self.db.commit()
|
||||
raise
|
||||
|
||||
response_posts = [] if bootstrap_mode else inserted_posts
|
||||
response_latest_posts = (
|
||||
list(latest_posts_by_board_map.values())
|
||||
if (0 if bootstrap_mode else len(inserted_posts)) == 0
|
||||
else []
|
||||
)
|
||||
return CrawlResponse(
|
||||
checked_at=datetime.now(UTC),
|
||||
bootstrap_mode=bootstrap_mode,
|
||||
bootstrap_inserted_count=len(inserted_posts) if bootstrap_mode else 0,
|
||||
new_posts_count=0 if bootstrap_mode else len(inserted_posts),
|
||||
new_posts=[
|
||||
PostOut(
|
||||
board_key=post.board_key,
|
||||
board_name=post.board_name,
|
||||
board_id=post.board_id,
|
||||
article_id=post.article_id,
|
||||
title=post.title,
|
||||
post_url=post.post_url,
|
||||
author=post.author,
|
||||
published_at=post.published_at,
|
||||
summary=post.summary,
|
||||
content_text=post.content_text,
|
||||
attachments=[
|
||||
AttachmentOut(name=item["name"], url=item["url"])
|
||||
for item in (post.attachments or [])
|
||||
],
|
||||
)
|
||||
for post in response_posts
|
||||
],
|
||||
latest_posts_by_board=[
|
||||
LatestBoardPostOut(
|
||||
board_key=post.board_key,
|
||||
board_name=post.board_name,
|
||||
board_id=post.board_id,
|
||||
article_id=post.article_id,
|
||||
title=post.title,
|
||||
post_url=post.post_url,
|
||||
published_at=post.published_at,
|
||||
)
|
||||
for post in response_latest_posts
|
||||
],
|
||||
)
|
||||
|
||||
def _is_bootstrap_mode(self) -> bool:
|
||||
first_saved_post = self.db.scalar(select(ScrapedPost.id).limit(1))
|
||||
return first_saved_post is None
|
||||
|
||||
def _crawl_board(self, board_key: str) -> tuple[list[ScrapedPost], PostStub | None]:
|
||||
candidates = []
|
||||
latest_stub: PostStub | None = None
|
||||
known_article_ids = {
|
||||
article_id
|
||||
for article_id in self.db.scalars(
|
||||
select(ScrapedPost.article_id).where(ScrapedPost.board_key == board_key)
|
||||
)
|
||||
}
|
||||
|
||||
seen_article_ids: set[int] = set()
|
||||
for page in range(1, self.crawler.max_pages_per_board + 1):
|
||||
page_posts = self.crawler.crawl_board_list(board_key=board_key, page=page)
|
||||
if not page_posts:
|
||||
break
|
||||
if page == 1 and latest_stub is None:
|
||||
latest_stub = page_posts[0]
|
||||
|
||||
for stub in page_posts:
|
||||
if stub.article_id in seen_article_ids:
|
||||
continue
|
||||
seen_article_ids.add(stub.article_id)
|
||||
if stub.article_id in known_article_ids:
|
||||
continue
|
||||
candidates.append(stub)
|
||||
|
||||
inserted_posts: list[ScrapedPost] = []
|
||||
for stub in reversed(candidates):
|
||||
detail = self.crawler.crawl_post_detail(stub)
|
||||
record = ScrapedPost(
|
||||
board_key=detail.board_key,
|
||||
board_name=detail.board_name,
|
||||
board_id=detail.board_id,
|
||||
article_id=detail.article_id,
|
||||
title=detail.title,
|
||||
post_url=detail.post_url,
|
||||
author=detail.author,
|
||||
published_at=detail.published_at,
|
||||
summary=detail.summary,
|
||||
content_text=detail.content_text,
|
||||
attachments=detail.attachments,
|
||||
)
|
||||
self.db.add(record)
|
||||
self.db.commit()
|
||||
self.db.refresh(record)
|
||||
inserted_posts.append(record)
|
||||
|
||||
return inserted_posts, latest_stub
|
||||
Reference in New Issue
Block a user