All checks were successful
hufs-notice-crawler-cicd / build_push_deploy (push) Successful in 8m35s
177 lines
6.0 KiB
Python
177 lines
6.0 KiB
Python
from datetime import datetime
|
|
|
|
from sqlalchemy import select
|
|
|
|
from app.crawler import BOARD_CONFIG, PostDetail, PostStub
|
|
from app.models import CrawlRun, ScrapedPost
|
|
from app.service import CrawlService
|
|
|
|
|
|
class FakeCrawler:
|
|
def __init__(self):
|
|
self.max_pages_per_board = 2
|
|
|
|
def crawl_board_list(self, board_key: str, page: int = 1):
|
|
if page > 1:
|
|
return []
|
|
|
|
board = BOARD_CONFIG[board_key]
|
|
if board_key == "notice":
|
|
return [
|
|
PostStub(
|
|
board_key="notice",
|
|
board_name=board["board_name"],
|
|
board_id=board["board_id"],
|
|
article_id=9001,
|
|
title="Existing notice",
|
|
post_url="https://computer.hufs.ac.kr/computer/10058/subview.do?enc=notice-9001",
|
|
published_at=datetime(2026, 3, 1),
|
|
),
|
|
PostStub(
|
|
board_key="notice",
|
|
board_name=board["board_name"],
|
|
board_id=board["board_id"],
|
|
article_id=9002,
|
|
title="New notice",
|
|
post_url="https://computer.hufs.ac.kr/computer/10058/subview.do?enc=notice-9002",
|
|
published_at=datetime(2026, 3, 2),
|
|
),
|
|
]
|
|
|
|
if board_key == "archive":
|
|
return [
|
|
PostStub(
|
|
board_key="archive",
|
|
board_name=board["board_name"],
|
|
board_id=board["board_id"],
|
|
article_id=9101,
|
|
title="New archive post",
|
|
post_url="https://computer.hufs.ac.kr/computer/10059/subview.do?enc=archive-9101",
|
|
published_at=datetime(2026, 3, 3),
|
|
)
|
|
]
|
|
|
|
if board_key == "jobs":
|
|
return []
|
|
|
|
return []
|
|
|
|
def crawl_post_detail(self, stub: PostStub):
|
|
return PostDetail(
|
|
board_key=stub.board_key,
|
|
board_name=stub.board_name,
|
|
board_id=stub.board_id,
|
|
article_id=stub.article_id,
|
|
title=stub.title,
|
|
post_url=stub.post_url,
|
|
author="admin",
|
|
published_at=stub.published_at,
|
|
summary=f"{stub.title} summary",
|
|
content_text=f"{stub.title} content",
|
|
attachments=[
|
|
{"name": f"{stub.article_id}.pdf", "url": f"https://example.com/files/{stub.article_id}.pdf"}
|
|
],
|
|
)
|
|
|
|
|
|
def test_crawl_service_saves_only_new_posts(db_session):
|
|
existing = ScrapedPost(
|
|
board_key="notice",
|
|
board_name="공지사항",
|
|
board_id=1926,
|
|
article_id=9001,
|
|
title="Existing notice",
|
|
post_url="https://computer.hufs.ac.kr/computer/10058/subview.do?enc=notice-9001",
|
|
author="admin",
|
|
published_at=datetime(2026, 3, 1),
|
|
summary="Already stored post",
|
|
content_text="Already stored content",
|
|
attachments=[],
|
|
)
|
|
db_session.add(existing)
|
|
db_session.commit()
|
|
|
|
service = CrawlService(db_session)
|
|
service.crawler = FakeCrawler()
|
|
|
|
response = service.crawl_new_posts()
|
|
|
|
assert response.bootstrap_mode is False
|
|
assert response.bootstrap_inserted_count == 0
|
|
assert response.new_posts_count == 2
|
|
assert [post.article_id for post in response.new_posts] == [9002, 9101]
|
|
assert response.latest_posts_by_board == []
|
|
|
|
saved_posts = db_session.scalars(
|
|
select(ScrapedPost).order_by(ScrapedPost.board_key, ScrapedPost.article_id)
|
|
).all()
|
|
assert len(saved_posts) == 3
|
|
|
|
run = db_session.scalars(select(CrawlRun).order_by(CrawlRun.id.desc())).first()
|
|
assert run is not None
|
|
assert run.status == "success"
|
|
assert run.inserted_count == 2
|
|
|
|
|
|
def test_crawl_service_returns_zero_when_no_new_posts(db_session):
|
|
for board_key, board in BOARD_CONFIG.items():
|
|
db_session.add(
|
|
ScrapedPost(
|
|
board_key=board_key,
|
|
board_name=board["board_name"],
|
|
board_id=board["board_id"],
|
|
article_id=1,
|
|
title="Existing post",
|
|
post_url=f"https://computer.hufs.ac.kr/computer/99999/subview.do?enc={board_key}-1",
|
|
author="admin",
|
|
published_at=datetime(2026, 3, 1),
|
|
summary="Existing summary",
|
|
content_text="Existing content",
|
|
attachments=[],
|
|
)
|
|
)
|
|
db_session.commit()
|
|
|
|
class NoNewPostCrawler(FakeCrawler):
|
|
def crawl_board_list(self, board_key: str, page: int = 1):
|
|
if page > 1:
|
|
return []
|
|
board = BOARD_CONFIG[board_key]
|
|
return [
|
|
PostStub(
|
|
board_key=board_key,
|
|
board_name=board["board_name"],
|
|
board_id=board["board_id"],
|
|
article_id=1,
|
|
title="Existing post",
|
|
post_url=f"https://computer.hufs.ac.kr/computer/99999/subview.do?enc={board_key}-1",
|
|
published_at=datetime(2026, 3, 1),
|
|
)
|
|
]
|
|
|
|
service = CrawlService(db_session)
|
|
service.crawler = NoNewPostCrawler()
|
|
|
|
response = service.crawl_new_posts()
|
|
|
|
assert response.bootstrap_mode is False
|
|
assert response.new_posts_count == 0
|
|
assert response.new_posts == []
|
|
assert [post.board_key for post in response.latest_posts_by_board] == ["notice", "archive", "jobs"]
|
|
|
|
|
|
def test_crawl_service_bootstrap_saves_posts_without_returning_them(db_session):
|
|
service = CrawlService(db_session)
|
|
service.crawler = FakeCrawler()
|
|
|
|
response = service.crawl_new_posts()
|
|
|
|
assert response.bootstrap_mode is True
|
|
assert response.bootstrap_inserted_count == 3
|
|
assert response.new_posts_count == 0
|
|
assert response.new_posts == []
|
|
assert [post.board_key for post in response.latest_posts_by_board] == ["notice", "archive"]
|
|
|
|
saved_posts = db_session.scalars(select(ScrapedPost)).all()
|
|
assert len(saved_posts) == 3
|