from datetime import datetime from sqlalchemy import select from app.crawler import BOARD_CONFIG, PostDetail, PostStub from app.models import CrawlRun, ScrapedPost from app.service import CrawlService class FakeCrawler: def __init__(self): self.max_pages_per_board = 2 def crawl_board_list(self, board_key: str, page: int = 1): if page > 1: return [] board = BOARD_CONFIG[board_key] if board_key == "notice": return [ PostStub( board_key="notice", board_name=board["board_name"], board_id=board["board_id"], article_id=9001, title="Existing notice", post_url="https://computer.hufs.ac.kr/computer/10058/subview.do?enc=notice-9001", published_at=datetime(2026, 3, 1), ), PostStub( board_key="notice", board_name=board["board_name"], board_id=board["board_id"], article_id=9002, title="New notice", post_url="https://computer.hufs.ac.kr/computer/10058/subview.do?enc=notice-9002", published_at=datetime(2026, 3, 2), ), ] if board_key == "archive": return [ PostStub( board_key="archive", board_name=board["board_name"], board_id=board["board_id"], article_id=9101, title="New archive post", post_url="https://computer.hufs.ac.kr/computer/10059/subview.do?enc=archive-9101", published_at=datetime(2026, 3, 3), ) ] if board_key == "jobs": return [] return [] def crawl_post_detail(self, stub: PostStub): return PostDetail( board_key=stub.board_key, board_name=stub.board_name, board_id=stub.board_id, article_id=stub.article_id, title=stub.title, post_url=stub.post_url, author="admin", published_at=stub.published_at, summary=f"{stub.title} summary", content_text=f"{stub.title} content", attachments=[ {"name": f"{stub.article_id}.pdf", "url": f"https://example.com/files/{stub.article_id}.pdf"} ], ) def test_crawl_service_saves_only_new_posts(db_session): existing = ScrapedPost( board_key="notice", board_name="공지사항", board_id=1926, article_id=9001, title="Existing notice", post_url="https://computer.hufs.ac.kr/computer/10058/subview.do?enc=notice-9001", author="admin", published_at=datetime(2026, 3, 1), summary="Already stored post", content_text="Already stored content", attachments=[], ) db_session.add(existing) db_session.commit() service = CrawlService(db_session) service.crawler = FakeCrawler() response = service.crawl_new_posts() assert response.bootstrap_mode is False assert response.bootstrap_inserted_count == 0 assert response.new_posts_count == 2 assert [post.article_id for post in response.new_posts] == [9002, 9101] assert response.latest_posts_by_board == [] saved_posts = db_session.scalars( select(ScrapedPost).order_by(ScrapedPost.board_key, ScrapedPost.article_id) ).all() assert len(saved_posts) == 3 run = db_session.scalars(select(CrawlRun).order_by(CrawlRun.id.desc())).first() assert run is not None assert run.status == "success" assert run.inserted_count == 2 def test_crawl_service_returns_zero_when_no_new_posts(db_session): for board_key, board in BOARD_CONFIG.items(): db_session.add( ScrapedPost( board_key=board_key, board_name=board["board_name"], board_id=board["board_id"], article_id=1, title="Existing post", post_url=f"https://computer.hufs.ac.kr/computer/99999/subview.do?enc={board_key}-1", author="admin", published_at=datetime(2026, 3, 1), summary="Existing summary", content_text="Existing content", attachments=[], ) ) db_session.commit() class NoNewPostCrawler(FakeCrawler): def crawl_board_list(self, board_key: str, page: int = 1): if page > 1: return [] board = BOARD_CONFIG[board_key] return [ PostStub( board_key=board_key, board_name=board["board_name"], board_id=board["board_id"], article_id=1, title="Existing post", post_url=f"https://computer.hufs.ac.kr/computer/99999/subview.do?enc={board_key}-1", published_at=datetime(2026, 3, 1), ) ] service = CrawlService(db_session) service.crawler = NoNewPostCrawler() response = service.crawl_new_posts() assert response.bootstrap_mode is False assert response.new_posts_count == 0 assert response.new_posts == [] assert [post.board_key for post in response.latest_posts_by_board] == ["notice", "archive", "jobs"] def test_crawl_service_bootstrap_saves_posts_without_returning_them(db_session): service = CrawlService(db_session) service.crawler = FakeCrawler() response = service.crawl_new_posts() assert response.bootstrap_mode is True assert response.bootstrap_inserted_count == 3 assert response.new_posts_count == 0 assert response.new_posts == [] assert [post.board_key for post in response.latest_posts_by_board] == ["notice", "archive"] saved_posts = db_session.scalars(select(ScrapedPost)).all() assert len(saved_posts) == 3