hufs-notice-crawler/tests/test_service.py

from datetime import datetime

from sqlalchemy import select

from app.crawler import BOARD_CONFIG, PostDetail, PostStub
from app.models import CrawlRun, ScrapedPost
from app.service import CrawlService


class FakeCrawler:
    def __init__(self):
        self.max_pages_per_board = 2

    def crawl_board_list(self, board_key: str, page: int = 1):
        if page > 1:
            return []

        board = BOARD_CONFIG[board_key]
        if board_key == "notice":
            return [
                PostStub(
                    board_key="notice",
                    board_name=board["board_name"],
                    board_id=board["board_id"],
                    article_id=9001,
                    title="Existing notice",
                    post_url="https://computer.hufs.ac.kr/computer/10058/subview.do?enc=notice-9001",
                    published_at=datetime(2026, 3, 1),
                ),
                PostStub(
                    board_key="notice",
                    board_name=board["board_name"],
                    board_id=board["board_id"],
                    article_id=9002,
                    title="New notice",
                    post_url="https://computer.hufs.ac.kr/computer/10058/subview.do?enc=notice-9002",
                    published_at=datetime(2026, 3, 2),
                ),
            ]

        if board_key == "archive":
            return [
                PostStub(
                    board_key="archive",
                    board_name=board["board_name"],
                    board_id=board["board_id"],
                    article_id=9101,
                    title="New archive post",
                    post_url="https://computer.hufs.ac.kr/computer/10059/subview.do?enc=archive-9101",
                    published_at=datetime(2026, 3, 3),
                )
            ]

        if board_key == "jobs":
            return []

        return []

    def crawl_post_detail(self, stub: PostStub):
        return PostDetail(
            board_key=stub.board_key,
            board_name=stub.board_name,
            board_id=stub.board_id,
            article_id=stub.article_id,
            title=stub.title,
            post_url=stub.post_url,
            author="admin",
            published_at=stub.published_at,
            summary=f"{stub.title} summary",
            content_text=f"{stub.title} content",
            attachments=[
                {"name": f"{stub.article_id}.pdf", "url": f"https://example.com/files/{stub.article_id}.pdf"}
            ],
        )


def test_crawl_service_saves_only_new_posts(db_session):
    existing = ScrapedPost(
        board_key="notice",
        board_name="공지사항",
        board_id=1926,
        article_id=9001,
        title="Existing notice",
        post_url="https://computer.hufs.ac.kr/computer/10058/subview.do?enc=notice-9001",
        author="admin",
        published_at=datetime(2026, 3, 1),
        summary="Already stored post",
        content_text="Already stored content",
        attachments=[],
    )
    db_session.add(existing)
    db_session.commit()

    service = CrawlService(db_session)
    service.crawler = FakeCrawler()

    response = service.crawl_new_posts()

    assert response.bootstrap_mode is False
    assert response.bootstrap_inserted_count == 0
    assert response.new_posts_count == 2
    assert [post.article_id for post in response.new_posts] == [9002, 9101]
    assert response.latest_posts_by_board == []

    saved_posts = db_session.scalars(
        select(ScrapedPost).order_by(ScrapedPost.board_key, ScrapedPost.article_id)
    ).all()
    assert len(saved_posts) == 3

    run = db_session.scalars(select(CrawlRun).order_by(CrawlRun.id.desc())).first()
    assert run is not None
    assert run.status == "success"
    assert run.inserted_count == 2


def test_crawl_service_returns_zero_when_no_new_posts(db_session):
    for board_key, board in BOARD_CONFIG.items():
        db_session.add(
            ScrapedPost(
                board_key=board_key,
                board_name=board["board_name"],
                board_id=board["board_id"],
                article_id=1,
                title="Existing post",
                post_url=f"https://computer.hufs.ac.kr/computer/99999/subview.do?enc={board_key}-1",
                author="admin",
                published_at=datetime(2026, 3, 1),
                summary="Existing summary",
                content_text="Existing content",
                attachments=[],
            )
        )
    db_session.commit()

    class NoNewPostCrawler(FakeCrawler):
        def crawl_board_list(self, board_key: str, page: int = 1):
            if page > 1:
                return []
            board = BOARD_CONFIG[board_key]
            return [
                PostStub(
                    board_key=board_key,
                    board_name=board["board_name"],
                    board_id=board["board_id"],
                    article_id=1,
                    title="Existing post",
                    post_url=f"https://computer.hufs.ac.kr/computer/99999/subview.do?enc={board_key}-1",
                    published_at=datetime(2026, 3, 1),
                )
            ]

    service = CrawlService(db_session)
    service.crawler = NoNewPostCrawler()

    response = service.crawl_new_posts()

    assert response.bootstrap_mode is False
    assert response.new_posts_count == 0
    assert response.new_posts == []
    assert [post.board_key for post in response.latest_posts_by_board] == ["notice", "archive", "jobs"]


def test_crawl_service_bootstrap_saves_posts_without_returning_them(db_session):
    service = CrawlService(db_session)
    service.crawler = FakeCrawler()

    response = service.crawl_new_posts()

    assert response.bootstrap_mode is True
    assert response.bootstrap_inserted_count == 3
    assert response.new_posts_count == 0
    assert response.new_posts == []
    assert [post.board_key for post in response.latest_posts_by_board] == ["notice", "archive"]

    saved_posts = db_session.scalars(select(ScrapedPost)).all()
    assert len(saved_posts) == 3