Feat: [main] news-summary-bot 완성

2026-03-24 12:19:54 +09:00
commit dc4656e452
21 changed files with 1028 additions and 0 deletions
--- a/app/init.py
+++ b/app/init.py
--- a/app/config.py
+++ b/app/config.py
@@ -0,0 +1,12 @@
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+    anthropic_api_key: str
+    discord_webhook_url: str
+    api_secret: str = ""
+
+    model_config = {"env_file": ".env", "extra": "ignore"}
+
+
+settings = Settings()
--- a/app/discord.py
+++ b/app/discord.py
@@ -0,0 +1,109 @@
+import re
+from datetime import datetime, timezone
+
+import httpx
+
+from app.config import settings
+
+
+def _extract_video_id(video_url: str) -> str | None:
+    """URL에서 YouTube 비디오 ID 추출."""
+    patterns = [
+        r"(?:youtu\.be/)([^?&]+)",
+        r"(?:v=)([^?&]+)",
+    ]
+    for p in patterns:
+        m = re.search(p, video_url)
+        if m:
+            return m.group(1)
+    return None
+
+
+def _parse_summary(summary: str) -> dict[str, str]:
+    """요약 텍스트를 섹션별로 파싱."""
+    sections: dict[str, str] = {}
+    current_key = None
+    current_lines: list[str] = []
+
+    for line in summary.split("\n"):
+        # **한줄 요약**: ... 또는 ## 한줄 요약 형태 매칭
+        header_match = re.match(
+            r"^(?:##\s*|-\s*\*\*|\*\*)(한줄\s*요약|주요\s*내용|결론/?시사점)[:\*\s]*(.*)",
+            line,
+        )
+        if header_match:
+            if current_key:
+                sections[current_key] = "\n".join(current_lines).strip()
+            current_key = header_match.group(1).replace(" ", "")
+            rest = re.sub(r"^\*\*:?\s*", "", header_match.group(2)).strip()
+            current_lines = [rest] if rest else []
+        elif current_key is not None:
+            current_lines.append(line)
+
+    if current_key:
+        sections[current_key] = "\n".join(current_lines).strip()
+
+    return sections
+
+
+async def send_to_discord(title: str, video_url: str, summary: str) -> None:
+    """Discord 웹훅으로 요약 전송 (임베드 디자인)."""
+    video_id = _extract_video_id(video_url)
+    thumbnail_url = (
+        f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"
+        if video_id
+        else None
+    )
+
+    sections = _parse_summary(summary)
+
+    oneliner = sections.get("한줄요약", "")
+    main_points = sections.get("주요내용", "")
+    conclusion = sections.get("결론/시사점", sections.get("결론시사점", ""))
+
+    # 파싱 실패 시 전체 텍스트를 그대로 사용
+    if not oneliner and not main_points:
+        fields = [{"name": "🔗 원본 영상", "value": video_url, "inline": False}]
+        description = summary[:4096]
+    else:
+        description = f"### 💡 {oneliner}" if oneliner else ""
+        fields = []
+        if main_points:
+            fields.append({
+                "name": "📋 주요 내용",
+                "value": main_points[:1024],
+                "inline": False,
+            })
+        if conclusion:
+            fields.append({
+                "name": "🎯 결론 / 시사점",
+                "value": conclusion[:1024],
+                "inline": False,
+            })
+        fields.append({
+            "name": "🔗 원본 영상",
+            "value": video_url,
+            "inline": False,
+        })
+
+    embed = {
+        "title": f"📰 {title}",
+        "url": video_url,
+        "description": description,
+        "color": 0x2B2D31,
+        "fields": fields,
+        "footer": {
+            "text": "YouTube 뉴스 요약 봇",
+            "icon_url": "https://www.youtube.com/s/desktop/f5ced909/img/favicon_144x144.png",
+        },
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+    }
+
+    if thumbnail_url:
+        embed["thumbnail"] = {"url": thumbnail_url}
+
+    payload = {"embeds": [embed]}
+
+    async with httpx.AsyncClient() as client:
+        resp = await client.post(settings.discord_webhook_url, json=payload)
+        resp.raise_for_status()
--- a/app/main.py
+++ b/app/main.py
@@ -0,0 +1,36 @@
+from fastapi import FastAPI, Header, HTTPException
+from pydantic import BaseModel
+
+from app.config import settings
+from app.discord import send_to_discord
+from app.summarizer import summarize
+from app.transcript import extract_video_id, fetch_transcript
+
+app = FastAPI(title="News Summary Bot")
+
+
+class SummarizeRequest(BaseModel):
+    video_url: str
+    title: str = ""
+
+
+@app.post("/api/news/summarize")
+async def summarize_video(
+    req: SummarizeRequest,
+    x_api_secret: str = Header(default=""),
+):
+    if settings.api_secret and x_api_secret != settings.api_secret:
+        raise HTTPException(status_code=401, detail="Unauthorized")
+
+    video_id = extract_video_id(req.video_url)
+    transcript = fetch_transcript(video_id)
+    title = req.title or video_id
+    summary = summarize(transcript, title)
+    await send_to_discord(title, req.video_url, summary)
+
+    return {"status": "ok", "title": title, "summary_length": len(summary)}
+
+
+@app.get("/api/news/health")
+async def health():
+    return {"status": "ok"}
--- a/app/summarizer.py
+++ b/app/summarizer.py
@@ -0,0 +1,34 @@
+import anthropic
+
+from app.config import settings
+
+client = anthropic.Anthropic(api_key=settings.anthropic_api_key)
+
+SYSTEM_PROMPT = """너는 뉴스/경제 유튜브 영상 요약 전문가야.
+영상 자막 텍스트를 받아서 아래 형식으로 요약해줘.
+
+## 형식
+- **한줄 요약**: 영상의 핵심을 한 문장으로
+- **주요 내용**: 핵심 포인트를 3~7개 불릿으로 정리
+- **결론/시사점**: 영상이 전달하려는 메시지나 시사점
+
+## 규칙
+- 한국어로 작성
+- 간결하고 명확하게
+- 자막의 오타나 말더듬은 무시하고 의미 중심으로 정리
+"""
+
+
+def summarize(transcript: str, title: str) -> str:
+    message = client.messages.create(
+        model="claude-sonnet-4-20250514",
+        max_tokens=2048,
+        system=SYSTEM_PROMPT,
+        messages=[
+            {
+                "role": "user",
+                "content": f"영상 제목: {title}\n\n자막:\n{transcript}",
+            }
+        ],
+    )
+    return message.content[0].text
--- a/app/transcript.py
+++ b/app/transcript.py
@@ -0,0 +1,55 @@
+import httpx
+import yt_dlp
+
+
+def extract_video_id(url: str) -> str:
+    """YouTube URL에서 video ID 추출."""
+    if "youtu.be/" in url:
+        return url.split("youtu.be/")[1].split("?")[0]
+    if "v=" in url:
+        return url.split("v=")[1].split("&")[0]
+    raise ValueError(f"유효하지 않은 YouTube URL: {url}")
+
+
+def fetch_transcript(video_id: str) -> str:
+    """yt-dlp로 YouTube 자동생성 자막을 텍스트로 추출."""
+    url = f"https://www.youtube.com/watch?v={video_id}"
+
+    ydl_opts = {
+        "skip_download": True,
+        "writeautomaticsub": True,
+        "subtitleslangs": ["ko", "en"],
+        "subtitlesformat": "json3",
+        "quiet": True,
+        "no_warnings": True,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=False)
+
+    subs = info.get("automatic_captions", {})
+    lang = "ko" if "ko" in subs else "en" if "en" in subs else None
+    if not lang:
+        raise ValueError(f"자막을 찾을 수 없습니다: {video_id}")
+
+    sub_url = None
+    for fmt in subs[lang]:
+        if fmt["ext"] == "json3":
+            sub_url = fmt["url"]
+            break
+
+    if not sub_url:
+        raise ValueError(f"json3 자막 포맷을 찾을 수 없습니다: {video_id}")
+
+    resp = httpx.get(sub_url)
+    resp.raise_for_status()
+    data = resp.json()
+
+    texts = []
+    for event in data.get("events", []):
+        for seg in event.get("segs", []):
+            text = seg.get("utf8", "").strip()
+            if text and text != "\n":
+                texts.append(text)
+
+    return " ".join(texts)