news-summary-bot/app/transcript.py

import os
import shutil
import tempfile

import httpx
import yt_dlp

COOKIES_SRC = "/app/cookies.txt"


class SkipVideo(Exception):
    """라이브/쇼츠 등 요약 대상이 아닌 영상."""


def extract_video_id(url: str) -> str:
    """YouTube URL에서 video ID 추출."""
    if "/shorts/" in url:
        raise SkipVideo("쇼츠 영상은 요약 대상이 아닙니다")
    if "youtu.be/" in url:
        return url.split("youtu.be/")[1].split("?")[0]
    if "v=" in url:
        return url.split("v=")[1].split("&")[0]
    raise ValueError(f"유효하지 않은 YouTube URL: {url}")


def fetch_transcript(video_id: str) -> str:
    """yt-dlp로 YouTube 자동생성 자막을 텍스트로 추출."""
    url = f"https://www.youtube.com/watch?v={video_id}"

    ydl_opts = {
        "skip_download": True,
        "quiet": True,
        "no_warnings": True,
        "socket_timeout": 30,
    }

    if os.path.isfile(COOKIES_SRC):
        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
        shutil.copy2(COOKIES_SRC, tmp.name)
        ydl_opts["cookiefile"] = tmp.name

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, ie_key="Youtube", download=False, process=False)
    except yt_dlp.utils.DownloadError as e:
        err_msg = str(e).lower()
        if "live event" in err_msg or "is live" in err_msg or "premieres in" in err_msg:
            raise SkipVideo("라이브/예정 영상은 요약 대상이 아닙니다")
        raise
    finally:
        if "cookiefile" in ydl_opts:
            os.unlink(ydl_opts["cookiefile"])

    if info.get("is_live") or info.get("live_status") in ("is_live", "is_upcoming"):
        raise SkipVideo("라이브/예정 영상은 요약 대상이 아닙니다")

    subs = info.get("automatic_captions", {})
    lang = "ko" if "ko" in subs else "en" if "en" in subs else None
    if not lang:
        raise ValueError(f"자막을 찾을 수 없습니다: {video_id}")

    sub_url = None
    for fmt in subs[lang]:
        if fmt["ext"] == "json3":
            sub_url = fmt["url"]
            break

    if not sub_url:
        raise ValueError(f"json3 자막 포맷을 찾을 수 없습니다: {video_id}")

    resp = httpx.get(sub_url, timeout=30.0)
    resp.raise_for_status()
    data = resp.json()

    texts = []
    for event in data.get("events", []):
        for seg in event.get("segs", []):
            text = seg.get("utf8", "").strip()
            if text and text != "\n":
                texts.append(text)

    return " ".join(texts)