import os import shutil import tempfile import httpx import yt_dlp COOKIES_SRC = "/app/cookies.txt" class SkipVideo(Exception): """라이브/쇼츠 등 요약 대상이 아닌 영상.""" def extract_video_id(url: str) -> str: """YouTube URL에서 video ID 추출.""" if "/shorts/" in url: raise SkipVideo("쇼츠 영상은 요약 대상이 아닙니다") if "youtu.be/" in url: return url.split("youtu.be/")[1].split("?")[0] if "v=" in url: return url.split("v=")[1].split("&")[0] raise ValueError(f"유효하지 않은 YouTube URL: {url}") def fetch_transcript(video_id: str) -> str: """yt-dlp로 YouTube 자동생성 자막을 텍스트로 추출.""" url = f"https://www.youtube.com/watch?v={video_id}" ydl_opts = { "skip_download": True, "quiet": True, "no_warnings": True, "socket_timeout": 30, } if os.path.isfile(COOKIES_SRC): tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt") shutil.copy2(COOKIES_SRC, tmp.name) ydl_opts["cookiefile"] = tmp.name try: with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, ie_key="Youtube", download=False, process=False) except yt_dlp.utils.DownloadError as e: err_msg = str(e).lower() if "live event" in err_msg or "is live" in err_msg or "premieres in" in err_msg: raise SkipVideo("라이브/예정 영상은 요약 대상이 아닙니다") raise finally: if "cookiefile" in ydl_opts: os.unlink(ydl_opts["cookiefile"]) if info.get("is_live") or info.get("live_status") in ("is_live", "is_upcoming"): raise SkipVideo("라이브/예정 영상은 요약 대상이 아닙니다") subs = info.get("automatic_captions", {}) lang = "ko" if "ko" in subs else "en" if "en" in subs else None if not lang: raise ValueError(f"자막을 찾을 수 없습니다: {video_id}") sub_url = None for fmt in subs[lang]: if fmt["ext"] == "json3": sub_url = fmt["url"] break if not sub_url: raise ValueError(f"json3 자막 포맷을 찾을 수 없습니다: {video_id}") resp = httpx.get(sub_url, timeout=30.0) resp.raise_for_status() data = resp.json() texts = [] for event in data.get("events", []): for seg in event.get("segs", []): text = seg.get("utf8", "").strip() if text and text != "\n": texts.append(text) return " ".join(texts)