Files
news-summary-bot/app/transcript.py
sm4640 c8c9c592cf
All checks were successful
news-summary-bot-cicd / build_push_deploy (push) Successful in 15m23s
Fix: [2.0.8] process=False로 포맷 처리 건너뛰고 자막만 추출
extract_info에 process=False 적용하여 포맷 선택 단계를 완전히 스킵.
자막 정보는 YouTube extractor에서 직접 반환되므로 포맷 처리 불필요.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-24 17:12:23 +09:00

68 lines
1.9 KiB
Python

import os
import shutil
import tempfile
import httpx
import yt_dlp
COOKIES_SRC = "/app/cookies.txt"
def extract_video_id(url: str) -> str:
"""YouTube URL에서 video ID 추출."""
if "youtu.be/" in url:
return url.split("youtu.be/")[1].split("?")[0]
if "v=" in url:
return url.split("v=")[1].split("&")[0]
raise ValueError(f"유효하지 않은 YouTube URL: {url}")
def fetch_transcript(video_id: str) -> str:
"""yt-dlp로 YouTube 자동생성 자막을 텍스트로 추출."""
url = f"https://www.youtube.com/watch?v={video_id}"
ydl_opts = {
"skip_download": True,
"quiet": True,
"no_warnings": True,
}
if os.path.isfile(COOKIES_SRC):
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
shutil.copy2(COOKIES_SRC, tmp.name)
ydl_opts["cookiefile"] = tmp.name
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, ie_key="Youtube", download=False, process=False)
finally:
if "cookiefile" in ydl_opts:
os.unlink(ydl_opts["cookiefile"])
subs = info.get("automatic_captions", {})
lang = "ko" if "ko" in subs else "en" if "en" in subs else None
if not lang:
raise ValueError(f"자막을 찾을 수 없습니다: {video_id}")
sub_url = None
for fmt in subs[lang]:
if fmt["ext"] == "json3":
sub_url = fmt["url"]
break
if not sub_url:
raise ValueError(f"json3 자막 포맷을 찾을 수 없습니다: {video_id}")
resp = httpx.get(sub_url)
resp.raise_for_status()
data = resp.json()
texts = []
for event in data.get("events", []):
for seg in event.get("segs", []):
text = seg.get("utf8", "").strip()
if text and text != "\n":
texts.append(text)
return " ".join(texts)