Fix: [2.0.2] yt-dlp → youtube-transcript-api로 교체
All checks were successful
news-summary-bot-cicd / build_push_deploy (push) Successful in 9m6s
All checks were successful
news-summary-bot-cicd / build_push_deploy (push) Successful in 9m6s
OCI 서버에서 YouTube 봇 감지로 yt-dlp 차단됨. 자막 전용 라이브러리로 교체하여 클라우드 IP 환경에서도 동작하도록 수정. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,4 @@
|
|||||||
import httpx
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
import yt_dlp
|
|
||||||
|
|
||||||
|
|
||||||
def extract_video_id(url: str) -> str:
|
def extract_video_id(url: str) -> str:
|
||||||
@@ -12,44 +11,9 @@ def extract_video_id(url: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def fetch_transcript(video_id: str) -> str:
|
def fetch_transcript(video_id: str) -> str:
|
||||||
"""yt-dlp로 YouTube 자동생성 자막을 텍스트로 추출."""
|
"""YouTube 자막을 텍스트로 추출."""
|
||||||
url = f"https://www.youtube.com/watch?v={video_id}"
|
ytt_api = YouTubeTranscriptApi()
|
||||||
|
transcript = ytt_api.fetch(video_id, languages=["ko", "en"])
|
||||||
ydl_opts = {
|
|
||||||
"skip_download": True,
|
|
||||||
"writeautomaticsub": True,
|
|
||||||
"subtitleslangs": ["ko", "en"],
|
|
||||||
"subtitlesformat": "json3",
|
|
||||||
"quiet": True,
|
|
||||||
"no_warnings": True,
|
|
||||||
}
|
|
||||||
|
|
||||||
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
||||||
info = ydl.extract_info(url, download=False)
|
|
||||||
|
|
||||||
subs = info.get("automatic_captions", {})
|
|
||||||
lang = "ko" if "ko" in subs else "en" if "en" in subs else None
|
|
||||||
if not lang:
|
|
||||||
raise ValueError(f"자막을 찾을 수 없습니다: {video_id}")
|
|
||||||
|
|
||||||
sub_url = None
|
|
||||||
for fmt in subs[lang]:
|
|
||||||
if fmt["ext"] == "json3":
|
|
||||||
sub_url = fmt["url"]
|
|
||||||
break
|
|
||||||
|
|
||||||
if not sub_url:
|
|
||||||
raise ValueError(f"json3 자막 포맷을 찾을 수 없습니다: {video_id}")
|
|
||||||
|
|
||||||
resp = httpx.get(sub_url)
|
|
||||||
resp.raise_for_status()
|
|
||||||
data = resp.json()
|
|
||||||
|
|
||||||
texts = []
|
|
||||||
for event in data.get("events", []):
|
|
||||||
for seg in event.get("segs", []):
|
|
||||||
text = seg.get("utf8", "").strip()
|
|
||||||
if text and text != "\n":
|
|
||||||
texts.append(text)
|
|
||||||
|
|
||||||
|
texts = [entry.text for entry in transcript if entry.text.strip()]
|
||||||
return " ".join(texts)
|
return " ".join(texts)
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
fastapi==0.115.12
|
fastapi==0.115.12
|
||||||
uvicorn==0.34.2
|
uvicorn==0.34.2
|
||||||
yt-dlp>=2025.3.31
|
youtube-transcript-api==1.0.3
|
||||||
anthropic==0.52.0
|
anthropic==0.52.0
|
||||||
httpx==0.28.1
|
httpx==0.28.1
|
||||||
pydantic-settings==2.8.1
|
pydantic-settings==2.8.1
|
||||||
|
|||||||
Reference in New Issue
Block a user