diff --git a/app/transcript.py b/app/transcript.py index 51c47b1..6fb19f8 100644 --- a/app/transcript.py +++ b/app/transcript.py @@ -1,5 +1,4 @@ -import httpx -import yt_dlp +from youtube_transcript_api import YouTubeTranscriptApi def extract_video_id(url: str) -> str: @@ -12,44 +11,9 @@ def extract_video_id(url: str) -> str: def fetch_transcript(video_id: str) -> str: - """yt-dlp로 YouTube 자동생성 자막을 텍스트로 추출.""" - url = f"https://www.youtube.com/watch?v={video_id}" - - ydl_opts = { - "skip_download": True, - "writeautomaticsub": True, - "subtitleslangs": ["ko", "en"], - "subtitlesformat": "json3", - "quiet": True, - "no_warnings": True, - } - - with yt_dlp.YoutubeDL(ydl_opts) as ydl: - info = ydl.extract_info(url, download=False) - - subs = info.get("automatic_captions", {}) - lang = "ko" if "ko" in subs else "en" if "en" in subs else None - if not lang: - raise ValueError(f"자막을 찾을 수 없습니다: {video_id}") - - sub_url = None - for fmt in subs[lang]: - if fmt["ext"] == "json3": - sub_url = fmt["url"] - break - - if not sub_url: - raise ValueError(f"json3 자막 포맷을 찾을 수 없습니다: {video_id}") - - resp = httpx.get(sub_url) - resp.raise_for_status() - data = resp.json() - - texts = [] - for event in data.get("events", []): - for seg in event.get("segs", []): - text = seg.get("utf8", "").strip() - if text and text != "\n": - texts.append(text) + """YouTube 자막을 텍스트로 추출.""" + ytt_api = YouTubeTranscriptApi() + transcript = ytt_api.fetch(video_id, languages=["ko", "en"]) + texts = [entry.text for entry in transcript if entry.text.strip()] return " ".join(texts) diff --git a/requirements.txt b/requirements.txt index 3076bf0..a233eb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ fastapi==0.115.12 uvicorn==0.34.2 -yt-dlp>=2025.3.31 +youtube-transcript-api==1.0.3 anthropic==0.52.0 httpx==0.28.1 pydantic-settings==2.8.1