From 14c70359ba6a39cdc0e130e05925942780905e55 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 21 Jan 2021 19:43:29 +0100 Subject: [PATCH] Fix "video not available" being shown to the user when when YouTube start asking for captcha resolution due to receiving too many requests from the same IP. Show instead an appropiate message. To be able to keep making requests, the captcha must be solved in a browser and the browser cookie must be passed to youtube-transcript-api. --- youtube_transcript_api/__init__.py | 1 + youtube_transcript_api/_errors.py | 5 +- youtube_transcript_api/_transcripts.py | 3 + .../youtube_too_many_requests.html.static | 239 ++++++++++++++++++ youtube_transcript_api/test/test_api.py | 11 + 5 files changed, 258 insertions(+), 1 deletion(-) create mode 100644 youtube_transcript_api/test/assets/youtube_too_many_requests.html.static diff --git a/youtube_transcript_api/__init__.py b/youtube_transcript_api/__init__.py index 1fe0f73..baefd02 100644 --- a/youtube_transcript_api/__init__.py +++ b/youtube_transcript_api/__init__.py @@ -5,6 +5,7 @@ from ._errors import ( NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable, + TooManyRequests, NotTranslatable, TranslationLanguageNotAvailable, NoTranscriptAvailable, diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index 2f83a16..f7a5658 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -37,7 +37,10 @@ class CouldNotRetrieveTranscript(Exception): class VideoUnavailable(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'The video is no longer available' - + +class TooManyRequests(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = ('YouTube is receiving too many requests from this IP,' + ' and now requires that a captcha must be solved in order to continue.') class TranscriptsDisabled(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'Subtitles are disabled for this video' diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 6b767ff..9400a1d 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -14,6 +14,7 @@ import re from ._html_unescaping import unescape from ._errors import ( VideoUnavailable, + TooManyRequests, NoTranscriptFound, TranscriptsDisabled, NotTranslatable, @@ -38,6 +39,8 @@ class TranscriptListFetcher(): splitted_html = html.split('"captions":') if len(splitted_html) <= 1: + if 'class="g-recaptcha"' in html: + raise TooManyRequests(video_id) if '"playabilityStatus":' not in html: raise VideoUnavailable(video_id) diff --git a/youtube_transcript_api/test/assets/youtube_too_many_requests.html.static b/youtube_transcript_api/test/assets/youtube_too_many_requests.html.static new file mode 100644 index 0000000..c63003f --- /dev/null +++ b/youtube_transcript_api/test/assets/youtube_too_many_requests.html.static @@ -0,0 +1,239 @@ + + + + YouTube + + + + + + + + + +
+
+

+ Perdón por la interrupción. Hemos recibido un gran número de + solicitudes de tu red. +

+

+ Para seguir disfrutando de YouTube, rellena el siguiente formulario. +

+
+
+
+
+ +
+ ES + +
+
+ +
+ + diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 5f95451..daf98f8 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -12,6 +12,7 @@ from youtube_transcript_api import ( TranscriptsDisabled, NoTranscriptFound, VideoUnavailable, + TooManyRequests, NoTranscriptAvailable, NotTranslatable, TranslationLanguageNotAvailable, @@ -134,6 +135,16 @@ class TestYouTubeTranscriptApi(TestCase): with self.assertRaises(VideoUnavailable): YouTubeTranscriptApi.get_transcript('abc') + def test_get_transcript__exception_if_video_unavailable(self): + httpretty.register_uri( + httpretty.GET, + 'https://www.youtube.com/watch', + body=load_asset('youtube_too_many_requests.html.static') + ) + + with self.assertRaises(TooManyRequests): + YouTubeTranscriptApi.get_transcript('abc') + def test_get_transcript__exception_if_transcripts_disabled(self): httpretty.register_uri( httpretty.GET,