From a04a7010ed112ad10d64bd6bf57bfa99fff23e54 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 17 Apr 2023 15:34:46 +0200 Subject: [PATCH] added error which is thrown if url is used as the video id --- youtube_transcript_api/__init__.py | 1 + youtube_transcript_api/_errors.py | 8 ++++++++ youtube_transcript_api/_transcripts.py | 6 ++++-- youtube_transcript_api/test/test_api.py | 11 +++++++++++ 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/youtube_transcript_api/__init__.py b/youtube_transcript_api/__init__.py index 4cf4b9c..7f703f4 100644 --- a/youtube_transcript_api/__init__.py +++ b/youtube_transcript_api/__init__.py @@ -13,4 +13,5 @@ from ._errors import ( CookiesInvalid, FailedToCreateConsentCookie, YouTubeRequestFailed, + InvalidVideoId, ) diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index cae17ad..d652c59 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -53,6 +53,14 @@ class VideoUnavailable(CouldNotRetrieveTranscript): CAUSE_MESSAGE = 'The video is no longer available' +class InvalidVideoId(CouldNotRetrieveTranscript): + CAUSE_MESSAGE = ( + 'You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n' + 'Do NOT run: `YouTubeTranscriptApi.get_transcript("https://www.youtube.com/watch?v=1234")`\n' + 'Instead run: `YouTubeTranscriptApi.get_transcript("1234")`' + ) + + class TooManyRequests(CouldNotRetrieveTranscript): CAUSE_MESSAGE = ( 'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. ' diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 1e0f8f1..4e712dd 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -24,6 +24,7 @@ from ._errors import ( TranslationLanguageNotAvailable, NoTranscriptAvailable, FailedToCreateConsentCookie, + InvalidVideoId, ) from ._settings import WATCH_URL @@ -41,7 +42,6 @@ class TranscriptListFetcher(object): self._http_client = http_client def fetch(self, video_id): - return TranscriptList.build( self._http_client, video_id, @@ -52,6 +52,8 @@ class TranscriptListFetcher(object): splitted_html = html.split('"captions":') if len(splitted_html) <= 1: + if video_id.startswith('http://') or video_id.startswith('https://'): + raise InvalidVideoId(video_id) if 'class="g-recaptcha"' in html: raise TooManyRequests(video_id) if '"playabilityStatus":' not in html: @@ -182,7 +184,7 @@ class TranscriptList(object): def find_generated_transcript(self, language_codes): """ - Finds a automatically generated transcript for a given language code. + Finds an automatically generated transcript for a given language code. :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 36d60a5..d6f5e0c 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -20,6 +20,7 @@ from youtube_transcript_api import ( CookiesInvalid, FailedToCreateConsentCookie, YouTubeRequestFailed, + InvalidVideoId, ) @@ -97,6 +98,16 @@ class TestYouTubeTranscriptApi(TestCase): self.assertTrue(transcript.is_generated) + def test_list_transcripts__url_as_video_id(self): + httpretty.register_uri( + httpretty.GET, + 'https://www.youtube.com/watch', + body=load_asset('youtube_transcripts_disabled.html.static') + ) + + with self.assertRaises(InvalidVideoId): + YouTubeTranscriptApi.list_transcripts('https://www.youtube.com/watch?v=GJLlxj_dtq8') + def test_translate_transcript(self): transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en'])