From 1bc50875754d69aac0de519a07749b6ccc54eec3 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Mon, 30 Dec 2019 15:20:47 +0100 Subject: [PATCH] added public list_transcripts method --- youtube_transcript_api/_api.py | 69 ++++++++++++++++++++++---- youtube_transcript_api/_transcripts.py | 16 +++--- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 3476b9b..c1519ae 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -4,17 +4,68 @@ from ._transcripts import TranscriptListFetcher class YouTubeTranscriptApi(): + @classmethod + def list_transcripts(cls, video_id, proxies=None): + """ + Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object + which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating + over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide + metadata and can either be fetched by calling `transcript.fetch()` or translated by calling + `transcript.translate('en')`. Example:: + + # retrieve the available transcripts + transcript_list = YouTubeTranscriptApi.get('video_id') + + # iterate over all available transcripts + for transcript in transcript_list: + # the Transcript object provides metadata properties + print( + transcript.video_id, + transcript.language, + transcript.language_code, + # whether it has been manually created or generated by YouTube + transcript.is_generated, + # a list of languages the transcript can be translated to + transcript.translation_languages, + ) + + # fetch the actual transcript data + print(transcript.fetch()) + + # translating the transcript will return another transcript object + print(transcript.translate('en').fetch()) + + # you can also directly filter for the language you are looking for, using the transcript list + transcript = transcript_list.find_transcript(['de', 'en']) + + # or just filter for manually created transcripts + transcript = transcript_list.find_manually_created_transcript(['de', 'en']) + + # or automatically generated ones + transcript = transcript_list.find_generated_transcript(['de', 'en']) + + :param video_id: the youtube video id + :type video_id: str + :param proxies: a dictionary mapping of http and https proxies to be used for the network requests + :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies + :return: the list of available transcripts + :rtype TranscriptList: + """ + with requests.Session() as http_client: + http_client.proxies = proxies if proxies else {} + return TranscriptListFetcher(http_client).fetch(video_id) + @classmethod def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None): """ Retrieves the transcripts for a list of videos. :param video_ids: a list of youtube video ids - :type video_ids: [str] + :type video_ids: list[str] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving one of the video transcripts :type continue_after_error: bool @@ -22,7 +73,7 @@ class YouTubeTranscriptApi(): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved - :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}) + :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): """ data = {} unretrievable_videos = [] @@ -41,19 +92,19 @@ class YouTubeTranscriptApi(): @classmethod def get_transcript(cls, video_id, languages=('en',), proxies=None): """ - Retrieves the transcript for a single video. + Retrieves the transcript for a single video. This is just a shortcut for calling:: + + YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch() :param video_id: the youtube video id :type video_id: str :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys - :rtype: [{'text': str, 'start': float, 'end': float}] + :rtype [{'text': str, 'start': float, 'end': float}]: """ - with requests.Session() as http_client: - http_client.proxies = proxies if proxies else {} - return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch() + return cls.list_transcripts(video_id, proxies).find_transcript(languages).fetch() diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index 19e9044..6b767ff 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -95,7 +95,7 @@ class TranscriptList(): :param captions_json: the JSON parsed from the YouTube pages static HTML :type captions_json: dict :return: the created TranscriptList - :rtype TranscriptList + :rtype TranscriptList: """ translation_languages = [ { @@ -142,9 +142,9 @@ class TranscriptList(): :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :return: the found Transcript - :rtype: Transcript + :rtype Transcript: :raises: NoTranscriptFound """ return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts]) @@ -156,9 +156,9 @@ class TranscriptList(): :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :return: the found Transcript - :rtype: Transcript + :rtype Transcript: :raises: NoTranscriptFound """ return self._find_transcript(language_codes, [self._generated_transcripts,]) @@ -170,9 +170,9 @@ class TranscriptList(): :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. - :type languages: [str] + :type languages: list[str] :return: the found Transcript - :rtype: Transcript + :rtype Transcript: :raises: NoTranscriptFound """ return self._find_transcript(language_codes, [self._manually_created_transcripts,]) @@ -252,7 +252,7 @@ class Transcript(): Loads the actual transcript data. :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys - :rtype: [{'text': str, 'start': float, 'end': float}] + :rtype [{'text': str, 'start': float, 'end': float}]: """ return _TranscriptParser().parse( self._http_client.get(self._url).text