added public list_transcripts method

This commit is contained in:
Jonas Depoix 2019-12-30 15:20:47 +01:00
parent 8287d1088e
commit 1bc5087575
2 changed files with 68 additions and 17 deletions

View File

@ -4,17 +4,68 @@ from ._transcripts import TranscriptListFetcher
class YouTubeTranscriptApi(): class YouTubeTranscriptApi():
@classmethod
def list_transcripts(cls, video_id, proxies=None):
"""
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
`transcript.translate('en')`. Example::
# retrieve the available transcripts
transcript_list = YouTubeTranscriptApi.get('video_id')
# iterate over all available transcripts
for transcript in transcript_list:
# the Transcript object provides metadata properties
print(
transcript.video_id,
transcript.language,
transcript.language_code,
# whether it has been manually created or generated by YouTube
transcript.is_generated,
# a list of languages the transcript can be translated to
transcript.translation_languages,
)
# fetch the actual transcript data
print(transcript.fetch())
# translating the transcript will return another transcript object
print(transcript.translate('en').fetch())
# you can also directly filter for the language you are looking for, using the transcript list
transcript = transcript_list.find_transcript(['de', 'en'])
# or just filter for manually created transcripts
transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
# or automatically generated ones
transcript = transcript_list.find_generated_transcript(['de', 'en'])
:param video_id: the youtube video id
:type video_id: str
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:return: the list of available transcripts
:rtype TranscriptList:
"""
with requests.Session() as http_client:
http_client.proxies = proxies if proxies else {}
return TranscriptListFetcher(http_client).fetch(video_id)
@classmethod @classmethod
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None): def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None):
""" """
Retrieves the transcripts for a list of videos. Retrieves the transcripts for a list of videos.
:param video_ids: a list of youtube video ids :param video_ids: a list of youtube video ids
:type video_ids: [str] :type video_ids: list[str]
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
do so. do so.
:type languages: [str] :type languages: list[str]
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
one of the video transcripts one of the video transcripts
:type continue_after_error: bool :type continue_after_error: bool
@ -22,7 +73,7 @@ class YouTubeTranscriptApi():
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
video ids, which could not be retrieved video ids, which could not be retrieved
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}) :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
""" """
data = {} data = {}
unretrievable_videos = [] unretrievable_videos = []
@ -41,19 +92,19 @@ class YouTubeTranscriptApi():
@classmethod @classmethod
def get_transcript(cls, video_id, languages=('en',), proxies=None): def get_transcript(cls, video_id, languages=('en',), proxies=None):
""" """
Retrieves the transcript for a single video. Retrieves the transcript for a single video. This is just a shortcut for calling::
YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
:param video_id: the youtube video id :param video_id: the youtube video id
:type video_id: str :type video_id: str
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
do so. do so.
:type languages: [str] :type languages: list[str]
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype: [{'text': str, 'start': float, 'end': float}] :rtype [{'text': str, 'start': float, 'end': float}]:
""" """
with requests.Session() as http_client: return cls.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
http_client.proxies = proxies if proxies else {}
return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()

View File

@ -95,7 +95,7 @@ class TranscriptList():
:param captions_json: the JSON parsed from the YouTube pages static HTML :param captions_json: the JSON parsed from the YouTube pages static HTML
:type captions_json: dict :type captions_json: dict
:return: the created TranscriptList :return: the created TranscriptList
:rtype TranscriptList :rtype TranscriptList:
""" """
translation_languages = [ translation_languages = [
{ {
@ -142,9 +142,9 @@ class TranscriptList():
:param language_codes: A list of language codes in a descending priority. For example, if this is set to :param language_codes: A list of language codes in a descending priority. For example, if this is set to
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
it fails to do so. it fails to do so.
:type languages: [str] :type languages: list[str]
:return: the found Transcript :return: the found Transcript
:rtype: Transcript :rtype Transcript:
:raises: NoTranscriptFound :raises: NoTranscriptFound
""" """
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts]) return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
@ -156,9 +156,9 @@ class TranscriptList():
:param language_codes: A list of language codes in a descending priority. For example, if this is set to :param language_codes: A list of language codes in a descending priority. For example, if this is set to
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
it fails to do so. it fails to do so.
:type languages: [str] :type languages: list[str]
:return: the found Transcript :return: the found Transcript
:rtype: Transcript :rtype Transcript:
:raises: NoTranscriptFound :raises: NoTranscriptFound
""" """
return self._find_transcript(language_codes, [self._generated_transcripts,]) return self._find_transcript(language_codes, [self._generated_transcripts,])
@ -170,9 +170,9 @@ class TranscriptList():
:param language_codes: A list of language codes in a descending priority. For example, if this is set to :param language_codes: A list of language codes in a descending priority. For example, if this is set to
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
it fails to do so. it fails to do so.
:type languages: [str] :type languages: list[str]
:return: the found Transcript :return: the found Transcript
:rtype: Transcript :rtype Transcript:
:raises: NoTranscriptFound :raises: NoTranscriptFound
""" """
return self._find_transcript(language_codes, [self._manually_created_transcripts,]) return self._find_transcript(language_codes, [self._manually_created_transcripts,])
@ -252,7 +252,7 @@ class Transcript():
Loads the actual transcript data. Loads the actual transcript data.
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype: [{'text': str, 'start': float, 'end': float}] :rtype [{'text': str, 'start': float, 'end': float}]:
""" """
return _TranscriptParser().parse( return _TranscriptParser().parse(
self._http_client.get(self._url).text self._http_client.get(self._url).text