60 lines
2.8 KiB
Python
60 lines
2.8 KiB
Python
import requests
|
|
|
|
from ._transcripts import TranscriptListFetcher
|
|
|
|
|
|
class YouTubeTranscriptApi():
|
|
@classmethod
|
|
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None):
|
|
"""
|
|
Retrieves the transcripts for a list of videos.
|
|
|
|
:param video_ids: a list of youtube video ids
|
|
:type video_ids: [str]
|
|
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
|
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
|
do so.
|
|
:type languages: [str]
|
|
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
|
|
one of the video transcripts
|
|
:type continue_after_error: bool
|
|
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
|
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
|
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
|
video ids, which could not be retrieved
|
|
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]})
|
|
"""
|
|
data = {}
|
|
unretrievable_videos = []
|
|
|
|
for video_id in video_ids:
|
|
try:
|
|
data[video_id] = cls.get_transcript(video_id, languages, proxies)
|
|
except Exception as exception:
|
|
if not continue_after_error:
|
|
raise exception
|
|
|
|
unretrievable_videos.append(video_id)
|
|
|
|
return data, unretrievable_videos
|
|
|
|
@classmethod
|
|
def get_transcript(cls, video_id, languages=('en',), proxies=None):
|
|
"""
|
|
Retrieves the transcript for a single video.
|
|
|
|
:param video_id: the youtube video id
|
|
:type video_id: str
|
|
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
|
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
|
do so.
|
|
:type languages: [str]
|
|
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
|
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
|
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
|
:rtype: [{'text': str, 'start': float, 'end': float}]
|
|
"""
|
|
with requests.Session() as http_client:
|
|
http_client.proxies = proxies if proxies else {}
|
|
return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
|