commit
a1b1e001fe
|
@ -55,7 +55,7 @@ This will return a list of dictionaries looking somewhat like this:
|
|||
]
|
||||
```
|
||||
|
||||
You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it usually defaults to english).
|
||||
You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english).
|
||||
|
||||
```python
|
||||
YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
|
||||
|
@ -118,4 +118,4 @@ youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://us
|
|||
|
||||
If this project makes you happy by reducing your development time, you can make me happy by treating me to a cup of coffee :)
|
||||
|
||||
[](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url)
|
||||
[](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url)
|
||||
|
|
|
@ -1 +1,3 @@
|
|||
from ._api import YouTubeTranscriptApi
|
||||
from ._transcripts import TranscriptList, Transcript
|
||||
from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable
|
||||
|
|
|
@ -1,55 +1,19 @@
|
|||
import sys
|
||||
|
||||
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
|
||||
if sys.version_info.major == 2: # pragma: no cover
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
|
||||
from xml.etree import ElementTree
|
||||
|
||||
import re
|
||||
|
||||
import requests
|
||||
|
||||
from ._html_unescaping import unescape
|
||||
from ._transcripts import TranscriptListFetcher
|
||||
|
||||
|
||||
class YouTubeTranscriptApi():
|
||||
class CouldNotRetrieveTranscript(Exception):
|
||||
"""
|
||||
Raised if a transcript could not be retrieved.
|
||||
"""
|
||||
|
||||
ERROR_MESSAGE = (
|
||||
'Could not get the transcript for the video {video_url}! '
|
||||
'This usually happens if one of the following things is the case:\n'
|
||||
' - subtitles have been disabled by the uploader\n'
|
||||
' - none of the language codes you provided are valid\n'
|
||||
' - none of the languages you provided are supported by the video\n'
|
||||
' - the video is no longer available.\n\n'
|
||||
'If none of these things is the case, please create an issue at '
|
||||
'https://github.com/jdepoix/youtube-transcript-api/issues.'
|
||||
'Please add which version of youtube_transcript_api you are using and make sure that there '
|
||||
'are no open issues which already describe your problem!'
|
||||
)
|
||||
|
||||
def __init__(self, video_id):
|
||||
super(YouTubeTranscriptApi.CouldNotRetrieveTranscript, self).__init__(
|
||||
self.ERROR_MESSAGE.format(video_url=_TranscriptFetcher.WATCH_URL.format(video_id=video_id))
|
||||
)
|
||||
self.video_id = video_id
|
||||
|
||||
@classmethod
|
||||
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None):
|
||||
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None):
|
||||
"""
|
||||
Retrieves the transcripts for a list of videos.
|
||||
|
||||
:param video_ids: a list of youtube video ids
|
||||
:type video_ids: [str]
|
||||
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||
it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
|
||||
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
|
||||
play around with the language codes a bit, to find the one which is working for you!
|
||||
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
||||
do so.
|
||||
:type languages: [str]
|
||||
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
|
||||
one of the video transcripts
|
||||
|
@ -58,7 +22,7 @@ class YouTubeTranscriptApi():
|
|||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
||||
video ids, which could not be retrieved
|
||||
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}
|
||||
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]})
|
||||
"""
|
||||
data = {}
|
||||
unretrievable_videos = []
|
||||
|
@ -75,7 +39,7 @@ class YouTubeTranscriptApi():
|
|||
return data, unretrievable_videos
|
||||
|
||||
@classmethod
|
||||
def get_transcript(cls, video_id, languages=None, proxies=None):
|
||||
def get_transcript(cls, video_id, languages=('en',), proxies=None):
|
||||
"""
|
||||
Retrieves the transcript for a single video.
|
||||
|
||||
|
@ -83,82 +47,13 @@ class YouTubeTranscriptApi():
|
|||
:type video_id: str
|
||||
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
||||
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
|
||||
play around with the language codes a bit, to find the one which is working for you!
|
||||
do so.
|
||||
:type languages: [str]
|
||||
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||
:rtype: [{'text': str, 'start': float, 'end': float}]
|
||||
"""
|
||||
try:
|
||||
return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse()
|
||||
except Exception:
|
||||
raise YouTubeTranscriptApi.CouldNotRetrieveTranscript(video_id)
|
||||
|
||||
|
||||
class _TranscriptFetcher():
|
||||
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
||||
API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
|
||||
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
|
||||
TIMEDTEXT_STRING = 'timedtext?v='
|
||||
|
||||
def __init__(self, video_id, languages, proxies):
|
||||
self.video_id = video_id
|
||||
self.languages = languages
|
||||
self.proxies = proxies
|
||||
|
||||
def fetch(self):
|
||||
if self.proxies:
|
||||
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text
|
||||
else:
|
||||
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
|
||||
timedtext_splits = fetched_site.split(self.TIMEDTEXT_STRING)
|
||||
timedtext_url_start = (
|
||||
timedtext_splits[2].find(self.TIMEDTEXT_STRING)
|
||||
+ len(timedtext_splits[0])
|
||||
+ len(timedtext_splits[1])
|
||||
+ len(self.TIMEDTEXT_STRING) + 1
|
||||
)
|
||||
|
||||
for language in (self.languages if self.languages else [None,]):
|
||||
response = self._execute_api_request(fetched_site, timedtext_url_start, language)
|
||||
if response:
|
||||
return response
|
||||
|
||||
return None
|
||||
|
||||
def _execute_api_request(self, fetched_site, timedtext_url_start, language):
|
||||
url = self.API_BASE_URL.format(
|
||||
api_url=fetched_site[
|
||||
timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
|
||||
].replace(
|
||||
'\\u0026', '&'
|
||||
).replace(
|
||||
'\\', ''
|
||||
)
|
||||
)
|
||||
if language:
|
||||
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
|
||||
if self.proxies:
|
||||
return requests.get(url, proxies=self.proxies).text
|
||||
else:
|
||||
return requests.get(url).text
|
||||
|
||||
|
||||
class _TranscriptParser():
|
||||
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
|
||||
|
||||
def __init__(self, plain_data):
|
||||
self.plain_data = plain_data
|
||||
|
||||
def parse(self):
|
||||
return [
|
||||
{
|
||||
'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
|
||||
'start': float(xml_element.attrib['start']),
|
||||
'duration': float(xml_element.attrib['dur']),
|
||||
}
|
||||
for xml_element in ElementTree.fromstring(self.plain_data)
|
||||
if xml_element.text is not None
|
||||
]
|
||||
with requests.Session() as http_client:
|
||||
http_client.proxies = proxies if proxies else {}
|
||||
return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
from ._settings import WATCH_URL
|
||||
|
||||
|
||||
class CouldNotRetrieveTranscript(Exception):
|
||||
"""
|
||||
Raised if a transcript could not be retrieved.
|
||||
"""
|
||||
ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
|
||||
CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
|
||||
CAUSE_MESSAGE = ''
|
||||
GITHUB_REFERRAL = (
|
||||
'\n\nIf you are sure that the described cause is not responsible for this error '
|
||||
'and that a transcript should be retrievable, please create an issue at '
|
||||
'https://github.com/jdepoix/youtube-transcript-api/issues.'
|
||||
'Please add which version of youtube_transcript_api you are using '
|
||||
'and provide the information needed to replicate the error. '
|
||||
'Also make sure that there are no open issues which already describe your problem!'
|
||||
)
|
||||
|
||||
def __init__(self, video_id):
|
||||
self.video_id = video_id
|
||||
super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message())
|
||||
|
||||
def _build_error_message(self):
|
||||
cause = self.cause
|
||||
error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
|
||||
|
||||
if cause:
|
||||
error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
|
||||
|
||||
return error_message
|
||||
|
||||
@property
|
||||
def cause(self):
|
||||
return self.CAUSE_MESSAGE
|
||||
|
||||
|
||||
class VideoUnavailable(CouldNotRetrieveTranscript):
|
||||
CAUSE_MESSAGE = 'The video is no longer available'
|
||||
|
||||
|
||||
class TranscriptsDisabled(CouldNotRetrieveTranscript):
|
||||
CAUSE_MESSAGE = 'Subtitles are disabled for this video'
|
||||
|
||||
|
||||
class NoTranscriptFound(CouldNotRetrieveTranscript):
|
||||
CAUSE_MESSAGE = (
|
||||
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
||||
'{transcript_data}'
|
||||
)
|
||||
|
||||
def __init__(self, video_id, requested_language_codes, transcript_data):
|
||||
self._requested_language_codes = requested_language_codes
|
||||
self._transcript_data = transcript_data
|
||||
super(NoTranscriptFound, self).__init__(video_id)
|
||||
|
||||
@property
|
||||
def cause(self):
|
||||
return self.CAUSE_MESSAGE.format(
|
||||
requested_language_codes=self._requested_language_codes,
|
||||
transcript_data=str(self._transcript_data),
|
||||
)
|
|
@ -0,0 +1 @@
|
|||
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
|
@ -0,0 +1,276 @@
|
|||
import sys
|
||||
|
||||
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
|
||||
if sys.version_info.major == 2: # pragma: no cover
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
|
||||
import json
|
||||
|
||||
from xml.etree import ElementTree
|
||||
|
||||
import re
|
||||
|
||||
from ._html_unescaping import unescape
|
||||
from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled
|
||||
from ._settings import WATCH_URL
|
||||
|
||||
|
||||
class TranscriptListFetcher():
|
||||
def __init__(self, http_client):
|
||||
self._http_client = http_client
|
||||
|
||||
def fetch(self, video_id):
|
||||
return TranscriptList.build(
|
||||
self._http_client,
|
||||
video_id,
|
||||
self._extract_captions_json(self._fetch_html(video_id), video_id)
|
||||
)
|
||||
|
||||
def _extract_captions_json(self, html, video_id):
|
||||
splitted_html = html.split('"captions":')
|
||||
|
||||
if len(splitted_html) <= 1:
|
||||
if '"playabilityStatus":' not in html:
|
||||
raise VideoUnavailable(video_id)
|
||||
|
||||
raise TranscriptsDisabled(video_id)
|
||||
|
||||
return json.loads(splitted_html[1].split(',"videoDetails')[0].replace('\n', ''))[
|
||||
'playerCaptionsTracklistRenderer'
|
||||
]
|
||||
|
||||
def _fetch_html(self, video_id):
|
||||
return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace(
|
||||
'\\u0026', '&'
|
||||
).replace(
|
||||
'\\', ''
|
||||
)
|
||||
|
||||
|
||||
class TranscriptList():
|
||||
"""
|
||||
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
||||
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
||||
"""
|
||||
|
||||
# TODO implement iterator
|
||||
|
||||
def __init__(self, video_id, manually_created_transcripts, generated_transcripts):
|
||||
"""
|
||||
The constructor is only for internal use. Use the static build method instead.
|
||||
|
||||
:param video_id: the id of the video this TranscriptList is for
|
||||
:type video_id: str
|
||||
:param manually_created_transcripts: dict mapping language codes to the manually created transcripts
|
||||
:type manually_created_transcripts: dict[str, Transcript]
|
||||
:param generated_transcripts: dict mapping language codes to the generated transcripts
|
||||
:type generated_transcripts: dict[str, Transcript]
|
||||
"""
|
||||
self.video_id = video_id
|
||||
self._manually_created_transcripts = manually_created_transcripts
|
||||
self._generated_transcripts = generated_transcripts
|
||||
|
||||
@staticmethod
|
||||
def build(http_client, video_id, captions_json):
|
||||
"""
|
||||
Factory method for TranscriptList.
|
||||
|
||||
:param http_client: http client which is used to make the transcript retrieving http calls
|
||||
:type http_client: requests.Session
|
||||
:param video_id: the id of the video this TranscriptList is for
|
||||
:type video_id: str
|
||||
:param captions_json: the JSON parsed from the YouTube pages static HTML
|
||||
:type captions_json: dict
|
||||
:return: the created TranscriptList
|
||||
:rtype TranscriptList
|
||||
"""
|
||||
translation_languages = [
|
||||
{
|
||||
'language': translation_language['languageName']['simpleText'],
|
||||
'language_code': translation_language['languageCode'],
|
||||
} for translation_language in captions_json['translationLanguages']
|
||||
]
|
||||
|
||||
manually_created_transcripts = {}
|
||||
generated_transcripts = {}
|
||||
|
||||
for caption in captions_json['captionTracks']:
|
||||
if caption.get('kind', '') == 'asr':
|
||||
transcript_dict = generated_transcripts
|
||||
else:
|
||||
transcript_dict = manually_created_transcripts
|
||||
|
||||
transcript_dict[caption['languageCode']] = Transcript(
|
||||
http_client,
|
||||
video_id,
|
||||
caption['baseUrl'],
|
||||
caption['name']['simpleText'],
|
||||
caption['languageCode'],
|
||||
caption.get('kind', '') == 'asr',
|
||||
translation_languages if caption['isTranslatable'] else []
|
||||
)
|
||||
|
||||
return TranscriptList(
|
||||
video_id,
|
||||
manually_created_transcripts,
|
||||
generated_transcripts,
|
||||
)
|
||||
|
||||
def find_transcript(self, language_codes):
|
||||
"""
|
||||
Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
|
||||
are found, generated transcripts are used. If you only want generated transcripts use
|
||||
find_manually_created_transcript instead.
|
||||
|
||||
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||
it fails to do so.
|
||||
:type languages: [str]
|
||||
:return: the found Transcript
|
||||
:rtype: Transcript
|
||||
:raises: NoTranscriptFound
|
||||
"""
|
||||
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
|
||||
|
||||
def find_generated_transcript(self, language_codes):
|
||||
"""
|
||||
Finds a automatically generated transcript for a given language code.
|
||||
|
||||
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||
it fails to do so.
|
||||
:type languages: [str]
|
||||
:return: the found Transcript
|
||||
:rtype: Transcript
|
||||
:raises: NoTranscriptFound
|
||||
"""
|
||||
return self._find_transcript(language_codes, [self._generated_transcripts,])
|
||||
|
||||
def find_manually_created_transcript(self, language_codes):
|
||||
"""
|
||||
Finds a manually created transcript for a given language code.
|
||||
|
||||
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||
it fails to do so.
|
||||
:type languages: [str]
|
||||
:return: the found Transcript
|
||||
:rtype: Transcript
|
||||
:raises: NoTranscriptFound
|
||||
"""
|
||||
return self._find_transcript(language_codes, [self._manually_created_transcripts,])
|
||||
|
||||
def _find_transcript(self, language_codes, transcript_dicts):
|
||||
for language_code in language_codes:
|
||||
for transcript_dict in transcript_dicts:
|
||||
if language_code in transcript_dict:
|
||||
return transcript_dict[language_code]
|
||||
|
||||
raise NoTranscriptFound(
|
||||
self.video_id,
|
||||
language_codes,
|
||||
self
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return (
|
||||
'For this video ({video_id}) transcripts are available in the following languages:\n\n'
|
||||
'(MANUALLY CREATED)\n'
|
||||
'{available_manually_created_transcript_languages}\n\n'
|
||||
'(GENERATED)\n'
|
||||
'{available_generated_transcripts}'
|
||||
).format(
|
||||
video_id=self.video_id,
|
||||
available_manually_created_transcript_languages=self._get_language_description(
|
||||
self._manually_created_transcripts.values()
|
||||
),
|
||||
available_generated_transcripts=self._get_language_description(
|
||||
self._generated_transcripts.values()
|
||||
),
|
||||
)
|
||||
|
||||
def _get_language_description(self, transcripts):
|
||||
return '\n'.join(
|
||||
' - {transcript}'.format(transcript=str(transcript))
|
||||
for transcript in transcripts
|
||||
) if transcripts else 'None'
|
||||
|
||||
|
||||
class Transcript():
|
||||
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
||||
"""
|
||||
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
||||
TranscriptList.
|
||||
|
||||
:param http_client: http client which is used to make the transcript retrieving http calls
|
||||
:type http_client: requests.Session
|
||||
:param video_id: the id of the video this TranscriptList is for
|
||||
:type video_id: str
|
||||
:param url: the url which needs to be called to fetch the transcript
|
||||
:param language: the name of the language this transcript uses
|
||||
:param language_code:
|
||||
:param is_generated:
|
||||
:param translation_languages:
|
||||
"""
|
||||
self._http_client = http_client
|
||||
self.video_id = video_id
|
||||
self._url = url
|
||||
self.language = language
|
||||
self.language_code = language_code
|
||||
self.is_generated = is_generated
|
||||
self.translation_languages = translation_languages
|
||||
|
||||
def fetch(self):
|
||||
"""
|
||||
Loads the actual transcript data.
|
||||
|
||||
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||
:rtype: [{'text': str, 'start': float, 'end': float}]
|
||||
"""
|
||||
return _TranscriptParser().parse(
|
||||
self._http_client.get(self._url).text
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return '{language_code} ("{language}")'.format(
|
||||
language=self.language,
|
||||
language_code=self.language_code,
|
||||
)
|
||||
|
||||
# TODO integrate translations in future release
|
||||
# @property
|
||||
# def is_translatable(self):
|
||||
# return len(self.translation_languages) > 0
|
||||
#
|
||||
#
|
||||
# class TranslatableTranscript(Transcript):
|
||||
# def __init__(self, http_client, url, translation_languages):
|
||||
# super(TranslatableTranscript, self).__init__(http_client, url)
|
||||
# self._translation_languages = translation_languages
|
||||
# self._translation_language_codes = {language['language_code'] for language in translation_languages}
|
||||
#
|
||||
#
|
||||
# def translate(self, language_code):
|
||||
# if language_code not in self._translation_language_codes:
|
||||
# raise TranslatableTranscript.TranslationLanguageNotAvailable()
|
||||
#
|
||||
# return Transcript(
|
||||
# self._http_client,
|
||||
# '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code)
|
||||
# )
|
||||
|
||||
|
||||
class _TranscriptParser():
|
||||
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
|
||||
|
||||
def parse(self, plain_data):
|
||||
return [
|
||||
{
|
||||
'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
|
||||
'start': float(xml_element.attrib['start']),
|
||||
'duration': float(xml_element.attrib['dur']),
|
||||
}
|
||||
for xml_element in ElementTree.fromstring(plain_data)
|
||||
if xml_element.text is not None
|
||||
]
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -5,7 +5,7 @@ import os
|
|||
|
||||
import httpretty
|
||||
|
||||
from youtube_transcript_api._api import YouTubeTranscriptApi
|
||||
from youtube_transcript_api import YouTubeTranscriptApi, VideoUnavailable, NoTranscriptFound, TranscriptsDisabled
|
||||
|
||||
|
||||
def load_asset(filename):
|
||||
|
@ -53,26 +53,40 @@ class TestYouTubeTranscriptApi(TestCase):
|
|||
def test_get_transcript__fallback_language_is_used(self):
|
||||
httpretty.register_uri(
|
||||
httpretty.GET,
|
||||
'https://www.youtube.com/api/timedtext',
|
||||
body=''
|
||||
'https://www.youtube.com/watch',
|
||||
body=load_asset('youtube_ww1_nl_en.html.static')
|
||||
)
|
||||
|
||||
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en'])
|
||||
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY', ['de', 'en'])
|
||||
query_string = httpretty.last_request().querystring
|
||||
|
||||
self.assertIn('lang', query_string)
|
||||
self.assertEqual(len(query_string['lang']), 1)
|
||||
self.assertEqual(query_string['lang'][0], 'en')
|
||||
|
||||
def test_get_transcript__exception_is_raised_when_not_available(self):
|
||||
def test_get_transcript__exception_if_video_unavailable(self):
|
||||
httpretty.register_uri(
|
||||
httpretty.GET,
|
||||
'https://www.youtube.com/api/timedtext',
|
||||
body=''
|
||||
'https://www.youtube.com/watch',
|
||||
body=load_asset('youtube_video_unavailable.html.static')
|
||||
)
|
||||
|
||||
with self.assertRaises(YouTubeTranscriptApi.CouldNotRetrieveTranscript):
|
||||
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8')
|
||||
with self.assertRaises(VideoUnavailable):
|
||||
YouTubeTranscriptApi.get_transcript('abc')
|
||||
|
||||
def test_get_transcript__exception_if_transcripts_disabled(self):
|
||||
httpretty.register_uri(
|
||||
httpretty.GET,
|
||||
'https://www.youtube.com/watch',
|
||||
body=load_asset('youtube_transcripts_disabled.html.static')
|
||||
)
|
||||
|
||||
with self.assertRaises(TranscriptsDisabled):
|
||||
YouTubeTranscriptApi.get_transcript('dsMFmonKDD4')
|
||||
|
||||
def test_get_transcript__exception_if_language_unavailable(self):
|
||||
with self.assertRaises(NoTranscriptFound):
|
||||
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', languages=['cz'])
|
||||
|
||||
def test_get_transcripts(self):
|
||||
video_id_1 = 'video_id_1'
|
||||
|
@ -99,8 +113,8 @@ class TestYouTubeTranscriptApi(TestCase):
|
|||
|
||||
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
|
||||
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None, None)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None, None)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ('en',), None)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ('en',), None)
|
||||
|
||||
def test_get_transcript__with_proxies(self):
|
||||
proxies = {'http': '', 'https:': ''}
|
||||
|
@ -118,4 +132,4 @@ class TestYouTubeTranscriptApi(TestCase):
|
|||
)
|
||||
YouTubeTranscriptApi.get_transcript = MagicMock()
|
||||
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', None, proxies)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies)
|
||||
|
|
Loading…
Reference in New Issue