move preserve_formatting from init to fetch()
also remove from transcriptlist & transcriptlistfetcher
This commit is contained in:
parent
79fd63d585
commit
ca93c48fa1
|
@ -16,7 +16,7 @@ from ._errors import (
|
||||||
|
|
||||||
class YouTubeTranscriptApi(object):
|
class YouTubeTranscriptApi(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False):
|
def list_transcripts(cls, video_id, proxies=None, cookies=None):
|
||||||
"""
|
"""
|
||||||
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
|
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
|
||||||
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
|
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
|
||||||
|
@ -61,8 +61,6 @@ class YouTubeTranscriptApi(object):
|
||||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||||
:param cookies: a string of the path to a text file containing youtube authorization cookies
|
:param cookies: a string of the path to a text file containing youtube authorization cookies
|
||||||
:type cookies: str
|
:type cookies: str
|
||||||
:param preserve_formatting: whether to keep select HTML text formatting
|
|
||||||
:type preserve_formatting: bool
|
|
||||||
:return: the list of available transcripts
|
:return: the list of available transcripts
|
||||||
:rtype TranscriptList:
|
:rtype TranscriptList:
|
||||||
"""
|
"""
|
||||||
|
@ -70,8 +68,7 @@ class YouTubeTranscriptApi(object):
|
||||||
if cookies:
|
if cookies:
|
||||||
http_client.cookies = cls._load_cookies(cookies, video_id)
|
http_client.cookies = cls._load_cookies(cookies, video_id)
|
||||||
http_client.proxies = proxies if proxies else {}
|
http_client.proxies = proxies if proxies else {}
|
||||||
return TranscriptListFetcher(http_client).fetch(video_id,
|
return TranscriptListFetcher(http_client).fetch(video_id)
|
||||||
preserve_formatting=preserve_formatting)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
|
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
|
||||||
|
@ -137,7 +134,7 @@ class YouTubeTranscriptApi(object):
|
||||||
:rtype [{'text': str, 'start': float, 'end': float}]:
|
:rtype [{'text': str, 'start': float, 'end': float}]:
|
||||||
"""
|
"""
|
||||||
assert isinstance(video_id, str), "`video_id` must be a string"
|
assert isinstance(video_id, str), "`video_id` must be a string"
|
||||||
return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch()
|
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _load_cookies(cls, cookies, video_id):
|
def _load_cookies(cls, cookies, video_id):
|
||||||
|
|
|
@ -40,12 +40,12 @@ class TranscriptListFetcher(object):
|
||||||
def __init__(self, http_client):
|
def __init__(self, http_client):
|
||||||
self._http_client = http_client
|
self._http_client = http_client
|
||||||
|
|
||||||
def fetch(self, video_id, preserve_formatting=False):
|
def fetch(self, video_id):
|
||||||
|
|
||||||
return TranscriptList.build(
|
return TranscriptList.build(
|
||||||
self._http_client,
|
self._http_client,
|
||||||
video_id,
|
video_id,
|
||||||
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
|
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
|
||||||
preserve_formatting=preserve_formatting,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_captions_json(self, html, video_id):
|
def _extract_captions_json(self, html, video_id):
|
||||||
|
@ -114,7 +114,7 @@ class TranscriptList(object):
|
||||||
self._translation_languages = translation_languages
|
self._translation_languages = translation_languages
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def build(http_client, video_id, captions_json, preserve_formatting=False):
|
def build(http_client, video_id, captions_json):
|
||||||
"""
|
"""
|
||||||
Factory method for TranscriptList.
|
Factory method for TranscriptList.
|
||||||
|
|
||||||
|
@ -124,8 +124,6 @@ class TranscriptList(object):
|
||||||
:type video_id: str
|
:type video_id: str
|
||||||
:param captions_json: the JSON parsed from the YouTube pages static HTML
|
:param captions_json: the JSON parsed from the YouTube pages static HTML
|
||||||
:type captions_json: dict
|
:type captions_json: dict
|
||||||
:param preserve_formatting: whether to keep select HTML text formatting
|
|
||||||
:type preserve_formatting: bool
|
|
||||||
:return: the created TranscriptList
|
:return: the created TranscriptList
|
||||||
:rtype TranscriptList:
|
:rtype TranscriptList:
|
||||||
"""
|
"""
|
||||||
|
@ -153,7 +151,6 @@ class TranscriptList(object):
|
||||||
caption['languageCode'],
|
caption['languageCode'],
|
||||||
caption.get('kind', '') == 'asr',
|
caption.get('kind', '') == 'asr',
|
||||||
translation_languages if caption.get('isTranslatable', False) else [],
|
translation_languages if caption.get('isTranslatable', False) else [],
|
||||||
preserve_formatting=preserve_formatting,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return TranscriptList(
|
return TranscriptList(
|
||||||
|
@ -253,8 +250,7 @@ class TranscriptList(object):
|
||||||
|
|
||||||
|
|
||||||
class Transcript(object):
|
class Transcript(object):
|
||||||
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages,
|
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
||||||
preserve_formatting=False):
|
|
||||||
"""
|
"""
|
||||||
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
||||||
TranscriptList.
|
TranscriptList.
|
||||||
|
@ -268,8 +264,6 @@ class Transcript(object):
|
||||||
:param language_code:
|
:param language_code:
|
||||||
:param is_generated:
|
:param is_generated:
|
||||||
:param translation_languages:
|
:param translation_languages:
|
||||||
:param preserve_formatting: whether to keep select HTML text formatting
|
|
||||||
:type preserve_formatting: bool
|
|
||||||
"""
|
"""
|
||||||
self._http_client = http_client
|
self._http_client = http_client
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
|
@ -282,17 +276,17 @@ class Transcript(object):
|
||||||
translation_language['language_code']: translation_language['language']
|
translation_language['language_code']: translation_language['language']
|
||||||
for translation_language in translation_languages
|
for translation_language in translation_languages
|
||||||
}
|
}
|
||||||
self.preserve_formatting = preserve_formatting
|
|
||||||
|
|
||||||
def fetch(self):
|
def fetch(self, preserve_formatting=False):
|
||||||
"""
|
"""
|
||||||
Loads the actual transcript data.
|
Loads the actual transcript data.
|
||||||
|
:param preserve_formatting: whether to keep select HTML text formatting
|
||||||
|
:type preserve_formatting: bool
|
||||||
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||||
:rtype [{'text': str, 'start': float, 'end': float}]:
|
:rtype [{'text': str, 'start': float, 'end': float}]:
|
||||||
"""
|
"""
|
||||||
response = self._http_client.get(self._url)
|
response = self._http_client.get(self._url)
|
||||||
return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse(
|
return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
|
||||||
_raise_http_errors(response, self.video_id).text,)
|
_raise_http_errors(response, self.video_id).text,)
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
@ -321,7 +315,6 @@ class Transcript(object):
|
||||||
language_code,
|
language_code,
|
||||||
True,
|
True,
|
||||||
[],
|
[],
|
||||||
preserve_formatting=self.preserve_formatting,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue