move preserve_formatting from init to fetch()

also remove from transcriptlist & transcriptlistfetcher
This commit is contained in:
E. Seiver 2023-04-13 12:46:24 -07:00
parent 79fd63d585
commit ca93c48fa1
2 changed files with 12 additions and 22 deletions

View File

@ -16,7 +16,7 @@ from ._errors import (
class YouTubeTranscriptApi(object):
@classmethod
def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False):
def list_transcripts(cls, video_id, proxies=None, cookies=None):
"""
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
@ -61,8 +61,6 @@ class YouTubeTranscriptApi(object):
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: the list of available transcripts
:rtype TranscriptList:
"""
@ -70,8 +68,7 @@ class YouTubeTranscriptApi(object):
if cookies:
http_client.cookies = cls._load_cookies(cookies, video_id)
http_client.proxies = proxies if proxies else {}
return TranscriptListFetcher(http_client).fetch(video_id,
preserve_formatting=preserve_formatting)
return TranscriptListFetcher(http_client).fetch(video_id)
@classmethod
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
@ -137,8 +134,8 @@ class YouTubeTranscriptApi(object):
:rtype [{'text': str, 'start': float, 'end': float}]:
"""
assert isinstance(video_id, str), "`video_id` must be a string"
return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch()
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
@classmethod
def _load_cookies(cls, cookies, video_id):
try:

View File

@ -40,12 +40,12 @@ class TranscriptListFetcher(object):
def __init__(self, http_client):
self._http_client = http_client
def fetch(self, video_id, preserve_formatting=False):
def fetch(self, video_id):
return TranscriptList.build(
self._http_client,
video_id,
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
preserve_formatting=preserve_formatting,
)
def _extract_captions_json(self, html, video_id):
@ -114,7 +114,7 @@ class TranscriptList(object):
self._translation_languages = translation_languages
@staticmethod
def build(http_client, video_id, captions_json, preserve_formatting=False):
def build(http_client, video_id, captions_json):
"""
Factory method for TranscriptList.
@ -124,8 +124,6 @@ class TranscriptList(object):
:type video_id: str
:param captions_json: the JSON parsed from the YouTube pages static HTML
:type captions_json: dict
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: the created TranscriptList
:rtype TranscriptList:
"""
@ -153,7 +151,6 @@ class TranscriptList(object):
caption['languageCode'],
caption.get('kind', '') == 'asr',
translation_languages if caption.get('isTranslatable', False) else [],
preserve_formatting=preserve_formatting,
)
return TranscriptList(
@ -253,8 +250,7 @@ class TranscriptList(object):
class Transcript(object):
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages,
preserve_formatting=False):
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
"""
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
TranscriptList.
@ -268,8 +264,6 @@ class Transcript(object):
:param language_code:
:param is_generated:
:param translation_languages:
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
"""
self._http_client = http_client
self.video_id = video_id
@ -282,17 +276,17 @@ class Transcript(object):
translation_language['language_code']: translation_language['language']
for translation_language in translation_languages
}
self.preserve_formatting = preserve_formatting
def fetch(self):
def fetch(self, preserve_formatting=False):
"""
Loads the actual transcript data.
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype [{'text': str, 'start': float, 'end': float}]:
"""
response = self._http_client.get(self._url)
return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse(
return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
_raise_http_errors(response, self.video_id).text,)
def __str__(self):
@ -321,7 +315,6 @@ class Transcript(object):
language_code,
True,
[],
preserve_formatting=self.preserve_formatting,
)