Propagate formatting up to user level

This commit is contained in:
E. Seiver 2023-03-15 18:20:16 -07:00
parent 1f1c8b249b
commit c1a037c39c
2 changed files with 20 additions and 12 deletions

View File

@ -16,7 +16,7 @@ from ._errors import (
class YouTubeTranscriptApi(object): class YouTubeTranscriptApi(object):
@classmethod @classmethod
def list_transcripts(cls, video_id, proxies=None, cookies=None): def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False):
""" """
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
@ -68,7 +68,8 @@ class YouTubeTranscriptApi(object):
if cookies: if cookies:
http_client.cookies = cls._load_cookies(cookies, video_id) http_client.cookies = cls._load_cookies(cookies, video_id)
http_client.proxies = proxies if proxies else {} http_client.proxies = proxies if proxies else {}
return TranscriptListFetcher(http_client).fetch(video_id) return TranscriptListFetcher(http_client).fetch(video_id,
preserve_formatting=preserve_formatting)
@classmethod @classmethod
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None): def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
@ -109,7 +110,7 @@ class YouTubeTranscriptApi(object):
return data, unretrievable_videos return data, unretrievable_videos
@classmethod @classmethod
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None): def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
""" """
Retrieves the transcript for a single video. This is just a shortcut for calling:: Retrieves the transcript for a single video. This is just a shortcut for calling::
@ -129,7 +130,7 @@ class YouTubeTranscriptApi(object):
:rtype [{'text': str, 'start': float, 'end': float}]: :rtype [{'text': str, 'start': float, 'end': float}]:
""" """
assert isinstance(video_id, str), "`video_id` must be a string" assert isinstance(video_id, str), "`video_id` must be a string"
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch()
@classmethod @classmethod
def _load_cookies(cls, cookies, video_id): def _load_cookies(cls, cookies, video_id):

View File

@ -53,11 +53,12 @@ class TranscriptListFetcher(object):
def __init__(self, http_client): def __init__(self, http_client):
self._http_client = http_client self._http_client = http_client
def fetch(self, video_id): def fetch(self, video_id, preserve_formatting=False):
return TranscriptList.build( return TranscriptList.build(
self._http_client, self._http_client,
video_id, video_id,
self._extract_captions_json(self._fetch_video_html(video_id), video_id) self._extract_captions_json(self._fetch_video_html(video_id), video_id),
preserve_formatting=preserve_formatting,
) )
def _extract_captions_json(self, html, video_id): def _extract_captions_json(self, html, video_id):
@ -107,7 +108,8 @@ class TranscriptList(object):
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
for a given YouTube video. Also it provides functionality to search for a transcript in a given language. for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
""" """
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages,
):
""" """
The constructor is only for internal use. Use the static build method instead. The constructor is only for internal use. Use the static build method instead.
@ -126,7 +128,7 @@ class TranscriptList(object):
self._translation_languages = translation_languages self._translation_languages = translation_languages
@staticmethod @staticmethod
def build(http_client, video_id, captions_json): def build(http_client, video_id, captions_json, preserve_formatting=False):
""" """
Factory method for TranscriptList. Factory method for TranscriptList.
@ -162,7 +164,8 @@ class TranscriptList(object):
caption['name']['simpleText'], caption['name']['simpleText'],
caption['languageCode'], caption['languageCode'],
caption.get('kind', '') == 'asr', caption.get('kind', '') == 'asr',
translation_languages if caption.get('isTranslatable', False) else [] translation_languages if caption.get('isTranslatable', False) else [],
preserve_formatting=preserve_formatting,
) )
return TranscriptList( return TranscriptList(
@ -262,7 +265,8 @@ class TranscriptList(object):
class Transcript(object): class Transcript(object):
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages,
preserve_formatting=False):
""" """
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
TranscriptList. TranscriptList.
@ -276,6 +280,7 @@ class Transcript(object):
:param language_code: :param language_code:
:param is_generated: :param is_generated:
:param translation_languages: :param translation_languages:
:param preserve_formatting: whether to keep select HTMl text formatting
""" """
self._http_client = http_client self._http_client = http_client
self.video_id = video_id self.video_id = video_id
@ -288,6 +293,7 @@ class Transcript(object):
translation_language['language_code']: translation_language['language'] translation_language['language_code']: translation_language['language']
for translation_language in translation_languages for translation_language in translation_languages
} }
self.preserve_formatting = preserve_formatting
def fetch(self): def fetch(self):
""" """
@ -297,7 +303,7 @@ class Transcript(object):
:rtype [{'text': str, 'start': float, 'end': float}]: :rtype [{'text': str, 'start': float, 'end': float}]:
""" """
response = self._http_client.get(self._url) response = self._http_client.get(self._url)
return _TranscriptParser().parse( return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse(
_raise_http_errors(response, self.video_id).text, _raise_http_errors(response, self.video_id).text,
) )
@ -327,6 +333,7 @@ class Transcript(object):
language_code, language_code,
True, True,
[], [],
preserve_formatting=self.preserve_formatting,
) )
class _TranscriptParser(object): class _TranscriptParser(object):
def __init__(self, preserve_formatting=False): def __init__(self, preserve_formatting=False):