Propagate formatting up to user level
This commit is contained in:
parent
1f1c8b249b
commit
c1a037c39c
|
@ -16,7 +16,7 @@ from ._errors import (
|
||||||
|
|
||||||
class YouTubeTranscriptApi(object):
|
class YouTubeTranscriptApi(object):
|
||||||
@classmethod
|
@classmethod
|
||||||
def list_transcripts(cls, video_id, proxies=None, cookies=None):
|
def list_transcripts(cls, video_id, proxies=None, cookies=None, preserve_formatting=False):
|
||||||
"""
|
"""
|
||||||
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
|
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
|
||||||
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
|
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
|
||||||
|
@ -68,7 +68,8 @@ class YouTubeTranscriptApi(object):
|
||||||
if cookies:
|
if cookies:
|
||||||
http_client.cookies = cls._load_cookies(cookies, video_id)
|
http_client.cookies = cls._load_cookies(cookies, video_id)
|
||||||
http_client.proxies = proxies if proxies else {}
|
http_client.proxies = proxies if proxies else {}
|
||||||
return TranscriptListFetcher(http_client).fetch(video_id)
|
return TranscriptListFetcher(http_client).fetch(video_id,
|
||||||
|
preserve_formatting=preserve_formatting)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
|
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
|
||||||
|
@ -109,7 +110,7 @@ class YouTubeTranscriptApi(object):
|
||||||
return data, unretrievable_videos
|
return data, unretrievable_videos
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
|
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
|
||||||
"""
|
"""
|
||||||
Retrieves the transcript for a single video. This is just a shortcut for calling::
|
Retrieves the transcript for a single video. This is just a shortcut for calling::
|
||||||
|
|
||||||
|
@ -129,7 +130,7 @@ class YouTubeTranscriptApi(object):
|
||||||
:rtype [{'text': str, 'start': float, 'end': float}]:
|
:rtype [{'text': str, 'start': float, 'end': float}]:
|
||||||
"""
|
"""
|
||||||
assert isinstance(video_id, str), "`video_id` must be a string"
|
assert isinstance(video_id, str), "`video_id` must be a string"
|
||||||
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
|
return cls.list_transcripts(video_id, proxies, cookies, preserve_formatting=preserve_formatting).find_transcript(languages).fetch()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _load_cookies(cls, cookies, video_id):
|
def _load_cookies(cls, cookies, video_id):
|
||||||
|
|
|
@ -53,11 +53,12 @@ class TranscriptListFetcher(object):
|
||||||
def __init__(self, http_client):
|
def __init__(self, http_client):
|
||||||
self._http_client = http_client
|
self._http_client = http_client
|
||||||
|
|
||||||
def fetch(self, video_id):
|
def fetch(self, video_id, preserve_formatting=False):
|
||||||
return TranscriptList.build(
|
return TranscriptList.build(
|
||||||
self._http_client,
|
self._http_client,
|
||||||
video_id,
|
video_id,
|
||||||
self._extract_captions_json(self._fetch_video_html(video_id), video_id)
|
self._extract_captions_json(self._fetch_video_html(video_id), video_id),
|
||||||
|
preserve_formatting=preserve_formatting,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_captions_json(self, html, video_id):
|
def _extract_captions_json(self, html, video_id):
|
||||||
|
@ -107,7 +108,8 @@ class TranscriptList(object):
|
||||||
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
||||||
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
||||||
"""
|
"""
|
||||||
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
The constructor is only for internal use. Use the static build method instead.
|
The constructor is only for internal use. Use the static build method instead.
|
||||||
|
|
||||||
|
@ -126,7 +128,7 @@ class TranscriptList(object):
|
||||||
self._translation_languages = translation_languages
|
self._translation_languages = translation_languages
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def build(http_client, video_id, captions_json):
|
def build(http_client, video_id, captions_json, preserve_formatting=False):
|
||||||
"""
|
"""
|
||||||
Factory method for TranscriptList.
|
Factory method for TranscriptList.
|
||||||
|
|
||||||
|
@ -162,7 +164,8 @@ class TranscriptList(object):
|
||||||
caption['name']['simpleText'],
|
caption['name']['simpleText'],
|
||||||
caption['languageCode'],
|
caption['languageCode'],
|
||||||
caption.get('kind', '') == 'asr',
|
caption.get('kind', '') == 'asr',
|
||||||
translation_languages if caption.get('isTranslatable', False) else []
|
translation_languages if caption.get('isTranslatable', False) else [],
|
||||||
|
preserve_formatting=preserve_formatting,
|
||||||
)
|
)
|
||||||
|
|
||||||
return TranscriptList(
|
return TranscriptList(
|
||||||
|
@ -262,7 +265,8 @@ class TranscriptList(object):
|
||||||
|
|
||||||
|
|
||||||
class Transcript(object):
|
class Transcript(object):
|
||||||
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages,
|
||||||
|
preserve_formatting=False):
|
||||||
"""
|
"""
|
||||||
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
||||||
TranscriptList.
|
TranscriptList.
|
||||||
|
@ -276,6 +280,7 @@ class Transcript(object):
|
||||||
:param language_code:
|
:param language_code:
|
||||||
:param is_generated:
|
:param is_generated:
|
||||||
:param translation_languages:
|
:param translation_languages:
|
||||||
|
:param preserve_formatting: whether to keep select HTMl text formatting
|
||||||
"""
|
"""
|
||||||
self._http_client = http_client
|
self._http_client = http_client
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
|
@ -288,6 +293,7 @@ class Transcript(object):
|
||||||
translation_language['language_code']: translation_language['language']
|
translation_language['language_code']: translation_language['language']
|
||||||
for translation_language in translation_languages
|
for translation_language in translation_languages
|
||||||
}
|
}
|
||||||
|
self.preserve_formatting = preserve_formatting
|
||||||
|
|
||||||
def fetch(self):
|
def fetch(self):
|
||||||
"""
|
"""
|
||||||
|
@ -297,7 +303,7 @@ class Transcript(object):
|
||||||
:rtype [{'text': str, 'start': float, 'end': float}]:
|
:rtype [{'text': str, 'start': float, 'end': float}]:
|
||||||
"""
|
"""
|
||||||
response = self._http_client.get(self._url)
|
response = self._http_client.get(self._url)
|
||||||
return _TranscriptParser().parse(
|
return _TranscriptParser(preserve_formatting=self.preserve_formatting).parse(
|
||||||
_raise_http_errors(response, self.video_id).text,
|
_raise_http_errors(response, self.video_id).text,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -327,11 +333,12 @@ class Transcript(object):
|
||||||
language_code,
|
language_code,
|
||||||
True,
|
True,
|
||||||
[],
|
[],
|
||||||
|
preserve_formatting=self.preserve_formatting,
|
||||||
)
|
)
|
||||||
class _TranscriptParser(object):
|
class _TranscriptParser(object):
|
||||||
def __init__(self, preserve_formatting=False):
|
def __init__(self, preserve_formatting=False):
|
||||||
self.preserve_formatting = preserve_formatting
|
self.preserve_formatting = preserve_formatting
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def html_regex(self):
|
def html_regex(self):
|
||||||
if self.preserve_formatting:
|
if self.preserve_formatting:
|
||||||
|
|
Loading…
Reference in New Issue