_FORMATTING_TAGS is now a static property of _TranscriptParser; _get_html_regext is now private; removed preserve_formatting property of _TranscriptParser

This commit is contained in:
Jonas Depoix 2023-04-17 15:07:10 +02:00
parent ca93c48fa1
commit 8c62e5e276
1 changed files with 23 additions and 21 deletions

View File

@ -1,7 +1,7 @@
import sys import sys
# This can only be tested by using different python versions, therefore it is not covered by coverage.py # This can only be tested by using different python versions, therefore it is not covered by coverage.py
if sys.version_info.major == 2: # pragma: no cover if sys.version_info.major == 2: # pragma: no cover
reload(sys) reload(sys)
sys.setdefaultencoding('utf-8') sys.setdefaultencoding('utf-8')
@ -95,6 +95,7 @@ class TranscriptList(object):
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
for a given YouTube video. Also it provides functionality to search for a transcript in a given language. for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
""" """
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
""" """
The constructor is only for internal use. Use the static build method instead. The constructor is only for internal use. Use the static build method instead.
@ -191,7 +192,7 @@ class TranscriptList(object):
:rtype Transcript: :rtype Transcript:
:raises: NoTranscriptFound :raises: NoTranscriptFound
""" """
return self._find_transcript(language_codes, [self._generated_transcripts,]) return self._find_transcript(language_codes, [self._generated_transcripts])
def find_manually_created_transcript(self, language_codes): def find_manually_created_transcript(self, language_codes):
""" """
@ -205,7 +206,7 @@ class TranscriptList(object):
:rtype Transcript: :rtype Transcript:
:raises: NoTranscriptFound :raises: NoTranscriptFound
""" """
return self._find_transcript(language_codes, [self._manually_created_transcripts,]) return self._find_transcript(language_codes, [self._manually_created_transcripts])
def _find_transcript(self, language_codes, transcript_dicts): def _find_transcript(self, language_codes, transcript_dicts):
for language_code in language_codes: for language_code in language_codes:
@ -287,7 +288,8 @@ class Transcript(object):
""" """
response = self._http_client.get(self._url) response = self._http_client.get(self._url)
return _TranscriptParser(preserve_formatting=preserve_formatting).parse( return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
_raise_http_errors(response, self.video_id).text,) _raise_http_errors(response, self.video_id).text,
)
def __str__(self): def __str__(self):
return '{language_code} ("{language}"){translation_description}'.format( return '{language_code} ("{language}"){translation_description}'.format(
@ -319,24 +321,24 @@ class Transcript(object):
class _TranscriptParser(object): class _TranscriptParser(object):
def __init__(self, preserve_formatting=False): _FORMATTING_TAGS = [
self.preserve_formatting = preserve_formatting 'strong', # important
self._FORMATTING_TAGS = [ 'em', # emphasized
'strong', # important 'b', # bold
'em', # emphasized 'i', # italic
'b', # bold 'mark', # marked
'i', # italic 'small', # smaller
'mark', # marked 'del', # deleted
'small', # smaller 'ins', # inserted
'del', # deleted 'sub', # subscript
'ins', # inserted 'sup', # superscript
'sub', # subscript ]
'sup', # superscript
]
self._html_regex = self.get_html_regex()
def get_html_regex(self): def __init__(self, preserve_formatting=False):
if self.preserve_formatting: self._html_regex = self._get_html_regex(preserve_formatting)
def _get_html_regex(self, preserve_formatting):
if preserve_formatting:
formats_regex = '|'.join(self._FORMATTING_TAGS) formats_regex = '|'.join(self._FORMATTING_TAGS)
formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
html_regex = re.compile(formats_regex, re.IGNORECASE) html_regex = re.compile(formats_regex, re.IGNORECASE)