_html_regex static property of _TranscriptParser()

also rename TEXT_FORMATS -> FORMATTING TAGS
This commit is contained in:
E. Seiver 2023-04-12 14:29:19 -07:00
parent 72e9781528
commit eda8ddb38f
1 changed files with 5 additions and 5 deletions

View File

@ -27,7 +27,7 @@ from ._errors import (
) )
from ._settings import WATCH_URL from ._settings import WATCH_URL
TEXT_FORMATS = [ _FORMATTING_TAGS = [
'strong', # important 'strong', # important
'em', # emphasized 'em', # emphasized
'b', # bold 'b', # bold
@ -341,11 +341,11 @@ class Transcript(object):
class _TranscriptParser(object): class _TranscriptParser(object):
def __init__(self, preserve_formatting=False): def __init__(self, preserve_formatting=False):
self.preserve_formatting = preserve_formatting self.preserve_formatting = preserve_formatting
self._html_regex = self.get_html_regex()
@property def get_html_regex(self):
def html_regex(self):
if self.preserve_formatting: if self.preserve_formatting:
formats_regex = '|'.join(TEXT_FORMATS) formats_regex = '|'.join(_FORMATTING_TAGS)
formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
html_regex = re.compile(formats_regex, re.IGNORECASE) html_regex = re.compile(formats_regex, re.IGNORECASE)
else: else:
@ -355,7 +355,7 @@ class _TranscriptParser(object):
def parse(self, plain_data): def parse(self, plain_data):
return [ return [
{ {
'text': re.sub(self.html_regex, '', unescape(xml_element.text)), 'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
'start': float(xml_element.attrib['start']), 'start': float(xml_element.attrib['start']),
'duration': float(xml_element.attrib.get('dur', '0.0')), 'duration': float(xml_element.attrib.get('dur', '0.0')),
} }