Add optional HTML formatting `_TranscriptParser`

Text formats in `TEXT_FORMATS` global variable
Defaults to False
This commit is contained in:
E. Seiver 2023-03-15 15:44:26 -07:00 committed by GitHub
parent 6070e6165a
commit 1f1c8b249b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 26 additions and 4 deletions

View File

@ -27,6 +27,19 @@ from ._errors import (
) )
from ._settings import WATCH_URL from ._settings import WATCH_URL
TEXT_FORMATS = [
'strong', # important
'em', # emphasized
'b', # bold
'i', # italic
'mark', # marked
'small', # smaller
'del', # deleted
'ins', # inserted
'sub', # subscript
'sup', # superscript
]
def _raise_http_errors(response, video_id): def _raise_http_errors(response, video_id):
try: try:
@ -315,15 +328,24 @@ class Transcript(object):
True, True,
[], [],
) )
class _TranscriptParser(object): class _TranscriptParser(object):
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) def __init__(self, preserve_formatting=False):
self.preserve_formatting = preserve_formatting
@property
def html_regex(self):
if self.preserve_formatting:
formats_regex = '|'.join(TEXT_FORMATS)
formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
html_regex = re.compile(formats_regex, re.IGNORECASE)
else:
html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
return html_regex
def parse(self, plain_data): def parse(self, plain_data):
return [ return [
{ {
'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), 'text': re.sub(self.html_regex, '', unescape(xml_element.text)),
'start': float(xml_element.attrib['start']), 'start': float(xml_element.attrib['start']),
'duration': float(xml_element.attrib.get('dur', '0.0')), 'duration': float(xml_element.attrib.get('dur', '0.0')),
} }