From 1f1c8b249b931a27fc78f2c0f18c65993be38ef3 Mon Sep 17 00:00:00 2001 From: "E. Seiver" Date: Wed, 15 Mar 2023 15:44:26 -0700 Subject: [PATCH] Add optional HTML formatting `_TranscriptParser` Text formats in `TEXT_FORMATS` global variable Defaults to False --- youtube_transcript_api/_transcripts.py | 30 ++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index cea50c4..64925f3 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -27,6 +27,19 @@ from ._errors import ( ) from ._settings import WATCH_URL +TEXT_FORMATS = [ + 'strong', # important + 'em', # emphasized + 'b', # bold + 'i', # italic + 'mark', # marked + 'small', # smaller + 'del', # deleted + 'ins', # inserted + 'sub', # subscript + 'sup', # superscript +] + def _raise_http_errors(response, video_id): try: @@ -315,15 +328,24 @@ class Transcript(object): True, [], ) - - class _TranscriptParser(object): - HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) + def __init__(self, preserve_formatting=False): + self.preserve_formatting = preserve_formatting + + @property + def html_regex(self): + if self.preserve_formatting: + formats_regex = '|'.join(TEXT_FORMATS) + formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' + html_regex = re.compile(formats_regex, re.IGNORECASE) + else: + html_regex = re.compile(r'<[^>]*>', re.IGNORECASE) + return html_regex def parse(self, plain_data): return [ { - 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), + 'text': re.sub(self.html_regex, '', unescape(xml_element.text)), 'start': float(xml_element.attrib['start']), 'duration': float(xml_element.attrib.get('dur', '0.0')), }