From 1f1c8b249b931a27fc78f2c0f18c65993be38ef3 Mon Sep 17 00:00:00 2001
From: "E. Seiver" <ellis.seiver@gmail.com>
Date: Wed, 15 Mar 2023 15:44:26 -0700
Subject: [PATCH] Add optional HTML formatting `_TranscriptParser`

Text formats in `TEXT_FORMATS` global variable
Defaults to False
---
 youtube_transcript_api/_transcripts.py | 30 ++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py
index cea50c4..64925f3 100644
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@@ -27,6 +27,19 @@ from ._errors import (
 )
 from ._settings import WATCH_URL
 
+TEXT_FORMATS = [
+    'strong',  # important
+    'em',  # emphasized
+    'b',  # bold
+    'i',  # italic
+    'mark',  # marked
+    'small',  # smaller
+    'del',  # deleted
+    'ins',  # inserted
+    'sub',  # subscript
+    'sup',  # superscript
+]
+
 
 def _raise_http_errors(response, video_id):
     try:
@@ -315,15 +328,24 @@ class Transcript(object):
             True,
             [],
         )
-
-
 class _TranscriptParser(object):
-    HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
+    def __init__(self, preserve_formatting=False):
+        self.preserve_formatting = preserve_formatting
+    
+    @property
+    def html_regex(self):
+        if self.preserve_formatting:
+            formats_regex = '|'.join(TEXT_FORMATS)
+            formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
+            html_regex = re.compile(formats_regex, re.IGNORECASE)
+        else:
+            html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
+        return html_regex
 
     def parse(self, plain_data):
         return [
             {
-                'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
+                'text': re.sub(self.html_regex, '', unescape(xml_element.text)),
                 'start': float(xml_element.attrib['start']),
                 'duration': float(xml_element.attrib.get('dur', '0.0')),
             }