_FORMATTING_TAGS is now a static property of _TranscriptParser; _get_html_regext is now private; removed preserve_formatting property of _TranscriptParser

2023-04-17 15:07:10 +02:00 · 2023-04-17 15:07:10 +02:00 · 8c62e5e276
parent ca93c48fa1
commit 8c62e5e276
1 changed files with 23 additions and 21 deletions
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@ -1,7 +1,7 @@
 import sys
 # This can only be tested by using different python versions, therefore it is not covered by coverage.py
-if sys.version_info.major == 2: # pragma: no cover
+if sys.version_info.major == 2:  # pragma: no cover
    reload(sys)
    sys.setdefaultencoding('utf-8')
@ -95,6 +95,7 @@ class TranscriptList(object):
    This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
    for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
    """
    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
        """
        The constructor is only for internal use. Use the static build method instead.
@ -191,7 +192,7 @@ class TranscriptList(object):
        :rtype Transcript:
        :raises: NoTranscriptFound
        """
-        return self._find_transcript(language_codes, [self._generated_transcripts,])
+        return self._find_transcript(language_codes, [self._generated_transcripts])
    def find_manually_created_transcript(self, language_codes):
        """
@ -205,7 +206,7 @@ class TranscriptList(object):
        :rtype Transcript:
        :raises: NoTranscriptFound
        """
-        return self._find_transcript(language_codes, [self._manually_created_transcripts,])
+        return self._find_transcript(language_codes, [self._manually_created_transcripts])
    def _find_transcript(self, language_codes, transcript_dicts):
        for language_code in language_codes:
@ -287,7 +288,8 @@ class Transcript(object):
        """
        response = self._http_client.get(self._url)
        return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
-            _raise_http_errors(response, self.video_id).text,)
+            _raise_http_errors(response, self.video_id).text,
        )
    def __str__(self):
        return '{language_code} ("{language}"){translation_description}'.format(
@ -319,24 +321,24 @@ class Transcript(object):
 class _TranscriptParser(object):
-    def __init__(self, preserve_formatting=False):
+    _FORMATTING_TAGS = [
-        self.preserve_formatting = preserve_formatting
+        'strong',  # important
-        self._FORMATTING_TAGS = [
+        'em',  # emphasized
-            'strong',  # important
+        'b',  # bold
-            'em',  # emphasized
+        'i',  # italic
-            'b',  # bold
+        'mark',  # marked
-            'i',  # italic
+        'small',  # smaller
-            'mark',  # marked
+        'del',  # deleted
-            'small',  # smaller
+        'ins',  # inserted
-            'del',  # deleted
+        'sub',  # subscript
-            'ins',  # inserted
+        'sup',  # superscript
-            'sub',  # subscript
+    ]
            'sup',  # superscript
            ]
        self._html_regex = self.get_html_regex()
-    def get_html_regex(self):
+    def __init__(self, preserve_formatting=False):
-        if self.preserve_formatting:
+        self._html_regex = self._get_html_regex(preserve_formatting)
    def _get_html_regex(self, preserve_formatting):
        if preserve_formatting:
            formats_regex = '|'.join(self._FORMATTING_TAGS)
            formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
            html_regex = re.compile(formats_regex, re.IGNORECASE)