Add new formatters.py module

2020-09-01 15:21:47 -07:00 · 2020-09-01 15:21:47 -07:00 · f3dc6f508f
parent 7a47fc83ad
commit f3dc6f508f
1 changed files with 91 additions and 0 deletions
--- a/youtube_transcript_api/formatters.py
+++ b/youtube_transcript_api/formatters.py
@ -0,0 +1,91 @@
+import json
+
+
+class Formatter(object):
+    """Formatter should be used as an abstract base class.
+    
+    Formatter classes should inherit from this class and implement
+    their own .format() method which should return a string. A 
+    transcript is represented by a List of Dictionary items.
+
+    :param transcript: list representing 1 or more transcripts
+    :type transcript: list
+    """
+    def __init__(self, transcript):
+        if not isinstance(transcript, list):
+            raise TypeError("'transcript' must be of type: List")
+
+        self._transcript = transcript
+    
+    def format(self, **kwargs):
+        raise NotImplementedError('A subclass of Formatter must implement ' \
+            'their own .format() method.')
+
+
+class JSONFormatter(Formatter):
+    def format(self, **kwargs):
+        """Converts a transcript into a JSON string.
+
+        :return: A JSON string representation of the transcript.'
+        :rtype str
+        """
+        return json.dumps(self._transcript, **kwargs)
+
+
+class TextFormatter(Formatter):
+    def format(self, **kwargs):
+        """Converts a transcript into plain text with no timestamps.
+
+        :return: all transcript text lines separated by newline breaks.'
+        :rtype str
+        """
+        return "\n".join(line['text'] for line in self._transcript)
+
+
+class WebVTTFormatter(Formatter):
+    def _seconds_to_timestamp(self, time):
+        """Helper that converts `time` into a transcript cue timestamp.
+
+        :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp
+
+        :param time: a float representing time in seconds.
+        :type time: float
+        :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS'
+        :rtype str
+        :example:
+        >>> self._seconds_to_timestamp(6.93)
+        '00:00:06.930'
+        """
+        time = float(time)
+        hours, mins, secs = (
+            int(time) // 3600,
+            int(time) // 60,
+            int(time) % 60,
+        )
+        ms = int(round((time - int(time))*1000, 2))
+        return "{:02d}:{:02d}:{:02d}.{:03d}".format(hours, mins, secs, ms)
+    
+    def format(self, **kwargs):
+        """A basic implementation of WEBVTT formatting.
+
+        :reference: https://www.w3.org/TR/webvtt1/#introduction-caption
+        """
+        lines = []
+        for i, line in enumerate(self._transcript):
+            if i < len(self._transcript)-1:
+                # Looks ahead, use next start time since duration value
+                # would create an overlap between start times.
+                time_text = "{} --> {}".format(
+                    self._seconds_to_timestamp(line['start']),
+                    self._seconds_to_timestamp(self._transcript[i+1]['start'])
+                )
+            else:
+                # Reached the end, cannot look ahead, use duration now.
+                duration = line['start'] + line['duration']
+                time_text = "{} --> {}".format(
+                    self._seconds_to_timestamp(line['start']),
+                    self._seconds_to_timestamp(duration)
+                )
+            lines.append("{}\n{}".format(time_text, line['text']))
+        
+        return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"