From f3dc6f508f6ff3753798f2d8d66307fbb7641830 Mon Sep 17 00:00:00 2001 From: Chris Howell Date: Tue, 1 Sep 2020 15:21:47 -0700 Subject: [PATCH] Add new formatters.py module --- youtube_transcript_api/formatters.py | 91 ++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 youtube_transcript_api/formatters.py diff --git a/youtube_transcript_api/formatters.py b/youtube_transcript_api/formatters.py new file mode 100644 index 0000000..6696215 --- /dev/null +++ b/youtube_transcript_api/formatters.py @@ -0,0 +1,91 @@ +import json + + +class Formatter(object): + """Formatter should be used as an abstract base class. + + Formatter classes should inherit from this class and implement + their own .format() method which should return a string. A + transcript is represented by a List of Dictionary items. + + :param transcript: list representing 1 or more transcripts + :type transcript: list + """ + def __init__(self, transcript): + if not isinstance(transcript, list): + raise TypeError("'transcript' must be of type: List") + + self._transcript = transcript + + def format(self, **kwargs): + raise NotImplementedError('A subclass of Formatter must implement ' \ + 'their own .format() method.') + + +class JSONFormatter(Formatter): + def format(self, **kwargs): + """Converts a transcript into a JSON string. + + :return: A JSON string representation of the transcript.' + :rtype str + """ + return json.dumps(self._transcript, **kwargs) + + +class TextFormatter(Formatter): + def format(self, **kwargs): + """Converts a transcript into plain text with no timestamps. + + :return: all transcript text lines separated by newline breaks.' + :rtype str + """ + return "\n".join(line['text'] for line in self._transcript) + + +class WebVTTFormatter(Formatter): + def _seconds_to_timestamp(self, time): + """Helper that converts `time` into a transcript cue timestamp. + + :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp + + :param time: a float representing time in seconds. + :type time: float + :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS' + :rtype str + :example: + >>> self._seconds_to_timestamp(6.93) + '00:00:06.930' + """ + time = float(time) + hours, mins, secs = ( + int(time) // 3600, + int(time) // 60, + int(time) % 60, + ) + ms = int(round((time - int(time))*1000, 2)) + return "{:02d}:{:02d}:{:02d}.{:03d}".format(hours, mins, secs, ms) + + def format(self, **kwargs): + """A basic implementation of WEBVTT formatting. + + :reference: https://www.w3.org/TR/webvtt1/#introduction-caption + """ + lines = [] + for i, line in enumerate(self._transcript): + if i < len(self._transcript)-1: + # Looks ahead, use next start time since duration value + # would create an overlap between start times. + time_text = "{} --> {}".format( + self._seconds_to_timestamp(line['start']), + self._seconds_to_timestamp(self._transcript[i+1]['start']) + ) + else: + # Reached the end, cannot look ahead, use duration now. + duration = line['start'] + line['duration'] + time_text = "{} --> {}".format( + self._seconds_to_timestamp(line['start']), + self._seconds_to_timestamp(duration) + ) + lines.append("{}\n{}".format(time_text, line['text'])) + + return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"