Add new formatters.py module
This commit is contained in:
parent
7a47fc83ad
commit
f3dc6f508f
|
@ -0,0 +1,91 @@
|
|||
import json
|
||||
|
||||
|
||||
class Formatter(object):
|
||||
"""Formatter should be used as an abstract base class.
|
||||
|
||||
Formatter classes should inherit from this class and implement
|
||||
their own .format() method which should return a string. A
|
||||
transcript is represented by a List of Dictionary items.
|
||||
|
||||
:param transcript: list representing 1 or more transcripts
|
||||
:type transcript: list
|
||||
"""
|
||||
def __init__(self, transcript):
|
||||
if not isinstance(transcript, list):
|
||||
raise TypeError("'transcript' must be of type: List")
|
||||
|
||||
self._transcript = transcript
|
||||
|
||||
def format(self, **kwargs):
|
||||
raise NotImplementedError('A subclass of Formatter must implement ' \
|
||||
'their own .format() method.')
|
||||
|
||||
|
||||
class JSONFormatter(Formatter):
|
||||
def format(self, **kwargs):
|
||||
"""Converts a transcript into a JSON string.
|
||||
|
||||
:return: A JSON string representation of the transcript.'
|
||||
:rtype str
|
||||
"""
|
||||
return json.dumps(self._transcript, **kwargs)
|
||||
|
||||
|
||||
class TextFormatter(Formatter):
|
||||
def format(self, **kwargs):
|
||||
"""Converts a transcript into plain text with no timestamps.
|
||||
|
||||
:return: all transcript text lines separated by newline breaks.'
|
||||
:rtype str
|
||||
"""
|
||||
return "\n".join(line['text'] for line in self._transcript)
|
||||
|
||||
|
||||
class WebVTTFormatter(Formatter):
|
||||
def _seconds_to_timestamp(self, time):
|
||||
"""Helper that converts `time` into a transcript cue timestamp.
|
||||
|
||||
:reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp
|
||||
|
||||
:param time: a float representing time in seconds.
|
||||
:type time: float
|
||||
:return: a string formatted as a cue timestamp, 'HH:MM:SS.MS'
|
||||
:rtype str
|
||||
:example:
|
||||
>>> self._seconds_to_timestamp(6.93)
|
||||
'00:00:06.930'
|
||||
"""
|
||||
time = float(time)
|
||||
hours, mins, secs = (
|
||||
int(time) // 3600,
|
||||
int(time) // 60,
|
||||
int(time) % 60,
|
||||
)
|
||||
ms = int(round((time - int(time))*1000, 2))
|
||||
return "{:02d}:{:02d}:{:02d}.{:03d}".format(hours, mins, secs, ms)
|
||||
|
||||
def format(self, **kwargs):
|
||||
"""A basic implementation of WEBVTT formatting.
|
||||
|
||||
:reference: https://www.w3.org/TR/webvtt1/#introduction-caption
|
||||
"""
|
||||
lines = []
|
||||
for i, line in enumerate(self._transcript):
|
||||
if i < len(self._transcript)-1:
|
||||
# Looks ahead, use next start time since duration value
|
||||
# would create an overlap between start times.
|
||||
time_text = "{} --> {}".format(
|
||||
self._seconds_to_timestamp(line['start']),
|
||||
self._seconds_to_timestamp(self._transcript[i+1]['start'])
|
||||
)
|
||||
else:
|
||||
# Reached the end, cannot look ahead, use duration now.
|
||||
duration = line['start'] + line['duration']
|
||||
time_text = "{} --> {}".format(
|
||||
self._seconds_to_timestamp(line['start']),
|
||||
self._seconds_to_timestamp(duration)
|
||||
)
|
||||
lines.append("{}\n{}".format(time_text, line['text']))
|
||||
|
||||
return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"
|
Loading…
Reference in New Issue