Add more functionality to TranscriptFormatter base class

Due to the behavior of the CLI and API, needed more flexibility for combining 1 or many transcripts for a given formatter.

- Now can specify a DELIMITER to separate multiple transcripts on.
- Can also specify how those items are combine overriding the combine class method.

Remove unused imports
Fix adjust some lines to meet PEP
This commit is contained in:
Chris Howell 2020-07-09 00:04:08 -07:00
parent 2c79bd563c
commit 1c0d584959
1 changed files with 56 additions and 39 deletions

View File

@ -1,13 +1,9 @@
from abc import ABCMeta from abc import ABC
from abc import abstractclassmethod from abc import abstractclassmethod
from collections import defaultdict from collections import defaultdict
import json import json
import re import re
from xml.etree import ElementTree
from ._html_unescaping import unescape
def parse_timecode(time): def parse_timecode(time):
"""Converts a `time` into a formatted transcript timecode. """Converts a `time` into a formatted transcript timecode.
@ -31,15 +27,29 @@ def parse_timecode(time):
return f"{hours}:{mins}:{secs},{ms}" return f"{hours}:{mins}:{secs},{ms}"
class TranscriptFormatter(metaclass=ABCMeta): class TranscriptFormatter(ABC):
""" """Abstract Base TranscriptFormatter class
Abstract Base TranscriptFormatter class
This class should be inherited from to create additional This class should be inherited from to create additional
custom transcript formatters. custom transcript formatters.
""" """
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
DELIMITER = ''
@classmethod
def combine(cls, transcripts):
"""Subclass may override this class method.
Default behavior of this method will ''.join() the str()
of each transcript in transcripts.
:param transcripts: a list of many transcripts
:type transcript_data: list[<formatted transcript>, ...]
:return: A string joined on the `cls.DELIMITER` to combine transcripts
:rtype: str
"""
return cls.DELIMITER.join(
str(transcript) for transcript in transcripts)
@abstractclassmethod @abstractclassmethod
def format(cls, transcript_data): def format(cls, transcript_data):
@ -56,9 +66,15 @@ class TranscriptFormatter(metaclass=ABCMeta):
class JSONTranscriptFormatter(TranscriptFormatter): class JSONTranscriptFormatter(TranscriptFormatter):
"""Formatter for outputting JSON data""" """Formatter for outputting JSON data"""
DELIMITER = ','
@classmethod
def combine(cls, transcripts):
return json.dumps(transcripts)
@classmethod @classmethod
def format(cls, transcript_data): def format(cls, transcript_data):
return [json.dumps(transcript_data)] if transcript_data else [] return transcript_data
class TextTranscriptFormatter(TranscriptFormatter): class TextTranscriptFormatter(TranscriptFormatter):
@ -66,39 +82,40 @@ class TextTranscriptFormatter(TranscriptFormatter):
Converts the fetched transcript data into separated lines of Converts the fetched transcript data into separated lines of
plain text separated by newline breaks (\n) with no timecodes. plain text separated by newline breaks (\n) with no timecodes.
""" """
DELIMITER = '\n\n'
@classmethod @classmethod
def format(cls, transcript_data): def format(cls, transcript_data):
return ['\n'.join(line['text'] for transcript in transcript_data return '{}\n'.format('\n'.join(
for line in transcript)] line['text']for line in transcript_data))
class SRTTranscriptFormatter(TranscriptFormatter): class SRTTranscriptFormatter(TranscriptFormatter):
"""Formatter for outputting the SRT Format """Formatter for outputting the SRT Format
Converts the fetched transcript data into a simple .srt file format. Converts the fetched transcript data into a simple .srt file format.
""" """
DELIMITER = '\n\n'
@classmethod @classmethod
def format(cls, transcript_data): def format(cls, transcript_data):
contents = [] output = []
for transcript in transcript_data: for frame, item in enumerate(transcript_data, start=1):
content = []
for frame, item in enumerate(transcript, start=1):
start_time = float(item.get('start')) start_time = float(item.get('start'))
duration = float(item.get('dur', '0.0')) duration = float(item.get('dur', '0.0'))
end_time = parse_timecode(start_time + duration) end_time = parse_timecode(start_time + duration)
start_time = parse_timecode(start_time) start_time = parse_timecode(start_time)
content.append("{frame}\n".format(frame=frame)) output.append("{frame}\n".format(frame=frame))
content.append("{start_time} --> {end_time}\n".format( output.append("{start_time} --> {end_time}\n".format(
start_time=start_time, end_time=end_time)) start_time=start_time, end_time=end_time))
content.append("{text}\n\n".format(text=item.get('text'))) output.append("{text}".format(text=item.get('text')))
if frame < len(transcript_data):
output.append('\n\n')
contents.append(''.join(content)) return '{}\n'.format(''.join(output))
return ['\n\n'.join(contents)]
class TranscriptFormatterFactory: class TranscriptFormatterFactory:
@ -124,7 +141,7 @@ class TranscriptFormatterFactory:
if not issubclass(formatter_class, TranscriptFormatter): if not issubclass(formatter_class, TranscriptFormatter):
raise TypeError( raise TypeError(
f'{formatter_class} must be a subclass of TranscriptFormatter') f'{formatter_class} must be a subclass of TranscriptFormatter')
self._formatters.update({name:formatter_class}) self._formatters.update({name: formatter_class})
def add_formatters(self, formatters_dict): def add_formatters(self, formatters_dict):
"""Allow creation of multiple transcript formatters at a time. """Allow creation of multiple transcript formatters at a time.