Add more functionality to TranscriptFormatter base class
Due to the behavior of the CLI and API, needed more flexibility for combining 1 or many transcripts for a given formatter. - Now can specify a DELIMITER to separate multiple transcripts on. - Can also specify how those items are combine overriding the combine class method. Remove unused imports Fix adjust some lines to meet PEP
This commit is contained in:
parent
2c79bd563c
commit
1c0d584959
|
@ -1,13 +1,9 @@
|
||||||
from abc import ABCMeta
|
from abc import ABC
|
||||||
from abc import abstractclassmethod
|
from abc import abstractclassmethod
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from xml.etree import ElementTree
|
|
||||||
|
|
||||||
from ._html_unescaping import unescape
|
|
||||||
|
|
||||||
|
|
||||||
def parse_timecode(time):
|
def parse_timecode(time):
|
||||||
"""Converts a `time` into a formatted transcript timecode.
|
"""Converts a `time` into a formatted transcript timecode.
|
||||||
|
@ -31,15 +27,29 @@ def parse_timecode(time):
|
||||||
return f"{hours}:{mins}:{secs},{ms}"
|
return f"{hours}:{mins}:{secs},{ms}"
|
||||||
|
|
||||||
|
|
||||||
class TranscriptFormatter(metaclass=ABCMeta):
|
class TranscriptFormatter(ABC):
|
||||||
"""
|
"""Abstract Base TranscriptFormatter class
|
||||||
Abstract Base TranscriptFormatter class
|
|
||||||
|
|
||||||
This class should be inherited from to create additional
|
This class should be inherited from to create additional
|
||||||
custom transcript formatters.
|
custom transcript formatters.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
|
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
|
||||||
|
DELIMITER = ''
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def combine(cls, transcripts):
|
||||||
|
"""Subclass may override this class method.
|
||||||
|
|
||||||
|
Default behavior of this method will ''.join() the str()
|
||||||
|
of each transcript in transcripts.
|
||||||
|
|
||||||
|
:param transcripts: a list of many transcripts
|
||||||
|
:type transcript_data: list[<formatted transcript>, ...]
|
||||||
|
:return: A string joined on the `cls.DELIMITER` to combine transcripts
|
||||||
|
:rtype: str
|
||||||
|
"""
|
||||||
|
return cls.DELIMITER.join(
|
||||||
|
str(transcript) for transcript in transcripts)
|
||||||
|
|
||||||
@abstractclassmethod
|
@abstractclassmethod
|
||||||
def format(cls, transcript_data):
|
def format(cls, transcript_data):
|
||||||
|
@ -56,9 +66,15 @@ class TranscriptFormatter(metaclass=ABCMeta):
|
||||||
|
|
||||||
class JSONTranscriptFormatter(TranscriptFormatter):
|
class JSONTranscriptFormatter(TranscriptFormatter):
|
||||||
"""Formatter for outputting JSON data"""
|
"""Formatter for outputting JSON data"""
|
||||||
|
DELIMITER = ','
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def combine(cls, transcripts):
|
||||||
|
return json.dumps(transcripts)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def format(cls, transcript_data):
|
def format(cls, transcript_data):
|
||||||
return [json.dumps(transcript_data)] if transcript_data else []
|
return transcript_data
|
||||||
|
|
||||||
|
|
||||||
class TextTranscriptFormatter(TranscriptFormatter):
|
class TextTranscriptFormatter(TranscriptFormatter):
|
||||||
|
@ -66,39 +82,40 @@ class TextTranscriptFormatter(TranscriptFormatter):
|
||||||
|
|
||||||
Converts the fetched transcript data into separated lines of
|
Converts the fetched transcript data into separated lines of
|
||||||
plain text separated by newline breaks (\n) with no timecodes.
|
plain text separated by newline breaks (\n) with no timecodes.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
DELIMITER = '\n\n'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def format(cls, transcript_data):
|
def format(cls, transcript_data):
|
||||||
return ['\n'.join(line['text'] for transcript in transcript_data
|
return '{}\n'.format('\n'.join(
|
||||||
for line in transcript)]
|
line['text']for line in transcript_data))
|
||||||
|
|
||||||
|
|
||||||
class SRTTranscriptFormatter(TranscriptFormatter):
|
class SRTTranscriptFormatter(TranscriptFormatter):
|
||||||
"""Formatter for outputting the SRT Format
|
"""Formatter for outputting the SRT Format
|
||||||
|
|
||||||
Converts the fetched transcript data into a simple .srt file format.
|
Converts the fetched transcript data into a simple .srt file format.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
DELIMITER = '\n\n'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def format(cls, transcript_data):
|
def format(cls, transcript_data):
|
||||||
contents = []
|
output = []
|
||||||
for transcript in transcript_data:
|
for frame, item in enumerate(transcript_data, start=1):
|
||||||
content = []
|
|
||||||
for frame, item in enumerate(transcript, start=1):
|
|
||||||
start_time = float(item.get('start'))
|
start_time = float(item.get('start'))
|
||||||
duration = float(item.get('dur', '0.0'))
|
duration = float(item.get('dur', '0.0'))
|
||||||
|
|
||||||
end_time = parse_timecode(start_time + duration)
|
end_time = parse_timecode(start_time + duration)
|
||||||
start_time = parse_timecode(start_time)
|
start_time = parse_timecode(start_time)
|
||||||
|
|
||||||
content.append("{frame}\n".format(frame=frame))
|
output.append("{frame}\n".format(frame=frame))
|
||||||
content.append("{start_time} --> {end_time}\n".format(
|
output.append("{start_time} --> {end_time}\n".format(
|
||||||
start_time=start_time, end_time=end_time))
|
start_time=start_time, end_time=end_time))
|
||||||
content.append("{text}\n\n".format(text=item.get('text')))
|
output.append("{text}".format(text=item.get('text')))
|
||||||
|
if frame < len(transcript_data):
|
||||||
|
output.append('\n\n')
|
||||||
|
|
||||||
contents.append(''.join(content))
|
return '{}\n'.format(''.join(output))
|
||||||
return ['\n\n'.join(contents)]
|
|
||||||
|
|
||||||
|
|
||||||
class TranscriptFormatterFactory:
|
class TranscriptFormatterFactory:
|
||||||
|
@ -124,7 +141,7 @@ class TranscriptFormatterFactory:
|
||||||
if not issubclass(formatter_class, TranscriptFormatter):
|
if not issubclass(formatter_class, TranscriptFormatter):
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
f'{formatter_class} must be a subclass of TranscriptFormatter')
|
f'{formatter_class} must be a subclass of TranscriptFormatter')
|
||||||
self._formatters.update({name:formatter_class})
|
self._formatters.update({name: formatter_class})
|
||||||
|
|
||||||
def add_formatters(self, formatters_dict):
|
def add_formatters(self, formatters_dict):
|
||||||
"""Allow creation of multiple transcript formatters at a time.
|
"""Allow creation of multiple transcript formatters at a time.
|
||||||
|
|
Loading…
Reference in New Issue