added FormatterLoader

This commit is contained in:
Jonas Depoix 2021-03-15 17:16:15 +01:00
parent 71268dfad9
commit d314139329
3 changed files with 73 additions and 7 deletions

View File

@ -37,13 +37,15 @@ class CouldNotRetrieveTranscript(Exception):
class VideoUnavailable(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The video is no longer available'
class TooManyRequests(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = ("YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. One of the following things can be done to work around this:\n\
- Manually solve the captcha in a browser and export the cookie. Read here how to use that cookie with youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
- Use a different IP address\n\
- Wait until the ban on your IP has been lifted")
class TranscriptsDisabled(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Subtitles are disabled for this video'

View File

@ -1,5 +1,7 @@
import json
import pprint
class Formatter(object):
"""Formatter should be used as an abstract base class.
@ -22,6 +24,16 @@ class Formatter(object):
'their own .format() method.')
class PrettyPrintFormatter(Formatter):
def format(self, **kwargs):
"""Pretty prints a transcript.
:return: A pretty printed string representation of the transcript dict.'
:rtype str
"""
return pprint.pformat(self._transcript, **kwargs)
class JSONFormatter(Formatter):
def format(self, **kwargs):
"""Converts a transcript into a JSON string.
@ -72,12 +84,12 @@ class WebVTTFormatter(Formatter):
"""
lines = []
for i, line in enumerate(self._transcript):
if i < len(self._transcript)-1:
if i < len(self._transcript) - 1:
# Looks ahead, use next start time since duration value
# would create an overlap between start times.
time_text = "{} --> {}".format(
self._seconds_to_timestamp(line['start']),
self._seconds_to_timestamp(self._transcript[i+1]['start'])
self._seconds_to_timestamp(self._transcript[i + 1]['start'])
)
else:
# Reached the end, cannot look ahead, use duration now.
@ -89,3 +101,27 @@ class WebVTTFormatter(Formatter):
lines.append("{}\n{}".format(time_text, line['text']))
return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"
class FormatterLoader(object):
TYPES = {
'json': JSONFormatter,
'pretty': PrettyPrintFormatter,
'text': TextFormatter,
'webvvt': WebVTTFormatter,
}
class UnknownFormatterType(Exception):
def __init__(self, formatter_type):
super(FormatterLoader.UnknownFormatterType, self).__init__(
f'The format \'{formatter_type}\' is not supported. '
f'Choose one of the following formats: {", ".join(FormatterLoader.TYPES.keys())}'
)
def __init__(self, formatter_type='pretty'):
if formatter_type not in FormatterLoader.TYPES.keys():
raise FormatterLoader.UnknownFormatterType(formatter_type)
self._formatter = FormatterLoader.TYPES[formatter_type]
def load(self, transcript):
return self._formatter(transcript)

View File

@ -1,12 +1,15 @@
import json
from mock import MagicMock
from unittest import TestCase
import json
import pprint
from youtube_transcript_api.formatters import (
Formatter,
JSONFormatter,
TextFormatter,
WebVTTFormatter
WebVTTFormatter,
PrettyPrintFormatter, FormatterLoader
)
@ -35,6 +38,7 @@ class TestFormatters(TestCase):
def test_webvtt_formatter_starting(self):
content = WebVTTFormatter(self.transcript).format()
lines = content.split('\n')
# test starting lines
self.assertEqual(lines[0], "WEBVTT")
self.assertEqual(lines[1], "")
@ -42,16 +46,40 @@ class TestFormatters(TestCase):
def test_webvtt_formatter_ending(self):
content = WebVTTFormatter(self.transcript).format()
lines = content.split('\n')
# test ending lines
self.assertEqual(lines[-2], self.transcript[-1]['text'])
self.assertEqual(lines[-1], "")
def test_pretty_print_formatter(self):
content = PrettyPrintFormatter(self.transcript).format()
self.assertEqual(content, pprint.pformat(self.transcript))
def test_json_formatter(self):
content = JSONFormatter(self.transcript).format()
self.assertEqual(json.loads(content), self.transcript)
def test_text_formatter(self):
content = TextFormatter(self.transcript).format()
lines = content.split('\n')
self.assertEqual(lines[0], self.transcript[0]["text"])
self.assertEqual(lines[-1], self.transcript[-1]["text"])
def test_formatter_loader(self):
loader = FormatterLoader('json')
formatter = loader.load(self.transcript)
self.assertTrue(isinstance(formatter, JSONFormatter))
def test_formatter_loader__default_formatter(self):
loader = FormatterLoader()
formatter = loader.load(self.transcript)
self.assertTrue(isinstance(formatter, PrettyPrintFormatter))
def test_formatter_loader__unknown_format(self):
with self.assertRaises(FormatterLoader.UnknownFormatterType):
FormatterLoader('png')