test suite and corresponding tooling added
This commit is contained in:
parent
4e228d9978
commit
94e4e4063f
|
@ -0,0 +1,30 @@
|
|||
[run]
|
||||
source = youtube_transcript_api
|
||||
|
||||
|
||||
[report]
|
||||
omit =
|
||||
*/__main__.py
|
||||
|
||||
exclude_lines =
|
||||
pragma: no cover
|
||||
|
||||
# Don't complain about missing debug-only code:
|
||||
def __unicode__
|
||||
def __repr__
|
||||
if self\.debug
|
||||
|
||||
# Don't complain if tests don't hit defensive assertion code:
|
||||
raise AssertionError
|
||||
raise NotImplementedError
|
||||
|
||||
# Don't complain if non-runnable code isn't run:
|
||||
if 0:
|
||||
if __name__ == .__main__.:
|
||||
|
||||
# Don't complain about empty stubs of abstract methods
|
||||
@abstractmethod
|
||||
@abstractclassmethod
|
||||
@abstractstaticmethod
|
||||
|
||||
show_missing = True
|
|
@ -6,3 +6,4 @@ dist
|
|||
build
|
||||
*.egg-info
|
||||
upload_new_version.sh
|
||||
.coverage
|
|
@ -0,0 +1,3 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
.venv/bin/coverage run -m unittest discover && .venv/bin/coverage report
|
|
@ -1 +1,5 @@
|
|||
requests
|
||||
|
||||
# testing
|
||||
httpretty
|
||||
coverage
|
18
setup.py
18
setup.py
|
@ -1,3 +1,7 @@
|
|||
import os
|
||||
|
||||
import unittest
|
||||
|
||||
import setuptools
|
||||
|
||||
|
||||
|
@ -9,6 +13,15 @@ def get_long_description():
|
|||
return _get_file_content('README.md')
|
||||
|
||||
|
||||
def get_test_suite():
|
||||
test_loader = unittest.TestLoader()
|
||||
test_suite = test_loader.discover(
|
||||
'test', pattern='test_*.py',
|
||||
top_level_dir='{dirname}/youtube_transcript_api'.format(dirname=os.path.dirname(__file__))
|
||||
)
|
||||
return test_suite
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
name="youtube_transcript_api",
|
||||
version="0.1.3",
|
||||
|
@ -29,6 +42,11 @@ setuptools.setup(
|
|||
install_requires=[
|
||||
'requests',
|
||||
],
|
||||
tests_require=[
|
||||
'httpretty',
|
||||
'coverage',
|
||||
],
|
||||
test_suite='setup.get_test_suite',
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'youtube_transcript_api = youtube_transcript_api.__main__:main',
|
||||
|
|
|
@ -1,62 +1,14 @@
|
|||
import sys
|
||||
|
||||
import json
|
||||
|
||||
from pprint import pprint
|
||||
|
||||
import logging
|
||||
|
||||
import argparse
|
||||
|
||||
from ._api import YouTubeTranscriptApi
|
||||
|
||||
|
||||
def parse_args(args):
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
|
||||
'It also works for automatically generated subtitles and it does not require a headless browser, like '
|
||||
'other selenium based solutions do!'
|
||||
)
|
||||
)
|
||||
parser.add_argument('video_ids', nargs='*', type=str, help='List of YouTube video IDs.')
|
||||
parser.add_argument(
|
||||
'--languages',
|
||||
nargs='*',
|
||||
default=[],
|
||||
type=str,
|
||||
help=(
|
||||
'A list of language codes in a descending priority. For example, if this is set to "de en" it will first '
|
||||
'try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to do so. '
|
||||
'As I can\'t provide a complete list of all working language codes with full certainty, you may have to '
|
||||
'play around with the language codes a bit, to find the one which is working for you!'
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
'--json',
|
||||
action='store_const',
|
||||
const=True,
|
||||
default=False,
|
||||
help='If this flag is set the output will be JSON formatted.',
|
||||
)
|
||||
|
||||
return parser.parse_args(args)
|
||||
from ._cli import YouTubeTranscriptCli
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig()
|
||||
|
||||
parsed_args = parse_args(sys.argv[1:])
|
||||
transcripts, _ = YouTubeTranscriptApi.get_transcripts(
|
||||
parsed_args.video_ids,
|
||||
languages=parsed_args.languages,
|
||||
continue_after_error=True
|
||||
)
|
||||
|
||||
if parsed_args.json:
|
||||
print(json.dumps(transcripts))
|
||||
else:
|
||||
pprint(transcripts)
|
||||
print(YouTubeTranscriptCli(sys.argv[1:]).run())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import sys
|
||||
|
||||
if sys.version_info.major == 2:
|
||||
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
|
||||
if sys.version_info.major == 2: # pragma: no cover
|
||||
reload(sys)
|
||||
sys.setdefaultencoding('utf-8')
|
||||
|
||||
|
@ -36,8 +37,8 @@ class YouTubeTranscriptApi():
|
|||
)
|
||||
self.video_id = video_id
|
||||
|
||||
@staticmethod
|
||||
def get_transcripts(video_ids, languages=None, continue_after_error=False):
|
||||
@classmethod
|
||||
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False):
|
||||
"""
|
||||
Retrieves the transcripts for a list of videos.
|
||||
|
||||
|
@ -60,7 +61,7 @@ class YouTubeTranscriptApi():
|
|||
|
||||
for video_id in video_ids:
|
||||
try:
|
||||
data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages)
|
||||
data[video_id] = cls.get_transcript(video_id, languages)
|
||||
except Exception as exception:
|
||||
if not continue_after_error:
|
||||
raise exception
|
||||
|
@ -69,15 +70,15 @@ class YouTubeTranscriptApi():
|
|||
|
||||
return data, unretrievable_videos
|
||||
|
||||
@staticmethod
|
||||
def get_transcript(video_id, languages=None):
|
||||
@classmethod
|
||||
def get_transcript(cls, video_id, languages=None):
|
||||
"""
|
||||
Retrieves the transcript for a single video.
|
||||
|
||||
:param video_id: the youtube video id
|
||||
:type video_id: str
|
||||
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||
it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
|
||||
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
||||
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
|
||||
play around with the language codes a bit, to find the one which is working for you!
|
||||
:type languages: [str]
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
import json
|
||||
|
||||
import pprint
|
||||
|
||||
import argparse
|
||||
|
||||
from ._api import YouTubeTranscriptApi
|
||||
|
||||
|
||||
class YouTubeTranscriptCli():
|
||||
def __init__(self, args):
|
||||
self._args = args
|
||||
|
||||
def run(self):
|
||||
parsed_args = self._parse_args()
|
||||
|
||||
transcripts, _ = YouTubeTranscriptApi.get_transcripts(
|
||||
parsed_args.video_ids,
|
||||
languages=parsed_args.languages,
|
||||
continue_after_error=True
|
||||
)
|
||||
|
||||
if parsed_args.json:
|
||||
return json.dumps(transcripts)
|
||||
else:
|
||||
return pprint.pformat(transcripts)
|
||||
|
||||
def _parse_args(self):
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
|
||||
'It also works for automatically generated subtitles and it does not require a headless browser, like '
|
||||
'other selenium based solutions do!'
|
||||
)
|
||||
)
|
||||
parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
|
||||
parser.add_argument(
|
||||
'--languages',
|
||||
nargs='*',
|
||||
default=[],
|
||||
type=str,
|
||||
help=(
|
||||
'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
|
||||
'first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails '
|
||||
'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
|
||||
'may have to play around with the language codes a bit, to find the one which is working for you!'
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
'--json',
|
||||
action='store_const',
|
||||
const=True,
|
||||
default=False,
|
||||
help='If this flag is set the output will be JSON formatted.',
|
||||
)
|
||||
|
||||
return parser.parse_args(self._args)
|
|
@ -1,9 +1,11 @@
|
|||
import sys
|
||||
|
||||
if sys.version_info.major == 3 and sys.version_info.minor >= 4:
|
||||
|
||||
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
|
||||
if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover
|
||||
# Python 3.4+
|
||||
from html import unescape
|
||||
else:
|
||||
else: # pragma: no cover
|
||||
if sys.version_info.major <= 2:
|
||||
# Python 2
|
||||
import HTMLParser
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1,6 @@
|
|||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<transcript>
|
||||
<text start="0" dur="1.54">Hey, this is just a test</text>
|
||||
<text start="1.54" dur="4.16">this is not the original transcript</text>
|
||||
<text start="5.7" dur="3.239">just something shorter, I made up for testing</text>
|
||||
</transcript>
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,103 @@
|
|||
from unittest import TestCase
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import os
|
||||
|
||||
import httpretty
|
||||
|
||||
from youtube_transcript_api._api import YouTubeTranscriptApi
|
||||
|
||||
|
||||
def load_asset(filename):
|
||||
with open('{dirname}/assets/{filename}'.format(dirname=os.path.dirname(__file__), filename=filename)) as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
class TestYouTubeTranscriptApi(TestCase):
|
||||
def setUp(self):
|
||||
httpretty.enable()
|
||||
httpretty.register_uri(
|
||||
httpretty.GET,
|
||||
'https://www.youtube.com/watch',
|
||||
body=load_asset('youtube.html')
|
||||
)
|
||||
httpretty.register_uri(
|
||||
httpretty.GET,
|
||||
'https://www.youtube.com/api/timedtext',
|
||||
body=load_asset('transcript.xml')
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
httpretty.disable()
|
||||
|
||||
def test_get_transcript(self):
|
||||
transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8')
|
||||
|
||||
self.assertEqual(
|
||||
transcript,
|
||||
[
|
||||
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
|
||||
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
|
||||
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
|
||||
]
|
||||
)
|
||||
|
||||
def test_get_transcript__correct_language_is_used(self):
|
||||
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en'])
|
||||
query_string = httpretty.last_request().querystring
|
||||
|
||||
self.assertIn('lang', query_string)
|
||||
self.assertEqual(len(query_string['lang']), 1)
|
||||
self.assertEqual(query_string['lang'][0], 'de')
|
||||
|
||||
def test_get_transcript__fallback_language_is_used(self):
|
||||
httpretty.register_uri(
|
||||
httpretty.GET,
|
||||
'https://www.youtube.com/api/timedtext',
|
||||
body=''
|
||||
)
|
||||
|
||||
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en'])
|
||||
query_string = httpretty.last_request().querystring
|
||||
|
||||
self.assertIn('lang', query_string)
|
||||
self.assertEqual(len(query_string['lang']), 1)
|
||||
self.assertEqual(query_string['lang'][0], 'en')
|
||||
|
||||
def test_get_transcript__exception_is_raised_when_not_available(self):
|
||||
httpretty.register_uri(
|
||||
httpretty.GET,
|
||||
'https://www.youtube.com/api/timedtext',
|
||||
body=''
|
||||
)
|
||||
|
||||
with self.assertRaises(YouTubeTranscriptApi.CouldNotRetrieveTranscript):
|
||||
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8')
|
||||
|
||||
def test_get_transcripts(self):
|
||||
video_id_1 = 'video_id_1'
|
||||
video_id_2 = 'video_id_2'
|
||||
languages = ['de', 'en']
|
||||
YouTubeTranscriptApi.get_transcript = MagicMock()
|
||||
|
||||
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
|
||||
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages)
|
||||
self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2)
|
||||
|
||||
def test_get_transcripts__stop_on_error(self):
|
||||
YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
|
||||
|
||||
with self.assertRaises(Exception):
|
||||
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
|
||||
|
||||
def test_get_transcripts__continue_on_error(self):
|
||||
video_id_1 = 'video_id_1'
|
||||
video_id_2 = 'video_id_2'
|
||||
YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
|
||||
|
||||
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
|
||||
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None)
|
||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None)
|
|
@ -0,0 +1,68 @@
|
|||
from unittest import TestCase
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import json
|
||||
|
||||
from youtube_transcript_api._cli import YouTubeTranscriptCli, YouTubeTranscriptApi
|
||||
|
||||
|
||||
class TestYouTubeTranscriptCli(TestCase):
|
||||
def test_argument_parsing(self):
|
||||
parsed_args = YouTubeTranscriptCli('v1 v2 --json --languages de en'.split())._parse_args()
|
||||
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||
self.assertEqual(parsed_args.json, True)
|
||||
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||
|
||||
parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split())._parse_args()
|
||||
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||
self.assertEqual(parsed_args.json, True)
|
||||
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||
|
||||
parsed_args = YouTubeTranscriptCli(' --json v1 v2 --languages de en'.split())._parse_args()
|
||||
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||
self.assertEqual(parsed_args.json, True)
|
||||
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||
|
||||
def test_argument_parsing__only_video_ids(self):
|
||||
parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args()
|
||||
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||
self.assertEqual(parsed_args.json, False)
|
||||
self.assertEqual(parsed_args.languages, [])
|
||||
|
||||
def test_argument_parsing__fail_without_video_ids(self):
|
||||
with self.assertRaises(SystemExit):
|
||||
YouTubeTranscriptCli('--json'.split())._parse_args()
|
||||
|
||||
def test_argument_parsing__json(self):
|
||||
parsed_args = YouTubeTranscriptCli('v1 v2 --json'.split())._parse_args()
|
||||
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||
self.assertEqual(parsed_args.json, True)
|
||||
self.assertEqual(parsed_args.languages, [])
|
||||
|
||||
parsed_args = YouTubeTranscriptCli('--json v1 v2'.split())._parse_args()
|
||||
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||
self.assertEqual(parsed_args.json, True)
|
||||
self.assertEqual(parsed_args.languages, [])
|
||||
|
||||
def test_argument_parsing__languages(self):
|
||||
parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en'.split())._parse_args()
|
||||
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||
self.assertEqual(parsed_args.json, False)
|
||||
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||
|
||||
def test_run(self):
|
||||
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], []))
|
||||
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
|
||||
|
||||
YouTubeTranscriptApi.get_transcripts.assert_called_once_with(
|
||||
['v1', 'v2'],
|
||||
languages=['de', 'en'],
|
||||
continue_after_error=True
|
||||
)
|
||||
|
||||
def test_run__json_output(self):
|
||||
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([{'boolean': True}], []))
|
||||
output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run()
|
||||
|
||||
# will fail if output is not valid json
|
||||
json.loads(output)
|
Loading…
Reference in New Issue