diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..0974163 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,30 @@ +[run] +source = youtube_transcript_api + + +[report] +omit = + */__main__.py + +exclude_lines = + pragma: no cover + + # Don't complain about missing debug-only code: + def __unicode__ + def __repr__ + if self\.debug + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: + + # Don't complain about empty stubs of abstract methods + @abstractmethod + @abstractclassmethod + @abstractstaticmethod + +show_missing = True \ No newline at end of file diff --git a/.gitignore b/.gitignore index 8b2b8b9..542d724 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ dist build *.egg-info upload_new_version.sh +.coverage \ No newline at end of file diff --git a/coverage.sh b/coverage.sh new file mode 100755 index 0000000..c7fe42b --- /dev/null +++ b/coverage.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash + +.venv/bin/coverage run -m unittest discover && .venv/bin/coverage report diff --git a/requirements.txt b/requirements.txt index f229360..a46f363 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,5 @@ requests + +# testing +httpretty +coverage \ No newline at end of file diff --git a/setup.py b/setup.py index 38c2a9e..9fd8506 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,7 @@ +import os + +import unittest + import setuptools @@ -9,6 +13,15 @@ def get_long_description(): return _get_file_content('README.md') +def get_test_suite(): + test_loader = unittest.TestLoader() + test_suite = test_loader.discover( + 'test', pattern='test_*.py', + top_level_dir='{dirname}/youtube_transcript_api'.format(dirname=os.path.dirname(__file__)) + ) + return test_suite + + setuptools.setup( name="youtube_transcript_api", version="0.1.3", @@ -29,6 +42,11 @@ setuptools.setup( install_requires=[ 'requests', ], + tests_require=[ + 'httpretty', + 'coverage', + ], + test_suite='setup.get_test_suite', entry_points={ 'console_scripts': [ 'youtube_transcript_api = youtube_transcript_api.__main__:main', diff --git a/youtube_transcript_api/__main__.py b/youtube_transcript_api/__main__.py index f011ff1..f756560 100644 --- a/youtube_transcript_api/__main__.py +++ b/youtube_transcript_api/__main__.py @@ -1,62 +1,14 @@ import sys -import json - -from pprint import pprint - import logging -import argparse - -from ._api import YouTubeTranscriptApi - - -def parse_args(args): - parser = argparse.ArgumentParser( - description=( - 'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. ' - 'It also works for automatically generated subtitles and it does not require a headless browser, like ' - 'other selenium based solutions do!' - ) - ) - parser.add_argument('video_ids', nargs='*', type=str, help='List of YouTube video IDs.') - parser.add_argument( - '--languages', - nargs='*', - default=[], - type=str, - help=( - 'A list of language codes in a descending priority. For example, if this is set to "de en" it will first ' - 'try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to do so. ' - 'As I can\'t provide a complete list of all working language codes with full certainty, you may have to ' - 'play around with the language codes a bit, to find the one which is working for you!' - ), - ) - parser.add_argument( - '--json', - action='store_const', - const=True, - default=False, - help='If this flag is set the output will be JSON formatted.', - ) - - return parser.parse_args(args) +from ._cli import YouTubeTranscriptCli def main(): logging.basicConfig() - parsed_args = parse_args(sys.argv[1:]) - transcripts, _ = YouTubeTranscriptApi.get_transcripts( - parsed_args.video_ids, - languages=parsed_args.languages, - continue_after_error=True - ) - - if parsed_args.json: - print(json.dumps(transcripts)) - else: - pprint(transcripts) + print(YouTubeTranscriptCli(sys.argv[1:]).run()) if __name__ == '__main__': diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index be37b61..35a5abe 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -1,6 +1,7 @@ import sys -if sys.version_info.major == 2: +# This can only be tested by using different python versions, therefore it is not covered by coverage.py +if sys.version_info.major == 2: # pragma: no cover reload(sys) sys.setdefaultencoding('utf-8') @@ -36,8 +37,8 @@ class YouTubeTranscriptApi(): ) self.video_id = video_id - @staticmethod - def get_transcripts(video_ids, languages=None, continue_after_error=False): + @classmethod + def get_transcripts(cls, video_ids, languages=None, continue_after_error=False): """ Retrieves the transcripts for a list of videos. @@ -60,7 +61,7 @@ class YouTubeTranscriptApi(): for video_id in video_ids: try: - data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages) + data[video_id] = cls.get_transcript(video_id, languages) except Exception as exception: if not continue_after_error: raise exception @@ -69,15 +70,15 @@ class YouTubeTranscriptApi(): return data, unretrievable_videos - @staticmethod - def get_transcript(video_id, languages=None): + @classmethod + def get_transcript(cls, video_id, languages=None): """ Retrieves the transcript for a single video. :param video_id: the youtube video id :type video_id: str :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] - it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to + it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to play around with the language codes a bit, to find the one which is working for you! :type languages: [str] diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py new file mode 100644 index 0000000..dc80934 --- /dev/null +++ b/youtube_transcript_api/_cli.py @@ -0,0 +1,57 @@ +import json + +import pprint + +import argparse + +from ._api import YouTubeTranscriptApi + + +class YouTubeTranscriptCli(): + def __init__(self, args): + self._args = args + + def run(self): + parsed_args = self._parse_args() + + transcripts, _ = YouTubeTranscriptApi.get_transcripts( + parsed_args.video_ids, + languages=parsed_args.languages, + continue_after_error=True + ) + + if parsed_args.json: + return json.dumps(transcripts) + else: + return pprint.pformat(transcripts) + + def _parse_args(self): + parser = argparse.ArgumentParser( + description=( + 'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. ' + 'It also works for automatically generated subtitles and it does not require a headless browser, like ' + 'other selenium based solutions do!' + ) + ) + parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.') + parser.add_argument( + '--languages', + nargs='*', + default=[], + type=str, + help=( + 'A list of language codes in a descending priority. For example, if this is set to "de en" it will ' + 'first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails ' + 'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you ' + 'may have to play around with the language codes a bit, to find the one which is working for you!' + ), + ) + parser.add_argument( + '--json', + action='store_const', + const=True, + default=False, + help='If this flag is set the output will be JSON formatted.', + ) + + return parser.parse_args(self._args) diff --git a/youtube_transcript_api/_html_unescaping.py b/youtube_transcript_api/_html_unescaping.py index eb88b33..3efdf4b 100644 --- a/youtube_transcript_api/_html_unescaping.py +++ b/youtube_transcript_api/_html_unescaping.py @@ -1,9 +1,11 @@ import sys -if sys.version_info.major == 3 and sys.version_info.minor >= 4: + +# This can only be tested by using different python versions, therefore it is not covered by coverage.py +if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover # Python 3.4+ from html import unescape -else: +else: # pragma: no cover if sys.version_info.major <= 2: # Python 2 import HTMLParser diff --git a/youtube_transcript_api/test/__init__.py b/youtube_transcript_api/test/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/youtube_transcript_api/test/__init__.py @@ -0,0 +1 @@ + diff --git a/youtube_transcript_api/test/assets/__init__.py b/youtube_transcript_api/test/assets/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/youtube_transcript_api/test/assets/__init__.py @@ -0,0 +1 @@ + diff --git a/youtube_transcript_api/test/assets/transcript.xml b/youtube_transcript_api/test/assets/transcript.xml new file mode 100644 index 0000000..267ebc3 --- /dev/null +++ b/youtube_transcript_api/test/assets/transcript.xml @@ -0,0 +1,6 @@ + + + Hey, this is just a test + this is not the original transcript + just something shorter, I made up for testing + \ No newline at end of file diff --git a/youtube_transcript_api/test/assets/youtube.html b/youtube_transcript_api/test/assets/youtube.html new file mode 100644 index 0000000..acbd299 --- /dev/null +++ b/youtube_transcript_api/test/assets/youtube.html @@ -0,0 +1,1541 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + +Surface Go Review - It’s Awesome - YouTube + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+
+
+ DE +
+
+
+ +
+
+
+

+ + + +Wird geladen... + +

+ +
+
+
+ +
+
+
+ +
+
+ + +
+
+
+ +
+
+
+
+ + +
+
+
+
+
+
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+

+ + + + + Surface Go Review - Es ist Awesome + + +

+
+
+ + +
+ + + + + +
+
1.294.955 Aufrufe
+
+
+
+
+
+ + + + + +
+
+ + + + +
+
+
+
+

+ + + +Wird geladen... + +

+ +
+
+
+ +
+ +
+
+
+

+ + + +Wird geladen... + +

+ +
+
+
+

+Transkript +

+
+ +
+ + + +
+
+Das interaktive Transkript konnte nicht geladen werden. +
+ + +
+
+ +
+ +
+
+

+ + + +Wird geladen... + +

+ +
+
+ + +
+
+ Die Bewertungsfunktion ist nach Ausleihen des Videos verfügbar. +
+ +
+ +
+
+ Diese Funktion ist gerade nicht verfügbar. Bitte versuche es später noch einmal. +
+
+ + +
+ + +
+ + +
Am 02.08.2018 veröffentlicht

Dave2D-Rezension des Microsoft Surface Go. Dies ist die beste 2 in einem Laptop von Microsoft für Studenten mit einem strafferen Budget.
Zum Verkauf hier-https://amzn.to/2n3Y4sj

Dieser 2in1 tablet/Laptop ist unglaublich klein und hat eine Tonne Potenzial für Menschen, die ein ultratragbares Gerät benötigen, das sowohl als komfortables Tablet als auch als sehr funktionstüchtig eingesetzt werden kann. Das ist toll für Entwickler, Studenten, Arbeit oder auch für den Medienkonsum als Sekundärgerät.

Musik-Credits:
Fili-Sonntag Vibez

Folge mir:
http://twitter.com/Dave2D
http://www.instagram.com/Dave2D

+ +
+
+ +
+ + + +
+
+

+ + + +Wird geladen... + +

+ +
+ +
+ + +
+
+
+ + + +
+
+ +
+ +
+
+
+Anzeige +
+
+
+
+ + +
+
+
+
+
+ + + +Wenn Autoplay aktiviert ist, wird die Wiedergabe automatisch mit einem der aktuellen Videovorschläge fortgesetzt. + + + +
+

+ Nächstes Video +

+ + +
+
+ + +
+
+
+ +
+
+ +
+
+ +
+
+
+ + +
+ +
+ +
+
+ + +
+
+ + +
+ , um dieses Video zur Playlist "Später ansehen" hinzuzufügen. + +
+
+

+Hinzufügen +

+
+
+

+ + + + Playlists werden geladen... + +

+ +
+
+ + + + + + + \ No newline at end of file diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py new file mode 100644 index 0000000..76e9d19 --- /dev/null +++ b/youtube_transcript_api/test/test_api.py @@ -0,0 +1,103 @@ +from unittest import TestCase +from unittest.mock import MagicMock + +import os + +import httpretty + +from youtube_transcript_api._api import YouTubeTranscriptApi + + +def load_asset(filename): + with open('{dirname}/assets/{filename}'.format(dirname=os.path.dirname(__file__), filename=filename)) as file: + return file.read() + + +class TestYouTubeTranscriptApi(TestCase): + def setUp(self): + httpretty.enable() + httpretty.register_uri( + httpretty.GET, + 'https://www.youtube.com/watch', + body=load_asset('youtube.html') + ) + httpretty.register_uri( + httpretty.GET, + 'https://www.youtube.com/api/timedtext', + body=load_asset('transcript.xml') + ) + + def tearDown(self): + httpretty.disable() + + def test_get_transcript(self): + transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8') + + self.assertEqual( + transcript, + [ + {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, + {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, + {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} + ] + ) + + def test_get_transcript__correct_language_is_used(self): + YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) + query_string = httpretty.last_request().querystring + + self.assertIn('lang', query_string) + self.assertEqual(len(query_string['lang']), 1) + self.assertEqual(query_string['lang'][0], 'de') + + def test_get_transcript__fallback_language_is_used(self): + httpretty.register_uri( + httpretty.GET, + 'https://www.youtube.com/api/timedtext', + body='' + ) + + YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) + query_string = httpretty.last_request().querystring + + self.assertIn('lang', query_string) + self.assertEqual(len(query_string['lang']), 1) + self.assertEqual(query_string['lang'][0], 'en') + + def test_get_transcript__exception_is_raised_when_not_available(self): + httpretty.register_uri( + httpretty.GET, + 'https://www.youtube.com/api/timedtext', + body='' + ) + + with self.assertRaises(YouTubeTranscriptApi.CouldNotRetrieveTranscript): + YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8') + + def test_get_transcripts(self): + video_id_1 = 'video_id_1' + video_id_2 = 'video_id_2' + languages = ['de', 'en'] + YouTubeTranscriptApi.get_transcript = MagicMock() + + YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) + + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages) + self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2) + + def test_get_transcripts__stop_on_error(self): + YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error')) + + with self.assertRaises(Exception): + YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2']) + + def test_get_transcripts__continue_on_error(self): + video_id_1 = 'video_id_1' + video_id_2 = 'video_id_2' + YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error')) + + YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) + + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None) + YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None) \ No newline at end of file diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py new file mode 100644 index 0000000..72c5890 --- /dev/null +++ b/youtube_transcript_api/test/test_cli.py @@ -0,0 +1,68 @@ +from unittest import TestCase +from unittest.mock import MagicMock + +import json + +from youtube_transcript_api._cli import YouTubeTranscriptCli, YouTubeTranscriptApi + + +class TestYouTubeTranscriptCli(TestCase): + def test_argument_parsing(self): + parsed_args = YouTubeTranscriptCli('v1 v2 --json --languages de en'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, True) + self.assertEqual(parsed_args.languages, ['de', 'en']) + + parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, True) + self.assertEqual(parsed_args.languages, ['de', 'en']) + + parsed_args = YouTubeTranscriptCli(' --json v1 v2 --languages de en'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, True) + self.assertEqual(parsed_args.languages, ['de', 'en']) + + def test_argument_parsing__only_video_ids(self): + parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, False) + self.assertEqual(parsed_args.languages, []) + + def test_argument_parsing__fail_without_video_ids(self): + with self.assertRaises(SystemExit): + YouTubeTranscriptCli('--json'.split())._parse_args() + + def test_argument_parsing__json(self): + parsed_args = YouTubeTranscriptCli('v1 v2 --json'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, True) + self.assertEqual(parsed_args.languages, []) + + parsed_args = YouTubeTranscriptCli('--json v1 v2'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, True) + self.assertEqual(parsed_args.languages, []) + + def test_argument_parsing__languages(self): + parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en'.split())._parse_args() + self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + self.assertEqual(parsed_args.json, False) + self.assertEqual(parsed_args.languages, ['de', 'en']) + + def test_run(self): + YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) + YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() + + YouTubeTranscriptApi.get_transcripts.assert_called_once_with( + ['v1', 'v2'], + languages=['de', 'en'], + continue_after_error=True + ) + + def test_run__json_output(self): + YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([{'boolean': True}], [])) + output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run() + + # will fail if output is not valid json + json.loads(output)