From 5f96588ada0d30ac90ff3d3a0031a5bfe022a177 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Thu, 26 Sep 2024 17:56:36 +0200 Subject: [PATCH] added black formatter --- poetry.lock | 111 +++++- pyproject.toml | 2 + youtube_transcript_api/__main__.py | 2 +- youtube_transcript_api/_api.py | 42 ++- youtube_transcript_api/_cli.py | 114 +++--- youtube_transcript_api/_errors.py | 63 ++-- youtube_transcript_api/_html_unescaping.py | 4 +- youtube_transcript_api/_settings.py | 2 +- youtube_transcript_api/_transcripts.py | 157 ++++---- youtube_transcript_api/formatters.py | 78 ++-- youtube_transcript_api/test/test_api.py | 313 +++++++++------- youtube_transcript_api/test/test_cli.py | 338 +++++++++++------- .../test/test_formatters.py | 50 +-- 13 files changed, 805 insertions(+), 471 deletions(-) diff --git a/poetry.lock b/poetry.lock index f87bcde..b3fc681 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,51 @@ # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +[[package]] +name = "black" +version = "24.8.0" +description = "The uncompromising code formatter." +optional = false +python-versions = ">=3.8" +files = [ + {file = "black-24.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:09cdeb74d494ec023ded657f7092ba518e8cf78fa8386155e4a03fdcc44679e6"}, + {file = "black-24.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:81c6742da39f33b08e791da38410f32e27d632260e599df7245cccee2064afeb"}, + {file = "black-24.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:707a1ca89221bc8a1a64fb5e15ef39cd755633daa672a9db7498d1c19de66a42"}, + {file = "black-24.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d6417535d99c37cee4091a2f24eb2b6d5ec42b144d50f1f2e436d9fe1916fe1a"}, + {file = "black-24.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fb6e2c0b86bbd43dee042e48059c9ad7830abd5c94b0bc518c0eeec57c3eddc1"}, + {file = "black-24.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:837fd281f1908d0076844bc2b801ad2d369c78c45cf800cad7b61686051041af"}, + {file = "black-24.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62e8730977f0b77998029da7971fa896ceefa2c4c4933fcd593fa599ecbf97a4"}, + {file = "black-24.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:72901b4913cbac8972ad911dc4098d5753704d1f3c56e44ae8dce99eecb0e3af"}, + {file = "black-24.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7c046c1d1eeb7aea9335da62472481d3bbf3fd986e093cffd35f4385c94ae368"}, + {file = "black-24.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:649f6d84ccbae73ab767e206772cc2d7a393a001070a4c814a546afd0d423aed"}, + {file = "black-24.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b59b250fdba5f9a9cd9d0ece6e6d993d91ce877d121d161e4698af3eb9c1018"}, + {file = "black-24.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:6e55d30d44bed36593c3163b9bc63bf58b3b30e4611e4d88a0c3c239930ed5b2"}, + {file = "black-24.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:505289f17ceda596658ae81b61ebbe2d9b25aa78067035184ed0a9d855d18afd"}, + {file = "black-24.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b19c9ad992c7883ad84c9b22aaa73562a16b819c1d8db7a1a1a49fb7ec13c7d2"}, + {file = "black-24.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f13f7f386f86f8121d76599114bb8c17b69d962137fc70efe56137727c7047e"}, + {file = "black-24.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:f490dbd59680d809ca31efdae20e634f3fae27fba3ce0ba3208333b713bc3920"}, + {file = "black-24.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eab4dd44ce80dea27dc69db40dab62d4ca96112f87996bca68cd75639aeb2e4c"}, + {file = "black-24.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3c4285573d4897a7610054af5a890bde7c65cb466040c5f0c8b732812d7f0e5e"}, + {file = "black-24.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e84e33b37be070ba135176c123ae52a51f82306def9f7d063ee302ecab2cf47"}, + {file = "black-24.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:73bbf84ed136e45d451a260c6b73ed674652f90a2b3211d6a35e78054563a9bb"}, + {file = "black-24.8.0-py3-none-any.whl", hash = "sha256:972085c618ee94f402da1af548a4f218c754ea7e5dc70acb168bfaca4c2542ed"}, + {file = "black-24.8.0.tar.gz", hash = "sha256:2500945420b6784c38b9ee885af039f5e7471ef284ab03fa35ecdde4688cd83f"}, +] + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +packaging = ">=22.0" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + [[package]] name = "certifi" version = "2024.8.30" @@ -110,6 +156,20 @@ files = [ {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, ] +[[package]] +name = "click" +version = "8.1.7" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" @@ -302,6 +362,17 @@ build = ["blurb", "twine", "wheel"] docs = ["sphinx"] test = ["pytest", "pytest-cov"] +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + [[package]] name = "packaging" version = "24.1" @@ -313,6 +384,33 @@ files = [ {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, ] +[[package]] +name = "pathspec" +version = "0.12.1" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, + {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, +] + +[[package]] +name = "platformdirs" +version = "4.3.6" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +optional = false +python-versions = ">=3.8" +files = [ + {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, + {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.11.2)"] + [[package]] name = "pluggy" version = "1.5.0" @@ -382,6 +480,17 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "typing-extensions" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +files = [ + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, +] + [[package]] name = "urllib3" version = "2.2.3" @@ -402,4 +511,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.13" -content-hash = "ae3ea36431a2a24e1d07e7c6e251fe7490b86edd928c22eda084e3cb974aaa99" +content-hash = "4c2e7d294773ea148b69f961053a9469630c48b88248903ead43e41a2838ff94" diff --git a/pyproject.toml b/pyproject.toml index 5176f40..5720af7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,7 @@ youtube_transcript_api = "youtube_transcript_api.__main__:main" [tool.poe.tasks] test = "pytest youtube_transcript_api" coverage.shell = "pytest youtube_transcript_api && coverage report -m" +format = "black youtube_transcript_api" [tool.poetry.dependencies] python = ">=3.8,<3.13" @@ -51,6 +52,7 @@ coverage = "^7.6.1" mock = "^5.1.0" httpretty = "^1.1.4" coveralls = "^4.0.1" +black = "^24.8.0" [tool.coverage.run] source = ["youtube_transcript_api"] diff --git a/youtube_transcript_api/__main__.py b/youtube_transcript_api/__main__.py index f756560..5b96393 100644 --- a/youtube_transcript_api/__main__.py +++ b/youtube_transcript_api/__main__.py @@ -11,5 +11,5 @@ def main(): print(YouTubeTranscriptCli(sys.argv[1:]).run()) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index 24a1236..bf1f240 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -1,17 +1,17 @@ import requests -try: # pragma: no cover + +try: # pragma: no cover import http.cookiejar as cookiejar + CookieLoadError = (FileNotFoundError, cookiejar.LoadError) -except ImportError: # pragma: no cover +except ImportError: # pragma: no cover import cookielib as cookiejar + CookieLoadError = IOError from ._transcripts import TranscriptListFetcher -from ._errors import ( - CookiePathInvalid, - CookiesInvalid -) +from ._errors import CookiePathInvalid, CookiesInvalid class YouTubeTranscriptApi(object): @@ -71,8 +71,15 @@ class YouTubeTranscriptApi(object): return TranscriptListFetcher(http_client).fetch(video_id) @classmethod - def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, - cookies=None, preserve_formatting=False): + def get_transcripts( + cls, + video_ids, + languages=("en",), + continue_after_error=False, + proxies=None, + cookies=None, + preserve_formatting=False, + ): """ Retrieves the transcripts for a list of videos. @@ -102,7 +109,9 @@ class YouTubeTranscriptApi(object): for video_id in video_ids: try: - data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting) + data[video_id] = cls.get_transcript( + video_id, languages, proxies, cookies, preserve_formatting + ) except Exception as exception: if not continue_after_error: raise exception @@ -112,7 +121,14 @@ class YouTubeTranscriptApi(object): return data, unretrievable_videos @classmethod - def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False): + def get_transcript( + cls, + video_id, + languages=("en",), + proxies=None, + cookies=None, + preserve_formatting=False, + ): """ Retrieves the transcript for a single video. This is just a shortcut for calling:: @@ -134,7 +150,11 @@ class YouTubeTranscriptApi(object): :rtype [{'text': str, 'start': float, 'end': float}]: """ assert isinstance(video_id, str), "`video_id` must be a string" - return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting) + return ( + cls.list_transcripts(video_id, proxies, cookies) + .find_transcript(languages) + .fetch(preserve_formatting=preserve_formatting) + ) @classmethod def _load_cookies(cls, cookies, video_id): diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index a9cbf75..09f76ba 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -13,10 +13,10 @@ class YouTubeTranscriptCli(object): parsed_args = self._parse_args() if parsed_args.exclude_manually_created and parsed_args.exclude_generated: - return '' + return "" proxies = None - if parsed_args.http_proxy != '' or parsed_args.https_proxy != '': + if parsed_args.http_proxy != "" or parsed_args.https_proxy != "": proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} cookies = parsed_args.cookies @@ -26,25 +26,41 @@ class YouTubeTranscriptCli(object): for video_id in parsed_args.video_ids: try: - transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id)) + transcripts.append( + self._fetch_transcript(parsed_args, proxies, cookies, video_id) + ) except Exception as exception: exceptions.append(exception) - return '\n\n'.join( + return "\n\n".join( [str(exception) for exception in exceptions] - + ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else []) + + ( + [ + FormatterLoader() + .load(parsed_args.format) + .format_transcripts(transcripts) + ] + if transcripts + else [] + ) ) def _fetch_transcript(self, parsed_args, proxies, cookies, video_id): - transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies) + transcript_list = YouTubeTranscriptApi.list_transcripts( + video_id, proxies=proxies, cookies=cookies + ) if parsed_args.list_transcripts: return str(transcript_list) if parsed_args.exclude_manually_created: - transcript = transcript_list.find_generated_transcript(parsed_args.languages) + transcript = transcript_list.find_generated_transcript( + parsed_args.languages + ) elif parsed_args.exclude_generated: - transcript = transcript_list.find_manually_created_transcript(parsed_args.languages) + transcript = transcript_list.find_manually_created_transcript( + parsed_args.languages + ) else: transcript = transcript_list.find_transcript(parsed_args.languages) @@ -56,80 +72,84 @@ class YouTubeTranscriptCli(object): def _parse_args(self): parser = argparse.ArgumentParser( description=( - 'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. ' - 'It also works for automatically generated subtitles and it does not require a headless browser, like ' - 'other selenium based solutions do!' + "This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. " + "It also works for automatically generated subtitles and it does not require a headless browser, like " + "other selenium based solutions do!" ) ) parser.add_argument( - '--list-transcripts', - action='store_const', + "--list-transcripts", + action="store_const", const=True, default=False, - help='This will list the languages in which the given videos are available in.', + help="This will list the languages in which the given videos are available in.", ) - parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.') parser.add_argument( - '--languages', - nargs='*', - default=['en',], + "video_ids", nargs="+", type=str, help="List of YouTube video IDs." + ) + parser.add_argument( + "--languages", + nargs="*", + default=[ + "en", + ], type=str, help=( 'A list of language codes in a descending priority. For example, if this is set to "de en" it will ' - 'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails ' - 'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you ' - 'may have to play around with the language codes a bit, to find the one which is working for you!' + "first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails " + "to do so. As I can't provide a complete list of all working language codes with full certainty, you " + "may have to play around with the language codes a bit, to find the one which is working for you!" ), ) parser.add_argument( - '--exclude-generated', - action='store_const', + "--exclude-generated", + action="store_const", const=True, default=False, - help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.', + help="If this flag is set transcripts which have been generated by YouTube will not be retrieved.", ) parser.add_argument( - '--exclude-manually-created', - action='store_const', + "--exclude-manually-created", + action="store_const", const=True, default=False, - help='If this flag is set transcripts which have been manually created will not be retrieved.', + help="If this flag is set transcripts which have been manually created will not be retrieved.", ) parser.add_argument( - '--format', + "--format", type=str, - default='pretty', + default="pretty", choices=tuple(FormatterLoader.TYPES.keys()), ) parser.add_argument( - '--translate', - default='', + "--translate", + default="", help=( - 'The language code for the language you want this transcript to be translated to. Use the ' - '--list-transcripts feature to find out which languages are translatable and which translation ' - 'languages are available.' - ) + "The language code for the language you want this transcript to be translated to. Use the " + "--list-transcripts feature to find out which languages are translatable and which translation " + "languages are available." + ), ) parser.add_argument( - '--http-proxy', - default='', - metavar='URL', - help='Use the specified HTTP proxy.' + "--http-proxy", + default="", + metavar="URL", + help="Use the specified HTTP proxy.", ) parser.add_argument( - '--https-proxy', - default='', - metavar='URL', - help='Use the specified HTTPS proxy.' + "--https-proxy", + default="", + metavar="URL", + help="Use the specified HTTPS proxy.", ) parser.add_argument( - '--cookies', + "--cookies", default=None, - help='The cookie file that will be used for authorization with youtube.' + help="The cookie file that will be used for authorization with youtube.", ) - + return self._sanitize_video_ids(parser.parse_args(self._args)) def _sanitize_video_ids(self, args): - args.video_ids = [video_id.replace('\\', '') for video_id in args.video_ids] + args.video_ids = [video_id.replace("\\", "") for video_id in args.video_ids] return args diff --git a/youtube_transcript_api/_errors.py b/youtube_transcript_api/_errors.py index d652c59..df4b0ad 100644 --- a/youtube_transcript_api/_errors.py +++ b/youtube_transcript_api/_errors.py @@ -5,16 +5,17 @@ class CouldNotRetrieveTranscript(Exception): """ Raised if a transcript could not be retrieved. """ - ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!' - CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}' - CAUSE_MESSAGE = '' + + ERROR_MESSAGE = "\nCould not retrieve a transcript for the video {video_url}!" + CAUSE_MESSAGE_INTRO = " This is most likely caused by:\n\n{cause}" + CAUSE_MESSAGE = "" GITHUB_REFERRAL = ( - '\n\nIf you are sure that the described cause is not responsible for this error ' - 'and that a transcript should be retrievable, please create an issue at ' - 'https://github.com/jdepoix/youtube-transcript-api/issues. ' - 'Please add which version of youtube_transcript_api you are using ' - 'and provide the information needed to replicate the error. ' - 'Also make sure that there are no open issues which already describe your problem!' + "\n\nIf you are sure that the described cause is not responsible for this error " + "and that a transcript should be retrievable, please create an issue at " + "https://github.com/jdepoix/youtube-transcript-api/issues. " + "Please add which version of youtube_transcript_api you are using " + "and provide the information needed to replicate the error. " + "Also make sure that there are no open issues which already describe your problem!" ) def __init__(self, video_id): @@ -23,10 +24,14 @@ class CouldNotRetrieveTranscript(Exception): def _build_error_message(self): cause = self.cause - error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id)) + error_message = self.ERROR_MESSAGE.format( + video_url=WATCH_URL.format(video_id=self.video_id) + ) if cause: - error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL + error_message += ( + self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL + ) return error_message @@ -36,7 +41,7 @@ class CouldNotRetrieveTranscript(Exception): class YouTubeRequestFailed(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'Request to YouTube failed: {reason}' + CAUSE_MESSAGE = "Request to YouTube failed: {reason}" def __init__(self, video_id, http_error): self.reason = str(http_error) @@ -50,12 +55,12 @@ class YouTubeRequestFailed(CouldNotRetrieveTranscript): class VideoUnavailable(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'The video is no longer available' + CAUSE_MESSAGE = "The video is no longer available" class InvalidVideoId(CouldNotRetrieveTranscript): CAUSE_MESSAGE = ( - 'You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n' + "You provided an invalid video id. Make sure you are using the video id and NOT the url!\n\n" 'Do NOT run: `YouTubeTranscriptApi.get_transcript("https://www.youtube.com/watch?v=1234")`\n' 'Instead run: `YouTubeTranscriptApi.get_transcript("1234")`' ) @@ -63,48 +68,48 @@ class InvalidVideoId(CouldNotRetrieveTranscript): class TooManyRequests(CouldNotRetrieveTranscript): CAUSE_MESSAGE = ( - 'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. ' - 'One of the following things can be done to work around this:\n\ - - Manually solve the captcha in a browser and export the cookie. ' - 'Read here how to use that cookie with ' - 'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\ + "YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. " + "One of the following things can be done to work around this:\n\ + - Manually solve the captcha in a browser and export the cookie. " + "Read here how to use that cookie with " + "youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\ - Use a different IP address\n\ - - Wait until the ban on your IP has been lifted' + - Wait until the ban on your IP has been lifted" ) class TranscriptsDisabled(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'Subtitles are disabled for this video' + CAUSE_MESSAGE = "Subtitles are disabled for this video" class NoTranscriptAvailable(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'No transcripts are available for this video' + CAUSE_MESSAGE = "No transcripts are available for this video" class NotTranslatable(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'The requested language is not translatable' + CAUSE_MESSAGE = "The requested language is not translatable" class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'The requested translation language is not available' + CAUSE_MESSAGE = "The requested translation language is not available" class CookiePathInvalid(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded' + CAUSE_MESSAGE = "The provided cookie file was unable to be loaded" class CookiesInvalid(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)' + CAUSE_MESSAGE = "The cookies provided are not valid (may have expired)" class FailedToCreateConsentCookie(CouldNotRetrieveTranscript): - CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies' + CAUSE_MESSAGE = "Failed to automatically give consent to saving cookies" class NoTranscriptFound(CouldNotRetrieveTranscript): CAUSE_MESSAGE = ( - 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n' - '{transcript_data}' + "No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n" + "{transcript_data}" ) def __init__(self, video_id, requested_language_codes, transcript_data): diff --git a/youtube_transcript_api/_html_unescaping.py b/youtube_transcript_api/_html_unescaping.py index 3efdf4b..6654d70 100644 --- a/youtube_transcript_api/_html_unescaping.py +++ b/youtube_transcript_api/_html_unescaping.py @@ -2,10 +2,10 @@ import sys # This can only be tested by using different python versions, therefore it is not covered by coverage.py -if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover +if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover # Python 3.4+ from html import unescape -else: # pragma: no cover +else: # pragma: no cover if sys.version_info.major <= 2: # Python 2 import HTMLParser diff --git a/youtube_transcript_api/_settings.py b/youtube_transcript_api/_settings.py index b1f7dfe..585b863 100644 --- a/youtube_transcript_api/_settings.py +++ b/youtube_transcript_api/_settings.py @@ -1 +1 @@ -WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' +WATCH_URL = "https://www.youtube.com/watch?v={video_id}" diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index ef1f44b..7ce4d2e 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -3,7 +3,7 @@ import sys # This can only be tested by using different python versions, therefore it is not covered by coverage.py if sys.version_info.major == 2: # pragma: no cover reload(sys) - sys.setdefaultencoding('utf-8') + sys.setdefaultencoding("utf-8") import json @@ -52,7 +52,7 @@ class TranscriptListFetcher(object): splitted_html = html.split('"captions":') if len(splitted_html) <= 1: - if video_id.startswith('http://') or video_id.startswith('https://'): + if video_id.startswith("http://") or video_id.startswith("https://"): raise InvalidVideoId(video_id) if 'class="g-recaptcha"' in html: raise TooManyRequests(video_id) @@ -62,12 +62,12 @@ class TranscriptListFetcher(object): raise TranscriptsDisabled(video_id) captions_json = json.loads( - splitted_html[1].split(',"videoDetails')[0].replace('\n', '') - ).get('playerCaptionsTracklistRenderer') + splitted_html[1].split(',"videoDetails')[0].replace("\n", "") + ).get("playerCaptionsTracklistRenderer") if captions_json is None: raise TranscriptsDisabled(video_id) - if 'captionTracks' not in captions_json: + if "captionTracks" not in captions_json: raise NoTranscriptAvailable(video_id) return captions_json @@ -76,7 +76,9 @@ class TranscriptListFetcher(object): match = re.search('name="v" value="(.*?)"', html) if match is None: raise FailedToCreateConsentCookie(video_id) - self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com') + self._http_client.cookies.set( + "CONSENT", "YES+" + match.group(1), domain=".youtube.com" + ) def _fetch_video_html(self, video_id): html = self._fetch_html(video_id) @@ -88,7 +90,9 @@ class TranscriptListFetcher(object): return html def _fetch_html(self, video_id): - response = self._http_client.get(WATCH_URL.format(video_id=video_id), headers={'Accept-Language': 'en-US'}) + response = self._http_client.get( + WATCH_URL.format(video_id=video_id), headers={"Accept-Language": "en-US"} + ) return unescape(_raise_http_errors(response, video_id).text) @@ -98,7 +102,13 @@ class TranscriptList(object): for a given YouTube video. Also it provides functionality to search for a transcript in a given language. """ - def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): + def __init__( + self, + video_id, + manually_created_transcripts, + generated_transcripts, + translation_languages, + ): """ The constructor is only for internal use. Use the static build method instead. @@ -132,28 +142,29 @@ class TranscriptList(object): """ translation_languages = [ { - 'language': translation_language['languageName']['simpleText'], - 'language_code': translation_language['languageCode'], - } for translation_language in captions_json.get('translationLanguages', []) + "language": translation_language["languageName"]["simpleText"], + "language_code": translation_language["languageCode"], + } + for translation_language in captions_json.get("translationLanguages", []) ] manually_created_transcripts = {} generated_transcripts = {} - for caption in captions_json['captionTracks']: - if caption.get('kind', '') == 'asr': + for caption in captions_json["captionTracks"]: + if caption.get("kind", "") == "asr": transcript_dict = generated_transcripts else: transcript_dict = manually_created_transcripts - transcript_dict[caption['languageCode']] = Transcript( + transcript_dict[caption["languageCode"]] = Transcript( http_client, video_id, - caption['baseUrl'], - caption['name']['simpleText'], - caption['languageCode'], - caption.get('kind', '') == 'asr', - translation_languages if caption.get('isTranslatable', False) else [], + caption["baseUrl"], + caption["name"]["simpleText"], + caption["languageCode"], + caption.get("kind", "") == "asr", + translation_languages if caption.get("isTranslatable", False) else [], ) return TranscriptList( @@ -164,7 +175,10 @@ class TranscriptList(object): ) def __iter__(self): - return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values())) + return iter( + list(self._manually_created_transcripts.values()) + + list(self._generated_transcripts.values()) + ) def find_transcript(self, language_codes): """ @@ -180,7 +194,10 @@ class TranscriptList(object): :rtype Transcript: :raises: NoTranscriptFound """ - return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts]) + return self._find_transcript( + language_codes, + [self._manually_created_transcripts, self._generated_transcripts], + ) def find_generated_transcript(self, language_codes): """ @@ -208,7 +225,9 @@ class TranscriptList(object): :rtype Transcript: :raises: NoTranscriptFound """ - return self._find_transcript(language_codes, [self._manually_created_transcripts]) + return self._find_transcript( + language_codes, [self._manually_created_transcripts] + ) def _find_transcript(self, language_codes, transcript_dicts): for language_code in language_codes: @@ -216,44 +235,54 @@ class TranscriptList(object): if language_code in transcript_dict: return transcript_dict[language_code] - raise NoTranscriptFound( - self.video_id, - language_codes, - self - ) + raise NoTranscriptFound(self.video_id, language_codes, self) def __str__(self): return ( - 'For this video ({video_id}) transcripts are available in the following languages:\n\n' - '(MANUALLY CREATED)\n' - '{available_manually_created_transcript_languages}\n\n' - '(GENERATED)\n' - '{available_generated_transcripts}\n\n' - '(TRANSLATION LANGUAGES)\n' - '{available_translation_languages}' + "For this video ({video_id}) transcripts are available in the following languages:\n\n" + "(MANUALLY CREATED)\n" + "{available_manually_created_transcript_languages}\n\n" + "(GENERATED)\n" + "{available_generated_transcripts}\n\n" + "(TRANSLATION LANGUAGES)\n" + "{available_translation_languages}" ).format( video_id=self.video_id, available_manually_created_transcript_languages=self._get_language_description( - str(transcript) for transcript in self._manually_created_transcripts.values() + str(transcript) + for transcript in self._manually_created_transcripts.values() ), available_generated_transcripts=self._get_language_description( str(transcript) for transcript in self._generated_transcripts.values() ), available_translation_languages=self._get_language_description( '{language_code} ("{language}")'.format( - language=translation_language['language'], - language_code=translation_language['language_code'], - ) for translation_language in self._translation_languages - ) + language=translation_language["language"], + language_code=translation_language["language_code"], + ) + for translation_language in self._translation_languages + ), ) def _get_language_description(self, transcript_strings): - description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings) - return description if description else 'None' + description = "\n".join( + " - {transcript}".format(transcript=transcript) + for transcript in transcript_strings + ) + return description if description else "None" class Transcript(object): - def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages): + def __init__( + self, + http_client, + video_id, + url, + language, + language_code, + is_generated, + translation_languages, + ): """ You probably don't want to initialize this directly. Usually you'll access Transcript objects using a TranscriptList. @@ -276,7 +305,7 @@ class Transcript(object): self.is_generated = is_generated self.translation_languages = translation_languages self._translation_languages_dict = { - translation_language['language_code']: translation_language['language'] + translation_language["language_code"]: translation_language["language"] for translation_language in translation_languages } @@ -288,7 +317,9 @@ class Transcript(object): :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ - response = self._http_client.get(self._url, headers={'Accept-Language': 'en-US'}) + response = self._http_client.get( + self._url, headers={"Accept-Language": "en-US"} + ) return _TranscriptParser(preserve_formatting=preserve_formatting).parse( _raise_http_errors(response, self.video_id).text, ) @@ -297,7 +328,7 @@ class Transcript(object): return '{language_code} ("{language}"){translation_description}'.format( language=self.language, language_code=self.language_code, - translation_description='[TRANSLATABLE]' if self.is_translatable else '' + translation_description="[TRANSLATABLE]" if self.is_translatable else "", ) @property @@ -314,7 +345,9 @@ class Transcript(object): return Transcript( self._http_client, self.video_id, - '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code), + "{url}&tlang={language_code}".format( + url=self._url, language_code=language_code + ), self._translation_languages_dict[language_code], language_code, True, @@ -324,16 +357,16 @@ class Transcript(object): class _TranscriptParser(object): _FORMATTING_TAGS = [ - 'strong', # important - 'em', # emphasized - 'b', # bold - 'i', # italic - 'mark', # marked - 'small', # smaller - 'del', # deleted - 'ins', # inserted - 'sub', # subscript - 'sup', # superscript + "strong", # important + "em", # emphasized + "b", # bold + "i", # italic + "mark", # marked + "small", # smaller + "del", # deleted + "ins", # inserted + "sub", # subscript + "sup", # superscript ] def __init__(self, preserve_formatting=False): @@ -341,19 +374,19 @@ class _TranscriptParser(object): def _get_html_regex(self, preserve_formatting): if preserve_formatting: - formats_regex = '|'.join(self._FORMATTING_TAGS) - formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' + formats_regex = "|".join(self._FORMATTING_TAGS) + formats_regex = r"<\/?(?!\/?(" + formats_regex + r")\b).*?\b>" html_regex = re.compile(formats_regex, re.IGNORECASE) else: - html_regex = re.compile(r'<[^>]*>', re.IGNORECASE) + html_regex = re.compile(r"<[^>]*>", re.IGNORECASE) return html_regex def parse(self, plain_data): return [ { - 'text': re.sub(self._html_regex, '', unescape(xml_element.text)), - 'start': float(xml_element.attrib['start']), - 'duration': float(xml_element.attrib.get('dur', '0.0')), + "text": re.sub(self._html_regex, "", unescape(xml_element.text)), + "start": float(xml_element.attrib["start"]), + "duration": float(xml_element.attrib.get("dur", "0.0")), } for xml_element in ElementTree.fromstring(plain_data) if xml_element.text is not None diff --git a/youtube_transcript_api/formatters.py b/youtube_transcript_api/formatters.py index 387e565..e693d47 100644 --- a/youtube_transcript_api/formatters.py +++ b/youtube_transcript_api/formatters.py @@ -12,12 +12,16 @@ class Formatter(object): """ def format_transcript(self, transcript, **kwargs): - raise NotImplementedError('A subclass of Formatter must implement ' \ - 'their own .format_transcript() method.') + raise NotImplementedError( + "A subclass of Formatter must implement " + "their own .format_transcript() method." + ) def format_transcripts(self, transcripts, **kwargs): - raise NotImplementedError('A subclass of Formatter must implement ' \ - 'their own .format_transcripts() method.') + raise NotImplementedError( + "A subclass of Formatter must implement " + "their own .format_transcripts() method." + ) class PrettyPrintFormatter(Formatter): @@ -68,7 +72,7 @@ class TextFormatter(Formatter): :return: all transcript text lines separated by newline breaks.' :rtype str """ - return '\n'.join(line['text'] for line in transcript) + return "\n".join(line["text"] for line in transcript) def format_transcripts(self, transcripts, **kwargs): """Converts a list of transcripts into plain text with no timestamps. @@ -77,21 +81,30 @@ class TextFormatter(Formatter): :return: all transcript text lines separated by newline breaks.' :rtype str """ - return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts]) + return "\n\n\n".join( + [self.format_transcript(transcript, **kwargs) for transcript in transcripts] + ) + class _TextBasedFormatter(TextFormatter): def _format_timestamp(self, hours, mins, secs, ms): - raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \ - 'their own .format_timestamp() method.') + raise NotImplementedError( + "A subclass of _TextBasedFormatter must implement " + "their own .format_timestamp() method." + ) def _format_transcript_header(self, lines): - raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \ - 'their own _format_transcript_header method.') + raise NotImplementedError( + "A subclass of _TextBasedFormatter must implement " + "their own _format_transcript_header method." + ) def _format_transcript_helper(self, i, time_text, line): - raise NotImplementedError('A subclass of _TextBasedFormatter must implement ' \ - 'their own _format_transcript_helper method.') - + raise NotImplementedError( + "A subclass of _TextBasedFormatter must implement " + "their own _format_transcript_helper method." + ) + def _seconds_to_timestamp(self, time): """Helper that converts `time` into a transcript cue timestamp. @@ -109,26 +122,27 @@ class _TextBasedFormatter(TextFormatter): hours_float, remainder = divmod(time, 3600) mins_float, secs_float = divmod(remainder, 60) hours, mins, secs = int(hours_float), int(mins_float), int(secs_float) - ms = int(round((time - int(time))*1000, 2)) + ms = int(round((time - int(time)) * 1000, 2)) return self._format_timestamp(hours, mins, secs, ms) def format_transcript(self, transcript, **kwargs): """A basic implementation of WEBVTT/SRT formatting. :param transcript: - :reference: + :reference: https://www.w3.org/TR/webvtt1/#introduction-caption https://www.3playmedia.com/blog/create-srt-file/ """ lines = [] for i, line in enumerate(transcript): - end = line['start'] + line['duration'] + end = line["start"] + line["duration"] time_text = "{} --> {}".format( - self._seconds_to_timestamp(line['start']), + self._seconds_to_timestamp(line["start"]), self._seconds_to_timestamp( - transcript[i + 1]['start'] - if i < len(transcript) - 1 and transcript[i + 1]['start'] < end else end - ) + transcript[i + 1]["start"] + if i < len(transcript) - 1 and transcript[i + 1]["start"] < end + else end + ), ) lines.append(self._format_transcript_helper(i, time_text, line)) @@ -138,12 +152,12 @@ class _TextBasedFormatter(TextFormatter): class SRTFormatter(_TextBasedFormatter): def _format_timestamp(self, hours, mins, secs, ms): return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, mins, secs, ms) - + def _format_transcript_header(self, lines): return "\n\n".join(lines) + "\n" def _format_transcript_helper(self, i, time_text, line): - return "{}\n{}\n{}".format(i + 1, time_text, line['text']) + return "{}\n{}\n{}".format(i + 1, time_text, line["text"]) class WebVTTFormatter(_TextBasedFormatter): @@ -154,29 +168,29 @@ class WebVTTFormatter(_TextBasedFormatter): return "WEBVTT\n\n" + "\n\n".join(lines) + "\n" def _format_transcript_helper(self, i, time_text, line): - return "{}\n{}".format(time_text, line['text']) + return "{}\n{}".format(time_text, line["text"]) class FormatterLoader(object): TYPES = { - 'json': JSONFormatter, - 'pretty': PrettyPrintFormatter, - 'text': TextFormatter, - 'webvtt': WebVTTFormatter, - 'srt' : SRTFormatter, + "json": JSONFormatter, + "pretty": PrettyPrintFormatter, + "text": TextFormatter, + "webvtt": WebVTTFormatter, + "srt": SRTFormatter, } class UnknownFormatterType(Exception): def __init__(self, formatter_type): super(FormatterLoader.UnknownFormatterType, self).__init__( - 'The format \'{formatter_type}\' is not supported. ' - 'Choose one of the following formats: {supported_formatter_types}'.format( + "The format '{formatter_type}' is not supported. " + "Choose one of the following formats: {supported_formatter_types}".format( formatter_type=formatter_type, - supported_formatter_types=', '.join(FormatterLoader.TYPES.keys()), + supported_formatter_types=", ".join(FormatterLoader.TYPES.keys()), ) ) - def load(self, formatter_type='pretty'): + def load(self, formatter_type="pretty"): """ Loads the Formatter for the given formatter type. diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 9b5e732..3d2e48c 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -25,8 +25,9 @@ from youtube_transcript_api import ( def load_asset(filename): - filepath = '{dirname}/assets/{filename}'.format( - dirname=os.path.dirname(__file__), filename=filename) + filepath = "{dirname}/assets/{filename}".format( + dirname=os.path.dirname(__file__), filename=filename + ) with open(filepath, mode="rb") as file: return file.read() @@ -37,13 +38,13 @@ class TestYouTubeTranscriptApi(TestCase): httpretty.enable() httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube.html.static"), ) httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/api/timedtext', - body=load_asset('transcript.xml.static') + "https://www.youtube.com/api/timedtext", + body=load_asset("transcript.xml.static"), ) def tearDown(self): @@ -51,306 +52,362 @@ class TestYouTubeTranscriptApi(TestCase): httpretty.disable() def test_get_transcript(self): - transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8') + transcript = YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8") self.assertEqual( transcript, [ - {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, - {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, - {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} - ] + {"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54}, + { + "text": "this is not the original transcript", + "start": 1.54, + "duration": 4.16, + }, + { + "text": "just something shorter, I made up for testing", + "start": 5.7, + "duration": 3.239, + }, + ], ) def test_get_transcript_formatted(self): - transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', preserve_formatting=True) + transcript = YouTubeTranscriptApi.get_transcript( + "GJLlxj_dtq8", preserve_formatting=True + ) self.assertEqual( transcript, [ - {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, - {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, - {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} - ] + {"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54}, + { + "text": "this is not the original transcript", + "start": 1.54, + "duration": 4.16, + }, + { + "text": "just something shorter, I made up for testing", + "start": 5.7, + "duration": 3.239, + }, + ], ) def test_list_transcripts(self): - transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') + transcript_list = YouTubeTranscriptApi.list_transcripts("GJLlxj_dtq8") language_codes = {transcript.language_code for transcript in transcript_list} - self.assertEqual(language_codes, {'zh', 'de', 'en', 'hi', 'ja', 'ko', 'es', 'cs', 'en'}) + self.assertEqual( + language_codes, {"zh", "de", "en", "hi", "ja", "ko", "es", "cs", "en"} + ) def test_list_transcripts__find_manually_created(self): - transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') - transcript = transcript_list.find_manually_created_transcript(['cs']) + transcript_list = YouTubeTranscriptApi.list_transcripts("GJLlxj_dtq8") + transcript = transcript_list.find_manually_created_transcript(["cs"]) self.assertFalse(transcript.is_generated) - def test_list_transcripts__find_generated(self): - transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') + transcript_list = YouTubeTranscriptApi.list_transcripts("GJLlxj_dtq8") with self.assertRaises(NoTranscriptFound): - transcript_list.find_generated_transcript(['cs']) + transcript_list.find_generated_transcript(["cs"]) - transcript = transcript_list.find_generated_transcript(['en']) + transcript = transcript_list.find_generated_transcript(["en"]) self.assertTrue(transcript.is_generated) def test_list_transcripts__url_as_video_id(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_transcripts_disabled.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_transcripts_disabled.html.static"), ) with self.assertRaises(InvalidVideoId): - YouTubeTranscriptApi.list_transcripts('https://www.youtube.com/watch?v=GJLlxj_dtq8') - + YouTubeTranscriptApi.list_transcripts( + "https://www.youtube.com/watch?v=GJLlxj_dtq8" + ) def test_list_transcripts__no_translation_languages_provided(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_no_translation_languages.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_no_translation_languages.html.static"), ) - transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') + transcript_list = YouTubeTranscriptApi.list_transcripts("GJLlxj_dtq8") for transcript in transcript_list: self.assertEqual(len(transcript.translation_languages), 0) - def test_translate_transcript(self): - transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en']) + transcript = YouTubeTranscriptApi.list_transcripts( + "GJLlxj_dtq8" + ).find_transcript(["en"]) - translated_transcript = transcript.translate('af') + translated_transcript = transcript.translate("af") - self.assertEqual(translated_transcript.language_code, 'af') - self.assertIn('&tlang=af', translated_transcript._url) + self.assertEqual(translated_transcript.language_code, "af") + self.assertIn("&tlang=af", translated_transcript._url) def test_translate_transcript__translation_language_not_available(self): - transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en']) + transcript = YouTubeTranscriptApi.list_transcripts( + "GJLlxj_dtq8" + ).find_transcript(["en"]) with self.assertRaises(TranslationLanguageNotAvailable): - transcript.translate('xyz') + transcript.translate("xyz") def test_translate_transcript__not_translatable(self): - transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en']) + transcript = YouTubeTranscriptApi.list_transcripts( + "GJLlxj_dtq8" + ).find_transcript(["en"]) transcript.translation_languages = [] with self.assertRaises(NotTranslatable): - transcript.translate('af') + transcript.translate("af") def test_get_transcript__correct_language_is_used(self): - YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en']) + YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8", ["de", "en"]) query_string = httpretty.last_request().querystring - self.assertIn('lang', query_string) - self.assertEqual(len(query_string['lang']), 1) - self.assertEqual(query_string['lang'][0], 'de') + self.assertIn("lang", query_string) + self.assertEqual(len(query_string["lang"]), 1) + self.assertEqual(query_string["lang"][0], "de") def test_get_transcript__fallback_language_is_used(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_ww1_nl_en.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_ww1_nl_en.html.static"), ) - YouTubeTranscriptApi.get_transcript('F1xioXWb8CY', ['de', 'en']) + YouTubeTranscriptApi.get_transcript("F1xioXWb8CY", ["de", "en"]) query_string = httpretty.last_request().querystring - self.assertIn('lang', query_string) - self.assertEqual(len(query_string['lang']), 1) - self.assertEqual(query_string['lang'][0], 'en') + self.assertIn("lang", query_string) + self.assertEqual(len(query_string["lang"]), 1) + self.assertEqual(query_string["lang"][0], "en") def test_get_transcript__create_consent_cookie_if_needed(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_consent_page.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_consent_page.html.static"), ) - YouTubeTranscriptApi.get_transcript('F1xioXWb8CY') + YouTubeTranscriptApi.get_transcript("F1xioXWb8CY") self.assertEqual(len(httpretty.latest_requests()), 3) for request in httpretty.latest_requests()[1:]: - self.assertEqual(request.headers['cookie'], 'CONSENT=YES+cb.20210328-17-p0.de+FX+119') + self.assertEqual( + request.headers["cookie"], "CONSENT=YES+cb.20210328-17-p0.de+FX+119" + ) def test_get_transcript__exception_if_create_consent_cookie_failed(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_consent_page.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_consent_page.html.static"), ) httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_consent_page.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_consent_page.html.static"), ) with self.assertRaises(FailedToCreateConsentCookie): - YouTubeTranscriptApi.get_transcript('F1xioXWb8CY') + YouTubeTranscriptApi.get_transcript("F1xioXWb8CY") def test_get_transcript__exception_if_consent_cookie_age_invalid(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_consent_page_invalid.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_consent_page_invalid.html.static"), ) with self.assertRaises(FailedToCreateConsentCookie): - YouTubeTranscriptApi.get_transcript('F1xioXWb8CY') + YouTubeTranscriptApi.get_transcript("F1xioXWb8CY") def test_get_transcript__exception_if_video_unavailable(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_video_unavailable.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_video_unavailable.html.static"), ) with self.assertRaises(VideoUnavailable): - YouTubeTranscriptApi.get_transcript('abc') + YouTubeTranscriptApi.get_transcript("abc") def test_get_transcript__exception_if_youtube_request_fails(self): httpretty.register_uri( - httpretty.GET, - 'https://www.youtube.com/watch', - status=500 + httpretty.GET, "https://www.youtube.com/watch", status=500 ) with self.assertRaises(YouTubeRequestFailed): - YouTubeTranscriptApi.get_transcript('abc') + YouTubeTranscriptApi.get_transcript("abc") def test_get_transcript__exception_if_youtube_request_limit_reached(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_too_many_requests.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_too_many_requests.html.static"), ) with self.assertRaises(TooManyRequests): - YouTubeTranscriptApi.get_transcript('abc') + YouTubeTranscriptApi.get_transcript("abc") def test_get_transcript__exception_if_transcripts_disabled(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_transcripts_disabled.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_transcripts_disabled.html.static"), ) with self.assertRaises(TranscriptsDisabled): - YouTubeTranscriptApi.get_transcript('dsMFmonKDD4') + YouTubeTranscriptApi.get_transcript("dsMFmonKDD4") httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_transcripts_disabled2.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_transcripts_disabled2.html.static"), ) with self.assertRaises(TranscriptsDisabled): - YouTubeTranscriptApi.get_transcript('Fjg5lYqvzUs') + YouTubeTranscriptApi.get_transcript("Fjg5lYqvzUs") def test_get_transcript__exception_if_language_unavailable(self): with self.assertRaises(NoTranscriptFound): - YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', languages=['cz']) + YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8", languages=["cz"]) def test_get_transcript__exception_if_no_transcript_available(self): httpretty.register_uri( httpretty.GET, - 'https://www.youtube.com/watch', - body=load_asset('youtube_no_transcript_available.html.static') + "https://www.youtube.com/watch", + body=load_asset("youtube_no_transcript_available.html.static"), ) with self.assertRaises(NoTranscriptAvailable): - YouTubeTranscriptApi.get_transcript('MwBPvcYFY2E') + YouTubeTranscriptApi.get_transcript("MwBPvcYFY2E") def test_get_transcript__with_proxy(self): - proxies = {'http': '', 'https:': ''} - transcript = YouTubeTranscriptApi.get_transcript( - 'GJLlxj_dtq8', proxies=proxies - ) + proxies = {"http": "", "https:": ""} + transcript = YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8", proxies=proxies) self.assertEqual( transcript, [ - {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, - {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, - {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} - ] + {"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54}, + { + "text": "this is not the original transcript", + "start": 1.54, + "duration": 4.16, + }, + { + "text": "just something shorter, I made up for testing", + "start": 5.7, + "duration": 3.239, + }, + ], ) - + def test_get_transcript__with_cookies(self): dirname, filename = os.path.split(os.path.abspath(__file__)) - cookies = dirname + '/example_cookies.txt' - transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', cookies=cookies) + cookies = dirname + "/example_cookies.txt" + transcript = YouTubeTranscriptApi.get_transcript("GJLlxj_dtq8", cookies=cookies) self.assertEqual( transcript, [ - {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, - {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, - {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} - ] + {"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54}, + { + "text": "this is not the original transcript", + "start": 1.54, + "duration": 4.16, + }, + { + "text": "just something shorter, I made up for testing", + "start": 5.7, + "duration": 3.239, + }, + ], ) def test_get_transcript__assertionerror_if_input_not_string(self): with self.assertRaises(AssertionError): - YouTubeTranscriptApi.get_transcript(['video_id_1', 'video_id_2']) + YouTubeTranscriptApi.get_transcript(["video_id_1", "video_id_2"]) def test_get_transcripts__assertionerror_if_input_not_list(self): with self.assertRaises(AssertionError): - YouTubeTranscriptApi.get_transcripts('video_id_1') + YouTubeTranscriptApi.get_transcripts("video_id_1") - @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') + @patch("youtube_transcript_api.YouTubeTranscriptApi.get_transcript") def test_get_transcripts(self, mock_get_transcript): - video_id_1 = 'video_id_1' - video_id_2 = 'video_id_2' - languages = ['de', 'en'] + video_id_1 = "video_id_1" + video_id_2 = "video_id_2" + languages = ["de", "en"] - YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) + YouTubeTranscriptApi.get_transcripts( + [video_id_1, video_id_2], languages=languages + ) mock_get_transcript.assert_any_call(video_id_1, languages, None, None, False) mock_get_transcript.assert_any_call(video_id_2, languages, None, None, False) self.assertEqual(mock_get_transcript.call_count, 2) - @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) + @patch( + "youtube_transcript_api.YouTubeTranscriptApi.get_transcript", + side_effect=Exception("Error"), + ) def test_get_transcripts__stop_on_error(self, mock_get_transcript): with self.assertRaises(Exception): - YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2']) + YouTubeTranscriptApi.get_transcripts(["video_id_1", "video_id_2"]) - @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) + @patch( + "youtube_transcript_api.YouTubeTranscriptApi.get_transcript", + side_effect=Exception("Error"), + ) def test_get_transcripts__continue_on_error(self, mock_get_transcript): - video_id_1 = 'video_id_1' - video_id_2 = 'video_id_2' + video_id_1 = "video_id_1" + video_id_2 = "video_id_2" - YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) + YouTubeTranscriptApi.get_transcripts( + ["video_id_1", "video_id_2"], continue_after_error=True + ) - mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None, False) - mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None, False) - - @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') + mock_get_transcript.assert_any_call(video_id_1, ("en",), None, None, False) + mock_get_transcript.assert_any_call(video_id_2, ("en",), None, None, False) + + @patch("youtube_transcript_api.YouTubeTranscriptApi.get_transcript") def test_get_transcripts__with_cookies(self, mock_get_transcript): - cookies = '/example_cookies.txt' - YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) - mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies, False) + cookies = "/example_cookies.txt" + YouTubeTranscriptApi.get_transcripts(["GJLlxj_dtq8"], cookies=cookies) + mock_get_transcript.assert_any_call( + "GJLlxj_dtq8", ("en",), None, cookies, False + ) - @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') + @patch("youtube_transcript_api.YouTubeTranscriptApi.get_transcript") def test_get_transcripts__with_proxies(self, mock_get_transcript): - proxies = {'http': '', 'https:': ''} - YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) - mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None, False) + proxies = {"http": "", "https:": ""} + YouTubeTranscriptApi.get_transcripts(["GJLlxj_dtq8"], proxies=proxies) + mock_get_transcript.assert_any_call( + "GJLlxj_dtq8", ("en",), proxies, None, False + ) def test_load_cookies(self): dirname, filename = os.path.split(os.path.abspath(__file__)) - cookies = dirname + '/example_cookies.txt' - session_cookies = YouTubeTranscriptApi._load_cookies(cookies, 'GJLlxj_dtq8') - self.assertEqual({'TEST_FIELD': 'TEST_VALUE'}, requests.utils.dict_from_cookiejar(session_cookies)) + cookies = dirname + "/example_cookies.txt" + session_cookies = YouTubeTranscriptApi._load_cookies(cookies, "GJLlxj_dtq8") + self.assertEqual( + {"TEST_FIELD": "TEST_VALUE"}, + requests.utils.dict_from_cookiejar(session_cookies), + ) def test_load_cookies__bad_file_path(self): - bad_cookies = 'nonexistent_cookies.txt' + bad_cookies = "nonexistent_cookies.txt" with self.assertRaises(CookiePathInvalid): - YouTubeTranscriptApi._load_cookies(bad_cookies, 'GJLlxj_dtq8') + YouTubeTranscriptApi._load_cookies(bad_cookies, "GJLlxj_dtq8") def test_load_cookies__no_valid_cookies(self): dirname, filename = os.path.split(os.path.abspath(__file__)) - expired_cookies = dirname + '/expired_example_cookies.txt' + expired_cookies = dirname + "/expired_example_cookies.txt" with self.assertRaises(CookiesInvalid): - YouTubeTranscriptApi._load_cookies(expired_cookies, 'GJLlxj_dtq8') + YouTubeTranscriptApi._load_cookies(expired_cookies, "GJLlxj_dtq8") diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index 26ffabc..623d4a4 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -10,211 +10,269 @@ from youtube_transcript_api._cli import YouTubeTranscriptCli class TestYouTubeTranscriptCli(TestCase): def setUp(self): self.transcript_mock = MagicMock() - self.transcript_mock.fetch = MagicMock(return_value=[ - {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, - {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, - {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} - ]) + self.transcript_mock.fetch = MagicMock( + return_value=[ + {"text": "Hey, this is just a test", "start": 0.0, "duration": 1.54}, + { + "text": "this is not the original transcript", + "start": 1.54, + "duration": 4.16, + }, + { + "text": "just something shorter, I made up for testing", + "start": 5.7, + "duration": 3.239, + }, + ] + ) self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock) self.transcript_list_mock = MagicMock() - self.transcript_list_mock.find_generated_transcript = MagicMock(return_value=self.transcript_mock) - self.transcript_list_mock.find_manually_created_transcript = MagicMock(return_value=self.transcript_mock) - self.transcript_list_mock.find_transcript = MagicMock(return_value=self.transcript_mock) + self.transcript_list_mock.find_generated_transcript = MagicMock( + return_value=self.transcript_mock + ) + self.transcript_list_mock.find_manually_created_transcript = MagicMock( + return_value=self.transcript_mock + ) + self.transcript_list_mock.find_transcript = MagicMock( + return_value=self.transcript_mock + ) - YouTubeTranscriptApi.list_transcripts = MagicMock(return_value=self.transcript_list_mock) + YouTubeTranscriptApi.list_transcripts = MagicMock( + return_value=self.transcript_list_mock + ) def test_argument_parsing(self): - parsed_args = YouTubeTranscriptCli('v1 v2 --format json --languages de en'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'json') - self.assertEqual(parsed_args.languages, ['de', 'en']) - self.assertEqual(parsed_args.http_proxy, '') - self.assertEqual(parsed_args.https_proxy, '') - - parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --format json'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'json') - self.assertEqual(parsed_args.languages, ['de', 'en']) - self.assertEqual(parsed_args.http_proxy, '') - self.assertEqual(parsed_args.https_proxy, '') - - parsed_args = YouTubeTranscriptCli(' --format json v1 v2 --languages de en'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'json') - self.assertEqual(parsed_args.languages, ['de', 'en']) - self.assertEqual(parsed_args.http_proxy, '') - self.assertEqual(parsed_args.https_proxy, '') + parsed_args = YouTubeTranscriptCli( + "v1 v2 --format json --languages de en".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "json") + self.assertEqual(parsed_args.languages, ["de", "en"]) + self.assertEqual(parsed_args.http_proxy, "") + self.assertEqual(parsed_args.https_proxy, "") parsed_args = YouTubeTranscriptCli( - 'v1 v2 --languages de en --format json ' - '--http-proxy http://user:pass@domain:port ' - '--https-proxy https://user:pass@domain:port'.split() + "v1 v2 --languages de en --format json".split() )._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'json') - self.assertEqual(parsed_args.languages, ['de', 'en']) - self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') - self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "json") + self.assertEqual(parsed_args.languages, ["de", "en"]) + self.assertEqual(parsed_args.http_proxy, "") + self.assertEqual(parsed_args.https_proxy, "") parsed_args = YouTubeTranscriptCli( - 'v1 v2 --languages de en --format json --http-proxy http://user:pass@domain:port'.split() + " --format json v1 v2 --languages de en".split() )._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'json') - self.assertEqual(parsed_args.languages, ['de', 'en']) - self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') - self.assertEqual(parsed_args.https_proxy, '') + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "json") + self.assertEqual(parsed_args.languages, ["de", "en"]) + self.assertEqual(parsed_args.http_proxy, "") + self.assertEqual(parsed_args.https_proxy, "") parsed_args = YouTubeTranscriptCli( - 'v1 v2 --languages de en --format json --https-proxy https://user:pass@domain:port'.split() + "v1 v2 --languages de en --format json " + "--http-proxy http://user:pass@domain:port " + "--https-proxy https://user:pass@domain:port".split() )._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'json') - self.assertEqual(parsed_args.languages, ['de', 'en']) - self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') - self.assertEqual(parsed_args.http_proxy, '') + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "json") + self.assertEqual(parsed_args.languages, ["de", "en"]) + self.assertEqual(parsed_args.http_proxy, "http://user:pass@domain:port") + self.assertEqual(parsed_args.https_proxy, "https://user:pass@domain:port") + + parsed_args = YouTubeTranscriptCli( + "v1 v2 --languages de en --format json --http-proxy http://user:pass@domain:port".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "json") + self.assertEqual(parsed_args.languages, ["de", "en"]) + self.assertEqual(parsed_args.http_proxy, "http://user:pass@domain:port") + self.assertEqual(parsed_args.https_proxy, "") + + parsed_args = YouTubeTranscriptCli( + "v1 v2 --languages de en --format json --https-proxy https://user:pass@domain:port".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "json") + self.assertEqual(parsed_args.languages, ["de", "en"]) + self.assertEqual(parsed_args.https_proxy, "https://user:pass@domain:port") + self.assertEqual(parsed_args.http_proxy, "") def test_argument_parsing__only_video_ids(self): - parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'pretty') - self.assertEqual(parsed_args.languages, ['en']) + parsed_args = YouTubeTranscriptCli("v1 v2".split())._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "pretty") + self.assertEqual(parsed_args.languages, ["en"]) def test_argument_parsing__video_ids_starting_with_dash(self): - parsed_args = YouTubeTranscriptCli('\-v1 \-\-v2 \--v3'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['-v1', '--v2', '--v3']) - self.assertEqual(parsed_args.format, 'pretty') - self.assertEqual(parsed_args.languages, ['en']) + parsed_args = YouTubeTranscriptCli("\-v1 \-\-v2 \--v3".split())._parse_args() + self.assertEqual(parsed_args.video_ids, ["-v1", "--v2", "--v3"]) + self.assertEqual(parsed_args.format, "pretty") + self.assertEqual(parsed_args.languages, ["en"]) def test_argument_parsing__fail_without_video_ids(self): with self.assertRaises(SystemExit): - YouTubeTranscriptCli('--format json'.split())._parse_args() + YouTubeTranscriptCli("--format json".split())._parse_args() def test_argument_parsing__json(self): - parsed_args = YouTubeTranscriptCli('v1 v2 --format json'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'json') - self.assertEqual(parsed_args.languages, ['en']) + parsed_args = YouTubeTranscriptCli("v1 v2 --format json".split())._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "json") + self.assertEqual(parsed_args.languages, ["en"]) - parsed_args = YouTubeTranscriptCli('--format json v1 v2'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'json') - self.assertEqual(parsed_args.languages, ['en']) + parsed_args = YouTubeTranscriptCli("--format json v1 v2".split())._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "json") + self.assertEqual(parsed_args.languages, ["en"]) def test_argument_parsing__languages(self): - parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'pretty') - self.assertEqual(parsed_args.languages, ['de', 'en']) + parsed_args = YouTubeTranscriptCli( + "v1 v2 --languages de en".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "pretty") + self.assertEqual(parsed_args.languages, ["de", "en"]) def test_argument_parsing__proxies(self): parsed_args = YouTubeTranscriptCli( - 'v1 v2 --http-proxy http://user:pass@domain:port'.split() + "v1 v2 --http-proxy http://user:pass@domain:port".split() )._parse_args() - self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') + self.assertEqual(parsed_args.http_proxy, "http://user:pass@domain:port") parsed_args = YouTubeTranscriptCli( - 'v1 v2 --https-proxy https://user:pass@domain:port'.split() + "v1 v2 --https-proxy https://user:pass@domain:port".split() )._parse_args() - self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') + self.assertEqual(parsed_args.https_proxy, "https://user:pass@domain:port") parsed_args = YouTubeTranscriptCli( - 'v1 v2 --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split() + "v1 v2 --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port".split() )._parse_args() - self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port') - self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port') + self.assertEqual(parsed_args.http_proxy, "http://user:pass@domain:port") + self.assertEqual(parsed_args.https_proxy, "https://user:pass@domain:port") - parsed_args = YouTubeTranscriptCli( - 'v1 v2'.split() - )._parse_args() - self.assertEqual(parsed_args.http_proxy, '') - self.assertEqual(parsed_args.https_proxy, '') + parsed_args = YouTubeTranscriptCli("v1 v2".split())._parse_args() + self.assertEqual(parsed_args.http_proxy, "") + self.assertEqual(parsed_args.https_proxy, "") def test_argument_parsing__list_transcripts(self): - parsed_args = YouTubeTranscriptCli('--list-transcripts v1 v2'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + parsed_args = YouTubeTranscriptCli( + "--list-transcripts v1 v2".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) self.assertTrue(parsed_args.list_transcripts) - parsed_args = YouTubeTranscriptCli('v1 v2 --list-transcripts'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + parsed_args = YouTubeTranscriptCli( + "v1 v2 --list-transcripts".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) self.assertTrue(parsed_args.list_transcripts) def test_argument_parsing__translate(self): - parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'pretty') - self.assertEqual(parsed_args.languages, ['de', 'en']) - self.assertEqual(parsed_args.translate, 'cz') + parsed_args = YouTubeTranscriptCli( + "v1 v2 --languages de en --translate cz".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "pretty") + self.assertEqual(parsed_args.languages, ["de", "en"]) + self.assertEqual(parsed_args.translate, "cz") - parsed_args = YouTubeTranscriptCli('v1 v2 --translate cz --languages de en'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) - self.assertEqual(parsed_args.format, 'pretty') - self.assertEqual(parsed_args.languages, ['de', 'en']) - self.assertEqual(parsed_args.translate, 'cz') + parsed_args = YouTubeTranscriptCli( + "v1 v2 --translate cz --languages de en".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) + self.assertEqual(parsed_args.format, "pretty") + self.assertEqual(parsed_args.languages, ["de", "en"]) + self.assertEqual(parsed_args.translate, "cz") def test_argument_parsing__manually_or_generated(self): - parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + parsed_args = YouTubeTranscriptCli( + "v1 v2 --exclude-manually-created".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) self.assertTrue(parsed_args.exclude_manually_created) self.assertFalse(parsed_args.exclude_generated) - parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-generated'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + parsed_args = YouTubeTranscriptCli( + "v1 v2 --exclude-generated".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) self.assertFalse(parsed_args.exclude_manually_created) self.assertTrue(parsed_args.exclude_generated) - parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created --exclude-generated'.split())._parse_args() - self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) + parsed_args = YouTubeTranscriptCli( + "v1 v2 --exclude-manually-created --exclude-generated".split() + )._parse_args() + self.assertEqual(parsed_args.video_ids, ["v1", "v2"]) self.assertTrue(parsed_args.exclude_manually_created) self.assertTrue(parsed_args.exclude_generated) def test_run(self): - YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() + YouTubeTranscriptCli("v1 v2 --languages de en".split()).run() - YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None) - YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call( + "v1", proxies=None, cookies=None + ) + YouTubeTranscriptApi.list_transcripts.assert_any_call( + "v2", proxies=None, cookies=None + ) - self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en']) + self.transcript_list_mock.find_transcript.assert_any_call(["de", "en"]) def test_run__failing_transcripts(self): - YouTubeTranscriptApi.list_transcripts = MagicMock(side_effect=VideoUnavailable('video_id')) + YouTubeTranscriptApi.list_transcripts = MagicMock( + side_effect=VideoUnavailable("video_id") + ) - output = YouTubeTranscriptCli('v1 --languages de en'.split()).run() + output = YouTubeTranscriptCli("v1 --languages de en".split()).run() - self.assertEqual(output, str(VideoUnavailable('video_id'))) + self.assertEqual(output, str(VideoUnavailable("video_id"))) def test_run__exclude_generated(self): - YouTubeTranscriptCli('v1 v2 --languages de en --exclude-generated'.split()).run() + YouTubeTranscriptCli( + "v1 v2 --languages de en --exclude-generated".split() + ).run() - self.transcript_list_mock.find_manually_created_transcript.assert_any_call(['de', 'en']) + self.transcript_list_mock.find_manually_created_transcript.assert_any_call( + ["de", "en"] + ) def test_run__exclude_manually_created(self): - YouTubeTranscriptCli('v1 v2 --languages de en --exclude-manually-created'.split()).run() + YouTubeTranscriptCli( + "v1 v2 --languages de en --exclude-manually-created".split() + ).run() - self.transcript_list_mock.find_generated_transcript.assert_any_call(['de', 'en']) + self.transcript_list_mock.find_generated_transcript.assert_any_call( + ["de", "en"] + ) def test_run__exclude_manually_created_and_generated(self): self.assertEqual( YouTubeTranscriptCli( - 'v1 v2 --languages de en --exclude-manually-created --exclude-generated'.split() + "v1 v2 --languages de en --exclude-manually-created --exclude-generated".split() ).run(), - '' + "", ) def test_run__translate(self): - YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split()).run(), + YouTubeTranscriptCli("v1 v2 --languages de en --translate cz".split()).run(), - self.transcript_mock.translate.assert_any_call('cz') + self.transcript_mock.translate.assert_any_call("cz") def test_run__list_transcripts(self): - YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run() + YouTubeTranscriptCli("--list-transcripts v1 v2".split()).run() - YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None) - YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None) + YouTubeTranscriptApi.list_transcripts.assert_any_call( + "v1", proxies=None, cookies=None + ) + YouTubeTranscriptApi.list_transcripts.assert_any_call( + "v2", proxies=None, cookies=None + ) def test_run__json_output(self): - output = YouTubeTranscriptCli('v1 v2 --languages de en --format json'.split()).run() + output = YouTubeTranscriptCli( + "v1 v2 --languages de en --format json".split() + ).run() # will fail if output is not valid json json.loads(output) @@ -222,31 +280,37 @@ class TestYouTubeTranscriptCli(TestCase): def test_run__proxies(self): YouTubeTranscriptCli( ( - 'v1 v2 --languages de en ' - '--http-proxy http://user:pass@domain:port ' - '--https-proxy https://user:pass@domain:port' + "v1 v2 --languages de en " + "--http-proxy http://user:pass@domain:port " + "--https-proxy https://user:pass@domain:port" ).split() ).run() YouTubeTranscriptApi.list_transcripts.assert_any_call( - 'v1', - proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}, - cookies= None + "v1", + proxies={ + "http": "http://user:pass@domain:port", + "https": "https://user:pass@domain:port", + }, + cookies=None, ) YouTubeTranscriptApi.list_transcripts.assert_any_call( - 'v2', - proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}, - cookies=None + "v2", + proxies={ + "http": "http://user:pass@domain:port", + "https": "https://user:pass@domain:port", + }, + cookies=None, ) def test_run__cookies(self): YouTubeTranscriptCli( - ( - 'v1 v2 --languages de en ' - '--cookies blahblah.txt' - ).split() + ("v1 v2 --languages de en " "--cookies blahblah.txt").split() ).run() - YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies='blahblah.txt') - YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies='blahblah.txt') - + YouTubeTranscriptApi.list_transcripts.assert_any_call( + "v1", proxies=None, cookies="blahblah.txt" + ) + YouTubeTranscriptApi.list_transcripts.assert_any_call( + "v2", proxies=None, cookies="blahblah.txt" + ) diff --git a/youtube_transcript_api/test/test_formatters.py b/youtube_transcript_api/test/test_formatters.py index b0b3ba2..7eda79a 100644 --- a/youtube_transcript_api/test/test_formatters.py +++ b/youtube_transcript_api/test/test_formatters.py @@ -10,16 +10,17 @@ from youtube_transcript_api.formatters import ( TextFormatter, SRTFormatter, WebVTTFormatter, - PrettyPrintFormatter, FormatterLoader + PrettyPrintFormatter, + FormatterLoader, ) class TestFormatters(TestCase): def setUp(self): self.transcript = [ - {'text': 'Test line 1', 'start': 0.0, 'duration': 1.50}, - {'text': 'line between', 'start': 1.5, 'duration': 2.0}, - {'text': 'testing the end line', 'start': 2.5, 'duration': 3.25} + {"text": "Test line 1", "start": 0.0, "duration": 1.50}, + {"text": "line between", "start": 1.5, "duration": 2.0}, + {"text": "testing the end line", "start": 2.5, "duration": 3.25}, ] self.transcripts = [self.transcript, self.transcript] @@ -31,27 +32,27 @@ class TestFormatters(TestCase): def test_srt_formatter_starting(self): content = SRTFormatter().format_transcript(self.transcript) - lines = content.split('\n') + lines = content.split("\n") # test starting lines self.assertEqual(lines[0], "1") self.assertEqual(lines[1], "00:00:00,000 --> 00:00:01,500") - + def test_srt_formatter_middle(self): content = SRTFormatter().format_transcript(self.transcript) - lines = content.split('\n') + lines = content.split("\n") # test middle lines self.assertEqual(lines[4], "2") self.assertEqual(lines[5], "00:00:01,500 --> 00:00:02,500") - self.assertEqual(lines[6], self.transcript[1]['text']) + self.assertEqual(lines[6], self.transcript[1]["text"]) def test_srt_formatter_ending(self): content = SRTFormatter().format_transcript(self.transcript) - lines = content.split('\n') + lines = content.split("\n") # test ending lines - self.assertEqual(lines[-2], self.transcript[-1]['text']) + self.assertEqual(lines[-2], self.transcript[-1]["text"]) self.assertEqual(lines[-1], "") def test_srt_formatter_many(self): @@ -59,22 +60,25 @@ class TestFormatters(TestCase): content = formatter.format_transcripts(self.transcripts) formatted_single_transcript = formatter.format_transcript(self.transcript) - self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript) + self.assertEqual( + content, + formatted_single_transcript + "\n\n\n" + formatted_single_transcript, + ) def test_webvtt_formatter_starting(self): content = WebVTTFormatter().format_transcript(self.transcript) - lines = content.split('\n') + lines = content.split("\n") # test starting lines self.assertEqual(lines[0], "WEBVTT") self.assertEqual(lines[1], "") - + def test_webvtt_formatter_ending(self): content = WebVTTFormatter().format_transcript(self.transcript) - lines = content.split('\n') + lines = content.split("\n") # test ending lines - self.assertEqual(lines[-2], self.transcript[-1]['text']) + self.assertEqual(lines[-2], self.transcript[-1]["text"]) self.assertEqual(lines[-1], "") def test_webvtt_formatter_many(self): @@ -82,7 +86,10 @@ class TestFormatters(TestCase): content = formatter.format_transcripts(self.transcripts) formatted_single_transcript = formatter.format_transcript(self.transcript) - self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript) + self.assertEqual( + content, + formatted_single_transcript + "\n\n\n" + formatted_single_transcript, + ) def test_pretty_print_formatter(self): content = PrettyPrintFormatter().format_transcript(self.transcript) @@ -106,7 +113,7 @@ class TestFormatters(TestCase): def test_text_formatter(self): content = TextFormatter().format_transcript(self.transcript) - lines = content.split('\n') + lines = content.split("\n") self.assertEqual(lines[0], self.transcript[0]["text"]) self.assertEqual(lines[-1], self.transcript[-1]["text"]) @@ -116,11 +123,14 @@ class TestFormatters(TestCase): content = formatter.format_transcripts(self.transcripts) formatted_single_transcript = formatter.format_transcript(self.transcript) - self.assertEqual(content, formatted_single_transcript + '\n\n\n' + formatted_single_transcript) + self.assertEqual( + content, + formatted_single_transcript + "\n\n\n" + formatted_single_transcript, + ) def test_formatter_loader(self): loader = FormatterLoader() - formatter = loader.load('json') + formatter = loader.load("json") self.assertTrue(isinstance(formatter, JSONFormatter)) @@ -132,4 +142,4 @@ class TestFormatters(TestCase): def test_formatter_loader__unknown_format(self): with self.assertRaises(FormatterLoader.UnknownFormatterType): - FormatterLoader().load('png') + FormatterLoader().load("png")