From 7159f6082361ea9c944e18b550cfc928080f59a7 Mon Sep 17 00:00:00 2001 From: Jonas Depoix Date: Tue, 31 Dec 2019 00:11:52 +0100 Subject: [PATCH] fixed bug in cli where no transcript could be retrieved if no language was specified --- README.md | 191 ++++++++++++------------ youtube_transcript_api/_cli.py | 2 +- youtube_transcript_api/test/test_cli.py | 6 +- 3 files changed, 97 insertions(+), 102 deletions(-) diff --git a/README.md b/README.md index 4292061..f4516f6 100644 --- a/README.md +++ b/README.md @@ -1,74 +1,69 @@ # YouTube Transcript/Subtitle API (including automatically generated subtitles and subtitle translations) -[![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) -[![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api) -[![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) -[![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT) -[![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) -[![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) - -This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do! - -## Install - -It is recommended to [install this module by using pip](https://pypi.org/project/youtube-transcript-api/): - -``` -pip install youtube_transcript_api -``` - -If you want to use it from source, you'll have to install the dependencies manually: - -``` -pip install -r requirements.txt -``` +[![Donate](https://img.shields.io/badge/Donate-PayPal-green.svg)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) [![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api) [![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master) [![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT) [![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) [![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/) + +This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles, supports translating subtitles and it does not require a headless browser, like other selenium based solutions do! + +## Install + +It is recommended to [install this module by using pip](https://pypi.org/project/youtube-transcript-api/): + +``` +pip install youtube_transcript_api +``` + +If you want to use it from source, you'll have to install the dependencies manually: + +``` +pip install -r requirements.txt +``` You can either integrate this module [into an existing application](#api), or just use it via an [CLI](#cli). - + ## API - -The easiest way to get a transcript for a given video is to execute: - -```python -from youtube_transcript_api import YouTubeTranscriptApi - -YouTubeTranscriptApi.get_transcript(video_id) -``` - -This will return a list of dictionaries looking somewhat like this: - -```python -[ - { - 'text': 'Hey there', - 'start': 7.58, - 'duration': 6.13 - }, - { - 'text': 'how are you', - 'start': 14.08, - 'duration': 7.58 - }, - # ... -] -``` - -You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). - -```python -YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) -``` - + +The easiest way to get a transcript for a given video is to execute: + +```python +from youtube_transcript_api import YouTubeTranscriptApi + +YouTubeTranscriptApi.get_transcript(video_id) +``` + +This will return a list of dictionaries looking somewhat like this: + +```python +[ + { + 'text': 'Hey there', + 'start': 7.58, + 'duration': 6.13 + }, + { + 'text': 'how are you', + 'start': 14.08, + 'duration': 7.58 + }, + # ... +] +``` + +You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). + +```python +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +``` + It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcript (`'en'`) if it fails to do so. If you want to find out which languages are available first, [have a look at `list_transcripts()`](#list-available-transcripts) - -To get transcripts for a list of video ids you can call: - -```python -YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) -``` - -`languages` also is optional here. + +To get transcripts for a list of video ids you can call: + +```python +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) +``` + +`languages` also is optional here. ### List available transcripts @@ -81,16 +76,16 @@ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, languages=['de This will return a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages and types, like: ```python -transcript = transcript_list.find_transcript(['de', 'en']) +transcript = transcript_list.find_transcript(['de', 'en']) ``` By default this module always picks manually created transcripts over automatically created ones, if a transcript in the requested language is available both manually created and generated. The `TranscriptList` allows you to bypass this default behaviour by searching for specific transcript types: ```python -# filter for manually created transcripts -transcript = transcript_list.find_manually_created_transcript(['de', 'en']) - -# or automatically generated ones +# filter for manually created transcripts +transcript = transcript_list.find_manually_created_transcript(['de', 'en']) + +# or automatically generated ones transcript = transcript_list.find_generated_transcript(['de', 'en']) ``` @@ -98,15 +93,15 @@ The methods `find_generated_transcript`, `find_manually_created_transcript`, `fi ```python print( - transcript.video_id, - transcript.language, - transcript.language_code, - # whether it has been manually created or generated by YouTube - transcript.is_generated, - # whether this transcript can be translated or not - transcript.is_translatable, - # a list of languages the transcript can be translated to - transcript.translation_languages, + transcript.video_id, + transcript.language, + transcript.language_code, + # whether it has been manually created or generated by YouTube + transcript.is_generated, + # whether this transcript can be translated or not + transcript.is_translatable, + # a list of languages the transcript can be translated to + transcript.translation_languages, ) ``` @@ -116,42 +111,42 @@ and provide the method, which allows you to fetch the actual transcript data: transcript.fetch() ``` -### Translate transcript +### Translate transcript YouTube has a feature which allows you to automatically translate subtitles. This module also makes it possible to access this feature. To do so `Transcript` objects provide a `translate()` method, which returns a new translated `Transcript` object: ```python -transcript = transcript_list.find_transcript(['en']) +transcript = transcript_list.find_transcript(['en']) translated_transcript = transcript.translate('de') print(translated_transcript.fetch()) ``` ### By example ```python -# retrieve the available transcripts -transcript_list = YouTubeTranscriptApi.get('video_id') - +# retrieve the available transcripts +transcript_list = YouTubeTranscriptApi.get('video_id') + # iterate over all available transcripts for transcript in transcript_list: # the Transcript object provides metadata properties - print( + print( transcript.video_id, transcript.language, - transcript.language_code, - # whether it has been manually created or generated by YouTube - transcript.is_generated, - # whether this transcript can be translated or not - transcript.is_translatable, - # a list of languages the transcript can be translated to - transcript.translation_languages, - ) - - # fetch the actual transcript data - print(transcript.fetch()) - - # translating the transcript will return another transcript object - print(transcript.translate('en').fetch()) + transcript.language_code, + # whether it has been manually created or generated by YouTube + transcript.is_generated, + # whether this transcript can be translated or not + transcript.is_translatable, + # a list of languages the transcript can be translated to + transcript.translation_languages, + ) + + # fetch the actual transcript data + print(transcript.fetch()) + + # translating the transcript will return another transcript object + print(transcript.translate('en').fetch()) # you can also directly filter for the language you are looking for, using the transcript list transcript = transcript_list.find_transcript(['de', 'en']) diff --git a/youtube_transcript_api/_cli.py b/youtube_transcript_api/_cli.py index 21f816b..043bf19 100644 --- a/youtube_transcript_api/_cli.py +++ b/youtube_transcript_api/_cli.py @@ -72,7 +72,7 @@ class YouTubeTranscriptCli(): parser.add_argument( '--languages', nargs='*', - default=[], + default=['en',], type=str, help=( 'A list of language codes in a descending priority. For example, if this is set to "de en" it will ' diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index e46789e..d2676d8 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -77,7 +77,7 @@ class TestYouTubeTranscriptCli(TestCase): parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.json, False) - self.assertEqual(parsed_args.languages, []) + self.assertEqual(parsed_args.languages, ['en']) def test_argument_parsing__fail_without_video_ids(self): with self.assertRaises(SystemExit): @@ -87,12 +87,12 @@ class TestYouTubeTranscriptCli(TestCase): parsed_args = YouTubeTranscriptCli('v1 v2 --json'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.json, True) - self.assertEqual(parsed_args.languages, []) + self.assertEqual(parsed_args.languages, ['en']) parsed_args = YouTubeTranscriptCli('--json v1 v2'.split())._parse_args() self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.json, True) - self.assertEqual(parsed_args.languages, []) + self.assertEqual(parsed_args.languages, ['en']) def test_argument_parsing__languages(self): parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en'.split())._parse_args()