added new params to cli to make new features accessible using the cli
This commit is contained in:
parent
4b75a47a74
commit
f8416ab004
|
@ -72,11 +72,11 @@ class YouTubeTranscriptApi():
|
||||||
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||||
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
||||||
exceptions which occurred for the videos which could not be retrieved
|
video ids, which could not be retrieved
|
||||||
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [CouldNotRetrieveTranscript]}):
|
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
|
||||||
"""
|
"""
|
||||||
data = {}
|
data = {}
|
||||||
exceptions = []
|
unretrievable_videos = []
|
||||||
|
|
||||||
for video_id in video_ids:
|
for video_id in video_ids:
|
||||||
try:
|
try:
|
||||||
|
@ -85,9 +85,9 @@ class YouTubeTranscriptApi():
|
||||||
if not continue_after_error:
|
if not continue_after_error:
|
||||||
raise exception
|
raise exception
|
||||||
|
|
||||||
exceptions.append(exception)
|
unretrievable_videos.append(video_id)
|
||||||
|
|
||||||
return data, exceptions
|
return data, unretrievable_videos
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_transcript(cls, video_id, languages=('en',), proxies=None):
|
def get_transcript(cls, video_id, languages=('en',), proxies=None):
|
||||||
|
|
|
@ -14,22 +14,42 @@ class YouTubeTranscriptCli():
|
||||||
def run(self):
|
def run(self):
|
||||||
parsed_args = self._parse_args()
|
parsed_args = self._parse_args()
|
||||||
|
|
||||||
|
if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
|
||||||
|
return ''
|
||||||
|
|
||||||
proxies = None
|
proxies = None
|
||||||
if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
|
if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
|
||||||
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
|
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
|
||||||
|
|
||||||
transcripts, unretrievable_videos = YouTubeTranscriptApi.get_transcripts(
|
transcripts = []
|
||||||
parsed_args.video_ids,
|
exceptions = []
|
||||||
languages=parsed_args.languages,
|
|
||||||
continue_after_error=True,
|
for video_id in parsed_args.video_ids:
|
||||||
proxies=proxies
|
try:
|
||||||
)
|
transcripts.append(self._fetch_transcript(parsed_args, proxies, video_id))
|
||||||
|
except Exception as exception:
|
||||||
|
exceptions.append(exception)
|
||||||
|
|
||||||
return '\n\n'.join(
|
return '\n\n'.join(
|
||||||
[str(exception) for exception in unretrievable_videos]
|
[str(exception) for exception in exceptions]
|
||||||
+ ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else [])
|
+ ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else [])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _fetch_transcript(self, parsed_args, proxies, video_id):
|
||||||
|
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies)
|
||||||
|
|
||||||
|
if parsed_args.exclude_manually_created:
|
||||||
|
transcript = transcript_list.find_generated_transcript(parsed_args.languages)
|
||||||
|
elif parsed_args.exclude_generated:
|
||||||
|
transcript = transcript_list.find_manually_created_transcript(parsed_args.languages)
|
||||||
|
else:
|
||||||
|
transcript = transcript_list.find_transcript(parsed_args.languages)
|
||||||
|
|
||||||
|
if parsed_args.translate:
|
||||||
|
transcript = transcript.translate(parsed_args.translate)
|
||||||
|
|
||||||
|
return transcript.fetch()
|
||||||
|
|
||||||
def _parse_args(self):
|
def _parse_args(self):
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description=(
|
description=(
|
||||||
|
@ -38,6 +58,13 @@ class YouTubeTranscriptCli():
|
||||||
'other selenium based solutions do!'
|
'other selenium based solutions do!'
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--list-transcripts',
|
||||||
|
action='store_const',
|
||||||
|
const=True,
|
||||||
|
default=False,
|
||||||
|
help='This will list the languages in which the given videos are available in.',
|
||||||
|
)
|
||||||
parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
|
parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--languages',
|
'--languages',
|
||||||
|
@ -46,11 +73,25 @@ class YouTubeTranscriptCli():
|
||||||
type=str,
|
type=str,
|
||||||
help=(
|
help=(
|
||||||
'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
|
'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
|
||||||
'first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails '
|
'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails '
|
||||||
'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
|
'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
|
||||||
'may have to play around with the language codes a bit, to find the one which is working for you!'
|
'may have to play around with the language codes a bit, to find the one which is working for you!'
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--exclude-generated',
|
||||||
|
action='store_const',
|
||||||
|
const=True,
|
||||||
|
default=False,
|
||||||
|
help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--exclude-manually-created',
|
||||||
|
action='store_const',
|
||||||
|
const=True,
|
||||||
|
default=False,
|
||||||
|
help='If this flag is set transcripts which have been manually created will not be retrieved.',
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--json',
|
'--json',
|
||||||
action='store_const',
|
action='store_const',
|
||||||
|
@ -59,13 +100,24 @@ class YouTubeTranscriptCli():
|
||||||
help='If this flag is set the output will be JSON formatted.',
|
help='If this flag is set the output will be JSON formatted.',
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--http-proxy', dest='http_proxy',
|
'--translate',
|
||||||
default='', metavar='URL',
|
default='',
|
||||||
|
help=(
|
||||||
|
'The language code for the language you want this transcript to be translated to. Use the '
|
||||||
|
'--list-transcripts feature to find out which languages are translatable and which translation '
|
||||||
|
'languages are available.'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--http-proxy',
|
||||||
|
default='',
|
||||||
|
metavar='URL',
|
||||||
help='Use the specified HTTP proxy.'
|
help='Use the specified HTTP proxy.'
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--https-proxy', dest='https_proxy',
|
'--https-proxy',
|
||||||
default='', metavar='URL',
|
default='',
|
||||||
|
metavar='URL',
|
||||||
help='Use the specified HTTPS proxy.'
|
help='Use the specified HTTPS proxy.'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -3,10 +3,27 @@ from mock import MagicMock
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from youtube_transcript_api._cli import YouTubeTranscriptCli, YouTubeTranscriptApi
|
from youtube_transcript_api import YouTubeTranscriptApi, VideoUnavailable
|
||||||
|
from youtube_transcript_api._cli import YouTubeTranscriptCli
|
||||||
|
|
||||||
|
|
||||||
class TestYouTubeTranscriptCli(TestCase):
|
class TestYouTubeTranscriptCli(TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.transcript_mock = MagicMock()
|
||||||
|
self.transcript_mock.fetch = MagicMock(return_value=[
|
||||||
|
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
|
||||||
|
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
|
||||||
|
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
|
||||||
|
])
|
||||||
|
self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock)
|
||||||
|
|
||||||
|
self.transcript_list_mock = MagicMock()
|
||||||
|
self.transcript_list_mock.find_generated_transcript = MagicMock(return_value=self.transcript_mock)
|
||||||
|
self.transcript_list_mock.find_manually_created_transcript = MagicMock(return_value=self.transcript_mock)
|
||||||
|
self.transcript_list_mock.find_transcript = MagicMock(return_value=self.transcript_mock)
|
||||||
|
|
||||||
|
YouTubeTranscriptApi.list_transcripts = MagicMock(return_value=self.transcript_list_mock)
|
||||||
|
|
||||||
def test_argument_parsing(self):
|
def test_argument_parsing(self):
|
||||||
parsed_args = YouTubeTranscriptCli('v1 v2 --json --languages de en'.split())._parse_args()
|
parsed_args = YouTubeTranscriptCli('v1 v2 --json --languages de en'.split())._parse_args()
|
||||||
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
@ -106,32 +123,107 @@ class TestYouTubeTranscriptCli(TestCase):
|
||||||
self.assertEqual(parsed_args.http_proxy, '')
|
self.assertEqual(parsed_args.http_proxy, '')
|
||||||
self.assertEqual(parsed_args.https_proxy, '')
|
self.assertEqual(parsed_args.https_proxy, '')
|
||||||
|
|
||||||
|
def test_argument_parsing__list_transcripts(self):
|
||||||
|
parsed_args = YouTubeTranscriptCli('--list-transcripts v1 v2'.split())._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertTrue(parsed_args.list_transcripts)
|
||||||
|
|
||||||
|
parsed_args = YouTubeTranscriptCli('v1 v2 --list-transcripts'.split())._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertTrue(parsed_args.list_transcripts)
|
||||||
|
|
||||||
|
def test_argument_parsing__translate(self):
|
||||||
|
parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split())._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertEqual(parsed_args.json, False)
|
||||||
|
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||||
|
self.assertEqual(parsed_args.translate, 'cz')
|
||||||
|
|
||||||
|
parsed_args = YouTubeTranscriptCli('v1 v2 --translate cz --languages de en'.split())._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertEqual(parsed_args.json, False)
|
||||||
|
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||||
|
self.assertEqual(parsed_args.translate, 'cz')
|
||||||
|
|
||||||
|
def test_argument_parsing__manually_or_generated(self):
|
||||||
|
parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created'.split())._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertTrue(parsed_args.exclude_manually_created)
|
||||||
|
self.assertFalse(parsed_args.exclude_generated)
|
||||||
|
|
||||||
|
parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-generated'.split())._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertFalse(parsed_args.exclude_manually_created)
|
||||||
|
self.assertTrue(parsed_args.exclude_generated)
|
||||||
|
|
||||||
|
parsed_args = YouTubeTranscriptCli('v1 v2 --exclude-manually-created --exclude-generated'.split())._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertTrue(parsed_args.exclude_manually_created)
|
||||||
|
self.assertTrue(parsed_args.exclude_generated)
|
||||||
|
|
||||||
def test_run(self):
|
def test_run(self):
|
||||||
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], []))
|
|
||||||
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
|
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
|
||||||
|
|
||||||
YouTubeTranscriptApi.get_transcripts.assert_called_once_with(
|
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None)
|
||||||
['v1', 'v2'],
|
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None)
|
||||||
languages=['de', 'en'],
|
|
||||||
continue_after_error=True,
|
self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en'])
|
||||||
proxies=None
|
|
||||||
|
def test_run__failing_transcripts(self):
|
||||||
|
YouTubeTranscriptApi.list_transcripts = MagicMock(side_effect=VideoUnavailable('video_id'))
|
||||||
|
|
||||||
|
output = YouTubeTranscriptCli('v1 --languages de en'.split()).run()
|
||||||
|
|
||||||
|
self.assertEqual(output, str(VideoUnavailable('video_id')))
|
||||||
|
|
||||||
|
def test_run__exclude_generated(self):
|
||||||
|
YouTubeTranscriptCli('v1 v2 --languages de en --exclude-generated'.split()).run()
|
||||||
|
|
||||||
|
self.transcript_list_mock.find_manually_created_transcript.assert_any_call(['de', 'en'])
|
||||||
|
|
||||||
|
def test_run__exclude_manually_created(self):
|
||||||
|
YouTubeTranscriptCli('v1 v2 --languages de en --exclude-manually-created'.split()).run()
|
||||||
|
|
||||||
|
self.transcript_list_mock.find_generated_transcript.assert_any_call(['de', 'en'])
|
||||||
|
|
||||||
|
def test_run__exclude_manually_created_and_generated(self):
|
||||||
|
self.assertEqual(
|
||||||
|
YouTubeTranscriptCli('v1 v2 --languages de en --exclude-manually-created --exclude-generated'.split()).run(),
|
||||||
|
''
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_run__translate(self):
|
||||||
|
YouTubeTranscriptCli('v1 v2 --languages de en --translate cz'.split()).run(),
|
||||||
|
|
||||||
|
self.transcript_mock.translate.assert_any_call('cz')
|
||||||
|
|
||||||
|
def test_run__list_transcripts(self):
|
||||||
|
YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run()
|
||||||
|
|
||||||
|
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None)
|
||||||
|
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None)
|
||||||
|
|
||||||
def test_run__json_output(self):
|
def test_run__json_output(self):
|
||||||
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([{'boolean': True}], []))
|
|
||||||
output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run()
|
output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run()
|
||||||
|
|
||||||
# will fail if output is not valid json
|
# will fail if output is not valid json
|
||||||
json.loads(output)
|
json.loads(output)
|
||||||
|
|
||||||
def test_run__proxies(self):
|
def test_run__proxies(self):
|
||||||
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], []))
|
|
||||||
YouTubeTranscriptCli(
|
YouTubeTranscriptCli(
|
||||||
'v1 v2 --languages de en --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()).run()
|
(
|
||||||
|
'v1 v2 --languages de en '
|
||||||
|
'--http-proxy http://user:pass@domain:port '
|
||||||
|
'--https-proxy https://user:pass@domain:port'
|
||||||
|
).split()
|
||||||
|
).run()
|
||||||
|
|
||||||
YouTubeTranscriptApi.get_transcripts.assert_called_once_with(
|
YouTubeTranscriptApi.list_transcripts.assert_any_call(
|
||||||
['v1', 'v2'],
|
'v1',
|
||||||
languages=['de', 'en'],
|
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}
|
||||||
continue_after_error=True,
|
)
|
||||||
|
|
||||||
|
YouTubeTranscriptApi.list_transcripts.assert_any_call(
|
||||||
|
'v2',
|
||||||
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}
|
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue