Rebased on PR #11 and added tests

This commit is contained in:
Danny Aziz 2019-03-16 16:23:42 +00:00
parent 4a564743df
commit 86cd1666c0
5 changed files with 108 additions and 20 deletions

View File

@ -1,11 +1,5 @@
# YouTube Transcript/Subtitle API (including automatically generated subtitles) # YouTube Transcript/Subtitle API (including automatically generated subtitles)
[![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api)
[![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master)
[![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT)
[![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/)
[![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/)
This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do! This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!
## Install ## Install
@ -90,6 +84,25 @@ If you would prefer to write it into a file or pipe it into another application,
youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --json > transcripts.json youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --json > transcripts.json
``` ```
### Proxy
You can pass a proxy to use during the network requests
Code:
```python
from youtube_transcript_api import YouTubeTranscriptApi
YouTubeTranscriptApi.get_transcript(video_id, proxy={"http": "http://user:pass@domain:port", "https": "https://user:pass@domain:port"})
```
CLI:
```
youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port
```
Find out more about using proxies and the type of proxies you can use here: http://docs.python-requests.org/en/master/user/advanced/#proxies
## Warning ## Warning
This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know! This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know!

View File

@ -38,7 +38,7 @@ class YouTubeTranscriptApi():
self.video_id = video_id self.video_id = video_id
@classmethod @classmethod
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False): def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None):
""" """
Retrieves the transcripts for a list of videos. Retrieves the transcripts for a list of videos.
@ -55,13 +55,15 @@ class YouTubeTranscriptApi():
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
video ids, which could not be retrieved video ids, which could not be retrieved
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]} :rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
""" """
data = {} data = {}
unretrievable_videos = [] unretrievable_videos = []
for video_id in video_ids: for video_id in video_ids:
try: try:
data[video_id] = cls.get_transcript(video_id, languages) data[video_id] = cls.get_transcript(video_id, languages, proxies)
except Exception as exception: except Exception as exception:
if not continue_after_error: if not continue_after_error:
raise exception raise exception
@ -71,7 +73,7 @@ class YouTubeTranscriptApi():
return data, unretrievable_videos return data, unretrievable_videos
@classmethod @classmethod
def get_transcript(cls, video_id, languages=None): def get_transcript(cls, video_id, languages=None, proxies=None):
""" """
Retrieves the transcript for a single video. Retrieves the transcript for a single video.
@ -84,9 +86,11 @@ class YouTubeTranscriptApi():
:type languages: [str] :type languages: [str]
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype: [{'text': str, 'start': float, 'end': float}] :rtype: [{'text': str, 'start': float, 'end': float}]
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
""" """
try: try:
return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse() return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse()
except Exception: except Exception:
logger.error( logger.error(
YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format( YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format(
@ -101,12 +105,16 @@ class _TranscriptFetcher():
API_BASE_URL = 'https://www.youtube.com/api/{api_url}' API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
def __init__(self, video_id, languages): def __init__(self, video_id, languages, proxies):
self.video_id = video_id self.video_id = video_id
self.languages = languages self.languages = languages
self.proxies = proxies
def fetch(self): def fetch(self):
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text if self.proxies:
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text
else:
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
timedtext_url_start = fetched_site.find('timedtext') timedtext_url_start = fetched_site.find('timedtext')
for language in (self.languages if self.languages else [None,]): for language in (self.languages if self.languages else [None,]):
@ -128,7 +136,10 @@ class _TranscriptFetcher():
) )
if language: if language:
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
return requests.get(url).text if self.proxies:
return requests.get(url, proxies=self.proxies).text
else:
return requests.get(url).text
class _TranscriptParser(): class _TranscriptParser():

View File

@ -14,10 +14,13 @@ class YouTubeTranscriptCli():
def run(self): def run(self):
parsed_args = self._parse_args() parsed_args = self._parse_args()
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
transcripts, _ = YouTubeTranscriptApi.get_transcripts( transcripts, _ = YouTubeTranscriptApi.get_transcripts(
parsed_args.video_ids, parsed_args.video_ids,
languages=parsed_args.languages, languages=parsed_args.languages,
continue_after_error=True continue_after_error=True,
proxies=proxies
) )
if parsed_args.json: if parsed_args.json:
@ -53,5 +56,15 @@ class YouTubeTranscriptCli():
default=False, default=False,
help='If this flag is set the output will be JSON formatted.', help='If this flag is set the output will be JSON formatted.',
) )
parser.add_argument(
'--http-proxy', dest='http_proxy',
default='', metavar='URL',
help='Use the specified HTTP proxy.'
)
parser.add_argument(
'--https-proxy', dest='https_proxy',
default='', metavar='URL',
help='Use the specified HTTPS proxy.'
)
return parser.parse_args(self._args) return parser.parse_args(self._args)

View File

@ -82,8 +82,8 @@ class TestYouTubeTranscriptApi(TestCase):
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages) YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages) YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None)
self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2) self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2)
def test_get_transcripts__stop_on_error(self): def test_get_transcripts__stop_on_error(self):
@ -99,5 +99,19 @@ class TestYouTubeTranscriptApi(TestCase):
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None) YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None, None)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None) YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None, None)
def test_get_transcript__with_proxies(self):
transcript = YouTubeTranscriptApi.get_transcript(
'GJLlxj_dtq8', proxies={'http': '', 'https:': ''}
)
self.assertEqual(
transcript,
[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]
)

View File

@ -23,6 +23,31 @@ class TestYouTubeTranscriptCli(TestCase):
self.assertEqual(parsed_args.json, True) self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en']) self.assertEqual(parsed_args.languages, ['de', 'en'])
parsed_args = YouTubeTranscriptCli(
'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
parsed_args = YouTubeTranscriptCli(
'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
parsed_args = YouTubeTranscriptCli(
'v1 v2 --languages de en --json --https-proxy https://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
def test_argument_parsing__only_video_ids(self): def test_argument_parsing__only_video_ids(self):
parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args() parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2']) self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
@ -50,6 +75,17 @@ class TestYouTubeTranscriptCli(TestCase):
self.assertEqual(parsed_args.json, False) self.assertEqual(parsed_args.json, False)
self.assertEqual(parsed_args.languages, ['de', 'en']) self.assertEqual(parsed_args.languages, ['de', 'en'])
def test_argument_parsing__proxies(self):
parsed_args = YouTubeTranscriptCli(
'v1 v2 --http-proxy http://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
parsed_args = YouTubeTranscriptCli(
'v1 v2 --https-proxy https://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
def test_run(self): def test_run(self):
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], [])) YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], []))
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
@ -57,7 +93,8 @@ class TestYouTubeTranscriptCli(TestCase):
YouTubeTranscriptApi.get_transcripts.assert_called_once_with( YouTubeTranscriptApi.get_transcripts.assert_called_once_with(
['v1', 'v2'], ['v1', 'v2'],
languages=['de', 'en'], languages=['de', 'en'],
continue_after_error=True continue_after_error=True,
proxies={"http": "", "https": ""}
) )
def test_run__json_output(self): def test_run__json_output(self):