Merge pull request #10 from DannyAziz/master

Add proxy functionality
This commit is contained in:
jdepoix 2019-03-27 11:07:36 +01:00 committed by GitHub
commit f9d7d337f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 146 additions and 20 deletions

View File

@ -1,11 +1,5 @@
# YouTube Transcript/Subtitle API (including automatically generated subtitles)
[![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api)
[![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master)
[![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT)
[![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/)
[![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/)
This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!
## Install
@ -90,6 +84,25 @@ If you would prefer to write it into a file or pipe it into another application,
youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --json > transcripts.json
```
### Proxy
You can pass a proxy to use during the network requests
Code:
```python
from youtube_transcript_api import YouTubeTranscriptApi
YouTubeTranscriptApi.get_transcript(video_id, proxy={"http": "http://user:pass@domain:port", "https": "https://user:pass@domain:port"})
```
CLI:
```
youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port
```
Find out more about using proxies and the type of proxies you can use here: http://docs.python-requests.org/en/master/user/advanced/#proxies
## Warning
This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know!
This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know!

View File

@ -38,7 +38,7 @@ class YouTubeTranscriptApi():
self.video_id = video_id
@classmethod
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False):
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None):
"""
Retrieves the transcripts for a list of videos.
@ -55,13 +55,15 @@ class YouTubeTranscriptApi():
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
video ids, which could not be retrieved
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
"""
data = {}
unretrievable_videos = []
for video_id in video_ids:
try:
data[video_id] = cls.get_transcript(video_id, languages)
data[video_id] = cls.get_transcript(video_id, languages, proxies)
except Exception as exception:
if not continue_after_error:
raise exception
@ -71,7 +73,7 @@ class YouTubeTranscriptApi():
return data, unretrievable_videos
@classmethod
def get_transcript(cls, video_id, languages=None):
def get_transcript(cls, video_id, languages=None, proxies=None):
"""
Retrieves the transcript for a single video.
@ -84,9 +86,11 @@ class YouTubeTranscriptApi():
:type languages: [str]
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype: [{'text': str, 'start': float, 'end': float}]
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
"""
try:
return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse()
return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse()
except Exception:
logger.error(
YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format(
@ -101,12 +105,16 @@ class _TranscriptFetcher():
API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
def __init__(self, video_id, languages):
def __init__(self, video_id, languages, proxies):
self.video_id = video_id
self.languages = languages
self.proxies = proxies
def fetch(self):
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
if self.proxies:
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text
else:
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
timedtext_url_start = fetched_site.find('timedtext')
for language in (self.languages if self.languages else [None,]):
@ -128,7 +136,10 @@ class _TranscriptFetcher():
)
if language:
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
return requests.get(url).text
if self.proxies:
return requests.get(url, proxies=self.proxies).text
else:
return requests.get(url).text
class _TranscriptParser():

View File

@ -14,10 +14,15 @@ class YouTubeTranscriptCli():
def run(self):
parsed_args = self._parse_args()
proxies = None
if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
transcripts, _ = YouTubeTranscriptApi.get_transcripts(
parsed_args.video_ids,
languages=parsed_args.languages,
continue_after_error=True
continue_after_error=True,
proxies=proxies
)
if parsed_args.json:
@ -53,5 +58,15 @@ class YouTubeTranscriptCli():
default=False,
help='If this flag is set the output will be JSON formatted.',
)
parser.add_argument(
'--http-proxy', dest='http_proxy',
default='', metavar='URL',
help='Use the specified HTTP proxy.'
)
parser.add_argument(
'--https-proxy', dest='https_proxy',
default='', metavar='URL',
help='Use the specified HTTPS proxy.'
)
return parser.parse_args(self._args)

View File

@ -82,8 +82,8 @@ class TestYouTubeTranscriptApi(TestCase):
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None)
self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2)
def test_get_transcripts__stop_on_error(self):
@ -99,5 +99,23 @@ class TestYouTubeTranscriptApi(TestCase):
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None, None)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None, None)
def test_get_transcript__with_proxies(self):
proxies = {'http': '', 'https:': ''}
transcript = YouTubeTranscriptApi.get_transcript(
'GJLlxj_dtq8', proxies=proxies
)
self.assertEqual(
transcript,
[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]
)
YouTubeTranscriptApi.get_transcript = MagicMock()
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', None, proxies)

View File

@ -12,16 +12,49 @@ class TestYouTubeTranscriptCli(TestCase):
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, '')
self.assertEqual(parsed_args.https_proxy, '')
parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, '')
self.assertEqual(parsed_args.https_proxy, '')
parsed_args = YouTubeTranscriptCli(' --json v1 v2 --languages de en'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, '')
self.assertEqual(parsed_args.https_proxy, '')
parsed_args = YouTubeTranscriptCli(
'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
parsed_args = YouTubeTranscriptCli(
'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
self.assertEqual(parsed_args.https_proxy, '')
parsed_args = YouTubeTranscriptCli(
'v1 v2 --languages de en --json --https-proxy https://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
self.assertEqual(parsed_args.http_proxy, '')
def test_argument_parsing__only_video_ids(self):
parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args()
@ -50,6 +83,29 @@ class TestYouTubeTranscriptCli(TestCase):
self.assertEqual(parsed_args.json, False)
self.assertEqual(parsed_args.languages, ['de', 'en'])
def test_argument_parsing__proxies(self):
parsed_args = YouTubeTranscriptCli(
'v1 v2 --http-proxy http://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
parsed_args = YouTubeTranscriptCli(
'v1 v2 --https-proxy https://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
parsed_args = YouTubeTranscriptCli(
'v1 v2 --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()
)._parse_args()
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
parsed_args = YouTubeTranscriptCli(
'v1 v2'.split()
)._parse_args()
self.assertEqual(parsed_args.http_proxy, '')
self.assertEqual(parsed_args.https_proxy, '')
def test_run(self):
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], []))
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
@ -57,7 +113,8 @@ class TestYouTubeTranscriptCli(TestCase):
YouTubeTranscriptApi.get_transcripts.assert_called_once_with(
['v1', 'v2'],
languages=['de', 'en'],
continue_after_error=True
continue_after_error=True,
proxies=None
)
def test_run__json_output(self):
@ -66,3 +123,15 @@ class TestYouTubeTranscriptCli(TestCase):
# will fail if output is not valid json
json.loads(output)
def test_run__proxies(self):
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], []))
YouTubeTranscriptCli(
'v1 v2 --languages de en --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()).run()
YouTubeTranscriptApi.get_transcripts.assert_called_once_with(
['v1', 'v2'],
languages=['de', 'en'],
continue_after_error=True,
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'}
)