Rebased on PR #11 and added tests
This commit is contained in:
parent
4a564743df
commit
86cd1666c0
25
README.md
25
README.md
|
@ -1,11 +1,5 @@
|
||||||
# YouTube Transcript/Subtitle API (including automatically generated subtitles)
|
# YouTube Transcript/Subtitle API (including automatically generated subtitles)
|
||||||
|
|
||||||
[](https://travis-ci.org/jdepoix/youtube-transcript-api)
|
|
||||||
[](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master)
|
|
||||||
[](http://opensource.org/licenses/MIT)
|
|
||||||
[](https://pypi.org/project/youtube-transcript-api/)
|
|
||||||
[](https://pypi.org/project/youtube-transcript-api/)
|
|
||||||
|
|
||||||
This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!
|
This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
@ -90,6 +84,25 @@ If you would prefer to write it into a file or pipe it into another application,
|
||||||
youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --json > transcripts.json
|
youtube_transcript_api <first_video_id> <second_video_id> ... --languages de en --json > transcripts.json
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Proxy
|
||||||
|
|
||||||
|
You can pass a proxy to use during the network requests
|
||||||
|
|
||||||
|
Code:
|
||||||
|
```python
|
||||||
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
|
|
||||||
|
YouTubeTranscriptApi.get_transcript(video_id, proxy={"http": "http://user:pass@domain:port", "https": "https://user:pass@domain:port"})
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
CLI:
|
||||||
|
```
|
||||||
|
youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port
|
||||||
|
```
|
||||||
|
|
||||||
|
Find out more about using proxies and the type of proxies you can use here: http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||||
|
|
||||||
## Warning
|
## Warning
|
||||||
|
|
||||||
This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know!
|
This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know!
|
|
@ -38,7 +38,7 @@ class YouTubeTranscriptApi():
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False):
|
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None):
|
||||||
"""
|
"""
|
||||||
Retrieves the transcripts for a list of videos.
|
Retrieves the transcripts for a list of videos.
|
||||||
|
|
||||||
|
@ -55,13 +55,15 @@ class YouTubeTranscriptApi():
|
||||||
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
||||||
video ids, which could not be retrieved
|
video ids, which could not be retrieved
|
||||||
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}
|
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}
|
||||||
|
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||||
|
:rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||||
"""
|
"""
|
||||||
data = {}
|
data = {}
|
||||||
unretrievable_videos = []
|
unretrievable_videos = []
|
||||||
|
|
||||||
for video_id in video_ids:
|
for video_id in video_ids:
|
||||||
try:
|
try:
|
||||||
data[video_id] = cls.get_transcript(video_id, languages)
|
data[video_id] = cls.get_transcript(video_id, languages, proxies)
|
||||||
except Exception as exception:
|
except Exception as exception:
|
||||||
if not continue_after_error:
|
if not continue_after_error:
|
||||||
raise exception
|
raise exception
|
||||||
|
@ -71,7 +73,7 @@ class YouTubeTranscriptApi():
|
||||||
return data, unretrievable_videos
|
return data, unretrievable_videos
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_transcript(cls, video_id, languages=None):
|
def get_transcript(cls, video_id, languages=None, proxies=None):
|
||||||
"""
|
"""
|
||||||
Retrieves the transcript for a single video.
|
Retrieves the transcript for a single video.
|
||||||
|
|
||||||
|
@ -84,9 +86,11 @@ class YouTubeTranscriptApi():
|
||||||
:type languages: [str]
|
:type languages: [str]
|
||||||
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||||
:rtype: [{'text': str, 'start': float, 'end': float}]
|
:rtype: [{'text': str, 'start': float, 'end': float}]
|
||||||
|
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||||
|
:rtype {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse()
|
return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse()
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.error(
|
logger.error(
|
||||||
YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format(
|
YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format(
|
||||||
|
@ -101,12 +105,16 @@ class _TranscriptFetcher():
|
||||||
API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
|
API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
|
||||||
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
|
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
|
||||||
|
|
||||||
def __init__(self, video_id, languages):
|
def __init__(self, video_id, languages, proxies):
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
self.languages = languages
|
self.languages = languages
|
||||||
|
self.proxies = proxies
|
||||||
|
|
||||||
def fetch(self):
|
def fetch(self):
|
||||||
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
|
if self.proxies:
|
||||||
|
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text
|
||||||
|
else:
|
||||||
|
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
|
||||||
timedtext_url_start = fetched_site.find('timedtext')
|
timedtext_url_start = fetched_site.find('timedtext')
|
||||||
|
|
||||||
for language in (self.languages if self.languages else [None,]):
|
for language in (self.languages if self.languages else [None,]):
|
||||||
|
@ -128,7 +136,10 @@ class _TranscriptFetcher():
|
||||||
)
|
)
|
||||||
if language:
|
if language:
|
||||||
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
|
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
|
||||||
return requests.get(url).text
|
if self.proxies:
|
||||||
|
return requests.get(url, proxies=self.proxies).text
|
||||||
|
else:
|
||||||
|
return requests.get(url).text
|
||||||
|
|
||||||
|
|
||||||
class _TranscriptParser():
|
class _TranscriptParser():
|
||||||
|
|
|
@ -14,10 +14,13 @@ class YouTubeTranscriptCli():
|
||||||
def run(self):
|
def run(self):
|
||||||
parsed_args = self._parse_args()
|
parsed_args = self._parse_args()
|
||||||
|
|
||||||
|
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
|
||||||
|
|
||||||
transcripts, _ = YouTubeTranscriptApi.get_transcripts(
|
transcripts, _ = YouTubeTranscriptApi.get_transcripts(
|
||||||
parsed_args.video_ids,
|
parsed_args.video_ids,
|
||||||
languages=parsed_args.languages,
|
languages=parsed_args.languages,
|
||||||
continue_after_error=True
|
continue_after_error=True,
|
||||||
|
proxies=proxies
|
||||||
)
|
)
|
||||||
|
|
||||||
if parsed_args.json:
|
if parsed_args.json:
|
||||||
|
@ -53,5 +56,15 @@ class YouTubeTranscriptCli():
|
||||||
default=False,
|
default=False,
|
||||||
help='If this flag is set the output will be JSON formatted.',
|
help='If this flag is set the output will be JSON formatted.',
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--http-proxy', dest='http_proxy',
|
||||||
|
default='', metavar='URL',
|
||||||
|
help='Use the specified HTTP proxy.'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--https-proxy', dest='https_proxy',
|
||||||
|
default='', metavar='URL',
|
||||||
|
help='Use the specified HTTPS proxy.'
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args(self._args)
|
return parser.parse_args(self._args)
|
||||||
|
|
|
@ -82,8 +82,8 @@ class TestYouTubeTranscriptApi(TestCase):
|
||||||
|
|
||||||
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
|
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
|
||||||
|
|
||||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages)
|
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None)
|
||||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages)
|
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None)
|
||||||
self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2)
|
self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2)
|
||||||
|
|
||||||
def test_get_transcripts__stop_on_error(self):
|
def test_get_transcripts__stop_on_error(self):
|
||||||
|
@ -99,5 +99,19 @@ class TestYouTubeTranscriptApi(TestCase):
|
||||||
|
|
||||||
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
|
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
|
||||||
|
|
||||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None)
|
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None, None)
|
||||||
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None)
|
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None, None)
|
||||||
|
|
||||||
|
def test_get_transcript__with_proxies(self):
|
||||||
|
transcript = YouTubeTranscriptApi.get_transcript(
|
||||||
|
'GJLlxj_dtq8', proxies={'http': '', 'https:': ''}
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
transcript,
|
||||||
|
[
|
||||||
|
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
|
||||||
|
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
|
||||||
|
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
|
@ -23,6 +23,31 @@ class TestYouTubeTranscriptCli(TestCase):
|
||||||
self.assertEqual(parsed_args.json, True)
|
self.assertEqual(parsed_args.json, True)
|
||||||
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||||
|
|
||||||
|
parsed_args = YouTubeTranscriptCli(
|
||||||
|
'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port'.split()
|
||||||
|
)._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertEqual(parsed_args.json, True)
|
||||||
|
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||||
|
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
|
||||||
|
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
|
||||||
|
|
||||||
|
parsed_args = YouTubeTranscriptCli(
|
||||||
|
'v1 v2 --languages de en --json --http-proxy http://user:pass@domain:port'.split()
|
||||||
|
)._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertEqual(parsed_args.json, True)
|
||||||
|
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||||
|
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
|
||||||
|
|
||||||
|
parsed_args = YouTubeTranscriptCli(
|
||||||
|
'v1 v2 --languages de en --json --https-proxy https://user:pass@domain:port'.split()
|
||||||
|
)._parse_args()
|
||||||
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
self.assertEqual(parsed_args.json, True)
|
||||||
|
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||||
|
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
|
||||||
|
|
||||||
def test_argument_parsing__only_video_ids(self):
|
def test_argument_parsing__only_video_ids(self):
|
||||||
parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args()
|
parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args()
|
||||||
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
|
||||||
|
@ -50,6 +75,17 @@ class TestYouTubeTranscriptCli(TestCase):
|
||||||
self.assertEqual(parsed_args.json, False)
|
self.assertEqual(parsed_args.json, False)
|
||||||
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
self.assertEqual(parsed_args.languages, ['de', 'en'])
|
||||||
|
|
||||||
|
def test_argument_parsing__proxies(self):
|
||||||
|
parsed_args = YouTubeTranscriptCli(
|
||||||
|
'v1 v2 --http-proxy http://user:pass@domain:port'.split()
|
||||||
|
)._parse_args()
|
||||||
|
self.assertEqual(parsed_args.http_proxy, 'http://user:pass@domain:port')
|
||||||
|
|
||||||
|
parsed_args = YouTubeTranscriptCli(
|
||||||
|
'v1 v2 --https-proxy https://user:pass@domain:port'.split()
|
||||||
|
)._parse_args()
|
||||||
|
self.assertEqual(parsed_args.https_proxy, 'https://user:pass@domain:port')
|
||||||
|
|
||||||
def test_run(self):
|
def test_run(self):
|
||||||
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], []))
|
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], []))
|
||||||
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
|
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
|
||||||
|
@ -57,7 +93,8 @@ class TestYouTubeTranscriptCli(TestCase):
|
||||||
YouTubeTranscriptApi.get_transcripts.assert_called_once_with(
|
YouTubeTranscriptApi.get_transcripts.assert_called_once_with(
|
||||||
['v1', 'v2'],
|
['v1', 'v2'],
|
||||||
languages=['de', 'en'],
|
languages=['de', 'en'],
|
||||||
continue_after_error=True
|
continue_after_error=True,
|
||||||
|
proxies={"http": "", "https": ""}
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_run__json_output(self):
|
def test_run__json_output(self):
|
||||||
|
|
Loading…
Reference in New Issue