Merge pull request #46 from danielcliu/feature/issue-45-use-authentication-cookies

Feature/issue 45 use authentication cookies
This commit is contained in:
jdepoix 2020-01-31 11:29:25 +01:00 committed by GitHub
commit 6da4d19978
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 188 additions and 49 deletions

View File

@ -213,8 +213,28 @@ Using the CLI:
``` ```
youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://user:pass@domain:port --https-proxy https://user:pass@domain:port
``` ```
## Cookies
Some videos are age restricted, so this module won't be able to access those videos without some sort of authentication. To do this, you will need to have access to the desired video in a browser. Then, you will need to download that pages cookies into a text file. You can use the Chrome extension [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg?hl=en) or the Firefox extension [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/).
Once you have that, you can use it with the module to access age-restricted videos' captions like so.
```python
from youtube_transcript_api import YouTubeTranscriptApi
YouTubeTranscriptApi.get_transcript(video_id, cookies='/path/to/your/cookies.txt')
YouTubeTranscriptApi.get_transcripts([video_id], cookies='/path/to/your/cookies.txt')
```
Using the CLI:
```
youtube_transcript_api <first_video_id> <second_video_id> --cookies /path/to/your/cookies.txt
```
## Warning ## Warning
@ -224,4 +244,4 @@ youtube_transcript_api <first_video_id> <second_video_id> --http-proxy http://us
If this project makes you happy by reducing your development time, you can make me happy by treating me to a cup of coffee :) If this project makes you happy by reducing your development time, you can make me happy by treating me to a cup of coffee :)
[![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url) [![Donate](https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=BAENLEW8VUJ6G&source=url)

View File

@ -8,4 +8,6 @@ from ._errors import (
NotTranslatable, NotTranslatable,
TranslationLanguageNotAvailable, TranslationLanguageNotAvailable,
NoTranscriptAvailable, NoTranscriptAvailable,
CookiePathInvalid,
CookiesInvalid
) )

View File

@ -1,11 +1,21 @@
import requests import requests
try:
import http.cookiejar as cookiejar
CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
except ImportError:
import cookielib as cookiejar
CookieLoadError = IOError
from ._transcripts import TranscriptListFetcher from ._transcripts import TranscriptListFetcher
from ._errors import (
CookiePathInvalid,
CookiesInvalid
)
class YouTubeTranscriptApi(): class YouTubeTranscriptApi():
@classmethod @classmethod
def list_transcripts(cls, video_id, proxies=None): def list_transcripts(cls, video_id, proxies=None, cookies=None):
""" """
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
@ -48,15 +58,19 @@ class YouTubeTranscriptApi():
:type video_id: str :type video_id: str
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str
:return: the list of available transcripts :return: the list of available transcripts
:rtype TranscriptList: :rtype TranscriptList:
""" """
with requests.Session() as http_client: with requests.Session() as http_client:
if cookies:
http_client.cookies = cls._load_cookies(cookies, video_id)
http_client.proxies = proxies if proxies else {} http_client.proxies = proxies if proxies else {}
return TranscriptListFetcher(http_client).fetch(video_id) return TranscriptListFetcher(http_client).fetch(video_id)
@classmethod @classmethod
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None): def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
""" """
Retrieves the transcripts for a list of videos. Retrieves the transcripts for a list of videos.
@ -71,6 +85,8 @@ class YouTubeTranscriptApi():
:type continue_after_error: bool :type continue_after_error: bool
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
video ids, which could not be retrieved video ids, which could not be retrieved
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
@ -80,7 +96,7 @@ class YouTubeTranscriptApi():
for video_id in video_ids: for video_id in video_ids:
try: try:
data[video_id] = cls.get_transcript(video_id, languages, proxies) data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
except Exception as exception: except Exception as exception:
if not continue_after_error: if not continue_after_error:
raise exception raise exception
@ -90,7 +106,7 @@ class YouTubeTranscriptApi():
return data, unretrievable_videos return data, unretrievable_videos
@classmethod @classmethod
def get_transcript(cls, video_id, languages=('en',), proxies=None): def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
""" """
Retrieves the transcript for a single video. This is just a shortcut for calling:: Retrieves the transcript for a single video. This is just a shortcut for calling::
@ -104,7 +120,21 @@ class YouTubeTranscriptApi():
:type languages: list[str] :type languages: list[str]
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests :param proxies: a dictionary mapping of http and https proxies to be used for the network requests
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype [{'text': str, 'start': float, 'end': float}]: :rtype [{'text': str, 'start': float, 'end': float}]:
""" """
return cls.list_transcripts(video_id, proxies).find_transcript(languages).fetch() return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
@classmethod
def _load_cookies(cls, cookies, video_id):
cookie_jar = {}
try:
cookie_jar = cookiejar.MozillaCookieJar()
cookie_jar.load(cookies)
except CookieLoadError:
raise CookiePathInvalid(video_id)
if not cookie_jar:
raise CookiesInvalid(video_id)
return cookie_jar

View File

@ -21,12 +21,14 @@ class YouTubeTranscriptCli():
if parsed_args.http_proxy != '' or parsed_args.https_proxy != '': if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy} proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
cookies = parsed_args.cookies
transcripts = [] transcripts = []
exceptions = [] exceptions = []
for video_id in parsed_args.video_ids: for video_id in parsed_args.video_ids:
try: try:
transcripts.append(self._fetch_transcript(parsed_args, proxies, video_id)) transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
except Exception as exception: except Exception as exception:
exceptions.append(exception) exceptions.append(exception)
@ -35,8 +37,8 @@ class YouTubeTranscriptCli():
+ ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else []) + ([json.dumps(transcripts) if parsed_args.json else pprint.pformat(transcripts)] if transcripts else [])
) )
def _fetch_transcript(self, parsed_args, proxies, video_id): def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies) transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)
if parsed_args.list_transcripts: if parsed_args.list_transcripts:
return str(transcript_list) return str(transcript_list)
@ -123,5 +125,10 @@ class YouTubeTranscriptCli():
metavar='URL', metavar='URL',
help='Use the specified HTTPS proxy.' help='Use the specified HTTPS proxy.'
) )
parser.add_argument(
'--cookies',
default=None,
help='The cookie file that will be used for authorization with youtube.'
)
return parser.parse_args(self._args) return parser.parse_args(self._args)

View File

@ -55,6 +55,14 @@ class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The requested translation language is not available' CAUSE_MESSAGE = 'The requested translation language is not available'
class CookiePathInvalid(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
class CookiesInvalid(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
class NoTranscriptFound(CouldNotRetrieveTranscript): class NoTranscriptFound(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = ( CAUSE_MESSAGE = (
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n' 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'

View File

@ -0,0 +1,9 @@
# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
# This file can be used by wget, curl, aria2c and other standard compliant tools.
# Usage Examples:
# 1) wget -x --load-cookies cookies.txt "https://www.youtube.com/"
# 2) curl --cookie cookies.txt "https://www.youtube.com/"
# 3) aria2c --load-cookies cookies.txt "https://www.youtube.com/"
#
.example.com TRUE / TRUE 3594431874 TEST_FIELD TEST_VALUE
.example.com TRUE / TRUE 31874 BAD_TEST_FIELD BAD_TEST_VALUE

View File

@ -0,0 +1,8 @@
# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
# This file can be used by wget, curl, aria2c and other standard compliant tools.
# Usage Examples:
# 1) wget -x --load-cookies cookies.txt "https://www.youtube.com/"
# 2) curl --cookie cookies.txt "https://www.youtube.com/"
# 3) aria2c --load-cookies cookies.txt "https://www.youtube.com/"
#
.example.com TRUE / TRUE 31874 BAD_TEST_FIELD BAD_TEST_VALUE

View File

@ -1,8 +1,10 @@
from unittest import TestCase from unittest import TestCase
from mock import MagicMock from mock import patch
import os import os
import requests
import httpretty import httpretty
from youtube_transcript_api import ( from youtube_transcript_api import (
@ -13,6 +15,8 @@ from youtube_transcript_api import (
NoTranscriptAvailable, NoTranscriptAvailable,
NotTranslatable, NotTranslatable,
TranslationLanguageNotAvailable, TranslationLanguageNotAvailable,
CookiePathInvalid,
CookiesInvalid
) )
@ -151,39 +155,24 @@ class TestYouTubeTranscriptApi(TestCase):
with self.assertRaises(NoTranscriptAvailable): with self.assertRaises(NoTranscriptAvailable):
YouTubeTranscriptApi.get_transcript('MwBPvcYFY2E') YouTubeTranscriptApi.get_transcript('MwBPvcYFY2E')
def test_get_transcripts(self): def test_get_transcript__with_proxy(self):
video_id_1 = 'video_id_1'
video_id_2 = 'video_id_2'
languages = ['de', 'en']
YouTubeTranscriptApi.get_transcript = MagicMock()
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages, None)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages, None)
self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2)
def test_get_transcripts__stop_on_error(self):
YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
with self.assertRaises(Exception):
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
def test_get_transcripts__continue_on_error(self):
video_id_1 = 'video_id_1'
video_id_2 = 'video_id_2'
YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, ('en',), None)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, ('en',), None)
def test_get_transcript__with_proxies(self):
proxies = {'http': '', 'https:': ''} proxies = {'http': '', 'https:': ''}
transcript = YouTubeTranscriptApi.get_transcript( transcript = YouTubeTranscriptApi.get_transcript(
'GJLlxj_dtq8', proxies=proxies 'GJLlxj_dtq8', proxies=proxies
) )
self.assertEqual(
transcript,
[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]
)
def test_get_transcript__with_cookies(self):
dirname, filename = os.path.split(os.path.abspath(__file__))
cookies = dirname + '/example_cookies.txt'
transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', cookies=cookies)
self.assertEqual( self.assertEqual(
transcript, transcript,
@ -193,6 +182,59 @@ class TestYouTubeTranscriptApi(TestCase):
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
] ]
) )
YouTubeTranscriptApi.get_transcript = MagicMock()
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
def test_get_transcripts(self, mock_get_transcript):
video_id_1 = 'video_id_1'
video_id_2 = 'video_id_2'
languages = ['de', 'en']
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
mock_get_transcript.assert_any_call(video_id_1, languages, None, None)
mock_get_transcript.assert_any_call(video_id_2, languages, None, None)
self.assertEqual(mock_get_transcript.call_count, 2)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
def test_get_transcripts__stop_on_error(self, mock_get_transcript):
with self.assertRaises(Exception):
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
def test_get_transcripts__continue_on_error(self, mock_get_transcript):
video_id_1 = 'video_id_1'
video_id_2 = 'video_id_2'
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None)
mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
def test_get_transcripts__with_cookies(self, mock_get_transcript):
cookies = '/example_cookies.txt'
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies)
mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
def test_get_transcripts__with_proxies(self, mock_get_transcript):
proxies = {'http': '', 'https:': ''}
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
YouTubeTranscriptApi.get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies) mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None)
def test_load_cookies(self):
dirname, filename = os.path.split(os.path.abspath(__file__))
cookies = dirname + '/example_cookies.txt'
session_cookies = YouTubeTranscriptApi._load_cookies(cookies, 'GJLlxj_dtq8')
self.assertEqual({'TEST_FIELD': 'TEST_VALUE'}, requests.utils.dict_from_cookiejar(session_cookies))
def test_load_cookies__bad_file_path(self):
bad_cookies = 'nonexistent_cookies.txt'
with self.assertRaises(CookiePathInvalid):
YouTubeTranscriptApi._load_cookies(bad_cookies, 'GJLlxj_dtq8')
def test_load_cookies__no_valid_cookies(self):
dirname, filename = os.path.split(os.path.abspath(__file__))
expired_cookies = dirname + '/expired_example_cookies.txt'
with self.assertRaises(CookiesInvalid):
YouTubeTranscriptApi._load_cookies(expired_cookies, 'GJLlxj_dtq8')

View File

@ -164,8 +164,8 @@ class TestYouTubeTranscriptCli(TestCase):
def test_run(self): def test_run(self):
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run() YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None) YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None)
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None) YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None)
self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en']) self.transcript_list_mock.find_transcript.assert_any_call(['de', 'en'])
@ -200,8 +200,8 @@ class TestYouTubeTranscriptCli(TestCase):
def test_run__list_transcripts(self): def test_run__list_transcripts(self):
YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run() YouTubeTranscriptCli('--list-transcripts v1 v2'.split()).run()
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None) YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies=None)
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None) YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies=None)
def test_run__json_output(self): def test_run__json_output(self):
output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run() output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run()
@ -220,10 +220,23 @@ class TestYouTubeTranscriptCli(TestCase):
YouTubeTranscriptApi.list_transcripts.assert_any_call( YouTubeTranscriptApi.list_transcripts.assert_any_call(
'v1', 'v1',
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'},
cookies= None
) )
YouTubeTranscriptApi.list_transcripts.assert_any_call( YouTubeTranscriptApi.list_transcripts.assert_any_call(
'v2', 'v2',
proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'} proxies={'http': 'http://user:pass@domain:port', 'https': 'https://user:pass@domain:port'},
cookies=None
) )
def test_run__cookies(self):
YouTubeTranscriptCli(
(
'v1 v2 --languages de en '
'--cookies blahblah.txt'
).split()
).run()
YouTubeTranscriptApi.list_transcripts.assert_any_call('v1', proxies=None, cookies='blahblah.txt')
YouTubeTranscriptApi.list_transcripts.assert_any_call('v2', proxies=None, cookies='blahblah.txt')