added ability to create consent cookie

This commit is contained in:
Jonas Depoix 2021-03-31 15:59:57 +02:00
parent c90cf16484
commit 9251be8462
7 changed files with 395 additions and 11 deletions

View File

@ -10,5 +10,6 @@ from ._errors import (
TranslationLanguageNotAvailable, TranslationLanguageNotAvailable,
NoTranscriptAvailable, NoTranscriptAvailable,
CookiePathInvalid, CookiePathInvalid,
CookiesInvalid CookiesInvalid,
FailedToCreateConsentCookie,
) )

View File

@ -129,12 +129,11 @@ class YouTubeTranscriptApi(object):
@classmethod @classmethod
def _load_cookies(cls, cookies, video_id): def _load_cookies(cls, cookies, video_id):
cookie_jar = {}
try: try:
cookie_jar = cookiejar.MozillaCookieJar() cookie_jar = cookiejar.MozillaCookieJar()
cookie_jar.load(cookies) cookie_jar.load(cookies)
except CookieLoadError:
raise CookiePathInvalid(video_id)
if not cookie_jar: if not cookie_jar:
raise CookiesInvalid(video_id) raise CookiesInvalid(video_id)
return cookie_jar return cookie_jar
except CookieLoadError:
raise CookiePathInvalid(video_id)

View File

@ -40,10 +40,15 @@ class VideoUnavailable(CouldNotRetrieveTranscript):
class TooManyRequests(CouldNotRetrieveTranscript): class TooManyRequests(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = ("YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. One of the following things can be done to work around this:\n\ CAUSE_MESSAGE = (
- Manually solve the captcha in a browser and export the cookie. Read here how to use that cookie with youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\ 'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
'One of the following things can be done to work around this:\n\
- Manually solve the captcha in a browser and export the cookie. '
'Read here how to use that cookie with '
'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
- Use a different IP address\n\ - Use a different IP address\n\
- Wait until the ban on your IP has been lifted") - Wait until the ban on your IP has been lifted'
)
class TranscriptsDisabled(CouldNotRetrieveTranscript): class TranscriptsDisabled(CouldNotRetrieveTranscript):
@ -70,6 +75,10 @@ class CookiesInvalid(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)' CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
class NoTranscriptFound(CouldNotRetrieveTranscript): class NoTranscriptFound(CouldNotRetrieveTranscript):
CAUSE_MESSAGE = ( CAUSE_MESSAGE = (
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n' 'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'

View File

@ -20,6 +20,7 @@ from ._errors import (
NotTranslatable, NotTranslatable,
TranslationLanguageNotAvailable, TranslationLanguageNotAvailable,
NoTranscriptAvailable, NoTranscriptAvailable,
FailedToCreateConsentCookie,
) )
from ._settings import WATCH_URL from ._settings import WATCH_URL
@ -32,7 +33,7 @@ class TranscriptListFetcher(object):
return TranscriptList.build( return TranscriptList.build(
self._http_client, self._http_client,
video_id, video_id,
self._extract_captions_json(self._fetch_html(video_id), video_id) self._extract_captions_json(self._fetch_video_html(video_id), video_id)
) )
def _extract_captions_json(self, html, video_id): def _extract_captions_json(self, html, video_id):
@ -55,6 +56,21 @@ class TranscriptListFetcher(object):
return captions_json return captions_json
def _create_consent_cookie(self, html, video_id):
match = re.search('name="v" value="(.*?)"', html)
if match is None:
raise FailedToCreateConsentCookie(video_id)
self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
def _fetch_video_html(self, video_id):
html = self._fetch_html(video_id)
if 'action="https://consent.youtube.com/s"' in html:
self._create_consent_cookie(html, video_id)
html = self._fetch_html(video_id)
if 'action="https://consent.youtube.com/s"' in html:
raise FailedToCreateConsentCookie(video_id)
return html
def _fetch_html(self, video_id): def _fetch_html(self, video_id):
return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace( return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace(
'\\u0026', '&' '\\u0026', '&'

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -17,7 +17,8 @@ from youtube_transcript_api import (
NotTranslatable, NotTranslatable,
TranslationLanguageNotAvailable, TranslationLanguageNotAvailable,
CookiePathInvalid, CookiePathInvalid,
CookiesInvalid CookiesInvalid,
FailedToCreateConsentCookie,
) )
@ -44,6 +45,7 @@ class TestYouTubeTranscriptApi(TestCase):
) )
def tearDown(self): def tearDown(self):
httpretty.reset()
httpretty.disable() httpretty.disable()
def test_get_transcript(self): def test_get_transcript(self):
@ -125,6 +127,43 @@ class TestYouTubeTranscriptApi(TestCase):
self.assertEqual(len(query_string['lang']), 1) self.assertEqual(len(query_string['lang']), 1)
self.assertEqual(query_string['lang'][0], 'en') self.assertEqual(query_string['lang'][0], 'en')
def test_get_transcript__create_consent_cookie_if_needed(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page.html.static')
)
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
self.assertEqual(len(httpretty.latest_requests()), 3)
for request in httpretty.latest_requests()[1:]:
self.assertEqual(request.headers['cookie'], 'CONSENT=YES+cb.20210328-17-p0.de+FX+119')
def test_get_transcript__exception_if_create_consent_cookie_failed(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page.html.static')
)
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page.html.static')
)
with self.assertRaises(FailedToCreateConsentCookie):
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
def test_get_transcript__exception_if_consent_cookie_age_invalid(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube_consent_page_invalid.html.static')
)
with self.assertRaises(FailedToCreateConsentCookie):
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
def test_get_transcript__exception_if_video_unavailable(self): def test_get_transcript__exception_if_video_unavailable(self):
httpretty.register_uri( httpretty.register_uri(
httpretty.GET, httpretty.GET,