added ability to create consent cookie
This commit is contained in:
parent
c90cf16484
commit
9251be8462
|
@ -10,5 +10,6 @@ from ._errors import (
|
||||||
TranslationLanguageNotAvailable,
|
TranslationLanguageNotAvailable,
|
||||||
NoTranscriptAvailable,
|
NoTranscriptAvailable,
|
||||||
CookiePathInvalid,
|
CookiePathInvalid,
|
||||||
CookiesInvalid
|
CookiesInvalid,
|
||||||
|
FailedToCreateConsentCookie,
|
||||||
)
|
)
|
||||||
|
|
|
@ -129,12 +129,11 @@ class YouTubeTranscriptApi(object):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _load_cookies(cls, cookies, video_id):
|
def _load_cookies(cls, cookies, video_id):
|
||||||
cookie_jar = {}
|
|
||||||
try:
|
try:
|
||||||
cookie_jar = cookiejar.MozillaCookieJar()
|
cookie_jar = cookiejar.MozillaCookieJar()
|
||||||
cookie_jar.load(cookies)
|
cookie_jar.load(cookies)
|
||||||
|
if not cookie_jar:
|
||||||
|
raise CookiesInvalid(video_id)
|
||||||
|
return cookie_jar
|
||||||
except CookieLoadError:
|
except CookieLoadError:
|
||||||
raise CookiePathInvalid(video_id)
|
raise CookiePathInvalid(video_id)
|
||||||
if not cookie_jar:
|
|
||||||
raise CookiesInvalid(video_id)
|
|
||||||
return cookie_jar
|
|
||||||
|
|
|
@ -40,10 +40,15 @@ class VideoUnavailable(CouldNotRetrieveTranscript):
|
||||||
|
|
||||||
|
|
||||||
class TooManyRequests(CouldNotRetrieveTranscript):
|
class TooManyRequests(CouldNotRetrieveTranscript):
|
||||||
CAUSE_MESSAGE = ("YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. One of the following things can be done to work around this:\n\
|
CAUSE_MESSAGE = (
|
||||||
- Manually solve the captcha in a browser and export the cookie. Read here how to use that cookie with youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
|
'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
|
||||||
- Use a different IP address\n\
|
'One of the following things can be done to work around this:\n\
|
||||||
- Wait until the ban on your IP has been lifted")
|
- Manually solve the captcha in a browser and export the cookie. '
|
||||||
|
'Read here how to use that cookie with '
|
||||||
|
'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
|
||||||
|
- Use a different IP address\n\
|
||||||
|
- Wait until the ban on your IP has been lifted'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TranscriptsDisabled(CouldNotRetrieveTranscript):
|
class TranscriptsDisabled(CouldNotRetrieveTranscript):
|
||||||
|
@ -70,6 +75,10 @@ class CookiesInvalid(CouldNotRetrieveTranscript):
|
||||||
CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
|
CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
|
||||||
|
|
||||||
|
|
||||||
|
class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
|
||||||
|
CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
|
||||||
|
|
||||||
|
|
||||||
class NoTranscriptFound(CouldNotRetrieveTranscript):
|
class NoTranscriptFound(CouldNotRetrieveTranscript):
|
||||||
CAUSE_MESSAGE = (
|
CAUSE_MESSAGE = (
|
||||||
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
||||||
|
|
|
@ -20,6 +20,7 @@ from ._errors import (
|
||||||
NotTranslatable,
|
NotTranslatable,
|
||||||
TranslationLanguageNotAvailable,
|
TranslationLanguageNotAvailable,
|
||||||
NoTranscriptAvailable,
|
NoTranscriptAvailable,
|
||||||
|
FailedToCreateConsentCookie,
|
||||||
)
|
)
|
||||||
from ._settings import WATCH_URL
|
from ._settings import WATCH_URL
|
||||||
|
|
||||||
|
@ -32,7 +33,7 @@ class TranscriptListFetcher(object):
|
||||||
return TranscriptList.build(
|
return TranscriptList.build(
|
||||||
self._http_client,
|
self._http_client,
|
||||||
video_id,
|
video_id,
|
||||||
self._extract_captions_json(self._fetch_html(video_id), video_id)
|
self._extract_captions_json(self._fetch_video_html(video_id), video_id)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _extract_captions_json(self, html, video_id):
|
def _extract_captions_json(self, html, video_id):
|
||||||
|
@ -55,6 +56,21 @@ class TranscriptListFetcher(object):
|
||||||
|
|
||||||
return captions_json
|
return captions_json
|
||||||
|
|
||||||
|
def _create_consent_cookie(self, html, video_id):
|
||||||
|
match = re.search('name="v" value="(.*?)"', html)
|
||||||
|
if match is None:
|
||||||
|
raise FailedToCreateConsentCookie(video_id)
|
||||||
|
self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
|
||||||
|
|
||||||
|
def _fetch_video_html(self, video_id):
|
||||||
|
html = self._fetch_html(video_id)
|
||||||
|
if 'action="https://consent.youtube.com/s"' in html:
|
||||||
|
self._create_consent_cookie(html, video_id)
|
||||||
|
html = self._fetch_html(video_id)
|
||||||
|
if 'action="https://consent.youtube.com/s"' in html:
|
||||||
|
raise FailedToCreateConsentCookie(video_id)
|
||||||
|
return html
|
||||||
|
|
||||||
def _fetch_html(self, video_id):
|
def _fetch_html(self, video_id):
|
||||||
return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace(
|
return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace(
|
||||||
'\\u0026', '&'
|
'\\u0026', '&'
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -17,7 +17,8 @@ from youtube_transcript_api import (
|
||||||
NotTranslatable,
|
NotTranslatable,
|
||||||
TranslationLanguageNotAvailable,
|
TranslationLanguageNotAvailable,
|
||||||
CookiePathInvalid,
|
CookiePathInvalid,
|
||||||
CookiesInvalid
|
CookiesInvalid,
|
||||||
|
FailedToCreateConsentCookie,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,6 +45,7 @@ class TestYouTubeTranscriptApi(TestCase):
|
||||||
)
|
)
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
|
httpretty.reset()
|
||||||
httpretty.disable()
|
httpretty.disable()
|
||||||
|
|
||||||
def test_get_transcript(self):
|
def test_get_transcript(self):
|
||||||
|
@ -125,6 +127,43 @@ class TestYouTubeTranscriptApi(TestCase):
|
||||||
self.assertEqual(len(query_string['lang']), 1)
|
self.assertEqual(len(query_string['lang']), 1)
|
||||||
self.assertEqual(query_string['lang'][0], 'en')
|
self.assertEqual(query_string['lang'][0], 'en')
|
||||||
|
|
||||||
|
def test_get_transcript__create_consent_cookie_if_needed(self):
|
||||||
|
httpretty.register_uri(
|
||||||
|
httpretty.GET,
|
||||||
|
'https://www.youtube.com/watch',
|
||||||
|
body=load_asset('youtube_consent_page.html.static')
|
||||||
|
)
|
||||||
|
|
||||||
|
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
|
||||||
|
self.assertEqual(len(httpretty.latest_requests()), 3)
|
||||||
|
for request in httpretty.latest_requests()[1:]:
|
||||||
|
self.assertEqual(request.headers['cookie'], 'CONSENT=YES+cb.20210328-17-p0.de+FX+119')
|
||||||
|
|
||||||
|
def test_get_transcript__exception_if_create_consent_cookie_failed(self):
|
||||||
|
httpretty.register_uri(
|
||||||
|
httpretty.GET,
|
||||||
|
'https://www.youtube.com/watch',
|
||||||
|
body=load_asset('youtube_consent_page.html.static')
|
||||||
|
)
|
||||||
|
httpretty.register_uri(
|
||||||
|
httpretty.GET,
|
||||||
|
'https://www.youtube.com/watch',
|
||||||
|
body=load_asset('youtube_consent_page.html.static')
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(FailedToCreateConsentCookie):
|
||||||
|
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
|
||||||
|
|
||||||
|
def test_get_transcript__exception_if_consent_cookie_age_invalid(self):
|
||||||
|
httpretty.register_uri(
|
||||||
|
httpretty.GET,
|
||||||
|
'https://www.youtube.com/watch',
|
||||||
|
body=load_asset('youtube_consent_page_invalid.html.static')
|
||||||
|
)
|
||||||
|
|
||||||
|
with self.assertRaises(FailedToCreateConsentCookie):
|
||||||
|
YouTubeTranscriptApi.get_transcript('F1xioXWb8CY')
|
||||||
|
|
||||||
def test_get_transcript__exception_if_video_unavailable(self):
|
def test_get_transcript__exception_if_video_unavailable(self):
|
||||||
httpretty.register_uri(
|
httpretty.register_uri(
|
||||||
httpretty.GET,
|
httpretty.GET,
|
||||||
|
|
Loading…
Reference in New Issue