refactored the way transcript information is retrieved and thereby improved error messages

2019-12-08 14:40:57 +01:00 · 2019-12-08 14:40:57 +01:00 · df417be915
parent 54ef72fafd
commit df417be915
8 changed files with 3258 additions and 118 deletions
--- a/youtube_transcript_api/init.py
+++ b/youtube_transcript_api/init.py
@ -1 +1,3 @@
 from ._api import YouTubeTranscriptApi
+from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript
+from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable
--- a/youtube_transcript_api/_api.py
+++ b/youtube_transcript_api/_api.py
@ -1,44 +1,9 @@
-import sys
-
-# This can only be tested by using different python versions, therefore it is not covered by coverage.py
-if sys.version_info.major == 2: # pragma: no cover
-    reload(sys)
-    sys.setdefaultencoding('utf-8')
-
-from xml.etree import ElementTree
-
-import re
-
 import requests

-from ._html_unescaping import unescape
+from ._transcripts import TranscriptDataFetcher


 class YouTubeTranscriptApi():
-    class CouldNotRetrieveTranscript(Exception):
-        """
-        Raised if a transcript could not be retrieved.
-        """
-
-        ERROR_MESSAGE = (
-            'Could not get the transcript for the video {video_url}! '
-            'This usually happens if one of the following things is the case:\n'
-            ' - subtitles have been disabled by the uploader\n'
-            ' - none of the language codes you provided are valid\n'
-            ' - none of the languages you provided are supported by the video\n'
-            ' - the video is no longer available.\n\n'
-            'If none of these things is the case, please create an issue at '
-            'https://github.com/jdepoix/youtube-transcript-api/issues.'
-            'Please add which version of youtube_transcript_api you are using and make sure that there '
-            'are no open issues which already describe your problem!'
-        )
-
-        def __init__(self, video_id):
-            super(YouTubeTranscriptApi.CouldNotRetrieveTranscript, self).__init__(
-                self.ERROR_MESSAGE.format(video_url=_TranscriptFetcher.WATCH_URL.format(video_id=video_id))
-            )
-            self.video_id = video_id
-
    @classmethod
    def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None):
        """
@ -47,7 +12,7 @@ class YouTubeTranscriptApi():
        :param video_ids: a list of youtube video ids
        :type video_ids: [str]
        :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
-        it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
+        it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
        do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
        play around with the language codes a bit, to find the one which is working for you!
        :type languages: [str]
@ -91,78 +56,6 @@ class YouTubeTranscriptApi():
        :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
        :rtype: [{'text': str, 'start': float, 'end': float}]
        """
-        try:
-            return _TranscriptParser(_TranscriptFetcher(video_id, languages, proxies).fetch()).parse()
-        except Exception:
-            raise YouTubeTranscriptApi.CouldNotRetrieveTranscript(video_id)
-
-
-class _TranscriptFetcher():
-    WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
-    API_BASE_URL = 'https://www.youtube.com/api/'
-    TIMEDTEXT_STRING = 'timedtext?v='
-    NAME_REGEX = re.compile(r'&name=.*?(&)|&name=.*')
-
-    def __init__(self, video_id, languages, proxies):
-        self.video_id = video_id
-        self.languages = languages
-        self.proxies = proxies
-
-    def fetch(self):
-        if self.proxies:
-            fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id), proxies=self.proxies).text
-        else:
-            fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
-        timedtext_splits = [split[:split.find('"')]
-                .replace('\\u0026', '&')
-                .replace('\\', '') 
-                for split in fetched_site.split(self.TIMEDTEXT_STRING)]
-        matched_splits = []
-        for language in self.languages:
-            matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split]
-            if matched_splits:
-                break
-        if matched_splits:
-            timedtext_url = min(matched_splits, key=self._sort_splits)
-            response = self._execute_api_request(timedtext_url)
-            if response:
-                return response
-
-        return None
-
-    def _sort_splits(self, matched_split):
-        """Returns a value related to a given caption track url.
-
-        This function is used to sort the matched splits by string 
-        length because we want non-asr and non-dialect options returned first.
-        With this in mind, it is remove the 'name' arugument from the url as 
-        it could possibly make the values inaccurate to what we desire.
-
-        matched_split: The caption track url we want to return a value for.        
-        """
-        return len(re.sub(self.NAME_REGEX, r'\1', matched_split))
-
-    def _execute_api_request(self, timedtext_url):
-        url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url)
-        if self.proxies:
-            return requests.get(url, proxies=self.proxies).text
-        else:
-            return requests.get(url).text
-
-
-class _TranscriptParser():
-    HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
-
-    def __init__(self, plain_data):
-        self.plain_data = plain_data
-
-    def parse(self):
-        return [
-            {
-                'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
-                'start': float(xml_element.attrib['start']),
-                'duration': float(xml_element.attrib['dur']),
-            }
-            for xml_element in ElementTree.fromstring(self.plain_data)
-            if xml_element.text is not None
-        ]
+        with requests.Session() as http_client:
+            http_client.proxies = proxies if proxies else {}
+            return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
--- a/youtube_transcript_api/_errors.py
+++ b/youtube_transcript_api/_errors.py
@ -0,0 +1,62 @@
+from ._settings import WATCH_URL
+
+
+class CouldNotRetrieveTranscript(Exception):
+    """
+    Raised if a transcript could not be retrieved.
+    """
+    ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
+    CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
+    CAUSE_MESSAGE = ''
+    GITHUB_REFERRAL = (
+        '\n\nIf you are sure that the described cause is not responsible for this error '
+        'and that a transcript should be retrievable, please create an issue at '
+        'https://github.com/jdepoix/youtube-transcript-api/issues.'
+        'Please add which version of youtube_transcript_api you are using '
+        'and provide the information needed to replicate the error. '
+        'Also make sure that there are no open issues which already describe your problem!'
+    )
+
+    def __init__(self, video_id):
+        self.video_id = video_id
+        super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message())
+
+    def _build_error_message(self):
+        cause = self.cause
+        error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
+
+        if cause:
+            error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
+
+        return error_message
+
+    @property
+    def cause(self):
+        return self.CAUSE_MESSAGE
+
+
+class VideoUnavailable(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'The video is no longer available'
+
+
+class TranscriptsDisabled(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = 'Subtitles are disabled for this video'
+
+
+class NoTranscriptFound(CouldNotRetrieveTranscript):
+    CAUSE_MESSAGE = (
+        'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
+        '{transcript_data}'
+    )
+
+    def __init__(self, video_id, requested_language_codes, transcript_data):
+        self._requested_language_codes = requested_language_codes
+        self._transcript_data = transcript_data
+        super(NoTranscriptFound, self).__init__(video_id)
+
+    @property
+    def cause(self):
+        return self.CAUSE_MESSAGE.format(
+            requested_language_codes=self._requested_language_codes,
+            transcript_data=str(self._transcript_data),
+        )
--- a/youtube_transcript_api/_settings.py
+++ b/youtube_transcript_api/_settings.py
@ -0,0 +1 @@
+WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
--- a/youtube_transcript_api/_transcripts.py
+++ b/youtube_transcript_api/_transcripts.py
@ -0,0 +1,202 @@
+import sys
+
+# This can only be tested by using different python versions, therefore it is not covered by coverage.py
+if sys.version_info.major == 2: # pragma: no cover
+    reload(sys)
+    sys.setdefaultencoding('utf-8')
+
+import json
+
+from xml.etree import ElementTree
+
+import re
+
+from ._html_unescaping import unescape
+from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled
+from ._settings import WATCH_URL
+
+
+class TranscriptDataFetcher():
+    def __init__(self, http_client):
+        self._http_client = http_client
+
+    def fetch(self, video_id):
+        return TranscriptData.build(
+            self._http_client,
+            video_id,
+            self._extract_captions_json(self._fetch_html(video_id), video_id)
+        )
+
+    def _extract_captions_json(self, html, video_id):
+        splitted_html = html.split('"captions":')
+
+        if len(splitted_html) <= 1:
+            if '"playabilityStatus":' not in html:
+                raise VideoUnavailable(video_id)
+
+            raise TranscriptsDisabled(video_id)
+
+        return json.loads(splitted_html[1].split(',"videoDetails')[0].replace('\n', ''))[
+            'playerCaptionsTracklistRenderer'
+        ]
+
+    def _fetch_html(self, video_id):
+        return self._http_client.get(WATCH_URL.format(video_id=video_id)).text.replace(
+            '\\u0026', '&'
+        ).replace(
+            '\\', ''
+        )
+
+
+class TranscriptData():
+    # TODO implement iterator
+
+    def __init__(
+        self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages
+    ):
+        self._http_client = http_client
+        self.video_id = video_id
+        self._manually_created_transcripts = manually_created_transcripts
+        self._generated_transcripts = generated_transcripts
+        self._translation_languages = translation_languages
+
+    @staticmethod
+    def build(http_client, video_id, captions_json):
+        manually_created_transcripts = []
+        generated_transcripts = []
+
+        for caption in captions_json['captionTracks']:
+            (generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append(
+                {
+                    'url': caption['baseUrl'],
+                    'language': caption['name']['simpleText'],
+                    'language_code': caption['languageCode'],
+                    'is_generated': caption.get('kind', '') == 'asr',
+                    'is_translatable': caption['isTranslatable'],
+                }
+            )
+
+        return TranscriptData(
+            http_client,
+            video_id,
+            manually_created_transcripts,
+            generated_transcripts,
+            [
+                {
+                    'language': translation_language['languageName']['simpleText'],
+                    'language_code': translation_language['languageCode'],
+                } for translation_language in captions_json['translationLanguages']
+            ],
+        )
+
+    def find_transcript(self, language_codes):
+        try:
+            return self.find_manually_created_transcript(language_codes)
+        except NoTranscriptFound:
+            pass
+
+        return self.find_generated_transcript(language_codes)
+
+    def find_generated_transcript(self, language_codes):
+        return self._find_transcript(language_codes, generated=True)
+
+    def find_manually_created_transcript(self, language_codes):
+        return self._find_transcript(language_codes, generated=False)
+
+    def _find_transcript(self, language_codes, generated):
+        transcripts = self._generated_transcripts if generated else self._manually_created_transcripts
+
+        for language_code in language_codes:
+            for transcript in transcripts:
+                if transcript['language_code'] == language_code:
+                    return Transcript(
+                        self._http_client,
+                        transcript['url'],
+                        transcript['language'],
+                        transcript['language_code'],
+                        transcript['is_generated'],
+                        self._translation_languages if transcript['is_translatable'] else []
+                    )
+
+        raise NoTranscriptFound(
+            self.video_id,
+            language_codes,
+            self
+        )
+
+    def __str__(self):
+        return (
+            'For this video ({video_id}) transcripts are available in the following languages:\n\n'
+            '(MANUALLY CREATED)\n'
+            '{available_manually_created_transcript_languages}\n\n'
+            '(GENERATED)\n'
+            '{available_generated_transcripts}'
+        ).format(
+            video_id=self.video_id,
+            available_manually_created_transcript_languages=self._get_language_description(
+                self._manually_created_transcripts
+            ),
+            available_generated_transcripts=self._get_language_description(
+                self._generated_transcripts
+            ),
+        )
+
+    def _get_language_description(self, transcripts):
+        return '\n'.join(
+            ' - {language_code} ("{language}")'.format(
+                language=transcript['language'],
+                language_code=transcript['language_code'],
+            ) for transcript in transcripts
+        ) if transcripts else 'None'
+
+
+class Transcript():
+    def __init__(self, http_client, url, language, language_code, is_generated, translation_languages):
+        self._http_client = http_client
+        self.url = url
+        self.language = language
+        self.language_code = language_code
+        self.is_generated = is_generated
+        self.translation_languages = translation_languages
+
+    def fetch(self):
+        return _TranscriptParser().parse(
+            self._http_client.get(self.url).text
+        )
+
+# TODO integrate translations in future release
+#     @property
+#     def is_translatable(self):
+#         return len(self.translation_languages) > 0
+#
+#
+# class TranslatableTranscript(Transcript):
+#     def __init__(self, http_client, url, translation_languages):
+#         super(TranslatableTranscript, self).__init__(http_client, url)
+#         self._translation_languages = translation_languages
+#         self._translation_language_codes = {language['language_code'] for language in translation_languages}
+#
+#
+#     def translate(self, language_code):
+#         if language_code not in self._translation_language_codes:
+#             raise TranslatableTranscript.TranslationLanguageNotAvailable()
+#
+#         return Transcript(
+#             self._http_client,
+#             '{url}&tlang={language_code}'.format(url=self._url, language_code=language_code)
+#         )
+
+
+class _TranscriptParser():
+    HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
+
+    def parse(self, plain_data):
+        return [
+            {
+                'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
+                'start': float(xml_element.attrib['start']),
+                'duration': float(xml_element.attrib['dur']),
+            }
+            for xml_element in ElementTree.fromstring(plain_data)
+            if xml_element.text is not None
+        ]
--- a/youtube_transcript_api/test/assets/youtube_transcripts_disabled.html.static
+++ b/youtube_transcript_api/test/assets/youtube_transcripts_disabled.html.static
--- a/youtube_transcript_api/test/assets/youtube_video_unavailable.html.static
+++ b/youtube_transcript_api/test/assets/youtube_video_unavailable.html.static
--- a/youtube_transcript_api/test/test_api.py
+++ b/youtube_transcript_api/test/test_api.py
@ -5,7 +5,7 @@ import os

 import httpretty

-from youtube_transcript_api._api import YouTubeTranscriptApi
+from youtube_transcript_api import YouTubeTranscriptApi, VideoUnavailable, NoTranscriptFound, TranscriptsDisabled


 def load_asset(filename):
@ -64,15 +64,29 @@ class TestYouTubeTranscriptApi(TestCase):
        self.assertEqual(len(query_string['lang']), 1)
        self.assertEqual(query_string['lang'][0], 'en')

-    def test_get_transcript__exception_is_raised_when_not_available(self):
+    def test_get_transcript__exception_if_video_unavailable(self):
        httpretty.register_uri(
            httpretty.GET,
-            'https://www.youtube.com/api/timedtext',
-            body=''
+            'https://www.youtube.com/watch',
+            body=load_asset('youtube_video_unavailable.html.static')
        )

-        with self.assertRaises(YouTubeTranscriptApi.CouldNotRetrieveTranscript):
-            YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8')
+        with self.assertRaises(VideoUnavailable):
+            YouTubeTranscriptApi.get_transcript('abc')
+
+    def test_get_transcript__exception_if_transcripts_disabled(self):
+        httpretty.register_uri(
+            httpretty.GET,
+            'https://www.youtube.com/watch',
+            body=load_asset('youtube_transcripts_disabled.html.static')
+        )
+
+        with self.assertRaises(TranscriptsDisabled):
+            YouTubeTranscriptApi.get_transcript('dsMFmonKDD4')
+
+    def test_get_transcript__exception_if_language_unavailable(self):
+        with self.assertRaises(NoTranscriptFound):
+            YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', languages=['cz'])

    def test_get_transcripts(self):
        video_id_1 = 'video_id_1'
				`@ -0,0 +1 @@`
				`WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'`