import sys # This can only be tested by using different python versions, therefore it is not covered by coverage.py if sys.version_info.major == 2: # pragma: no cover # ruff: noqa: F821 reload(sys) sys.setdefaultencoding("utf-8") import json from xml.etree import ElementTree import re from requests import HTTPError from ._html_unescaping import unescape from ._errors import ( VideoUnavailable, TooManyRequests, YouTubeRequestFailed, NoTranscriptFound, TranscriptsDisabled, NotTranslatable, TranslationLanguageNotAvailable, NoTranscriptAvailable, FailedToCreateConsentCookie, InvalidVideoId, ) from ._settings import WATCH_URL def _raise_http_errors(response, video_id): try: response.raise_for_status() return response except HTTPError as error: raise YouTubeRequestFailed(error, video_id) class TranscriptListFetcher(object): def __init__(self, http_client): self._http_client = http_client def fetch(self, video_id): return TranscriptList.build( self._http_client, video_id, self._extract_captions_json(self._fetch_video_html(video_id), video_id), ) def _extract_captions_json(self, html, video_id): splitted_html = html.split('"captions":') if len(splitted_html) <= 1: if video_id.startswith("http://") or video_id.startswith("https://"): raise InvalidVideoId(video_id) if 'class="g-recaptcha"' in html: raise TooManyRequests(video_id) if '"playabilityStatus":' not in html: raise VideoUnavailable(video_id) raise TranscriptsDisabled(video_id) captions_json = json.loads( splitted_html[1].split(',"videoDetails')[0].replace("\n", "") ).get("playerCaptionsTracklistRenderer") if captions_json is None: raise TranscriptsDisabled(video_id) if "captionTracks" not in captions_json: raise NoTranscriptAvailable(video_id) return captions_json def _create_consent_cookie(self, html, video_id): match = re.search('name="v" value="(.*?)"', html) if match is None: raise FailedToCreateConsentCookie(video_id) self._http_client.cookies.set( "CONSENT", "YES+" + match.group(1), domain=".youtube.com" ) def _fetch_video_html(self, video_id): html = self._fetch_html(video_id) if 'action="https://consent.youtube.com/s"' in html: self._create_consent_cookie(html, video_id) html = self._fetch_html(video_id) if 'action="https://consent.youtube.com/s"' in html: raise FailedToCreateConsentCookie(video_id) return html def _fetch_html(self, video_id): response = self._http_client.get( WATCH_URL.format(video_id=video_id), headers={"Accept-Language": "en-US"} ) return unescape(_raise_http_errors(response, video_id).text) class TranscriptList(object): """ This object represents a list of transcripts. It can be iterated over to list all transcripts which are available for a given YouTube video. Also it provides functionality to search for a transcript in a given language. """ def __init__( self, video_id, manually_created_transcripts, generated_transcripts, translation_languages, ): """ The constructor is only for internal use. Use the static build method instead. :param video_id: the id of the video this TranscriptList is for :type video_id: str :param manually_created_transcripts: dict mapping language codes to the manually created transcripts :type manually_created_transcripts: dict[str, Transcript] :param generated_transcripts: dict mapping language codes to the generated transcripts :type generated_transcripts: dict[str, Transcript] :param translation_languages: list of languages which can be used for translatable languages :type translation_languages: list[dict[str, str]] """ self.video_id = video_id self._manually_created_transcripts = manually_created_transcripts self._generated_transcripts = generated_transcripts self._translation_languages = translation_languages @staticmethod def build(http_client, video_id, captions_json): """ Factory method for TranscriptList. :param http_client: http client which is used to make the transcript retrieving http calls :type http_client: requests.Session :param video_id: the id of the video this TranscriptList is for :type video_id: str :param captions_json: the JSON parsed from the YouTube pages static HTML :type captions_json: dict :return: the created TranscriptList :rtype TranscriptList: """ translation_languages = [ { "language": translation_language["languageName"]["simpleText"], "language_code": translation_language["languageCode"], } for translation_language in captions_json.get("translationLanguages", []) ] manually_created_transcripts = {} generated_transcripts = {} for caption in captions_json["captionTracks"]: if caption.get("kind", "") == "asr": transcript_dict = generated_transcripts else: transcript_dict = manually_created_transcripts transcript_dict[caption["languageCode"]] = Transcript( http_client, video_id, caption["baseUrl"], caption["name"]["simpleText"], caption["languageCode"], caption.get("kind", "") == "asr", translation_languages if caption.get("isTranslatable", False) else [], ) return TranscriptList( video_id, manually_created_transcripts, generated_transcripts, translation_languages, ) def __iter__(self): return iter( list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()) ) def find_transcript(self, language_codes): """ Finds a transcript for a given language code. Manually created transcripts are returned first and only if none are found, generated transcripts are used. If you only want generated transcripts use `find_manually_created_transcript` instead. :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. :type languages: list[str] :return: the found Transcript :rtype Transcript: :raises: NoTranscriptFound """ return self._find_transcript( language_codes, [self._manually_created_transcripts, self._generated_transcripts], ) def find_generated_transcript(self, language_codes): """ Finds an automatically generated transcript for a given language code. :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. :type languages: list[str] :return: the found Transcript :rtype Transcript: :raises: NoTranscriptFound """ return self._find_transcript(language_codes, [self._generated_transcripts]) def find_manually_created_transcript(self, language_codes): """ Finds a manually created transcript for a given language code. :param language_codes: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. :type languages: list[str] :return: the found Transcript :rtype Transcript: :raises: NoTranscriptFound """ return self._find_transcript( language_codes, [self._manually_created_transcripts] ) def _find_transcript(self, language_codes, transcript_dicts): for language_code in language_codes: for transcript_dict in transcript_dicts: if language_code in transcript_dict: return transcript_dict[language_code] raise NoTranscriptFound(self.video_id, language_codes, self) def __str__(self): return ( "For this video ({video_id}) transcripts are available in the following languages:\n\n" "(MANUALLY CREATED)\n" "{available_manually_created_transcript_languages}\n\n" "(GENERATED)\n" "{available_generated_transcripts}\n\n" "(TRANSLATION LANGUAGES)\n" "{available_translation_languages}" ).format( video_id=self.video_id, available_manually_created_transcript_languages=self._get_language_description( str(transcript) for transcript in self._manually_created_transcripts.values() ), available_generated_transcripts=self._get_language_description( str(transcript) for transcript in self._generated_transcripts.values() ), available_translation_languages=self._get_language_description( '{language_code} ("{language}")'.format( language=translation_language["language"], language_code=translation_language["language_code"], ) for translation_language in self._translation_languages ), ) def _get_language_description(self, transcript_strings): description = "\n".join( " - {transcript}".format(transcript=transcript) for transcript in transcript_strings ) return description if description else "None" class Transcript(object): def __init__( self, http_client, video_id, url, language, language_code, is_generated, translation_languages, ): """ You probably don't want to initialize this directly. Usually you'll access Transcript objects using a TranscriptList. :param http_client: http client which is used to make the transcript retrieving http calls :type http_client: requests.Session :param video_id: the id of the video this TranscriptList is for :type video_id: str :param url: the url which needs to be called to fetch the transcript :param language: the name of the language this transcript uses :param language_code: :param is_generated: :param translation_languages: """ self._http_client = http_client self.video_id = video_id self._url = url self.language = language self.language_code = language_code self.is_generated = is_generated self.translation_languages = translation_languages self._translation_languages_dict = { translation_language["language_code"]: translation_language["language"] for translation_language in translation_languages } def fetch(self, preserve_formatting=False): """ Loads the actual transcript data. :param preserve_formatting: whether to keep select HTML text formatting :type preserve_formatting: bool :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ response = self._http_client.get( self._url, headers={"Accept-Language": "en-US"} ) return _TranscriptParser(preserve_formatting=preserve_formatting).parse( _raise_http_errors(response, self.video_id).text, ) def __str__(self): return '{language_code} ("{language}"){translation_description}'.format( language=self.language, language_code=self.language_code, translation_description="[TRANSLATABLE]" if self.is_translatable else "", ) @property def is_translatable(self): return len(self.translation_languages) > 0 def translate(self, language_code): if not self.is_translatable: raise NotTranslatable(self.video_id) if language_code not in self._translation_languages_dict: raise TranslationLanguageNotAvailable(self.video_id) return Transcript( self._http_client, self.video_id, "{url}&tlang={language_code}".format( url=self._url, language_code=language_code ), self._translation_languages_dict[language_code], language_code, True, [], ) class _TranscriptParser(object): _FORMATTING_TAGS = [ "strong", # important "em", # emphasized "b", # bold "i", # italic "mark", # marked "small", # smaller "del", # deleted "ins", # inserted "sub", # subscript "sup", # superscript ] def __init__(self, preserve_formatting=False): self._html_regex = self._get_html_regex(preserve_formatting) def _get_html_regex(self, preserve_formatting): if preserve_formatting: formats_regex = "|".join(self._FORMATTING_TAGS) formats_regex = r"<\/?(?!\/?(" + formats_regex + r")\b).*?\b>" html_regex = re.compile(formats_regex, re.IGNORECASE) else: html_regex = re.compile(r"<[^>]*>", re.IGNORECASE) return html_regex def parse(self, plain_data): return [ { "text": re.sub(self._html_regex, "", unescape(xml_element.text)), "start": float(xml_element.attrib["start"]), "duration": float(xml_element.attrib.get("dur", "0.0")), } for xml_element in ElementTree.fromstring(plain_data) if xml_element.text is not None ]