diff --git a/README.md b/README.md index 8279040..b6dca7d 100644 --- a/README.md +++ b/README.md @@ -48,8 +48,9 @@ This will return a list of dictionaries looking somewhat like this: # ... ] ``` +### Translate transcript -You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). +You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). ```python YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) @@ -65,6 +66,14 @@ YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) `languages` also is optional here. +### Preserve formatting + +You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `` (italics) and `` (bold). + +```python +YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True) +``` + ### List available transcripts If you want to list all transcripts which are available for a given video you can call: diff --git a/youtube_transcript_api/_api.py b/youtube_transcript_api/_api.py index dfb790d..24a1236 100644 --- a/youtube_transcript_api/_api.py +++ b/youtube_transcript_api/_api.py @@ -71,7 +71,8 @@ class YouTubeTranscriptApi(object): return TranscriptListFetcher(http_client).fetch(video_id) @classmethod - def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None): + def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, + cookies=None, preserve_formatting=False): """ Retrieves the transcripts for a list of videos. @@ -88,6 +89,8 @@ class YouTubeTranscriptApi(object): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): @@ -99,7 +102,7 @@ class YouTubeTranscriptApi(object): for video_id in video_ids: try: - data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies) + data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting) except Exception as exception: if not continue_after_error: raise exception @@ -109,7 +112,7 @@ class YouTubeTranscriptApi(object): return data, unretrievable_videos @classmethod - def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None): + def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False): """ Retrieves the transcript for a single video. This is just a shortcut for calling:: @@ -125,12 +128,14 @@ class YouTubeTranscriptApi(object): :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ assert isinstance(video_id, str), "`video_id` must be a string" - return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() - + return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting) + @classmethod def _load_cookies(cls, cookies, video_id): try: diff --git a/youtube_transcript_api/_transcripts.py b/youtube_transcript_api/_transcripts.py index cea50c4..1e0f8f1 100644 --- a/youtube_transcript_api/_transcripts.py +++ b/youtube_transcript_api/_transcripts.py @@ -1,7 +1,7 @@ import sys # This can only be tested by using different python versions, therefore it is not covered by coverage.py -if sys.version_info.major == 2: # pragma: no cover +if sys.version_info.major == 2: # pragma: no cover reload(sys) sys.setdefaultencoding('utf-8') @@ -41,10 +41,11 @@ class TranscriptListFetcher(object): self._http_client = http_client def fetch(self, video_id): + return TranscriptList.build( self._http_client, video_id, - self._extract_captions_json(self._fetch_video_html(video_id), video_id) + self._extract_captions_json(self._fetch_video_html(video_id), video_id), ) def _extract_captions_json(self, html, video_id): @@ -94,6 +95,7 @@ class TranscriptList(object): This object represents a list of transcripts. It can be iterated over to list all transcripts which are available for a given YouTube video. Also it provides functionality to search for a transcript in a given language. """ + def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): """ The constructor is only for internal use. Use the static build method instead. @@ -149,7 +151,7 @@ class TranscriptList(object): caption['name']['simpleText'], caption['languageCode'], caption.get('kind', '') == 'asr', - translation_languages if caption.get('isTranslatable', False) else [] + translation_languages if caption.get('isTranslatable', False) else [], ) return TranscriptList( @@ -190,7 +192,7 @@ class TranscriptList(object): :rtype Transcript: :raises: NoTranscriptFound """ - return self._find_transcript(language_codes, [self._generated_transcripts,]) + return self._find_transcript(language_codes, [self._generated_transcripts]) def find_manually_created_transcript(self, language_codes): """ @@ -204,7 +206,7 @@ class TranscriptList(object): :rtype Transcript: :raises: NoTranscriptFound """ - return self._find_transcript(language_codes, [self._manually_created_transcripts,]) + return self._find_transcript(language_codes, [self._manually_created_transcripts]) def _find_transcript(self, language_codes, transcript_dicts): for language_code in language_codes: @@ -276,15 +278,16 @@ class Transcript(object): for translation_language in translation_languages } - def fetch(self): + def fetch(self, preserve_formatting=False): """ Loads the actual transcript data. - + :param preserve_formatting: whether to keep select HTML text formatting + :type preserve_formatting: bool :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ response = self._http_client.get(self._url) - return _TranscriptParser().parse( + return _TranscriptParser(preserve_formatting=preserve_formatting).parse( _raise_http_errors(response, self.video_id).text, ) @@ -318,12 +321,35 @@ class Transcript(object): class _TranscriptParser(object): - HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) + _FORMATTING_TAGS = [ + 'strong', # important + 'em', # emphasized + 'b', # bold + 'i', # italic + 'mark', # marked + 'small', # smaller + 'del', # deleted + 'ins', # inserted + 'sub', # subscript + 'sup', # superscript + ] + + def __init__(self, preserve_formatting=False): + self._html_regex = self._get_html_regex(preserve_formatting) + + def _get_html_regex(self, preserve_formatting): + if preserve_formatting: + formats_regex = '|'.join(self._FORMATTING_TAGS) + formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>' + html_regex = re.compile(formats_regex, re.IGNORECASE) + else: + html_regex = re.compile(r'<[^>]*>', re.IGNORECASE) + return html_regex def parse(self, plain_data): return [ { - 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), + 'text': re.sub(self._html_regex, '', unescape(xml_element.text)), 'start': float(xml_element.attrib['start']), 'duration': float(xml_element.attrib.get('dur', '0.0')), } diff --git a/youtube_transcript_api/test/assets/transcript.xml.static b/youtube_transcript_api/test/assets/transcript.xml.static index ec777e7..64f9c3c 100644 --- a/youtube_transcript_api/test/assets/transcript.xml.static +++ b/youtube_transcript_api/test/assets/transcript.xml.static @@ -1,7 +1,7 @@ Hey, this is just a test - this is not the original transcript + this is <i>not</i> the original transcript just something shorter, I made up for testing \ No newline at end of file diff --git a/youtube_transcript_api/test/test_api.py b/youtube_transcript_api/test/test_api.py index 3bda630..36d60a5 100644 --- a/youtube_transcript_api/test/test_api.py +++ b/youtube_transcript_api/test/test_api.py @@ -61,6 +61,18 @@ class TestYouTubeTranscriptApi(TestCase): ] ) + def test_get_transcript_formatted(self): + transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', preserve_formatting=True) + + self.assertEqual( + transcript, + [ + {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, + {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, + {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} + ] + ) + def test_list_transcripts(self): transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') @@ -254,11 +266,11 @@ class TestYouTubeTranscriptApi(TestCase): {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} ] ) - + def test_get_transcript__assertionerror_if_input_not_string(self): with self.assertRaises(AssertionError): YouTubeTranscriptApi.get_transcript(['video_id_1', 'video_id_2']) - + def test_get_transcripts__assertionerror_if_input_not_list(self): with self.assertRaises(AssertionError): YouTubeTranscriptApi.get_transcripts('video_id_1') @@ -271,8 +283,8 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) - mock_get_transcript.assert_any_call(video_id_1, languages, None, None) - mock_get_transcript.assert_any_call(video_id_2, languages, None, None) + mock_get_transcript.assert_any_call(video_id_1, languages, None, None, False) + mock_get_transcript.assert_any_call(video_id_2, languages, None, None, False) self.assertEqual(mock_get_transcript.call_count, 2) @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) @@ -287,20 +299,20 @@ class TestYouTubeTranscriptApi(TestCase): YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) - mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None) - mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None) + mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None, False) + mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None, False) @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') def test_get_transcripts__with_cookies(self, mock_get_transcript): cookies = '/example_cookies.txt' YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) - mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies) + mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies, False) @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') def test_get_transcripts__with_proxies(self, mock_get_transcript): proxies = {'http': '', 'https:': ''} YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) - mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None) + mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None, False) def test_load_cookies(self): dirname, filename = os.path.split(os.path.abspath(__file__)) diff --git a/youtube_transcript_api/test/test_cli.py b/youtube_transcript_api/test/test_cli.py index d14f331..26ffabc 100644 --- a/youtube_transcript_api/test/test_cli.py +++ b/youtube_transcript_api/test/test_cli.py @@ -12,7 +12,7 @@ class TestYouTubeTranscriptCli(TestCase): self.transcript_mock = MagicMock() self.transcript_mock.fetch = MagicMock(return_value=[ {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, - {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, + {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} ]) self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock)