Merge pull request #192 from eseiver/html_formatting

Add HTML text formatting option
This commit is contained in:
Jonas Depoix 2023-04-17 15:15:46 +02:00 committed by GitHub
commit e0a9f0d3e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 78 additions and 26 deletions

View File

@ -48,8 +48,9 @@ This will return a list of dictionaries looking somewhat like this:
# ... # ...
] ]
``` ```
### Translate transcript
You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english). You can add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it defaults to english).
```python ```python
YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
@ -65,6 +66,14 @@ YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
`languages` also is optional here. `languages` also is optional here.
### Preserve formatting
You can also add `preserve_formatting=True` if you'd like to keep HTML formatting elements such as `<i>` (italics) and `<b>` (bold).
```python
YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'], preserve_formatting=True)
```
### List available transcripts ### List available transcripts
If you want to list all transcripts which are available for a given video you can call: If you want to list all transcripts which are available for a given video you can call:

View File

@ -71,7 +71,8 @@ class YouTubeTranscriptApi(object):
return TranscriptListFetcher(http_client).fetch(video_id) return TranscriptListFetcher(http_client).fetch(video_id)
@classmethod @classmethod
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None): def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None,
cookies=None, preserve_formatting=False):
""" """
Retrieves the transcripts for a list of videos. Retrieves the transcripts for a list of videos.
@ -88,6 +89,8 @@ class YouTubeTranscriptApi(object):
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies :param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str :type cookies: str
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
video ids, which could not be retrieved video ids, which could not be retrieved
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
@ -99,7 +102,7 @@ class YouTubeTranscriptApi(object):
for video_id in video_ids: for video_id in video_ids:
try: try:
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies) data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies, preserve_formatting)
except Exception as exception: except Exception as exception:
if not continue_after_error: if not continue_after_error:
raise exception raise exception
@ -109,7 +112,7 @@ class YouTubeTranscriptApi(object):
return data, unretrievable_videos return data, unretrievable_videos
@classmethod @classmethod
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None): def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None, preserve_formatting=False):
""" """
Retrieves the transcript for a single video. This is just a shortcut for calling:: Retrieves the transcript for a single video. This is just a shortcut for calling::
@ -125,11 +128,13 @@ class YouTubeTranscriptApi(object):
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
:param cookies: a string of the path to a text file containing youtube authorization cookies :param cookies: a string of the path to a text file containing youtube authorization cookies
:type cookies: str :type cookies: str
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype [{'text': str, 'start': float, 'end': float}]: :rtype [{'text': str, 'start': float, 'end': float}]:
""" """
assert isinstance(video_id, str), "`video_id` must be a string" assert isinstance(video_id, str), "`video_id` must be a string"
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch() return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)
@classmethod @classmethod
def _load_cookies(cls, cookies, video_id): def _load_cookies(cls, cookies, video_id):

View File

@ -41,10 +41,11 @@ class TranscriptListFetcher(object):
self._http_client = http_client self._http_client = http_client
def fetch(self, video_id): def fetch(self, video_id):
return TranscriptList.build( return TranscriptList.build(
self._http_client, self._http_client,
video_id, video_id,
self._extract_captions_json(self._fetch_video_html(video_id), video_id) self._extract_captions_json(self._fetch_video_html(video_id), video_id),
) )
def _extract_captions_json(self, html, video_id): def _extract_captions_json(self, html, video_id):
@ -94,6 +95,7 @@ class TranscriptList(object):
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
for a given YouTube video. Also it provides functionality to search for a transcript in a given language. for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
""" """
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages): def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
""" """
The constructor is only for internal use. Use the static build method instead. The constructor is only for internal use. Use the static build method instead.
@ -149,7 +151,7 @@ class TranscriptList(object):
caption['name']['simpleText'], caption['name']['simpleText'],
caption['languageCode'], caption['languageCode'],
caption.get('kind', '') == 'asr', caption.get('kind', '') == 'asr',
translation_languages if caption.get('isTranslatable', False) else [] translation_languages if caption.get('isTranslatable', False) else [],
) )
return TranscriptList( return TranscriptList(
@ -190,7 +192,7 @@ class TranscriptList(object):
:rtype Transcript: :rtype Transcript:
:raises: NoTranscriptFound :raises: NoTranscriptFound
""" """
return self._find_transcript(language_codes, [self._generated_transcripts,]) return self._find_transcript(language_codes, [self._generated_transcripts])
def find_manually_created_transcript(self, language_codes): def find_manually_created_transcript(self, language_codes):
""" """
@ -204,7 +206,7 @@ class TranscriptList(object):
:rtype Transcript: :rtype Transcript:
:raises: NoTranscriptFound :raises: NoTranscriptFound
""" """
return self._find_transcript(language_codes, [self._manually_created_transcripts,]) return self._find_transcript(language_codes, [self._manually_created_transcripts])
def _find_transcript(self, language_codes, transcript_dicts): def _find_transcript(self, language_codes, transcript_dicts):
for language_code in language_codes: for language_code in language_codes:
@ -276,15 +278,16 @@ class Transcript(object):
for translation_language in translation_languages for translation_language in translation_languages
} }
def fetch(self): def fetch(self, preserve_formatting=False):
""" """
Loads the actual transcript data. Loads the actual transcript data.
:param preserve_formatting: whether to keep select HTML text formatting
:type preserve_formatting: bool
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype [{'text': str, 'start': float, 'end': float}]: :rtype [{'text': str, 'start': float, 'end': float}]:
""" """
response = self._http_client.get(self._url) response = self._http_client.get(self._url)
return _TranscriptParser().parse( return _TranscriptParser(preserve_formatting=preserve_formatting).parse(
_raise_http_errors(response, self.video_id).text, _raise_http_errors(response, self.video_id).text,
) )
@ -318,12 +321,35 @@ class Transcript(object):
class _TranscriptParser(object): class _TranscriptParser(object):
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE) _FORMATTING_TAGS = [
'strong', # important
'em', # emphasized
'b', # bold
'i', # italic
'mark', # marked
'small', # smaller
'del', # deleted
'ins', # inserted
'sub', # subscript
'sup', # superscript
]
def __init__(self, preserve_formatting=False):
self._html_regex = self._get_html_regex(preserve_formatting)
def _get_html_regex(self, preserve_formatting):
if preserve_formatting:
formats_regex = '|'.join(self._FORMATTING_TAGS)
formats_regex = r'<\/?(?!\/?(' + formats_regex + r')\b).*?\b>'
html_regex = re.compile(formats_regex, re.IGNORECASE)
else:
html_regex = re.compile(r'<[^>]*>', re.IGNORECASE)
return html_regex
def parse(self, plain_data): def parse(self, plain_data):
return [ return [
{ {
'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)), 'text': re.sub(self._html_regex, '', unescape(xml_element.text)),
'start': float(xml_element.attrib['start']), 'start': float(xml_element.attrib['start']),
'duration': float(xml_element.attrib.get('dur', '0.0')), 'duration': float(xml_element.attrib.get('dur', '0.0')),
} }

View File

@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8" ?> <?xml version="1.0" encoding="utf-8" ?>
<transcript> <transcript>
<text start="0" dur="1.54">Hey, this is just a test</text> <text start="0" dur="1.54">Hey, this is just a test</text>
<text start="1.54" dur="4.16">this is not the original transcript</text> <text start="1.54" dur="4.16">this is &lt;i>not&lt;/i> the original transcript</text>
<text start="5" dur="0.5"></text> <text start="5" dur="0.5"></text>
<text start="5.7" dur="3.239">just something shorter, I made up for testing</text> <text start="5.7" dur="3.239">just something shorter, I made up for testing</text>
</transcript> </transcript>

View File

@ -61,6 +61,18 @@ class TestYouTubeTranscriptApi(TestCase):
] ]
) )
def test_get_transcript_formatted(self):
transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', preserve_formatting=True)
self.assertEqual(
transcript,
[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]
)
def test_list_transcripts(self): def test_list_transcripts(self):
transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8')
@ -271,8 +283,8 @@ class TestYouTubeTranscriptApi(TestCase):
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
mock_get_transcript.assert_any_call(video_id_1, languages, None, None) mock_get_transcript.assert_any_call(video_id_1, languages, None, None, False)
mock_get_transcript.assert_any_call(video_id_2, languages, None, None) mock_get_transcript.assert_any_call(video_id_2, languages, None, None, False)
self.assertEqual(mock_get_transcript.call_count, 2) self.assertEqual(mock_get_transcript.call_count, 2)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error')) @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript', side_effect=Exception('Error'))
@ -287,20 +299,20 @@ class TestYouTubeTranscriptApi(TestCase):
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None) mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None, False)
mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None) mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None, False)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
def test_get_transcripts__with_cookies(self, mock_get_transcript): def test_get_transcripts__with_cookies(self, mock_get_transcript):
cookies = '/example_cookies.txt' cookies = '/example_cookies.txt'
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies)
mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies) mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), None, cookies, False)
@patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript') @patch('youtube_transcript_api.YouTubeTranscriptApi.get_transcript')
def test_get_transcripts__with_proxies(self, mock_get_transcript): def test_get_transcripts__with_proxies(self, mock_get_transcript):
proxies = {'http': '', 'https:': ''} proxies = {'http': '', 'https:': ''}
YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies)
mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None) mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en',), proxies, None, False)
def test_load_cookies(self): def test_load_cookies(self):
dirname, filename = os.path.split(os.path.abspath(__file__)) dirname, filename = os.path.split(os.path.abspath(__file__))

View File

@ -12,7 +12,7 @@ class TestYouTubeTranscriptCli(TestCase):
self.transcript_mock = MagicMock() self.transcript_mock = MagicMock()
self.transcript_mock.fetch = MagicMock(return_value=[ self.transcript_mock.fetch = MagicMock(return_value=[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, {'text': 'this is <i>not</i> the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]) ])
self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock) self.transcript_mock.translate = MagicMock(return_value=self.transcript_mock)