YouTubeTranscriptApi now supports retrieving transcripts for given languages

This commit is contained in:
Jonas Depoix 2019-02-21 12:55:03 +01:00
parent 48cb31fe3e
commit 18fb0cbaec
5 changed files with 61 additions and 26 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
.idea .idea
.venv .venv
virtualenv
*.pyc *.pyc
dist dist
build build

View File

@ -48,12 +48,22 @@ This will return a list of dictionaries looking somewhat like this:
] ]
``` ```
You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it usually defaults to english).
```python
YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
```
It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcipt (`'en'`) if it fails to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to play around with the language codes a bit, to find the one which is working for you!
To get transcripts for a list fo video ids you can call: To get transcripts for a list fo video ids you can call:
```python ```python
YouTubeTranscriptApi.get_transcripts(video_ids) YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
``` ```
`languages` also is optional here.
### CLI ### CLI
Execute the CLI script using the video ids as parameters and the results will be printed out to the command line: Execute the CLI script using the video ids as parameters and the results will be printed out to the command line:

View File

@ -8,13 +8,10 @@ def _get_file_content(file_name):
def get_long_description(): def get_long_description():
return _get_file_content('README.md') return _get_file_content('README.md')
def get_requirements():
return list(filter(lambda line: line != '' and not line.startswith('#'), _get_file_content('requirements.txt').split('\n')))
setuptools.setup( setuptools.setup(
name="youtube_transcript_api", name="youtube_transcript_api",
version="0.1.1", version="0.1.2",
author="Jonas Depoix", author="Jonas Depoix",
author_email="jonas.depoix@web.de", author_email="jonas.depoix@web.de",
description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!", description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!",
@ -29,7 +26,9 @@ setuptools.setup(
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Operating System :: OS Independent", "Operating System :: OS Independent",
), ),
install_requires=get_requirements(), install_requires=[
'requests',
],
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'youtube_transcript_api = youtube_transcript_api.__main__:main', 'youtube_transcript_api = youtube_transcript_api.__main__:main',

View File

@ -1,3 +1,9 @@
import sys
if sys.version_info.major == 2:
reload(sys)
sys.setdefaultencoding('utf-8')
from xml.etree import ElementTree from xml.etree import ElementTree
import re import re
@ -30,14 +36,18 @@ class YouTubeTranscriptApi():
) )
self.video_id = video_id self.video_id = video_id
@staticmethod @staticmethod
def get_transcripts(video_ids, continue_after_error=False): def get_transcripts(video_ids, languages=None, continue_after_error=False):
""" """
Retrieves the transcripts for a list of videos. Retrieves the transcripts for a list of videos.
:param video_ids: a list of youtube video ids :param video_ids: a list of youtube video ids
:type video_ids: [str] :type video_ids: [str]
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
play around with the language codes a bit, to find the one which is working for you!
:type languages: [str]
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
one of the video transcripts one of the video transcripts
:type continue_after_error: bool :type continue_after_error: bool
@ -50,7 +60,7 @@ class YouTubeTranscriptApi():
for video_id in video_ids: for video_id in video_ids:
try: try:
data[video_id] = YouTubeTranscriptApi.get_transcript(video_id) data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages)
except Exception as exception: except Exception as exception:
if not continue_after_error: if not continue_after_error:
raise exception raise exception
@ -60,17 +70,22 @@ class YouTubeTranscriptApi():
return data, unretrievable_videos return data, unretrievable_videos
@staticmethod @staticmethod
def get_transcript(video_id): def get_transcript(video_id, languages=None):
""" """
Retrieves the transcript for a single video. Retrieves the transcript for a single video.
:param video_id: the youtube video id :param video_id: the youtube video id
:type video_id: str :type video_id: str
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
play around with the language codes a bit, to find the one which is working for you!
:type languages: [str]
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype: [{'text': str, 'start': float, 'end': float}] :rtype: [{'text': str, 'start': float, 'end': float}]
""" """
try: try:
return _TranscriptParser(_TranscriptFetcher(video_id).fetch()).parse() return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse()
except Exception: except Exception:
logger.error( logger.error(
YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format( YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format(
@ -83,17 +98,25 @@ class YouTubeTranscriptApi():
class _TranscriptFetcher(): class _TranscriptFetcher():
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
API_BASE_URL = 'https://www.youtube.com/api/{api_url}' API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
def __init__(self, video_id): def __init__(self, video_id, languages):
self.video_id = video_id self.video_id = video_id
self.languages = languages
def fetch(self): def fetch(self):
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
timedtext_url_start = fetched_site.find('timedtext') timedtext_url_start = fetched_site.find('timedtext')
return requests.get( for language in (self.languages if self.languages else [None,]):
self.API_BASE_URL.format( response = self._execute_api_request(fetched_site, timedtext_url_start, language)
if response:
return response
return None
def _execute_api_request(self, fetched_site, timedtext_url_start, language):
url = self.API_BASE_URL.format(
api_url=fetched_site[ api_url=fetched_site[
timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"') timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
].replace( ].replace(
@ -102,7 +125,9 @@ class _TranscriptFetcher():
'\\', '' '\\', ''
) )
) )
).text if language:
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
return requests.get(url).text
class _TranscriptParser(): class _TranscriptParser():