YouTubeTranscriptApi now supports retrieving transcripts for given languages
This commit is contained in:
parent
48cb31fe3e
commit
18fb0cbaec
|
@ -1,5 +1,6 @@
|
||||||
.idea
|
.idea
|
||||||
.venv
|
.venv
|
||||||
|
virtualenv
|
||||||
*.pyc
|
*.pyc
|
||||||
dist
|
dist
|
||||||
build
|
build
|
||||||
|
|
12
README.md
12
README.md
|
@ -48,12 +48,22 @@ This will return a list of dictionaries looking somewhat like this:
|
||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it usually defaults to english).
|
||||||
|
|
||||||
|
```python
|
||||||
|
YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
|
||||||
|
```
|
||||||
|
|
||||||
|
It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcipt (`'en'`) if it fails to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to play around with the language codes a bit, to find the one which is working for you!
|
||||||
|
|
||||||
To get transcripts for a list fo video ids you can call:
|
To get transcripts for a list fo video ids you can call:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
YouTubeTranscriptApi.get_transcripts(video_ids)
|
YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
`languages` also is optional here.
|
||||||
|
|
||||||
### CLI
|
### CLI
|
||||||
|
|
||||||
Execute the CLI script using the video ids as parameters and the results will be printed out to the command line:
|
Execute the CLI script using the video ids as parameters and the results will be printed out to the command line:
|
||||||
|
|
9
setup.py
9
setup.py
|
@ -8,13 +8,10 @@ def _get_file_content(file_name):
|
||||||
def get_long_description():
|
def get_long_description():
|
||||||
return _get_file_content('README.md')
|
return _get_file_content('README.md')
|
||||||
|
|
||||||
def get_requirements():
|
|
||||||
return list(filter(lambda line: line != '' and not line.startswith('#'), _get_file_content('requirements.txt').split('\n')))
|
|
||||||
|
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name="youtube_transcript_api",
|
name="youtube_transcript_api",
|
||||||
version="0.1.1",
|
version="0.1.2",
|
||||||
author="Jonas Depoix",
|
author="Jonas Depoix",
|
||||||
author_email="jonas.depoix@web.de",
|
author_email="jonas.depoix@web.de",
|
||||||
description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!",
|
description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!",
|
||||||
|
@ -29,7 +26,9 @@ setuptools.setup(
|
||||||
"License :: OSI Approved :: MIT License",
|
"License :: OSI Approved :: MIT License",
|
||||||
"Operating System :: OS Independent",
|
"Operating System :: OS Independent",
|
||||||
),
|
),
|
||||||
install_requires=get_requirements(),
|
install_requires=[
|
||||||
|
'requests',
|
||||||
|
],
|
||||||
entry_points={
|
entry_points={
|
||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
'youtube_transcript_api = youtube_transcript_api.__main__:main',
|
'youtube_transcript_api = youtube_transcript_api.__main__:main',
|
||||||
|
|
|
@ -1,3 +1,9 @@
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if sys.version_info.major == 2:
|
||||||
|
reload(sys)
|
||||||
|
sys.setdefaultencoding('utf-8')
|
||||||
|
|
||||||
from xml.etree import ElementTree
|
from xml.etree import ElementTree
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
@ -30,14 +36,18 @@ class YouTubeTranscriptApi():
|
||||||
)
|
)
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_transcripts(video_ids, continue_after_error=False):
|
def get_transcripts(video_ids, languages=None, continue_after_error=False):
|
||||||
"""
|
"""
|
||||||
Retrieves the transcripts for a list of videos.
|
Retrieves the transcripts for a list of videos.
|
||||||
|
|
||||||
:param video_ids: a list of youtube video ids
|
:param video_ids: a list of youtube video ids
|
||||||
:type video_ids: [str]
|
:type video_ids: [str]
|
||||||
|
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||||
|
it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
|
||||||
|
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
|
||||||
|
play around with the language codes a bit, to find the one which is working for you!
|
||||||
|
:type languages: [str]
|
||||||
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
|
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
|
||||||
one of the video transcripts
|
one of the video transcripts
|
||||||
:type continue_after_error: bool
|
:type continue_after_error: bool
|
||||||
|
@ -50,7 +60,7 @@ class YouTubeTranscriptApi():
|
||||||
|
|
||||||
for video_id in video_ids:
|
for video_id in video_ids:
|
||||||
try:
|
try:
|
||||||
data[video_id] = YouTubeTranscriptApi.get_transcript(video_id)
|
data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages)
|
||||||
except Exception as exception:
|
except Exception as exception:
|
||||||
if not continue_after_error:
|
if not continue_after_error:
|
||||||
raise exception
|
raise exception
|
||||||
|
@ -60,17 +70,22 @@ class YouTubeTranscriptApi():
|
||||||
return data, unretrievable_videos
|
return data, unretrievable_videos
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_transcript(video_id):
|
def get_transcript(video_id, languages=None):
|
||||||
"""
|
"""
|
||||||
Retrieves the transcript for a single video.
|
Retrieves the transcript for a single video.
|
||||||
|
|
||||||
:param video_id: the youtube video id
|
:param video_id: the youtube video id
|
||||||
:type video_id: str
|
:type video_id: str
|
||||||
|
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||||
|
it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
|
||||||
|
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
|
||||||
|
play around with the language codes a bit, to find the one which is working for you!
|
||||||
|
:type languages: [str]
|
||||||
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||||
:rtype: [{'text': str, 'start': float, 'end': float}]
|
:rtype: [{'text': str, 'start': float, 'end': float}]
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return _TranscriptParser(_TranscriptFetcher(video_id).fetch()).parse()
|
return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse()
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.error(
|
logger.error(
|
||||||
YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format(
|
YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format(
|
||||||
|
@ -83,26 +98,36 @@ class YouTubeTranscriptApi():
|
||||||
class _TranscriptFetcher():
|
class _TranscriptFetcher():
|
||||||
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
||||||
API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
|
API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
|
||||||
|
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
|
||||||
|
|
||||||
def __init__(self, video_id):
|
def __init__(self, video_id, languages):
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
|
self.languages = languages
|
||||||
|
|
||||||
def fetch(self):
|
def fetch(self):
|
||||||
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
|
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
|
||||||
|
|
||||||
timedtext_url_start = fetched_site.find('timedtext')
|
timedtext_url_start = fetched_site.find('timedtext')
|
||||||
|
|
||||||
return requests.get(
|
for language in (self.languages if self.languages else [None,]):
|
||||||
self.API_BASE_URL.format(
|
response = self._execute_api_request(fetched_site, timedtext_url_start, language)
|
||||||
api_url=fetched_site[
|
if response:
|
||||||
timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
|
return response
|
||||||
].replace(
|
|
||||||
'\\u0026', '&'
|
return None
|
||||||
).replace(
|
|
||||||
'\\', ''
|
def _execute_api_request(self, fetched_site, timedtext_url_start, language):
|
||||||
)
|
url = self.API_BASE_URL.format(
|
||||||
|
api_url=fetched_site[
|
||||||
|
timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
|
||||||
|
].replace(
|
||||||
|
'\\u0026', '&'
|
||||||
|
).replace(
|
||||||
|
'\\', ''
|
||||||
)
|
)
|
||||||
).text
|
)
|
||||||
|
if language:
|
||||||
|
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
|
||||||
|
return requests.get(url).text
|
||||||
|
|
||||||
|
|
||||||
class _TranscriptParser():
|
class _TranscriptParser():
|
||||||
|
|
Loading…
Reference in New Issue