Merge pull request #7 from jdepoix/feature/ISSUE-6

YouTubeTranscriptApi now supports retrieving transcripts for given languages
This commit is contained in:
jdepoix 2019-02-21 12:57:11 +01:00 committed by GitHub
commit 22927a4d6b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 61 additions and 26 deletions

3
.gitignore vendored
View File

@ -1,7 +1,8 @@
.idea
.venv
virtualenv
*.pyc
dist
build
*.egg-info
upload_new_version.sh
upload_new_version.sh

View File

@ -48,12 +48,22 @@ This will return a list of dictionaries looking somewhat like this:
]
```
You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it usually defaults to english).
```python
YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
```
It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcipt (`'en'`) if it fails to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to play around with the language codes a bit, to find the one which is working for you!
To get transcripts for a list fo video ids you can call:
```python
YouTubeTranscriptApi.get_transcripts(video_ids)
YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])
```
`languages` also is optional here.
### CLI
Execute the CLI script using the video ids as parameters and the results will be printed out to the command line:
@ -70,4 +80,4 @@ youtube_transcript_api --json <first_video_id> <second_video_id> ... > transcrip
## Warning
This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know!
This code uses an undocumented part of the YouTube API, which is called by the YouTube web-client. So there is no guarantee that it won't stop working tomorrow, if they change how things work. I will however do my best to make things working again as soon as possible if that happens. So if it stops working, let me know!

View File

@ -8,13 +8,10 @@ def _get_file_content(file_name):
def get_long_description():
return _get_file_content('README.md')
def get_requirements():
return list(filter(lambda line: line != '' and not line.startswith('#'), _get_file_content('requirements.txt').split('\n')))
setuptools.setup(
name="youtube_transcript_api",
version="0.1.1",
version="0.1.2",
author="Jonas Depoix",
author_email="jonas.depoix@web.de",
description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!",
@ -29,7 +26,9 @@ setuptools.setup(
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
),
install_requires=get_requirements(),
install_requires=[
'requests',
],
entry_points={
'console_scripts': [
'youtube_transcript_api = youtube_transcript_api.__main__:main',

View File

@ -11,7 +11,7 @@ from ._api import YouTubeTranscriptApi
def main():
logging.basicConfig()
if len(sys.argv) <= 1:
print('No YouTube video id was found')
elif sys.argv[1] == '--json':

View File

@ -1,3 +1,9 @@
import sys
if sys.version_info.major == 2:
reload(sys)
sys.setdefaultencoding('utf-8')
from xml.etree import ElementTree
import re
@ -30,14 +36,18 @@ class YouTubeTranscriptApi():
)
self.video_id = video_id
@staticmethod
def get_transcripts(video_ids, continue_after_error=False):
def get_transcripts(video_ids, languages=None, continue_after_error=False):
"""
Retrieves the transcripts for a list of videos.
:param video_ids: a list of youtube video ids
:type video_ids: [str]
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
play around with the language codes a bit, to find the one which is working for you!
:type languages: [str]
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
one of the video transcripts
:type continue_after_error: bool
@ -50,7 +60,7 @@ class YouTubeTranscriptApi():
for video_id in video_ids:
try:
data[video_id] = YouTubeTranscriptApi.get_transcript(video_id)
data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages)
except Exception as exception:
if not continue_after_error:
raise exception
@ -60,17 +70,22 @@ class YouTubeTranscriptApi():
return data, unretrievable_videos
@staticmethod
def get_transcript(video_id):
def get_transcript(video_id, languages=None):
"""
Retrieves the transcript for a single video.
:param video_id: the youtube video id
:type video_id: str
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
play around with the language codes a bit, to find the one which is working for you!
:type languages: [str]
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
:rtype: [{'text': str, 'start': float, 'end': float}]
"""
try:
return _TranscriptParser(_TranscriptFetcher(video_id).fetch()).parse()
return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse()
except Exception:
logger.error(
YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format(
@ -83,26 +98,36 @@ class YouTubeTranscriptApi():
class _TranscriptFetcher():
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
def __init__(self, video_id):
def __init__(self, video_id, languages):
self.video_id = video_id
self.languages = languages
def fetch(self):
fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
timedtext_url_start = fetched_site.find('timedtext')
return requests.get(
self.API_BASE_URL.format(
api_url=fetched_site[
timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
].replace(
'\\u0026', '&'
).replace(
'\\', ''
)
for language in (self.languages if self.languages else [None,]):
response = self._execute_api_request(fetched_site, timedtext_url_start, language)
if response:
return response
return None
def _execute_api_request(self, fetched_site, timedtext_url_start, language):
url = self.API_BASE_URL.format(
api_url=fetched_site[
timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
].replace(
'\\u0026', '&'
).replace(
'\\', ''
)
).text
)
if language:
url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url)
return requests.get(url).text
class _TranscriptParser():