YouTubeTranscriptApi now supports retrieving transcripts for given languages
This commit is contained in:
		
							parent
							
								
									48cb31fe3e
								
							
						
					
					
						commit
						18fb0cbaec
					
				|  | @ -1,5 +1,6 @@ | ||||||
| .idea | .idea | ||||||
| .venv | .venv | ||||||
|  | virtualenv | ||||||
| *.pyc | *.pyc | ||||||
| dist | dist | ||||||
| build | build | ||||||
|  |  | ||||||
							
								
								
									
										12
									
								
								README.md
								
								
								
								
							
							
						
						
									
										12
									
								
								README.md
								
								
								
								
							|  | @ -48,12 +48,22 @@ This will return a list of dictionaries looking somewhat like this: | ||||||
| ] | ] | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
|  | You can also add the `languages` param if you want to make sure the transcripts are retrieved in your desired language (it usually defaults to english). | ||||||
|  | 
 | ||||||
|  | ```python | ||||||
|  | YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | It's a list of language codes in a descending priority. In this example it will first try to fetch the german transcript (`'de'`) and then fetch the english transcipt (`'en'`) if it fails to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to play around with the language codes a bit, to find the one which is working for you! | ||||||
|  | 
 | ||||||
| To get transcripts for a list fo video ids you can call: | To get transcripts for a list fo video ids you can call: | ||||||
| 
 | 
 | ||||||
| ```python | ```python | ||||||
| YouTubeTranscriptApi.get_transcripts(video_ids) | YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en']) | ||||||
| ``` | ``` | ||||||
| 
 | 
 | ||||||
|  | `languages` also is optional here. | ||||||
|  | 
 | ||||||
| ### CLI | ### CLI | ||||||
| 
 | 
 | ||||||
| Execute the CLI script using the video ids as parameters and the results will be printed out to the command line: | Execute the CLI script using the video ids as parameters and the results will be printed out to the command line: | ||||||
|  |  | ||||||
							
								
								
									
										9
									
								
								setup.py
								
								
								
								
							
							
						
						
									
										9
									
								
								setup.py
								
								
								
								
							|  | @ -8,13 +8,10 @@ def _get_file_content(file_name): | ||||||
| def get_long_description(): | def get_long_description(): | ||||||
|     return _get_file_content('README.md') |     return _get_file_content('README.md') | ||||||
| 
 | 
 | ||||||
| def get_requirements(): |  | ||||||
|     return list(filter(lambda line: line != '' and not line.startswith('#'), _get_file_content('requirements.txt').split('\n'))) |  | ||||||
| 
 |  | ||||||
| 
 | 
 | ||||||
| setuptools.setup( | setuptools.setup( | ||||||
|     name="youtube_transcript_api", |     name="youtube_transcript_api", | ||||||
|     version="0.1.1", |     version="0.1.2", | ||||||
|     author="Jonas Depoix", |     author="Jonas Depoix", | ||||||
|     author_email="jonas.depoix@web.de", |     author_email="jonas.depoix@web.de", | ||||||
|     description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!", |     description="This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!", | ||||||
|  | @ -29,7 +26,9 @@ setuptools.setup( | ||||||
|         "License :: OSI Approved :: MIT License", |         "License :: OSI Approved :: MIT License", | ||||||
|         "Operating System :: OS Independent", |         "Operating System :: OS Independent", | ||||||
|     ), |     ), | ||||||
|     install_requires=get_requirements(), |     install_requires=[ | ||||||
|  |         'requests', | ||||||
|  |     ], | ||||||
|     entry_points={ |     entry_points={ | ||||||
|         'console_scripts': [ |         'console_scripts': [ | ||||||
|             'youtube_transcript_api = youtube_transcript_api.__main__:main', |             'youtube_transcript_api = youtube_transcript_api.__main__:main', | ||||||
|  |  | ||||||
|  | @ -1,3 +1,9 @@ | ||||||
|  | import sys | ||||||
|  | 
 | ||||||
|  | if sys.version_info.major == 2: | ||||||
|  |     reload(sys) | ||||||
|  |     sys.setdefaultencoding('utf-8') | ||||||
|  | 
 | ||||||
| from xml.etree import ElementTree | from xml.etree import ElementTree | ||||||
| 
 | 
 | ||||||
| import re | import re | ||||||
|  | @ -30,14 +36,18 @@ class YouTubeTranscriptApi(): | ||||||
|             ) |             ) | ||||||
|             self.video_id = video_id |             self.video_id = video_id | ||||||
| 
 | 
 | ||||||
| 
 |  | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def get_transcripts(video_ids, continue_after_error=False): |     def get_transcripts(video_ids, languages=None, continue_after_error=False): | ||||||
|         """ |         """ | ||||||
|         Retrieves the transcripts for a list of videos. |         Retrieves the transcripts for a list of videos. | ||||||
| 
 | 
 | ||||||
|         :param video_ids: a list of youtube video ids |         :param video_ids: a list of youtube video ids | ||||||
|         :type video_ids: [str] |         :type video_ids: [str] | ||||||
|  |         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] | ||||||
|  |         it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to | ||||||
|  |         do so. As I can't provide a complete list of all working language codes with full certainty, you may have to | ||||||
|  |         play around with the language codes a bit, to find the one which is working for you! | ||||||
|  |         :type languages: [str] | ||||||
|         :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving |         :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving | ||||||
|         one of the video transcripts |         one of the video transcripts | ||||||
|         :type continue_after_error: bool |         :type continue_after_error: bool | ||||||
|  | @ -50,7 +60,7 @@ class YouTubeTranscriptApi(): | ||||||
| 
 | 
 | ||||||
|         for video_id in video_ids: |         for video_id in video_ids: | ||||||
|             try: |             try: | ||||||
|                 data[video_id] = YouTubeTranscriptApi.get_transcript(video_id) |                 data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages) | ||||||
|             except Exception as exception: |             except Exception as exception: | ||||||
|                 if not continue_after_error: |                 if not continue_after_error: | ||||||
|                     raise exception |                     raise exception | ||||||
|  | @ -60,17 +70,22 @@ class YouTubeTranscriptApi(): | ||||||
|         return data, unretrievable_videos |         return data, unretrievable_videos | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def get_transcript(video_id): |     def get_transcript(video_id, languages=None): | ||||||
|         """ |         """ | ||||||
|         Retrieves the transcript for a single video. |         Retrieves the transcript for a single video. | ||||||
| 
 | 
 | ||||||
|         :param video_id: the youtube video id |         :param video_id: the youtube video id | ||||||
|         :type video_id: str |         :type video_id: str | ||||||
|  |         :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] | ||||||
|  |         it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to | ||||||
|  |         do so. As I can't provide a complete list of all working language codes with full certainty, you may have to | ||||||
|  |         play around with the language codes a bit, to find the one which is working for you! | ||||||
|  |         :type languages: [str] | ||||||
|         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys |         :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys | ||||||
|         :rtype: [{'text': str, 'start': float, 'end': float}] |         :rtype: [{'text': str, 'start': float, 'end': float}] | ||||||
|         """ |         """ | ||||||
|         try: |         try: | ||||||
|             return _TranscriptParser(_TranscriptFetcher(video_id).fetch()).parse() |             return _TranscriptParser(_TranscriptFetcher(video_id, languages).fetch()).parse() | ||||||
|         except Exception: |         except Exception: | ||||||
|             logger.error( |             logger.error( | ||||||
|                 YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format( |                 YouTubeTranscriptApi.CouldNotRetrieveTranscript.ERROR_MESSAGE.format( | ||||||
|  | @ -83,17 +98,25 @@ class YouTubeTranscriptApi(): | ||||||
| class _TranscriptFetcher(): | class _TranscriptFetcher(): | ||||||
|     WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' |     WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' | ||||||
|     API_BASE_URL = 'https://www.youtube.com/api/{api_url}' |     API_BASE_URL = 'https://www.youtube.com/api/{api_url}' | ||||||
|  |     LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)') | ||||||
| 
 | 
 | ||||||
|     def __init__(self, video_id): |     def __init__(self, video_id, languages): | ||||||
|         self.video_id = video_id |         self.video_id = video_id | ||||||
|  |         self.languages = languages | ||||||
| 
 | 
 | ||||||
|     def fetch(self): |     def fetch(self): | ||||||
|         fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text |         fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text | ||||||
| 
 |  | ||||||
|         timedtext_url_start = fetched_site.find('timedtext') |         timedtext_url_start = fetched_site.find('timedtext') | ||||||
| 
 | 
 | ||||||
|         return requests.get( |         for language in (self.languages if self.languages else [None,]): | ||||||
|             self.API_BASE_URL.format( |             response = self._execute_api_request(fetched_site, timedtext_url_start, language) | ||||||
|  |             if response: | ||||||
|  |                 return response | ||||||
|  | 
 | ||||||
|  |         return None | ||||||
|  | 
 | ||||||
|  |     def _execute_api_request(self, fetched_site, timedtext_url_start, language): | ||||||
|  |         url = self.API_BASE_URL.format( | ||||||
|             api_url=fetched_site[ |             api_url=fetched_site[ | ||||||
|                 timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"') |                 timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"') | ||||||
|             ].replace( |             ].replace( | ||||||
|  | @ -102,7 +125,9 @@ class _TranscriptFetcher(): | ||||||
|                 '\\', '' |                 '\\', '' | ||||||
|             ) |             ) | ||||||
|         ) |         ) | ||||||
|         ).text |         if language: | ||||||
|  |             url = re.sub(self.LANGUAGE_REGEX, '&lang={language}&'.format(language=language), url) | ||||||
|  |         return requests.get(url).text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class _TranscriptParser(): | class _TranscriptParser(): | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue