Removed unnecessary language variables, sort split matches by len while ignoring name arguement

This commit is contained in:
danielcliu 2019-11-06 21:20:51 -08:00
parent 7ac7d3266b
commit c7cb3117be
1 changed files with 17 additions and 11 deletions

View File

@ -40,7 +40,7 @@ class YouTubeTranscriptApi():
self.video_id = video_id
@classmethod
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None):
def get_transcripts(cls, video_ids, languages=['en'], continue_after_error=False, proxies=None):
"""
Retrieves the transcripts for a list of videos.
@ -75,7 +75,7 @@ class YouTubeTranscriptApi():
return data, unretrievable_videos
@classmethod
def get_transcript(cls, video_id, languages=None, proxies=None):
def get_transcript(cls, video_id, languages=['en'], proxies=None):
"""
Retrieves the transcript for a single video.
@ -100,14 +100,14 @@ class YouTubeTranscriptApi():
class _TranscriptFetcher():
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
API_BASE_URL = 'https://www.youtube.com/api/'
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
TIMEDTEXT_STRING = 'timedtext?v='
NAME_REGEX = re.compile(r'(&name=.*&)|(&name=.*)')
def __init__(self, video_id, languages, proxies):
self.video_id = video_id
self.languages = languages
print(languages)
self.proxies = proxies
self.matched_splits = []
def fetch(self):
if self.proxies:
@ -118,19 +118,25 @@ class _TranscriptFetcher():
.replace('\\u0026', '&')
.replace('\\', '')
for split in fetched_site.split(self.TIMEDTEXT_STRING)]
for language in (self.languages if self.languages else ['en']):
self.matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split]
if self.matched_splits:
matched_splits = []
for language in self.languages:
matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split]
if matched_splits:
break
if self.matched_splits:
timedtext_url = min(self.matched_splits, key=len)
response = self._execute_api_request(timedtext_url, language)
if matched_splits:
timedtext_url = min(matched_splits, key=self._sort_splits)
response = self._execute_api_request(timedtext_url)
if response:
return response
return None
def _execute_api_request(self, timedtext_url, language):
#Sorting the matched splits by string length because we want non-asr options returned first
#However, we don't want to include the length of the 'name' argument as it could possible throw this off
def _sort_splits(self, matched_split):
return len(re.sub(self.NAME_REGEX, r'\1', matched_split))
def _execute_api_request(self, timedtext_url):
url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url)
if self.proxies:
return requests.get(url, proxies=self.proxies).text