Removed unnecessary language variables, sort split matches by len while ignoring name arguement

This commit is contained in:
danielcliu 2019-11-06 21:20:51 -08:00
parent 7ac7d3266b
commit c7cb3117be
1 changed files with 17 additions and 11 deletions

View File

@ -40,7 +40,7 @@ class YouTubeTranscriptApi():
self.video_id = video_id self.video_id = video_id
@classmethod @classmethod
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None): def get_transcripts(cls, video_ids, languages=['en'], continue_after_error=False, proxies=None):
""" """
Retrieves the transcripts for a list of videos. Retrieves the transcripts for a list of videos.
@ -75,7 +75,7 @@ class YouTubeTranscriptApi():
return data, unretrievable_videos return data, unretrievable_videos
@classmethod @classmethod
def get_transcript(cls, video_id, languages=None, proxies=None): def get_transcript(cls, video_id, languages=['en'], proxies=None):
""" """
Retrieves the transcript for a single video. Retrieves the transcript for a single video.
@ -100,14 +100,14 @@ class YouTubeTranscriptApi():
class _TranscriptFetcher(): class _TranscriptFetcher():
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}' WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
API_BASE_URL = 'https://www.youtube.com/api/' API_BASE_URL = 'https://www.youtube.com/api/'
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
TIMEDTEXT_STRING = 'timedtext?v=' TIMEDTEXT_STRING = 'timedtext?v='
NAME_REGEX = re.compile(r'(&name=.*&)|(&name=.*)')
def __init__(self, video_id, languages, proxies): def __init__(self, video_id, languages, proxies):
self.video_id = video_id self.video_id = video_id
self.languages = languages self.languages = languages
print(languages)
self.proxies = proxies self.proxies = proxies
self.matched_splits = []
def fetch(self): def fetch(self):
if self.proxies: if self.proxies:
@ -118,19 +118,25 @@ class _TranscriptFetcher():
.replace('\\u0026', '&') .replace('\\u0026', '&')
.replace('\\', '') .replace('\\', '')
for split in fetched_site.split(self.TIMEDTEXT_STRING)] for split in fetched_site.split(self.TIMEDTEXT_STRING)]
for language in (self.languages if self.languages else ['en']): matched_splits = []
self.matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split] for language in self.languages:
if self.matched_splits: matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split]
if matched_splits:
break break
if self.matched_splits: if matched_splits:
timedtext_url = min(self.matched_splits, key=len) timedtext_url = min(matched_splits, key=self._sort_splits)
response = self._execute_api_request(timedtext_url, language) response = self._execute_api_request(timedtext_url)
if response: if response:
return response return response
return None return None
def _execute_api_request(self, timedtext_url, language): #Sorting the matched splits by string length because we want non-asr options returned first
#However, we don't want to include the length of the 'name' argument as it could possible throw this off
def _sort_splits(self, matched_split):
return len(re.sub(self.NAME_REGEX, r'\1', matched_split))
def _execute_api_request(self, timedtext_url):
url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url) url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url)
if self.proxies: if self.proxies:
return requests.get(url, proxies=self.proxies).text return requests.get(url, proxies=self.proxies).text