Removed unnecessary language variables, sort split matches by len while ignoring name arguement
This commit is contained in:
parent
7ac7d3266b
commit
c7cb3117be
|
@ -40,7 +40,7 @@ class YouTubeTranscriptApi():
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_transcripts(cls, video_ids, languages=None, continue_after_error=False, proxies=None):
|
def get_transcripts(cls, video_ids, languages=['en'], continue_after_error=False, proxies=None):
|
||||||
"""
|
"""
|
||||||
Retrieves the transcripts for a list of videos.
|
Retrieves the transcripts for a list of videos.
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ class YouTubeTranscriptApi():
|
||||||
return data, unretrievable_videos
|
return data, unretrievable_videos
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_transcript(cls, video_id, languages=None, proxies=None):
|
def get_transcript(cls, video_id, languages=['en'], proxies=None):
|
||||||
"""
|
"""
|
||||||
Retrieves the transcript for a single video.
|
Retrieves the transcript for a single video.
|
||||||
|
|
||||||
|
@ -100,14 +100,14 @@ class YouTubeTranscriptApi():
|
||||||
class _TranscriptFetcher():
|
class _TranscriptFetcher():
|
||||||
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
||||||
API_BASE_URL = 'https://www.youtube.com/api/'
|
API_BASE_URL = 'https://www.youtube.com/api/'
|
||||||
LANGUAGE_REGEX = re.compile(r'(&lang=.*&)|(&lang=.*)')
|
|
||||||
TIMEDTEXT_STRING = 'timedtext?v='
|
TIMEDTEXT_STRING = 'timedtext?v='
|
||||||
|
NAME_REGEX = re.compile(r'(&name=.*&)|(&name=.*)')
|
||||||
|
|
||||||
def __init__(self, video_id, languages, proxies):
|
def __init__(self, video_id, languages, proxies):
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
self.languages = languages
|
self.languages = languages
|
||||||
|
print(languages)
|
||||||
self.proxies = proxies
|
self.proxies = proxies
|
||||||
self.matched_splits = []
|
|
||||||
|
|
||||||
def fetch(self):
|
def fetch(self):
|
||||||
if self.proxies:
|
if self.proxies:
|
||||||
|
@ -118,19 +118,25 @@ class _TranscriptFetcher():
|
||||||
.replace('\\u0026', '&')
|
.replace('\\u0026', '&')
|
||||||
.replace('\\', '')
|
.replace('\\', '')
|
||||||
for split in fetched_site.split(self.TIMEDTEXT_STRING)]
|
for split in fetched_site.split(self.TIMEDTEXT_STRING)]
|
||||||
for language in (self.languages if self.languages else ['en']):
|
matched_splits = []
|
||||||
self.matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split]
|
for language in self.languages:
|
||||||
if self.matched_splits:
|
matched_splits = [split for split in timedtext_splits if '&lang={}'.format(language) in split]
|
||||||
|
if matched_splits:
|
||||||
break
|
break
|
||||||
if self.matched_splits:
|
if matched_splits:
|
||||||
timedtext_url = min(self.matched_splits, key=len)
|
timedtext_url = min(matched_splits, key=self._sort_splits)
|
||||||
response = self._execute_api_request(timedtext_url, language)
|
response = self._execute_api_request(timedtext_url)
|
||||||
if response:
|
if response:
|
||||||
return response
|
return response
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _execute_api_request(self, timedtext_url, language):
|
#Sorting the matched splits by string length because we want non-asr options returned first
|
||||||
|
#However, we don't want to include the length of the 'name' argument as it could possible throw this off
|
||||||
|
def _sort_splits(self, matched_split):
|
||||||
|
return len(re.sub(self.NAME_REGEX, r'\1', matched_split))
|
||||||
|
|
||||||
|
def _execute_api_request(self, timedtext_url):
|
||||||
url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url)
|
url = '{}{}{}'.format(self.API_BASE_URL, self.TIMEDTEXT_STRING, timedtext_url)
|
||||||
if self.proxies:
|
if self.proxies:
|
||||||
return requests.get(url, proxies=self.proxies).text
|
return requests.get(url, proxies=self.proxies).text
|
||||||
|
|
Loading…
Reference in New Issue