fixed bug; added doctstrings for public methods
This commit is contained in:
parent
df417be915
commit
c2c49c3c17
|
@ -1,3 +1,3 @@
|
||||||
from ._api import YouTubeTranscriptApi
|
from ._api import YouTubeTranscriptApi
|
||||||
from ._transcripts import TranscriptDataFetcher, TranscriptData, Transcript
|
from ._transcripts import TranscriptList, Transcript
|
||||||
from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable
|
from ._errors import TranscriptsDisabled, NoTranscriptFound, CouldNotRetrieveTranscript, VideoUnavailable
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from ._transcripts import TranscriptDataFetcher
|
from ._transcripts import TranscriptListFetcher
|
||||||
|
|
||||||
|
|
||||||
class YouTubeTranscriptApi():
|
class YouTubeTranscriptApi():
|
||||||
|
@ -13,8 +13,7 @@ class YouTubeTranscriptApi():
|
||||||
:type video_ids: [str]
|
:type video_ids: [str]
|
||||||
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||||
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
||||||
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
|
do so.
|
||||||
play around with the language codes a bit, to find the one which is working for you!
|
|
||||||
:type languages: [str]
|
:type languages: [str]
|
||||||
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
|
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
|
||||||
one of the video transcripts
|
one of the video transcripts
|
||||||
|
@ -23,7 +22,7 @@ class YouTubeTranscriptApi():
|
||||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||||
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
||||||
video ids, which could not be retrieved
|
video ids, which could not be retrieved
|
||||||
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}
|
:rtype: ({str: [{'text': str, 'start': float, 'end': float}]}, [str]})
|
||||||
"""
|
"""
|
||||||
data = {}
|
data = {}
|
||||||
unretrievable_videos = []
|
unretrievable_videos = []
|
||||||
|
@ -48,8 +47,7 @@ class YouTubeTranscriptApi():
|
||||||
:type video_id: str
|
:type video_id: str
|
||||||
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
||||||
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
||||||
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
|
do so.
|
||||||
play around with the language codes a bit, to find the one which is working for you!
|
|
||||||
:type languages: [str]
|
:type languages: [str]
|
||||||
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
||||||
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
||||||
|
@ -58,4 +56,4 @@ class YouTubeTranscriptApi():
|
||||||
"""
|
"""
|
||||||
with requests.Session() as http_client:
|
with requests.Session() as http_client:
|
||||||
http_client.proxies = proxies if proxies else {}
|
http_client.proxies = proxies if proxies else {}
|
||||||
return TranscriptDataFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
|
return TranscriptListFetcher(http_client).fetch(video_id).find_transcript(languages).fetch()
|
||||||
|
|
|
@ -16,12 +16,12 @@ from ._errors import VideoUnavailable, NoTranscriptFound, TranscriptsDisabled
|
||||||
from ._settings import WATCH_URL
|
from ._settings import WATCH_URL
|
||||||
|
|
||||||
|
|
||||||
class TranscriptDataFetcher():
|
class TranscriptListFetcher():
|
||||||
def __init__(self, http_client):
|
def __init__(self, http_client):
|
||||||
self._http_client = http_client
|
self._http_client = http_client
|
||||||
|
|
||||||
def fetch(self, video_id):
|
def fetch(self, video_id):
|
||||||
return TranscriptData.build(
|
return TranscriptList.build(
|
||||||
self._http_client,
|
self._http_client,
|
||||||
video_id,
|
video_id,
|
||||||
self._extract_captions_json(self._fetch_html(video_id), video_id)
|
self._extract_captions_json(self._fetch_html(video_id), video_id)
|
||||||
|
@ -48,48 +48,89 @@ class TranscriptDataFetcher():
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class TranscriptData():
|
class TranscriptList():
|
||||||
|
"""
|
||||||
|
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
||||||
|
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
||||||
|
"""
|
||||||
|
|
||||||
# TODO implement iterator
|
# TODO implement iterator
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, video_id, manually_created_transcripts, generated_transcripts):
|
||||||
self, http_client, video_id, manually_created_transcripts, generated_transcripts, translation_languages
|
"""
|
||||||
):
|
The constructor is only for internal use. Use the static build method instead.
|
||||||
self._http_client = http_client
|
|
||||||
|
:param video_id: the id of the video this TranscriptList is for
|
||||||
|
:type video_id: str
|
||||||
|
:param manually_created_transcripts: dict mapping language codes to the manually created transcripts
|
||||||
|
:type manually_created_transcripts: dict[str, Transcript]
|
||||||
|
:param generated_transcripts: dict mapping language codes to the generated transcripts
|
||||||
|
:type generated_transcripts: dict[str, Transcript]
|
||||||
|
"""
|
||||||
self.video_id = video_id
|
self.video_id = video_id
|
||||||
self._manually_created_transcripts = manually_created_transcripts
|
self._manually_created_transcripts = manually_created_transcripts
|
||||||
self._generated_transcripts = generated_transcripts
|
self._generated_transcripts = generated_transcripts
|
||||||
self._translation_languages = translation_languages
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def build(http_client, video_id, captions_json):
|
def build(http_client, video_id, captions_json):
|
||||||
manually_created_transcripts = []
|
"""
|
||||||
generated_transcripts = []
|
Factory method for TranscriptList.
|
||||||
|
|
||||||
for caption in captions_json['captionTracks']:
|
:param http_client: http client which is used to make the transcript retrieving http calls
|
||||||
(generated_transcripts if caption.get('kind', '') == 'asr' else generated_transcripts).append(
|
:type http_client: requests.Session
|
||||||
{
|
:param video_id: the id of the video this TranscriptList is for
|
||||||
'url': caption['baseUrl'],
|
:type video_id: str
|
||||||
'language': caption['name']['simpleText'],
|
:param captions_json: the JSON parsed from the YouTube pages static HTML
|
||||||
'language_code': caption['languageCode'],
|
:type captions_json: dict
|
||||||
'is_generated': caption.get('kind', '') == 'asr',
|
:return: the created TranscriptList
|
||||||
'is_translatable': caption['isTranslatable'],
|
:rtype TranscriptList
|
||||||
}
|
"""
|
||||||
)
|
translation_languages = [
|
||||||
|
|
||||||
return TranscriptData(
|
|
||||||
http_client,
|
|
||||||
video_id,
|
|
||||||
manually_created_transcripts,
|
|
||||||
generated_transcripts,
|
|
||||||
[
|
|
||||||
{
|
{
|
||||||
'language': translation_language['languageName']['simpleText'],
|
'language': translation_language['languageName']['simpleText'],
|
||||||
'language_code': translation_language['languageCode'],
|
'language_code': translation_language['languageCode'],
|
||||||
} for translation_language in captions_json['translationLanguages']
|
} for translation_language in captions_json['translationLanguages']
|
||||||
],
|
]
|
||||||
|
|
||||||
|
manually_created_transcripts = {}
|
||||||
|
generated_transcripts = {}
|
||||||
|
|
||||||
|
for caption in captions_json['captionTracks']:
|
||||||
|
if caption.get('kind', '') == 'asr':
|
||||||
|
transcript_dict = generated_transcripts
|
||||||
|
else:
|
||||||
|
transcript_dict = manually_created_transcripts
|
||||||
|
|
||||||
|
transcript_dict[caption['languageCode']] = Transcript(
|
||||||
|
http_client,
|
||||||
|
video_id,
|
||||||
|
caption['baseUrl'],
|
||||||
|
caption['name']['simpleText'],
|
||||||
|
caption['languageCode'],
|
||||||
|
caption.get('kind', '') == 'asr',
|
||||||
|
translation_languages if caption['isTranslatable'] else []
|
||||||
|
)
|
||||||
|
|
||||||
|
return TranscriptList(
|
||||||
|
video_id,
|
||||||
|
manually_created_transcripts,
|
||||||
|
generated_transcripts,
|
||||||
)
|
)
|
||||||
|
|
||||||
def find_transcript(self, language_codes):
|
def find_transcript(self, language_codes):
|
||||||
|
"""
|
||||||
|
Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
|
||||||
|
are found, generated transcripts are used. If you only want generated transcripts use
|
||||||
|
find_manually_created_transcript instead.
|
||||||
|
|
||||||
|
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||||
|
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||||
|
it fails to do so.
|
||||||
|
:type languages: [str]
|
||||||
|
:return: the found Transcript
|
||||||
|
:rtype: Transcript
|
||||||
|
:raises: NoTranscriptFound
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
return self.find_manually_created_transcript(language_codes)
|
return self.find_manually_created_transcript(language_codes)
|
||||||
except NoTranscriptFound:
|
except NoTranscriptFound:
|
||||||
|
@ -98,25 +139,39 @@ class TranscriptData():
|
||||||
return self.find_generated_transcript(language_codes)
|
return self.find_generated_transcript(language_codes)
|
||||||
|
|
||||||
def find_generated_transcript(self, language_codes):
|
def find_generated_transcript(self, language_codes):
|
||||||
|
"""
|
||||||
|
Finds a automatically generated transcript for a given language code.
|
||||||
|
|
||||||
|
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||||
|
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||||
|
it fails to do so.
|
||||||
|
:type languages: [str]
|
||||||
|
:return: the found Transcript
|
||||||
|
:rtype: Transcript
|
||||||
|
:raises: NoTranscriptFound
|
||||||
|
"""
|
||||||
return self._find_transcript(language_codes, generated=True)
|
return self._find_transcript(language_codes, generated=True)
|
||||||
|
|
||||||
def find_manually_created_transcript(self, language_codes):
|
def find_manually_created_transcript(self, language_codes):
|
||||||
|
"""
|
||||||
|
Finds a manually created transcript for a given language code.
|
||||||
|
|
||||||
|
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
||||||
|
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
||||||
|
it fails to do so.
|
||||||
|
:type languages: [str]
|
||||||
|
:return: the found Transcript
|
||||||
|
:rtype: Transcript
|
||||||
|
:raises: NoTranscriptFound
|
||||||
|
"""
|
||||||
return self._find_transcript(language_codes, generated=False)
|
return self._find_transcript(language_codes, generated=False)
|
||||||
|
|
||||||
def _find_transcript(self, language_codes, generated):
|
def _find_transcript(self, language_codes, generated):
|
||||||
transcripts = self._generated_transcripts if generated else self._manually_created_transcripts
|
transcripts = self._generated_transcripts if generated else self._manually_created_transcripts
|
||||||
|
|
||||||
for language_code in language_codes:
|
for language_code in language_codes:
|
||||||
for transcript in transcripts:
|
if language_code in transcripts:
|
||||||
if transcript['language_code'] == language_code:
|
return transcripts[language_code]
|
||||||
return Transcript(
|
|
||||||
self._http_client,
|
|
||||||
transcript['url'],
|
|
||||||
transcript['language'],
|
|
||||||
transcript['language_code'],
|
|
||||||
transcript['is_generated'],
|
|
||||||
self._translation_languages if transcript['is_translatable'] else []
|
|
||||||
)
|
|
||||||
|
|
||||||
raise NoTranscriptFound(
|
raise NoTranscriptFound(
|
||||||
self.video_id,
|
self.video_id,
|
||||||
|
@ -134,34 +189,59 @@ class TranscriptData():
|
||||||
).format(
|
).format(
|
||||||
video_id=self.video_id,
|
video_id=self.video_id,
|
||||||
available_manually_created_transcript_languages=self._get_language_description(
|
available_manually_created_transcript_languages=self._get_language_description(
|
||||||
self._manually_created_transcripts
|
self._manually_created_transcripts.values()
|
||||||
),
|
),
|
||||||
available_generated_transcripts=self._get_language_description(
|
available_generated_transcripts=self._get_language_description(
|
||||||
self._generated_transcripts
|
self._generated_transcripts.values()
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_language_description(self, transcripts):
|
def _get_language_description(self, transcripts):
|
||||||
return '\n'.join(
|
return '\n'.join(
|
||||||
' - {language_code} ("{language}")'.format(
|
' - {transcript}'.format(transcript=str(transcript))
|
||||||
language=transcript['language'],
|
for transcript in transcripts
|
||||||
language_code=transcript['language_code'],
|
|
||||||
) for transcript in transcripts
|
|
||||||
) if transcripts else 'None'
|
) if transcripts else 'None'
|
||||||
|
|
||||||
|
|
||||||
class Transcript():
|
class Transcript():
|
||||||
def __init__(self, http_client, url, language, language_code, is_generated, translation_languages):
|
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
||||||
|
"""
|
||||||
|
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
||||||
|
TranscriptList.
|
||||||
|
|
||||||
|
:param http_client: http client which is used to make the transcript retrieving http calls
|
||||||
|
:type http_client: requests.Session
|
||||||
|
:param video_id: the id of the video this TranscriptList is for
|
||||||
|
:type video_id: str
|
||||||
|
:param url: the url which needs to be called to fetch the transcript
|
||||||
|
:param language: the name of the language this transcript uses
|
||||||
|
:param language_code:
|
||||||
|
:param is_generated:
|
||||||
|
:param translation_languages:
|
||||||
|
"""
|
||||||
self._http_client = http_client
|
self._http_client = http_client
|
||||||
self.url = url
|
self.video_id = video_id
|
||||||
|
self._url = url
|
||||||
self.language = language
|
self.language = language
|
||||||
self.language_code = language_code
|
self.language_code = language_code
|
||||||
self.is_generated = is_generated
|
self.is_generated = is_generated
|
||||||
self.translation_languages = translation_languages
|
self.translation_languages = translation_languages
|
||||||
|
|
||||||
def fetch(self):
|
def fetch(self):
|
||||||
|
"""
|
||||||
|
Loads the actual transcript data.
|
||||||
|
|
||||||
|
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
||||||
|
:rtype: [{'text': str, 'start': float, 'end': float}]
|
||||||
|
"""
|
||||||
return _TranscriptParser().parse(
|
return _TranscriptParser().parse(
|
||||||
self._http_client.get(self.url).text
|
self._http_client.get(self._url).text
|
||||||
|
)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return '{language_code} ("{language}")'.format(
|
||||||
|
language=self.language,
|
||||||
|
language_code=self.language_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO integrate translations in future release
|
# TODO integrate translations in future release
|
||||||
|
|
Loading…
Reference in New Issue