transcript api implemented

2018-04-20 13:23:14 +02:00 · 2018-04-20 13:23:14 +02:00 · 55d76a158a
parent 119e694f3f
commit 55d76a158a
6 changed files with 86 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 .idea
 .venv
+*.pyc
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1 @@
-selenium==3.11.0
+requests==2.18.4
--- a/src/init.py
+++ b/src/init.py
@ -0,0 +1 @@
+
--- a/src/transcript_api.py
+++ b/src/transcript_api.py
@ -0,0 +1,72 @@
+from xml.etree import ElementTree
+
+import re
+
+import logging
+
+import requests
+
+
+logger = logging.getLogger(__name__)
+
+
+class YouTubeTranscriptApi():
+    @staticmethod
+    def get(*video_ids):
+        data = {}
+
+        for video_id in video_ids:
+            try:
+                data[video_id] = _TranscriptParser(_TranscriptFetcher(video_id).fetch()).parse()
+            except Exception:
+                logger.error(
+                    'Could not get the transcript for the video {video_url}! '
+                    'Most likely subtitles have been disabled by the uploader or the video is no longer '
+                    'available.'.format(
+                        video_url=_TranscriptFetcher.WATCH_URL.format(video_id=video_id)
+                    )
+                )
+
+        return data
+
+
+class _TranscriptFetcher():
+    WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
+    API_BASE_URL = 'https://www.youtube.com/api/{api_url}'
+
+    def __init__(self, video_id):
+        self.video_id = video_id
+
+    def fetch(self):
+        fetched_site = requests.get(self.WATCH_URL.format(video_id=self.video_id)).text
+
+        timedtext_url_start = fetched_site.find('timedtext')
+
+        return requests.get(
+            self.API_BASE_URL.format(
+                api_url=fetched_site[
+                    timedtext_url_start:timedtext_url_start + fetched_site[timedtext_url_start:].find('"')
+                ].replace(
+                    '\\u0026', '&'
+                ).replace(
+                    '\\', ''
+                )
+            )
+        ).text
+
+
+class _TranscriptParser():
+    HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
+
+    def __init__(self, plain_data):
+        self.plain_data = plain_data
+
+    def parse(self):
+        return [
+            {
+                'text': re.sub(self.HTML_TAG_REGEX, '', xml_element.text),
+                'start': float(xml_element.attrib['start']),
+                'duration': float(xml_element.attrib['dur']),
+            }
+            for xml_element in ElementTree.fromstring(self.plain_data)
+        ]
--- a/transcripts.json
+++ b/transcripts.json
--- a/youtube_transcript_api.py
+++ b/youtube_transcript_api.py
@ -0,0 +1,10 @@
+#!./.venv/bin/python
+
+import sys
+
+import json
+
+from src.transcript_api import YouTubeTranscriptApi
+
+if __name__ == '__main__':
+    print(json.dumps(YouTubeTranscriptApi.get(*sys.argv[1:])))