Merge pull request #11 from jdepoix/feature/ci

Setup CI
This commit is contained in:
jdepoix 2019-03-14 17:47:41 +01:00 committed by GitHub
commit 9fa8bb0d70
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 1875 additions and 59 deletions

30
.coveragerc Normal file
View File

@ -0,0 +1,30 @@
[run]
source = youtube_transcript_api
[report]
omit =
*/__main__.py
exclude_lines =
pragma: no cover
# Don't complain about missing debug-only code:
def __unicode__
def __repr__
if self\.debug
# Don't complain if tests don't hit defensive assertion code:
raise AssertionError
raise NotImplementedError
# Don't complain if non-runnable code isn't run:
if 0:
if __name__ == .__main__.:
# Don't complain about empty stubs of abstract methods
@abstractmethod
@abstractclassmethod
@abstractstaticmethod
show_missing = True

1
.gitignore vendored
View File

@ -6,3 +6,4 @@ dist
build build
*.egg-info *.egg-info
upload_new_version.sh upload_new_version.sh
.coverage

18
.travis.yml Normal file
View File

@ -0,0 +1,18 @@
language: python
python:
- "2.7"
- "3.3"
- "3.4"
- "3.5"
- "3.6"
matrix:
include:
- python: 3.7
dist: xenial
sudo: true
install:
- pip install -r requirements.txt
script:
- coverage run -m unittest discover
after_success:
- coveralls

View File

@ -1,5 +1,11 @@
# YouTube Transcript/Subtitle API (including automatically generated subtitles) # YouTube Transcript/Subtitle API (including automatically generated subtitles)
[![Build Status](https://travis-ci.org/jdepoix/youtube-transcript-api.svg)](https://travis-ci.org/jdepoix/youtube-transcript-api)
[![Coverage Status](https://coveralls.io/repos/github/jdepoix/youtube-transcript-api/badge.svg?branch=master)](https://coveralls.io/github/jdepoix/youtube-transcript-api?branch=master)
[![MIT license](http://img.shields.io/badge/license-MIT-brightgreen.svg?style=flat)](http://opensource.org/licenses/MIT)
[![image](https://img.shields.io/pypi/v/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/)
[![image](https://img.shields.io/pypi/pyversions/youtube-transcript-api.svg)](https://pypi.org/project/youtube-transcript-api/)
This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do! This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. It also works for automatically generated subtitles and it does not require a headless browser, like other selenium based solutions do!
## Install ## Install

3
coverage.sh Executable file
View File

@ -0,0 +1,3 @@
#!/usr/bin/env bash
.venv/bin/coverage run -m unittest discover && .venv/bin/coverage report

View File

@ -1 +1,7 @@
requests requests
# testing
mock
httpretty
coverage
coveralls

View File

@ -1,3 +1,7 @@
import os
import unittest
import setuptools import setuptools
@ -9,6 +13,15 @@ def get_long_description():
return _get_file_content('README.md') return _get_file_content('README.md')
def get_test_suite():
test_loader = unittest.TestLoader()
test_suite = test_loader.discover(
'test', pattern='test_*.py',
top_level_dir='{dirname}/youtube_transcript_api'.format(dirname=os.path.dirname(__file__))
)
return test_suite
setuptools.setup( setuptools.setup(
name="youtube_transcript_api", name="youtube_transcript_api",
version="0.1.3", version="0.1.3",
@ -29,6 +42,13 @@ setuptools.setup(
install_requires=[ install_requires=[
'requests', 'requests',
], ],
tests_require=[
'mock',
'httpretty',
'coverage',
'coveralls',
],
test_suite='setup.get_test_suite',
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'youtube_transcript_api = youtube_transcript_api.__main__:main', 'youtube_transcript_api = youtube_transcript_api.__main__:main',

View File

@ -1,62 +1,14 @@
import sys import sys
import json
from pprint import pprint
import logging import logging
import argparse from ._cli import YouTubeTranscriptCli
from ._api import YouTubeTranscriptApi
def parse_args(args):
parser = argparse.ArgumentParser(
description=(
'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
'It also works for automatically generated subtitles and it does not require a headless browser, like '
'other selenium based solutions do!'
)
)
parser.add_argument('video_ids', nargs='*', type=str, help='List of YouTube video IDs.')
parser.add_argument(
'--languages',
nargs='*',
default=[],
type=str,
help=(
'A list of language codes in a descending priority. For example, if this is set to "de en" it will first '
'try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to do so. '
'As I can\'t provide a complete list of all working language codes with full certainty, you may have to '
'play around with the language codes a bit, to find the one which is working for you!'
),
)
parser.add_argument(
'--json',
action='store_const',
const=True,
default=False,
help='If this flag is set the output will be JSON formatted.',
)
return parser.parse_args(args)
def main(): def main():
logging.basicConfig() logging.basicConfig()
parsed_args = parse_args(sys.argv[1:]) print(YouTubeTranscriptCli(sys.argv[1:]).run())
transcripts, _ = YouTubeTranscriptApi.get_transcripts(
parsed_args.video_ids,
languages=parsed_args.languages,
continue_after_error=True
)
if parsed_args.json:
print(json.dumps(transcripts))
else:
pprint(transcripts)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -1,6 +1,7 @@
import sys import sys
if sys.version_info.major == 2: # This can only be tested by using different python versions, therefore it is not covered by coverage.py
if sys.version_info.major == 2: # pragma: no cover
reload(sys) reload(sys)
sys.setdefaultencoding('utf-8') sys.setdefaultencoding('utf-8')
@ -36,8 +37,8 @@ class YouTubeTranscriptApi():
) )
self.video_id = video_id self.video_id = video_id
@staticmethod @classmethod
def get_transcripts(video_ids, languages=None, continue_after_error=False): def get_transcripts(cls, video_ids, languages=None, continue_after_error=False):
""" """
Retrieves the transcripts for a list of videos. Retrieves the transcripts for a list of videos.
@ -60,7 +61,7 @@ class YouTubeTranscriptApi():
for video_id in video_ids: for video_id in video_ids:
try: try:
data[video_id] = YouTubeTranscriptApi.get_transcript(video_id, languages) data[video_id] = cls.get_transcript(video_id, languages)
except Exception as exception: except Exception as exception:
if not continue_after_error: if not continue_after_error:
raise exception raise exception
@ -69,15 +70,15 @@ class YouTubeTranscriptApi():
return data, unretrievable_videos return data, unretrievable_videos
@staticmethod @classmethod
def get_transcript(video_id, languages=None): def get_transcript(cls, video_id, languages=None):
""" """
Retrieves the transcript for a single video. Retrieves the transcript for a single video.
:param video_id: the youtube video id :param video_id: the youtube video id
:type video_id: str :type video_id: str
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
it will first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails to it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
do so. As I can't provide a complete list of all working language codes with full certainty, you may have to do so. As I can't provide a complete list of all working language codes with full certainty, you may have to
play around with the language codes a bit, to find the one which is working for you! play around with the language codes a bit, to find the one which is working for you!
:type languages: [str] :type languages: [str]

View File

@ -0,0 +1,57 @@
import json
import pprint
import argparse
from ._api import YouTubeTranscriptApi
class YouTubeTranscriptCli():
def __init__(self, args):
self._args = args
def run(self):
parsed_args = self._parse_args()
transcripts, _ = YouTubeTranscriptApi.get_transcripts(
parsed_args.video_ids,
languages=parsed_args.languages,
continue_after_error=True
)
if parsed_args.json:
return json.dumps(transcripts)
else:
return pprint.pformat(transcripts)
def _parse_args(self):
parser = argparse.ArgumentParser(
description=(
'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
'It also works for automatically generated subtitles and it does not require a headless browser, like '
'other selenium based solutions do!'
)
)
parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
parser.add_argument(
'--languages',
nargs='*',
default=[],
type=str,
help=(
'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
'first try to fetch the german transcript (de) and then fetch the english transcipt (en) if it fails '
'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
'may have to play around with the language codes a bit, to find the one which is working for you!'
),
)
parser.add_argument(
'--json',
action='store_const',
const=True,
default=False,
help='If this flag is set the output will be JSON formatted.',
)
return parser.parse_args(self._args)

View File

@ -1,9 +1,11 @@
import sys import sys
if sys.version_info.major == 3 and sys.version_info.minor >= 4:
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover
# Python 3.4+ # Python 3.4+
from html import unescape from html import unescape
else: else: # pragma: no cover
if sys.version_info.major <= 2: if sys.version_info.major <= 2:
# Python 2 # Python 2
import HTMLParser import HTMLParser

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="utf-8" ?>
<transcript>
<text start="0" dur="1.54">Hey, this is just a test</text>
<text start="1.54" dur="4.16">this is not the original transcript</text>
<text start="5.7" dur="3.239">just something shorter, I made up for testing</text>
</transcript>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,103 @@
from unittest import TestCase
from mock import MagicMock
import os
import httpretty
from youtube_transcript_api._api import YouTubeTranscriptApi
def load_asset(filename):
with open('{dirname}/assets/{filename}'.format(dirname=os.path.dirname(__file__), filename=filename)) as file:
return file.read()
class TestYouTubeTranscriptApi(TestCase):
def setUp(self):
httpretty.enable()
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/watch',
body=load_asset('youtube.html')
)
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/api/timedtext',
body=load_asset('transcript.xml')
)
def tearDown(self):
httpretty.disable()
def test_get_transcript(self):
transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8')
self.assertEqual(
transcript,
[
{'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54},
{'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16},
{'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239}
]
)
def test_get_transcript__correct_language_is_used(self):
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en'])
query_string = httpretty.last_request().querystring
self.assertIn('lang', query_string)
self.assertEqual(len(query_string['lang']), 1)
self.assertEqual(query_string['lang'][0], 'de')
def test_get_transcript__fallback_language_is_used(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/api/timedtext',
body=''
)
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', ['de', 'en'])
query_string = httpretty.last_request().querystring
self.assertIn('lang', query_string)
self.assertEqual(len(query_string['lang']), 1)
self.assertEqual(query_string['lang'][0], 'en')
def test_get_transcript__exception_is_raised_when_not_available(self):
httpretty.register_uri(
httpretty.GET,
'https://www.youtube.com/api/timedtext',
body=''
)
with self.assertRaises(YouTubeTranscriptApi.CouldNotRetrieveTranscript):
YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8')
def test_get_transcripts(self):
video_id_1 = 'video_id_1'
video_id_2 = 'video_id_2'
languages = ['de', 'en']
YouTubeTranscriptApi.get_transcript = MagicMock()
YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, languages)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, languages)
self.assertEqual(YouTubeTranscriptApi.get_transcript.call_count, 2)
def test_get_transcripts__stop_on_error(self):
YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
with self.assertRaises(Exception):
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
def test_get_transcripts__continue_on_error(self):
video_id_1 = 'video_id_1'
video_id_2 = 'video_id_2'
YouTubeTranscriptApi.get_transcript = MagicMock(side_effect=Exception('Error'))
YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_1, None)
YouTubeTranscriptApi.get_transcript.assert_any_call(video_id_2, None)

View File

@ -0,0 +1,68 @@
from unittest import TestCase
from mock import MagicMock
import json
from youtube_transcript_api._cli import YouTubeTranscriptCli, YouTubeTranscriptApi
class TestYouTubeTranscriptCli(TestCase):
def test_argument_parsing(self):
parsed_args = YouTubeTranscriptCli('v1 v2 --json --languages de en'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
parsed_args = YouTubeTranscriptCli(' --json v1 v2 --languages de en'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, ['de', 'en'])
def test_argument_parsing__only_video_ids(self):
parsed_args = YouTubeTranscriptCli('v1 v2'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, False)
self.assertEqual(parsed_args.languages, [])
def test_argument_parsing__fail_without_video_ids(self):
with self.assertRaises(SystemExit):
YouTubeTranscriptCli('--json'.split())._parse_args()
def test_argument_parsing__json(self):
parsed_args = YouTubeTranscriptCli('v1 v2 --json'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, [])
parsed_args = YouTubeTranscriptCli('--json v1 v2'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, True)
self.assertEqual(parsed_args.languages, [])
def test_argument_parsing__languages(self):
parsed_args = YouTubeTranscriptCli('v1 v2 --languages de en'.split())._parse_args()
self.assertEqual(parsed_args.video_ids, ['v1', 'v2'])
self.assertEqual(parsed_args.json, False)
self.assertEqual(parsed_args.languages, ['de', 'en'])
def test_run(self):
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([], []))
YouTubeTranscriptCli('v1 v2 --languages de en'.split()).run()
YouTubeTranscriptApi.get_transcripts.assert_called_once_with(
['v1', 'v2'],
languages=['de', 'en'],
continue_after_error=True
)
def test_run__json_output(self):
YouTubeTranscriptApi.get_transcripts = MagicMock(return_value=([{'boolean': True}], []))
output = YouTubeTranscriptCli('v1 v2 --languages de en --json'.split()).run()
# will fail if output is not valid json
json.loads(output)