support for html unescaping for all python versions added

This commit is contained in:
Jonas Depoix 2018-04-26 13:36:14 +02:00
parent fe1783688e
commit 2b3a6f3a71
2 changed files with 22 additions and 1 deletions

19
src/html_unescaping.py Normal file
View File

@ -0,0 +1,19 @@
import sys
if sys.version_info.major == 3 and sys.version_info.minor >= 4:
# Python 3.4+
from html import unescape
else:
if sys.version_info.major <= 2:
# Python 2
import HTMLParser
html_parser = HTMLParser.HTMLParser()
else:
# Python 3.0 - 3.3
import html.parser
html_parser = html.parser.HTMLParser()
def unescape(string):
return html_parser.unescape(string)

View File

@ -6,6 +6,8 @@ import logging
import requests import requests
from .html_unescaping import unescape
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -112,7 +114,7 @@ class _TranscriptParser():
def parse(self): def parse(self):
return [ return [
{ {
'text': re.sub(self.HTML_TAG_REGEX, '', xml_element.text), 'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
'start': float(xml_element.attrib['start']), 'start': float(xml_element.attrib['start']),
'duration': float(xml_element.attrib['dur']), 'duration': float(xml_element.attrib['dur']),
} }