support for html unescaping for all python versions added
This commit is contained in:
parent
fe1783688e
commit
2b3a6f3a71
|
@ -0,0 +1,19 @@
|
||||||
|
import sys
|
||||||
|
|
||||||
|
if sys.version_info.major == 3 and sys.version_info.minor >= 4:
|
||||||
|
# Python 3.4+
|
||||||
|
from html import unescape
|
||||||
|
else:
|
||||||
|
if sys.version_info.major <= 2:
|
||||||
|
# Python 2
|
||||||
|
import HTMLParser
|
||||||
|
|
||||||
|
html_parser = HTMLParser.HTMLParser()
|
||||||
|
else:
|
||||||
|
# Python 3.0 - 3.3
|
||||||
|
import html.parser
|
||||||
|
|
||||||
|
html_parser = html.parser.HTMLParser()
|
||||||
|
|
||||||
|
def unescape(string):
|
||||||
|
return html_parser.unescape(string)
|
|
@ -6,6 +6,8 @@ import logging
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
from .html_unescaping import unescape
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -112,7 +114,7 @@ class _TranscriptParser():
|
||||||
def parse(self):
|
def parse(self):
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
'text': re.sub(self.HTML_TAG_REGEX, '', xml_element.text),
|
'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
|
||||||
'start': float(xml_element.attrib['start']),
|
'start': float(xml_element.attrib['start']),
|
||||||
'duration': float(xml_element.attrib['dur']),
|
'duration': float(xml_element.attrib['dur']),
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue