cpython: 7b9235852b3b (original) (raw)

Mercurial > cpython

changeset 87275:7b9235852b3b

#2927: Added the unescape() function to the html module. [#2927]

Ezio Melotti ezio.melotti@gmail.com
date Tue, 19 Nov 2013 20:28:45 +0200
parents ee2c80eeca2a
children 2012e85638d9
files Doc/library/html.entities.rst Doc/library/html.rst Lib/html/__init__.py Lib/html/parser.py Lib/test/test_html.py Lib/test/test_htmlparser.py Misc/NEWS
diffstat 7 files changed, 215 insertions(+), 49 deletions(-)[+] [-] Doc/library/html.entities.rst 1 Doc/library/html.rst 11 Lib/html/__init__.py 114 Lib/html/parser.py 38 Lib/test/test_html.py 86 Lib/test/test_htmlparser.py 12 Misc/NEWS 2

line wrap: on

line diff

--- a/Doc/library/html.entities.rst +++ b/Doc/library/html.entities.rst @@ -20,6 +20,7 @@ This module defines four dictionaries, : Note that the trailing semicolon is included in the name (e.g. 'gt;'), however some of the names are accepted by the standard even without the semicolon: in this case the name is present with and without the ';'.

--- a/Doc/library/html.rst +++ b/Doc/library/html.rst @@ -20,6 +20,17 @@ This module defines utilities to manipul .. versionadded:: 3.2 + +.. function:: unescape(s) +

-------------- Submodules in the html package are:

--- a/Lib/html/init.py +++ b/Lib/html/init.py @@ -2,7 +2,12 @@ General functions for HTML manipulation. """ -# NB: this is a candidate for a bytes/string polymorphic interface +import re as _re +from html.entities import html5 as _html5 + + +all = ['escape', 'unescape'] + def escape(s, quote=True): """ @@ -18,3 +23,110 @@ def escape(s, quote=True): s = s.replace('"', """) s = s.replace(''', "'") return s + + +# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references[](#l3.23) + +_invalid_charrefs = {

+} + +_invalid_codepoints = {

+} + + +def _replace_charref(s):

+ + +_charref = _re.compile(r'&(#[0-9]+;?'

+ +def unescape(s):

--- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -8,9 +8,12 @@

and CDATA (character data -- only end tags are special).

-import _markupbase import re import warnings +import _markupbase + +from html import unescape + all = ['HTMLParser'] @@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase) attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue:

@@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase) def unknown_decl(self, data): if self.strict: self.error("unknown declaration: %r" % (data,)) -

-

--- a/Lib/test/test_html.py +++ b/Lib/test/test_html.py @@ -16,9 +16,89 @@ class HtmlTests(unittest.TestCase): html.escape('''', False), ''<script>"&foo;"</script>'')

-def test_main():

if name == 'main':

--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -569,18 +569,6 @@ class HTMLParserTolerantTestCase(HTMLPar for html, expected in data: self._run_check(html, expected)

- def test_broken_comments(self): html = ('<! not really a comment >' '<! not a comment either -->'

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -59,6 +59,8 @@ Library