cpython: 7b9235852b3b (original) (raw)
Mercurial > cpython
changeset 87275:7b9235852b3b
#2927: Added the unescape() function to the html module. [#2927]
Ezio Melotti ezio.melotti@gmail.com | |
---|---|
date | Tue, 19 Nov 2013 20:28:45 +0200 |
parents | ee2c80eeca2a |
children | 2012e85638d9 |
files | Doc/library/html.entities.rst Doc/library/html.rst Lib/html/__init__.py Lib/html/parser.py Lib/test/test_html.py Lib/test/test_htmlparser.py Misc/NEWS |
diffstat | 7 files changed, 215 insertions(+), 49 deletions(-)[+] [-] Doc/library/html.entities.rst 1 Doc/library/html.rst 11 Lib/html/__init__.py 114 Lib/html/parser.py 38 Lib/test/test_html.py 86 Lib/test/test_htmlparser.py 12 Misc/NEWS 2 |
line wrap: on
line diff
--- a/Doc/library/html.entities.rst
+++ b/Doc/library/html.entities.rst
@@ -20,6 +20,7 @@ This module defines four dictionaries, :
Note that the trailing semicolon is included in the name (e.g. 'gt;'
),
however some of the names are accepted by the standard even without the
semicolon: in this case the name is present with and without the ';'
.
--- a/Doc/library/html.rst +++ b/Doc/library/html.rst @@ -20,6 +20,17 @@ This module defines utilities to manipul .. versionadded:: 3.2 + +.. function:: unescape(s) +
- Convert all named and numeric character references (e.g.
>
, >
,&x3e;
) in the string s to the corresponding unicode- characters. This function uses the rules defined by the HTML 5 standard
- for both valid and invalid character references, and the :data:`list of
- HTML 5 named character references <html.entities.html5>`. +
- .. versionadded:: 3.4 +
--------------
Submodules in the html
package are:
--- a/Lib/html/init.py +++ b/Lib/html/init.py @@ -2,7 +2,12 @@ General functions for HTML manipulation. """ -# NB: this is a candidate for a bytes/string polymorphic interface +import re as _re +from html.entities import html5 as _html5 + + +all = ['escape', 'unescape'] + def escape(s, quote=True): """ @@ -18,3 +23,110 @@ def escape(s, quote=True): s = s.replace('"', """) s = s.replace(''', "'") return s + + +# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references[](#l3.23) + +_invalid_charrefs = {
- 0x00: '\ufffd', # REPLACEMENT CHARACTER
- 0x0d: '\r', # CARRIAGE RETURN
- 0x80: '\u20ac', # EURO SIGN
- 0x81: '\x81', #
- 0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK
- 0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK
- 0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK
- 0x85: '\u2026', # HORIZONTAL ELLIPSIS
- 0x86: '\u2020', # DAGGER
- 0x87: '\u2021', # DOUBLE DAGGER
- 0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT
- 0x89: '\u2030', # PER MILLE SIGN
- 0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON
- 0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
- 0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE
- 0x8d: '\x8d', #
- 0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON
- 0x8f: '\x8f', #
- 0x90: '\x90', #
- 0x91: '\u2018', # LEFT SINGLE QUOTATION MARK
- 0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK
- 0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK
- 0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK
- 0x95: '\u2022', # BULLET
- 0x96: '\u2013', # EN DASH
- 0x97: '\u2014', # EM DASH
- 0x98: '\u02dc', # SMALL TILDE
- 0x99: '\u2122', # TRADE MARK SIGN
- 0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON
- 0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
- 0x9c: '\u0153', # LATIN SMALL LIGATURE OE
- 0x9d: '\x9d', #
- 0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON
- 0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS
0x0001 to 0x0008
- 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
0x000E to 0x001F
- 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
- 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x007F to 0x009F
- 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
- 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
- 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xFDD0 to 0xFDEF
- 0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
- 0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
- 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
- 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
others
- 0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
- 0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
- 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
- 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
- 0x10fffe, 0x10ffff
+} + + +def _replace_charref(s):
- s = s.group(1)
- if s[0] == '#':
# numeric charref[](#l3.89)
if s[1] in 'xX':[](#l3.90)
num = int(s[2:].rstrip(';'), 16)[](#l3.91)
else:[](#l3.92)
num = int(s[1:].rstrip(';'))[](#l3.93)
if num in _invalid_charrefs:[](#l3.94)
return _invalid_charrefs[num][](#l3.95)
if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:[](#l3.96)
return '\uFFFD'[](#l3.97)
if num in _invalid_codepoints:[](#l3.98)
return ''[](#l3.99)
return chr(num)[](#l3.100)
- else:
# named charref[](#l3.102)
if s in _html5:[](#l3.103)
return _html5[s][](#l3.104)
# find the longest matching name (as defined by the standard)[](#l3.105)
for x in range(len(s)-1, 1, -1):[](#l3.106)
if s[:x] in _html5:[](#l3.107)
return _html5[s[:x]] + s[x:][](#l3.108)
else:[](#l3.109)
return '&' + s[](#l3.110)
+ + +_charref = _re.compile(r'&(#[0-9]+;?'
r'|#[xX][0-9a-fA-F]+;?'[](#l3.114)
r'|[^\t\n\f <&#;]{1,32};?)')[](#l3.115)
- """
- Convert all named and numeric character references (e.g. >, >,
- &x3e;) in the string s to the corresponding unicode characters.
- This function uses the rules defined by the HTML 5 standard
- for both valid and invalid character references, and the list of
- HTML 5 named character references defined in html.entities.html5.
- """
- if '&' not in s:
return s[](#l3.126)
- return _charref.sub(_replace_charref, s)
--- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -8,9 +8,12 @@
and CDATA (character data -- only end tags are special).
-import _markupbase import re import warnings +import _markupbase + +from html import unescape + all = ['HTMLParser'] @@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase) attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue:
attrvalue = self.unescape(attrvalue)[](#l4.21)
attrvalue = unescape(attrvalue)[](#l4.22) attrs.append((attrname.lower(), attrvalue))[](#l4.23) k = m.end()[](#l4.24)
@@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase) def unknown_decl(self, data): if self.strict: self.error("unknown declaration: %r" % (data,)) -
Internal -- helper to remove special character quoting
- def unescape(self, s):
if '&' not in s:[](#l4.33)
return s[](#l4.34)
def replaceEntities(s):[](#l4.35)
s = s.groups()[0][](#l4.36)
try:[](#l4.37)
if s[0] == "#":[](#l4.38)
s = s[1:][](#l4.39)
if s[0] in ['x','X']:[](#l4.40)
c = int(s[1:].rstrip(';'), 16)[](#l4.41)
else:[](#l4.42)
c = int(s.rstrip(';'))[](#l4.43)
return chr(c)[](#l4.44)
except ValueError:[](#l4.45)
return '&#' + s[](#l4.46)
else:[](#l4.47)
from html.entities import html5[](#l4.48)
if s in html5:[](#l4.49)
return html5[s][](#l4.50)
elif s.endswith(';'):[](#l4.51)
return '&' + s[](#l4.52)
for x in range(2, len(s)):[](#l4.53)
if s[:x] in html5:[](#l4.54)
return html5[s[:x]] + s[x:][](#l4.55)
else:[](#l4.56)
return '&' + s[](#l4.57)
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",[](#l4.59)
replaceEntities, s, flags=re.ASCII)[](#l4.60)
--- a/Lib/test/test_html.py +++ b/Lib/test/test_html.py @@ -16,9 +16,89 @@ class HtmlTests(unittest.TestCase): html.escape('''', False), ''<script>"&foo;"</script>'')
- def test_unescape(self):
numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;'][](#l5.8)
errmsg = 'unescape(%r) should have returned %r'[](#l5.9)
def check(text, expected):[](#l5.10)
self.assertEqual(html.unescape(text), expected,[](#l5.11)
msg=errmsg % (text, expected))[](#l5.12)
def check_num(num, expected):[](#l5.13)
for format in numeric_formats:[](#l5.14)
text = format % num[](#l5.15)
self.assertEqual(html.unescape(text), expected,[](#l5.16)
msg=errmsg % (text, expected))[](#l5.17)
# check text with no character references[](#l5.18)
check('no character references', 'no character references')[](#l5.19)
# check & followed by invalid chars[](#l5.20)
check('&\n&\t& &&', '&\n&\t& &&')[](#l5.21)
# check & followed by numbers and letters[](#l5.22)
check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;')[](#l5.23)
# check incomplete entities at the end of the string[](#l5.24)
for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']:[](#l5.25)
check(x, x)[](#l5.26)
check(x+';', x+';')[](#l5.27)
# check several combinations of numeric character references,[](#l5.28)
# possibly followed by different characters[](#l5.29)
formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;',[](#l5.30)
'&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;',[](#l5.31)
'&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;'][](#l5.32)
for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234],[](#l5.33)
['A', 'a', '"', '&', '\u2603', '\U00101234']):[](#l5.34)
for s in formats:[](#l5.35)
check(s % num, char)[](#l5.36)
for end in [' ', 'X']:[](#l5.37)
check((s+end) % num, char+end)[](#l5.38)
# check invalid codepoints[](#l5.39)
for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]:[](#l5.40)
check_num(cp, '\uFFFD')[](#l5.41)
# check more invalid codepoints[](#l5.42)
for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]:[](#l5.43)
check_num(cp, '')[](#l5.44)
# check invalid numbers[](#l5.45)
for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'):[](#l5.46)
check_num(num, ch)[](#l5.47)
# check small numbers[](#l5.48)
check_num(0, '\uFFFD')[](#l5.49)
check_num(9, '\t')[](#l5.50)
# check a big number[](#l5.51)
check_num(1000000000000000000, '\uFFFD')[](#l5.52)
# check that multiple trailing semicolons are handled correctly[](#l5.53)
for e in ['";', '";', '";', '";']:[](#l5.54)
check(e, '";')[](#l5.55)
# check that semicolons in the middle don't create problems[](#l5.56)
for e in ['"quot;', '"quot;', '"quot;', '"quot;']:[](#l5.57)
check(e, '"quot;')[](#l5.58)
# check triple adjacent charrefs[](#l5.59)
for e in ['"', '"', '"', '"']:[](#l5.60)
check(e*3, '"""')[](#l5.61)
check((e+';')*3, '"""')[](#l5.62)
# check that the case is respected[](#l5.63)
for e in ['&', '&', '&', '&']:[](#l5.64)
check(e, '&')[](#l5.65)
for e in ['&Amp', '&Amp;']:[](#l5.66)
check(e, e)[](#l5.67)
# check that non-existent named entities are returned unchanged[](#l5.68)
check('&svadilfari;', '&svadilfari;')[](#l5.69)
# the following examples are in the html5 specs[](#l5.70)
check('¬it', '¬it')[](#l5.71)
check('¬it;', '¬it;')[](#l5.72)
check('¬in', '¬in')[](#l5.73)
check('∉', '∉')[](#l5.74)
# a similar example with a long name[](#l5.75)
check('¬ReallyAnExistingNamedCharacterReference;',[](#l5.76)
'¬ReallyAnExistingNamedCharacterReference;')[](#l5.77)
# longest valid name[](#l5.78)
check('∳', '∳')[](#l5.79)
# check a charref that maps to two unicode chars[](#l5.80)
check('∾̳', '\u223E\u0333')[](#l5.81)
check('&acE', '&acE')[](#l5.82)
# see #12888[](#l5.83)
check('{ ' * 1050, '{ ' * 1050)[](#l5.84)
# see #15156[](#l5.85)
check('ÉricÉric&alphacentauriαcentauri',[](#l5.86)
'ÉricÉric&alphacentauriαcentauri')[](#l5.87)
check('&co;', '&co;')[](#l5.88)
--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -569,18 +569,6 @@ class HTMLParserTolerantTestCase(HTMLPar for html, expected in data: self._run_check(html, expected)
- def test_unescape_function(self):
p = self.get_collector()[](#l6.8)
self.assertEqual(p.unescape('&#bad;'),'&#bad;')[](#l6.9)
self.assertEqual(p.unescape('&'),'&')[](#l6.10)
# see #12888[](#l6.11)
self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050)[](#l6.12)
# see #15156[](#l6.13)
self.assertEqual(p.unescape('ÉricÉric'[](#l6.14)
'&alphacentauriαcentauri'),[](#l6.15)
'ÉricÉric&alphacentauriαcentauri')[](#l6.16)
self.assertEqual(p.unescape('&co;'), '&co;')[](#l6.17)
- def test_broken_comments(self): html = ('<! not really a comment >' '<! not a comment either -->'
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -59,6 +59,8 @@ Library
- Issue #19449: in csv's writerow, handle non-string keys when generating the error message that certain keys are not in the 'fieldnames' list. +- Issue #2927: Added the unescape() function to the html module. +
- Issue #8402: Added the escape() function to the glob module.
- Issue #17618: Add Base85 and Ascii85 encoding/decoding to the base64 module.