cpython: 7b9235852b3b (original) (raw)

Mercurial > cpython

changeset 87275:7b9235852b3b

#2927: Added the unescape() function to the html module. [#2927]

Ezio Melotti ezio.melotti@gmail.com
date	Tue, 19 Nov 2013 20:28:45 +0200
parents	ee2c80eeca2a
children	2012e85638d9
files	Doc/library/html.entities.rst Doc/library/html.rst Lib/html/__init__.py Lib/html/parser.py Lib/test/test_html.py Lib/test/test_htmlparser.py Misc/NEWS
diffstat	7 files changed, 215 insertions(+), 49 deletions(-)[+] [-] Doc/library/html.entities.rst 1 Doc/library/html.rst 11 Lib/html/__init__.py 114 Lib/html/parser.py 38 Lib/test/test_html.py 86 Lib/test/test_htmlparser.py 12 Misc/NEWS 2

line wrap: on

line diff

--- a/Doc/library/html.entities.rst +++ b/Doc/library/html.entities.rst @@ -20,6 +20,7 @@ This module defines four dictionaries, : Note that the trailing semicolon is included in the name (e.g. 'gt;'), however some of the names are accepted by the standard even without the semicolon: in this case the name is present with and without the ';'.

See also :func:html.unescape. .. versionadded:: 3.3

--- a/Doc/library/html.rst +++ b/Doc/library/html.rst @@ -20,6 +20,17 @@ This module defines utilities to manipul .. versionadded:: 3.2 + +.. function:: unescape(s) +

Convert all named and numeric character references (e.g. >,
>, &x3e;) in the string s to the corresponding unicode
characters. This function uses the rules defined by the HTML 5 standard
for both valid and invalid character references, and the :data:`list of
HTML 5 named character references <html.entities.html5>`. +
.. versionadded:: 3.4 +

-------------- Submodules in the html package are:

--- a/Lib/html/init.py +++ b/Lib/html/init.py @@ -2,7 +2,12 @@ General functions for HTML manipulation. """ -# NB: this is a candidate for a bytes/string polymorphic interface +import re as _re +from html.entities import html5 as _html5 + + +all = ['escape', 'unescape'] + def escape(s, quote=True): """ @@ -18,3 +23,110 @@ def escape(s, quote=True): s = s.replace('"', """) s = s.replace(''', "'") return s + + +# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references[](#l3.23) + +_invalid_charrefs = {

0x00: '\ufffd', # REPLACEMENT CHARACTER
0x0d: '\r', # CARRIAGE RETURN
0x80: '\u20ac', # EURO SIGN
0x81: '\x81', #
0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK
0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK
0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK
0x85: '\u2026', # HORIZONTAL ELLIPSIS
0x86: '\u2020', # DAGGER
0x87: '\u2021', # DOUBLE DAGGER
0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT
0x89: '\u2030', # PER MILLE SIGN
0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON
0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE
0x8d: '\x8d', #
0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON
0x8f: '\x8f', #
0x90: '\x90', #
0x91: '\u2018', # LEFT SINGLE QUOTATION MARK
0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK
0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK
0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK
0x95: '\u2022', # BULLET
0x96: '\u2013', # EN DASH
0x97: '\u2014', # EM DASH
0x98: '\u02dc', # SMALL TILDE
0x99: '\u2122', # TRADE MARK SIGN
0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON
0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9c: '\u0153', # LATIN SMALL LIGATURE OE
0x9d: '\x9d', #
0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON
0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS

+} + +_invalid_codepoints = {

0x0001 to 0x0008
0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
0x000E to 0x001F
0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x007F to 0x009F
0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xFDD0 to 0xFDEF
0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
others
0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
0x10fffe, 0x10ffff

+} + + +def _replace_charref(s):

s = s.group(1)
if s[0] == '#':
```
   # numeric charref[](#l3.89)
```
```
   if s[1] in 'xX':[](#l3.90)
```

       num = int(s[2:].rstrip(';'), 16)[](#l3.91)

```
   else:[](#l3.92)
```

       num = int(s[1:].rstrip(';'))[](#l3.93)

   if num in _invalid_charrefs:[](#l3.94)

       return _invalid_charrefs[num][](#l3.95)

   if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:[](#l3.96)

```
       return '\uFFFD'[](#l3.97)
```

   if num in _invalid_codepoints:[](#l3.98)

```
       return ''[](#l3.99)
```
```
   return chr(num)[](#l3.100)
```
else:
```
   # named charref[](#l3.102)
```
```
   if s in _html5:[](#l3.103)
```
```
       return _html5[s][](#l3.104)
```

   # find the longest matching name (as defined by the standard)[](#l3.105)

   for x in range(len(s)-1, 1, -1):[](#l3.106)

```
       if s[:x] in _html5:[](#l3.107)
```

           return _html5[s[:x]] + s[x:][](#l3.108)

```
   else:[](#l3.109)
```
```
       return '&' + s[](#l3.110)
```

+ + +_charref = _re.compile(r'&(#[0-9]+;?'

                  r'|#[xX][0-9a-fA-F]+;?'[](#l3.114)

                  r'|[^\t\n\f <&#;]{1,32};?)')[](#l3.115)

+ +def unescape(s):

"""
Convert all named and numeric character references (e.g. >, >,
&x3e;) in the string s to the corresponding unicode characters.
This function uses the rules defined by the HTML 5 standard
for both valid and invalid character references, and the list of
HTML 5 named character references defined in html.entities.html5.
"""
if '&' not in s:
```
   return s[](#l3.126)
```
return _charref.sub(_replace_charref, s)

--- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -8,9 +8,12 @@

and CDATA (character data -- only end tags are special).

-import _markupbase import re import warnings +import _markupbase + +from html import unescape + all = ['HTMLParser'] @@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase) attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] if attrvalue:

           attrvalue = self.unescape(attrvalue)[](#l4.21)

           attrvalue = unescape(attrvalue)[](#l4.22)
       attrs.append((attrname.lower(), attrvalue))[](#l4.23)
       k = m.end()[](#l4.24)

@@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase) def unknown_decl(self, data): if self.strict: self.error("unknown declaration: %r" % (data,)) -

Internal -- helper to remove special character quoting
def unescape(self, s):
```
   if '&' not in s:[](#l4.33)
```
```
       return s[](#l4.34)
```
```
   def replaceEntities(s):[](#l4.35)
```
```
       s = s.groups()[0][](#l4.36)
```
```
       try:[](#l4.37)
```
```
           if s[0] == "#":[](#l4.38)
```
```
               s = s[1:][](#l4.39)
```

               if s[0] in ['x','X']:[](#l4.40)

                   c = int(s[1:].rstrip(';'), 16)[](#l4.41)

```
               else:[](#l4.42)
```

                   c = int(s.rstrip(';'))[](#l4.43)

```
               return chr(c)[](#l4.44)
```
```
       except ValueError:[](#l4.45)
```
```
           return '&#' + s[](#l4.46)
```
```
       else:[](#l4.47)
```

           from html.entities import html5[](#l4.48)

```
           if s in html5:[](#l4.49)
```

               return html5[s][](#l4.50)

           elif s.endswith(';'):[](#l4.51)

```
               return '&' + s[](#l4.52)
```

           for x in range(2, len(s)):[](#l4.53)

               if s[:x] in html5:[](#l4.54)

                   return html5[s[:x]] + s[x:][](#l4.55)

```
           else:[](#l4.56)
```
```
               return '&' + s[](#l4.57)
```

   return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",[](#l4.59)

                 replaceEntities, s, flags=re.ASCII)[](#l4.60)

--- a/Lib/test/test_html.py +++ b/Lib/test/test_html.py @@ -16,9 +16,89 @@ class HtmlTests(unittest.TestCase): html.escape('''', False), ''<script>"&foo;"</script>'')

def test_unescape(self):

   numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;'][](#l5.8)

   errmsg = 'unescape(%r) should have returned %r'[](#l5.9)

```
   def check(text, expected):[](#l5.10)
```

       self.assertEqual(html.unescape(text), expected,[](#l5.11)

                        msg=errmsg % (text, expected))[](#l5.12)

   def check_num(num, expected):[](#l5.13)

       for format in numeric_formats:[](#l5.14)

           text = format % num[](#l5.15)

           self.assertEqual(html.unescape(text), expected,[](#l5.16)

                            msg=errmsg % (text, expected))[](#l5.17)

   # check text with no character references[](#l5.18)

   check('no character references', 'no character references')[](#l5.19)

   # check & followed by invalid chars[](#l5.20)

   check('&\n&\t& &&', '&\n&\t& &&')[](#l5.21)

   # check & followed by numbers and letters[](#l5.22)

   check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;')[](#l5.23)

   # check incomplete entities at the end of the string[](#l5.24)

   for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']:[](#l5.25)

```
       check(x, x)[](#l5.26)
```
```
       check(x+';', x+';')[](#l5.27)
```

   # check several combinations of numeric character references,[](#l5.28)

   # possibly followed by different characters[](#l5.29)

   formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;',[](#l5.30)

              '&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;',[](#l5.31)

              '&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;'][](#l5.32)

   for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234],[](#l5.33)

                        ['A', 'a', '"', '&', '\u2603', '\U00101234']):[](#l5.34)

```
       for s in formats:[](#l5.35)
```

           check(s % num, char)[](#l5.36)

           for end in [' ', 'X']:[](#l5.37)

               check((s+end) % num, char+end)[](#l5.38)

```
   # check invalid codepoints[](#l5.39)
```

   for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]:[](#l5.40)

       check_num(cp, '\uFFFD')[](#l5.41)

   # check more invalid codepoints[](#l5.42)

   for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]:[](#l5.43)

```
       check_num(cp, '')[](#l5.44)
```
```
   # check invalid numbers[](#l5.45)
```

   for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'):[](#l5.46)

```
       check_num(num, ch)[](#l5.47)
```
```
   # check small numbers[](#l5.48)
```
```
   check_num(0, '\uFFFD')[](#l5.49)
```
```
   check_num(9, '\t')[](#l5.50)
```
```
   # check a big number[](#l5.51)
```

   check_num(1000000000000000000, '\uFFFD')[](#l5.52)

   # check that multiple trailing semicolons are handled correctly[](#l5.53)

   for e in ['&quot;;', '&#34;;', '&#x22;;', '&#X22;;']:[](#l5.54)

```
       check(e, '";')[](#l5.55)
```

   # check that semicolons in the middle don't create problems[](#l5.56)

   for e in ['&quot;quot;', '&#34;quot;', '&#x22;quot;', '&#X22;quot;']:[](#l5.57)

```
       check(e, '"quot;')[](#l5.58)
```

   # check triple adjacent charrefs[](#l5.59)

   for e in ['&quot', '&#34', '&#x22', '&#X22']:[](#l5.60)

```
       check(e*3, '"""')[](#l5.61)
```

       check((e+';')*3, '"""')[](#l5.62)

   # check that the case is respected[](#l5.63)

   for e in ['&amp', '&amp;', '&AMP', '&AMP;']:[](#l5.64)

```
       check(e, '&')[](#l5.65)
```

   for e in ['&Amp', '&Amp;']:[](#l5.66)

```
       check(e, e)[](#l5.67)
```

   # check that non-existent named entities are returned unchanged[](#l5.68)

   check('&svadilfari;', '&svadilfari;')[](#l5.69)

   # the following examples are in the html5 specs[](#l5.70)

```
   check('&notit', '¬it')[](#l5.71)
```
```
   check('&notit;', '¬it;')[](#l5.72)
```
```
   check('&notin', '¬in')[](#l5.73)
```
```
   check('&notin;', '∉')[](#l5.74)
```

   # a similar example with a long name[](#l5.75)

   check('&notReallyAnExistingNamedCharacterReference;',[](#l5.76)

         '¬ReallyAnExistingNamedCharacterReference;')[](#l5.77)

```
   # longest valid name[](#l5.78)
```

   check('&CounterClockwiseContourIntegral;', '∳')[](#l5.79)

   # check a charref that maps to two unicode chars[](#l5.80)

   check('&acE;', '\u223E\u0333')[](#l5.81)

```
   check('&acE', '&acE')[](#l5.82)
```
```
   # see #12888[](#l5.83)
```

   check('&#123; ' * 1050, '{ ' * 1050)[](#l5.84)

```
   # see #15156[](#l5.85)
```

   check('&Eacuteric&Eacute;ric&alphacentauri&alpha;centauri',[](#l5.86)

         'ÉricÉric&alphacentauriαcentauri')[](#l5.87)

```
   check('&co;', '&co;')[](#l5.88)
```

-def test_main():

run_unittest(HtmlTests)

if name == 'main':

test_main()

unittest.main()

--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -569,18 +569,6 @@ class HTMLParserTolerantTestCase(HTMLPar for html, expected in data: self._run_check(html, expected)

def test_unescape_function(self):
```
   p = self.get_collector()[](#l6.8)
```

   self.assertEqual(p.unescape('&#bad;'),'&#bad;')[](#l6.9)

   self.assertEqual(p.unescape('&#0038;'),'&')[](#l6.10)

```
   # see #12888[](#l6.11)
```

   self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)[](#l6.12)

```
   # see #15156[](#l6.13)
```

   self.assertEqual(p.unescape('&Eacuteric&Eacute;ric'[](#l6.14)

                               '&alphacentauri&alpha;centauri'),[](#l6.15)

                               'ÉricÉric&alphacentauriαcentauri')[](#l6.16)

   self.assertEqual(p.unescape('&co;'), '&co;')[](#l6.17)

- def test_broken_comments(self): html = ('<! not really a comment >' '<! not a comment either -->'

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -59,6 +59,8 @@ Library

Issue #19449: in csv's writerow, handle non-string keys when generating the error message that certain keys are not in the 'fieldnames' list. +- Issue #2927: Added the unescape() function to the html module. +
Issue #8402: Added the escape() function to the glob module.
Issue #17618: Add Base85 and Ascii85 encoding/decoding to the base64 module.

cpython: 7b9235852b3b (original) (raw)

Mercurial > cpython

changeset 87275:7b9235852b3b

0x0001 to 0x0008

0x000E to 0x001F

0x007F to 0x009F

0xFDD0 to 0xFDEF

others

and CDATA (character data -- only end tags are special).

Internal -- helper to remove special character quoting