cpython: 1575f2dd08c4 (original) (raw)

--- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -16,14 +16,21 @@ This module defines a class :class:HTMLParser which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(strict=False) +.. class:: HTMLParser(strict=False, *, convert_charrefs=False) +

Create a parser instance.

Create a parser instance. If strict is False (the default), the parser
will accept and parse invalid markup. If strict is True the parser
will raise an :exc:~html.parser.HTMLParseError exception instead [#]_ when
it's not able to parse the markup.
The use of strict=True is discouraged and the strict argument is
deprecated.

If convert_charrefs is True (default: False), all character
references (except the ones in script/style elements) are
automatically converted to the corresponding Unicode characters.
The use of convert_charrefs=True is encouraged and will become
the default in Python 3.5. +
If strict is False (the default), the parser will accept and parse
invalid markup. If strict is True the parser will raise an
:exc:~html.parser.HTMLParseError exception instead [#]_ when it's not
able to parse the markup. The use of strict=True is discouraged and
the strict argument is deprecated. An :class:.HTMLParser instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are @@ -34,12 +41,15 @@ parsing text files formatted in HTML (Hy handler for elements which are closed implicitly by closing an outer element. .. versionchanged:: 3.2

```
 *strict* keyword added.[](#l1.36)
```

```
 *strict* argument added.[](#l1.37)
```

.. deprecated-removed:: 3.3 3.5 The strict argument and the strict mode have been deprecated. The parser is now able to accept and parse invalid markup too.

.. versionchanged:: 3.4

 *convert_charrefs* keyword argument added.[](#l1.44)

+ An exception is defined as well: @@ -181,7 +191,8 @@ implementations do nothing (except for : This method is called to process a named character reference of the form &name; (e.g. >), where name is a general entity reference

(e.g. 'gt').

(e.g. 'gt'). This method is never called if convert_charrefs is
True.

.. method:: HTMLParser.handle_charref(name) @@ -189,7 +200,8 @@ implementations do nothing (except for : This method is called to process decimal and hexadecimal numeric character references of the form &#NNN; and &#xNNN;. For example, the decimal equivalent for > is >, whereas the hexadecimal is >;

in this case the method will receive '62' or 'x3E'.

in this case the method will receive '62' or 'x3E'. This method
is never called if convert_charrefs is True.

.. method:: HTMLParser.handle_comment(data) @@ -324,7 +336,8 @@ correct char (note: these 3 references a Num ent : > Feeding incomplete chunks to :meth:~HTMLParser.feed works, but -:meth:~HTMLParser.handle_data might be called more than once:: +:meth:~HTMLParser.handle_data might be called more than once +(unless convert_charrefs is set to True):: >>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']: ... parser.feed(chunk)

--- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -97,7 +97,7 @@ class HTMLParseError(Exception): return result -_strict_sentinel = object() +_default_sentinel = object() class HTMLParser(_markupbase.ParserBase): """Find tags and other markup and call handler functions. @@ -112,28 +112,39 @@ class HTMLParser(_markupbase.ParserBase) self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data

may be split up in arbitrary chunks). Entity references are
passed by calling self.handle_entityref() with the entity
reference as the argument. Numeric character references are
passed to self.handle_charref() with the string containing the
reference as the argument.

may be split up in arbitrary chunks). If convert_charrefs is
True the character references are converted automatically to the
corresponding Unicode character (and self.handle_data() is no
longer split in chunks), otherwise they are passed by calling
self.handle_entityref() or self.handle_charref() with the string
containing respectively the named or numeric reference as the
argument. """ CDATA_CONTENT_ELEMENTS = ("script", "style")

def init(self, strict=_strict_sentinel):

def init(self, strict=_default_sentinel, *,

            convert_charrefs=_default_sentinel):[](#l2.34)
   """Initialize and reset this instance.[](#l2.35)

   If convert_charrefs is True (default: False), all character references[](#l2.37)

   are automatically converted to the corresponding Unicode characters.[](#l2.38)
   If strict is set to False (the default) the parser will parse invalid[](#l2.39)
   markup, otherwise it will raise an error.  Note that the strict mode[](#l2.40)
   and argument are deprecated.[](#l2.41)
   """[](#l2.42)

   if strict is not _strict_sentinel:[](#l2.43)

   if strict is not _default_sentinel:[](#l2.44)
       warnings.warn("The strict argument and mode are deprecated.",[](#l2.45)
                     DeprecationWarning, stacklevel=2)[](#l2.46)
   else:[](#l2.47)
       strict = False  # default[](#l2.48)
   self.strict = strict[](#l2.49)

   if convert_charrefs is _default_sentinel:[](#l2.50)

       convert_charrefs = False  # default[](#l2.51)

       warnings.warn("The value of convert_charrefs will become True in "[](#l2.52)

                     "3.5. You are encouraged to set the value explicitly.",[](#l2.53)

                     DeprecationWarning, stacklevel=2)[](#l2.54)

   self.convert_charrefs = convert_charrefs[](#l2.55)
   self.reset()[](#l2.56)

def reset(self): @@ -184,14 +195,25 @@ class HTMLParser(_markupbase.ParserBase) i = 0 n = len(rawdata) while i < n:

       match = self.interesting.search(rawdata, i) # < or &[](#l2.63)

```
       if match:[](#l2.64)
```
```
           j = match.start()[](#l2.65)
```

       if self.convert_charrefs and not self.cdata_elem:[](#l2.66)

           j = rawdata.find('<', i)[](#l2.67)

```
           if j < 0:[](#l2.68)
```
```
               if not end:[](#l2.69)
```

                   break  # wait till we get all the text[](#l2.70)

               j = n[](#l2.71)
       else:[](#l2.72)

           if self.cdata_elem:[](#l2.73)

```
               break[](#l2.74)
```
```
           j = n[](#l2.75)
```

       if i < j: self.handle_data(rawdata[i:j])[](#l2.76)

           match = self.interesting.search(rawdata, i)  # < or &[](#l2.77)

```
           if match:[](#l2.78)
```

               j = match.start()[](#l2.79)

```
           else:[](#l2.80)
```

               if self.cdata_elem:[](#l2.81)

```
                   break[](#l2.82)
```
```
               j = n[](#l2.83)
```
```
       if i < j:[](#l2.84)
```

           if self.convert_charrefs and not self.cdata_elem:[](#l2.85)

               self.handle_data(unescape(rawdata[i:j]))[](#l2.86)

```
           else:[](#l2.87)
```

               self.handle_data(rawdata[i:j])[](#l2.88)
       i = self.updatepos(i, j)[](#l2.89)
       if i == n: break[](#l2.90)
       startswith = rawdata.startswith[](#l2.91)

@@ -226,7 +248,10 @@ class HTMLParser(_markupbase.ParserBase) k = i + 1 else: k += 1

               self.handle_data(rawdata[i:k])[](#l2.96)

               if self.convert_charrefs and not self.cdata_elem:[](#l2.97)

                   self.handle_data(unescape(rawdata[i:k]))[](#l2.98)

```
               else:[](#l2.99)
```

                   self.handle_data(rawdata[i:k])[](#l2.100)
           i = self.updatepos(i, k)[](#l2.101)
       elif startswith("&#", i):[](#l2.102)
           match = charref.match(rawdata, i)[](#l2.103)

@@ -277,7 +302,10 @@ class HTMLParser(_markupbase.ParserBase) assert 0, "interesting.search() lied" # end while if end and i < n and not self.cdata_elem:

       self.handle_data(rawdata[i:n])[](#l2.108)

       if self.convert_charrefs and not self.cdata_elem:[](#l2.109)

           self.handle_data(unescape(rawdata[i:n]))[](#l2.110)

```
       else:[](#l2.111)
```

           self.handle_data(rawdata[i:n])[](#l2.112)
       i = self.updatepos(i, n)[](#l2.113)
   self.rawdata = rawdata[i:][](#l2.114)

--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -70,6 +70,18 @@ class EventCollectorExtra(EventCollector self.append(("starttag_text", self.get_starttag_text())) +class EventCollectorCharrefs(EventCollector): +

def get_events(self):
```
   return self.events[](#l3.10)
```

def handle_charref(self, data):

   self.fail('This should never be called with convert_charrefs=True')[](#l3.13)

def handle_entityref(self, data):

   self.fail('This should never be called with convert_charrefs=True')[](#l3.16)

+ + class TestCaseBase(unittest.TestCase): def get_collector(self): @@ -84,12 +96,14 @@ class TestCaseBase(unittest.TestCase): parser.close() events = parser.get_events() if events != expected_events:

       self.fail("received events did not match expected events\n"[](#l3.26)

                 "Expected:\n" + pprint.pformat(expected_events) +[](#l3.27)

       self.fail("received events did not match expected events" +[](#l3.28)

                 "\nSource:\n" + repr(source) +[](#l3.29)

                 "\nExpected:\n" + pprint.pformat(expected_events) +[](#l3.30)
                 "\nReceived:\n" + pprint.pformat(events))[](#l3.31)

def _run_check_extra(self, source, events):

   self._run_check(source, events, EventCollectorExtra())[](#l3.34)

   self._run_check(source, events,[](#l3.35)

                   EventCollectorExtra(convert_charrefs=False))[](#l3.36)

def _parse_error(self, source): def parse(source=source): @@ -105,7 +119,7 @@ class HTMLParserStrictTestCase(TestCaseB def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False):

       return EventCollector(strict=True)[](#l3.44)

       return EventCollector(strict=True, convert_charrefs=False)[](#l3.45)

def test_processing_instruction_only(self): self._run_check("<?processing instruction>", [ @@ -335,7 +349,7 @@ text self._run_check(s, [("starttag", element_lower, []), ("data", content), ("endtag", element_lower)],

                       collector=Collector())[](#l3.53)

                       collector=Collector(convert_charrefs=False))[](#l3.54)

def test_comments(self): html = ("" @@ -363,14 +377,54 @@ text ('comment', '[if lte IE 7]>pretty?<![endif]')] self._run_check(html, expected)

def test_convert_charrefs(self):

   collector = lambda: EventCollectorCharrefs(convert_charrefs=True)[](#l3.63)

   self.assertTrue(collector().convert_charrefs)[](#l3.64)

   charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22'][](#l3.65)

   # check charrefs in the middle of the text/attributes[](#l3.66)

   expected = [('starttag', 'a', [('href', 'foo"zar')]),[](#l3.67)

               ('data', 'a"z'), ('endtag', 'a')][](#l3.68)

```
   for charref in charrefs:[](#l3.69)
```

       self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),[](#l3.70)

                       expected, collector=collector())[](#l3.71)

   # check charrefs at the beginning/end of the text/attributes[](#l3.72)

```
   expected = [('data', '"'),[](#l3.73)
```

               ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),[](#l3.74)

               ('data', '"'), ('endtag', 'a'), ('data', '"')][](#l3.75)

```
   for charref in charrefs:[](#l3.76)
```

       self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'[](#l3.77)

                       '{0}</a>{0}'.format(charref),[](#l3.78)

                       expected, collector=collector())[](#l3.79)

   # check charrefs in <script>/<style> elements[](#l3.80)

```
   for charref in charrefs:[](#l3.81)
```

       text = 'X'.join([charref]*3)[](#l3.82)

       expected = [('data', '"'),[](#l3.83)

                   ('starttag', 'script', []), ('data', text),[](#l3.84)

                   ('endtag', 'script'), ('data', '"'),[](#l3.85)

                   ('starttag', 'style', []), ('data', text),[](#l3.86)

                   ('endtag', 'style'), ('data', '"')][](#l3.87)

       self._run_check('{1}<script>{0}</script>{1}'[](#l3.88)

                       '<style>{0}</style>{1}'.format(text, charref),[](#l3.89)

                       expected, collector=collector())[](#l3.90)

   # check truncated charrefs at the end of the file[](#l3.91)

```
   html = '&quo &# &#x'[](#l3.92)
```

   for x in range(1, len(html)):[](#l3.93)

       self._run_check(html[:x], [('data', html[:x])],[](#l3.94)

                       collector=collector())[](#l3.95)

   # check a string with no charrefs[](#l3.96)

   self._run_check('no charrefs here', [('data', 'no charrefs here')],[](#l3.97)

                   collector=collector())[](#l3.98)

+ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): def get_collector(self):

```
   return EventCollector()[](#l3.104)
```

   return EventCollector(convert_charrefs=False)[](#l3.105)

def test_deprecation_warnings(self): with self.assertWarns(DeprecationWarning):

       EventCollector()  # convert_charrefs not passed explicitly[](#l3.109)

   with self.assertWarns(DeprecationWarning):[](#l3.110)
       EventCollector(strict=True)[](#l3.111)
   with self.assertWarns(DeprecationWarning):[](#l3.112)
       EventCollector(strict=False)[](#l3.113)

@@ -630,7 +684,7 @@ class AttributesStrictTestCase(TestCaseB def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False):

       return EventCollector(strict=True)[](#l3.118)

       return EventCollector(strict=True, convert_charrefs=False)[](#l3.119)

def test_attr_syntax(self): output = [ @@ -691,7 +745,7 @@ class AttributesStrictTestCase(TestCaseB class AttributesTolerantTestCase(AttributesStrictTestCase): def get_collector(self):

```
   return EventCollector()[](#l3.127)
```

   return EventCollector(convert_charrefs=False)[](#l3.128)

def test_attr_funky_names2(self): self._run_check(

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -132,6 +132,9 @@ Library

Issue #19449: in csv's writerow, handle non-string keys when generating the error message that certain keys are not in the 'fieldnames' list. +- Issue #13633: Added a new convert_charrefs keyword arg to HTMLParser that,

when True, automatically converts all character references. +

Issue #2927: Added the unescape() function to the html module.
Issue #8402: Added the escape() function to the glob module.