cpython: 1575f2dd08c4 (original) (raw)

--- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -16,14 +16,21 @@ This module defines a class :class:HTMLParser which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(strict=False) +.. class:: HTMLParser(strict=False, *, convert_charrefs=False) +

.. deprecated-removed:: 3.3 3.5 The strict argument and the strict mode have been deprecated. The parser is now able to accept and parse invalid markup too.

+ An exception is defined as well: @@ -181,7 +191,8 @@ implementations do nothing (except for : This method is called to process a named character reference of the form &name; (e.g. >), where name is a general entity reference

.. method:: HTMLParser.handle_charref(name) @@ -189,7 +200,8 @@ implementations do nothing (except for : This method is called to process decimal and hexadecimal numeric character references of the form &#NNN; and &#xNNN;. For example, the decimal equivalent for > is >, whereas the hexadecimal is >;

.. method:: HTMLParser.handle_comment(data) @@ -324,7 +336,8 @@ correct char (note: these 3 references a Num ent : > Feeding incomplete chunks to :meth:~HTMLParser.feed works, but -:meth:~HTMLParser.handle_data might be called more than once:: +:meth:~HTMLParser.handle_data might be called more than once +(unless convert_charrefs is set to True):: >>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']: ... parser.feed(chunk)

--- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -97,7 +97,7 @@ class HTMLParseError(Exception): return result -_strict_sentinel = object() +_default_sentinel = object() class HTMLParser(_markupbase.ParserBase): """Find tags and other markup and call handler functions. @@ -112,28 +112,39 @@ class HTMLParser(_markupbase.ParserBase) self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data

def reset(self): @@ -184,14 +195,25 @@ class HTMLParser(_markupbase.ParserBase) i = 0 n = len(rawdata) while i < n:

@@ -226,7 +248,10 @@ class HTMLParser(_markupbase.ParserBase) k = i + 1 else: k += 1

@@ -277,7 +302,10 @@ class HTMLParser(_markupbase.ParserBase) assert 0, "interesting.search() lied" # end while if end and i < n and not self.cdata_elem:

--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -70,6 +70,18 @@ class EventCollectorExtra(EventCollector self.append(("starttag_text", self.get_starttag_text())) +class EventCollectorCharrefs(EventCollector): +

+

+

+ + class TestCaseBase(unittest.TestCase): def get_collector(self): @@ -84,12 +96,14 @@ class TestCaseBase(unittest.TestCase): parser.close() events = parser.get_events() if events != expected_events:

def _run_check_extra(self, source, events):

def _parse_error(self, source): def parse(source=source): @@ -105,7 +119,7 @@ class HTMLParserStrictTestCase(TestCaseB def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False):

def test_processing_instruction_only(self): self._run_check("<?processing instruction>", [ @@ -335,7 +349,7 @@ text self._run_check(s, [("starttag", element_lower, []), ("data", content), ("endtag", element_lower)],

def test_comments(self): html = ("" @@ -363,14 +377,54 @@ text ('comment', '[if lte IE 7]>pretty?<![endif]')] self._run_check(html, expected)

+ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): def get_collector(self):

def test_deprecation_warnings(self): with self.assertWarns(DeprecationWarning):

@@ -630,7 +684,7 @@ class AttributesStrictTestCase(TestCaseB def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False):

def test_attr_syntax(self): output = [ @@ -691,7 +745,7 @@ class AttributesStrictTestCase(TestCaseB class AttributesTolerantTestCase(AttributesStrictTestCase): def get_collector(self):

def test_attr_funky_names2(self): self._run_check(

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -132,6 +132,9 @@ Library