cpython: 1575f2dd08c4 (original) (raw)
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -16,14 +16,21 @@
This module defines a class :class:HTMLParser
which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(strict=False)
+.. class:: HTMLParser(strict=False, *, convert_charrefs=False)
+
- Create a parser instance. If strict is
False
(the default), the parser - will accept and parse invalid markup. If strict is
True
the parser - will raise an :exc:
~html.parser.HTMLParseError
exception instead [#]_ when - it's not able to parse the markup.
- The use of
strict=True
is discouraged and the strict argument is - deprecated.
- If convert_charrefs is
True
(default:False
), all character - references (except the ones in
script
/style
elements) are - automatically converted to the corresponding Unicode characters.
- The use of
convert_charrefs=True
is encouraged and will become - the default in Python 3.5. +
- If strict is
False
(the default), the parser will accept and parse - invalid markup. If strict is
True
the parser will raise an - :exc:
~html.parser.HTMLParseError
exception instead [#]_ when it's not - able to parse the markup. The use of
strict=True
is discouraged and - the strict argument is deprecated.
An :class:
.HTMLParser
instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are @@ -34,12 +41,15 @@ parsing text files formatted in HTML (Hy handler for elements which are closed implicitly by closing an outer element. .. versionchanged:: 3.2
*strict* keyword added.[](#l1.36)
*strict* argument added.[](#l1.37)
.. deprecated-removed:: 3.3 3.5 The strict argument and the strict mode have been deprecated. The parser is now able to accept and parse invalid markup too.
+
An exception is defined as well:
@@ -181,7 +191,8 @@ implementations do nothing (except for :
This method is called to process a named character reference of the form
&name;
(e.g. >
), where name is a general entity reference
.. method:: HTMLParser.handle_charref(name)
@@ -189,7 +200,8 @@ implementations do nothing (except for :
This method is called to process decimal and hexadecimal numeric character
references of the form &#NNN;
and &#xNNN;
. For example, the decimal
equivalent for >
is >
, whereas the hexadecimal is >
;
- in this case the method will receive
'62'
or'x3E'
. This method - is never called if convert_charrefs is
True
.
.. method:: HTMLParser.handle_comment(data)
@@ -324,7 +336,8 @@ correct char (note: these 3 references a
Num ent : >
Feeding incomplete chunks to :meth:~HTMLParser.feed
works, but
-:meth:~HTMLParser.handle_data
might be called more than once::
+:meth:~HTMLParser.handle_data
might be called more than once
+(unless convert_charrefs is set to True
)::
>>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:
... parser.feed(chunk)
--- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -97,7 +97,7 @@ class HTMLParseError(Exception): return result -_strict_sentinel = object() +_default_sentinel = object() class HTMLParser(_markupbase.ParserBase): """Find tags and other markup and call handler functions. @@ -112,28 +112,39 @@ class HTMLParser(_markupbase.ParserBase) self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data
- may be split up in arbitrary chunks). Entity references are
- passed by calling self.handle_entityref() with the entity
- reference as the argument. Numeric character references are
- passed to self.handle_charref() with the string containing the
- reference as the argument.
- may be split up in arbitrary chunks). If convert_charrefs is
- True the character references are converted automatically to the
- corresponding Unicode character (and self.handle_data() is no
- longer split in chunks), otherwise they are passed by calling
- self.handle_entityref() or self.handle_charref() with the string
- containing respectively the named or numeric reference as the
- argument. """ CDATA_CONTENT_ELEMENTS = ("script", "style")
- def init(self, strict=_default_sentinel, *,
convert_charrefs=_default_sentinel):[](#l2.34) """Initialize and reset this instance.[](#l2.35)
If convert_charrefs is True (default: False), all character references[](#l2.37)
are automatically converted to the corresponding Unicode characters.[](#l2.38) If strict is set to False (the default) the parser will parse invalid[](#l2.39) markup, otherwise it will raise an error. Note that the strict mode[](#l2.40) and argument are deprecated.[](#l2.41) """[](#l2.42)
if strict is not _strict_sentinel:[](#l2.43)
if strict is not _default_sentinel:[](#l2.44) warnings.warn("The strict argument and mode are deprecated.",[](#l2.45) DeprecationWarning, stacklevel=2)[](#l2.46) else:[](#l2.47) strict = False # default[](#l2.48) self.strict = strict[](#l2.49)
if convert_charrefs is _default_sentinel:[](#l2.50)
convert_charrefs = False # default[](#l2.51)
warnings.warn("The value of convert_charrefs will become True in "[](#l2.52)
"3.5. You are encouraged to set the value explicitly.",[](#l2.53)
DeprecationWarning, stacklevel=2)[](#l2.54)
self.convert_charrefs = convert_charrefs[](#l2.55) self.reset()[](#l2.56)
def reset(self): @@ -184,14 +195,25 @@ class HTMLParser(_markupbase.ParserBase) i = 0 n = len(rawdata) while i < n:
match = self.interesting.search(rawdata, i) # < or &[](#l2.63)
if match:[](#l2.64)
j = match.start()[](#l2.65)
if self.convert_charrefs and not self.cdata_elem:[](#l2.66)
j = rawdata.find('<', i)[](#l2.67)
if j < 0:[](#l2.68)
if not end:[](#l2.69)
break # wait till we get all the text[](#l2.70)
j = n[](#l2.71) else:[](#l2.72)
if self.cdata_elem:[](#l2.73)
break[](#l2.74)
j = n[](#l2.75)
if i < j: self.handle_data(rawdata[i:j])[](#l2.76)
match = self.interesting.search(rawdata, i) # < or &[](#l2.77)
if match:[](#l2.78)
j = match.start()[](#l2.79)
else:[](#l2.80)
if self.cdata_elem:[](#l2.81)
break[](#l2.82)
j = n[](#l2.83)
if i < j:[](#l2.84)
if self.convert_charrefs and not self.cdata_elem:[](#l2.85)
self.handle_data(unescape(rawdata[i:j]))[](#l2.86)
else:[](#l2.87)
self.handle_data(rawdata[i:j])[](#l2.88) i = self.updatepos(i, j)[](#l2.89) if i == n: break[](#l2.90) startswith = rawdata.startswith[](#l2.91)
@@ -226,7 +248,10 @@ class HTMLParser(_markupbase.ParserBase) k = i + 1 else: k += 1
self.handle_data(rawdata[i:k])[](#l2.96)
if self.convert_charrefs and not self.cdata_elem:[](#l2.97)
self.handle_data(unescape(rawdata[i:k]))[](#l2.98)
else:[](#l2.99)
self.handle_data(rawdata[i:k])[](#l2.100) i = self.updatepos(i, k)[](#l2.101) elif startswith("&#", i):[](#l2.102) match = charref.match(rawdata, i)[](#l2.103)
@@ -277,7 +302,10 @@ class HTMLParser(_markupbase.ParserBase) assert 0, "interesting.search() lied" # end while if end and i < n and not self.cdata_elem:
self.handle_data(rawdata[i:n])[](#l2.108)
if self.convert_charrefs and not self.cdata_elem:[](#l2.109)
self.handle_data(unescape(rawdata[i:n]))[](#l2.110)
else:[](#l2.111)
self.handle_data(rawdata[i:n])[](#l2.112) i = self.updatepos(i, n)[](#l2.113) self.rawdata = rawdata[i:][](#l2.114)
--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -70,6 +70,18 @@ class EventCollectorExtra(EventCollector self.append(("starttag_text", self.get_starttag_text())) +class EventCollectorCharrefs(EventCollector): +
- def handle_charref(self, data):
self.fail('This should never be called with convert_charrefs=True')[](#l3.13)
- def handle_entityref(self, data):
self.fail('This should never be called with convert_charrefs=True')[](#l3.16)
+ + class TestCaseBase(unittest.TestCase): def get_collector(self): @@ -84,12 +96,14 @@ class TestCaseBase(unittest.TestCase): parser.close() events = parser.get_events() if events != expected_events:
self.fail("received events did not match expected events\n"[](#l3.26)
"Expected:\n" + pprint.pformat(expected_events) +[](#l3.27)
self.fail("received events did not match expected events" +[](#l3.28)
"\nSource:\n" + repr(source) +[](#l3.29)
"\nExpected:\n" + pprint.pformat(expected_events) +[](#l3.30) "\nReceived:\n" + pprint.pformat(events))[](#l3.31)
def _run_check_extra(self, source, events):
self._run_check(source, events, EventCollectorExtra())[](#l3.34)
self._run_check(source, events,[](#l3.35)
EventCollectorExtra(convert_charrefs=False))[](#l3.36)
def _parse_error(self, source): def parse(source=source): @@ -105,7 +119,7 @@ class HTMLParserStrictTestCase(TestCaseB def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False):
return EventCollector(strict=True)[](#l3.44)
return EventCollector(strict=True, convert_charrefs=False)[](#l3.45)
def test_processing_instruction_only(self): self._run_check("<?processing instruction>", [ @@ -335,7 +349,7 @@ text self._run_check(s, [("starttag", element_lower, []), ("data", content), ("endtag", element_lower)],
collector=Collector())[](#l3.53)
collector=Collector(convert_charrefs=False))[](#l3.54)
def test_comments(self): html = ("" @@ -363,14 +377,54 @@ text ('comment', '[if lte IE 7]>pretty?<![endif]')] self._run_check(html, expected)
- def test_convert_charrefs(self):
collector = lambda: EventCollectorCharrefs(convert_charrefs=True)[](#l3.63)
self.assertTrue(collector().convert_charrefs)[](#l3.64)
charrefs = ['"', '"', '"', '"', '"', '"'][](#l3.65)
# check charrefs in the middle of the text/attributes[](#l3.66)
expected = [('starttag', 'a', [('href', 'foo"zar')]),[](#l3.67)
('data', 'a"z'), ('endtag', 'a')][](#l3.68)
for charref in charrefs:[](#l3.69)
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),[](#l3.70)
expected, collector=collector())[](#l3.71)
# check charrefs at the beginning/end of the text/attributes[](#l3.72)
expected = [('data', '"'),[](#l3.73)
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),[](#l3.74)
('data', '"'), ('endtag', 'a'), ('data', '"')][](#l3.75)
for charref in charrefs:[](#l3.76)
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'[](#l3.77)
'{0}</a>{0}'.format(charref),[](#l3.78)
expected, collector=collector())[](#l3.79)
# check charrefs in <script>/<style> elements[](#l3.80)
for charref in charrefs:[](#l3.81)
text = 'X'.join([charref]*3)[](#l3.82)
expected = [('data', '"'),[](#l3.83)
('starttag', 'script', []), ('data', text),[](#l3.84)
('endtag', 'script'), ('data', '"'),[](#l3.85)
('starttag', 'style', []), ('data', text),[](#l3.86)
('endtag', 'style'), ('data', '"')][](#l3.87)
self._run_check('{1}<script>{0}</script>{1}'[](#l3.88)
'<style>{0}</style>{1}'.format(text, charref),[](#l3.89)
expected, collector=collector())[](#l3.90)
# check truncated charrefs at the end of the file[](#l3.91)
html = '&quo &# &#x'[](#l3.92)
for x in range(1, len(html)):[](#l3.93)
self._run_check(html[:x], [('data', html[:x])],[](#l3.94)
collector=collector())[](#l3.95)
# check a string with no charrefs[](#l3.96)
self._run_check('no charrefs here', [('data', 'no charrefs here')],[](#l3.97)
collector=collector())[](#l3.98)
+ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): def get_collector(self):
return EventCollector()[](#l3.104)
return EventCollector(convert_charrefs=False)[](#l3.105)
def test_deprecation_warnings(self): with self.assertWarns(DeprecationWarning):
EventCollector() # convert_charrefs not passed explicitly[](#l3.109)
with self.assertWarns(DeprecationWarning):[](#l3.110) EventCollector(strict=True)[](#l3.111) with self.assertWarns(DeprecationWarning):[](#l3.112) EventCollector(strict=False)[](#l3.113)
@@ -630,7 +684,7 @@ class AttributesStrictTestCase(TestCaseB def get_collector(self): with support.check_warnings(("", DeprecationWarning), quite=False):
return EventCollector(strict=True)[](#l3.118)
return EventCollector(strict=True, convert_charrefs=False)[](#l3.119)
def test_attr_syntax(self): output = [ @@ -691,7 +745,7 @@ class AttributesStrictTestCase(TestCaseB class AttributesTolerantTestCase(AttributesStrictTestCase): def get_collector(self):
return EventCollector()[](#l3.127)
return EventCollector(convert_charrefs=False)[](#l3.128)
def test_attr_funky_names2(self): self._run_check(