cpython: 8dd2f5754b2f (original) (raw)
Mercurial > cpython
changeset 77630:8dd2f5754b2f
#15114: the strict mode of HTMLParser and the HTMLParseError exception are deprecated now that the parser is able to parse invalid markup. [#15114]
Ezio Melotti ezio.melotti@gmail.com | |
---|---|
date | Sat, 23 Jun 2012 15:27:51 +0200 |
parents | 9945d7dfa72c |
children | 0e8285321659 |
files | Doc/library/html.parser.rst Lib/html/parser.py Lib/test/test_htmlparser.py Misc/NEWS |
diffstat | 4 files changed, 35 insertions(+), 18 deletions(-)[+] [-] Doc/library/html.parser.rst 21 Lib/html/parser.py 21 Lib/test/test_htmlparser.py 6 Misc/NEWS 5 |
line wrap: on
line diff
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -16,13 +16,14 @@
This module defines a class :class:HTMLParser
which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(strict=True)
+.. class:: HTMLParser(strict=False)
- Create a parser instance. If strict is
True
(the default), invalid - HTML results in :exc:
~html.parser.HTMLParseError
exceptions [#]_. If - strict is
False
, the parser uses heuristics to make a best guess at - the intention of any invalid HTML it encounters, similar to the way most
- browsers do. Using
strict=False
is advised.
- Create a parser instance. If strict is
False
(the default), the parser - will accept and parse invalid markup. If strict is
True
the parser - will raise an :exc:
~html.parser.HTMLParseError
exception instead [#]_ when - it's not able to parse the markup.
- The use of
strict=True
is discouraged and the strict argument is - deprecated.
An :class:
.HTMLParser
instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are @@ -34,6 +35,10 @@ parsing text files formatted in HTML (Hy .. versionchanged:: 3.2 strict keyword added - .. deprecated-removed:: 3.3 3.5
The *strict* argument and the strict mode have been deprecated.[](#l1.29)
The parser is now able to accept and parse invalid markup too.[](#l1.30)
+
An exception is defined as well:
@@ -46,6 +51,10 @@ An exception is defined as well:
detected, and :attr:offset
is the number of characters into the line at
which the construct starts.
- .. deprecated-removed:: 3.3 3.5
This exception has been deprecated because it's never raised by the parser[](#l1.40)
(when the default non-strict mode is used).[](#l1.41)
+ Example HTML Parser Application -------------------------------
--- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -10,6 +10,7 @@ import _markupbase import re +import warnings
Regular expressions used for parsing
@@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase) CDATA_CONTENT_ELEMENTS = ("script", "style")
If strict is set to True (the default), errors are raised when invalid[](#l2.19)
HTML is encountered. If set to False, an attempt is instead made to[](#l2.20)
continue parsing, making "best guesses" about the intended meaning, in[](#l2.21)
a fashion similar to what browsers typically do.[](#l2.22)
If strict is set to False (the default) the parser will parse invalid[](#l2.23)
markup, otherwise it will raise an error. Note that the strict mode[](#l2.24)
is deprecated.[](#l2.25) """[](#l2.26)
if strict:[](#l2.27)
warnings.warn("The strict mode is deprecated.",[](#l2.28)
DeprecationWarning, stacklevel=2)[](#l2.29) self.strict = strict[](#l2.30) self.reset()[](#l2.31)
@@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase) # See also parse_declaration in _markupbase def parse_html_declaration(self, i): rawdata = self.rawdata
if rawdata[i:i+2] != '<!':[](#l2.37)
self.error('unexpected call to parse_html_declaration()')[](#l2.38)
assert rawdata[i:i+2] == '<!', ('unexpected call to '[](#l2.39)
'parse_html_declaration()')[](#l2.40) if rawdata[i:i+4] == '<!--':[](#l2.41) # this case is actually already handled in goahead()[](#l2.42) return self.parse_comment(i)[](#l2.43)
@@ -292,8 +295,8 @@ class HTMLParser(_markupbase.ParserBase) # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state[](#l2.45) def parse_bogus_comment(self, i, report=1): rawdata = self.rawdata
if rawdata[i:i+2] not in ('<!', '</'):[](#l2.48)
self.error('unexpected call to parse_comment()')[](#l2.49)
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '[](#l2.50)
'parse_comment()')[](#l2.51) pos = rawdata.find('>', i+2)[](#l2.52) if pos == -1:[](#l2.53) return -1[](#l2.54)
--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -102,7 +102,8 @@ class TestCaseBase(unittest.TestCase): class HTMLParserStrictTestCase(TestCaseBase): def get_collector(self):
return EventCollector(strict=True)[](#l3.7)
with support.check_warnings(("", DeprecationWarning), quite=False):[](#l3.8)
return EventCollector(strict=True)[](#l3.9)
def test_processing_instruction_only(self): self._run_check("<?processing instruction>", [ @@ -594,7 +595,8 @@ class HTMLParserTolerantTestCase(HTMLPar class AttributesStrictTestCase(TestCaseBase): def get_collector(self):
return EventCollector(strict=True)[](#l3.17)
with support.check_warnings(("", DeprecationWarning), quite=False):[](#l3.18)
return EventCollector(strict=True)[](#l3.19)
def test_attr_syntax(self): output = [
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -43,6 +43,9 @@ Core and Builtins Library ------- +- Issue #15114: the strict mode of HTMLParser and the HTMLParseError exception
- Issue #3665: \u and \U escapes are now supported in unicode regular expressions. Patch by Serhiy Storchaka. @@ -78,7 +81,7 @@ Library
- Issue #9527: datetime.astimezone() method will now supply a class timezone instance corresponding to the system local timezone when called with no arguments.
- +
- Issue #14653: email.utils.mktime_tz() no longer relies on system mktime() when timezone offest is supplied.