(original) (raw)

changeset: 91963:4425024f2e01 user: Ezio Melotti ezio.melotti@gmail.com date: Sat Aug 02 18:36:12 2014 +0300 files: Doc/library/html.parser.rst Lib/html/parser.py Lib/test/test_htmlparser.py Misc/NEWS description: #21047: set the default value for the *convert_charrefs* argument of HTMLParser to True. Patch by Berker Peksag. diff -r 5abe28a9c8fe -r 4425024f2e01 Doc/library/html.parser.rst --- a/Doc/library/html.parser.rst Sat Aug 02 15:15:02 2014 +0300 +++ b/Doc/library/html.parser.rst Sat Aug 02 18:36:12 2014 +0300 @@ -16,15 +16,13 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(*, convert_charrefs=False) +.. class:: HTMLParser(*, convert_charrefs=True) Create a parser instance able to parse invalid markup. - If *convert_charrefs* is ``True`` (default: ``False``), all character + If *convert_charrefs* is ``True`` (the default), all character references (except the ones in ``script``/``style`` elements) are automatically converted to the corresponding Unicode characters. - The use of ``convert_charrefs=True`` is encouraged and will become - the default in Python 3.5. An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are @@ -37,6 +35,9 @@ .. versionchanged:: 3.4 *convert_charrefs* keyword argument added. + .. versionchanged:: 3.5 + The default value for argument *convert_charrefs* is now ``True``. + Example HTML Parser Application ------------------------------- diff -r 5abe28a9c8fe -r 4425024f2e01 Lib/html/parser.py --- a/Lib/html/parser.py Sat Aug 02 15:15:02 2014 +0300 +++ b/Lib/html/parser.py Sat Aug 02 18:36:12 2014 +0300 @@ -59,7 +59,6 @@ endtagfind = re.compile('') -_default_sentinel = object() class HTMLParser(_markupbase.ParserBase): """Find tags and other markup and call handler functions. @@ -85,17 +84,12 @@ CDATA_CONTENT_ELEMENTS = ("script", "style") - def __init__(self, *, convert_charrefs=_default_sentinel): + def __init__(self, *, convert_charrefs=True): """Initialize and reset this instance. - If convert_charrefs is True (default: False), all character references + If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. """ - if convert_charrefs is _default_sentinel: - convert_charrefs = False # default - warnings.warn("The value of convert_charrefs will become True in " - "3.5. You are encouraged to set the value explicitly.", - DeprecationWarning, stacklevel=2) self.convert_charrefs = convert_charrefs self.reset() diff -r 5abe28a9c8fe -r 4425024f2e01 Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py Sat Aug 02 15:15:02 2014 +0300 +++ b/Lib/test/test_htmlparser.py Sat Aug 02 18:36:12 2014 +0300 @@ -346,7 +346,8 @@ self._run_check(html, expected) def test_convert_charrefs(self): - collector = lambda: EventCollectorCharrefs(convert_charrefs=True) + # default value for convert_charrefs is now True + collector = lambda: EventCollectorCharrefs() self.assertTrue(collector().convert_charrefs) charrefs = ['"', '"', '"', '"', '"', '"'] # check charrefs in the middle of the text/attributes @@ -383,10 +384,6 @@ self._run_check('no charrefs here', [('data', 'no charrefs here')], collector=collector()) - def test_deprecation_warnings(self): - with self.assertWarns(DeprecationWarning): - EventCollector() # convert_charrefs not passed explicitly - # the remaining tests were for the "tolerant" parser (which is now # the default), and check various kind of broken markup def test_tolerant_parsing(self): diff -r 5abe28a9c8fe -r 4425024f2e01 Misc/NEWS --- a/Misc/NEWS Sat Aug 02 15:15:02 2014 +0300 +++ b/Misc/NEWS Sat Aug 02 18:36:12 2014 +0300 @@ -121,6 +121,9 @@ Library ------- +- Issue #21047: set the default value for the *convert_charrefs* argument + of HTMLParser to True. Patch by Berker Peksag. + - Add an __all__ to html.entities. - Issue #15114: the strict mode and argument of HTMLParser, HTMLParser.error, /ezio.melotti@gmail.com