(original) (raw)

changeset: 69189:225400cb6e84 branch: 3.2 parent: 69186:3d7c9b38fbfd user: Ezio Melotti date: Thu Apr 07 22:03:31 2011 +0300 files: Lib/html/parser.py Lib/test/test_htmlparser.py Misc/NEWS description: #7311: fix html.parser to accept non-ASCII attribute values. diff -r 3d7c9b38fbfd -r 225400cb6e84 Lib/html/parser.py --- a/Lib/html/parser.py Thu Apr 07 10:45:07 2011 -0400 +++ b/Lib/html/parser.py Thu Apr 07 22:03:31 2011 +0300 @@ -28,7 +28,7 @@ # make it correctly strict without breaking backward compatibility. attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$_#=~@]*))?') + r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') diff -r 3d7c9b38fbfd -r 225400cb6e84 Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py Thu Apr 07 10:45:07 2011 -0400 +++ b/Lib/test/test_htmlparser.py Thu Apr 07 22:03:31 2011 +0300 @@ -217,6 +217,23 @@ ("starttag", "a", [("href", "mailto:xyz@example.com")]), ]) + def test_attr_nonascii(self): + # see issue 7311 + self._run_check(" $\u4e2d\u6587$ ", [ + ("starttag", "img", [("src", "/foo/bar.png"), + ("alt", "\u4e2d\u6587")]), + ]) + self._run_check("", [ + ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")]), + ]) + self._run_check('', [ + ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")]), + ]) + def test_attr_entity_replacement(self): self._run_check("""""", [ ("starttag", "a", [("b", "&><\"'")]), diff -r 3d7c9b38fbfd -r 225400cb6e84 Misc/NEWS --- a/Misc/NEWS Thu Apr 07 10:45:07 2011 -0400 +++ b/Misc/NEWS Thu Apr 07 22:03:31 2011 +0300 @@ -49,6 +49,8 @@ Library ------- +- Issue #7311: fix html.parser to accept non-ASCII attribute values. + - Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart subpararts with an 8bit CTE into unicode instead of preserving the bytes.