<\"'")])]) + + def test_attr_funky_names(self): + self._run_check( + "", + [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])]) + + def test_entityrefs_in_attributes(self): + self._run_check( + "", + [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) + + + +class AttributesTolerantTestCase(AttributesStrictTestCase): + + def get_collector(self): + return EventCollector(strict=False) + + def test_attr_funky_names2(self): + self._run_check( + "", + [("starttag", "a", [("$", None)]), + ("starttag", "b", [("$", "%")]), + ("starttag", "c", [("\\", "/")])]) + + def test_entities_in_attribute_value(self): + # see #1200313 + for entity in ['&', '&', '&', '&']: + self._run_check('' % entity, + [("starttag", "a", [("href", "&")])]) + self._run_check("" % entity, + [("starttag", "a", [("href", "&")])]) + self._run_check("" % entity, + [("starttag", "a", [("href", "&")])]) + + def test_malformed_attributes(self): + # see #13357 + html = ( + "test - bad1" + "test - bad2" + "test - bad3" + "test - bad4" + ) + expected = [ + ('starttag', 'a', [('href', "test'style='color:red;bad1'")]), + ('data', 'test - bad1'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]), + ('data', 'test - bad2'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]), + ('data', 'test - bad3'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]), + ('data', 'test - bad4'), ('endtag', 'a') + ] + self._run_check(html, expected) + + def test_malformed_adjacent_attributes(self): + # see #12629 + self._run_check('', + [('starttag', 'x', []), + ('startendtag', 'y', [('z', ''), ('o""', None)]), + ('endtag', 'x')]) + self._run_check('', + [('starttag', 'x', []), + ('startendtag', 'y', [('z', ''), ('""', None)]), + ('endtag', 'x')]) + + # see #755670 for the following 3 tests + def test_adjacent_attributes(self): + self._run_check('', + [("starttag", "a", + [("width", "100%"), ("cellspacing","0")])]) + + self._run_check('', + [("starttag", "a", + [("id", "foo"), ("class","bar")])]) + + def test_missing_attribute_value(self): + self._run_check('', + [("starttag", "a", [("v", "")])]) + + def test_javascript_attribute_value(self): + self._run_check("", + [("starttag", "a", + [("href", "javascript:popup('/popup/help.html')")])]) + + def test_end_tag_in_attribute_value(self): + # see #1745761 + self._run_check("spam", + [("starttag", "a", + [("href", "http://www.example.org/\">;")]), + ("data", "spam"), ("endtag", "a")]) + + + def test_main(): - support.run_unittest(HTMLParserStrictTestCase, HTMLParserTolerantTestCase) + support.run_unittest(HTMLParserStrictTestCase, HTMLParserTolerantTestCase, + AttributesStrictTestCase, AttributesTolerantTestCase) if __name__ == "__main__": diff -r 410115400838 -r 426f7a2b1826 Misc/NEWS --- a/Misc/NEWS Mon Nov 14 01:18:24 2011 +0200 +++ b/Misc/NEWS Mon Nov 14 18:56:11 2011 +0200 @@ -365,6 +365,9 @@ Library ------- +- Issues #1745761, #755670, #13357, #12629, #1200313: HTMLParser now correctly + handles non-valid attributes, including adjacent and unquoted attributes. + - Issue #13193: Fix distutils.filelist.FileList and packaging.manifest.Manifest under Windows. The "recursive-include" directive now recognizes both legal path separators.">

(original) (raw)

""", output) + self._run_check("""""", output) + self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output) + self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output) + + def test_attr_values(self): + self._run_check("""</a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te></a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", + [("starttag", "a", [("b", "xxx\n\txxx"), + ("c", "yyy\t\nyyy"), + ("d", "\txyz\n")])]) + self._run_check("""""", + [("starttag", "a", [("b", ""), ("c", "")])]) + # Regression test for SF patch #669683. + self._run_check("", + [("starttag", "e", [("a", "rgb(1,2,3)")])]) + # Regression test for SF bug #921657. + self._run_check( + "", + [("starttag", "a", [("href", "mailto:xyz@example.com")])]) + + def test_attr_nonascii(self): + # see issue 7311 + self._run_check( + "\u4e2d\u6587", + [("starttag", "img", [("src", "/foo/bar.png"), + ("alt", "\u4e2d\u6587")])]) + self._run_check( + "", + [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")])]) + self._run_check( + '', + [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"), + ("href", "\u30c6\u30b9\u30c8.html")])]) + + def test_attr_entity_replacement(self): + self._run_check( + "", + [("starttag", "a", [("b", "&><\"'")])]) + + def test_attr_funky_names(self): + self._run_check( + "", + [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])]) + + def test_entityrefs_in_attributes(self): + self._run_check( + "", + [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) + + + +class AttributesTolerantTestCase(AttributesStrictTestCase): + + def get_collector(self): + return EventCollector(strict=False) + + def test_attr_funky_names2(self): + self._run_check( + "", + [("starttag", "a", [("$", None)]), + ("starttag", "b", [("$", "%")]), + ("starttag", "c", [("\\", "/")])]) + + def test_entities_in_attribute_value(self): + # see #1200313 + for entity in ['&', '&', '&', '&']: + self._run_check('** **' % entity, + [("starttag", "a", [("href", "&")])]) + self._run_check("" % entity, + [("starttag", "a", [("href", "&")])]) + self._run_check("" % entity, + [("starttag", "a", [("href", "&")])]) + + def test_malformed_attributes(self): + # see #13357 + html = ( + "test - bad1" + "test - bad2" + "[test - bad3](test' style='color:red;bad3')" + "[test - bad4](test' style='color:red;bad4')" + ) + expected = [ + ('starttag', 'a', [('href', "test'style='color:red;bad1'")]), + ('data', 'test - bad1'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]), + ('data', 'test - bad2'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]), + ('data', 'test - bad3'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]), + ('data', 'test - bad4'), ('endtag', 'a') + ] + self._run_check(html, expected) + + def test_malformed_adjacent_attributes(self): + # see #12629 + self._run_check('', + [('starttag', 'x', []), + ('startendtag', 'y', [('z', ''), ('o""', None)]), + ('endtag', 'x')]) + self._run_check('', + [('starttag', 'x', []), + ('startendtag', 'y', [('z', ''), ('""', None)]), + ('endtag', 'x')]) + + # see #755670 for the following 3 tests + def test_adjacent_attributes(self): + self._run_check('', + [("starttag", "a", + [("width", "100%"), ("cellspacing","0")])]) + + self._run_check('', + [("starttag", "a", + [("id", "foo"), ("class","bar")])]) + + def test_missing_attribute_value(self): + self._run_check('', + [("starttag", "a", [("v", "")])]) + + def test_javascript_attribute_value(self): + self._run_check("", + [("starttag", "a", + [("href", "javascript:popup('/popup/help.html')")])]) + + def test_end_tag_in_attribute_value(self): + # see #1745761 + self._run_check("spam", + [("starttag", "a", + [("href", ";")\]" title="undefined" rel="noopener noreferrer">http://www.example.org/\\">;")\]), + ("data", "spam"), ("endtag", "a")]) + + + def test_main(): - support.run_unittest(HTMLParserStrictTestCase, HTMLParserTolerantTestCase) + support.run_unittest(HTMLParserStrictTestCase, HTMLParserTolerantTestCase, + AttributesStrictTestCase, AttributesTolerantTestCase) if __name__ == "__main__": diff -r 410115400838 -r 426f7a2b1826 Misc/NEWS --- a/Misc/NEWS Mon Nov 14 01🔞24 2011 +0200 +++ b/Misc/NEWS Mon Nov 14 18:56:11 2011 +0200 @@ -365,6 +365,9 @@ Library ------- +- Issues #1745761, #755670, #13357, #12629, #1200313: HTMLParser now correctly + handles non-valid attributes, including adjacent and unquoted attributes. + - Issue #13193: Fix distutils.filelist.FileList and packaging.manifest.Manifest under Windows. The "recursive-include" directive now recognizes both legal path separators.