(original) (raw)

changeset: 74859:242b697449d8 branch: 3.2 parent: 74848:a352e24b9907 user: Ezio Melotti ezio.melotti@gmail.com date: Fri Feb 10 10:45:44 2012 +0200 files: Lib/html/parser.py Lib/test/test_htmlparser.py Misc/NEWS description: #13960: HTMLParser is now able to handle broken comments when strict=False. diff -r a352e24b9907 -r 242b697449d8 Lib/html/parser.py --- a/Lib/html/parser.py Thu Feb 09 18:26:59 2012 +0800 +++ b/Lib/html/parser.py Fri Feb 10 10:45:44 2012 +0200 @@ -184,7 +184,17 @@ elif startswith(" or + # . When strict is True an + # error is raised, when it's False they will be considered + # as bogus comments and parsed (see parse_bogus_comment). + if self.strict: + k = self.parse_declaration(i) + else: + try: + k = self.parse_declaration(i) + except HTMLParseError: + k = self.parse_bogus_comment(i) elif (i + 1) < n: self.handle_data("<") k = i + 1 @@ -256,6 +266,19 @@ i = self.updatepos(i, n) self.rawdata = rawdata[i:] + # Internal -- parse bogus comment, return length or -1 if not terminated + # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state + def parse_bogus_comment(self, i, report=1): + rawdata = self.rawdata + if rawdata[i:i+2] != '', i+2) + if pos == -1: + return -1 + if report: + self.handle_comment(rawdata[i+2:pos]) + return pos + 1 + # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata diff -r a352e24b9907 -r 242b697449d8 Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py Thu Feb 09 18:26:59 2012 +0800 +++ b/Lib/test/test_htmlparser.py Fri Feb 10 10:45:44 2012 +0200 @@ -323,6 +323,23 @@ ("endtag", element_lower)], collector=Collector()) + def test_comments(self): + html = ("" + '' + '' + '' + '' + '' + '') + expected = [('comment', " I'm a valid comment "), + ('comment', 'me too!'), + ('comment', '--'), + ('comment', ''), + ('comment', '--I have many hyphens--'), + ('comment', ' I have a > in the middle '), + ('comment', ' and I have -- in the middle! ')] + self._run_check(html, expected) + def test_condcoms(self): html = ('' '' @@ -426,6 +443,19 @@ # see #12888 self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050) + def test_broken_comments(self): + html = ('' + '' + '' + '') + expected = [ + ('comment', ' not really a comment '), + ('comment', ' not a comment either --'), + ('comment', ' -- close enough --'), + ('comment', '!! another bogus comment !!!'), + ] + self._run_check(html, expected) + def test_broken_condcoms(self): # these condcoms are missing the '--' after '' html = ('broken condcom' diff -r a352e24b9907 -r 242b697449d8 Misc/NEWS --- a/Misc/NEWS Thu Feb 09 18:26:59 2012 +0800 +++ b/Misc/NEWS Fri Feb 10 10:45:44 2012 +0200 @@ -113,6 +113,9 @@ Library ------- +- Issue #13960: HTMLParser is now able to handle broken comments when + strict=False. + - Issue #9021: Add an introduction to the copy module documentation. - Issue #6005: Examples in the socket library documentation use sendall, where @@ -123,7 +126,7 @@ - Issue #10881: Fix test_site failure with OS X framework builds. -- Issue #964437 Make IDLE help window non-modal. +- Issue #964437: Make IDLE help window non-modal. Patch by Guilherme Polo and Roger Serwy. - Issue #2945: Make the distutils upload command aware of bdist_rpm products. /ezio.melotti@gmail.com