(original) (raw)

changeset: 74908:333e3acf2008 branch: 2.7 parent: 74900:6653328a02d0 user: Ezio Melotti ezio.melotti@gmail.com date: Mon Feb 13 16:10:44 2012 +0200 files: Lib/HTMLParser.py Lib/test/test_htmlparser.py Misc/NEWS description: #13960: HTMLParser is now able to handle broken comments. diff -r 6653328a02d0 -r 333e3acf2008 Lib/HTMLParser.py --- a/Lib/HTMLParser.py Sun Feb 12 15:59:35 2012 -0800 +++ b/Lib/HTMLParser.py Mon Feb 13 16:10:44 2012 +0200 @@ -160,7 +160,7 @@ elif startswith("+ gtpos = rawdata.find('>', 9) + if gtpos == -1: + return -1 + self.handle_decl(rawdata[i+2:gtpos]) + return gtpos+1 + else: + return self.parse_bogus_comment(i) + + # Internal -- parse bogus comment, return length or -1 if not terminated + # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state + def parse_bogus_comment(self, i, report=1): + rawdata = self.rawdata + if rawdata[i:i+2] != '', i+2) + if pos == -1: + return -1 + if report: + self.handle_comment(rawdata[i+2:pos]) + return pos + 1 + # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): rawdata = self.rawdata diff -r 6653328a02d0 -r 333e3acf2008 Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py Sun Feb 12 15:59:35 2012 -0800 +++ b/Lib/test/test_htmlparser.py Mon Feb 13 16:10:44 2012 +0200 @@ -114,7 +114,7 @@sample text “ -+ """, [ ("data", "\n"), @@ -142,24 +142,6 @@ ("data", " foo"), ]) - def test_doctype_decl(self): - inside = """\ -DOCTYPE html [ - - - - - - - %paramEntity; - -]""" - self._run_check("" % inside, [ - ("decl", inside), - ]) - def test_bad_nesting(self): # Strangely, this *is* supposed to test that overlapping # elements are allowed. HTMLParser is more geared toward @@ -182,7 +164,8 @@ ]) def test_illegal_declarations(self): - self._parse_error('') + self._run_check('', + [('comment', 'spacer type="block" height="25"')]) def test_starttag_end_boundary(self): self._run_check("""""", [("starttag", "a", [("b", "<")])]) @@ -233,7 +216,7 @@ self._parse_error("", [ @@ -449,6 +432,39 @@ [("href", ";")\]" title="undefined" rel="noopener noreferrer">http://www.example.org/\\">;")\]), ("data", "spam"), ("endtag", "a")]) + def test_comments(self): + html = ("" + '' + '' + '' + '' + '' + '') + expected = [('comment', " I'm a valid comment "), + ('comment', 'me too!'), + ('comment', '--'), + ('comment', ''), + ('comment', '--I have many hyphens--'), + ('comment', ' I have a > in the middle '), + ('comment', ' and I have -- in the middle! ')] + self._run_check(html, expected) + + def test_broken_comments(self): + html = ('' + '' + '' + '' + '') + expected = [ + ('comment', ' not really a comment '), + ('comment', ' not a comment either --'), + ('comment', ' -- close enough --'), + ('comment', ''), + ('comment', '<-- this was an empty comment'), + ('comment', '!! another bogus comment !!!'), + ] + self._run_check(html, expected) + def test_condcoms(self): html = ('' '' diff -r 6653328a02d0 -r 333e3acf2008 Misc/NEWS --- a/Misc/NEWS Sun Feb 12 15:59:35 2012 -0800 +++ b/Misc/NEWS Mon Feb 13 16:10:44 2012 +0200 @@ -90,6 +90,8 @@ Library ------- +- Issue #13960: HTMLParser is now able to handle broken comments. + - Issue #9750: Fix sqlite3.Connection.iterdump on tables and fields with a name that is a keyword or contains quotes. Patch by Marko Kohtala./ezio.melotti@gmail.com