cpython: a349448474ea (original) (raw)

--- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -23,6 +23,9 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]') +# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state[](#l1.7) +# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state[](#l1.8) +tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]') attrfind = re.compile( r'\s*((?<=['"\s])[^\s/>][^\s/=>])(\s=+\s*' @@ -243,7 +246,7 @@ class HTMLParser(markupbase.ParserBase): # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state[](#l1.14) def parse_bogus_comment(self, i, report=1): rawdata = self.rawdata

@@ -353,23 +356,38 @@ class HTMLParser(markupbase.ParserBase): match = endendtag.search(rawdata, i+1) # > if not match: return -1

elem = match.group(1).lower() # script or style if self.cdata_elem is not None: if elem != self.cdata_elem:

self.handle_endtag(elem) self.clear_cdata_mode()

# Overridable -- finish processing of start+end tag: <tag.../> def handle_startendtag(self, tag, attrs):

--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -202,12 +202,12 @@ text self._run_check(["", ""], output) def test_starttag_junk_chars(self):

@@ -232,6 +232,44 @@ text ("endtag", "p"), ])

+

+ def test_get_starttag_text(self): s = """<foo:bar \n one="1"\ttwo=2 >""" self._run_check_extra(s, [

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -90,6 +90,8 @@ Core and Builtins Library ------- +- Issue #13993: HTMLParser is now able to handle broken end tags. +