cpython: 225400cb6e84 (original) (raw)

--- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -28,7 +28,7 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0

make it correctly strict without breaking backward compatibility.

attrfind = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s*'

r'('[^']'|"[^"]"|[-a-zA-Z0-9./,:;+%?!&$()_#=~@]))?')

r'('[^']'|"[^"]"|[^\s"'=<>`]*))?')

attrfind_tolerant = re.compile( r'\s*([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s*' r'('[^']'|"[^"]"|[^>\s]*))?')

--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -217,6 +217,23 @@ DOCTYPE html [ ("starttag", "a", [("href", "mailto:xyz@example.com")]), ])

def test_attr_nonascii(self):
```
   # see issue 7311[](#l2.8)
```

   self._run_check("<img src=/foo/bar.png alt=\u4e2d\u6587>", [[](#l2.9)

       ("starttag", "img", [("src", "/foo/bar.png"),[](#l2.10)

                            ("alt", "\u4e2d\u6587")]),[](#l2.11)

```
       ])[](#l2.12)
```

   self._run_check("<a title='\u30c6\u30b9\u30c8' "[](#l2.13)

                   "href='\u30c6\u30b9\u30c8.html'>", [[](#l2.14)

       ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),[](#l2.15)

                          ("href", "\u30c6\u30b9\u30c8.html")]),[](#l2.16)

```
       ])[](#l2.17)
```

   self._run_check('<a title="\u30c6\u30b9\u30c8" '[](#l2.18)

                   'href="\u30c6\u30b9\u30c8.html">', [[](#l2.19)

       ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),[](#l2.20)

                          ("href", "\u30c6\u30b9\u30c8.html")]),[](#l2.21)

```
       ])[](#l2.22)
```

+ def test_attr_entity_replacement(self): self._run_check("""""", [ ("starttag", "a", [("b", "&><"'")]),

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -49,6 +49,8 @@ Core and Builtins Library ------- +- Issue #7311: fix html.parser to accept non-ASCII attribute values. +

Issue #11605: email.parser.BytesFeedParser was incorrectly converting multipart subpararts with an 8bit CTE into unicode instead of preserving the bytes.