cpython: b07488490001 (original) (raw)

Mercurial > cpython

changeset 76425:b07488490001 3.2

Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the first two lines have non-UTF-8 characters without an encoding declaration. [#14629]

Martin v. Löwis martin@v.loewis.de
date	Fri, 20 Apr 2012 14:36:47 +0200
parents	41c64c700e1e
children	98a6a57c5876 cdcc6b489862
files	Lib/test/test_tokenize.py Lib/tokenize.py Misc/NEWS
diffstat	3 files changed, 18 insertions(+), 2 deletions(-)[+] [-] Lib/test/test_tokenize.py 10 Lib/tokenize.py 7 Misc/NEWS 3

line wrap: on

line diff

--- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -825,6 +825,16 @@ class TestDetectEncoding(TestCase): found, consumed_lines = detect_encoding(rl) self.assertEqual(found, "iso-8859-1")

def test_syntaxerror_latin1(self):

   # Issue 14629: need to raise SyntaxError if the first[](#l1.8)

   # line(s) have non-UTF-8 characters[](#l1.9)

```
   lines = ([](#l1.10)
```

       b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S[](#l1.11)

```
       )[](#l1.12)
```

   readline = self.get_readline(lines)[](#l1.13)

   self.assertRaises(SyntaxError, detect_encoding, readline)[](#l1.14)

+ + def test_utf8_normalization(self): # See get_normal_name() in tokenizer.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

--- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -292,9 +292,12 @@ def detect_encoding(readline): def find_cookie(line): try:

       line_string = line.decode('ascii')[](#l2.7)

       # Decode as UTF-8. Either the line is an encoding declaration,[](#l2.8)

       # in which case it should be pure ASCII, or it must be UTF-8[](#l2.9)

       # per default encoding.[](#l2.10)

       line_string = line.decode('utf-8')[](#l2.11)
   except UnicodeDecodeError:[](#l2.12)

```
       return None[](#l2.13)
```

       raise SyntaxError("invalid or missing encoding declaration")[](#l2.14)

matches = cookie_re.findall(line_string) if not matches:

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -47,6 +47,9 @@ Core and Builtins Library ------- +- Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the

first two lines have non-UTF-8 characters without an encoding declaration. +

Issue #14308: Fix an exception when a "dummy" thread is in the threading module's active list after a fork().