cpython: b07488490001 (original) (raw)
Mercurial > cpython
changeset 76425:b07488490001 3.2
Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the first two lines have non-UTF-8 characters without an encoding declaration. [#14629]
Martin v. Löwis martin@v.loewis.de | |
---|---|
date | Fri, 20 Apr 2012 14:36:47 +0200 |
parents | 41c64c700e1e |
children | 98a6a57c5876 cdcc6b489862 |
files | Lib/test/test_tokenize.py Lib/tokenize.py Misc/NEWS |
diffstat | 3 files changed, 18 insertions(+), 2 deletions(-)[+] [-] Lib/test/test_tokenize.py 10 Lib/tokenize.py 7 Misc/NEWS 3 |
line wrap: on
line diff
--- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -825,6 +825,16 @@ class TestDetectEncoding(TestCase): found, consumed_lines = detect_encoding(rl) self.assertEqual(found, "iso-8859-1")
- def test_syntaxerror_latin1(self):
# Issue 14629: need to raise SyntaxError if the first[](#l1.8)
# line(s) have non-UTF-8 characters[](#l1.9)
lines = ([](#l1.10)
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S[](#l1.11)
)[](#l1.12)
readline = self.get_readline(lines)[](#l1.13)
self.assertRaises(SyntaxError, detect_encoding, readline)[](#l1.14)
+ + def test_utf8_normalization(self): # See get_normal_name() in tokenizer.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
--- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -292,9 +292,12 @@ def detect_encoding(readline): def find_cookie(line): try:
line_string = line.decode('ascii')[](#l2.7)
# Decode as UTF-8. Either the line is an encoding declaration,[](#l2.8)
# in which case it should be pure ASCII, or it must be UTF-8[](#l2.9)
# per default encoding.[](#l2.10)
line_string = line.decode('utf-8')[](#l2.11) except UnicodeDecodeError:[](#l2.12)
return None[](#l2.13)
raise SyntaxError("invalid or missing encoding declaration")[](#l2.14)
matches = cookie_re.findall(line_string) if not matches:
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -47,6 +47,9 @@ Core and Builtins Library ------- +- Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the