(original) (raw)
changeset: 88368:1bdcaf6c0eb5 branch: 3.3 parent: 88365:d28242a636c7 user: Serhiy Storchaka storchaka@gmail.com date: Thu Jan 09 18:36:09 2014 +0200 files: Lib/idlelib/IOBinding.py Lib/lib2to3/pgen2/tokenize.py Lib/test/test_tokenize.py Lib/tokenize.py Misc/NEWS Parser/tokenizer.c Tools/scripts/findnocoding.py description: Issue #18960: Fix bugs with Python source code encoding in the second line. * The first line of Python script could be executed twice when the source encoding (not equal to 'utf-8') was specified on the second line. * Now the source encoding declaration on the second line isn't effective if the first line contains anything except a comment. * As a consequence, 'python -x' works now again with files with the source encoding declarations specified on the second file, and can be used again to make Python batch files on Windows. * The tokenize module now ignore the source encoding declaration on the second line if the first line contains anything except a comment. * IDLE now ignores the source encoding declaration on the second line if the first line contains anything except a comment. * 2to3 and the findnocoding.py script now ignore the source encoding declaration on the second line if the first line contains anything except a comment. diff -r d28242a636c7 -r 1bdcaf6c0eb5 Lib/idlelib/IOBinding.py --- a/Lib/idlelib/IOBinding.py Thu Jan 09 09:36:10 2014 -0600 +++ b/Lib/idlelib/IOBinding.py Thu Jan 09 18:36:09 2014 +0200 @@ -64,6 +64,7 @@ ### 'encoding' is used below in encode(), check! coding_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII) +blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) def coding_spec(data): """Return the encoding declaration according to PEP 263. @@ -93,6 +94,8 @@ match = coding_re.match(line) if match is not None: break + if not blank_re.match(line): + return None else: return None name = match.group(1) diff -r d28242a636c7 -r 1bdcaf6c0eb5 Lib/lib2to3/pgen2/tokenize.py --- a/Lib/lib2to3/pgen2/tokenize.py Thu Jan 09 09:36:10 2014 -0600 +++ b/Lib/lib2to3/pgen2/tokenize.py Thu Jan 09 18:36:09 2014 +0200 @@ -237,6 +237,7 @@ toks_append(tokval) cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII) +blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) def _get_normal_name(orig_enc): """Imitates get_normal_name in tokenizer.c.""" @@ -309,6 +310,8 @@ encoding = find_cookie(first) if encoding: return encoding, [first] + if not blank_re.match(first): + return default, [first] second = read_or_stop() if not second: diff -r d28242a636c7 -r 1bdcaf6c0eb5 Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Thu Jan 09 09:36:10 2014 -0600 +++ b/Lib/test/test_tokenize.py Thu Jan 09 18:36:09 2014 +0200 @@ -885,6 +885,39 @@ readline = self.get_readline(lines) self.assertRaises(SyntaxError, detect_encoding, readline) + def test_cookie_second_line_noncommented_first_line(self): + lines = ( + b"print('\xc2\xa3')\n", + b'# vim: set fileencoding=iso8859-15 :\n', + b"print('\xe2\x82\xac')\n" + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8') + expected = [b"print('\xc2\xa3')\n"] + self.assertEqual(consumed_lines, expected) + + def test_cookie_second_line_commented_first_line(self): + lines = ( + b"#print('\xc2\xa3')\n", + b'# vim: set fileencoding=iso8859-15 :\n', + b"print('\xe2\x82\xac')\n" + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso8859-15') + expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] + self.assertEqual(consumed_lines, expected) + + def test_cookie_second_line_empty_first_line(self): + lines = ( + b'\n', + b'# vim: set fileencoding=iso8859-15 :\n', + b"print('\xe2\x82\xac')\n" + ) + encoding, consumed_lines = detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso8859-15') + expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] + self.assertEqual(consumed_lines, expected) + def test_latin1_normalization(self): # See get_normal_name() in tokenizer.c. encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", diff -r d28242a636c7 -r 1bdcaf6c0eb5 Lib/tokenize.py --- a/Lib/tokenize.py Thu Jan 09 09:36:10 2014 -0600 +++ b/Lib/tokenize.py Thu Jan 09 18:36:09 2014 +0200 @@ -32,6 +32,7 @@ import collections from io import TextIOWrapper cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII) +blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) import token __all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding", @@ -409,6 +410,8 @@ encoding = find_cookie(first) if encoding: return encoding, [first] + if not blank_re.match(first): + return default, [first] second = read_or_stop() if not second: diff -r d28242a636c7 -r 1bdcaf6c0eb5 Misc/NEWS --- a/Misc/NEWS Thu Jan 09 09:36:10 2014 -0600 +++ b/Misc/NEWS Thu Jan 09 18:36:09 2014 +0200 @@ -10,6 +10,13 @@ Core and Builtins ----------------- +- Issue #18960: The first line of Python script could be executed twice when + the source encoding was specified on the second line. Now the source encoding + declaration on the second line isn't effective if the first line contains + anything except a comment. 'python -x' works now again with files with the + source encoding declarations, and can be used to make Python batch files + on Windows. + - Issue #19081: When a zipimport .zip file in sys.path being imported from is modified during the lifetime of the Python process after zipimport has already cached the zip's table of contents we detect this and recover @@ -36,6 +43,9 @@ Library ------- +- Issue #18960: The tokenize module now ignore the source encoding declaration + on the second line if the first line contains anything except a comment. + - Issue #20078: Reading malformed zipfiles no longer hangs with 100% CPU consumption. @@ -204,6 +214,9 @@ IDLE ---- +- Issue #18960: IDLE now ignores the source encoding declaration on the second + line if the first line contains anything except a comment. + - Issue #20058: sys.stdin.readline() in IDLE now always returns only one line. - Issue #19481: print() of string subclass instance in IDLE no longer hangs. @@ -281,6 +294,13 @@ - Add workaround for VS 2010 nmake clean issue. VS 2010 doesn't set up PATH for nmake.exe correctly. +Tools/Demos +----------- + +- Issue #18960: 2to3 and the findnocoding.py script now ignore the source + encoding declaration on the second line if the first line contains anything + except a comment. + What's New in Python 3.3.3? =========================== diff -r d28242a636c7 -r 1bdcaf6c0eb5 Parser/tokenizer.c --- a/Parser/tokenizer.c Thu Jan 09 09:36:10 2014 -0600 +++ b/Parser/tokenizer.c Thu Jan 09 18:36:09 2014 +0200 @@ -283,13 +283,27 @@ char *cs; int r = 1; - if (tok->cont_line) + if (tok->cont_line) { /* It's a continuation line, so it can't be a coding spec. */ + tok->read_coding_spec = 1; return 1; + } if (!get_coding_spec(line, &cs, size, tok)) return 0; - if (!cs) + if (!cs) { + Py_ssize_t i; + for (i = 0; i < size; i++) { + if (line[i] == '#' || line[i] == '\n' || line[i] == '\r') + break; + if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') { + /* Stop checking coding spec after a line containing + * anything except a comment. */ + tok->read_coding_spec = 1; + break; + } + } return 1; + } tok->read_coding_spec = 1; if (tok->encoding == NULL) { assert(tok->decoding_state == STATE_RAW); @@ -476,13 +490,17 @@ _Py_IDENTIFIER(open); _Py_IDENTIFIER(readline); int fd; + long pos; io = PyImport_ImportModuleNoBlock("io"); if (io == NULL) goto cleanup; fd = fileno(tok->fp); - if (lseek(fd, 0, SEEK_SET) == (off_t)-1) { + /* Due to buffering the file offset for fd can be different from the file + * position of tok->fp. */ + pos = ftell(tok->fp); + if (pos == -1 || lseek(fd, (off_t)pos, SEEK_SET) == (off_t)-1) { PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL); goto cleanup; } @@ -751,7 +769,7 @@ if (newl[0]) { if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) return error_ret(tok); - if (tok->enc == NULL && newl[1]) { + if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) { if (!check_coding_spec(newl[0]+1, newl[1] - newl[0], tok, buf_setreadl)) return error_ret(tok); diff -r d28242a636c7 -r 1bdcaf6c0eb5 Tools/scripts/findnocoding.py --- a/Tools/scripts/findnocoding.py Thu Jan 09 09:36:10 2014 -0600 +++ b/Tools/scripts/findnocoding.py Thu Jan 09 18:36:09 2014 +0200 @@ -33,6 +33,7 @@ decl_re = re.compile(rb'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)') +blank_re = re.compile(rb'^[ \t\f]*(?:[#\r\n]|$)') def get_declaration(line): match = decl_re.match(line) @@ -58,7 +59,8 @@ line1 = infile.readline() line2 = infile.readline() - if get_declaration(line1) or get_declaration(line2): + if (get_declaration(line1) or + blank_re.match(line1) and get_declaration(line2)): # the file does have an encoding declaration, so trust it return False /storchaka@gmail.com