(original) (raw)

changeset: 95751:03b2259c6cd3 parent: 95749:475c6a4dfab3 parent: 95750:414e08c478f4 user: Benjamin Peterson benjamin@python.org date: Tue Apr 21 12:07:06 2015 -0400 files: Lib/test/test_compile.py Misc/NEWS Parser/tokenizer.c description: merge 3.4 (#24022) diff -r 475c6a4dfab3 -r 03b2259c6cd3 Lib/test/test_compile.py --- a/Lib/test/test_compile.py Tue Apr 21 10:57:41 2015 +0200 +++ b/Lib/test/test_compile.py Tue Apr 21 12:07:06 2015 -0400 @@ -1,9 +1,11 @@ import math +import os import unittest import sys import _ast +import tempfile import types -from test import support +from test import support, script_helper class TestSpecifics(unittest.TestCase): @@ -492,6 +494,16 @@ self.assertInvalidSingle('f()\nxy # blah\nblah()') self.assertInvalidSingle('x = 5 # comment\nx = 6\n') + def test_particularly_evil_undecodable(self): + # Issue 24022 + src = b'0000\x00\n00000000000\n\x00\n\x9e\n' + with tempfile.TemporaryDirectory() as tmpd: + fn = os.path.join(tmpd, "bad.py") + with open(fn, "wb") as fp: + fp.write(src) + res = script_helper.run_python_until_end(fn)[0] + self.assertIn(b"Non-UTF-8", res.err) + @support.cpython_only def test_compiler_recursion_limit(self): # Expected limit is sys.getrecursionlimit() * the scaling factor diff -r 475c6a4dfab3 -r 03b2259c6cd3 Misc/NEWS --- a/Misc/NEWS Tue Apr 21 10:57:41 2015 +0200 +++ b/Misc/NEWS Tue Apr 21 12:07:06 2015 -0400 @@ -10,6 +10,8 @@ Core and Builtins ----------------- +- Issue #24022: Fix tokenizer crash when processing undecodable source code. + Library ------- diff -r 475c6a4dfab3 -r 03b2259c6cd3 Parser/tokenizer.c --- a/Parser/tokenizer.c Tue Apr 21 10:57:41 2015 +0200 +++ b/Parser/tokenizer.c Tue Apr 21 12:07:06 2015 -0400 @@ -1307,6 +1307,8 @@ { PyObject *s; int result; + if (tok->decoding_erred) + return 0; s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL); if (s == NULL || PyUnicode_READY(s) == -1) { if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) { @@ -1475,11 +1477,8 @@ c = tok_nextc(tok); } tok_backup(tok, c); - if (nonascii && - !verify_identifier(tok)) { - tok->done = E_IDENTIFIER; + if (nonascii && !verify_identifier(tok)) return ERRORTOKEN; - } *p_start = tok->start; *p_end = tok->cur; return NAME; /benjamin@python.org