cpython: 259745f9a1e4 (original) (raw)

Mercurial > cpython

changeset 104848:259745f9a1e4

Issue 28128: Print out better error/warning messages for invalid string escapes.

Eric V. Smith eric@trueblade.com
date	Mon, 31 Oct 2016 09:22:08 -0400
parents	66f255754ce9
children	fb672afd0151
files	Include/bytesobject.h Include/unicodeobject.h Lib/test/test_string_literals.py Lib/test/test_unicode.py Misc/NEWS Objects/bytesobject.c Objects/unicodeobject.c Python/ast.c
diffstat	8 files changed, 173 insertions(+), 22 deletions(-)[+] [-] Include/bytesobject.h 5 Include/unicodeobject.h 11 Lib/test/test_string_literals.py 27 Lib/test/test_unicode.py 7 Misc/NEWS 4 Objects/bytesobject.c 37 Objects/unicodeobject.c 38 Python/ast.c 66

line wrap: on

line diff

--- a/Include/bytesobject.h +++ b/Include/bytesobject.h @@ -74,6 +74,11 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex( PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, const char *, Py_ssize_t, const char ); +/ Helper for PyBytes_DecodeEscape that detects invalid escape chars. */ +PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,

                                        const char *, Py_ssize_t,[](#l1.9)

                                        const char *,[](#l1.10)

                                        const char **);[](#l1.11)

/* Macro, trading safety for speed */ #ifndef Py_LIMITED_API

--- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1486,6 +1486,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUn const char errors / error handling / ); +/ Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape

chars. / +PyAPI_FUNC(PyObject) _PyUnicode_DecodeUnicodeEscape(

   const char *string,     /* Unicode-Escape encoded string */[](#l2.10)

   Py_ssize_t length,      /* size of string */[](#l2.11)

   const char *errors,     /* error handling */[](#l2.12)

   const char **first_invalid_escape  /* on return, points to first[](#l2.13)

                                         invalid escaped char in[](#l2.14)

                                         string. */[](#l2.15)

+); + PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( PyObject unicode / Unicode object */ );

--- a/Lib/test/test_string_literals.py +++ b/Lib/test/test_string_literals.py @@ -31,6 +31,7 @@ import os import sys import shutil import tempfile +import warnings import unittest @@ -104,6 +105,19 @@ class TestLiterals(unittest.TestCase): self.assertRaises(SyntaxError, eval, r""" '\U000000' """) self.assertRaises(SyntaxError, eval, r""" '\U0000000' """)

def test_eval_str_invalid_escape(self):
```
   for b in range(1, 128):[](#l3.16)
```

       if b in b"""\n\r"'01234567NU\\abfnrtuvx""":[](#l3.17)

```
           continue[](#l3.18)
```

       with self.assertWarns(DeprecationWarning):[](#l3.19)

           self.assertEqual(eval(r"'\%c'" % b), '\\' + chr(b))[](#l3.20)

   with warnings.catch_warnings(record=True) as w:[](#l3.21)

       warnings.simplefilter('always', category=DeprecationWarning)[](#l3.22)

```
       eval("'''\n\\z'''")[](#l3.23)
```

   self.assertEqual(len(w), 1)[](#l3.24)

   self.assertEqual(w[0].filename, '<string>')[](#l3.25)

   self.assertEqual(w[0].lineno, 2)[](#l3.26)

+ def test_eval_str_raw(self): self.assertEqual(eval(""" r'x' """), 'x') self.assertEqual(eval(r""" r'\x01' """), '\' + 'x01') @@ -130,6 +144,19 @@ class TestLiterals(unittest.TestCase): self.assertRaises(SyntaxError, eval, r""" b'\x' """) self.assertRaises(SyntaxError, eval, r""" b'\x0' """)

def test_eval_bytes_invalid_escape(self):
```
   for b in range(1, 128):[](#l3.36)
```

       if b in b"""\n\r"'01234567\\abfnrtvx""":[](#l3.37)

```
           continue[](#l3.38)
```

       with self.assertWarns(DeprecationWarning):[](#l3.39)

           self.assertEqual(eval(r"b'\%c'" % b), b'\\' + bytes([b]))[](#l3.40)

   with warnings.catch_warnings(record=True) as w:[](#l3.41)

       warnings.simplefilter('always', category=DeprecationWarning)[](#l3.42)

```
       eval("b'''\n\\z'''")[](#l3.43)
```

   self.assertEqual(len(w), 1)[](#l3.44)

   self.assertEqual(w[0].filename, '<string>')[](#l3.45)

   self.assertEqual(w[0].lineno, 2)[](#l3.46)

+ def test_eval_bytes_raw(self): self.assertEqual(eval(""" br'x' """), b'x') self.assertEqual(eval(""" rb'x' """), b'x')

--- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -2413,13 +2413,6 @@ class UnicodeTest(string_tests.CommonTes support.check_free_after_iterating(self, iter, str) support.check_free_after_iterating(self, reversed, str)

def test_invalid_sequences(self):

   for letter in string.ascii_letters + "89": # 0-7 are octal escapes[](#l4.8)

       if letter in "abfnrtuvxNU":[](#l4.9)

```
           continue[](#l4.10)
```

       with self.assertWarns(DeprecationWarning):[](#l4.11)

           eval(r"'\%s'" % letter)[](#l4.12)

- class CAPITest(unittest.TestCase):

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,10 @@ What's New in Python 3.7.0 alpha 1 Core and Builtins ----------------- +- Issue #28128: Deprecation warning for invalid str and byte escape

sequences now prints better information about where the error
occurs. Patch by Serhiy Storchaka and Eric Smith. +

Issue #28509: dict.update() no longer allocate unnecessary large memory.
Issue #28426: Fixed potential crash in PyUnicode_AsDecodedObject() in debug

--- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1105,11 +1105,12 @@ static char * return p; } -PyObject *PyBytes_DecodeEscape(const char *s, +PyObject *_PyBytes_DecodeEscape(const char *s, Py_ssize_t len, const char *errors, Py_ssize_t unicode,

                           const char *recode_encoding)[](#l6.12)

                           const char *recode_encoding,[](#l6.13)

                           const char **first_invalid_escape)[](#l6.14)

{ int c; char *p; @@ -1123,6 +1124,8 @@ PyObject *PyBytes_DecodeEscape(const cha return NULL; writer.overallocate = 1;

*first_invalid_escape = NULL;

+ end = s + len; while (s < end) { if (*s != '\') { @@ -1207,9 +1210,12 @@ PyObject *PyBytes_DecodeEscape(const cha break; default:

       if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0)[](#l6.31)

```
           goto failed;[](#l6.32)
```

       if (*first_invalid_escape == NULL) {[](#l6.33)

           *first_invalid_escape = s-1; /* Back up one char, since we've[](#l6.34)

                                           already incremented s. */[](#l6.35)

       }[](#l6.36)
       *p++ = '\\';[](#l6.37)

       s--;[](#l6.38)
       goto non_esc; /* an arbitrary number of unescaped[](#l6.39)
                        UTF-8 bytes may follow. */[](#l6.40)
   }[](#l6.41)

@@ -1222,6 +1228,29 @@ PyObject *PyBytes_DecodeEscape(const cha return NULL; } +PyObject *PyBytes_DecodeEscape(const char *s,

                           Py_ssize_t len,[](#l6.47)

                           const char *errors,[](#l6.48)

                           Py_ssize_t unicode,[](#l6.49)

                           const char *recode_encoding)[](#l6.50)

const char* first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape(s, len, errors, unicode,

                                        recode_encoding,[](#l6.54)

                                        &first_invalid_escape);[](#l6.55)

if (result == NULL)
```
   return NULL;[](#l6.57)
```
if (first_invalid_escape != NULL) {

   if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,[](#l6.59)

                        "invalid escape sequence '\\%c'",[](#l6.60)

                        *first_invalid_escape) < 0) {[](#l6.61)

```
       Py_DECREF(result);[](#l6.62)
```
```
       return NULL;[](#l6.63)
```
```
   }[](#l6.64)
```
}
return result;

+ +} /* -------------------------------------------------------------------- / / object api */

--- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5877,9 +5877,10 @@ PyUnicode_AsUTF16String(PyObject *unicod static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject * -PyUnicode_DecodeUnicodeEscape(const char *s,

                         Py_ssize_t size,[](#l7.8)

                         const char *errors)[](#l7.9)

+_PyUnicode_DecodeUnicodeEscape(const char *s,

                          Py_ssize_t size,[](#l7.11)

                          const char *errors,[](#l7.12)

                          const char **first_invalid_escape)[](#l7.13)

{ const char *starts = s; _PyUnicodeWriter writer; @@ -5887,6 +5888,9 @@ PyUnicode_DecodeUnicodeEscape(const char PyObject *errorHandler = NULL; PyObject *exc = NULL;

// so we can remember if we've seen an invalid escape char or not
*first_invalid_escape = NULL;

+ if (size == 0) { _Py_RETURN_UNICODE_EMPTY(); } @@ -6061,9 +6065,10 @@ PyUnicode_DecodeUnicodeEscape(const char goto error; default:

       if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,[](#l7.31)

                            "invalid escape sequence '\\%c'", c) < 0)[](#l7.32)

```
           goto onError;[](#l7.33)
```

       if (*first_invalid_escape == NULL) {[](#l7.34)

           *first_invalid_escape = s-1; /* Back up one char, since we've[](#l7.35)

                                           already incremented s. */[](#l7.36)

       }[](#l7.37)
       WRITE_ASCII_CHAR('\\');[](#l7.38)
       WRITE_CHAR(c);[](#l7.39)
       continue;[](#l7.40)

@@ -6098,6 +6103,27 @@ PyUnicode_DecodeUnicodeEscape(const char return NULL; } +PyObject * +PyUnicode_DecodeUnicodeEscape(const char *s,

                         Py_ssize_t size,[](#l7.47)

                         const char *errors)[](#l7.48)

const char *first_invalid_escape;
PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,

                                                 &first_invalid_escape);[](#l7.52)

if (result == NULL)
```
   return NULL;[](#l7.54)
```
if (first_invalid_escape != NULL) {

   if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,[](#l7.56)

                        "invalid escape sequence '\\%c'",[](#l7.57)

                        *first_invalid_escape) < 0) {[](#l7.58)

```
       Py_DECREF(result);[](#l7.59)
```
```
       return NULL;[](#l7.60)
```
```
   }[](#l7.61)
```
}
return result;

+} + /* Return a Unicode-Escape string version of the Unicode object. If quotes is true, the string is enclosed in u"" or u'' quotes as

--- a/Python/ast.c +++ b/Python/ast.c @@ -4113,8 +4113,34 @@ decode_utf8(struct compiling *c, const c return PyUnicode_DecodeUTF8(t, s - t, NULL); } +static int +warn_invalid_escape_sequence(struct compiling *c, const node *n,

                        char first_invalid_escape_char)[](#l8.9)

PyObject *msg = PyUnicode_FromFormat("invalid escape sequence \%c",

                                    first_invalid_escape_char);[](#l8.12)

if (msg == NULL) {
```
   return -1;[](#l8.14)
```
}
if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg,

                              c->c_filename, LINENO(n),[](#l8.17)

                              NULL, NULL) < 0 &&[](#l8.18)

   PyErr_ExceptionMatches(PyExc_DeprecationWarning))[](#l8.19)

{

   const char *s = PyUnicode_AsUTF8(msg);[](#l8.21)

```
   if (s != NULL) {[](#l8.22)
```
```
       ast_error(c, n, s);[](#l8.23)
```
```
   }[](#l8.24)
```
```
   Py_DECREF(msg);[](#l8.25)
```
```
   return -1;[](#l8.26)
```
}
Py_DECREF(msg);
return 0;

+} + static PyObject * -decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len) +decode_unicode_with_escapes(struct compiling *c, const node *n, const char *s,

                       size_t len)[](#l8.35)

{ PyObject *v, *u; char *buf; @@ -4167,11 +4193,41 @@ decode_unicode_with_escapes(struct compi len = p - buf; s = buf;

v = PyUnicode_DecodeUnicodeEscape(s, len, NULL);

const char *first_invalid_escape;
v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);

if (v != NULL && first_invalid_escape != NULL) {

   if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {[](#l8.48)

       /* We have not decref u before because first_invalid_escape points[](#l8.49)

```
          inside u. */[](#l8.50)
```
```
       Py_XDECREF(u);[](#l8.51)
```
```
       Py_DECREF(v);[](#l8.52)
```
```
       return NULL;[](#l8.53)
```
```
   }[](#l8.54)
```
} Py_XDECREF(u); return v; }

+static PyObject * +decode_bytes_with_escapes(struct compiling *c, const node *n, const char *s,

                     size_t len)[](#l8.62)

const char *first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL,

                                        &first_invalid_escape);[](#l8.66)

if (result == NULL)
```
   return NULL;[](#l8.68)
```

if (first_invalid_escape != NULL) {

   if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {[](#l8.71)

```
       Py_DECREF(result);[](#l8.72)
```
```
       return NULL;[](#l8.73)
```
```
   }[](#l8.74)
```
}
return result;

+} + /* Compile this expression in to an expr_ty. Add parens around the expression, in order to allow leading spaces in the expression. */ static expr_ty @@ -4310,7 +4366,7 @@ done: literal_end-literal_start, NULL, NULL); else

       *literal = decode_unicode_with_escapes(c, literal_start,[](#l8.86)

       *literal = decode_unicode_with_escapes(c, n, literal_start,[](#l8.87)
                                              literal_end-literal_start);[](#l8.88)
   if (!*literal)[](#l8.89)
       return -1;[](#l8.90)

@@ -5048,12 +5104,12 @@ parsestr(struct compiling *c, const node if (*rawmode) *result = PyBytes_FromStringAndSize(s, len); else

       *result = PyBytes_DecodeEscape(s, len, NULL, /* ignored */ 0, NULL);[](#l8.95)

```
       *result = decode_bytes_with_escapes(c, n, s, len);[](#l8.96)
```
} else { if (*rawmode) *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); else

       *result = decode_unicode_with_escapes(c, s, len);[](#l8.101)

       *result = decode_unicode_with_escapes(c, n, s, len);[](#l8.102)

} return *result == NULL ? -1 : 0; }