cpython: 01d4dd412581 (original) (raw)

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,9 @@ What's New in Python 3.3.1? Core and Builtins ----------------- +- Issue #10156: In the interpreter's initialization phase, unicode globals

are now initialized dynamically as needed. +

Issue #16980: Fix processing of escaped non-ascii bytes in the unicode-escape-decode decoder.

--- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -57,8 +57,9 @@ OF OR IN CONNECTION WITH THE USE OR PERF /* --- Globals ------------------------------------------------------------

The globals are initialized by the _PyUnicode_Init() API and should
not be used before calling that API. +NOTE: In the interpreter's initialization phase, some globals are currently

 initialized dynamically as needed. In the process Unicode objects may[](#l2.10)

 be created before the Unicode type is ready.[](#l2.11)

*/ @@ -179,17 +180,36 @@ extern "C" { Another way to look at this is that to say that the actual reference count of a string is: s->ob_refcnt + (s->state ? 2 : 0) */ -static PyObject *interned; +static PyObject interned = NULL; / The empty Unicode object is shared to improve performance. */ -static PyObject *unicode_empty; +static PyObject *unicode_empty = NULL; + +#define _Py_INCREF_UNICODE_EMPTY() [](#l2.26)

do { [](#l2.27)

   if (unicode_empty != NULL)                      \[](#l2.28)

       Py_INCREF(unicode_empty);                   \[](#l2.29)

   else {                                          \[](#l2.30)

       unicode_empty = PyUnicode_New(0, 0);        \[](#l2.31)

       if (unicode_empty != NULL) {                \[](#l2.32)

           Py_INCREF(unicode_empty);               \[](#l2.33)

           assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \[](#l2.34)

       }                                           \[](#l2.35)

   }                                               \[](#l2.36)

} while (0)

+ +#define _Py_RETURN_UNICODE_EMPTY() [](#l2.39)

do { [](#l2.40)

   _Py_INCREF_UNICODE_EMPTY();                     \[](#l2.41)

   return unicode_empty;                           \[](#l2.42)

} while (0)

/* List of static strings. */ -static _Py_Identifier *static_strings; +static _Py_Identifier static_strings = NULL; / Single character Unicode strings in the Latin-1 range are being shared as well. */ -static PyObject *unicode_latin1[256]; +static PyObject unicode_latin1[256] = {NULL}; / Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { @@ -416,9 +436,8 @@ unicode_result_wchar(PyObject *unicode) len = _PyUnicode_WSTR_LENGTH(unicode); if (len == 0) {

   Py_INCREF(unicode_empty);[](#l2.60)
   Py_DECREF(unicode);[](#l2.61)

```
   return unicode_empty;[](#l2.62)
```

```
   _Py_RETURN_UNICODE_EMPTY();[](#l2.63)
```
} if (len == 1) { @@ -450,8 +469,8 @@ unicode_result_ready(PyObject *unicode) length = PyUnicode_GET_LENGTH(unicode); if (length == 0) { if (unicode != unicode_empty) {

       Py_INCREF(unicode_empty);[](#l2.71)
       Py_DECREF(unicode);[](#l2.72)

       _Py_RETURN_UNICODE_EMPTY();[](#l2.73)
   }[](#l2.74)
   return unicode_empty;[](#l2.75)

} @@ -528,7 +547,7 @@ static OSVERSIONINFOEX winver;

#define BLOOM_MASK unsigned long -static BLOOM_MASK bloom_linebreak; +static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) @@ -1582,9 +1601,11 @@ unicode_resize(PyObject **p_unicode, Py_ return 0; if (length == 0) {

   _Py_INCREF_UNICODE_EMPTY();[](#l2.90)

```
   if (!unicode_empty)[](#l2.91)
```

       return -1;[](#l2.92)
   Py_DECREF(*p_unicode);[](#l2.93)
   *p_unicode = unicode_empty;[](#l2.94)

```
   Py_INCREF(*p_unicode);[](#l2.95)
   return 0;[](#l2.96)
```
} @@ -1731,10 +1752,8 @@ PyUnicode_FromUnicode(const Py_UNICODE * some optimizations which share commonly used objects. */

/* Optimization for empty strings */

if (size == 0 && unicode_empty != NULL) {
```
   Py_INCREF(unicode_empty);[](#l2.104)
```
```
   return unicode_empty;[](#l2.105)
```
}

if (size == 0)

   _Py_RETURN_UNICODE_EMPTY();[](#l2.108)

/* Single character Unicode objects in the Latin-1 range are shared when using this constructor / @@ -1893,10 +1912,8 @@ static PyObject PyObject *res; unsigned char max_char;

if (size == 0) {
```
   Py_INCREF(unicode_empty);[](#l2.117)
```
```
   return unicode_empty;[](#l2.118)
```
}

if (size == 0)
```
   _Py_RETURN_UNICODE_EMPTY();[](#l2.121)
```
assert(size > 0); if (size == 1) return get_latin1_char(u[0]);

@@ -1916,10 +1933,8 @@ static PyObject* PyObject *res; Py_UCS2 max_char;

if (size == 0) {
```
   Py_INCREF(unicode_empty);[](#l2.130)
```
```
   return unicode_empty;[](#l2.131)
```
}

if (size == 0)
```
   _Py_RETURN_UNICODE_EMPTY();[](#l2.134)
```
assert(size > 0); if (size == 1) { Py_UCS4 ch = u[0];

@@ -1954,10 +1969,8 @@ static PyObject* PyObject *res; Py_UCS4 max_char;

if (size == 0) {
```
   Py_INCREF(unicode_empty);[](#l2.143)
```
```
   return unicode_empty;[](#l2.144)
```
}

if (size == 0)
```
   _Py_RETURN_UNICODE_EMPTY();[](#l2.147)
```
assert(size > 0); if (size == 1) { Py_UCS4 ch = u[0];

@@ -2249,10 +2262,8 @@ PyObject * PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) { if (w == NULL) {

```
   if (size == 0) {[](#l2.155)
```

       Py_INCREF(unicode_empty);[](#l2.156)

```
       return unicode_empty;[](#l2.157)
```
```
   }[](#l2.158)
```

```
   if (size == 0)[](#l2.159)
```
```
       _Py_RETURN_UNICODE_EMPTY();[](#l2.160)
   PyErr_BadInternalCall();[](#l2.161)
   return NULL;[](#l2.162)
```
} @@ -3007,15 +3018,11 @@ PyUnicode_FromEncodedObject(register PyO /* Decoding bytes objects is the most common case and should be fast */ if (PyBytes_Check(obj)) {

   if (PyBytes_GET_SIZE(obj) == 0) {[](#l2.168)

       Py_INCREF(unicode_empty);[](#l2.169)

```
       v = unicode_empty;[](#l2.170)
```
```
   }[](#l2.171)
```
```
   else {[](#l2.172)
```
```
       v = PyUnicode_Decode([](#l2.173)
```

               PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),[](#l2.174)

               encoding, errors);[](#l2.175)

```
   }[](#l2.176)
```

   if (PyBytes_GET_SIZE(obj) == 0)[](#l2.177)

       _Py_RETURN_UNICODE_EMPTY();[](#l2.178)

```
   v = PyUnicode_Decode([](#l2.179)
```

           PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),[](#l2.180)

```
           encoding, errors);[](#l2.181)
   return v;[](#l2.182)
```
} @@ -3035,12 +3042,11 @@ PyUnicode_FromEncodedObject(register PyO } if (buffer.len == 0) {

```
   Py_INCREF(unicode_empty);[](#l2.189)
```
```
   v = unicode_empty;[](#l2.190)
```
}
else

   v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);[](#l2.193)

   PyBuffer_Release(&buffer);[](#l2.195)

   _Py_RETURN_UNICODE_EMPTY();[](#l2.196)

}

v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); PyBuffer_Release(&buffer); return v; }

@@ -4720,8 +4726,7 @@ PyUnicode_DecodeUTF8Stateful(const char if (size == 0) { if (consumed) *consumed = 0;

```
   Py_INCREF(unicode_empty);[](#l2.207)
```
```
   return unicode_empty;[](#l2.208)
```

```
   _Py_RETURN_UNICODE_EMPTY();[](#l2.209)
```
} /* ASCII is equivalent to the first 128 ordinals in Unicode. */ @@ -5232,8 +5237,7 @@ PyUnicode_DecodeUTF16Stateful(const char if (q == e) { if (consumed) *consumed = size;

```
   Py_INCREF(unicode_empty);[](#l2.217)
```
```
   return unicode_empty;[](#l2.218)
```

   _Py_RETURN_UNICODE_EMPTY();[](#l2.219)

}

#ifdef BYTEORDER_IS_LITTLE_ENDIAN @@ -6558,10 +6562,8 @@ PyUnicode_DecodeASCII(const char *s, PyObject *errorHandler = NULL; PyObject *exc = NULL;

if (size == 0) {
```
   Py_INCREF(unicode_empty);[](#l2.228)
```
```
   return unicode_empty;[](#l2.229)
```
}

if (size == 0)

   _Py_RETURN_UNICODE_EMPTY();[](#l2.232)

/* ASCII is equivalent to the first 128 ordinals in Unicode. */ if (size == 1 && (unsigned char)s[0] < 128) @@ -6940,8 +6942,7 @@ decode_code_page_stateful(int code_page, if (chunk_size == 0 && done) { if (v != NULL) break;

       Py_INCREF(unicode_empty);[](#l2.240)

```
       return unicode_empty;[](#l2.241)
```

       _Py_RETURN_UNICODE_EMPTY();[](#l2.242)
   }[](#l2.243)

@@ -9503,9 +9504,7 @@ PyUnicode_Join(PyObject separator, PyOb / If empty sequence, return u"". */ if (seqlen == 0) { Py_DECREF(fseq);

```
   Py_INCREF(unicode_empty);[](#l2.250)
```
```
   res = unicode_empty;[](#l2.251)
```
```
   return res;[](#l2.252)
```

```
   _Py_RETURN_UNICODE_EMPTY();[](#l2.253)
```
} /* If singleton sequence with an exact Unicode, return that. */ @@ -10205,7 +10204,9 @@ replace(PyObject *self, PyObject *str1, } new_size = slen + n * (len2 - len1); if (new_size == 0) {

       Py_INCREF(unicode_empty);[](#l2.261)

       _Py_INCREF_UNICODE_EMPTY();[](#l2.262)

```
       if (!unicode_empty)[](#l2.263)
```

           goto error;[](#l2.264)
       u = unicode_empty;[](#l2.265)
       goto done;[](#l2.266)
   }[](#l2.267)

@@ -11672,10 +11673,8 @@ PyUnicode_Substring(PyObject *self, Py_s PyErr_SetString(PyExc_IndexError, "string index out of range"); return NULL; }

if (start >= length || end < start) {
```
   Py_INCREF(unicode_empty);[](#l2.273)
```
```
   return unicode_empty;[](#l2.274)
```
}

if (start >= length || end < start)

   _Py_RETURN_UNICODE_EMPTY();[](#l2.277)

length = end - start; if (PyUnicode_IS_ASCII(self)) { @@ -11802,10 +11801,8 @@ unicode_repeat(PyObject *str, Py_ssize_t PyObject *u; Py_ssize_t nchars, n;

if (len < 1) {
```
   Py_INCREF(unicode_empty);[](#l2.286)
```
```
   return unicode_empty;[](#l2.287)
```
}

if (len < 1)

   _Py_RETURN_UNICODE_EMPTY();[](#l2.290)

/* no repeat, return original string */ if (len == 1) @@ -12924,8 +12921,7 @@ PyObject * { if (writer->pos == 0) { Py_XDECREF(writer->buffer);

```
   Py_INCREF(unicode_empty);[](#l2.298)
```
```
   return unicode_empty;[](#l2.299)
```

```
   _Py_RETURN_UNICODE_EMPTY();[](#l2.300)
```
} if (writer->readonly) { assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);

@@ -13143,8 +13139,7 @@ unicode_subscript(PyObject* self, PyObje } if (slicelength <= 0) {

       Py_INCREF(unicode_empty);[](#l2.308)

```
       return unicode_empty;[](#l2.309)
```

       _Py_RETURN_UNICODE_EMPTY();[](#l2.310)
   } else if (start == 0 && step == 1 &&[](#l2.311)
              slicelength == PyUnicode_GET_LENGTH(self)) {[](#l2.312)
       return unicode_result_unchanged(self);[](#l2.313)

@@ -13974,10 +13969,8 @@ unicode_new(PyTypeObject *type, PyObject if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", kwlist, &x, &encoding, &errors)) return NULL;

if (x == NULL) {
```
   Py_INCREF(unicode_empty);[](#l2.319)
```
```
   return unicode_empty;[](#l2.320)
```
}

if (x == NULL)
```
   _Py_RETURN_UNICODE_EMPTY();[](#l2.323)
```
if (encoding == NULL && errors == NULL) return PyObject_Str(x); else @@ -14146,8 +14139,6 @@ PyTypeObject PyUnicode_Type = { int _PyUnicode_Init(void) {

int i;

- /* XXX - move this array to unicodectype.c ? / Py_UCS2 linebreak[] = { 0x000A, / LINE FEED / @@ -14161,13 +14152,11 @@ int _PyUnicode_Init(void) }; / Init the implementation */

unicode_empty = PyUnicode_New(0, 0);

_Py_INCREF_UNICODE_EMPTY(); if (!unicode_empty) Py_FatalError("Can't create empty string");

assert(_PyUnicode_CheckConsistency(unicode_empty, 1));

for (i = 0; i < 256; i++)
```
   unicode_latin1[i] = NULL;[](#l2.347)
```

Py_DECREF(unicode_empty);

+ if (PyType_Ready(&PyUnicode_Type) < 0) Py_FatalError("Can't initialize 'unicode'"); @@ -14207,15 +14196,10 @@ void { int i;

Py_XDECREF(unicode_empty);
unicode_empty = NULL;

for (i = 0; i < 256; i++) {
```
   if (unicode_latin1[i]) {[](#l2.361)
```

       Py_DECREF(unicode_latin1[i]);[](#l2.362)

       unicode_latin1[i] = NULL;[](#l2.363)

```
   }[](#l2.364)
```
}

Py_CLEAR(unicode_empty);

for (i = 0; i < 256; i++)
```
   Py_CLEAR(unicode_latin1[i]);[](#l2.369)
```
_PyUnicode_ClearStaticStrings(); (void)PyUnicode_ClearFreeList(); } @@ -14344,8 +14328,7 @@ void "mortal/immortal\n", mortal_size, immortal_size); Py_DECREF(keys); PyDict_Clear(interned);

Py_DECREF(interned);
interned = NULL;

Py_CLEAR(interned);

}