cpython: 01d4dd412581 (original) (raw)
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -12,6 +12,9 @@ What's New in Python 3.3.1? Core and Builtins ----------------- +- Issue #10156: In the interpreter's initialization phase, unicode globals
--- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -57,8 +57,9 @@ OF OR IN CONNECTION WITH THE USE OR PERF /* --- Globals ------------------------------------------------------------
- The globals are initialized by the _PyUnicode_Init() API and should
- not be used before calling that API. +NOTE: In the interpreter's initialization phase, some globals are currently
initialized dynamically as needed. In the process Unicode objects may[](#l2.10)
be created before the Unicode type is ready.[](#l2.11)
*/ @@ -179,17 +180,36 @@ extern "C" { Another way to look at this is that to say that the actual reference count of a string is: s->ob_refcnt + (s->state ? 2 : 0) */ -static PyObject *interned; +static PyObject interned = NULL; / The empty Unicode object is shared to improve performance. */ -static PyObject *unicode_empty; +static PyObject *unicode_empty = NULL; + +#define _Py_INCREF_UNICODE_EMPTY() [](#l2.26)
- do { [](#l2.27)
if (unicode_empty != NULL) \[](#l2.28)
Py_INCREF(unicode_empty); \[](#l2.29)
else { \[](#l2.30)
unicode_empty = PyUnicode_New(0, 0); \[](#l2.31)
if (unicode_empty != NULL) { \[](#l2.32)
Py_INCREF(unicode_empty); \[](#l2.33)
assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \[](#l2.34)
} \[](#l2.35)
} \[](#l2.36)
- } while (0)
+ +#define _Py_RETURN_UNICODE_EMPTY() [](#l2.39)
- do { [](#l2.40)
_Py_INCREF_UNICODE_EMPTY(); \[](#l2.41)
return unicode_empty; \[](#l2.42)
- } while (0)
/* List of static strings. */ -static _Py_Identifier *static_strings; +static _Py_Identifier static_strings = NULL; / Single character Unicode strings in the Latin-1 range are being shared as well. */ -static PyObject *unicode_latin1[256]; +static PyObject unicode_latin1[256] = {NULL}; / Fast detection of the most frequent whitespace characters */ const unsigned char _Py_ascii_whitespace[] = { @@ -416,9 +436,8 @@ unicode_result_wchar(PyObject *unicode) len = _PyUnicode_WSTR_LENGTH(unicode); if (len == 0) {
Py_INCREF(unicode_empty);[](#l2.60) Py_DECREF(unicode);[](#l2.61)
return unicode_empty;[](#l2.62)
} if (len == 1) { @@ -450,8 +469,8 @@ unicode_result_ready(PyObject *unicode) length = PyUnicode_GET_LENGTH(unicode); if (length == 0) { if (unicode != unicode_empty) {_Py_RETURN_UNICODE_EMPTY();[](#l2.63)
Py_INCREF(unicode_empty);[](#l2.71) Py_DECREF(unicode);[](#l2.72)
} @@ -528,7 +547,7 @@ static OSVERSIONINFOEX winver;_Py_RETURN_UNICODE_EMPTY();[](#l2.73) }[](#l2.74) return unicode_empty;[](#l2.75)
#define BLOOM_MASK unsigned long -static BLOOM_MASK bloom_linebreak; +static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0; #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) @@ -1582,9 +1601,11 @@ unicode_resize(PyObject **p_unicode, Py_ return 0; if (length == 0) {
_Py_INCREF_UNICODE_EMPTY();[](#l2.90)
if (!unicode_empty)[](#l2.91)
return -1;[](#l2.92) Py_DECREF(*p_unicode);[](#l2.93) *p_unicode = unicode_empty;[](#l2.94)
} @@ -1731,10 +1752,8 @@ PyUnicode_FromUnicode(const Py_UNICODE * some optimizations which share commonly used objects. */Py_INCREF(*p_unicode);[](#l2.95) return 0;[](#l2.96)
/* Optimization for empty strings */
- if (size == 0 && unicode_empty != NULL) {
Py_INCREF(unicode_empty);[](#l2.104)
return unicode_empty;[](#l2.105)
- }
/* Single character Unicode objects in the Latin-1 range are shared when using this constructor / @@ -1893,10 +1912,8 @@ static PyObject PyObject *res; unsigned char max_char;
- if (size == 0)
assert(size > 0); if (size == 1) return get_latin1_char(u[0]);_Py_RETURN_UNICODE_EMPTY();[](#l2.121)
@@ -1916,10 +1933,8 @@ static PyObject* PyObject *res; Py_UCS2 max_char;
- if (size == 0)
assert(size > 0); if (size == 1) { Py_UCS4 ch = u[0];_Py_RETURN_UNICODE_EMPTY();[](#l2.134)
@@ -1954,10 +1969,8 @@ static PyObject* PyObject *res; Py_UCS4 max_char;
- if (size == 0)
assert(size > 0); if (size == 1) { Py_UCS4 ch = u[0];_Py_RETURN_UNICODE_EMPTY();[](#l2.147)
@@ -2249,10 +2262,8 @@ PyObject * PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size) { if (w == NULL) {
if (size == 0) {[](#l2.155)
Py_INCREF(unicode_empty);[](#l2.156)
return unicode_empty;[](#l2.157)
}[](#l2.158)
if (size == 0)[](#l2.159)
} @@ -3007,15 +3018,11 @@ PyUnicode_FromEncodedObject(register PyO /* Decoding bytes objects is the most common case and should be fast */ if (PyBytes_Check(obj)) {_Py_RETURN_UNICODE_EMPTY();[](#l2.160) PyErr_BadInternalCall();[](#l2.161) return NULL;[](#l2.162)
if (PyBytes_GET_SIZE(obj) == 0) {[](#l2.168)
Py_INCREF(unicode_empty);[](#l2.169)
v = unicode_empty;[](#l2.170)
}[](#l2.171)
else {[](#l2.172)
v = PyUnicode_Decode([](#l2.173)
PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),[](#l2.174)
encoding, errors);[](#l2.175)
}[](#l2.176)
if (PyBytes_GET_SIZE(obj) == 0)[](#l2.177)
_Py_RETURN_UNICODE_EMPTY();[](#l2.178)
v = PyUnicode_Decode([](#l2.179)
PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),[](#l2.180)
} @@ -3035,12 +3042,11 @@ PyUnicode_FromEncodedObject(register PyO } if (buffer.len == 0) {encoding, errors);[](#l2.181) return v;[](#l2.182)
Py_INCREF(unicode_empty);[](#l2.189)
v = unicode_empty;[](#l2.190)
- }
- else
v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);[](#l2.193)
- v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors); PyBuffer_Release(&buffer); return v; }
@@ -4720,8 +4726,7 @@ PyUnicode_DecodeUTF8Stateful(const char if (size == 0) { if (consumed) *consumed = 0;
Py_INCREF(unicode_empty);[](#l2.207)
return unicode_empty;[](#l2.208)
} /* ASCII is equivalent to the first 128 ordinals in Unicode. */ @@ -5232,8 +5237,7 @@ PyUnicode_DecodeUTF16Stateful(const char if (q == e) { if (consumed) *consumed = size;_Py_RETURN_UNICODE_EMPTY();[](#l2.209)
Py_INCREF(unicode_empty);[](#l2.217)
return unicode_empty;[](#l2.218)
#ifdef BYTEORDER_IS_LITTLE_ENDIAN @@ -6558,10 +6562,8 @@ PyUnicode_DecodeASCII(const char *s, PyObject *errorHandler = NULL; PyObject *exc = NULL;
/* ASCII is equivalent to the first 128 ordinals in Unicode. */ if (size == 1 && (unsigned char)s[0] < 128) @@ -6940,8 +6942,7 @@ decode_code_page_stateful(int code_page, if (chunk_size == 0 && done) { if (v != NULL) break;
Py_INCREF(unicode_empty);[](#l2.240)
return unicode_empty;[](#l2.241)
_Py_RETURN_UNICODE_EMPTY();[](#l2.242) }[](#l2.243)
@@ -9503,9 +9504,7 @@ PyUnicode_Join(PyObject separator, PyOb / If empty sequence, return u"". */ if (seqlen == 0) { Py_DECREF(fseq);
Py_INCREF(unicode_empty);[](#l2.250)
res = unicode_empty;[](#l2.251)
return res;[](#l2.252)
} /* If singleton sequence with an exact Unicode, return that. */ @@ -10205,7 +10204,9 @@ replace(PyObject *self, PyObject *str1, } new_size = slen + n * (len2 - len1); if (new_size == 0) {_Py_RETURN_UNICODE_EMPTY();[](#l2.253)
Py_INCREF(unicode_empty);[](#l2.261)
_Py_INCREF_UNICODE_EMPTY();[](#l2.262)
if (!unicode_empty)[](#l2.263)
goto error;[](#l2.264) u = unicode_empty;[](#l2.265) goto done;[](#l2.266) }[](#l2.267)
@@ -11672,10 +11673,8 @@ PyUnicode_Substring(PyObject *self, Py_s PyErr_SetString(PyExc_IndexError, "string index out of range"); return NULL; }
- if (start >= length || end < start) {
Py_INCREF(unicode_empty);[](#l2.273)
return unicode_empty;[](#l2.274)
- }
length = end - start; if (PyUnicode_IS_ASCII(self)) { @@ -11802,10 +11801,8 @@ unicode_repeat(PyObject *str, Py_ssize_t PyObject *u; Py_ssize_t nchars, n;
/* no repeat, return original string */ if (len == 1) @@ -12924,8 +12921,7 @@ PyObject * { if (writer->pos == 0) { Py_XDECREF(writer->buffer);
Py_INCREF(unicode_empty);[](#l2.298)
return unicode_empty;[](#l2.299)
} if (writer->readonly) { assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);_Py_RETURN_UNICODE_EMPTY();[](#l2.300)
@@ -13143,8 +13139,7 @@ unicode_subscript(PyObject* self, PyObje } if (slicelength <= 0) {
Py_INCREF(unicode_empty);[](#l2.308)
return unicode_empty;[](#l2.309)
_Py_RETURN_UNICODE_EMPTY();[](#l2.310) } else if (start == 0 && step == 1 &&[](#l2.311) slicelength == PyUnicode_GET_LENGTH(self)) {[](#l2.312) return unicode_result_unchanged(self);[](#l2.313)
@@ -13974,10 +13969,8 @@ unicode_new(PyTypeObject *type, PyObject if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str", kwlist, &x, &encoding, &errors)) return NULL;
- if (x == NULL)
if (encoding == NULL && errors == NULL) return PyObject_Str(x); else @@ -14146,8 +14139,6 @@ PyTypeObject PyUnicode_Type = { int _PyUnicode_Init(void) {_Py_RETURN_UNICODE_EMPTY();[](#l2.323)
- /* XXX - move this array to unicodectype.c ? / Py_UCS2 linebreak[] = { 0x000A, / LINE FEED / @@ -14161,13 +14152,11 @@ int _PyUnicode_Init(void) }; / Init the implementation */
+ if (PyType_Ready(&PyUnicode_Type) < 0) Py_FatalError("Can't initialize 'unicode'"); @@ -14207,15 +14196,10 @@ void { int i;
- for (i = 0; i < 256; i++) {
if (unicode_latin1[i]) {[](#l2.361)
Py_DECREF(unicode_latin1[i]);[](#l2.362)
unicode_latin1[i] = NULL;[](#l2.363)
}[](#l2.364)
- }