cpython: 123f2dc08b3e (original) (raw)
--- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -46,10 +46,6 @@ OF OR IN CONNECTION WITH THE USE OR PERF #include <windows.h> #endif -#ifdef Py_DEBUG -# define DONT_MAKE_RESULT_READY -#endif - /* Endianness switches; defaults to little endian */ #ifdef WORDS_BIGENDIAN @@ -118,11 +114,6 @@ extern "C" { 0 : [](#l1.15) _PyUnicode_Ready(op))) -#define _PyUnicode_READY_REPLACE(p_obj) [](#l1.18)
- (assert(_PyUnicode_CHECK(*p_obj)), [](#l1.19)
(PyUnicode_IS_READY(*p_obj) ? \[](#l1.20)
0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))[](#l1.21)
- #define _PyUnicode_SHARE_UTF8(op) [](#l1.23) (assert(_PyUnicode_CHECK(op)), [](#l1.24) assert(!PyUnicode_IS_COMPACT_ASCII(op)), [](#l1.25) @@ -232,9 +223,6 @@ static void copy_characters( PyObject *to, Py_ssize_t to_start, PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many); -#ifdef Py_DEBUG -static int unicode_is_singleton(PyObject *unicode); -#endif static PyObject * unicode_fromascii(const unsigned char s, Py_ssize_t size); @@ -425,6 +413,90 @@ int } #endif +static PyObject +unicode_result_wchar(PyObject *unicode) +{ +#ifndef Py_DEBUG
- len = _PyUnicode_WSTR_LENGTH(unicode);
- if (len == 0) {
Py_INCREF(unicode_empty);[](#l1.50)
Py_DECREF(unicode);[](#l1.51)
return unicode_empty;[](#l1.52)
- }
- if (len == 1) {
wchar_t ch = _PyUnicode_WSTR(unicode)[0];[](#l1.56)
if (ch < 256) {[](#l1.57)
PyObject *latin1_char = get_latin1_char((unsigned char)ch);[](#l1.58)
Py_DECREF(unicode);[](#l1.59)
return latin1_char;[](#l1.60)
}[](#l1.61)
- }
- /* don't make the result ready in debug mode to ensure that the caller
makes the string ready before using it */[](#l1.70)
- assert(_PyUnicode_CheckConsistency(unicode, 1));
+} + +static PyObject* +unicode_result_ready(PyObject *unicode) +{
- length = PyUnicode_GET_LENGTH(unicode);
- if (length == 0) {
if (unicode != unicode_empty) {[](#l1.83)
Py_INCREF(unicode_empty);[](#l1.84)
Py_DECREF(unicode);[](#l1.85)
}[](#l1.86)
return unicode_empty;[](#l1.87)
- }
- if (length == 1) {
Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);[](#l1.91)
if (ch < 256) {[](#l1.92)
PyObject *latin1_char = unicode_latin1[ch];[](#l1.93)
if (latin1_char != NULL) {[](#l1.94)
if (unicode != latin1_char) {[](#l1.95)
Py_INCREF(latin1_char);[](#l1.96)
Py_DECREF(unicode);[](#l1.97)
}[](#l1.98)
return latin1_char;[](#l1.99)
}[](#l1.100)
else {[](#l1.101)
assert(_PyUnicode_CheckConsistency(unicode, 1));[](#l1.102)
Py_INCREF(unicode);[](#l1.103)
unicode_latin1[ch] = unicode;[](#l1.104)
return unicode;[](#l1.105)
}[](#l1.106)
}[](#l1.107)
- }
+} + +static PyObject* +unicode_result(PyObject *unicode) +{
- assert(_PyUnicode_CHECK(unicode));
- if (PyUnicode_IS_READY(unicode))
return unicode_result_ready(unicode);[](#l1.119)
- else
return unicode_result_wchar(unicode);[](#l1.121)
+} + #ifdef HAVE_MBCS static OSVERSIONINFOEX winver; #endif @@ -1271,10 +1343,9 @@ find_maxchar_surrogates(const wchar_t *b static int unicode_ready_calls = 0; #endif -static int -unicode_ready(PyObject **p_obj, int replace) -{
+int +_PyUnicode_Ready(PyObject *unicode) +{ wchar_t *end; Py_UCS4 maxchar = 0; Py_ssize_t num_surrogates; @@ -1282,9 +1353,6 @@ unicode_ready(PyObject **p_obj, int repl Py_ssize_t length_wo_surrogates; #endif
- /* _PyUnicode_Ready() is only intended for old-style API usage where strings were created using _PyObject_New() and where no canonical representation (the str field) has been set yet aka strings @@ -1301,32 +1369,6 @@ unicode_ready(PyObject **p_obj, int repl ++unicode_ready_calls; #endif -#ifdef Py_DEBUG
- if (replace) {
Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);[](#l1.162)
wchar_t *wstr = _PyUnicode_WSTR(unicode);[](#l1.163)
/* Optimization for empty strings */[](#l1.164)
if (len == 0) {[](#l1.165)
Py_INCREF(unicode_empty);[](#l1.166)
Py_DECREF(*p_obj);[](#l1.167)
*p_obj = unicode_empty;[](#l1.168)
return 0;[](#l1.169)
}[](#l1.170)
if (len == 1 && wstr[0] < 256) {[](#l1.171)
PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);[](#l1.172)
if (latin1_char == NULL)[](#l1.173)
return -1;[](#l1.174)
Py_DECREF(*p_obj);[](#l1.175)
*p_obj = latin1_char;[](#l1.176)
return 0;[](#l1.177)
}[](#l1.178)
- }
- end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode); if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end, &maxchar, &num_surrogates) == -1) @@ -1430,18 +1472,6 @@ unicode_ready(PyObject **p_obj, int repl return 0; } -int -_PyUnicode_ReadyReplace(PyObject **op) -{
-} - -int -_PyUnicode_Ready(PyObject *op) -{
-} - static void unicode_dealloc(register PyObject *unicode) { @@ -1681,8 +1711,7 @@ PyUnicode_FromUnicode(const Py_UNICODE * assert(0 && "Impossible state"); }
} PyObject @@ -1755,6 +1784,8 @@ void } } +/ Internal function, don't check maximum character / + static PyObject unicode_fromascii(const unsigned char* s, Py_ssize_t size) { @@ -1795,11 +1826,16 @@ static PyObject* _PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size) { PyObject *res;
- if (size == 0) {
Py_INCREF(unicode_empty);[](#l1.232)
return unicode_empty;[](#l1.233)
- }
- assert(size > 0); if (size == 1) return get_latin1_char(u[0]);
+ max_char = ucs1lib_find_max_char(u, u + size); res = PyUnicode_New(size, max_char); if (!res) @@ -1813,11 +1849,16 @@ static PyObject* _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size) { PyObject *res;
- if (size == 0) {
Py_INCREF(unicode_empty);[](#l1.252)
return unicode_empty;[](#l1.253)
- }
- assert(size > 0); if (size == 1 && u[0] < 256) return get_latin1_char((unsigned char)u[0]);
+ max_char = ucs2lib_find_max_char(u, u + size); res = PyUnicode_New(size, max_char); if (!res) @@ -1836,11 +1877,16 @@ static PyObject* _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size) { PyObject *res;
- if (size == 0) {
Py_INCREF(unicode_empty);[](#l1.272)
return unicode_empty;[](#l1.273)
- }
- assert(size > 0); if (size == 1 && u[0] < 256)
return get_latin1_char(u[0]);[](#l1.277)
return get_latin1_char((unsigned char)u[0]);[](#l1.278)
+ max_char = ucs4lib_find_max_char(u, u + size); res = PyUnicode_New(size, max_char); if (!res) @@ -2640,8 +2686,7 @@ PyUnicode_FromFormatV(const char *format PyObject_Free(callresults); if (numberresults) PyObject_Free(numberresults);
fail: if (callresults) { PyObject **callresult2 = callresults; @@ -2936,14 +2981,7 @@ PyUnicode_Decode(const char *s, goto onError; } Py_DECREF(buffer); -#ifndef DONT_MAKE_RESULT_READY
onError: Py_XDECREF(buffer); @@ -2969,8 +3007,7 @@ PyUnicode_AsDecodedObject(PyObject *unic v = PyCodec_Decode(unicode, encoding, errors); if (v == NULL) goto onError;
onError: return NULL; @@ -3002,8 +3039,7 @@ PyUnicode_AsDecodedUnicode(PyObject *uni Py_DECREF(v); goto onError; }
onError: return NULL; @@ -4002,14 +4038,7 @@ utf7Error: Py_XDECREF(errorHandler); Py_XDECREF(exc); -#ifndef DONT_MAKE_RESULT_READY
onError: Py_XDECREF(errorHandler); @@ -4358,18 +4387,26 @@ PyUnicode_DecodeUTF8Stateful(const char return (PyObject *)PyUnicode_New(0, 0); } maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
- /* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8[](#l1.350)
sequence at the end of the ASCII block. */[](#l1.351)
- if (maxchar < 128 && size == unicode_size) {
if (size == 1)[](#l1.353)
return get_latin1_char((unsigned char)s[0]);[](#l1.354)
unicode = PyUnicode_New(unicode_size, maxchar);[](#l1.356)
if (!unicode)[](#l1.357)
return NULL;[](#l1.358)
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);[](#l1.359)
assert(_PyUnicode_CheckConsistency(unicode, 1));[](#l1.360)
return unicode;[](#l1.361)
- }
+ /* In case of errors, maxchar and size computation might be incorrect; code below refits and resizes as necessary. */ unicode = PyUnicode_New(unicode_size, maxchar); if (!unicode) return NULL;
- /* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8[](#l1.370)
sequence at the end of the ASCII block. */[](#l1.371)
- if (maxchar < 128 && size == unicode_size) {
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);[](#l1.373)
return unicode;[](#l1.374)
- } kind = PyUnicode_KIND(unicode); data = PyUnicode_DATA(unicode); @@ -5052,14 +5089,7 @@ PyUnicode_DecodeUTF32Stateful(const char Py_XDECREF(errorHandler); Py_XDECREF(exc); -#ifndef DONT_MAKE_RESULT_READY
- if (_PyUnicode_READY_REPLACE(&unicode)) {
Py_DECREF(unicode);[](#l1.385)
return NULL;[](#l1.386)
- }
onError: Py_DECREF(unicode); @@ -5423,8 +5453,7 @@ PyUnicode_DecodeUTF16Stateful(const char Py_XDECREF(errorHandler); Py_XDECREF(exc);
onError: Py_DECREF(unicode); @@ -5843,14 +5872,7 @@ PyUnicode_DecodeUnicodeEscape(const char goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); -#ifndef DONT_MAKE_RESULT_READY
ucnhashError: PyErr_SetString( @@ -6111,8 +6133,7 @@ PyUnicode_DecodeRawUnicodeEscape(const c goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc);
onError: Py_XDECREF(v); @@ -6301,8 +6322,7 @@ PyObject * goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc);
onError: Py_XDECREF(v); @@ -6686,6 +6706,11 @@ PyUnicode_DecodeASCII(const char *s, PyObject *errorHandler = NULL; PyObject *exc = NULL;
+ /* ASCII is equivalent to the first 128 ordinals in Unicode. */ if (size == 1 && (unsigned char)s[0] < 128) return get_latin1_char((unsigned char)s[0]); @@ -7115,14 +7140,7 @@ decode_code_page_stateful(int code_page, size -= converted; } while (!done); -#ifndef DONT_MAKE_RESULT_READY
} PyObject * @@ -7723,8 +7741,7 @@ PyUnicode_DecodeCharmap(const char *s, goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc);
onError: Py_XDECREF(errorHandler); @@ -8658,8 +8675,12 @@ PyObject * repunicode = unicode_translate_call_errorhandler(errors, &errorHandler, reason, input, &exc, collstart, collend, &newpos);
if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))[](#l1.483)
if (repunicode == NULL)[](#l1.484) goto onError;[](#l1.485)
if (PyUnicode_READY(repunicode) < 0) {[](#l1.486)
Py_DECREF(repunicode);[](#l1.487)
goto onError;[](#l1.488)
}[](#l1.489) /* generate replacement */[](#l1.490) repsize = PyUnicode_GET_LENGTH(repunicode);[](#l1.491) if (charmaptranslate_makespace(&output, &osize,[](#l1.492)
@@ -8812,8 +8833,7 @@ PyUnicode_TransformDecimalToASCII(Py_UNI } PyUnicode_WRITE(kind, data, i, ch); }
} /* --- Decimal Encoder ---------------------------------------------------- */ @@ -10801,8 +10821,7 @@ PyUnicode_Append(PyObject **p_left, PyOb if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right))) { unicode_append_inplace(p_left, right);
if (p_left != NULL)[](#l1.507)
assert(_PyUnicode_CheckConsistency(*p_left, 1));[](#l1.508)
} @@ -11012,14 +11031,7 @@ unicode_expandtabs(PyObject *self, PyObj } } assert (j == PyUnicode_GET_LENGTH(u)); -#ifndef DONT_MAKE_RESULT_READYassert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));[](#l1.509) return;[](#l1.510) }[](#l1.511)
overflow: PyErr_SetString(PyExc_OverflowError, "new string is too long"); @@ -13876,9 +13888,9 @@ int _PyUnicode_Init(void) /* Init the implementation */ unicode_empty = PyUnicode_New(0, 0);
- assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); if (!unicode_empty) Py_FatalError("Can't create empty string");
for (i = 0; i < 256; i++) unicode_latin1[i] = NULL; @@ -13946,11 +13958,6 @@ PyUnicode_InternInPlace(PyObject **p) return; if (PyUnicode_CHECK_INTERNED(s)) return;