cpython: edf029fc9591 (original) (raw)

Mercurial > cpython

changeset 83433:edf029fc9591

Close #17694: Add minimum length to _PyUnicodeWriter * Add also min_char attribute to _PyUnicodeWriter structure (currently unused) * _PyUnicodeWriter_Init() has no more argument (except the writer itself): min_length and overallocate must be set explicitly * In error handlers, only enable overallocation if the replacement string is longer than 1 character * CJK decoders don't use overallocation anymore * Set min_length, instead of preallocating memory using _PyUnicodeWriter_Prepare(), in many decoders * _PyUnicode_DecodeUnicodeInternal() checks for integer overflow [#17694]

Victor Stinner victor.stinner@gmail.com
date	Wed, 17 Apr 2013 23:02:17 +0200
parents	5755ee168bd7
children	7eb52460c999
files	Include/unicodeobject.h Modules/cjkcodecs/multibytecodec.c Objects/complexobject.c Objects/floatobject.c Objects/longobject.c Objects/stringlib/unicode_format.h Objects/unicodeobject.c
diffstat	7 files changed, 85 insertions(+), 75 deletions(-)[+] [-] Include/unicodeobject.h 20 Modules/cjkcodecs/multibytecodec.c 9 Objects/complexobject.c 2 Objects/floatobject.c 2 Objects/longobject.c 2 Objects/stringlib/unicode_format.h 6 Objects/unicodeobject.c 119

line wrap: on

line diff

--- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -898,22 +898,28 @@ typedef struct { Py_UCS4 maxchar; Py_ssize_t size; Py_ssize_t pos;

/* minimum length of the buffer when overallocation is enabled,

  see _PyUnicodeWriter_Init() */[](#l1.8)

/* minimum number of allocated characters (default: 0) */ Py_ssize_t min_length; +
/* minimum character (default: 127, ASCII) */
Py_UCS4 min_char;

/* If non-zero, overallocate the buffer by 25% (default: 0). / unsigned char overallocate; + / If readonly is 1, buffer is a shared string (cannot be modified) and size is set to 0. / unsigned char readonly; } _PyUnicodeWriter ; / Initialize a Unicode writer. -

If min_length is greater than zero, _PyUnicodeWriter_Prepare()
overallocates the buffer and min_length is the minimum length in characters
of the buffer. */

*
- By default, the minimum buffer size is 0 character and overallocation is
- disabled. Set min_length, min_char and overallocate attributes to control
- the allocation of the buffer. */ PyAPI_FUNC(void) -_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length); +_PyUnicodeWriter_Init(_PyUnicodeWriter writer); / Prepare the buffer to write 'length' characters with the specified maximum character.

--- a/Modules/cjkcodecs/multibytecodec.c +++ b/Modules/cjkcodecs/multibytecodec.c @@ -633,7 +633,8 @@ MultibyteCodec_Decode(MultibyteCodecObje return make_tuple(PyUnicode_New(0, 0), 0); }

_PyUnicodeWriter_Init(&buf.writer, datalen);

_PyUnicodeWriter_Init(&buf.writer);
buf.writer.min_length = datalen; buf.excobj = NULL; buf.inbuf = buf.inbuf_top = (unsigned char *)data; buf.inbuf_end = buf.inbuf_top + datalen; @@ -839,7 +840,7 @@ decoder_prepare_buffer(MultibyteDecodeBu { buf->inbuf = buf->inbuf_top = (const unsigned char *)data; buf->inbuf_end = buf->inbuf_top + size;

_PyUnicodeWriter_Init(&buf->writer, size);

buf->writer.min_length += size; return 0; }

@@ -1037,7 +1038,7 @@ mbidecoder_decode(MultibyteIncrementalDe data = pdata.buf; size = pdata.len;

_PyUnicodeWriter_Init(&buf.writer, 1);

_PyUnicodeWriter_Init(&buf.writer); buf.excobj = NULL; origpending = self->pendingsize; @@ -1241,7 +1242,7 @@ mbstreamreader_iread(MultibyteStreamRead if (sizehint == 0) return PyUnicode_New(0, 0);

_PyUnicodeWriter_Init(&buf.writer, 1);

_PyUnicodeWriter_Init(&buf.writer); buf.excobj = NULL; cres = NULL;

--- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -705,7 +705,7 @@ complex__format__(PyObject* self, PyObje if (!PyArg_ParseTuple(args, "U:format", &format_spec)) return NULL;

_PyUnicodeWriter_Init(&writer, 0);

_PyUnicodeWriter_Init(&writer); ret = _PyComplex_FormatAdvancedWriter( &writer, self,

--- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -1711,7 +1711,7 @@ float__format__(PyObject *self, PyObject if (!PyArg_ParseTuple(args, "U:format", &format_spec)) return NULL;

_PyUnicodeWriter_Init(&writer, 0);

_PyUnicodeWriter_Init(&writer); ret = _PyFloat_FormatAdvancedWriter( &writer, self,

--- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -4379,7 +4379,7 @@ long__format__(PyObject *self, PyObject if (!PyArg_ParseTuple(args, "U:format", &format_spec)) return NULL;

_PyUnicodeWriter_Init(&writer, 0);

_PyUnicodeWriter_Init(&writer); ret = _PyLong_FormatAdvancedWriter( &writer, self,

--- a/Objects/stringlib/unicode_format.h +++ b/Objects/stringlib/unicode_format.h @@ -906,7 +906,6 @@ build_string(SubString *input, PyObject int recursion_depth, AutoNumber *auto_number) { _PyUnicodeWriter writer;

Py_ssize_t minlen;

/* check the recursion level */ if (recursion_depth <= 0) { @@ -915,8 +914,9 @@ build_string(SubString *input, PyObject return NULL; }

minlen = PyUnicode_GET_LENGTH(input->str) + 100;
_PyUnicodeWriter_Init(&writer, minlen);

_PyUnicodeWriter_Init(&writer);
writer.overallocate = 1;
writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;

if (!do_markup(input, args, kwargs, &writer, recursion_depth, auto_number)) {

--- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2665,7 +2665,9 @@ PyUnicode_FromFormatV(const char *format const char *f; _PyUnicodeWriter writer;

_PyUnicodeWriter_Init(&writer, strlen(format) + 100);

_PyUnicodeWriter_Init(&writer);
writer.min_length = strlen(format) + 100;
writer.overallocate = 1;

/* va_list may be an array (of 1 item) on some platforms (ex: AMD64). Copy it to be able to pass a reference to a subfunction. */ @@ -4117,7 +4119,10 @@ unicode_decode_call_errorhandler_writer( goto onError; }

writer->overallocate = 1;

if (PyUnicode_READY(repunicode) < 0)
```
   goto onError;[](#l7.20)
```
if (PyUnicode_GET_LENGTH(repunicode) > 1)
```
   writer->overallocate = 1;[](#l7.22)
```
if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1) return

@@ -4256,9 +4261,8 @@ PyUnicode_DecodeUTF7Stateful(const char } /* Start off assuming it's all ASCII. Widen later as necessary. */

_PyUnicodeWriter_Init(&writer, 0);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
```
   goto onError;[](#l7.32)
```

_PyUnicodeWriter_Init(&writer);
writer.min_length = size;

shiftOutStart = 0; e = s + size; @@ -4655,7 +4659,7 @@ PyUnicode_DecodeUTF8Stateful(const char return get_latin1_char((unsigned char)s[0]); }

_PyUnicodeWriter_Init(&writer, 0);

_PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError;

@@ -4910,7 +4914,7 @@ PyUnicode_DecodeUTF32Stateful(const char le = bo <= 0; #endif

_PyUnicodeWriter_Init(&writer, 0);

_PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, (e - q + 3) / 4, 127) == -1) goto onError;

@@ -5149,7 +5153,7 @@ PyUnicode_DecodeUTF16Stateful(const char /* Note: size will always be longer than the resulting Unicode character count */

_PyUnicodeWriter_Init(&writer, 0);

_PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, (e - q + 1) / 2, 127) == -1) goto onError;

@@ -5420,11 +5424,9 @@ PyUnicode_DecodeUnicodeEscape(const char and we determined it's exact size (common case) or it contains \x, \u, ... escape sequences. then we create a legacy wchar string and resize it at the end of this function. */

_PyUnicodeWriter_Init(&writer, 0);

_PyUnicodeWriter_Init(&writer); if (len > 0) {

   if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)[](#l7.72)

```
       goto onError;[](#l7.73)
```

   assert(writer.kind == PyUnicode_1BYTE_KIND);[](#l7.74)

```
   writer.min_length = len;[](#l7.75)
```
} else { /* Escaped strings will always be longer than the resulting

@@ -5432,8 +5434,7 @@ PyUnicode_DecodeUnicodeEscape(const char length after conversion to the true value. (but if the error callback returns a long replacement string we'll have to allocate more space) */

   if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)[](#l7.83)

```
       goto onError;[](#l7.84)
```

```
   writer.min_length = size;[](#l7.85)
```
} if (size == 0) @@ -5461,10 +5462,6 @@ PyUnicode_DecodeUnicodeEscape(const char if (s > end) c = '\0'; /* Invalid after \ */

   /* The only case in which i == ascii_length is a backslash[](#l7.93)

      followed by a newline. */[](#l7.94)

   assert(writer.pos < writer.size || (writer.pos == writer.size && c == '\n'));[](#l7.95)

- switch (c) { /* \x escapes */ @@ -5787,9 +5784,8 @@ PyUnicode_DecodeRawUnicodeEscape(const c Unicode string, so we start with size here and then reduce the length after conversion to the true value. (But decoding error handler might have to resize the string) */

_PyUnicodeWriter_Init(&writer, 1);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
```
   goto onError;[](#l7.106)
```

_PyUnicodeWriter_Init(&writer);
writer.min_length = size;

end = s + size; while (s < end) { @@ -5982,12 +5978,14 @@ PyObject * if (size == 0) _Py_RETURN_UNICODE_EMPTY();

/* XXX overflow detection missing */
_PyUnicodeWriter_Init(&writer, 0);
if (_PyUnicodeWriter_Prepare(&writer, (size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127) == -1)

_PyUnicodeWriter_Init(&writer);
if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {

   PyErr_NoMemory();[](#l7.121)
   goto onError;[](#l7.122)

}
writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;

+ end = s + size; - while (s < end) { Py_UNICODE uch; Py_UCS4 ch; @@ -6429,9 +6427,9 @@ PyUnicode_DecodeASCII(const char *s, if (size == 1 && (unsigned char)s[0] < 128) return get_latin1_char((unsigned char)s[0]);

_PyUnicodeWriter_Init(&writer, 0);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
```
   goto onError;[](#l7.137)
```

_PyUnicodeWriter_Init(&writer);
if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0)
```
   return NULL;[](#l7.140)
```

e = s + size; data = writer.data; @@ -7280,7 +7278,7 @@ PyUnicode_DecodeCharmap(const char *s, if (size == 0) _Py_RETURN_UNICODE_EMPTY();

_PyUnicodeWriter_Init(&writer, 0);

_PyUnicodeWriter_Init(&writer); if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1) goto onError;

@@ -7312,7 +7310,7 @@ PyUnicode_DecodeCharmap(const char *s, ch = *s; x = mapdata_ucs1[ch]; if (x > maxchar) {

               if (_PyUnicodeWriter_PrepareInternal(&writer, 1, 0xff) == -1)[](#l7.157)

               if (_PyUnicodeWriter_Prepare(&writer, 1, 0xff) == -1)[](#l7.158)
                   goto onError;[](#l7.159)
               maxchar = writer.maxchar;[](#l7.160)
               outdata = (Py_UCS1 *)writer.data;[](#l7.161)

@@ -12841,21 +12839,27 @@ unicode_endswith(PyObject *self, Py_LOCAL_INLINE(void) _PyUnicodeWriter_Update(_PyUnicodeWriter *writer) {

writer->size = PyUnicode_GET_LENGTH(writer->buffer);

if (!writer->readonly)

   writer->size = PyUnicode_GET_LENGTH(writer->buffer);[](#l7.168)

else {

   /* Copy-on-write mode: set buffer size to 0 so[](#l7.170)

    * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on[](#l7.171)

```
    * next write. */[](#l7.172)
```
```
   writer->size = 0;[](#l7.173)
```
} writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer); writer->data = PyUnicode_DATA(writer->buffer); writer->kind = PyUnicode_KIND(writer->buffer); }

void -_PyUnicodeWriter_Init(_PyUnicodeWriter *writer, Py_ssize_t min_length) +_PyUnicodeWriter_Init(_PyUnicodeWriter *writer) { memset(writer, 0, sizeof(writer)); #ifdef Py_DEBUG writer->kind = 5; / invalid kind */ #endif

writer->min_length = Py_MAX(min_length, 100);
writer->overallocate = (min_length > 0);

writer->min_char = 127;

} int @@ -12873,29 +12877,28 @@ int } newlen = writer->pos + length;

maxchar = MAX_MAXCHAR(maxchar, writer->min_char);

+ if (writer->buffer == NULL) {

   if (writer->overallocate) {[](#l7.201)

   assert(!writer->readonly);[](#l7.202)

   if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {[](#l7.203)
       /* overallocate 25% to limit the number of resize */[](#l7.204)

       if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))[](#l7.205)

           newlen += newlen / 4;[](#l7.206)

       if (newlen < writer->min_length)[](#l7.207)

           newlen = writer->min_length;[](#l7.208)

```
   }[](#l7.209)
```

```
       newlen += newlen / 4;[](#l7.210)
```
```
   }[](#l7.211)
```

   if (newlen < writer->min_length)[](#l7.212)

       newlen = writer->min_length;[](#l7.213)

+ writer->buffer = PyUnicode_New(newlen, maxchar); if (writer->buffer == NULL) return -1;

   _PyUnicodeWriter_Update(writer);[](#l7.218)

```
   return 0;[](#l7.219)
```
}

if (newlen > writer->size) {

   if (writer->overallocate) {[](#l7.223)

}
else if (newlen > writer->size) {

   if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {[](#l7.226)
       /* overallocate 25% to limit the number of resize */[](#l7.227)

       if (newlen <= (PY_SSIZE_T_MAX - newlen / 4))[](#l7.228)

           newlen += newlen / 4;[](#l7.229)

       if (newlen < writer->min_length)[](#l7.230)

           newlen = writer->min_length;[](#l7.231)

```
   }[](#l7.232)
```

```
       newlen += newlen / 4;[](#l7.233)
```
```
   }[](#l7.234)
```

   if (newlen < writer->min_length)[](#l7.235)

       newlen = writer->min_length;[](#l7.236)

if (maxchar > writer->maxchar || writer->readonly) { /* resize + widen */ @@ -12913,7 +12916,6 @@ int return -1; } writer->buffer = newbuffer;

```
   _PyUnicodeWriter_Update(writer);[](#l7.244)
```
} else if (maxchar > writer->maxchar) { assert(!writer->readonly);

@@ -12924,8 +12926,8 @@ int writer->buffer, 0, writer->pos); Py_DECREF(writer->buffer); writer->buffer = newbuffer;

   _PyUnicodeWriter_Update(writer);[](#l7.252)

}

}
_PyUnicodeWriter_Update(writer); return 0; }

@@ -12959,11 +12961,10 @@ int maxchar = PyUnicode_MAX_CHAR_VALUE(str); if (maxchar > writer->maxchar || len > writer->size - writer->pos) { if (writer->buffer == NULL && !writer->overallocate) {

       writer->readonly = 1;[](#l7.263)
       Py_INCREF(str);[](#l7.264)
       writer->buffer = str;[](#l7.265)
       _PyUnicodeWriter_Update(writer);[](#l7.266)

```
       writer->readonly = 1;[](#l7.267)
```

       writer->size = 0;[](#l7.268)
       writer->pos += len;[](#l7.269)
       return 0;[](#l7.270)
   }[](#l7.271)

@@ -13080,7 +13081,7 @@ unicode__format__(PyObject* self, PyObje if (PyUnicode_READY(self) == -1) return NULL;

_PyUnicodeWriter_Init(&writer, 0);

_PyUnicodeWriter_Init(&writer); ret = _PyUnicode_FormatAdvancedWriter(&writer, self, format_spec, 0, PyUnicode_GET_LENGTH(format_spec));

@@ -14164,7 +14165,9 @@ PyUnicode_Format(PyObject *format, PyObj ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr); ctx.fmtpos = 0;

_PyUnicodeWriter_Init(&ctx.writer, ctx.fmtcnt + 100);

_PyUnicodeWriter_Init(&ctx.writer);
ctx.writer.min_length = ctx.fmtcnt + 100;
ctx.writer.overallocate = 1;

if (PyTuple_Check(args)) { ctx.arglen = PyTuple_Size(args);